| @@ -0,0 +1,2 @@ | |||
| # from .log import init_logger | |||
| # init_logger() | |||
| @@ -0,0 +1,159 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import logging | |||
| import torch.nn as nn | |||
| from .mutables import Mutable, MutableScope, InputChoice | |||
| from .utils import StructuredMutableTreeNode | |||
| logger = logging.getLogger(__name__) | |||
| logger.setLevel(logging.INFO) | |||
| class BaseMutator(nn.Module): | |||
| """ | |||
| A mutator is responsible for mutating a graph by obtaining the search space from the network and implementing | |||
| callbacks that are called in ``forward`` in mutables. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| PyTorch model to apply mutator on. | |||
| """ | |||
| def __init__(self, model): | |||
| super().__init__() | |||
| self.__dict__["model"] = model | |||
| self._structured_mutables = self._parse_search_space(self.model) | |||
| def _parse_search_space(self, module, root=None, prefix="", memo=None, nested_detection=None): | |||
| if memo is None: | |||
| memo = set() | |||
| if root is None: | |||
| root = StructuredMutableTreeNode(None) | |||
| if module not in memo: | |||
| memo.add(module) | |||
| if isinstance(module, Mutable): | |||
| if nested_detection is not None: | |||
| raise RuntimeError("Cannot have nested search space. Error at {} in {}" | |||
| .format(module, nested_detection)) | |||
| module.name = prefix | |||
| module.set_mutator(self) | |||
| root = root.add_child(module) | |||
| if not isinstance(module, MutableScope): | |||
| nested_detection = module | |||
| if isinstance(module, InputChoice): | |||
| for k in module.choose_from: | |||
| if k != InputChoice.NO_KEY and k not in [m.key for m in memo if isinstance(m, Mutable)]: | |||
| raise RuntimeError("'{}' required by '{}' not found in keys that appeared before, and is not NO_KEY." | |||
| .format(k, module.key)) | |||
| for name, submodule in module._modules.items(): | |||
| if submodule is None: | |||
| continue | |||
| submodule_prefix = prefix + ("." if prefix else "") + name | |||
| self._parse_search_space(submodule, root, submodule_prefix, memo=memo, | |||
| nested_detection=nested_detection) | |||
| return root | |||
| @property | |||
| def mutables(self): | |||
| """ | |||
| A generator of all modules inheriting :class:`~nni.nas.pytorch.mutables.Mutable`. | |||
| Modules are yielded in the order that they are defined in ``__init__``. | |||
| For mutables with their keys appearing multiple times, only the first one will appear. | |||
| """ | |||
| return self._structured_mutables | |||
| @property | |||
| def undedup_mutables(self): | |||
| return self._structured_mutables.traverse(deduplicate=False) | |||
| def forward(self, *inputs): | |||
| """ | |||
| Warnings | |||
| -------- | |||
| Don't call forward of a mutator. | |||
| """ | |||
| raise RuntimeError("Forward is undefined for mutators.") | |||
| def __setattr__(self, name, value): | |||
| if name == "model": | |||
| raise AttributeError("Attribute `model` can be set at most once, and you shouldn't use `self.model = model` to " | |||
| "include you network, as it will include all parameters in model into the mutator.") | |||
| return super().__setattr__(name, value) | |||
| def enter_mutable_scope(self, mutable_scope): | |||
| """ | |||
| Callback when forward of a MutableScope is entered. | |||
| Parameters | |||
| ---------- | |||
| mutable_scope : MutableScope | |||
| The mutable scope that is entered. | |||
| """ | |||
| pass | |||
| def exit_mutable_scope(self, mutable_scope): | |||
| """ | |||
| Callback when forward of a MutableScope is exited. | |||
| Parameters | |||
| ---------- | |||
| mutable_scope : MutableScope | |||
| The mutable scope that is exited. | |||
| """ | |||
| pass | |||
| def on_forward_layer_choice(self, mutable, *args, **kwargs): | |||
| """ | |||
| Callbacks of forward in LayerChoice. | |||
| Parameters | |||
| ---------- | |||
| mutable : LayerChoice | |||
| Module whose forward is called. | |||
| args : list of torch.Tensor | |||
| The arguments of its forward function. | |||
| kwargs : dict | |||
| The keyword arguments of its forward function. | |||
| Returns | |||
| ------- | |||
| tuple of torch.Tensor and torch.Tensor | |||
| Output tensor and mask. | |||
| """ | |||
| raise NotImplementedError | |||
| def on_forward_input_choice(self, mutable, tensor_list): | |||
| """ | |||
| Callbacks of forward in InputChoice. | |||
| Parameters | |||
| ---------- | |||
| mutable : InputChoice | |||
| Mutable that is called. | |||
| tensor_list : list of torch.Tensor | |||
| The arguments mutable is called with. | |||
| Returns | |||
| ------- | |||
| tuple of torch.Tensor and torch.Tensor | |||
| Output tensor and mask. | |||
| """ | |||
| raise NotImplementedError | |||
| def export(self): | |||
| """ | |||
| Export the data of all decisions. This should output the decisions of all the mutables, so that the whole | |||
| network can be fully determined with these decisions for further training from scratch. | |||
| Returns | |||
| ------- | |||
| dict | |||
| Mappings from mutable keys to decisions. | |||
| """ | |||
| raise NotImplementedError | |||
| @@ -0,0 +1,40 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| from abc import ABC, abstractmethod | |||
| class BaseTrainer(ABC): | |||
| @abstractmethod | |||
| def train(self): | |||
| """ | |||
| Override the method to train. | |||
| """ | |||
| raise NotImplementedError | |||
| @abstractmethod | |||
| def validate(self): | |||
| """ | |||
| Override the method to validate. | |||
| """ | |||
| raise NotImplementedError | |||
| @abstractmethod | |||
| def export(self, file): | |||
| """ | |||
| Override the method to export to file. | |||
| Parameters | |||
| ---------- | |||
| file : str | |||
| File path to export to. | |||
| """ | |||
| raise NotImplementedError | |||
| @abstractmethod | |||
| def checkpoint(self): | |||
| """ | |||
| Override to dump a checkpoint. | |||
| """ | |||
| raise NotImplementedError | |||
| @@ -0,0 +1,167 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import logging | |||
| import os | |||
| import torch | |||
| import torch.nn as nn | |||
| _logger = logging.getLogger(__name__) | |||
| _logger.setLevel(logging.INFO) | |||
| class Callback: | |||
| """ | |||
| Callback provides an easy way to react to events like begin/end of epochs. | |||
| """ | |||
| def __init__(self): | |||
| self.model = None | |||
| self.optimizer = None | |||
| self.mutator = None | |||
| self.trainer = None | |||
| def build(self, model, optimizer, mutator, trainer): | |||
| """ | |||
| Callback needs to be built with model, mutator, trainer, to get updates from them. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| Model to be trained. | |||
| mutator : nn.Module | |||
| Mutator that mutates the model. | |||
| trainer : BaseTrainer | |||
| Trainer that is to call the callback. | |||
| """ | |||
| self.model = model | |||
| self.optimizer = optimizer | |||
| self.mutator = mutator | |||
| self.trainer = trainer | |||
| def on_epoch_begin(self, epoch): | |||
| """ | |||
| Implement this to do something at the begin of epoch. | |||
| Parameters | |||
| ---------- | |||
| epoch : int | |||
| Epoch number, starting from 0. | |||
| """ | |||
| pass | |||
| def on_epoch_end(self, epoch): | |||
| """ | |||
| Implement this to do something at the end of epoch. | |||
| Parameters | |||
| ---------- | |||
| epoch : int | |||
| Epoch number, starting from 0. | |||
| """ | |||
| pass | |||
| def on_batch_begin(self, epoch): | |||
| pass | |||
| def on_batch_end(self, epoch): | |||
| pass | |||
| class LRSchedulerCallback(Callback): | |||
| """ | |||
| Calls scheduler on every epoch ends. | |||
| Parameters | |||
| ---------- | |||
| scheduler : LRScheduler | |||
| Scheduler to be called. | |||
| """ | |||
| def __init__(self, scheduler, mode="epoch"): | |||
| super().__init__() | |||
| assert mode == "epoch" | |||
| self.scheduler = scheduler | |||
| self.mode = mode | |||
| def on_epoch_end(self, epoch): | |||
| """ | |||
| Call ``self.scheduler.step()`` on epoch end. | |||
| """ | |||
| self.scheduler.step() | |||
| class ArchitectureCheckpoint(Callback): | |||
| """ | |||
| Calls ``trainer.export()`` on every epoch ends. | |||
| Parameters | |||
| ---------- | |||
| checkpoint_dir : str | |||
| Location to save checkpoints. | |||
| """ | |||
| def __init__(self, checkpoint_dir): | |||
| super().__init__() | |||
| self.checkpoint_dir = checkpoint_dir | |||
| os.makedirs(self.checkpoint_dir, exist_ok=True) | |||
| def on_epoch_end(self, epoch): | |||
| """ | |||
| Dump to ``/checkpoint_dir/epoch_{number}.json`` on epoch end. | |||
| """ | |||
| dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.json".format(epoch)) | |||
| _logger.info("Saving architecture to %s", dest_path) | |||
| self.trainer.export(dest_path) | |||
| class BestArchitectureCheckpoint(Callback): | |||
| """ | |||
| Calls ``trainer.export()`` on final epoch ends. | |||
| Parameters | |||
| ---------- | |||
| checkpoint_path : str | |||
| Location to save checkpoints. | |||
| """ | |||
| def __init__(self, checkpoint_path, epoches): | |||
| super().__init__() | |||
| self.epoches = epoches | |||
| self.checkpoint_path = checkpoint_path | |||
| def on_epoch_end(self, epoch): | |||
| """ | |||
| Dump to ``./best_selected_space.json`` on epoch end. | |||
| """ | |||
| if epoch == self.epoches -1: | |||
| _logger.info("Saving architecture to %s", self.checkpoint_path) | |||
| self.trainer.export(self.checkpoint_path) | |||
| class ModelCheckpoint(Callback): | |||
| """ | |||
| Calls ``trainer.export()`` on every epoch ends. | |||
| Parameters | |||
| ---------- | |||
| checkpoint_dir : str | |||
| Location to save checkpoints. | |||
| """ | |||
| def __init__(self, checkpoint_dir): | |||
| super().__init__() | |||
| self.checkpoint_dir = checkpoint_dir | |||
| os.makedirs(self.checkpoint_dir, exist_ok=True) | |||
| def on_epoch_end(self, epoch): | |||
| """ | |||
| Dump to ``/checkpoint_dir/epoch_{number}.pth.tar`` on every epoch end. | |||
| ``DataParallel`` object will have their inside modules exported. | |||
| """ | |||
| if isinstance(self.model, nn.DataParallel): | |||
| child_model_state_dict = self.model.module.state_dict() | |||
| else: | |||
| child_model_state_dict = self.model.state_dict() | |||
| save_state = {'child_model_state_dict': child_model_state_dict, | |||
| 'optimizer_state_dict': self.optimizer.state_dict(), | |||
| 'epoch': epoch} | |||
| dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.pth.tar".format(epoch)) | |||
| _logger.info("Saving model to %s", dest_path) | |||
| torch.save(save_state, dest_path) | |||
| @@ -0,0 +1,150 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import json | |||
| import logging | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| from pytorch.mutables import InputChoice, LayerChoice, MutableScope | |||
| from pytorch.mutator import Mutator | |||
| from pytorch.utils import to_list | |||
| _logger = logging.getLogger(__name__) | |||
| #_logger.setLevel(logging.INFO) | |||
| class FixedArchitecture(Mutator): | |||
| """ | |||
| Fixed architecture mutator that always selects a certain graph. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| A mutable network. | |||
| fixed_arc : dict | |||
| Preloaded architecture object. | |||
| strict : bool | |||
| Force everything that appears in ``fixed_arc`` to be used at least once. | |||
| """ | |||
| def __init__(self, model, fixed_arc, strict=True): | |||
| super().__init__(model) | |||
| self._fixed_arc = fixed_arc | |||
| mutable_keys = set([mutable.key for mutable in self.mutables if not isinstance(mutable, MutableScope)]) | |||
| fixed_arc_keys = set(self._fixed_arc.keys()) | |||
| if fixed_arc_keys - mutable_keys: | |||
| raise RuntimeError("Unexpected keys found in fixed architecture: {}.".format(fixed_arc_keys - mutable_keys)) | |||
| if mutable_keys - fixed_arc_keys: | |||
| raise RuntimeError("Missing keys in fixed architecture: {}.".format(mutable_keys - fixed_arc_keys)) | |||
| self._fixed_arc = self._from_human_readable_architecture(self._fixed_arc) | |||
| def _from_human_readable_architecture(self, human_arc): | |||
| # convert from an exported architecture | |||
| #print('human_arc',human_arc) | |||
| result_arc = {k: to_list(v) for k, v in human_arc.items()} # there could be tensors, numpy arrays, etc. | |||
| #print('result_arc',result_arc) | |||
| # First, convert non-list to list, because there could be {"op1": 0} or {"op1": "conv"}, | |||
| # which means {"op1": [0, ]} ir {"op1": ["conv", ]} | |||
| result_arc = {k: v['_value'] if isinstance(v['_value'], list) else [v['_value']] for k, v in result_arc.items()} | |||
| # Second, infer which ones are multi-hot arrays and which ones are in human-readable format. | |||
| # This is non-trivial, since if an array in [0, 1], we cannot know for sure it means [false, true] or [true, true]. | |||
| # Here, we assume an multihot array has to be a boolean array or a float array and matches the length. | |||
| for mutable in self.mutables: | |||
| if mutable.key not in result_arc: | |||
| continue # skip silently | |||
| choice_arr = result_arc[mutable.key] | |||
| if all(isinstance(v, bool) for v in choice_arr) or all(isinstance(v, float) for v in choice_arr): | |||
| if (isinstance(mutable, LayerChoice) and len(mutable) == len(choice_arr)) or \ | |||
| (isinstance(mutable, InputChoice) and mutable.n_candidates == len(choice_arr)): | |||
| # multihot, do nothing | |||
| continue | |||
| if isinstance(mutable, LayerChoice): | |||
| choice_arr = [mutable.names.index(val) if isinstance(val, str) else val for val in choice_arr] | |||
| choice_arr = [i in choice_arr for i in range(len(mutable))] | |||
| elif isinstance(mutable, InputChoice): | |||
| choice_arr = [mutable.choose_from.index(val) if isinstance(val, str) else val for val in choice_arr] | |||
| choice_arr = [i in choice_arr for i in range(mutable.n_candidates)] | |||
| result_arc[mutable.key] = choice_arr | |||
| return result_arc | |||
| def sample_search(self): | |||
| """ | |||
| Always returns the fixed architecture. | |||
| """ | |||
| return self._fixed_arc | |||
| def sample_final(self): | |||
| """ | |||
| Always returns the fixed architecture. | |||
| """ | |||
| return self._fixed_arc | |||
| def replace_layer_choice(self, module=None, prefix=""): | |||
| """ | |||
| Replace layer choices with selected candidates. It's done with best effort. | |||
| In case of weighted choices or multiple choices. if some of the choices on weighted with zero, delete them. | |||
| If single choice, replace the module with a normal module. | |||
| Parameters | |||
| ---------- | |||
| module : nn.Module | |||
| Module to be processed. | |||
| prefix : str | |||
| Module name under global namespace. | |||
| """ | |||
| if module is None: | |||
| module = self.model | |||
| for name, mutable in module.named_children(): | |||
| global_name = (prefix + "." if prefix else "") + name | |||
| if isinstance(mutable, LayerChoice): | |||
| chosen = self._fixed_arc[mutable.key] | |||
| if sum(chosen) == 1 and max(chosen) == 1 and not mutable.return_mask: | |||
| # sum is one, max is one, there has to be an only one | |||
| # this is compatible with both integer arrays, boolean arrays and float arrays | |||
| _logger.info("Replacing %s with candidate number %d.", global_name, chosen.index(1)) | |||
| setattr(module, name, mutable[chosen.index(1)]) | |||
| else: | |||
| if mutable.return_mask: | |||
| _logger.info("`return_mask` flag of %s is true. As it relies on the behavior of LayerChoice, " \ | |||
| "LayerChoice will not be replaced.") | |||
| # remove unused parameters | |||
| for ch, n in zip(chosen, mutable.names): | |||
| if ch == 0 and not isinstance(ch, float): | |||
| setattr(mutable, n, None) | |||
| else: | |||
| self.replace_layer_choice(mutable, global_name) | |||
| def apply_fixed_architecture(model, fixed_arc): | |||
| """ | |||
| Load architecture from `fixed_arc` and apply to model. | |||
| Parameters | |||
| ---------- | |||
| model : torch.nn.Module | |||
| Model with mutables. | |||
| fixed_arc : str or dict | |||
| Path to the JSON that stores the architecture, or dict that stores the exported architecture. | |||
| Returns | |||
| ------- | |||
| FixedArchitecture | |||
| Mutator that is responsible for fixes the graph. | |||
| """ | |||
| if isinstance(fixed_arc, str): | |||
| with open(fixed_arc) as f: | |||
| fixed_arc = json.load(f) | |||
| architecture = FixedArchitecture(model, fixed_arc) | |||
| architecture.reset() | |||
| # for the convenience of parameters counting | |||
| architecture.replace_layer_choice() | |||
| return architecture | |||
| @@ -0,0 +1,206 @@ | |||
| """ | |||
| A deep MNIST classifier using convolutional layers. | |||
| This file is a modification of the official pytorch mnist example: | |||
| https://github.com/pytorch/examples/blob/master/mnist/main.py | |||
| """ | |||
| import os | |||
| import argparse | |||
| import logging | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| from collections import OrderedDict | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import torch.optim as optim | |||
| from torchvision import datasets, transforms | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| from mutator import ClassicMutator | |||
| import numpy as np | |||
| import time | |||
| import json | |||
| logger = logging.getLogger('mnist_AutoML') | |||
| class Net(nn.Module): | |||
| def __init__(self, hidden_size): | |||
| super(Net, self).__init__() | |||
| # two options of conv1 | |||
| self.conv1 = LayerChoice(OrderedDict([ | |||
| ("conv5x5", nn.Conv2d(1, 20, 5, 1)), | |||
| ("conv3x3", nn.Conv2d(1, 20, 3, 1)) | |||
| ]), key='first_conv') | |||
| # two options of mid_conv | |||
| self.mid_conv = LayerChoice([ | |||
| nn.Conv2d(20, 20, 3, 1, padding=1), | |||
| nn.Conv2d(20, 20, 5, 1, padding=2) | |||
| ], key='mid_conv') | |||
| self.conv2 = nn.Conv2d(20, 50, 5, 1) | |||
| self.fc1 = nn.Linear(4*4*50, hidden_size) | |||
| self.fc2 = nn.Linear(hidden_size, 10) | |||
| # skip connection over mid_conv | |||
| self.input_switch = InputChoice(n_candidates=2, | |||
| n_chosen=1, | |||
| key='skip') | |||
| def forward(self, x): | |||
| x = F.relu(self.conv1(x)) | |||
| x = F.max_pool2d(x, 2, 2) | |||
| old_x = x | |||
| x = F.relu(self.mid_conv(x)) | |||
| zero_x = torch.zeros_like(old_x) | |||
| skip_x = self.input_switch([zero_x, old_x]) | |||
| x = torch.add(x, skip_x) | |||
| x = F.relu(self.conv2(x)) | |||
| x = F.max_pool2d(x, 2, 2) | |||
| x = x.view(-1, 4*4*50) | |||
| x = F.relu(self.fc1(x)) | |||
| x = self.fc2(x) | |||
| return F.log_softmax(x, dim=1) | |||
| def train(args, model, device, train_loader, optimizer, epoch): | |||
| model.train() | |||
| for batch_idx, (data, target) in enumerate(train_loader): | |||
| data, target = data.to(device), target.to(device) | |||
| optimizer.zero_grad() | |||
| output = model(data) | |||
| loss = F.nll_loss(output, target) | |||
| loss.backward() | |||
| optimizer.step() | |||
| if batch_idx % args['log_interval'] == 0: | |||
| logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( | |||
| epoch, batch_idx * len(data), len(train_loader.dataset), | |||
| 100. * batch_idx / len(train_loader), loss.item())) | |||
| def test(args, model, device, test_loader): | |||
| model.eval() | |||
| test_loss = 0 | |||
| correct = 0 | |||
| with torch.no_grad(): | |||
| for data, target in test_loader: | |||
| data, target = data.to(device), target.to(device) | |||
| output = model(data) | |||
| # sum up batch loss | |||
| test_loss += F.nll_loss(output, target, reduction='sum').item() | |||
| # get the index of the max log-probability | |||
| pred = output.argmax(dim=1, keepdim=True) | |||
| correct += pred.eq(target.view_as(pred)).sum().item() | |||
| test_loss /= len(test_loader.dataset) | |||
| accuracy = 100. * correct / len(test_loader.dataset) | |||
| logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( | |||
| test_loss, correct, len(test_loader.dataset), accuracy)) | |||
| return accuracy | |||
| def main(args): | |||
| global_result={'accuarcy':[]} | |||
| use_cuda = not args['no_cuda'] and torch.cuda.is_available() | |||
| torch.manual_seed(args['seed']) | |||
| device = torch.device("cuda" if use_cuda else "cpu") | |||
| kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} | |||
| data_dir = args['data_dir'] | |||
| train_loader = torch.utils.data.DataLoader( | |||
| datasets.MNIST(data_dir, train=True, download=True, | |||
| transform=transforms.Compose([ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize((0.1307,), (0.3081,)) | |||
| ])), | |||
| batch_size=args['batch_size'], shuffle=True, **kwargs) | |||
| test_loader = torch.utils.data.DataLoader( | |||
| datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize((0.1307,), (0.3081,)) | |||
| ])), | |||
| batch_size=1000, shuffle=True, **kwargs) | |||
| hidden_size = args['hidden_size'] | |||
| model = Net(hidden_size=hidden_size).to(device) | |||
| #np.random.seed(42) | |||
| #x = np.random.rand(2,1,28,28).astype(np.float32) | |||
| #x= torch.from_numpy(x).to(device) | |||
| ClassicMutator(model,trial_id=args['trial_id'],selected_path=args["selected_space_path"],search_space_path=args["search_space_path"]) | |||
| #y=model(x) | |||
| #print(y) | |||
| optimizer = optim.SGD(model.parameters(), lr=args['lr'], | |||
| momentum=args['momentum']) | |||
| for epoch in range(1, args['epochs'] + 1): | |||
| train(args, model, device, train_loader, optimizer, epoch) | |||
| test_acc = test(args, model, device, test_loader) | |||
| print({"type":"accuracy","result":{"sequence":epoch,"category":"epoch","value":test_acc}} ) | |||
| global_result['accuarcy'].append(test_acc) | |||
| return global_result | |||
| def dump_global_result(args,global_result): | |||
| with open(args['result_path'], "w") as ss_file: | |||
| json.dump(global_result, ss_file, sort_keys=True, indent=2) | |||
| def get_params(): | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='./data', help="data directory") | |||
| parser.add_argument("--selected_space_path", type=str, | |||
| default='./selected_space.json', help="selected_space_path") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./selected_space.json', help="search_space_path") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./result.json', help="result_path") | |||
| parser.add_argument('--batch_size', type=int, default=64, metavar='N', | |||
| help='input batch size for training (default: 64)') | |||
| parser.add_argument("--hidden_size", type=int, default=512, metavar='N', | |||
| help='hidden layer size (default: 512)') | |||
| parser.add_argument('--lr', type=float, default=0.01, metavar='LR', | |||
| help='learning rate (default: 0.01)') | |||
| parser.add_argument('--momentum', type=float, default=0.5, metavar='M', | |||
| help='SGD momentum (default: 0.5)') | |||
| parser.add_argument('--epochs', type=int, default=10, metavar='N', | |||
| help='number of epochs to train (default: 10)') | |||
| parser.add_argument('--seed', type=int, default=1, metavar='S', | |||
| help='random seed (default: 1)') | |||
| parser.add_argument('--no_cuda', action='store_true', default=False, | |||
| help='disables CUDA training') | |||
| parser.add_argument('--log_interval', type=int, default=1000, metavar='N', | |||
| help='how many batches to wait before logging training status') | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| args, _ = parser.parse_known_args() | |||
| return args | |||
| if __name__ == '__main__': | |||
| try: | |||
| start=time.time() | |||
| params = vars(get_params()) | |||
| global_result = main(params) | |||
| global_result['cost_time'] = str(time.time() - start) +'s' | |||
| dump_global_result(params,global_result) | |||
| except Exception as exception: | |||
| logger.exception(exception) | |||
| raise | |||
| @@ -0,0 +1,52 @@ | |||
| import os | |||
| import argparse | |||
| import logging | |||
| import sys | |||
| from collections import OrderedDict | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import torch.optim as optim | |||
| from torchvision import datasets, transforms | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| from mutator import ClassicMutator | |||
| import numpy as np | |||
| class Net(nn.Module): | |||
| def __init__(self, hidden_size): | |||
| super(Net, self).__init__() | |||
| # two options of conv1 | |||
| self.conv1 = LayerChoice(OrderedDict([ | |||
| ("conv5x5", nn.Conv2d(1, 20, 5, 1)), | |||
| ("conv3x3", nn.Conv2d(1, 20, 3, 1)) | |||
| ]), key='conv1') | |||
| # two options of mid_conv | |||
| self.mid_conv = LayerChoice(OrderedDict([ | |||
| ("conv3x3",nn.Conv2d(20, 20, 3, 1, padding=1)), | |||
| ("conv5x5",nn.Conv2d(20, 20, 5, 1, padding=2)) | |||
| ]), key='mid_conv') | |||
| self.conv2 = nn.Conv2d(20, 50, 5, 1) | |||
| self.fc1 = nn.Linear(4*4*50, hidden_size) | |||
| self.fc2 = nn.Linear(hidden_size, 10) | |||
| # skip connection over mid_conv | |||
| self.input_switch = InputChoice(n_candidates=2, | |||
| n_chosen=1, | |||
| key='skip') | |||
| def forward(self, x): | |||
| x = F.relu(self.conv1(x)) | |||
| x = F.max_pool2d(x, 2, 2) | |||
| old_x = x | |||
| x = F.relu(self.mid_conv(x)) | |||
| zero_x = torch.zeros_like(old_x) | |||
| skip_x = self.input_switch([zero_x, old_x]) | |||
| x = torch.add(x, skip_x) | |||
| x = F.relu(self.conv2(x)) | |||
| x = F.max_pool2d(x, 2, 2) | |||
| x = x.view(-1, 4*4*50) | |||
| x = F.relu(self.fc1(x)) | |||
| x = self.fc2(x) | |||
| return F.log_softmax(x, dim=1) | |||
| @@ -0,0 +1,260 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import json | |||
| import logging | |||
| import os | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import torch | |||
| from pytorch.mutables import LayerChoice, InputChoice, MutableScope | |||
| from pytorch.mutator import Mutator | |||
| import numpy as np | |||
| import random | |||
| logger = logging.getLogger(__name__) | |||
| NNI_GEN_SEARCH_SPACE = "NNI_GEN_SEARCH_SPACE" | |||
| NNI_PLATFORM = "GPU" | |||
| LAYER_CHOICE = "layer_choice" | |||
| INPUT_CHOICE = "input_choice" | |||
| def get_and_apply_next_architecture(model): | |||
| """ | |||
| Wrapper of :class:`~nni.nas.pytorch.classic_nas.mutator.ClassicMutator` to make it more meaningful, | |||
| similar to ``get_next_parameter`` for HPO. | |||
| It will generate search space based on ``model``. | |||
| If env ``NNI_GEN_SEARCH_SPACE`` exists, this is in dry run mode for | |||
| generating search space for the experiment. | |||
| If not, there are still two mode, one is nni experiment mode where users | |||
| use ``nnictl`` to start an experiment. The other is standalone mode | |||
| where users directly run the trial command, this mode chooses the first | |||
| one(s) for each LayerChoice and InputChoice. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| User's model with search space (e.g., LayerChoice, InputChoice) embedded in it. | |||
| """ | |||
| ClassicMutator(model) | |||
| class ClassicMutator(Mutator): | |||
| """ | |||
| This mutator is to apply the architecture chosen from tuner. | |||
| It implements the forward function of LayerChoice and InputChoice, | |||
| to only activate the chosen ones. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| User's model with search space (e.g., LayerChoice, InputChoice) embedded in it. | |||
| """ | |||
| def __init__(self, model,trial_id,selected_path,search_space_path,load_selected_space=False): | |||
| super(ClassicMutator, self).__init__(model) | |||
| self._chosen_arch = {} | |||
| self._search_space = self._generate_search_space() | |||
| self.trial_id = trial_id | |||
| #if NNI_GEN_SEARCH_SPACE in os.environ: | |||
| # dry run for only generating search space | |||
| self._dump_search_space(search_space_path) | |||
| #sys.exit(0) | |||
| if load_selected_space: | |||
| logger.warning("load selected space.") | |||
| self._chosen_arch = self.load_selected_space | |||
| else: | |||
| # get chosen arch from tuner | |||
| self._chosen_arch = self.random_generate_chosen() | |||
| self._generate_selected_space(selected_path) | |||
| self.reset() | |||
| def _sample_layer_choice(self, mutable, idx, value, search_space_item): | |||
| """ | |||
| Convert layer choice to tensor representation. | |||
| Parameters | |||
| ---------- | |||
| mutable : Mutable | |||
| idx : int | |||
| Number `idx` of list will be selected. | |||
| value : str | |||
| The verbose representation of the selected value. | |||
| search_space_item : list | |||
| The list for corresponding search space. | |||
| """ | |||
| # doesn't support multihot for layer choice yet | |||
| onehot_list = [False] * len(mutable) | |||
| assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \ | |||
| "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value) | |||
| onehot_list[idx] = True | |||
| return torch.tensor(onehot_list, dtype=torch.bool) # pylint: disable=not-callable | |||
| def _sample_input_choice(self, mutable, idx, value, search_space_item): | |||
| """ | |||
| Convert input choice to tensor representation. | |||
| Parameters | |||
| ---------- | |||
| mutable : Mutable | |||
| idx : int | |||
| Number `idx` of list will be selected. | |||
| value : str | |||
| The verbose representation of the selected value. | |||
| search_space_item : list | |||
| The list for corresponding search space. | |||
| """ | |||
| candidate_repr = search_space_item["candidates"] | |||
| multihot_list = [False] * mutable.n_candidates | |||
| for i, v in zip(idx, value): | |||
| assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \ | |||
| "Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v) | |||
| assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx) | |||
| multihot_list[i] = True | |||
| return torch.tensor(multihot_list, dtype=torch.bool) # pylint: disable=not-callable | |||
| def sample_search(self): | |||
| """ | |||
| See :meth:`sample_final`. | |||
| """ | |||
| return self.sample_final() | |||
| def sample_final(self): | |||
| """ | |||
| Convert the chosen arch and apply it on model. | |||
| """ | |||
| assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \ | |||
| "Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(), | |||
| self._chosen_arch.keys()) | |||
| result = dict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, (LayerChoice, InputChoice)): | |||
| assert mutable.key in self._chosen_arch, \ | |||
| "Expected '{}' in chosen arch, but not found.".format(mutable.key) | |||
| data = self._chosen_arch[mutable.key] | |||
| assert isinstance(data, dict) and "_value" in data and "_idx" in data, \ | |||
| "'{}' is not a valid choice.".format(data) | |||
| if isinstance(mutable, LayerChoice): | |||
| result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"], | |||
| self._search_space[mutable.key]["_value"]) | |||
| elif isinstance(mutable, InputChoice): | |||
| result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"], | |||
| self._search_space[mutable.key]["_value"]) | |||
| elif isinstance(mutable, MutableScope): | |||
| logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key) | |||
| else: | |||
| raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) | |||
| return result | |||
| def _standalone_generate_chosen(self): | |||
| """ | |||
| Generate the chosen architecture for standalone mode, | |||
| i.e., choose the first one(s) for LayerChoice and InputChoice. | |||
| :: | |||
| { key_name: {"_value": "conv1", | |||
| "_idx": 0} } | |||
| { key_name: {"_value": ["in1"], | |||
| "_idx": [0]} } | |||
| Returns | |||
| ------- | |||
| dict | |||
| the chosen architecture | |||
| """ | |||
| chosen_arch = {} | |||
| for key, val in self._search_space.items(): | |||
| if val["_type"] == LAYER_CHOICE: | |||
| choices = val["_value"] | |||
| chosen_arch[key] = {"_value": choices[0], "_idx": 0} | |||
| elif val["_type"] == INPUT_CHOICE: | |||
| choices = val["_value"]["candidates"] | |||
| n_chosen = val["_value"]["n_chosen"] | |||
| if n_chosen is None: | |||
| n_chosen = len(choices) | |||
| chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))} | |||
| else: | |||
| raise ValueError("Unknown key '%s' and value '%s'." % (key, val)) | |||
| return chosen_arch | |||
| def random_generate_chosen(self): | |||
| """ | |||
| Generate the chosen architecture for standalone mode, | |||
| i.e., choose the first one(s) for LayerChoice and InputChoice. | |||
| :: | |||
| { key_name: {"_value": "conv1", | |||
| "_idx": 0} } | |||
| { key_name: {"_value": ["in1"], | |||
| "_idx": [0]} } | |||
| Returns | |||
| ------- | |||
| dict | |||
| the chosen architecture | |||
| """ | |||
| chosen_arch = {} | |||
| np.random.seed(self.trial_id) | |||
| random.seed(self.trial_id) | |||
| for key, val in self._search_space.items(): | |||
| if val["_type"] == LAYER_CHOICE: | |||
| choices = val["_value"] | |||
| chosen_idx = np.random.randint(len(choices)) | |||
| chosen_arch[key] = {"_value": choices[chosen_idx], "_idx": chosen_idx} | |||
| elif val["_type"] == INPUT_CHOICE: | |||
| choices = val["_value"]["candidates"] | |||
| n_chosen = val["_value"]["n_chosen"] | |||
| if n_chosen is None: | |||
| n_chosen = len(choices) | |||
| chosen_idx = random.sample(list(range(n_chosen)),n_chosen) | |||
| chosen_arch[key] = {"_value": [choices[idx] for idx in chosen_idx], "_idx": chosen_idx} | |||
| else: | |||
| raise ValueError("Unknown key '%s' and value '%s'." % (key, val)) | |||
| return chosen_arch | |||
| def _generate_search_space(self): | |||
| """ | |||
| Generate search space from mutables. | |||
| Here is the search space format: | |||
| :: | |||
| { key_name: {"_type": "layer_choice", | |||
| "_value": ["conv1", "conv2"]} } | |||
| { key_name: {"_type": "input_choice", | |||
| "_value": {"candidates": ["in1", "in2"], | |||
| "n_chosen": 1}} } | |||
| Returns | |||
| ------- | |||
| dict | |||
| the generated search space | |||
| """ | |||
| search_space = {} | |||
| for mutable in self.mutables: | |||
| # for now we only generate flattened search space | |||
| if isinstance(mutable, LayerChoice): | |||
| key = mutable.key | |||
| val = mutable.names | |||
| search_space[key] = {"_type": LAYER_CHOICE, "_value": val} | |||
| elif isinstance(mutable, InputChoice): | |||
| key = mutable.key | |||
| search_space[key] = {"_type": INPUT_CHOICE, | |||
| "_value": {"candidates": mutable.choose_from, | |||
| "n_chosen": mutable.n_chosen}} | |||
| elif isinstance(mutable, MutableScope): | |||
| logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key) | |||
| else: | |||
| raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) | |||
| return search_space | |||
| def _dump_search_space(self, file_path): | |||
| with open(file_path, "w") as ss_file: | |||
| json.dump(self._search_space, ss_file, sort_keys=True, indent=2) | |||
| def _generate_selected_space(self,file_path): | |||
| with open(file_path, "w") as ss_file: | |||
| json.dump(self._chosen_arch, ss_file, sort_keys=True, indent=2) | |||
| @@ -0,0 +1,64 @@ | |||
| stage1:python trainer.py --trial_id=1 --model_selected_space_path='./exp/train/2/model_selected_space.json' --search_space_path='./search_space.json' --result_path='./exp/train/2/result.json' | |||
| stage2:python selector.py --experiment_dir='./exp' --best_selected_space_path='./best_selected_space.json' | |||
| stage3:python retrainer.py --best_checkpoint_dir='experiment_id/' --best_selected_space_path='./best_selected_space.json' --result_path='result.json' | |||
| search_space.json: | |||
| { | |||
| "first_conv": { | |||
| "_type": "layer_choice", | |||
| "_value": [ | |||
| "conv5x5", | |||
| "conv3x3" | |||
| ] | |||
| }, | |||
| "mid_conv": { | |||
| "_type": "layer_choice", | |||
| "_value": [ | |||
| "0", | |||
| "1" | |||
| ] | |||
| }, | |||
| "skip": { | |||
| "_type": "input_choice", | |||
| "_value": { | |||
| "candidates": [ | |||
| "", | |||
| "" | |||
| ], | |||
| "n_chosen": 1 | |||
| } | |||
| } | |||
| } | |||
| selected_space.json: | |||
| { | |||
| "first_conv": { | |||
| "_idx": 0, | |||
| "_value": "conv5x5" | |||
| }, | |||
| "mid_conv": { | |||
| "_idx": 0, | |||
| "_value": "0" | |||
| }, | |||
| "skip": { | |||
| "_idx": [ | |||
| 0 | |||
| ], | |||
| "_value": [ | |||
| "" | |||
| ] | |||
| } | |||
| } | |||
| result.json: | |||
| {'type': 'accuracy', 'result': {'sequence': 1, 'category': 'epoch', 'value': 96.73815907059875}} | |||
| {'type': 'accuracy', 'result': {'sequence': 2, 'category': 'epoch', 'value': 97.6988382484361}} | |||
| {'type': 'accuracy', 'result': {'sequence': 3, 'category': 'epoch', 'value': 98.63717605004469}} | |||
| {'type': 'accuracy', 'result': {'sequence': 4, 'category': 'epoch', 'value': 98.72654155495978}} | |||
| {'type': 'accuracy', 'result': {'sequence': 5, 'category': 'epoch', 'value': 99.27390527256479}} | |||
| {'type': 'accuracy', 'result': {'sequence': 6, 'category': 'epoch', 'value': 99.13985701519213}} | |||
| {'type': 'accuracy', 'result': {'sequence': 7, 'category': 'epoch', 'value': 99.3632707774799}} | |||
| {'type': 'accuracy', 'result': {'sequence': 8, 'category': 'epoch', 'value': 99.4414655942806}} | |||
| {'type': 'accuracy', 'result': {'sequence': 9, 'category': 'epoch', 'value': 99.67605004468275}} | |||
| {'type': 'accuracy', 'result': {'sequence': 10, 'category': 'epoch', 'value': 99.74307417336908}} | |||
| @@ -0,0 +1,288 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import copy | |||
| import logging | |||
| import os | |||
| import argparse | |||
| import logging | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| from collections import OrderedDict | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torchvision import datasets, transforms | |||
| from model import Net | |||
| from pytorch.trainer import Trainer | |||
| from pytorch.utils import AverageMeterGroup | |||
| from pytorch.utils import mkdirs | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| from fixed import apply_fixed_architecture | |||
| from mutator import ClassicMutator | |||
| from abc import ABC, abstractmethod | |||
| from pytorch.retrainer import Retrainer | |||
| import numpy as np | |||
| import time | |||
| import json | |||
| logger = logging.getLogger(__name__) | |||
| #logger.setLevel(logging.INFO) | |||
| class ClassicnasRetrainer(Retrainer): | |||
| """ | |||
| Classicnas trainer. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| PyTorch model to be trained. | |||
| loss : callable | |||
| Receives logits and ground truth label, return a loss tensor. | |||
| metrics : callable | |||
| Receives logits and ground truth label, return a dict of metrics. | |||
| optimizer : Optimizer | |||
| The optimizer used for optimizing the model. | |||
| epochs : int | |||
| Number of epochs planned for training. | |||
| dataset_train : Dataset | |||
| Dataset for training. Will be split for training weights and architecture weights. | |||
| dataset_valid : Dataset | |||
| Dataset for testing. | |||
| mutator : ClassicMutator | |||
| Use in case of customizing your own ClassicMutator. By default will instantiate a ClassicMutator. | |||
| batch_size : int | |||
| Batch size. | |||
| workers : int | |||
| Workers for data loading. | |||
| device : torch.device | |||
| ``torch.device("cpu")`` or ``torch.device("cuda")``. | |||
| log_frequency : int | |||
| Step count per logging. | |||
| callbacks : list of Callback | |||
| list of callbacks to trigger at events. | |||
| arc_learning_rate : float | |||
| Learning rate of architecture parameters. | |||
| unrolled : float | |||
| ``True`` if using second order optimization, else first order optimization. | |||
| """ | |||
| def __init__(self, model, loss, metrics, | |||
| optimizer, epochs, dataset_train, dataset_valid, search_space_path,selected_space_path,checkpoint_dir,trial_id, | |||
| mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, | |||
| callbacks=None, arc_learning_rate=3.0E-4, unrolled=False): | |||
| self.model = model | |||
| self.loss = loss | |||
| self.metrics = metrics | |||
| self.optimizer = optimizer | |||
| self.epochs = epochs | |||
| self.device = device | |||
| self.batch_size = batch_size | |||
| self.checkpoint_dir =checkpoint_dir | |||
| self.train_loader = torch.utils.data.DataLoader( | |||
| datasets.MNIST(dataset_train, train=True, download=True, | |||
| transform=transforms.Compose([ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize((0.1307,), (0.3081,)) | |||
| ])), | |||
| batch_size=batch_size, shuffle=True, **kwargs) | |||
| self.test_loader = torch.utils.data.DataLoader( | |||
| datasets.MNIST(dataset_valid, train=False, transform=transforms.Compose([ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize((0.1307,), (0.3081,)) | |||
| ])), | |||
| batch_size=1000, shuffle=True, **kwargs) | |||
| self.search_space_path = search_space_path | |||
| self.selected_space_path =selected_space_path | |||
| self.trial_id = trial_id | |||
| self.result = {"accuracy": [],"cost_time": 0.} | |||
| def train(self): | |||
| # t1 = time() | |||
| # phase 1. architecture step | |||
| #print(self.model.state_dict) | |||
| apply_fixed_architecture(self.model, self.selected_space_path) | |||
| #print(self.model.state_dict) | |||
| # phase 2: child network step | |||
| for child_epoch in range(1, self.epochs + 1): | |||
| self.model.train() | |||
| for batch_idx, (data, target) in enumerate(self.train_loader): | |||
| data, target = data.to(self.device), target.to(self.device) | |||
| optimizer.zero_grad() | |||
| output = self.model(data) | |||
| loss = F.nll_loss(output, target) | |||
| loss.backward() | |||
| optimizer.step() | |||
| if batch_idx % args['log_interval'] == 0: | |||
| logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( | |||
| child_epoch, batch_idx * len(data), len(self.train_loader.dataset), | |||
| 100. * batch_idx / len(self.train_loader), loss.item())) | |||
| test_acc = self.validate() | |||
| print({"type":"accuracy","result":{"sequence":child_epoch,"category":"epoch","value":test_acc}} ) | |||
| with open(args['result_path'], "a") as ss_file: | |||
| ss_file.write(json.dumps({"type":"accuracy","result":{"sequence":child_epoch,"category":"epoch","value":test_acc}} ) + '\n') | |||
| self.result['accuracy'].append(test_acc) | |||
| def validate(self): | |||
| self.model.eval() | |||
| test_loss = 0 | |||
| correct = 0 | |||
| with torch.no_grad(): | |||
| for data, target in self.test_loader: | |||
| data, target = data.to(self.device), target.to(self.device) | |||
| output = self.model(data) | |||
| # sum up batch loss | |||
| test_loss += F.nll_loss(output, target, reduction='sum').item() | |||
| # get the index of the max log-probability | |||
| pred = output.argmax(dim=1, keepdim=True) | |||
| correct += pred.eq(target.view_as(pred)).sum().item() | |||
| test_loss /= len(self.test_loader.dataset) | |||
| accuracy = 100. * correct / len(self.test_loader.dataset) | |||
| logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( | |||
| test_loss, correct, len(self.test_loader.dataset), accuracy)) | |||
| return accuracy | |||
| # | |||
| # def export(self, file): | |||
| # """ | |||
| # Override the method to export to file. | |||
| # Parameters | |||
| # ---------- | |||
| # file : str | |||
| # File path to export to. | |||
| # """ | |||
| # raise NotImplementedError | |||
| def checkpoint(self): | |||
| """ | |||
| Override to dump a checkpoint. | |||
| """ | |||
| if isinstance(self.model, nn.DataParallel): | |||
| state_dict = self.model.module.state_dict() | |||
| else: | |||
| state_dict = self.model.state_dict() | |||
| if not os.path.exists(self.checkpoint_dir): | |||
| os.makedirs(self.checkpoint_dir) | |||
| dest_path = os.path.join(self.checkpoint_dir, f"best_checkpoint_epoch{self.epochs}.pth") | |||
| logger.info("Saving model to %s", dest_path) | |||
| torch.save(state_dict, dest_path) | |||
| def dump_global_result(args,global_result): | |||
| with open(args['result_path'], "w") as ss_file: | |||
| json.dump(global_result, ss_file, sort_keys=True, indent=2) | |||
| def get_params(): | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='./data', help="data directory") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./selected_space.json', help="selected_space_path") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./selected_space.json', help="search_space_path") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./result.json', help="result_path") | |||
| parser.add_argument('--batch_size', type=int, default=64, metavar='N', | |||
| help='input batch size for training (default: 64)') | |||
| parser.add_argument("--hidden_size", type=int, default=512, metavar='N', | |||
| help='hidden layer size (default: 512)') | |||
| parser.add_argument('--lr', type=float, default=0.01, metavar='LR', | |||
| help='learning rate (default: 0.01)') | |||
| parser.add_argument('--momentum', type=float, default=0.5, metavar='M', | |||
| help='SGD momentum (default: 0.5)') | |||
| parser.add_argument('--epochs', type=int, default=10, metavar='N', | |||
| help='number of epochs to train (default: 10)') | |||
| parser.add_argument('--seed', type=int, default=1, metavar='S', | |||
| help='random seed (default: 1)') | |||
| parser.add_argument('--no_cuda', default=False, | |||
| help='disables CUDA training') | |||
| parser.add_argument('--log_interval', type=int, default=1000, metavar='N', | |||
| help='how many batches to wait before logging training status') | |||
| parser.add_argument("--best_checkpoint_dir",type=str,default="path/to/", | |||
| help="Path for saved checkpoints. (default: %(default)s)") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| args, _ = parser.parse_known_args() | |||
| return args | |||
| if __name__ == '__main__': | |||
| try: | |||
| start=time.time() | |||
| params = vars(get_params()) | |||
| args =params | |||
| use_cuda = not args['no_cuda'] and torch.cuda.is_available() | |||
| torch.manual_seed(args['seed']) | |||
| device = torch.device("cuda" if use_cuda else "cpu") | |||
| kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} | |||
| data_dir = args['data_dir'] | |||
| hidden_size = args['hidden_size'] | |||
| model = Net(hidden_size=hidden_size).to(device) | |||
| optimizer = torch.optim.SGD(model.parameters(), lr=args['lr'], | |||
| momentum=args['momentum']) | |||
| #mkdirs(args['search_space_path']) | |||
| mkdirs(args['best_selected_space_path']) | |||
| mkdirs(args['result_path']) | |||
| trainer = ClassicnasRetrainer(model, | |||
| loss=None, | |||
| metrics=None, | |||
| optimizer=optimizer, | |||
| epochs=args['epochs'], | |||
| dataset_train=data_dir, | |||
| dataset_valid=data_dir, | |||
| search_space_path = args['search_space_path'], | |||
| selected_space_path = args['best_selected_space_path'], | |||
| checkpoint_dir = args['best_checkpoint_dir'], | |||
| trial_id = args['trial_id'], | |||
| batch_size=args['batch_size'], | |||
| log_frequency=args['log_interval'], | |||
| device= device, | |||
| unrolled=None, | |||
| callbacks=None) | |||
| with open(args['result_path'], "w") as ss_file: | |||
| ss_file.write('') | |||
| trainer.train() | |||
| trainer.checkpoint() | |||
| global_result = trainer.result | |||
| global_result['cost_time'] = str(time.time() - start) +'s' | |||
| #dump_global_result(params,global_result) | |||
| except Exception as exception: | |||
| logger.exception(exception) | |||
| raise | |||
| @@ -0,0 +1,58 @@ | |||
| import sys | |||
| sys.path.append('../..') | |||
| from pytorch.selector import Selector | |||
| from pytorch.utils import mkdirs | |||
| import shutil | |||
| import argparse | |||
| import os | |||
| import json | |||
| class ClassicnasSelector(Selector): | |||
| def __init__(self, args, single_candidate=True): | |||
| super().__init__(single_candidate) | |||
| self.args = args | |||
| def fit(self): | |||
| """ | |||
| only one candatite, function passed | |||
| """ | |||
| train_dir = os.path.join(self.args['experiment_dir'],'train') | |||
| max_accuracy = 0 | |||
| best_selected_space = '' | |||
| for trialId in os.listdir(train_dir): | |||
| path= os.path.join(train_dir,trialId,'result','result.json') | |||
| max_accuracy_trial = 0 | |||
| with open(path,'r') as f: | |||
| for line in f: | |||
| result_dict = json.loads(line) | |||
| accuracy = result_dict["result"]["value"] | |||
| if accuracy>max_accuracy_trial: | |||
| max_accuracy_trial=accuracy | |||
| print(max_accuracy_trial) | |||
| if max_accuracy_trial > max_accuracy: | |||
| max_accuracy = max_accuracy_trial | |||
| best_selected_space = os.path.join(train_dir,trialId,'model_selected_space','model_selected_space.json') | |||
| print('best trial id:',trialId) | |||
| shutil.copyfile(best_selected_space,self.args['best_selected_space_path']) | |||
| def get_params(): | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| parser.add_argument("--experiment_dir", type=str, | |||
| default='./experiment_dir', help="data directory") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="selected_space_path") | |||
| args, _ = parser.parse_known_args() | |||
| return args | |||
| if __name__ == "__main__": | |||
| params = vars(get_params()) | |||
| args =params | |||
| mkdirs(args['best_selected_space_path']) | |||
| hpo_selector = ClassicnasSelector(args,single_candidate=False) | |||
| hpo_selector.fit() | |||
| @@ -0,0 +1,269 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import copy | |||
| import logging | |||
| import os | |||
| import argparse | |||
| import logging | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| from collections import OrderedDict | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torchvision import datasets, transforms | |||
| from model import Net | |||
| from pytorch.trainer import Trainer | |||
| from pytorch.utils import AverageMeterGroup | |||
| from pytorch.utils import mkdirs | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| from mutator import ClassicMutator | |||
| import numpy as np | |||
| import time | |||
| import json | |||
| logger = logging.getLogger(__name__) | |||
| #logger.setLevel(logging.INFO) | |||
| class ClassicnasTrainer(Trainer): | |||
| """ | |||
| Classicnas trainer. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| PyTorch model to be trained. | |||
| loss : callable | |||
| Receives logits and ground truth label, return a loss tensor. | |||
| metrics : callable | |||
| Receives logits and ground truth label, return a dict of metrics. | |||
| optimizer : Optimizer | |||
| The optimizer used for optimizing the model. | |||
| num_epochs : int | |||
| Number of epochs planned for training. | |||
| dataset_train : Dataset | |||
| Dataset for training. Will be split for training weights and architecture weights. | |||
| dataset_valid : Dataset | |||
| Dataset for testing. | |||
| mutator : ClassicMutator | |||
| Use in case of customizing your own ClassicMutator. By default will instantiate a ClassicMutator. | |||
| batch_size : int | |||
| Batch size. | |||
| workers : int | |||
| Workers for data loading. | |||
| device : torch.device | |||
| ``torch.device("cpu")`` or ``torch.device("cuda")``. | |||
| log_frequency : int | |||
| Step count per logging. | |||
| callbacks : list of Callback | |||
| list of callbacks to trigger at events. | |||
| arc_learning_rate : float | |||
| Learning rate of architecture parameters. | |||
| unrolled : float | |||
| ``True`` if using second order optimization, else first order optimization. | |||
| """ | |||
| def __init__(self, model, loss, metrics, | |||
| optimizer, epochs, dataset_train, dataset_valid, search_space_path,selected_space_path,trial_id, | |||
| mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, | |||
| callbacks=None, arc_learning_rate=3.0E-4, unrolled=False): | |||
| self.model = model | |||
| self.loss = loss | |||
| self.metrics = metrics | |||
| self.optimizer = optimizer | |||
| self.epochs = epochs | |||
| self.device = device | |||
| self.batch_size = batch_size | |||
| self.train_loader = torch.utils.data.DataLoader( | |||
| datasets.MNIST(dataset_train, train=True, download=False, | |||
| transform=transforms.Compose([ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize((0.1307,), (0.3081,)) | |||
| ])), | |||
| batch_size=batch_size, shuffle=True, **kwargs) | |||
| self.test_loader = torch.utils.data.DataLoader( | |||
| datasets.MNIST(dataset_valid, train=False, transform=transforms.Compose([ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize((0.1307,), (0.3081,)) | |||
| ])), | |||
| batch_size=1000, shuffle=True, **kwargs) | |||
| self.search_space_path = search_space_path | |||
| self.selected_space_path =selected_space_path | |||
| self.trial_id = trial_id | |||
| self.num_epochs = 10 | |||
| self.classicmutator=ClassicMutator(self.model,trial_id=self.trial_id,selected_path=self.selected_space_path,search_space_path=self.search_space_path) | |||
| self.result = {"accuracy": [],"cost_time": 0.} | |||
| def train_one_epoch(self, epoch): | |||
| # t1 = time() | |||
| # phase 1. architecture step | |||
| self.classicmutator.trial_id = epoch | |||
| self.classicmutator._chosen_arch=self.classicmutator.random_generate_chosen() | |||
| #print('epoch:',epoch,'\n',self.classicmutator._chosen_arch) | |||
| # phase 2: child network step | |||
| for child_epoch in range(1, self.epochs + 1): | |||
| self.model.train() | |||
| for batch_idx, (data, target) in enumerate(self.train_loader): | |||
| data, target = data.to(self.device), target.to(self.device) | |||
| optimizer.zero_grad() | |||
| output = self.model(data) | |||
| loss = F.nll_loss(output, target) | |||
| loss.backward() | |||
| optimizer.step() | |||
| if batch_idx % args['log_interval'] == 0: | |||
| logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( | |||
| child_epoch, batch_idx * len(data), len(self.train_loader.dataset), | |||
| 100. * batch_idx / len(self.train_loader), loss.item())) | |||
| test_acc = self.validate_one_epoch(epoch) | |||
| print({"type":"accuracy","result":{"sequence":child_epoch,"category":"epoch","value":test_acc}} ) | |||
| with open(args['result_path'], "a") as ss_file: | |||
| ss_file.write(json.dumps({"type":"accuracy","result":{"sequence":child_epoch,"category":"epoch","value":test_acc}} ) + '\n') | |||
| self.result['accuracy'].append(test_acc) | |||
| def validate_one_epoch(self, epoch): | |||
| self.model.eval() | |||
| test_loss = 0 | |||
| correct = 0 | |||
| with torch.no_grad(): | |||
| for data, target in self.test_loader: | |||
| data, target = data.to(self.device), target.to(self.device) | |||
| output = self.model(data) | |||
| # sum up batch loss | |||
| test_loss += F.nll_loss(output, target, reduction='sum').item() | |||
| # get the index of the max log-probability | |||
| pred = output.argmax(dim=1, keepdim=True) | |||
| correct += pred.eq(target.view_as(pred)).sum().item() | |||
| test_loss /= len(self.test_loader.dataset) | |||
| accuracy = 100. * correct / len(self.test_loader.dataset) | |||
| logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( | |||
| test_loss, correct, len(self.test_loader.dataset), accuracy)) | |||
| return accuracy | |||
| def train(self): | |||
| """ | |||
| Train ``num_epochs``. | |||
| Trigger callbacks at the start and the end of each epoch. | |||
| Parameters | |||
| ---------- | |||
| validate : bool | |||
| If ``true``, will do validation every epoch. | |||
| """ | |||
| for epoch in range(self.num_epochs): | |||
| # training | |||
| self.train_one_epoch(epoch) | |||
| def dump_global_result(args,global_result): | |||
| with open(args['result_path'], "w") as ss_file: | |||
| json.dump(global_result, ss_file, sort_keys=True, indent=2) | |||
| def get_params(): | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='./data', help="data directory") | |||
| parser.add_argument("--model_selected_space_path", type=str, | |||
| default='./selected_space.json', help="selected_space_path") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./selected_space.json', help="search_space_path") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./model_result.json', help="result_path") | |||
| parser.add_argument('--batch_size', type=int, default=64, metavar='N', | |||
| help='input batch size for training (default: 64)') | |||
| parser.add_argument("--hidden_size", type=int, default=512, metavar='N', | |||
| help='hidden layer size (default: 512)') | |||
| parser.add_argument('--lr', type=float, default=0.01, metavar='LR', | |||
| help='learning rate (default: 0.01)') | |||
| parser.add_argument('--momentum', type=float, default=0.5, metavar='M', | |||
| help='SGD momentum (default: 0.5)') | |||
| parser.add_argument('--epochs', type=int, default=10, metavar='N', | |||
| help='number of epochs to train (default: 10)') | |||
| parser.add_argument('--seed', type=int, default=1, metavar='S', | |||
| help='random seed (default: 1)') | |||
| parser.add_argument('--no_cuda', default=False, | |||
| help='disables CUDA training') | |||
| parser.add_argument('--log_interval', type=int, default=1000, metavar='N', | |||
| help='how many batches to wait before logging training status') | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| args, _ = parser.parse_known_args() | |||
| return args | |||
| if __name__ == '__main__': | |||
| try: | |||
| start=time.time() | |||
| params = vars(get_params()) | |||
| args =params | |||
| use_cuda = not args['no_cuda'] and torch.cuda.is_available() | |||
| torch.manual_seed(args['seed']) | |||
| device = torch.device("cuda" if use_cuda else "cpu") | |||
| kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} | |||
| data_dir = args['data_dir'] | |||
| hidden_size = args['hidden_size'] | |||
| model = Net(hidden_size=hidden_size).to(device) | |||
| optimizer = torch.optim.SGD(model.parameters(), lr=args['lr'], | |||
| momentum=args['momentum']) | |||
| mkdirs(args['search_space_path']) | |||
| mkdirs(args['model_selected_space_path']) | |||
| mkdirs(args['result_path']) | |||
| trainer = ClassicnasTrainer(model, | |||
| loss=None, | |||
| metrics=None, | |||
| optimizer=optimizer, | |||
| epochs=args['epochs'], | |||
| dataset_train=data_dir, | |||
| dataset_valid=data_dir, | |||
| search_space_path = args['search_space_path'], | |||
| selected_space_path = args['model_selected_space_path'], | |||
| trial_id = args['trial_id'], | |||
| batch_size=args['batch_size'], | |||
| log_frequency=args['log_interval'], | |||
| device= device, | |||
| unrolled=None, | |||
| callbacks=None) | |||
| with open(args['result_path'], "w") as ss_file: | |||
| ss_file.write('') | |||
| trainer.train_one_epoch(args['trial_id']) | |||
| #trainer.train() | |||
| global_result = trainer.result | |||
| #global_result['cost_time'] = str(time.time() - start) +'s' | |||
| #dump_global_result(params,global_result) | |||
| except Exception as exception: | |||
| logger.exception(exception) | |||
| raise | |||
| @@ -0,0 +1,70 @@ | |||
| # Cream of the Crop: Distilling Prioritized Paths For One-Shot Neural Architecture Search | |||
| ## 0x01 requirements | |||
| * Install the following requirements: | |||
| ``` | |||
| future | |||
| thop | |||
| timm<0.4 | |||
| yacs | |||
| ptflops==0.6.4 | |||
| #tensorboardx | |||
| #tensorboard | |||
| #opencv-python | |||
| #torch-scope | |||
| #git+https://github.com/sovrasov/flops-counter.pytorch.git | |||
| #git+https://github.com/Tramac/torchscope.git | |||
| ``` | |||
| * (required) Build and install apex to accelerate the training | |||
| (see [yuque](https://www.yuque.com/kcgyxv/ukpea3/mxz5xy)), | |||
| a little bit faster than pytorch DistributedDataParallel. | |||
| * Put the imagenet data in `./data` Using the following script: | |||
| ``` | |||
| cd TADL_DIR/pytorch/cream/ | |||
| ln -s /mnt/data . | |||
| ``` | |||
| ## 0x02 Quick Start | |||
| * Run the following script to search an architecture. | |||
| ``` | |||
| python trainer.py | |||
| ``` | |||
| * Selector (deprecated) | |||
| ``` | |||
| python selector.py | |||
| ``` | |||
| * Train searched architectures. | |||
| > Note: exponential moving average(model_ema) is not available yet. | |||
| ``` | |||
| python retrainer.py | |||
| ``` | |||
| <!-- | |||
| * Test trained models. | |||
| ``` | |||
| $ cp configs/test.yaml.example configs/test.yaml | |||
| $ python -m torch.distributed.launch --nproc_per_node=1 ./test.py --cfg ./configs/test.yaml | |||
| > 01/26 02:06:27 AM | [Model-14] Flops: 13.768M Params: 2.673M | |||
| > 01/26 02:06:30 AM | Training on Process 0 with 1 GPUs. | |||
| > 01/26 02:06:30 AM | Restoring model state from checkpoint... | |||
| > 01/26 02:06:30 AM | Loaded checkpoint './pretrained/14.pth.tar' (epoch 591) | |||
| > 01/26 02:06:30 AM | Loaded state_dict_ema | |||
| > 01/26 02:06:32 AM | Test_EMA: [ 0/390] Time: 1.573 (1.573) Loss: 0.9613 (0.9613) Prec@1: 82.8125 (82.8125) Prec@5: 91.4062 (91.4062) | |||
| > ... | |||
| > 01/26 02:07:50 AM | Test_EMA: [ 390/390] Time: 0.077 (0.203) Loss: 3.4356 (2.0912) Prec@1: 25.0000 (53.7640) Prec@5: 53.7500 (77.2840) | |||
| ``` | |||
| --> | |||
| @@ -0,0 +1,5 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| from .trainer import CreamSupernetTrainer | |||
| from .mutator import RandomMutator | |||
| @@ -0,0 +1,36 @@ | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from pytorch.mutator import Mutator | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| # TODO: This class is duplicate with SPOS. | |||
| class RandomMutator(Mutator): | |||
| """ | |||
| Random mutator that samples a random candidate in the search space each time ``reset()``. | |||
| It uses random function in PyTorch, so users can set seed in PyTorch to ensure deterministic behavior. | |||
| """ | |||
| def sample_search(self): | |||
| """ | |||
| Sample a random candidate. | |||
| """ | |||
| result = dict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| gen_index = torch.randint(high=len(mutable), size=(1, )) | |||
| result[mutable.key] = F.one_hot(gen_index, num_classes=len(mutable)).view(-1).bool() | |||
| elif isinstance(mutable, InputChoice): | |||
| if mutable.n_chosen is None: | |||
| result[mutable.key] = torch.randint(high=2, size=(mutable.n_candidates,)).view(-1).bool() | |||
| else: | |||
| perm = torch.randperm(mutable.n_candidates) | |||
| mask = [i in perm[:mutable.n_chosen] for i in range(mutable.n_candidates)] | |||
| result[mutable.key] = torch.tensor(mask, dtype=torch.bool) # pylint: disable=not-callable | |||
| return result | |||
| def sample_final(self): | |||
| """ | |||
| Same as :meth:`sample_search`. | |||
| """ | |||
| return self.sample_search() | |||
| @@ -0,0 +1,437 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import os | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| import logging | |||
| from copy import deepcopy | |||
| from pytorch.trainer import Trainer | |||
| from pytorch.utils import AverageMeterGroup | |||
| from .utils import accuracy, reduce_metrics | |||
| logger = logging.getLogger(__name__) | |||
| class CreamSupernetTrainer(Trainer): | |||
| """ | |||
| This trainer trains a supernet and output prioritized architectures that can be used for other tasks. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| Model with mutables. | |||
| loss : callable | |||
| Called with logits and targets. Returns a loss tensor. | |||
| val_loss : callable | |||
| Called with logits and targets for validation only. Returns a loss tensor. | |||
| optimizer : Optimizer | |||
| Optimizer that optimizes the model. | |||
| num_epochs : int | |||
| Number of epochs of training. | |||
| train_loader : iterablez | |||
| Data loader of training. Raise ``StopIteration`` when one epoch is exhausted. | |||
| valid_loader : iterablez | |||
| Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted. | |||
| mutator : Mutator | |||
| A mutator object that has been initialized with the model. | |||
| batch_size : int | |||
| Batch size. | |||
| log_frequency : int | |||
| Number of mini-batches to log metrics. | |||
| meta_sta_epoch : int | |||
| start epoch of using meta matching network to pick teacher architecture | |||
| update_iter : int | |||
| interval of updating meta matching networks | |||
| slices : int | |||
| batch size of mini training data in the process of training meta matching network | |||
| pool_size : int | |||
| board size | |||
| pick_method : basestring | |||
| how to pick teacher network | |||
| choice_num : int | |||
| number of operations in supernet | |||
| sta_num : int | |||
| layer number of each stage in supernet (5 stage in supernet) | |||
| acc_gap : int | |||
| maximum accuracy improvement to omit the limitation of flops | |||
| flops_dict : Dict | |||
| dictionary of each layer's operations in supernet | |||
| flops_fixed : int | |||
| flops of fixed part in supernet | |||
| local_rank : int | |||
| index of current rank | |||
| callbacks : list of Callback | |||
| Callbacks to plug into the trainer. See Callbacks. | |||
| """ | |||
| def __init__(self, selected_space, model, loss, val_loss, | |||
| optimizer, num_epochs, train_loader, valid_loader, | |||
| mutator=None, batch_size=64, log_frequency=None, | |||
| meta_sta_epoch=20, update_iter=200, slices=2, | |||
| pool_size=10, pick_method='meta', choice_num=6, | |||
| sta_num=(4, 4, 4, 4, 4), acc_gap=5, | |||
| flops_dict=None, flops_fixed=0, local_rank=0, callbacks=None, result_path=None): | |||
| assert torch.cuda.is_available() | |||
| super(CreamSupernetTrainer, self).__init__(model, mutator, loss, None, | |||
| optimizer, num_epochs, None, None, | |||
| batch_size, None, None, log_frequency, callbacks) | |||
| self.selected_space = selected_space | |||
| self.model = model | |||
| self.loss = loss | |||
| self.val_loss = val_loss | |||
| self.train_loader = train_loader | |||
| self.valid_loader = valid_loader | |||
| self.log_frequency = log_frequency | |||
| self.batch_size = batch_size | |||
| self.optimizer = optimizer | |||
| self.model = model | |||
| self.loss = loss | |||
| self.num_epochs = num_epochs | |||
| self.meta_sta_epoch = meta_sta_epoch | |||
| self.update_iter = update_iter | |||
| self.slices = slices | |||
| self.pick_method = pick_method | |||
| self.pool_size = pool_size | |||
| self.local_rank = local_rank | |||
| self.choice_num = choice_num | |||
| self.sta_num = sta_num | |||
| self.acc_gap = acc_gap | |||
| self.flops_dict = flops_dict | |||
| self.flops_fixed = flops_fixed | |||
| self.current_student_arch = None | |||
| self.current_teacher_arch = None | |||
| self.main_proc = (local_rank == 0) | |||
| self.current_epoch = 0 | |||
| self.prioritized_board = [] | |||
| self.result_path = result_path | |||
| # size of prioritized board | |||
| def _board_size(self): | |||
| return len(self.prioritized_board) | |||
| # select teacher architecture according to the logit difference | |||
| def _select_teacher(self): | |||
| self._replace_mutator_cand(self.current_student_arch) | |||
| if self.pick_method == 'top1': | |||
| meta_value, teacher_cand = 0.5, sorted( | |||
| self.prioritized_board, reverse=True)[0][3] | |||
| elif self.pick_method == 'meta': | |||
| meta_value, cand_idx, teacher_cand = -1000000000, -1, None | |||
| for now_idx, item in enumerate(self.prioritized_board): | |||
| inputx = item[4] | |||
| output = torch.nn.functional.softmax(self.model(inputx), dim=1) | |||
| weight = self.model.forward_meta(output - item[5]) | |||
| if weight > meta_value: | |||
| meta_value = weight | |||
| cand_idx = now_idx | |||
| teacher_cand = self.prioritized_board[cand_idx][3] | |||
| assert teacher_cand is not None | |||
| meta_value = torch.nn.functional.sigmoid(-weight) | |||
| else: | |||
| raise ValueError('Method Not supported') | |||
| return meta_value, teacher_cand | |||
| # check whether to update prioritized board | |||
| def _isUpdateBoard(self, prec1, flops): | |||
| if self.current_epoch <= self.meta_sta_epoch: | |||
| return False | |||
| if len(self.prioritized_board) < self.pool_size: | |||
| return True | |||
| if prec1 > self.prioritized_board[-1][1] + self.acc_gap: | |||
| return True | |||
| if prec1 > self.prioritized_board[-1][1] and flops < self.prioritized_board[-1][2]: | |||
| return True | |||
| return False | |||
| # update prioritized board | |||
| def _update_prioritized_board(self, inputs, teacher_output, outputs, prec1, flops): | |||
| if self._isUpdateBoard(prec1, flops): | |||
| val_prec1 = prec1 | |||
| training_data = deepcopy(inputs[:self.slices].detach()) | |||
| if len(self.prioritized_board) == 0: | |||
| features = deepcopy(outputs[:self.slices].detach()) | |||
| else: | |||
| features = deepcopy(teacher_output[:self.slices].detach()) | |||
| self.prioritized_board.append( | |||
| (val_prec1, | |||
| prec1, | |||
| flops, | |||
| self.current_student_arch, | |||
| training_data, | |||
| torch.nn.functional.softmax( | |||
| features, | |||
| dim=1))) | |||
| self.prioritized_board = sorted( | |||
| self.prioritized_board, reverse=True) | |||
| if len(self.prioritized_board) > self.pool_size: | |||
| self.prioritized_board = sorted( | |||
| self.prioritized_board, reverse=True) | |||
| del self.prioritized_board[-1] | |||
| # only update student network weights | |||
| def _update_student_weights_only(self, grad_1): | |||
| for weight, grad_item in zip( | |||
| self.model.module.rand_parameters(self.current_student_arch), grad_1): | |||
| weight.grad = grad_item | |||
| torch.nn.utils.clip_grad_norm_( | |||
| self.model.module.rand_parameters(self.current_student_arch), 1) | |||
| self.optimizer.step() | |||
| for weight, grad_item in zip( | |||
| self.model.module.rand_parameters(self.current_student_arch), grad_1): | |||
| del weight.grad | |||
| # only update meta networks weights | |||
| def _update_meta_weights_only(self, teacher_cand, grad_teacher): | |||
| for weight, grad_item in zip(self.model.module.rand_parameters( | |||
| teacher_cand, self.pick_method == 'meta'), grad_teacher): | |||
| weight.grad = grad_item | |||
| # clip gradients | |||
| torch.nn.utils.clip_grad_norm_( | |||
| self.model.module.rand_parameters( | |||
| self.current_student_arch, self.pick_method == 'meta'), 1) | |||
| self.optimizer.step() | |||
| for weight, grad_item in zip(self.model.module.rand_parameters( | |||
| teacher_cand, self.pick_method == 'meta'), grad_teacher): | |||
| del weight.grad | |||
| # simulate sgd updating | |||
| def _simulate_sgd_update(self, w, g, optimizer): | |||
| return g * optimizer.param_groups[-1]['lr'] + w | |||
| # split training images into several slices | |||
| def _get_minibatch_input(self, input): | |||
| slice = self.slices | |||
| x = deepcopy(input[:slice].clone().detach()) | |||
| return x | |||
| # calculate 1st gradient of student architectures | |||
| def _calculate_1st_gradient(self, kd_loss): | |||
| self.optimizer.zero_grad() | |||
| grad = torch.autograd.grad( | |||
| kd_loss, | |||
| self.model.module.rand_parameters(self.current_student_arch), | |||
| create_graph=True) | |||
| return grad | |||
| # calculate 2nd gradient of meta networks | |||
| def _calculate_2nd_gradient(self, validation_loss, teacher_cand, students_weight): | |||
| self.optimizer.zero_grad() | |||
| grad_student_val = torch.autograd.grad( | |||
| validation_loss, | |||
| self.model.module.rand_parameters(self.current_student_arch), | |||
| retain_graph=True) | |||
| grad_teacher = torch.autograd.grad( | |||
| students_weight[0], | |||
| self.model.module.rand_parameters( | |||
| teacher_cand, | |||
| self.pick_method == 'meta'), | |||
| grad_outputs=grad_student_val) | |||
| return grad_teacher | |||
| # forward training data | |||
| def _forward_training(self, x, meta_value): | |||
| self._replace_mutator_cand(self.current_student_arch) | |||
| output = self.model(x) | |||
| with torch.no_grad(): | |||
| self._replace_mutator_cand(self.current_teacher_arch) | |||
| teacher_output = self.model(x) | |||
| soft_label = torch.nn.functional.softmax(teacher_output, dim=1) | |||
| kd_loss = meta_value * \ | |||
| self._cross_entropy_loss_with_soft_target(output, soft_label) | |||
| return kd_loss | |||
| # calculate soft target loss | |||
| def _cross_entropy_loss_with_soft_target(self, pred, soft_target): | |||
| logsoftmax = torch.nn.LogSoftmax() | |||
| return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1)) | |||
| # forward validation data | |||
| def _forward_validation(self, input, target): | |||
| slice = self.slices | |||
| x = input[slice:slice * 2].clone() | |||
| self._replace_mutator_cand(self.current_student_arch) | |||
| output_2 = self.model(x) | |||
| validation_loss = self.loss(output_2, target[slice:slice * 2]) | |||
| return validation_loss | |||
| def _isUpdateMeta(self, batch_idx): | |||
| isUpdate = True | |||
| isUpdate &= (self.current_epoch > self.meta_sta_epoch) | |||
| isUpdate &= (batch_idx > 0) | |||
| isUpdate &= (batch_idx % self.update_iter == 0) | |||
| isUpdate &= (self._board_size() > 0) | |||
| return isUpdate | |||
| def _replace_mutator_cand(self, cand): | |||
| self.mutator._cache = cand | |||
| # update meta matching networks | |||
| def _run_update(self, input, target, batch_idx): | |||
| if self._isUpdateMeta(batch_idx): | |||
| x = self._get_minibatch_input(input) | |||
| meta_value, teacher_cand = self._select_teacher() | |||
| kd_loss = self._forward_training(x, meta_value) | |||
| # calculate 1st gradient | |||
| grad_1st = self._calculate_1st_gradient(kd_loss) | |||
| # simulate updated student weights | |||
| students_weight = [ | |||
| self._simulate_sgd_update( | |||
| p, grad_item, self.optimizer) for p, grad_item in zip( | |||
| self.model.module.rand_parameters(self.current_student_arch), grad_1st)] | |||
| # update student weights | |||
| self._update_student_weights_only(grad_1st) | |||
| validation_loss = self._forward_validation(input, target) | |||
| # calculate 2nd gradient | |||
| grad_teacher = self._calculate_2nd_gradient(validation_loss, | |||
| teacher_cand, | |||
| students_weight) | |||
| # update meta matching networks | |||
| self._update_meta_weights_only(teacher_cand, grad_teacher) | |||
| # delete internal variants | |||
| del grad_teacher, grad_1st, x, validation_loss, kd_loss, students_weight | |||
| def _get_cand_flops(self, cand): | |||
| flops = 0 | |||
| for block_id, block in enumerate(cand): | |||
| if block == 'LayerChoice1' or block_id == 'LayerChoice23': | |||
| continue | |||
| for idx, choice in enumerate(cand[block]): | |||
| flops += self.flops_dict[block_id][idx] * (1 if choice else 0) | |||
| return flops + self.flops_fixed | |||
| def train_one_epoch(self, epoch): | |||
| self.current_epoch = epoch | |||
| meters = AverageMeterGroup() | |||
| self.steps_per_epoch = len(self.train_loader) | |||
| for step, (input_data, target) in enumerate(self.train_loader): | |||
| self.mutator.reset() | |||
| self.current_student_arch = self.mutator._cache | |||
| input_data, target = input_data.cuda(), target.cuda() | |||
| # calculate flops of current architecture | |||
| cand_flops = self._get_cand_flops(self.mutator._cache) | |||
| # update meta matching network | |||
| self._run_update(input_data, target, step) | |||
| if self._board_size() > 0: | |||
| # select teacher architecture | |||
| meta_value, teacher_cand = self._select_teacher() | |||
| self.current_teacher_arch = teacher_cand | |||
| # forward supernet | |||
| if self._board_size() == 0 or epoch <= self.meta_sta_epoch: | |||
| self._replace_mutator_cand(self.current_student_arch) | |||
| output = self.model(input_data) | |||
| loss = self.loss(output, target) | |||
| kd_loss, teacher_output, teacher_cand = None, None, None | |||
| else: | |||
| self._replace_mutator_cand(self.current_student_arch) | |||
| output = self.model(input_data) | |||
| gt_loss = self.loss(output, target) | |||
| with torch.no_grad(): | |||
| self._replace_mutator_cand(self.current_teacher_arch) | |||
| teacher_output = self.model(input_data).detach() | |||
| soft_label = torch.nn.functional.softmax(teacher_output, dim=1) | |||
| kd_loss = self._cross_entropy_loss_with_soft_target(output, soft_label) | |||
| loss = (meta_value * kd_loss + (2 - meta_value) * gt_loss) / 2 | |||
| # update network | |||
| self.optimizer.zero_grad() | |||
| loss.backward() | |||
| self.optimizer.step() | |||
| # update metrics | |||
| prec1, prec5 = accuracy(output, target, topk=(1, 5)) | |||
| metrics = {"prec1": prec1, "prec5": prec5, "loss": loss} | |||
| metrics = reduce_metrics(metrics) | |||
| meters.update(metrics) | |||
| # update prioritized board | |||
| self._update_prioritized_board(input_data, | |||
| teacher_output, | |||
| output, | |||
| metrics['prec1'], | |||
| cand_flops) | |||
| if self.main_proc and ( | |||
| step % self.log_frequency == 0 or step + 1 == self.steps_per_epoch): | |||
| logger.info("Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, self.num_epochs, | |||
| step + 1, len(self.train_loader), meters) | |||
| arch_list = [] | |||
| # if self.main_proc and self.num_epochs == epoch + 1: | |||
| for idx, i in enumerate(self.prioritized_board): | |||
| # logger.info("prioritized_board: No.%s %s", idx, i[:4]) | |||
| if idx == 0: | |||
| for arch in list(i[3].values()): | |||
| _ = arch.numpy() | |||
| _ = np.where(_)[0].tolist() | |||
| arch_list.append(_) | |||
| if len(arch_list) > 0: | |||
| with open(self.selected_space, "w") as f: | |||
| print("dump selected space.") | |||
| json.dump({'selected_space': arch_list}, f) | |||
| def validate_one_epoch(self, epoch): | |||
| self.model.eval() | |||
| meters = AverageMeterGroup() | |||
| with torch.no_grad(): | |||
| for step, (x, y) in enumerate(self.valid_loader): | |||
| self.mutator.reset() | |||
| logits = self.model(x) | |||
| loss = self.val_loss(logits, y) | |||
| prec1, prec5 = accuracy(logits, y, topk=(1, 5)) | |||
| metrics = {"prec1": prec1, "prec5": prec5, "loss": loss} | |||
| metrics = reduce_metrics(metrics) | |||
| meters.update(metrics) | |||
| if self.log_frequency is not None and step % self.log_frequency == 0: | |||
| logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, | |||
| self.num_epochs, step + 1, len(self.valid_loader), meters) | |||
| # print({'type': 'Accuracy', 'result': {'sequence': epoch, 'category': 'epoch', | |||
| # 'value': metrics["prec1"]}}) | |||
| if self.result_path is not None: | |||
| with open(self.result_path, "a") as ss_file: | |||
| ss_file.write(json.dumps( | |||
| {'type': 'Accuracy', | |||
| 'result': {'sequence': epoch, | |||
| 'category': 'epoch', | |||
| 'value': metrics["prec1"]}}) + '\n') | |||
| @@ -0,0 +1,39 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import os | |||
| import torch | |||
| import torch.distributed as dist | |||
| def accuracy(output, target, topk=(1,)): | |||
| """ Computes the precision@k for the specified values of k """ | |||
| maxk = max(topk) | |||
| batch_size = target.size(0) | |||
| _, pred = output.topk(maxk, 1, True, True) | |||
| pred = pred.t() | |||
| # one-hot case | |||
| if target.ndimension() > 1: | |||
| target = target.max(1)[1] | |||
| correct = pred.eq(target.reshape(1, -1).expand_as(pred)) | |||
| res = [] | |||
| for k in topk: | |||
| correct_k = correct[:k].reshape(-1).float().sum(0) | |||
| res.append(correct_k.mul_(1.0 / batch_size)) | |||
| return res | |||
| def reduce_metrics(metrics): | |||
| return {k: reduce_tensor(v).item() for k, v in metrics.items()} | |||
| def reduce_tensor(tensor): | |||
| rt = torch.sum(tensor) | |||
| # rt = tensor.clone() | |||
| # dist.all_reduce(rt, op=dist.ReduceOp.SUM) | |||
| # rt /= float(os.environ["WORLD_SIZE"]) | |||
| return rt | |||
| @@ -0,0 +1,123 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| from __future__ import absolute_import | |||
| from __future__ import division | |||
| from __future__ import print_function | |||
| from __future__ import unicode_literals | |||
| from yacs.config import CfgNode as CN | |||
| DEFAULT_CROP_PCT = 0.875 | |||
| IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) | |||
| IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) | |||
| __C = CN() | |||
| cfg = __C | |||
| __C.AUTO_RESUME = True | |||
| __C.DATA_DIR = './data/imagenet' | |||
| __C.MODEL = 'cream' | |||
| __C.RESUME_PATH = './experiments/ckps/resume.pth.tar' | |||
| __C.SAVE_PATH = './experiments/ckps/' | |||
| __C.SEED = 42 | |||
| __C.LOG_INTERVAL = 50 | |||
| __C.RECOVERY_INTERVAL = 0 | |||
| __C.WORKERS = 4 | |||
| __C.NUM_GPU = 1 | |||
| __C.SAVE_IMAGES = False | |||
| __C.AMP = False | |||
| __C.ACC_GAP = 5 | |||
| __C.OUTPUT = 'output/path/' | |||
| __C.EVAL_METRICS = 'prec1' | |||
| __C.TTA = 0 # Test or inference time augmentation | |||
| __C.LOCAL_RANK = 0 | |||
| __C.VERBOSE = False | |||
| # dataset configs | |||
| __C.DATASET = CN() | |||
| __C.DATASET.NUM_CLASSES = 1000 | |||
| __C.DATASET.IMAGE_SIZE = 224 # image patch size | |||
| __C.DATASET.INTERPOLATION = 'bilinear' # Image resize interpolation type | |||
| __C.DATASET.BATCH_SIZE = 32 # batch size | |||
| __C.DATASET.NO_PREFECHTER = False | |||
| __C.DATASET.PIN_MEM = True | |||
| __C.DATASET.VAL_BATCH_MUL = 4 | |||
| # model configs | |||
| __C.NET = CN() | |||
| __C.NET.SELECTION = 14 | |||
| __C.NET.GP = 'avg' # type of global pool ["avg", "max", "avgmax", "avgmaxc"] | |||
| __C.NET.DROPOUT_RATE = 0.0 # dropout rate | |||
| __C.NET.INPUT_ARCH = [[0], [3], [3, 3], [3, 1, 3], [3, 3, 3, 3], [3, 3, 3], [0]] | |||
| # model ema parameters | |||
| __C.NET.EMA = CN() | |||
| __C.NET.EMA.USE = True | |||
| __C.NET.EMA.FORCE_CPU = False # force model ema to be tracked on CPU | |||
| __C.NET.EMA.DECAY = 0.9998 | |||
| # optimizer configs | |||
| __C.OPT = 'sgd' | |||
| __C.OPT_EPS = 1e-2 | |||
| __C.MOMENTUM = 0.9 | |||
| __C.WEIGHT_DECAY = 1e-4 | |||
| __C.OPTIMIZER = CN() | |||
| __C.OPTIMIZER.NAME = 'sgd' | |||
| __C.OPTIMIZER.MOMENTUM = 0.9 | |||
| __C.OPTIMIZER.WEIGHT_DECAY = 1e-3 | |||
| # scheduler configs | |||
| __C.SCHED = 'sgd' | |||
| __C.LR_NOISE = None | |||
| __C.LR_NOISE_PCT = 0.67 | |||
| __C.LR_NOISE_STD = 1.0 | |||
| __C.WARMUP_LR = 1e-4 | |||
| __C.MIN_LR = 1e-5 | |||
| __C.EPOCHS = 200 | |||
| __C.START_EPOCH = None | |||
| __C.DECAY_EPOCHS = 30.0 | |||
| __C.WARMUP_EPOCHS = 3 | |||
| __C.COOLDOWN_EPOCHS = 10 | |||
| __C.PATIENCE_EPOCHS = 10 | |||
| __C.DECAY_RATE = 0.1 | |||
| __C.LR = 1e-2 | |||
| __C.META_LR = 1e-4 | |||
| # data augmentation parameters | |||
| __C.AUGMENTATION = CN() | |||
| __C.AUGMENTATION.AA = 'rand-m9-mstd0.5' | |||
| __C.AUGMENTATION.COLOR_JITTER = 0.4 | |||
| __C.AUGMENTATION.RE_PROB = 0.2 # random erase prob | |||
| __C.AUGMENTATION.RE_MODE = 'pixel' # random erase mode | |||
| __C.AUGMENTATION.MIXUP = 0.0 # mixup alpha | |||
| __C.AUGMENTATION.MIXUP_OFF_EPOCH = 0 # turn off mixup after this epoch | |||
| __C.AUGMENTATION.SMOOTHING = 0.1 # label smoothing parameters | |||
| # batch norm parameters (only works with gen_efficientnet based models | |||
| # currently) | |||
| __C.BATCHNORM = CN() | |||
| __C.BATCHNORM.SYNC_BN = False | |||
| __C.BATCHNORM.BN_TF = False | |||
| __C.BATCHNORM.BN_MOMENTUM = 0.1 # batchnorm momentum override | |||
| __C.BATCHNORM.BN_EPS = 1e-5 # batchnorm eps override | |||
| # supernet training hyperparameters | |||
| __C.SUPERNET = CN() | |||
| __C.SUPERNET.UPDATE_ITER = 1300 | |||
| __C.SUPERNET.SLICE = 4 | |||
| __C.SUPERNET.POOL_SIZE = 10 | |||
| __C.SUPERNET.RESUNIT = False | |||
| __C.SUPERNET.DIL_CONV = False | |||
| __C.SUPERNET.UPDATE_2ND = True | |||
| __C.SUPERNET.FLOPS_MAXIMUM = 600 | |||
| __C.SUPERNET.FLOPS_MINIMUM = 0 | |||
| __C.SUPERNET.PICK_METHOD = 'meta' # pick teacher method | |||
| __C.SUPERNET.META_STA_EPOCH = 20 # start using meta picking method | |||
| __C.SUPERNET.HOW_TO_PROB = 'pre_prob' # sample method | |||
| __C.SUPERNET.PRE_PROB = (0.05, 0.2, 0.05, 0.5, 0.05, | |||
| 0.15) # sample prob in 'pre_prob' | |||
| @@ -0,0 +1,129 @@ | |||
| import os | |||
| import time | |||
| import timm | |||
| import torch | |||
| import torchvision | |||
| from collections import OrderedDict | |||
| from ..utils.util import AverageMeter, accuracy, reduce_tensor | |||
| def train_epoch( | |||
| epoch, model, loader, optimizer, loss_fn, args, | |||
| lr_scheduler=None, saver=None, output_dir='', use_amp=False, | |||
| model_ema=None, logger=None, writer=None, local_rank=0): | |||
| batch_time_m = AverageMeter() | |||
| data_time_m = AverageMeter() | |||
| losses_m = AverageMeter() | |||
| prec1_m = AverageMeter() | |||
| prec5_m = AverageMeter() | |||
| model.train() | |||
| end = time.time() | |||
| last_idx = len(loader) - 1 | |||
| num_updates = epoch * len(loader) | |||
| optimizer.zero_grad() | |||
| for batch_idx, (input, target) in enumerate(loader): | |||
| last_batch = batch_idx == last_idx | |||
| data_time_m.update(time.time() - end) | |||
| input = input.cuda() | |||
| target = target.cuda() | |||
| output = model(input) | |||
| loss = loss_fn(output, target) | |||
| prec1, prec5 = accuracy(output, target, topk=(1, 5)) | |||
| if args.num_gpu > 1: | |||
| reduced_loss = reduce_tensor(loss.data, args.num_gpu) | |||
| prec1 = reduce_tensor(prec1, args.num_gpu) | |||
| prec5 = reduce_tensor(prec5, args.num_gpu) | |||
| else: | |||
| reduced_loss = loss.data | |||
| optimizer.zero_grad() | |||
| loss.backward() | |||
| optimizer.step() | |||
| torch.cuda.synchronize() | |||
| losses_m.update(reduced_loss.item(), input.size(0)) | |||
| prec1_m.update(prec1.item(), output.size(0)) | |||
| prec5_m.update(prec5.item(), output.size(0)) | |||
| if model_ema is not None: | |||
| model_ema.update(model) | |||
| num_updates += 1 | |||
| batch_time_m.update(time.time() - end) | |||
| if last_batch or batch_idx % args.log_interval == 0: | |||
| lrl = [param_group['lr'] for param_group in optimizer.param_groups] | |||
| lr = sum(lrl) / len(lrl) | |||
| if local_rank == 0: | |||
| logger.info( | |||
| 'Train: {} [{:>4d}/{}] ' | |||
| 'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f}) ' | |||
| 'Prec@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) ' | |||
| 'Prec@5: {top5.val:>7.4f} ({top5.avg:>7.4f}) ' | |||
| 'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s ' | |||
| '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' | |||
| 'LR: {lr:.3e}' | |||
| 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format( | |||
| epoch, | |||
| batch_idx, | |||
| len(loader), | |||
| loss=losses_m, | |||
| top1=prec1_m, | |||
| top5=prec5_m, | |||
| batch_time=batch_time_m, | |||
| rate=input.size(0) * args.num_gpu / batch_time_m.val, | |||
| rate_avg=input.size(0) * args.num_gpu / batch_time_m.avg, | |||
| lr=lr, | |||
| data_time=data_time_m)) | |||
| # writer.add_scalar( | |||
| # 'Loss/train', prec1_m.avg, epoch * len(loader) + batch_idx) | |||
| # writer.add_scalar( | |||
| # 'Accuracy/train', prec1_m.avg, epoch * len(loader) + batch_idx) | |||
| # writer.add_scalar( | |||
| # 'Learning_Rate', | |||
| # optimizer.param_groups[0]['lr'], epoch * len(loader) + batch_idx) | |||
| if args.save_images and output_dir: | |||
| torchvision.utils.save_image( | |||
| input, os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx), | |||
| padding=0, normalize=True) | |||
| if saver is not None and args.recovery_interval and ( | |||
| last_batch or (batch_idx + 1) % args.recovery_interval == 0): | |||
| if int(timm.__version__[2]) >= 3: | |||
| saver.save_recovery( | |||
| epoch, | |||
| batch_idx=batch_idx) | |||
| else: | |||
| saver.save_recovery( | |||
| model, | |||
| optimizer, | |||
| args, | |||
| epoch, | |||
| model_ema=model_ema, | |||
| use_amp=use_amp, | |||
| batch_idx=batch_idx) | |||
| if lr_scheduler is not None: | |||
| lr_scheduler.step_update( | |||
| num_updates=num_updates, | |||
| metric=losses_m.avg) | |||
| end = time.time() | |||
| # end for | |||
| if hasattr(optimizer, 'sync_lookahead'): | |||
| optimizer.sync_lookahead() | |||
| return OrderedDict([('loss', losses_m.avg)]) | |||
| @@ -0,0 +1,100 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| import time | |||
| import torch | |||
| import json | |||
| from collections import OrderedDict | |||
| from ..utils.util import AverageMeter, accuracy, reduce_tensor | |||
| def validate(epoch, model, loader, loss_fn, args, log_suffix='', | |||
| logger=None, writer=None, local_rank=0,result_path=None): | |||
| batch_time_m = AverageMeter() | |||
| losses_m = AverageMeter() | |||
| prec1_m = AverageMeter() | |||
| prec5_m = AverageMeter() | |||
| model.eval() | |||
| end = time.time() | |||
| last_idx = len(loader) - 1 | |||
| with torch.no_grad(): | |||
| for batch_idx, (input, target) in enumerate(loader): | |||
| last_batch = batch_idx == last_idx | |||
| output = model(input) | |||
| if isinstance(output, (tuple, list)): | |||
| output = output[0] | |||
| # augmentation reduction | |||
| reduce_factor = args.tta | |||
| if reduce_factor > 1: | |||
| output = output.unfold( | |||
| 0, | |||
| reduce_factor, | |||
| reduce_factor).mean( | |||
| dim=2) | |||
| target = target[0:target.size(0):reduce_factor] | |||
| loss = loss_fn(output, target) | |||
| prec1, prec5 = accuracy(output, target, topk=(1, 5)) | |||
| if args.num_gpu > 1: | |||
| reduced_loss = reduce_tensor(loss.data, args.num_gpu) | |||
| prec1 = reduce_tensor(prec1, args.num_gpu) | |||
| prec5 = reduce_tensor(prec5, args.num_gpu) | |||
| else: | |||
| reduced_loss = loss.data | |||
| torch.cuda.synchronize() | |||
| losses_m.update(reduced_loss.item(), input.size(0)) | |||
| prec1_m.update(prec1.item(), output.size(0)) | |||
| prec5_m.update(prec5.item(), output.size(0)) | |||
| batch_time_m.update(time.time() - end) | |||
| end = time.time() | |||
| if local_rank == 0 and (last_batch or batch_idx % args.log_interval == 0): | |||
| log_name = 'Test' + log_suffix | |||
| logger.info( | |||
| '{0}: [{1:>4d}/{2}] ' | |||
| 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' | |||
| 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' | |||
| 'Prec@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) ' | |||
| 'Prec@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format( | |||
| log_name, batch_idx, last_idx, | |||
| batch_time=batch_time_m, loss=losses_m, | |||
| top1=prec1_m, top5=prec5_m)) | |||
| # print({'type': 'Accuracy', 'result': {'sequence': epoch, 'category': 'epoch', 'value': prec1_m.val}}) | |||
| if result_path is not None: | |||
| with open(result_path, "a") as ss_file: | |||
| ss_file.write(json.dumps( | |||
| {'type': 'Accuracy', | |||
| 'result': {'sequence': epoch, | |||
| 'category': 'epoch', | |||
| 'value': prec1_m.val}}) + '\n') | |||
| # writer.add_scalar( | |||
| # 'Loss' + log_suffix + '/vaild', | |||
| # prec1_m.avg, | |||
| # epoch * len(loader) + batch_idx) | |||
| # writer.add_scalar( | |||
| # 'Accuracy' + | |||
| # log_suffix + | |||
| # '/vaild', | |||
| # prec1_m.avg, | |||
| # epoch * | |||
| # len(loader) + | |||
| # batch_idx) | |||
| metrics = OrderedDict( | |||
| [('loss', losses_m.avg), ('prec1', prec1_m.avg), ('prec5', prec5_m.avg)]) | |||
| return metrics | |||
| @@ -0,0 +1,2 @@ | |||
| from .residual_block import get_Bottleneck, get_BasicBlock | |||
| from .inverted_residual_block import InvertedResidual | |||
| @@ -0,0 +1,113 @@ | |||
| # This file is downloaded from | |||
| # https://github.com/rwightman/pytorch-image-models | |||
| import torch.nn as nn | |||
| from timm.models.layers import create_conv2d | |||
| from timm.models.efficientnet_blocks import make_divisible, resolve_se_args, \ | |||
| SqueezeExcite, drop_path | |||
| class InvertedResidual(nn.Module): | |||
| """ Inverted residual block w/ optional SE and CondConv routing""" | |||
| def __init__( | |||
| self, | |||
| in_chs, | |||
| out_chs, | |||
| dw_kernel_size=3, | |||
| stride=1, | |||
| dilation=1, | |||
| pad_type='', | |||
| act_layer=nn.ReLU, | |||
| noskip=False, | |||
| exp_ratio=1.0, | |||
| exp_kernel_size=1, | |||
| pw_kernel_size=1, | |||
| se_ratio=0., | |||
| se_kwargs=None, | |||
| norm_layer=nn.BatchNorm2d, | |||
| norm_kwargs=None, | |||
| conv_kwargs=None, | |||
| drop_path_rate=0.): | |||
| super(InvertedResidual, self).__init__() | |||
| norm_kwargs = norm_kwargs or {} | |||
| conv_kwargs = conv_kwargs or {} | |||
| mid_chs = make_divisible(in_chs * exp_ratio) | |||
| has_se = se_ratio is not None and se_ratio > 0. | |||
| self.has_residual = (in_chs == out_chs and stride == 1) and not noskip | |||
| self.drop_path_rate = drop_path_rate | |||
| # Point-wise expansion | |||
| self.conv_pw = create_conv2d( | |||
| in_chs, | |||
| mid_chs, | |||
| exp_kernel_size, | |||
| padding=pad_type, | |||
| **conv_kwargs) | |||
| self.bn1 = norm_layer(mid_chs, **norm_kwargs) | |||
| self.act1 = act_layer(inplace=True) | |||
| # Depth-wise convolution | |||
| self.conv_dw = create_conv2d( | |||
| mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation, | |||
| padding=pad_type, depthwise=True, **conv_kwargs) | |||
| self.bn2 = norm_layer(mid_chs, **norm_kwargs) | |||
| self.act2 = act_layer(inplace=True) | |||
| # Squeeze-and-excitation | |||
| if has_se: | |||
| se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer) | |||
| self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs) | |||
| else: | |||
| self.se = None | |||
| # Point-wise linear projection | |||
| self.conv_pwl = create_conv2d( | |||
| mid_chs, | |||
| out_chs, | |||
| pw_kernel_size, | |||
| padding=pad_type, | |||
| **conv_kwargs) | |||
| self.bn3 = norm_layer(out_chs, **norm_kwargs) | |||
| def feature_info(self, location): | |||
| if location == 'expansion': # after SE, input to PWL | |||
| info = dict( | |||
| module='conv_pwl', | |||
| hook_type='forward_pre', | |||
| num_chs=self.conv_pwl.in_channels) | |||
| else: # location == 'bottleneck', block output | |||
| info = dict( | |||
| module='', | |||
| hook_type='', | |||
| num_chs=self.conv_pwl.out_channels) | |||
| return info | |||
| def forward(self, x): | |||
| residual = x | |||
| # Point-wise expansion | |||
| x = self.conv_pw(x) | |||
| x = self.bn1(x) | |||
| x = self.act1(x) | |||
| # Depth-wise convolution | |||
| x = self.conv_dw(x) | |||
| x = self.bn2(x) | |||
| x = self.act2(x) | |||
| # Squeeze-and-excitation | |||
| if self.se is not None: | |||
| x = self.se(x) | |||
| # Point-wise linear projection | |||
| x = self.conv_pwl(x) | |||
| x = self.bn3(x) | |||
| if self.has_residual: | |||
| if self.drop_path_rate > 0.: | |||
| x = drop_path(x, self.drop_path_rate, self.training) | |||
| x += residual | |||
| return x | |||
| @@ -0,0 +1,105 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| def conv3x3(in_planes, out_planes, stride=1): | |||
| "3x3 convolution with padding" | |||
| return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, | |||
| padding=1, bias=True) | |||
| class BasicBlock(nn.Module): | |||
| expansion = 1 | |||
| def __init__(self, inplanes, planes, stride=1, downsample=None): | |||
| super(BasicBlock, self).__init__() | |||
| self.conv1 = conv3x3(inplanes, planes, stride) | |||
| self.bn1 = nn.BatchNorm2d(planes) | |||
| self.relu = nn.ReLU(inplace=True) | |||
| self.conv2 = conv3x3(planes, planes) | |||
| self.bn2 = nn.BatchNorm2d(planes) | |||
| self.downsample = downsample | |||
| self.stride = stride | |||
| def forward(self, x): | |||
| residual = x | |||
| out = self.conv1(x) | |||
| out = self.bn1(out) | |||
| out = self.relu(out) | |||
| out = self.conv2(out) | |||
| out = self.bn2(out) | |||
| if self.downsample is not None: | |||
| residual = self.downsample(x) | |||
| out += residual | |||
| out = self.relu(out) | |||
| return out | |||
| class Bottleneck(nn.Module): | |||
| def __init__(self, inplanes, planes, stride=1, expansion=4): | |||
| super(Bottleneck, self).__init__() | |||
| planes = int(planes / expansion) | |||
| self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True) | |||
| self.bn1 = nn.BatchNorm2d(planes) | |||
| self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, | |||
| padding=1, bias=True) | |||
| self.bn2 = nn.BatchNorm2d(planes) | |||
| self.conv3 = nn.Conv2d( | |||
| planes, | |||
| planes * expansion, | |||
| kernel_size=1, | |||
| bias=True) | |||
| self.bn3 = nn.BatchNorm2d(planes * expansion) | |||
| self.relu = nn.ReLU(inplace=True) | |||
| self.stride = stride | |||
| self.expansion = expansion | |||
| if inplanes != planes * self.expansion: | |||
| self.downsample = nn.Sequential( | |||
| nn.Conv2d(inplanes, planes * self.expansion, | |||
| kernel_size=1, stride=stride, bias=True), | |||
| nn.BatchNorm2d(planes * self.expansion), | |||
| ) | |||
| else: | |||
| self.downsample = None | |||
| def forward(self, x): | |||
| residual = x | |||
| out = self.conv1(x) | |||
| out = self.bn1(out) | |||
| out = self.relu(out) | |||
| out = self.conv2(out) | |||
| out = self.bn2(out) | |||
| out = self.relu(out) | |||
| out = self.conv3(out) | |||
| out = self.bn3(out) | |||
| if self.downsample is not None: | |||
| residual = self.downsample(x) | |||
| out += residual | |||
| out = self.relu(out) | |||
| return out | |||
| def get_Bottleneck(in_c, out_c, stride): | |||
| return Bottleneck(in_c, out_c, stride=stride) | |||
| def get_BasicBlock(in_c, out_c, stride): | |||
| return BasicBlock(in_c, out_c, stride=stride) | |||
| @@ -0,0 +1,182 @@ | |||
| from ...utils.util import * | |||
| from collections import OrderedDict | |||
| from timm.models.efficientnet_blocks import * | |||
| class ChildNetBuilder: | |||
| def __init__( | |||
| self, | |||
| channel_multiplier=1.0, | |||
| channel_divisor=8, | |||
| channel_min=None, | |||
| output_stride=32, | |||
| pad_type='', | |||
| act_layer=None, | |||
| se_kwargs=None, | |||
| norm_layer=nn.BatchNorm2d, | |||
| norm_kwargs=None, | |||
| drop_path_rate=0., | |||
| feature_location='', | |||
| verbose=False, | |||
| logger=None): | |||
| self.channel_multiplier = channel_multiplier | |||
| self.channel_divisor = channel_divisor | |||
| self.channel_min = channel_min | |||
| self.output_stride = output_stride | |||
| self.pad_type = pad_type | |||
| self.act_layer = act_layer | |||
| self.se_kwargs = se_kwargs | |||
| self.norm_layer = norm_layer | |||
| self.norm_kwargs = norm_kwargs | |||
| self.drop_path_rate = drop_path_rate | |||
| self.feature_location = feature_location | |||
| assert feature_location in ('pre_pwl', 'post_exp', '') | |||
| self.verbose = verbose | |||
| self.in_chs = None | |||
| self.features = OrderedDict() | |||
| self.logger = logger | |||
| def _round_channels(self, chs): | |||
| return round_channels( | |||
| chs, | |||
| self.channel_multiplier, | |||
| self.channel_divisor, | |||
| self.channel_min) | |||
| def _make_block(self, ba, block_idx, block_count): | |||
| drop_path_rate = self.drop_path_rate * block_idx / block_count | |||
| bt = ba.pop('block_type') | |||
| ba['in_chs'] = self.in_chs | |||
| ba['out_chs'] = self._round_channels(ba['out_chs']) | |||
| if 'fake_in_chs' in ba and ba['fake_in_chs']: | |||
| ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs']) | |||
| ba['norm_layer'] = self.norm_layer | |||
| ba['norm_kwargs'] = self.norm_kwargs | |||
| ba['pad_type'] = self.pad_type | |||
| # block act fn overrides the model default | |||
| ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer | |||
| assert ba['act_layer'] is not None | |||
| if bt == 'ir': | |||
| ba['drop_path_rate'] = drop_path_rate | |||
| ba['se_kwargs'] = self.se_kwargs | |||
| if self.verbose: | |||
| self.logger.info( | |||
| ' InvertedResidual {}, Args: {}'.format( | |||
| block_idx, str(ba))) | |||
| block = InvertedResidual(**ba) | |||
| elif bt == 'ds' or bt == 'dsa': | |||
| ba['drop_path_rate'] = drop_path_rate | |||
| ba['se_kwargs'] = self.se_kwargs | |||
| if self.verbose: | |||
| self.logger.info( | |||
| ' DepthwiseSeparable {}, Args: {}'.format( | |||
| block_idx, str(ba))) | |||
| block = DepthwiseSeparableConv(**ba) | |||
| elif bt == 'cn': | |||
| if self.verbose: | |||
| self.logger.info( | |||
| ' ConvBnAct {}, Args: {}'.format( | |||
| block_idx, str(ba))) | |||
| block = ConvBnAct(**ba) | |||
| else: | |||
| assert False, 'Uknkown block type (%s) while building model.' % bt | |||
| self.in_chs = ba['out_chs'] # update in_chs for arg of next block | |||
| return block | |||
| def __call__(self, in_chs, model_block_args): | |||
| """ Build the blocks | |||
| Args: | |||
| in_chs: Number of input-channels passed to first block | |||
| model_block_args: A list of lists, outer list defines stages, inner | |||
| list contains strings defining block configuration(s) | |||
| Return: | |||
| List of block stacks (each stack wrapped in nn.Sequential) | |||
| """ | |||
| if self.verbose: | |||
| self.logger.info( | |||
| 'Building model trunk with %d stages...' % | |||
| len(model_block_args)) | |||
| self.in_chs = in_chs | |||
| total_block_count = sum([len(x) for x in model_block_args]) | |||
| total_block_idx = 0 | |||
| current_stride = 2 | |||
| current_dilation = 1 | |||
| feature_idx = 0 | |||
| stages = [] | |||
| # outer list of block_args defines the stacks ('stages' by some | |||
| # conventions) | |||
| for stage_idx, stage_block_args in enumerate(model_block_args): | |||
| last_stack = stage_idx == (len(model_block_args) - 1) | |||
| if self.verbose: | |||
| self.logger.info('Stack: {}'.format(stage_idx)) | |||
| assert isinstance(stage_block_args, list) | |||
| blocks = [] | |||
| # each stack (stage) contains a list of block arguments | |||
| for block_idx, block_args in enumerate(stage_block_args): | |||
| last_block = block_idx == (len(stage_block_args) - 1) | |||
| extract_features = '' # No features extracted | |||
| if self.verbose: | |||
| self.logger.info(' Block: {}'.format(block_idx)) | |||
| # Sort out stride, dilation, and feature extraction details | |||
| assert block_args['stride'] in (1, 2) | |||
| if block_idx >= 1: | |||
| # only the first block in any stack can have a stride > 1 | |||
| block_args['stride'] = 1 | |||
| do_extract = False | |||
| if self.feature_location == 'pre_pwl': | |||
| if last_block: | |||
| next_stage_idx = stage_idx + 1 | |||
| if next_stage_idx >= len(model_block_args): | |||
| do_extract = True | |||
| else: | |||
| do_extract = model_block_args[next_stage_idx][0]['stride'] > 1 | |||
| elif self.feature_location == 'post_exp': | |||
| if block_args['stride'] > 1 or (last_stack and last_block): | |||
| do_extract = True | |||
| if do_extract: | |||
| extract_features = self.feature_location | |||
| next_dilation = current_dilation | |||
| if block_args['stride'] > 1: | |||
| next_output_stride = current_stride * block_args['stride'] | |||
| if next_output_stride > self.output_stride: | |||
| next_dilation = current_dilation * block_args['stride'] | |||
| block_args['stride'] = 1 | |||
| if self.verbose: | |||
| self.logger.info( | |||
| ' Converting stride to dilation to maintain output_stride=={}'.format( | |||
| self.output_stride)) | |||
| else: | |||
| current_stride = next_output_stride | |||
| block_args['dilation'] = current_dilation | |||
| if next_dilation != current_dilation: | |||
| current_dilation = next_dilation | |||
| # create the block | |||
| block = self._make_block( | |||
| block_args, total_block_idx, total_block_count) | |||
| blocks.append(block) | |||
| # stash feature module name and channel info for model feature | |||
| # extraction | |||
| if extract_features: | |||
| feature_module = block.feature_module(extract_features) | |||
| if feature_module: | |||
| feature_module = 'blocks.{}.{}.'.format( | |||
| stage_idx, block_idx) + feature_module | |||
| feature_channels = block.feature_channels(extract_features) | |||
| self.features[feature_idx] = dict( | |||
| name=feature_module, | |||
| num_chs=feature_channels | |||
| ) | |||
| feature_idx += 1 | |||
| # incr global block idx (across all stacks) | |||
| total_block_idx += 1 | |||
| stages.append(nn.Sequential(*blocks)) | |||
| return stages | |||
| @@ -0,0 +1,214 @@ | |||
| from copy import deepcopy | |||
| from ...utils.builder_util import modify_block_args | |||
| from ..blocks import get_Bottleneck, InvertedResidual | |||
| from timm.models.efficientnet_blocks import * | |||
| from pytorch.mutables import LayerChoice | |||
| class SuperNetBuilder: | |||
| """ Build Trunk Blocks | |||
| """ | |||
| def __init__( | |||
| self, | |||
| choices, | |||
| channel_multiplier=1.0, | |||
| channel_divisor=8, | |||
| channel_min=None, | |||
| output_stride=32, | |||
| pad_type='', | |||
| act_layer=None, | |||
| se_kwargs=None, | |||
| norm_layer=nn.BatchNorm2d, | |||
| norm_kwargs=None, | |||
| drop_path_rate=0., | |||
| feature_location='', | |||
| verbose=False, | |||
| resunit=False, | |||
| dil_conv=False, | |||
| logger=None): | |||
| # dict | |||
| # choices = {'kernel_size': [3, 5, 7], 'exp_ratio': [4, 6]} | |||
| self.choices = [[x, y] for x in choices['kernel_size'] | |||
| for y in choices['exp_ratio']] | |||
| self.choices_num = len(self.choices) - 1 | |||
| self.channel_multiplier = channel_multiplier | |||
| self.channel_divisor = channel_divisor | |||
| self.channel_min = channel_min | |||
| self.output_stride = output_stride | |||
| self.pad_type = pad_type | |||
| self.act_layer = act_layer | |||
| self.se_kwargs = se_kwargs | |||
| self.norm_layer = norm_layer | |||
| self.norm_kwargs = norm_kwargs | |||
| self.drop_path_rate = drop_path_rate | |||
| self.feature_location = feature_location | |||
| assert feature_location in ('pre_pwl', 'post_exp', '') | |||
| self.verbose = verbose | |||
| self.resunit = resunit | |||
| self.dil_conv = dil_conv | |||
| self.logger = logger | |||
| # state updated during build, consumed by model | |||
| self.in_chs = None | |||
| def _round_channels(self, chs): | |||
| return round_channels( | |||
| chs, | |||
| self.channel_multiplier, | |||
| self.channel_divisor, | |||
| self.channel_min) | |||
| def _make_block( | |||
| self, | |||
| ba, | |||
| choice_idx, | |||
| block_idx, | |||
| block_count, | |||
| resunit=False, | |||
| dil_conv=False): | |||
| drop_path_rate = self.drop_path_rate * block_idx / block_count | |||
| bt = ba.pop('block_type') | |||
| ba['in_chs'] = self.in_chs | |||
| ba['out_chs'] = self._round_channels(ba['out_chs']) | |||
| if 'fake_in_chs' in ba and ba['fake_in_chs']: | |||
| # FIXME this is a hack to work around mismatch in origin impl input | |||
| # filters | |||
| ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs']) | |||
| ba['norm_layer'] = self.norm_layer | |||
| ba['norm_kwargs'] = self.norm_kwargs | |||
| ba['pad_type'] = self.pad_type | |||
| # block act fn overrides the model default | |||
| ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer | |||
| assert ba['act_layer'] is not None | |||
| if bt == 'ir': | |||
| ba['drop_path_rate'] = drop_path_rate | |||
| ba['se_kwargs'] = self.se_kwargs | |||
| if self.verbose: | |||
| self.logger.info( | |||
| ' InvertedResidual {}, Args: {}'.format( | |||
| block_idx, str(ba))) | |||
| block = InvertedResidual(**ba) | |||
| elif bt == 'ds' or bt == 'dsa': | |||
| ba['drop_path_rate'] = drop_path_rate | |||
| ba['se_kwargs'] = self.se_kwargs | |||
| if self.verbose: | |||
| self.logger.info( | |||
| ' DepthwiseSeparable {}, Args: {}'.format( | |||
| block_idx, str(ba))) | |||
| block = DepthwiseSeparableConv(**ba) | |||
| elif bt == 'cn': | |||
| if self.verbose: | |||
| self.logger.info( | |||
| ' ConvBnAct {}, Args: {}'.format( | |||
| block_idx, str(ba))) | |||
| block = ConvBnAct(**ba) | |||
| else: | |||
| assert False, 'Uknkown block type (%s) while building model.' % bt | |||
| if choice_idx == self.choice_num - 1: | |||
| self.in_chs = ba['out_chs'] # update in_chs for arg of next block | |||
| return block | |||
| def __call__(self, in_chs, model_block_args): | |||
| """ Build the blocks | |||
| Args: | |||
| in_chs: Number of input-channels passed to first block | |||
| model_block_args: A list of lists, outer list defines stages, inner | |||
| list contains strings defining block configuration(s) | |||
| Return: | |||
| List of block stacks (each stack wrapped in nn.Sequential) | |||
| """ | |||
| if self.verbose: | |||
| self.logger.info('Building model trunk with %d stages...' % len(model_block_args)) | |||
| self.in_chs = in_chs | |||
| total_block_count = sum([len(x) for x in model_block_args]) | |||
| total_block_idx = 0 | |||
| current_stride = 2 | |||
| current_dilation = 1 | |||
| feature_idx = 0 | |||
| stages = [] | |||
| # outer list of block_args defines the stacks ('stages' by some conventions) | |||
| for stage_idx, stage_block_args in enumerate(model_block_args): | |||
| last_stack = stage_idx == (len(model_block_args) - 1) | |||
| if self.verbose: | |||
| self.logger.info('Stack: {}'.format(stage_idx)) | |||
| assert isinstance(stage_block_args, list) | |||
| # blocks = [] | |||
| # each stack (stage) contains a list of block arguments | |||
| for block_idx, block_args in enumerate(stage_block_args): | |||
| last_block = block_idx == (len(stage_block_args) - 1) | |||
| if self.verbose: | |||
| self.logger.info(' Block: {}'.format(block_idx)) | |||
| # Sort out stride, dilation, and feature extraction details | |||
| assert block_args['stride'] in (1, 2) | |||
| if block_idx >= 1: | |||
| # only the first block in any stack can have a stride > 1 | |||
| block_args['stride'] = 1 | |||
| next_dilation = current_dilation | |||
| if block_args['stride'] > 1: | |||
| next_output_stride = current_stride * block_args['stride'] | |||
| if next_output_stride > self.output_stride: | |||
| next_dilation = current_dilation * block_args['stride'] | |||
| block_args['stride'] = 1 | |||
| else: | |||
| current_stride = next_output_stride | |||
| block_args['dilation'] = current_dilation | |||
| if next_dilation != current_dilation: | |||
| current_dilation = next_dilation | |||
| if stage_idx==0 or stage_idx==6: | |||
| self.choice_num = 1 | |||
| else: | |||
| self.choice_num = len(self.choices) | |||
| if self.dil_conv: | |||
| self.choice_num += 2 | |||
| choice_blocks = [] | |||
| block_args_copy = deepcopy(block_args) | |||
| if self.choice_num == 1: | |||
| # create the block | |||
| block = self._make_block(block_args, 0, total_block_idx, total_block_count) | |||
| choice_blocks.append(block) | |||
| else: | |||
| for choice_idx, choice in enumerate(self.choices): | |||
| # create the block | |||
| block_args = deepcopy(block_args_copy) | |||
| block_args = modify_block_args(block_args, choice[0], choice[1]) | |||
| block = self._make_block(block_args, choice_idx, total_block_idx, total_block_count) | |||
| choice_blocks.append(block) | |||
| if self.dil_conv: | |||
| block_args = deepcopy(block_args_copy) | |||
| block_args = modify_block_args(block_args, 3, 0) | |||
| block = self._make_block(block_args, self.choice_num - 2, total_block_idx, total_block_count, | |||
| resunit=self.resunit, dil_conv=self.dil_conv) | |||
| choice_blocks.append(block) | |||
| block_args = deepcopy(block_args_copy) | |||
| block_args = modify_block_args(block_args, 5, 0) | |||
| block = self._make_block(block_args, self.choice_num - 1, total_block_idx, total_block_count, | |||
| resunit=self.resunit, dil_conv=self.dil_conv) | |||
| choice_blocks.append(block) | |||
| if self.resunit: | |||
| block = get_Bottleneck(block.conv_pw.in_channels, | |||
| block.conv_pwl.out_channels, | |||
| block.conv_dw.stride[0]) | |||
| choice_blocks.append(block) | |||
| choice_block = LayerChoice(choice_blocks) | |||
| stages.append(choice_block) | |||
| # create the block | |||
| # block = self._make_block(block_args, total_block_idx, total_block_count) | |||
| total_block_idx += 1 # incr global block idx (across all stacks) | |||
| # stages.append(blocks) | |||
| return stages | |||
| @@ -0,0 +1,145 @@ | |||
| from ...utils.builder_util import * | |||
| from ..builders.build_childnet import * | |||
| from timm.models.layers import SelectAdaptivePool2d | |||
| from timm.models.layers.activations import hard_sigmoid | |||
| class ChildNet(nn.Module): | |||
| def __init__( | |||
| self, | |||
| block_args, | |||
| num_classes=1000, | |||
| in_chans=3, | |||
| stem_size=16, | |||
| num_features=1280, | |||
| head_bias=True, | |||
| channel_multiplier=1.0, | |||
| pad_type='', | |||
| act_layer=nn.ReLU, | |||
| drop_rate=0., | |||
| drop_path_rate=0., | |||
| se_kwargs=None, | |||
| norm_layer=nn.BatchNorm2d, | |||
| norm_kwargs=None, | |||
| global_pool='avg', | |||
| logger=None, | |||
| verbose=False): | |||
| super(ChildNet, self).__init__() | |||
| self.num_classes = num_classes | |||
| self.num_features = num_features | |||
| self.drop_rate = drop_rate | |||
| self._in_chs = in_chans | |||
| self.logger = logger | |||
| # Stem | |||
| stem_size = round_channels(stem_size, channel_multiplier) | |||
| self.conv_stem = create_conv2d( | |||
| self._in_chs, stem_size, 3, stride=2, padding=pad_type) | |||
| self.bn1 = norm_layer(stem_size, **norm_kwargs) | |||
| self.act1 = act_layer(inplace=True) | |||
| self._in_chs = stem_size | |||
| # Middle stages (IR/ER/DS Blocks) | |||
| builder = ChildNetBuilder( | |||
| channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs, | |||
| norm_layer, norm_kwargs, drop_path_rate, verbose=verbose) | |||
| self.blocks = nn.Sequential(*builder(self._in_chs, block_args)) | |||
| # self.blocks = builder(self._in_chs, block_args) | |||
| self._in_chs = builder.in_chs | |||
| # Head + Pooling | |||
| self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) | |||
| self.conv_head = create_conv2d( | |||
| self._in_chs, | |||
| self.num_features, | |||
| 1, | |||
| padding=pad_type, | |||
| bias=head_bias) | |||
| self.act2 = act_layer(inplace=True) | |||
| # Classifier | |||
| self.classifier = nn.Linear( | |||
| self.num_features * | |||
| self.global_pool.feat_mult(), | |||
| self.num_classes) | |||
| efficientnet_init_weights(self) | |||
| def get_classifier(self): | |||
| return self.classifier | |||
| def reset_classifier(self, num_classes, global_pool='avg'): | |||
| self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) | |||
| self.num_classes = num_classes | |||
| self.classifier = nn.Linear( | |||
| self.num_features * self.global_pool.feat_mult(), | |||
| num_classes) if self.num_classes else None | |||
| def forward_features(self, x): | |||
| # architecture = [[0], [], [], [], [], [0]] | |||
| x = self.conv_stem(x) | |||
| x = self.bn1(x) | |||
| x = self.act1(x) | |||
| x = self.blocks(x) | |||
| x = self.global_pool(x) | |||
| x = self.conv_head(x) | |||
| x = self.act2(x) | |||
| return x | |||
| def forward(self, x): | |||
| x = self.forward_features(x) | |||
| x = x.flatten(1) | |||
| if self.drop_rate > 0.: | |||
| x = F.dropout(x, p=self.drop_rate, training=self.training) | |||
| x = self.classifier(x) | |||
| return x | |||
| def gen_childnet(arch_list, arch_def, **kwargs): | |||
| # arch_list = [[0], [], [], [], [], [0]] | |||
| choices = {'kernel_size': [3, 5, 7], 'exp_ratio': [4, 6]} | |||
| choices_list = [[x, y] for x in choices['kernel_size'] | |||
| for y in choices['exp_ratio']] | |||
| num_features = 1280 | |||
| # act_layer = HardSwish | |||
| act_layer = Swish | |||
| new_arch = [] | |||
| # change to child arch_def | |||
| for i, (layer_choice, layer_arch) in enumerate(zip(arch_list, arch_def)): | |||
| if len(layer_arch) == 1: | |||
| new_arch.append(layer_arch) | |||
| continue | |||
| else: | |||
| new_layer = [] | |||
| for j, (block_choice, block_arch) in enumerate( | |||
| zip(layer_choice, layer_arch)): | |||
| kernel_size, exp_ratio = choices_list[block_choice] | |||
| elements = block_arch.split('_') | |||
| block_arch = block_arch.replace( | |||
| elements[2], 'k{}'.format(str(kernel_size))) | |||
| block_arch = block_arch.replace( | |||
| elements[4], 'e{}'.format(str(exp_ratio))) | |||
| new_layer.append(block_arch) | |||
| new_arch.append(new_layer) | |||
| model_kwargs = dict( | |||
| block_args=decode_arch_def(new_arch), | |||
| num_features=num_features, | |||
| stem_size=16, | |||
| norm_kwargs=resolve_bn_args(kwargs), | |||
| act_layer=act_layer, | |||
| se_kwargs=dict( | |||
| act_layer=nn.ReLU, | |||
| gate_fn=hard_sigmoid, | |||
| reduce_mid=True, | |||
| divisor=8), | |||
| **kwargs, | |||
| ) | |||
| model = ChildNet(**model_kwargs) | |||
| return model | |||
| @@ -0,0 +1,202 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| from ...utils.builder_util import * | |||
| from ...utils.search_structure_supernet import * | |||
| from ...utils.op_by_layer_dict import flops_op_dict | |||
| from ..builders.build_supernet import * | |||
| from timm.models.layers import SelectAdaptivePool2d | |||
| from timm.models.layers.activations import hard_sigmoid | |||
| class SuperNet(nn.Module): | |||
| def __init__( | |||
| self, | |||
| block_args, | |||
| choices, | |||
| num_classes=1000, | |||
| in_chans=3, | |||
| stem_size=16, | |||
| num_features=1280, | |||
| head_bias=True, | |||
| channel_multiplier=1.0, | |||
| pad_type='', | |||
| act_layer=nn.ReLU, | |||
| drop_rate=0., | |||
| drop_path_rate=0., | |||
| slice=4, | |||
| se_kwargs=None, | |||
| norm_layer=nn.BatchNorm2d, | |||
| logger=None, | |||
| norm_kwargs=None, | |||
| global_pool='avg', | |||
| resunit=False, | |||
| dil_conv=False, | |||
| verbose=False): | |||
| super(SuperNet, self).__init__() | |||
| self.num_classes = num_classes | |||
| self.num_features = num_features | |||
| self.drop_rate = drop_rate | |||
| self._in_chs = in_chans | |||
| self.logger = logger | |||
| # Stem | |||
| stem_size = round_channels(stem_size, channel_multiplier) | |||
| self.conv_stem = create_conv2d( | |||
| self._in_chs, stem_size, 3, stride=2, padding=pad_type) | |||
| self.bn1 = norm_layer(stem_size, **norm_kwargs) | |||
| self.act1 = act_layer(inplace=True) | |||
| self._in_chs = stem_size | |||
| # Middle stages (IR/ER/DS Blocks) | |||
| builder = SuperNetBuilder( | |||
| choices, | |||
| channel_multiplier, | |||
| 8, | |||
| None, | |||
| 32, | |||
| pad_type, | |||
| act_layer, | |||
| se_kwargs, | |||
| norm_layer, | |||
| norm_kwargs, | |||
| drop_path_rate, | |||
| verbose=verbose, | |||
| resunit=resunit, | |||
| dil_conv=dil_conv, | |||
| logger=self.logger) | |||
| blocks = builder(self._in_chs, block_args) | |||
| self.blocks = nn.Sequential(*blocks) | |||
| self._in_chs = builder.in_chs | |||
| # Head + Pooling | |||
| self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) | |||
| self.conv_head = create_conv2d( | |||
| self._in_chs, | |||
| self.num_features, | |||
| 1, | |||
| padding=pad_type, | |||
| bias=head_bias) | |||
| self.act2 = act_layer(inplace=True) | |||
| # Classifier | |||
| self.classifier = nn.Linear( | |||
| self.num_features * | |||
| self.global_pool.feat_mult(), | |||
| self.num_classes) | |||
| self.meta_layer = nn.Linear(self.num_classes * slice, 1) | |||
| efficientnet_init_weights(self) | |||
| def get_classifier(self): | |||
| return self.classifier | |||
| def reset_classifier(self, num_classes, global_pool='avg'): | |||
| self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) | |||
| self.num_classes = num_classes | |||
| self.classifier = nn.Linear( | |||
| self.num_features * self.global_pool.feat_mult(), | |||
| num_classes) if self.num_classes else None | |||
| def forward_features(self, x): | |||
| x = self.conv_stem(x) | |||
| x = self.bn1(x) | |||
| x = self.act1(x) | |||
| x = self.blocks(x) | |||
| x = self.global_pool(x) | |||
| x = self.conv_head(x) | |||
| x = self.act2(x) | |||
| return x | |||
| def forward(self, x): | |||
| x = self.forward_features(x) | |||
| x = x.flatten(1) | |||
| if self.drop_rate > 0.: | |||
| x = F.dropout(x, p=self.drop_rate, training=self.training) | |||
| return self.classifier(x) | |||
| def forward_meta(self, features): | |||
| return self.meta_layer(features.view(1, -1)) | |||
| def rand_parameters(self, architecture, meta=False): | |||
| for name, param in self.named_parameters(recurse=True): | |||
| if 'meta' in name and meta: | |||
| yield param | |||
| elif 'blocks' not in name and 'meta' not in name and (not meta): | |||
| yield param | |||
| if not meta: | |||
| for layer, layer_arch in zip(self.blocks, architecture): | |||
| for blocks, arch in zip(layer, layer_arch): | |||
| if arch == -1: | |||
| continue | |||
| for name, param in blocks[arch].named_parameters( | |||
| recurse=True): | |||
| yield param | |||
| class Classifier(nn.Module): | |||
| def __init__(self, num_classes=1000): | |||
| super(Classifier, self).__init__() | |||
| self.classifier = nn.Linear(num_classes, num_classes) | |||
| def forward(self, x): | |||
| return self.classifier(x) | |||
| def gen_supernet(flops_minimum=0, flops_maximum=600, **kwargs): | |||
| choices = {'kernel_size': [3, 5, 7], 'exp_ratio': [4, 6]} | |||
| num_features = 1280 | |||
| # act_layer = HardSwish | |||
| act_layer = Swish | |||
| arch_def = [ | |||
| # stage 0, 112x112 in | |||
| ['ds_r1_k3_s1_e1_c16_se0.25'], | |||
| # stage 1, 112x112 in | |||
| ['ir_r1_k3_s2_e4_c24_se0.25', 'ir_r1_k3_s1_e4_c24_se0.25', 'ir_r1_k3_s1_e4_c24_se0.25', | |||
| 'ir_r1_k3_s1_e4_c24_se0.25'], | |||
| # stage 2, 56x56 in | |||
| ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r1_k5_s1_e4_c40_se0.25', 'ir_r1_k5_s2_e4_c40_se0.25', | |||
| 'ir_r1_k5_s2_e4_c40_se0.25'], | |||
| # stage 3, 28x28 in | |||
| ['ir_r1_k3_s2_e6_c80_se0.25', 'ir_r1_k3_s1_e4_c80_se0.25', 'ir_r1_k3_s1_e4_c80_se0.25', | |||
| 'ir_r2_k3_s1_e4_c80_se0.25'], | |||
| # stage 4, 14x14in | |||
| ['ir_r1_k3_s1_e6_c96_se0.25', 'ir_r1_k3_s1_e6_c96_se0.25', 'ir_r1_k3_s1_e6_c96_se0.25', | |||
| 'ir_r1_k3_s1_e6_c96_se0.25'], | |||
| # stage 5, 14x14in | |||
| ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k5_s2_e6_c192_se0.25', | |||
| 'ir_r1_k5_s2_e6_c192_se0.25'], | |||
| # stage 6, 7x7 in | |||
| ['cn_r1_k1_s1_c320_se0.25'], | |||
| ] | |||
| sta_num, arch_def, resolution = search_for_layer( | |||
| flops_op_dict, arch_def, flops_minimum, flops_maximum) | |||
| if sta_num is None or arch_def is None or resolution is None: | |||
| raise ValueError('Invalid FLOPs Settings') | |||
| model_kwargs = dict( | |||
| block_args=decode_arch_def(arch_def), | |||
| choices=choices, | |||
| num_features=num_features, | |||
| stem_size=16, | |||
| norm_kwargs=resolve_bn_args(kwargs), | |||
| act_layer=act_layer, | |||
| se_kwargs=dict( | |||
| act_layer=nn.ReLU, | |||
| gate_fn=hard_sigmoid, | |||
| reduce_mid=True, | |||
| divisor=8), | |||
| **kwargs, | |||
| ) | |||
| model = SuperNet(**model_kwargs) | |||
| return model, sta_num, resolution, arch_def | |||
| @@ -0,0 +1,270 @@ | |||
| import re | |||
| import math | |||
| import torch.nn as nn | |||
| from copy import deepcopy | |||
| from timm.utils import * | |||
| from timm.models.layers.activations import Swish | |||
| from timm.models.layers import CondConv2d, get_condconv_initializer | |||
| def parse_ksize(ss): | |||
| if ss.isdigit(): | |||
| return int(ss) | |||
| else: | |||
| return [int(k) for k in ss.split('.')] | |||
| def decode_arch_def( | |||
| arch_def, | |||
| depth_multiplier=1.0, | |||
| depth_trunc='ceil', | |||
| experts_multiplier=1): | |||
| arch_args = [] | |||
| for stack_idx, block_strings in enumerate(arch_def): | |||
| assert isinstance(block_strings, list) | |||
| stack_args = [] | |||
| repeats = [] | |||
| for block_str in block_strings: | |||
| assert isinstance(block_str, str) | |||
| ba, rep = decode_block_str(block_str) | |||
| if ba.get('num_experts', 0) > 0 and experts_multiplier > 1: | |||
| ba['num_experts'] *= experts_multiplier | |||
| stack_args.append(ba) | |||
| repeats.append(rep) | |||
| arch_args.append( | |||
| scale_stage_depth( | |||
| stack_args, | |||
| repeats, | |||
| depth_multiplier, | |||
| depth_trunc)) | |||
| return arch_args | |||
| def modify_block_args(block_args, kernel_size, exp_ratio): | |||
| block_type = block_args['block_type'] | |||
| if block_type == 'cn': | |||
| block_args['kernel_size'] = kernel_size | |||
| elif block_type == 'er': | |||
| block_args['exp_kernel_size'] = kernel_size | |||
| else: | |||
| block_args['dw_kernel_size'] = kernel_size | |||
| if block_type == 'ir' or block_type == 'er': | |||
| block_args['exp_ratio'] = exp_ratio | |||
| return block_args | |||
| def decode_block_str(block_str): | |||
| """ Decode block definition string | |||
| Gets a list of block arg (dicts) through a string notation of arguments. | |||
| E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip | |||
| All args can exist in any order with the exception of the leading string which | |||
| is assumed to indicate the block type. | |||
| leading string - block type ( | |||
| ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct) | |||
| r - number of repeat blocks, | |||
| k - kernel size, | |||
| s - strides (1-9), | |||
| e - expansion ratio, | |||
| c - output channels, | |||
| se - squeeze/excitation ratio | |||
| n - activation fn ('re', 'r6', 'hs', or 'sw') | |||
| Args: | |||
| block_str: a string representation of block arguments. | |||
| Returns: | |||
| A list of block args (dicts) | |||
| Raises: | |||
| ValueError: if the string def not properly specified (TODO) | |||
| """ | |||
| assert isinstance(block_str, str) | |||
| ops = block_str.split('_') | |||
| block_type = ops[0] # take the block type off the front | |||
| ops = ops[1:] | |||
| options = {} | |||
| noskip = False | |||
| for op in ops: | |||
| # string options being checked on individual basis, combine if they | |||
| # grow | |||
| if op == 'noskip': | |||
| noskip = True | |||
| elif op.startswith('n'): | |||
| # activation fn | |||
| key = op[0] | |||
| v = op[1:] | |||
| if v == 're': | |||
| value = nn.ReLU | |||
| elif v == 'r6': | |||
| value = nn.ReLU6 | |||
| elif v == 'sw': | |||
| value = Swish | |||
| else: | |||
| continue | |||
| options[key] = value | |||
| else: | |||
| # all numeric options | |||
| splits = re.split(r'(\d.*)', op) | |||
| if len(splits) >= 2: | |||
| key, value = splits[:2] | |||
| options[key] = value | |||
| # if act_layer is None, the model default (passed to model init) will be | |||
| # used | |||
| act_layer = options['n'] if 'n' in options else None | |||
| exp_kernel_size = parse_ksize(options['a']) if 'a' in options else 1 | |||
| pw_kernel_size = parse_ksize(options['p']) if 'p' in options else 1 | |||
| # FIXME hack to deal with in_chs issue in TPU def | |||
| fake_in_chs = int(options['fc']) if 'fc' in options else 0 | |||
| num_repeat = int(options['r']) | |||
| # each type of block has different valid arguments, fill accordingly | |||
| if block_type == 'ir': | |||
| block_args = dict( | |||
| block_type=block_type, | |||
| dw_kernel_size=parse_ksize(options['k']), | |||
| exp_kernel_size=exp_kernel_size, | |||
| pw_kernel_size=pw_kernel_size, | |||
| out_chs=int(options['c']), | |||
| exp_ratio=float(options['e']), | |||
| se_ratio=float(options['se']) if 'se' in options else None, | |||
| stride=int(options['s']), | |||
| act_layer=act_layer, | |||
| noskip=noskip, | |||
| ) | |||
| if 'cc' in options: | |||
| block_args['num_experts'] = int(options['cc']) | |||
| elif block_type == 'ds' or block_type == 'dsa': | |||
| block_args = dict( | |||
| block_type=block_type, | |||
| dw_kernel_size=parse_ksize(options['k']), | |||
| pw_kernel_size=pw_kernel_size, | |||
| out_chs=int(options['c']), | |||
| se_ratio=float(options['se']) if 'se' in options else None, | |||
| stride=int(options['s']), | |||
| act_layer=act_layer, | |||
| pw_act=block_type == 'dsa', | |||
| noskip=block_type == 'dsa' or noskip, | |||
| ) | |||
| elif block_type == 'cn': | |||
| block_args = dict( | |||
| block_type=block_type, | |||
| kernel_size=int(options['k']), | |||
| out_chs=int(options['c']), | |||
| stride=int(options['s']), | |||
| act_layer=act_layer, | |||
| ) | |||
| else: | |||
| assert False, 'Unknown block type (%s)' % block_type | |||
| return block_args, num_repeat | |||
| def scale_stage_depth( | |||
| stack_args, | |||
| repeats, | |||
| depth_multiplier=1.0, | |||
| depth_trunc='ceil'): | |||
| """ Per-stage depth scaling | |||
| Scales the block repeats in each stage. This depth scaling impl maintains | |||
| compatibility with the EfficientNet scaling method, while allowing sensible | |||
| scaling for other models that may have multiple block arg definitions in each stage. | |||
| """ | |||
| # We scale the total repeat count for each stage, there may be multiple | |||
| # block arg defs per stage so we need to sum. | |||
| num_repeat = sum(repeats) | |||
| if depth_trunc == 'round': | |||
| # Truncating to int by rounding allows stages with few repeats to remain | |||
| # proportionally smaller for longer. This is a good choice when stage definitions | |||
| # include single repeat stages that we'd prefer to keep that way as | |||
| # long as possible | |||
| num_repeat_scaled = max(1, round(num_repeat * depth_multiplier)) | |||
| else: | |||
| # The default for EfficientNet truncates repeats to int via 'ceil'. | |||
| # Any multiplier > 1.0 will result in an increased depth for every | |||
| # stage. | |||
| num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier)) | |||
| # Proportionally distribute repeat count scaling to each block definition in the stage. | |||
| # Allocation is done in reverse as it results in the first block being less likely to be scaled. | |||
| # The first block makes less sense to repeat in most of the arch | |||
| # definitions. | |||
| repeats_scaled = [] | |||
| for r in repeats[::-1]: | |||
| rs = max(1, round((r / num_repeat * num_repeat_scaled))) | |||
| repeats_scaled.append(rs) | |||
| num_repeat -= r | |||
| num_repeat_scaled -= rs | |||
| repeats_scaled = repeats_scaled[::-1] | |||
| # Apply the calculated scaling to each block arg in the stage | |||
| sa_scaled = [] | |||
| for ba, rep in zip(stack_args, repeats_scaled): | |||
| sa_scaled.extend([deepcopy(ba) for _ in range(rep)]) | |||
| return sa_scaled | |||
| def init_weight_goog(m, n='', fix_group_fanout=True, last_bn=None): | |||
| """ Weight initialization as per Tensorflow official implementations. | |||
| Args: | |||
| m (nn.Module): module to init | |||
| n (str): module name | |||
| fix_group_fanout (bool): enable correct (matching Tensorflow TPU impl) fanout calculation w/ group convs | |||
| Handles layers in EfficientNet, EfficientNet-CondConv, MixNet, MnasNet, MobileNetV3, etc: | |||
| * https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py | |||
| * https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py | |||
| """ | |||
| if isinstance(m, CondConv2d): | |||
| fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | |||
| if fix_group_fanout: | |||
| fan_out //= m.groups | |||
| init_weight_fn = get_condconv_initializer(lambda w: w.data.normal_( | |||
| 0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) | |||
| init_weight_fn(m.weight) | |||
| if m.bias is not None: | |||
| m.bias.data.zero_() | |||
| elif isinstance(m, nn.Conv2d): | |||
| fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | |||
| if fix_group_fanout: | |||
| fan_out //= m.groups | |||
| m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) | |||
| if m.bias is not None: | |||
| m.bias.data.zero_() | |||
| elif isinstance(m, nn.BatchNorm2d): | |||
| if n in last_bn: | |||
| m.weight.data.zero_() | |||
| m.bias.data.zero_() | |||
| else: | |||
| m.weight.data.fill_(1.0) | |||
| m.bias.data.zero_() | |||
| m.weight.data.fill_(1.0) | |||
| m.bias.data.zero_() | |||
| elif isinstance(m, nn.Linear): | |||
| fan_out = m.weight.size(0) # fan-out | |||
| fan_in = 0 | |||
| if 'routing_fn' in n: | |||
| fan_in = m.weight.size(1) | |||
| init_range = 1.0 / math.sqrt(fan_in + fan_out) | |||
| m.weight.data.uniform_(-init_range, init_range) | |||
| m.bias.data.zero_() | |||
| def efficientnet_init_weights( | |||
| model: nn.Module, | |||
| init_fn=None, | |||
| zero_gamma=False): | |||
| last_bn = [] | |||
| if zero_gamma: | |||
| prev_n = '' | |||
| for n, m in model.named_modules(): | |||
| if isinstance(m, nn.BatchNorm2d): | |||
| if ''.join(prev_n.split('.')[:-1]) != ''.join(n.split('.')[:-1]): | |||
| last_bn.append(prev_n) | |||
| prev_n = n | |||
| last_bn.append(prev_n) | |||
| init_fn = init_fn or init_weight_goog | |||
| for n, m in model.named_modules(): | |||
| init_fn(m, n, last_bn=last_bn) | |||
| init_fn(m, n, last_bn=last_bn) | |||
| @@ -0,0 +1,79 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| import torch | |||
| from ptflops import get_model_complexity_info | |||
| class FlopsEst(object): | |||
| def __init__(self, model, input_shape=(2, 3, 224, 224), device='cpu'): | |||
| self.block_num = len(model.blocks) | |||
| self.choice_num = len(model.blocks[0]) | |||
| self.flops_dict = {} | |||
| self.params_dict = {} | |||
| if device == 'cpu': | |||
| model = model.cpu() | |||
| else: | |||
| model = model.cuda() | |||
| self.params_fixed = 0 | |||
| self.flops_fixed = 0 | |||
| input = torch.randn(input_shape) | |||
| flops, params = get_model_complexity_info( | |||
| model.conv_stem, (3, 224, 224), as_strings=False, print_per_layer_stat=False) | |||
| self.params_fixed += params / 1e6 | |||
| self.flops_fixed += flops / 1e6 | |||
| input = model.conv_stem(input) | |||
| for block_id, block in enumerate(model.blocks): | |||
| self.flops_dict[block_id] = {} | |||
| self.params_dict[block_id] = {} | |||
| for module_id, module in enumerate(block): | |||
| flops, params = get_model_complexity_info(module, tuple( | |||
| input.shape[1:]), as_strings=False, print_per_layer_stat=False) | |||
| # Flops(M) | |||
| self.flops_dict[block_id][module_id] = flops / 1e6 | |||
| # Params(M) | |||
| self.params_dict[block_id][module_id] = params / 1e6 | |||
| input = module(input) | |||
| # conv_last | |||
| flops, params = get_model_complexity_info(model.global_pool, tuple( | |||
| input.shape[1:]), as_strings=False, print_per_layer_stat=False) | |||
| self.params_fixed += params / 1e6 | |||
| self.flops_fixed += flops / 1e6 | |||
| input = model.global_pool(input) | |||
| # globalpool | |||
| flops, params = get_model_complexity_info(model.conv_head, tuple( | |||
| input.shape[1:]), as_strings=False, print_per_layer_stat=False) | |||
| self.params_fixed += params / 1e6 | |||
| self.flops_fixed += flops / 1e6 | |||
| # return params (M) | |||
| def get_params(self, arch): | |||
| params = 0 | |||
| for block_id, block in enumerate(arch): | |||
| if block == -1: | |||
| continue | |||
| params += self.params_dict[block_id][block] | |||
| return params + self.params_fixed | |||
| # return flops (M) | |||
| def get_flops(self, arch): | |||
| flops = 0 | |||
| for block_id, block in enumerate(arch): | |||
| if block == 'LayerChoice1' or block_id == 'LayerChoice23': | |||
| continue | |||
| for idx, choice in enumerate(arch[block]): | |||
| flops += self.flops_dict[block_id][idx] * (1 if choice else 0) | |||
| return flops + self.flops_fixed | |||
| @@ -0,0 +1,42 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| # This dictionary is generated from calculating each operation of each layer to quickly search for layers. | |||
| # flops_op_dict[which_stage][which_operation] = | |||
| # (flops_of_operation_with_stride1, flops_of_operation_with_stride2) | |||
| flops_op_dict = {} | |||
| for i in range(5): | |||
| flops_op_dict[i] = {} | |||
| flops_op_dict[0][0] = (21.828704, 18.820752) | |||
| flops_op_dict[0][1] = (32.669328, 28.16048) | |||
| flops_op_dict[0][2] = (25.039968, 23.637648) | |||
| flops_op_dict[0][3] = (37.486224, 35.385824) | |||
| flops_op_dict[0][4] = (29.856864, 30.862992) | |||
| flops_op_dict[0][5] = (44.711568, 46.22384) | |||
| flops_op_dict[1][0] = (11.808656, 11.86712) | |||
| flops_op_dict[1][1] = (17.68624, 17.780848) | |||
| flops_op_dict[1][2] = (13.01288, 13.87416) | |||
| flops_op_dict[1][3] = (19.492576, 20.791408) | |||
| flops_op_dict[1][4] = (14.819216, 16.88472) | |||
| flops_op_dict[1][5] = (22.20208, 25.307248) | |||
| flops_op_dict[2][0] = (8.198, 10.99632) | |||
| flops_op_dict[2][1] = (12.292848, 16.5172) | |||
| flops_op_dict[2][2] = (8.69976, 11.99984) | |||
| flops_op_dict[2][3] = (13.045488, 18.02248) | |||
| flops_op_dict[2][4] = (9.4524, 13.50512) | |||
| flops_op_dict[2][5] = (14.174448, 20.2804) | |||
| flops_op_dict[3][0] = (12.006112, 15.61632) | |||
| flops_op_dict[3][1] = (18.028752, 23.46096) | |||
| flops_op_dict[3][2] = (13.009632, 16.820544) | |||
| flops_op_dict[3][3] = (19.534032, 25.267296) | |||
| flops_op_dict[3][4] = (14.514912, 18.62688) | |||
| flops_op_dict[3][5] = (21.791952, 27.9768) | |||
| flops_op_dict[4][0] = (11.307456, 15.292416) | |||
| flops_op_dict[4][1] = (17.007072, 23.1504) | |||
| flops_op_dict[4][2] = (11.608512, 15.894528) | |||
| flops_op_dict[4][3] = (17.458656, 24.053568) | |||
| flops_op_dict[4][4] = (12.060096, 16.797696) | |||
| flops_op_dict[4][5] = (18.136032, 25.40832) | |||
| @@ -0,0 +1,47 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| def search_for_layer(flops_op_dict, arch_def, flops_minimum, flops_maximum): | |||
| sta_num = [1, 1, 1, 1, 1] | |||
| order = [2, 3, 4, 1, 0, 2, 3, 4, 1, 0] | |||
| limits = [3, 3, 3, 2, 2, 4, 4, 4, 4, 4] | |||
| size_factor = 224 // 32 | |||
| base_min_flops = sum([flops_op_dict[i][0][0] for i in range(5)]) | |||
| base_max_flops = sum([flops_op_dict[i][5][0] for i in range(5)]) | |||
| if base_min_flops > flops_maximum: | |||
| while base_min_flops > flops_maximum and size_factor >= 2: | |||
| size_factor = size_factor - 1 | |||
| flops_minimum = flops_minimum * (7. / size_factor) | |||
| flops_maximum = flops_maximum * (7. / size_factor) | |||
| if size_factor < 2: | |||
| return None, None, None | |||
| elif base_max_flops < flops_minimum: | |||
| cur_ptr = 0 | |||
| while base_max_flops < flops_minimum and cur_ptr <= 9: | |||
| if sta_num[order[cur_ptr]] >= limits[cur_ptr]: | |||
| cur_ptr += 1 | |||
| continue | |||
| base_max_flops = base_max_flops + flops_op_dict[order[cur_ptr]][5][1] | |||
| sta_num[order[cur_ptr]] += 1 | |||
| if cur_ptr > 7 and base_max_flops < flops_minimum: | |||
| return None, None, None | |||
| cur_ptr = 0 | |||
| while cur_ptr <= 9: | |||
| if sta_num[order[cur_ptr]] >= limits[cur_ptr]: | |||
| cur_ptr += 1 | |||
| continue | |||
| base_max_flops = base_max_flops + flops_op_dict[order[cur_ptr]][5][1] | |||
| if base_max_flops <= flops_maximum: | |||
| sta_num[order[cur_ptr]] += 1 | |||
| else: | |||
| break | |||
| arch_def = [item[:i] for i, item in zip([1] + sta_num + [1], arch_def)] | |||
| # print(arch_def) | |||
| return sta_num, arch_def, size_factor * 32 | |||
| @@ -0,0 +1,178 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| import sys | |||
| import torch | |||
| import logging | |||
| import argparse | |||
| import torch | |||
| import torch.nn as nn | |||
| from copy import deepcopy | |||
| from torch import optim as optim | |||
| from thop import profile, clever_format | |||
| from timm.utils import * | |||
| from ..config import cfg | |||
| def get_path_acc(model, path, val_loader, args, val_iters=50): | |||
| prec1_m = AverageMeter() | |||
| prec5_m = AverageMeter() | |||
| with torch.no_grad(): | |||
| for batch_idx, (input, target) in enumerate(val_loader): | |||
| if batch_idx >= val_iters: | |||
| break | |||
| if not args.prefetcher: | |||
| input = input.cuda() | |||
| target = target.cuda() | |||
| output = model(input, path) | |||
| if isinstance(output, (tuple, list)): | |||
| output = output[0] | |||
| # augmentation reduction | |||
| reduce_factor = args.tta | |||
| if reduce_factor > 1: | |||
| output = output.unfold( | |||
| 0, | |||
| reduce_factor, | |||
| reduce_factor).mean( | |||
| dim=2) | |||
| target = target[0:target.size(0):reduce_factor] | |||
| prec1, prec5 = accuracy(output, target, topk=(1, 5)) | |||
| torch.cuda.synchronize() | |||
| prec1_m.update(prec1.item(), output.size(0)) | |||
| prec5_m.update(prec5.item(), output.size(0)) | |||
| return (prec1_m.avg, prec5_m.avg) | |||
| def get_logger(file_path): | |||
| """ Make python logger """ | |||
| log_format = '%(asctime)s | %(message)s' | |||
| logging.basicConfig(stream=sys.stdout, level=logging.INFO, | |||
| format=log_format, datefmt='%m/%d %I:%M:%S %p') | |||
| logger = logging.getLogger() | |||
| logger.setLevel(logging.INFO) | |||
| formatter = logging.Formatter(log_format, datefmt='%m/%d %I:%M:%S %p') | |||
| file_handler = logging.FileHandler(file_path) | |||
| file_handler.setFormatter(formatter) | |||
| logger.addHandler(file_handler) | |||
| return logger | |||
| def add_weight_decay_supernet(model, args, weight_decay=1e-5, skip_list=()): | |||
| decay = [] | |||
| no_decay = [] | |||
| meta_layer_no_decay = [] | |||
| meta_layer_decay = [] | |||
| for name, param in model.named_parameters(): | |||
| if not param.requires_grad: | |||
| continue # frozen weights | |||
| if len(param.shape) == 1 or name.endswith( | |||
| ".bias") or name in skip_list: | |||
| if 'meta_layer' in name: | |||
| meta_layer_no_decay.append(param) | |||
| else: | |||
| no_decay.append(param) | |||
| else: | |||
| if 'meta_layer' in name: | |||
| meta_layer_decay.append(param) | |||
| else: | |||
| decay.append(param) | |||
| return [ | |||
| {'params': no_decay, 'weight_decay': 0., 'lr': args.lr}, | |||
| {'params': decay, 'weight_decay': weight_decay, 'lr': args.lr}, | |||
| {'params': meta_layer_no_decay, 'weight_decay': 0., 'lr': args.meta_lr}, | |||
| {'params': meta_layer_decay, 'weight_decay': 0, 'lr': args.meta_lr}, | |||
| ] | |||
| def create_optimizer_supernet(args, model, has_apex=False, filter_bias_and_bn=True): | |||
| weight_decay = args.weight_decay | |||
| if 'adamw' == args.opt or 'radam' == args.opt : | |||
| weight_decay /= args.lr | |||
| if weight_decay and filter_bias_and_bn: | |||
| parameters = add_weight_decay_supernet(model, args, weight_decay) | |||
| weight_decay = 0. | |||
| else: | |||
| parameters = model.parameters() | |||
| if 'fused' == args.opt: | |||
| assert has_apex and torch.cuda.is_available( | |||
| ), 'APEX and CUDA required for fused optimizers' | |||
| if args.opt == 'sgd' or args.opt == 'nesterov': | |||
| optimizer = optim.SGD( | |||
| parameters, | |||
| momentum=args.momentum, | |||
| weight_decay=weight_decay, | |||
| nesterov=True) | |||
| elif args.opt == 'momentum': | |||
| optimizer = optim.SGD( | |||
| parameters, | |||
| momentum=args.momentum, | |||
| weight_decay=weight_decay, | |||
| nesterov=False) | |||
| elif args.opt == 'adam': | |||
| optimizer = optim.Adam( | |||
| parameters, weight_decay=weight_decay, eps=args.opt_eps) | |||
| else: | |||
| assert False and "Invalid optimizer" | |||
| raise ValueError | |||
| return optimizer | |||
| def convert_lowercase(cfg): | |||
| keys = cfg.keys() | |||
| lowercase_keys = [key.lower() for key in keys] | |||
| values = [cfg.get(key) for key in keys] | |||
| for lowercase_key, value in zip(lowercase_keys, values): | |||
| cfg.setdefault(lowercase_key, value) | |||
| return cfg | |||
| # | |||
| # def parse_config_args(exp_name): | |||
| # parser = argparse.ArgumentParser(description=exp_name) | |||
| # parser.add_argument( | |||
| # '--cfg', | |||
| # type=str, | |||
| # default='../experiments/workspace/retrain/retrain.yaml', | |||
| # help='configuration of cream') | |||
| # parser.add_argument('--local_rank', type=int, default=0, | |||
| # help='local_rank') | |||
| # args = parser.parse_args() | |||
| # | |||
| # cfg.merge_from_file(args.cfg) | |||
| # converted_cfg = convert_lowercase(cfg) | |||
| # | |||
| # return args, converted_cfg | |||
| def get_model_flops_params(model, input_size=(1, 3, 224, 224)): | |||
| input = torch.randn(input_size) | |||
| macs, params = profile(deepcopy(model), inputs=(input,), verbose=False) | |||
| macs, params = clever_format([macs, params], "%.3f") | |||
| return macs, params | |||
| def cross_entropy_loss_with_soft_target(pred, soft_target): | |||
| logsoftmax = nn.LogSoftmax() | |||
| return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1)) | |||
| def create_supernet_scheduler(optimizer, epochs, num_gpu, batch_size, lr): | |||
| ITERS = epochs * \ | |||
| (1280000 / (num_gpu * batch_size)) | |||
| lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: ( | |||
| lr - step / ITERS) if step <= ITERS else 0, last_epoch=-1) | |||
| return lr_scheduler, epochs | |||
| @@ -0,0 +1,6 @@ | |||
| ## Pretrained models | |||
| The official 14M/43M/114M/287M/481M/604M pretrained models in | |||
| [google drive](https://drive.google.com/drive/folders/1CQjyBryZ4F20Rutj7coF8HWFcedApUn2) or | |||
| [Models-Baidu Disk (password: wqw6)](https://pan.baidu.com/s/1TqQNm2s14oEdyNPimw3T9g). | |||
| @@ -0,0 +1,462 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| import sys | |||
| sys.path.append('../..') | |||
| import os | |||
| import json | |||
| import time | |||
| import timm | |||
| import torch | |||
| import numpy as np | |||
| import torch.nn as nn | |||
| from argparse import ArgumentParser | |||
| # from torch.utils.tensorboard import SummaryWriter | |||
| # import timm packages | |||
| from timm.optim import create_optimizer | |||
| from timm.models import resume_checkpoint | |||
| from timm.scheduler import create_scheduler | |||
| from timm.data import Dataset, create_loader | |||
| from timm.utils import CheckpointSaver, ModelEma, update_summary | |||
| from timm.loss import LabelSmoothingCrossEntropy | |||
| # import apex as distributed package | |||
| try: | |||
| from apex import amp | |||
| from apex.parallel import DistributedDataParallel as DDP | |||
| from apex.parallel import convert_syncbn_model | |||
| HAS_APEX = True | |||
| except ImportError as e: | |||
| print(e) | |||
| from torch.nn.parallel import DistributedDataParallel as DDP | |||
| HAS_APEX = False | |||
| # import models and training functions | |||
| from pytorch.utils import mkdirs, save_best_checkpoint, str2bool | |||
| from lib.core.test import validate | |||
| from lib.core.retrain import train_epoch | |||
| from lib.models.structures.childnet import gen_childnet | |||
| from lib.utils.util import get_logger, get_model_flops_params | |||
| from lib.config import DEFAULT_CROP_PCT, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD | |||
| def parse_args(): | |||
| """See lib.utils.config""" | |||
| parser = ArgumentParser() | |||
| # path | |||
| parser.add_argument("--best_checkpoint_dir", type=str, default='./output/best_checkpoint/') | |||
| parser.add_argument("--checkpoint_dir", type=str, default='./output/checkpoints/') | |||
| parser.add_argument("--data_dir", type=str, default='./data') | |||
| parser.add_argument("--experiment_dir", type=str, default='./') | |||
| parser.add_argument("--model_name", type=str, default='retrainer') | |||
| parser.add_argument("--log_path", type=str, default='output/log') | |||
| parser.add_argument("--result_path", type=str, default='output/result.json') | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='output/selected_space.json') | |||
| # int | |||
| parser.add_argument("--acc_gap", type=int, default=5) | |||
| parser.add_argument("--batch_size", type=int, default=32) | |||
| parser.add_argument("--cooldown_epochs", type=int, default=10) | |||
| parser.add_argument("--decay_epochs", type=int, default=10) | |||
| parser.add_argument("--epochs", type=int, default=200) | |||
| parser.add_argument("--flops_minimum", type=int, default=0) | |||
| parser.add_argument("--flops_maximum", type=int, default=200) | |||
| parser.add_argument("--image_size", type=int, default=224) | |||
| parser.add_argument("--local_rank", type=int, default=0) | |||
| parser.add_argument("--log_interval", type=int, default=50) | |||
| parser.add_argument("--meta_sta_epoch", type=int, default=20) | |||
| parser.add_argument("--num_classes", type=int, default=1000) | |||
| parser.add_argument("--num_gpu", type=int, default=1) | |||
| parser.add_argument("--parience_epochs", type=int, default=10) | |||
| parser.add_argument("--pool_size", type=int, default=10) | |||
| parser.add_argument("--recovery_interval", type=int, default=10) | |||
| parser.add_argument("--trial_id", type=int, default=42) | |||
| parser.add_argument("--selection", type=int, default=-1) | |||
| parser.add_argument("--slice_num", type=int, default=4) | |||
| parser.add_argument("--tta", type=int, default=0) | |||
| parser.add_argument("--update_iter", type=int, default=1300) | |||
| parser.add_argument("--val_batch_mul", type=int, default=4) | |||
| parser.add_argument("--warmup_epochs", type=int, default=3) | |||
| parser.add_argument("--workers", type=int, default=4) | |||
| # float | |||
| parser.add_argument("--color_jitter", type=float, default=0.4) | |||
| parser.add_argument("--decay_rate", type=float, default=0.1) | |||
| parser.add_argument("--dropout_rate", type=float, default=0.0) | |||
| parser.add_argument("--ema_decay", type=float, default=0.998) | |||
| parser.add_argument("--lr", type=float, default=1e-2) | |||
| parser.add_argument("--meta_lr", type=float, default=1e-4) | |||
| parser.add_argument("--re_prob", type=float, default=0.2) | |||
| parser.add_argument("--opt_eps", type=float, default=1e-2) | |||
| parser.add_argument("--momentum", type=float, default=0.9) | |||
| parser.add_argument("--min_lr", type=float, default=1e-5) | |||
| parser.add_argument("--smoothing", type=float, default=0.1) | |||
| parser.add_argument("--weight_decay", type=float, default=1e-4) | |||
| parser.add_argument("--warmup_lr", type=float, default=1e-4) | |||
| # bool | |||
| parser.add_argument("--auto_resume", type=str2bool, default='False') | |||
| parser.add_argument("--dil_conv", type=str2bool, default='False') | |||
| parser.add_argument("--ema_cpu", type=str2bool, default='False') | |||
| parser.add_argument("--pin_mem", type=str2bool, default='True') | |||
| parser.add_argument("--resunit", type=str2bool, default='False') | |||
| parser.add_argument("--save_images", type=str2bool, default='False') | |||
| parser.add_argument("--sync_bn", type=str2bool, default='False') | |||
| parser.add_argument("--use_ema", type=str2bool, default='False') | |||
| parser.add_argument("--verbose", type=str2bool, default='False') | |||
| # str | |||
| parser.add_argument("--aa", type=str, default='rand-m9-mstd0.5') | |||
| parser.add_argument("--eval_metrics", type=str, default='prec1') | |||
| # gp: type of global pool ["avg", "max", "avgmax", "avgmaxc"] | |||
| parser.add_argument("--gp", type=str, default='avg') | |||
| parser.add_argument("--interpolation", type=str, default='bilinear') | |||
| parser.add_argument("--opt", type=str, default='sgd') | |||
| parser.add_argument("--pick_method", type=str, default='meta') | |||
| parser.add_argument("--re_mode", type=str, default='pixel') | |||
| parser.add_argument("--sched", type=str, default='sgd') | |||
| args = parser.parse_args() | |||
| args.sync_bn = False | |||
| args.verbose = False | |||
| args.data_dir = args.data_dir + "/imagenet" | |||
| return args | |||
| def main(): | |||
| args = parse_args() | |||
| mkdirs(args.checkpoint_dir + "/", | |||
| args.experiment_dir, | |||
| args.best_selected_space_path, | |||
| args.result_path) | |||
| with open(args.result_path, "w") as ss_file: | |||
| ss_file.write('') | |||
| if len(args.checkpoint_dir > 1): | |||
| mkdirs(args.best_checkpoint_dir + "/") | |||
| args.checkpoint_dir = os.path.join( | |||
| args.checkpoint_dir, | |||
| "{}_{}".format(args.model_name, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())) | |||
| ) | |||
| if not os.path.exists(args.checkpoint_dir): | |||
| os.mkdir(args.checkpoint_dir) | |||
| # resolve logging | |||
| if args.local_rank == 0: | |||
| logger = get_logger(args.log_path) | |||
| writer = None # SummaryWriter(os.path.join(output_dir, 'runs')) | |||
| else: | |||
| writer, logger = None, None | |||
| # retrain model selection | |||
| if args.selection == -1: | |||
| if os.path.exists(args.best_selected_space_path): | |||
| with open(args.best_selected_space_path, "r") as f: | |||
| arch_list = json.load(f)['selected_space'] | |||
| else: | |||
| args.selection = 14 | |||
| logger.warning("args.best_selected_space_path is not exist. Set selection to 14.") | |||
| if args.selection == 481: | |||
| arch_list = [ | |||
| [0], [ | |||
| 3, 4, 3, 1], [ | |||
| 3, 2, 3, 0], [ | |||
| 3, 3, 3, 1], [ | |||
| 3, 3, 3, 3], [ | |||
| 3, 3, 3, 3], [0]] | |||
| args.image_size = 224 | |||
| elif args.selection == 43: | |||
| arch_list = [[0], [3], [3, 1], [3, 1], [3, 3, 3], [3, 3], [0]] | |||
| args.image_size = 96 | |||
| elif args.selection == 14: | |||
| arch_list = [[0], [3], [3, 3], [3, 3], [3], [3], [0]] | |||
| args.image_size = 64 | |||
| elif args.selection == 112: | |||
| arch_list = [[0], [3], [3, 3], [3, 3], [3, 3, 3], [3, 3], [0]] | |||
| args.image_size = 160 | |||
| elif args.selection == 287: | |||
| arch_list = [[0], [3], [3, 3], [3, 1, 3], [3, 3, 3, 3], [3, 3, 3], [0]] | |||
| args.image_size = 224 | |||
| elif args.selection == 604: | |||
| arch_list = [ | |||
| [0], [ | |||
| 3, 3, 2, 3, 3], [ | |||
| 3, 2, 3, 2, 3], [ | |||
| 3, 2, 3, 2, 3], [ | |||
| 3, 3, 2, 2, 3, 3], [ | |||
| 3, 3, 2, 3, 3, 3], [0]] | |||
| args.image_size = 224 | |||
| elif args.selection == -1: | |||
| args.image_size = 224 | |||
| else: | |||
| raise ValueError("Model Retrain Selection is not Supported!") | |||
| print(arch_list) | |||
| # define childnet architecture from arch_list | |||
| stem = ['ds_r1_k3_s1_e1_c16_se0.25', 'cn_r1_k1_s1_c320_se0.25'] | |||
| # TODO: this param from NNI is different from microsoft/Cream. | |||
| choice_block_pool = ['ir_r1_k3_s2_e4_c24_se0.25', | |||
| 'ir_r1_k5_s2_e4_c40_se0.25', | |||
| 'ir_r1_k3_s2_e6_c80_se0.25', | |||
| 'ir_r1_k3_s1_e6_c96_se0.25', | |||
| 'ir_r1_k5_s2_e6_c192_se0.25'] | |||
| arch_def = [[stem[0]]] + [[choice_block_pool[idx] | |||
| for repeat_times in range(len(arch_list[idx + 1]))] | |||
| for idx in range(len(choice_block_pool))] + [[stem[1]]] | |||
| # generate childnet | |||
| model = gen_childnet( | |||
| arch_list, | |||
| arch_def, | |||
| num_classes=args.num_classes, | |||
| drop_rate=args.dropout_rate, | |||
| global_pool=args.gp) | |||
| # initialize distributed parameters | |||
| distributed = args.num_gpu > 1 | |||
| torch.cuda.set_device(args.local_rank) | |||
| if args.local_rank == 0: | |||
| logger.info( | |||
| 'Training on Process {} with {} GPUs.'.format( | |||
| args.local_rank, args.num_gpu)) | |||
| # fix random seeds | |||
| torch.manual_seed(args.trial_id) | |||
| torch.cuda.manual_seed_all(args.trial_id) | |||
| np.random.seed(args.trial_id) | |||
| torch.backends.cudnn.deterministic = True | |||
| torch.backends.cudnn.benchmark = False | |||
| # get parameters and FLOPs of model | |||
| if args.local_rank == 0: | |||
| macs, params = get_model_flops_params(model, input_size=( | |||
| 1, 3, args.image_size, args.image_size)) | |||
| logger.info( | |||
| '[Model-{}] Flops: {} Params: {}'.format(args.selection, macs, params)) | |||
| # create optimizer | |||
| model = model.cuda() | |||
| optimizer = create_optimizer(args, model) | |||
| # optionally resume from a checkpoint | |||
| resume_epoch = None | |||
| if args.auto_resume: | |||
| if int(timm.__version__[2]) >= 3: | |||
| resume_epoch = resume_checkpoint(model, args.experiment_dir, optimizer) | |||
| else: | |||
| resume_state, resume_epoch = resume_checkpoint(model, args.experiment_dir) | |||
| optimizer.load_state_dict(resume_state['optimizer']) | |||
| del resume_state | |||
| model_ema = None | |||
| if args.use_ema: | |||
| model_ema = ModelEma( | |||
| model, | |||
| decay=args.ema_decay, | |||
| device='cpu' if args.ema_cpu else '', | |||
| resume=args.experiment_dir if args.auto_resume else None) | |||
| # initialize training parameters | |||
| eval_metric = args.eval_metrics | |||
| best_metric, best_epoch, saver = None, None, None | |||
| if args.local_rank == 0: | |||
| decreasing = True if eval_metric == 'loss' else False | |||
| if int(timm.__version__[2]) >= 3: | |||
| saver = CheckpointSaver(model, optimizer, | |||
| checkpoint_dir=args.checkpoint_dir, | |||
| recovery_dir=args.checkpoint_dir, | |||
| model_ema=model_ema, | |||
| decreasing=decreasing, | |||
| max_history=2) | |||
| else: | |||
| saver = CheckpointSaver( | |||
| checkpoint_dir=args.checkpoint_dir, | |||
| recovery_dir=args.checkpoint_dir, | |||
| decreasing=decreasing, | |||
| max_history=2) | |||
| if distributed: | |||
| torch.distributed.init_process_group(backend='nccl', init_method='env://') | |||
| if args.sync_bn: | |||
| try: | |||
| if HAS_APEX: | |||
| model = convert_syncbn_model(model) | |||
| else: | |||
| model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) | |||
| if args.local_rank == 0: | |||
| logger.info('Converted model to use Synchronized BatchNorm.') | |||
| except Exception as e: | |||
| if args.local_rank == 0: | |||
| logger.error( | |||
| 'Failed to enable Synchronized BatchNorm. ' | |||
| 'Install Apex or Torch >= 1.1 with exception {}'.format(e)) | |||
| if HAS_APEX: | |||
| model = DDP(model, delay_allreduce=True) | |||
| else: | |||
| if args.local_rank == 0: | |||
| logger.info( | |||
| "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP.") | |||
| # can use device str in Torch >= 1.1 | |||
| model = DDP(model, device_ids=[args.local_rank], find_unused_parameters=True) | |||
| # imagenet train dataset | |||
| train_dir = os.path.join(args.data_dir, 'train') | |||
| if not os.path.exists(train_dir) and args.local_rank == 0: | |||
| logger.error('Training folder does not exist at: {}'.format(train_dir)) | |||
| exit(1) | |||
| dataset_train = Dataset(train_dir) | |||
| loader_train = create_loader( | |||
| dataset_train, | |||
| input_size=(3, args.image_size, args.image_size), | |||
| batch_size=args.batch_size, | |||
| is_training=True, | |||
| color_jitter=args.color_jitter, | |||
| auto_augment=args.aa, | |||
| num_aug_splits=0, | |||
| crop_pct=DEFAULT_CROP_PCT, | |||
| mean=IMAGENET_DEFAULT_MEAN, | |||
| std=IMAGENET_DEFAULT_STD, | |||
| num_workers=args.workers, | |||
| distributed=distributed, | |||
| collate_fn=None, | |||
| pin_memory=args.pin_mem, | |||
| interpolation='random', | |||
| re_mode=args.re_mode, | |||
| re_prob=args.re_prob | |||
| ) | |||
| # imagenet validation dataset | |||
| eval_dir = os.path.join(args.data_dir, 'val') | |||
| if not os.path.exists(eval_dir) and args.local_rank == 0: | |||
| logger.error( | |||
| 'Validation folder does not exist at: {}'.format(eval_dir)) | |||
| exit(1) | |||
| dataset_eval = Dataset(eval_dir) | |||
| loader_eval = create_loader( | |||
| dataset_eval, | |||
| input_size=(3, args.image_size, args.image_size), | |||
| batch_size=args.val_batch_mul * args.batch_size, | |||
| is_training=False, | |||
| interpolation=args.interpolation, | |||
| crop_pct=DEFAULT_CROP_PCT, | |||
| mean=IMAGENET_DEFAULT_MEAN, | |||
| std=IMAGENET_DEFAULT_STD, | |||
| num_workers=args.workers, | |||
| distributed=distributed, | |||
| pin_memory=args.pin_mem | |||
| ) | |||
| # whether to use label smoothing | |||
| if args.smoothing > 0.: | |||
| train_loss_fn = LabelSmoothingCrossEntropy( | |||
| smoothing=args.smoothing).cuda() | |||
| validate_loss_fn = nn.CrossEntropyLoss().cuda() | |||
| else: | |||
| train_loss_fn = nn.CrossEntropyLoss().cuda() | |||
| validate_loss_fn = train_loss_fn | |||
| # create learning rate scheduler | |||
| lr_scheduler, num_epochs = create_scheduler(args, optimizer) | |||
| start_epoch = resume_epoch if resume_epoch is not None else 0 | |||
| if start_epoch > 0: | |||
| lr_scheduler.step(start_epoch) | |||
| if args.local_rank == 0: | |||
| logger.info('Scheduled epochs: {}'.format(num_epochs)) | |||
| try: | |||
| best_record, best_ep = 0, 0 | |||
| for epoch in range(start_epoch, num_epochs): | |||
| if distributed: | |||
| loader_train.sampler.set_epoch(epoch) | |||
| train_metrics = train_epoch( | |||
| epoch, | |||
| model, | |||
| loader_train, | |||
| optimizer, | |||
| train_loss_fn, | |||
| args, | |||
| lr_scheduler=lr_scheduler, | |||
| saver=saver, | |||
| output_dir=args.checkpoint_dir, | |||
| model_ema=model_ema, | |||
| logger=logger, | |||
| writer=writer, | |||
| local_rank=args.local_rank) | |||
| eval_metrics = validate( | |||
| epoch, | |||
| model, | |||
| loader_eval, | |||
| validate_loss_fn, | |||
| args, | |||
| logger=logger, | |||
| writer=writer, | |||
| local_rank=args.local_rank, | |||
| result_path=args.result_path | |||
| ) | |||
| if model_ema is not None and not args.ema_cpu: | |||
| ema_eval_metrics = validate( | |||
| epoch, | |||
| model_ema.ema, | |||
| loader_eval, | |||
| validate_loss_fn, | |||
| args, | |||
| log_suffix='_EMA', | |||
| logger=logger, | |||
| writer=writer, | |||
| local_rank=args.local_rank | |||
| ) | |||
| eval_metrics = ema_eval_metrics | |||
| if lr_scheduler is not None: | |||
| lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) | |||
| update_summary(epoch, train_metrics, eval_metrics, os.path.join( | |||
| args.checkpoint_dir, 'summary.csv'), write_header=best_metric is None) | |||
| if saver is not None: | |||
| # save proper checkpoint with eval metric | |||
| save_metric = eval_metrics[eval_metric] | |||
| if int(timm.__version__[2]) >= 3: | |||
| best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric) | |||
| else: | |||
| best_metric, best_epoch = saver.save_checkpoint( | |||
| model, optimizer, args, | |||
| epoch=epoch, metric=save_metric) | |||
| if best_record < eval_metrics[eval_metric]: | |||
| best_record = eval_metrics[eval_metric] | |||
| best_ep = epoch | |||
| if args.local_rank == 0: | |||
| logger.info( | |||
| '*** Best metric: {0} (epoch {1})'.format(best_record, best_ep)) | |||
| except KeyboardInterrupt: | |||
| pass | |||
| if best_metric is not None: | |||
| logger.info( | |||
| '*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch)) | |||
| save_best_checkpoint(args.best_checkpoint_dir, model, optimizer, epoch) | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -0,0 +1,21 @@ | |||
| import sys | |||
| sys.path.append('../..') | |||
| from pytorch.selector import Selector | |||
| class ClassicnasSelector(Selector): | |||
| def __init__(self, *args, single_candidate=True): | |||
| super().__init__(single_candidate) | |||
| self.args = args | |||
| def fit(self): | |||
| """ | |||
| only one candatite, function passed | |||
| """ | |||
| pass | |||
| if __name__ == "__main__": | |||
| hpo_selector = ClassicnasSelector() | |||
| hpo_selector.fit() | |||
| @@ -0,0 +1,167 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT License. | |||
| # Written by Hao Du and Houwen Peng | |||
| # email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com | |||
| import os | |||
| import warnings | |||
| import datetime | |||
| import torch | |||
| import torch.nn as nn | |||
| # from torch.utils.tensorboard import SummaryWriter | |||
| # import timm packages | |||
| from timm.utils import ModelEma | |||
| from timm.models import resume_checkpoint | |||
| from timm.data import Dataset, create_loader | |||
| # import apex as distributed package | |||
| try: | |||
| from apex.parallel import convert_syncbn_model | |||
| from apex.parallel import DistributedDataParallel as DDP | |||
| HAS_APEX = True | |||
| except ImportError as e: | |||
| print(e) | |||
| from torch.nn.parallel import DistributedDataParallel as DDP | |||
| HAS_APEX = False | |||
| # import models and training functions | |||
| from lib.core.test import validate | |||
| from lib.models.structures.childnet import gen_childnet | |||
| from lib.utils.util import parse_config_args, get_logger, get_model_flops_params | |||
| from lib.config import DEFAULT_CROP_PCT, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD | |||
| def main(): | |||
| args, cfg = parse_config_args('child net testing') | |||
| # resolve logging | |||
| output_dir = os.path.join(cfg.SAVE_PATH, | |||
| "{}-{}".format(datetime.date.today().strftime('%m%d'), | |||
| cfg.MODEL)) | |||
| if not os.path.exists(output_dir): | |||
| os.mkdir(output_dir) | |||
| if args.local_rank == 0: | |||
| logger = get_logger(os.path.join(output_dir, 'test.log')) | |||
| writer = None # SummaryWriter(os.path.join(output_dir, 'runs')) | |||
| else: | |||
| writer, logger = None, None | |||
| # retrain model selection | |||
| if cfg.NET.SELECTION == 481: | |||
| arch_list = [ | |||
| [0], [ | |||
| 3, 4, 3, 1], [ | |||
| 3, 2, 3, 0], [ | |||
| 3, 3, 3, 1], [ | |||
| 3, 3, 3, 3], [ | |||
| 3, 3, 3, 3], [0]] | |||
| cfg.DATASET.IMAGE_SIZE = 224 | |||
| elif cfg.NET.SELECTION == 43: | |||
| arch_list = [[0], [3], [3, 1], [3, 1], [3, 3, 3], [3, 3], [0]] | |||
| cfg.DATASET.IMAGE_SIZE = 96 | |||
| elif cfg.NET.SELECTION == 14: | |||
| arch_list = [[0], [3], [3, 3], [3, 3], [3], [3], [0]] | |||
| cfg.DATASET.IMAGE_SIZE = 64 | |||
| elif cfg.NET.SELECTION == 112: | |||
| arch_list = [[0], [3], [3, 3], [3, 3], [3, 3, 3], [3, 3], [0]] | |||
| cfg.DATASET.IMAGE_SIZE = 160 | |||
| elif cfg.NET.SELECTION == 287: | |||
| arch_list = [[0], [3], [3, 3], [3, 1, 3], [3, 3, 3, 3], [3, 3, 3], [0]] | |||
| cfg.DATASET.IMAGE_SIZE = 224 | |||
| elif cfg.NET.SELECTION == 604: | |||
| arch_list = [[0], [3, 3, 2, 3, 3], [3, 2, 3, 2, 3], [3, 2, 3, 2, 3], | |||
| [3, 3, 2, 2, 3, 3], [3, 3, 2, 3, 3, 3], [0]] | |||
| cfg.DATASET.IMAGE_SIZE = 224 | |||
| else: | |||
| raise ValueError("Model Test Selection is not Supported!") | |||
| # define childnet architecture from arch_list | |||
| stem = ['ds_r1_k3_s1_e1_c16_se0.25', 'cn_r1_k1_s1_c320_se0.25'] | |||
| # TODO: this param from NNI is different from microsoft/Cream. | |||
| choice_block_pool = ['ir_r1_k3_s2_e4_c24_se0.25', | |||
| 'ir_r1_k5_s2_e4_c40_se0.25', | |||
| 'ir_r1_k3_s2_e6_c80_se0.25', | |||
| 'ir_r1_k3_s1_e6_c96_se0.25', | |||
| 'ir_r1_k5_s2_e6_c192_se0.25'] | |||
| arch_def = [[stem[0]]] + [[choice_block_pool[idx] | |||
| for repeat_times in range(len(arch_list[idx + 1]))] | |||
| for idx in range(len(choice_block_pool))] + [[stem[1]]] | |||
| # generate childnet | |||
| model = gen_childnet( | |||
| arch_list, | |||
| arch_def, | |||
| num_classes=cfg.DATASET.NUM_CLASSES, | |||
| drop_rate=cfg.NET.DROPOUT_RATE, | |||
| global_pool=cfg.NET.GP) | |||
| if args.local_rank == 0: | |||
| macs, params = get_model_flops_params(model, input_size=( | |||
| 1, 3, cfg.DATASET.IMAGE_SIZE, cfg.DATASET.IMAGE_SIZE)) | |||
| logger.info( | |||
| '[Model-{}] Flops: {} Params: {}'.format(cfg.NET.SELECTION, macs, params)) | |||
| # initialize distributed parameters | |||
| torch.cuda.set_device(args.local_rank) | |||
| torch.distributed.init_process_group(backend='nccl', init_method='env://') | |||
| if args.local_rank == 0: | |||
| logger.info( | |||
| "Training on Process {} with {} GPUs.".format( | |||
| args.local_rank, cfg.NUM_GPU)) | |||
| # resume model from checkpoint | |||
| assert cfg.AUTO_RESUME is True and os.path.exists(cfg.RESUME_PATH) | |||
| resume_checkpoint(model, cfg.RESUME_PATH) | |||
| model = model.cuda() | |||
| model_ema = None | |||
| if cfg.NET.EMA.USE: | |||
| # Important to create EMA model after cuda(), DP wrapper, and AMP but | |||
| # before SyncBN and DDP wrapper | |||
| model_ema = ModelEma( | |||
| model, | |||
| decay=cfg.NET.EMA.DECAY, | |||
| device='cpu' if cfg.NET.EMA.FORCE_CPU else '', | |||
| resume=cfg.RESUME_PATH) | |||
| # imagenet validation dataset | |||
| eval_dir = os.path.join(cfg.DATA_DIR, 'val') | |||
| if not os.path.exists(eval_dir) and args.local_rank == 0: | |||
| logger.error( | |||
| 'Validation folder does not exist at: {}'.format(eval_dir)) | |||
| exit(1) | |||
| dataset_eval = Dataset(eval_dir) | |||
| loader_eval = create_loader( | |||
| dataset_eval, | |||
| input_size=(3, cfg.DATASET.IMAGE_SIZE, cfg.DATASET.IMAGE_SIZE), | |||
| batch_size=cfg.DATASET.VAL_BATCH_MUL * cfg.DATASET.BATCH_SIZE, | |||
| is_training=False, | |||
| num_workers=cfg.WORKERS, | |||
| distributed=True, | |||
| pin_memory=cfg.DATASET.PIN_MEM, | |||
| crop_pct=DEFAULT_CROP_PCT, | |||
| mean=IMAGENET_DEFAULT_MEAN, | |||
| std=IMAGENET_DEFAULT_STD | |||
| ) | |||
| # only test accuracy of model-EMA | |||
| validate_loss_fn = nn.CrossEntropyLoss().cuda() | |||
| validate(0, model, loader_eval, validate_loss_fn, cfg, | |||
| log_suffix='_EMA', logger=logger, | |||
| writer=writer, local_rank=args.local_rank) | |||
| if cfg.NET.EMA.USE: | |||
| validate(0, model_ema.ema, loader_eval, validate_loss_fn, cfg, | |||
| log_suffix='_EMA', logger=logger, | |||
| writer=writer, local_rank=args.local_rank) | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -0,0 +1,312 @@ | |||
| # https://github.com/microsoft/nni/blob/v2.0/examples/nas/cream/train.py | |||
| import sys | |||
| sys.path.append('../..') | |||
| import os | |||
| import sys | |||
| import time | |||
| import json | |||
| import torch | |||
| import numpy as np | |||
| import torch.nn as nn | |||
| from argparse import ArgumentParser | |||
| # import timm packages | |||
| from timm.loss import LabelSmoothingCrossEntropy | |||
| from timm.data import Dataset, create_loader | |||
| from timm.models import resume_checkpoint | |||
| # import apex as distributed package | |||
| # try: | |||
| # from apex.parallel import DistributedDataParallel as DDP | |||
| # from apex.parallel import convert_syncbn_model | |||
| # | |||
| # USE_APEX = True | |||
| # except ImportError as e: | |||
| # print(e) | |||
| # from torch.nn.parallel import DistributedDataParallel as DDP | |||
| # | |||
| # USE_APEX = False | |||
| # import models and training functions | |||
| from lib.utils.flops_table import FlopsEst | |||
| from lib.models.structures.supernet import gen_supernet | |||
| from lib.config import DEFAULT_CROP_PCT, IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN | |||
| from lib.utils.util import get_logger, \ | |||
| create_optimizer_supernet, create_supernet_scheduler | |||
| from pytorch.utils import mkdirs, str2bool | |||
| from pytorch.callbacks import LRSchedulerCallback | |||
| from pytorch.callbacks import ModelCheckpoint | |||
| from algorithms import CreamSupernetTrainer | |||
| from algorithms import RandomMutator | |||
| def parse_args(): | |||
| """See lib.utils.config""" | |||
| parser = ArgumentParser() | |||
| # path | |||
| parser.add_argument("--checkpoint_dir", type=str, default='') | |||
| parser.add_argument("--data_dir", type=str, default='./data') | |||
| parser.add_argument("--experiment_dir", type=str, default='./') | |||
| parser.add_argument("--model_name", type=str, default='trainer') | |||
| parser.add_argument("--log_path", type=str, default='output/log') | |||
| parser.add_argument("--result_path", type=str, default='output/result.json') | |||
| parser.add_argument("--search_space_path", type=str, default='output/search_space.json') | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='output/selected_space.json') | |||
| # int | |||
| parser.add_argument("--acc_gap", type=int, default=5) | |||
| parser.add_argument("--batch_size", type=int, default=1) | |||
| parser.add_argument("--epochs", type=int, default=200) | |||
| parser.add_argument("--flops_minimum", type=int, default=0) | |||
| parser.add_argument("--flops_maximum", type=int, default=200) | |||
| parser.add_argument("--image_size", type=int, default=224) | |||
| parser.add_argument("--local_rank", type=int, default=0) | |||
| parser.add_argument("--log_interval", type=int, default=50) | |||
| parser.add_argument("--meta_sta_epoch", type=int, default=20) | |||
| parser.add_argument("--num_classes", type=int, default=1000) | |||
| parser.add_argument("--num_gpu", type=int, default=1) | |||
| parser.add_argument("--pool_size", type=int, default=10) | |||
| parser.add_argument("--trial_id", type=int, default=42) | |||
| parser.add_argument("--slice_num", type=int, default=4) | |||
| parser.add_argument("--tta", type=int, default=0) | |||
| parser.add_argument("--update_iter", type=int, default=1300) | |||
| parser.add_argument("--workers", type=int, default=4) | |||
| # float | |||
| parser.add_argument("--color_jitter", type=float, default=0.4) | |||
| parser.add_argument("--dropout_rate", type=float, default=0.0) | |||
| parser.add_argument("--lr", type=float, default=1e-2) | |||
| parser.add_argument("--meta_lr", type=float, default=1e-4) | |||
| parser.add_argument("--opt_eps", type=float, default=1e-2) | |||
| parser.add_argument("--re_prob", type=float, default=0.2) | |||
| parser.add_argument("--momentum", type=float, default=0.9) | |||
| parser.add_argument("--smoothing", type=float, default=0.1) | |||
| parser.add_argument("--weight_decay", type=float, default=1e-4) | |||
| # bool | |||
| parser.add_argument("--auto_resume", type=str2bool, default='False') | |||
| parser.add_argument("--dil_conv", type=str2bool, default='False') | |||
| parser.add_argument("--resunit", type=str2bool, default='False') | |||
| parser.add_argument("--sync_bn", type=str2bool, default='False') | |||
| parser.add_argument("--verbose", type=str2bool, default='False') | |||
| # str | |||
| # gp: type of global pool ["avg", "max", "avgmax", "avgmaxc"] | |||
| parser.add_argument("--gp", type=str, default='avg') | |||
| parser.add_argument("--interpolation", type=str, default='bilinear') | |||
| parser.add_argument("--opt", type=str, default='sgd') | |||
| parser.add_argument("--pick_method", type=str, default='meta') | |||
| parser.add_argument("--re_mode", type=str, default='pixel') | |||
| args = parser.parse_args() | |||
| args.sync_bn = False | |||
| args.verbose = False | |||
| args.data_dir = args.data_dir + "/imagenet" | |||
| return args | |||
| def main(): | |||
| args = parse_args() | |||
| mkdirs(args.experiment_dir, | |||
| args.best_selected_space_path, | |||
| args.search_space_path, | |||
| args.result_path, | |||
| args.log_path) | |||
| with open(args.result_path, "w") as ss_file: | |||
| ss_file.write('') | |||
| # resolve logging | |||
| if len(args.checkpoint_dir > 1): | |||
| mkdirs(args.checkpoint_dir + "/") | |||
| args.checkpoint_dir = os.path.join( | |||
| args.checkpoint_dir, | |||
| "{}_{}".format(args.model_name, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())) | |||
| ) | |||
| if not os.path.exists(args.checkpoint_dir): | |||
| os.mkdir(args.checkpoint_dir) | |||
| if args.local_rank == 0: | |||
| logger = get_logger(args.log_path) | |||
| else: | |||
| logger = None | |||
| # initialize distributed parameters | |||
| torch.cuda.set_device(args.local_rank) | |||
| # torch.distributed.init_process_group(backend='nccl', init_method='env://') | |||
| if args.local_rank == 0: | |||
| logger.info( | |||
| 'Training on Process %d with %d GPUs.', | |||
| args.local_rank, args.num_gpu) | |||
| # fix random seeds | |||
| torch.manual_seed(args.trial_id) | |||
| torch.cuda.manual_seed_all(args.trial_id) | |||
| np.random.seed(args.trial_id) | |||
| torch.backends.cudnn.deterministic = True | |||
| torch.backends.cudnn.benchmark = False | |||
| # generate supernet and optimizer | |||
| model, sta_num, resolution, search_space = gen_supernet( | |||
| flops_minimum=args.flops_minimum, | |||
| flops_maximum=args.flops_maximum, | |||
| num_classes=args.num_classes, | |||
| drop_rate=args.dropout_rate, | |||
| global_pool=args.gp, | |||
| resunit=args.resunit, | |||
| dil_conv=args.dil_conv, | |||
| slice=args.slice_num, | |||
| verbose=args.verbose, | |||
| logger=logger) | |||
| optimizer = create_optimizer_supernet(args, model) | |||
| # number of choice blocks in supernet | |||
| choice_num = len(model.blocks[7]) | |||
| if args.local_rank == 0: | |||
| logger.info('Supernet created, param count: %d', ( | |||
| sum([m.numel() for m in model.parameters()]))) | |||
| logger.info('resolution: %d', resolution) | |||
| logger.info('choice number: %d', choice_num) | |||
| with open(args.search_space_path, "w") as f: | |||
| print("dump search space.") | |||
| json.dump({'search_space': search_space}, f) | |||
| # initialize flops look-up table | |||
| model_est = FlopsEst(model) | |||
| flops_dict, flops_fixed = model_est.flops_dict, model_est.flops_fixed | |||
| model = model.cuda() | |||
| # convert model to distributed mode | |||
| if args.sync_bn: | |||
| try: | |||
| # if USE_APEX: | |||
| # model = convert_syncbn_model(model) | |||
| # else: | |||
| model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) | |||
| if args.local_rank == 0: | |||
| logger.info('Converted model to use Synchronized BatchNorm.') | |||
| except Exception as exception: | |||
| logger.info( | |||
| 'Failed to enable Synchronized BatchNorm. ' | |||
| 'Install Apex or Torch >= 1.1 with Exception %s', exception) | |||
| # if USE_APEX: | |||
| # model = DDP(model, delay_allreduce=True) | |||
| # else: | |||
| # if args.local_rank == 0: | |||
| # logger.info( | |||
| # "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP.") | |||
| # # can use device str in Torch >= 1.1 | |||
| # model = DDP(model, device_ids=[args.local_rank], find_unused_parameters=True) | |||
| # optionally resume from a checkpoint | |||
| resume_epoch = None | |||
| if False: # args.auto_resume: | |||
| checkpoint = torch.load(args.experiment_dir) | |||
| model.load_state_dict(checkpoint['child_model_state_dict']) | |||
| optimizer.load_state_dict(checkpoint['optimizer_state_dict']) | |||
| resume_epoch = checkpoint['epoch'] | |||
| # create learning rate scheduler | |||
| lr_scheduler, num_epochs = create_supernet_scheduler(optimizer, args.epochs, args.num_gpu, | |||
| args.batch_size, args.lr) | |||
| start_epoch = resume_epoch if resume_epoch is not None else 0 | |||
| if start_epoch > 0: | |||
| lr_scheduler.step(start_epoch) | |||
| if args.local_rank == 0: | |||
| logger.info('Scheduled epochs: %d', num_epochs) | |||
| # imagenet train dataset | |||
| train_dir = os.path.join(args.data_dir, 'train') | |||
| if not os.path.exists(train_dir): | |||
| logger.info('Training folder does not exist at: %s', train_dir) | |||
| sys.exit() | |||
| dataset_train = Dataset(train_dir) | |||
| loader_train = create_loader( | |||
| dataset_train, | |||
| input_size=(3, args.image_size, args.image_size), | |||
| batch_size=args.batch_size, | |||
| is_training=True, | |||
| use_prefetcher=True, | |||
| re_prob=args.re_prob, | |||
| re_mode=args.re_mode, | |||
| color_jitter=args.color_jitter, | |||
| interpolation='random', | |||
| num_workers=args.workers, | |||
| distributed=False, | |||
| collate_fn=None, | |||
| crop_pct=DEFAULT_CROP_PCT, | |||
| mean=IMAGENET_DEFAULT_MEAN, | |||
| std=IMAGENET_DEFAULT_STD | |||
| ) | |||
| # imagenet validation dataset | |||
| eval_dir = os.path.join(args.data_dir, 'val') | |||
| if not os.path.isdir(eval_dir): | |||
| logger.info('Validation folder does not exist at: %s', eval_dir) | |||
| sys.exit() | |||
| dataset_eval = Dataset(eval_dir) | |||
| loader_eval = create_loader( | |||
| dataset_eval, | |||
| input_size=(3, args.image_size, args.image_size), | |||
| batch_size=4 * args.batch_size, | |||
| is_training=False, | |||
| use_prefetcher=True, | |||
| num_workers=args.workers, | |||
| distributed=False, | |||
| crop_pct=DEFAULT_CROP_PCT, | |||
| mean=IMAGENET_DEFAULT_MEAN, | |||
| std=IMAGENET_DEFAULT_STD, | |||
| interpolation=args.interpolation | |||
| ) | |||
| # whether to use label smoothing | |||
| if args.smoothing > 0.: | |||
| train_loss_fn = LabelSmoothingCrossEntropy( | |||
| smoothing=args.smoothing).cuda() | |||
| validate_loss_fn = nn.CrossEntropyLoss().cuda() | |||
| else: | |||
| train_loss_fn = nn.CrossEntropyLoss().cuda() | |||
| validate_loss_fn = train_loss_fn | |||
| mutator = RandomMutator(model) | |||
| _callbacks = [LRSchedulerCallback(lr_scheduler)] | |||
| if len(args.checkpoint_dir) > 1: | |||
| _callbacks.append(ModelCheckpoint(checkpoint_dir)) | |||
| trainer = CreamSupernetTrainer(args.best_selected_space_path, model, train_loss_fn, | |||
| validate_loss_fn, | |||
| optimizer, num_epochs, loader_train, loader_eval, | |||
| result_path=args.result_path, | |||
| mutator=mutator, | |||
| batch_size=args.batch_size, | |||
| log_frequency=args.log_interval, | |||
| meta_sta_epoch=args.meta_sta_epoch, | |||
| update_iter=args.update_iter, | |||
| slices=args.slice_num, | |||
| pool_size=args.pool_size, | |||
| pick_method=args.pick_method, | |||
| choice_num=choice_num, | |||
| sta_num=sta_num, | |||
| acc_gap=args.acc_gap, | |||
| flops_dict=flops_dict, | |||
| flops_fixed=flops_fixed, | |||
| local_rank=args.local_rank, | |||
| callbacks=_callbacks) | |||
| trainer.train() | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -0,0 +1,2 @@ | |||
| from pytorch.darts.dartstrainer import DartsTrainer | |||
| from pytorch.darts.dartsmutator import DartsMutator | |||
| @@ -0,0 +1,205 @@ | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import os | |||
| import logging | |||
| import time | |||
| from argparse import ArgumentParser | |||
| import torch | |||
| import torch.nn as nn | |||
| # from torch.utils.tensorboard import SummaryWriter | |||
| import datasets | |||
| import utils | |||
| from model import CNN | |||
| from pytorch.utils import set_seed, mkdirs, init_logger, save_best_checkpoint, AverageMeter | |||
| from pytorch.fixed import apply_fixed_architecture | |||
| from pytorch.retrainer import Retrainer | |||
| logger = logging.getLogger(__name__) | |||
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |||
| # writer = SummaryWriter() | |||
| class DartsRetrainer(Retrainer): | |||
| def __init__(self, aux_weight, grad_clip, epochs, log_frequency): | |||
| self.aux_weight = aux_weight | |||
| self.grad_clip = grad_clip | |||
| self.epochs = epochs | |||
| self.log_frequency = log_frequency | |||
| def train(self, train_loader, model, optimizer, criterion, epoch): | |||
| top1 = AverageMeter("top1") | |||
| top5 = AverageMeter("top5") | |||
| losses = AverageMeter("losses") | |||
| cur_step = epoch * len(train_loader) | |||
| cur_lr = optimizer.param_groups[0]["lr"] | |||
| logger.info("Epoch %d LR %.6f", epoch, cur_lr) | |||
| # writer.add_scalar("lr", cur_lr, global_step=cur_step) | |||
| model.train() | |||
| for step, (x, y) in enumerate(train_loader): | |||
| x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) | |||
| bs = x.size(0) | |||
| optimizer.zero_grad() | |||
| logits, aux_logits = model(x) | |||
| loss = criterion(logits, y) | |||
| if self.aux_weight > 0.: | |||
| loss += self.aux_weight * criterion(aux_logits, y) | |||
| loss.backward() | |||
| # gradient clipping | |||
| nn.utils.clip_grad_norm_(model.parameters(), self.grad_clip) | |||
| optimizer.step() | |||
| accuracy = utils.accuracy(logits, y, topk=(1, 5)) | |||
| losses.update(loss.item(), bs) | |||
| top1.update(accuracy["acc1"], bs) | |||
| top5.update(accuracy["acc5"], bs) | |||
| # writer.add_scalar("loss/train", loss.item(), global_step=cur_step) | |||
| # writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step) | |||
| # writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step) | |||
| if step % self.log_frequency == 0 or step == len(train_loader) - 1: | |||
| logger.info( | |||
| "Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " | |||
| "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( | |||
| epoch + 1, self.epochs, step, len(train_loader) - 1, losses=losses, | |||
| top1=top1, top5=top5)) | |||
| cur_step += 1 | |||
| logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg)) | |||
| def validate(self, valid_loader, model, criterion, epoch, cur_step): | |||
| top1 = AverageMeter("top1") | |||
| top5 = AverageMeter("top5") | |||
| losses = AverageMeter("losses") | |||
| model.eval() | |||
| with torch.no_grad(): | |||
| for step, (X, y) in enumerate(valid_loader): | |||
| X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True) | |||
| bs = X.size(0) | |||
| logits = model(X) | |||
| loss = criterion(logits, y) | |||
| accuracy = utils.accuracy(logits, y, topk=(1, 5)) | |||
| losses.update(loss.item(), bs) | |||
| top1.update(accuracy["acc1"], bs) | |||
| top5.update(accuracy["acc5"], bs) | |||
| if step % self.log_frequency == 0 or step == len(valid_loader) - 1: | |||
| logger.info( | |||
| "Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " | |||
| "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( | |||
| epoch + 1, self.epochs, step, len(valid_loader) - 1, losses=losses, | |||
| top1=top1, top5=top5)) | |||
| # writer.add_scalar("loss/test", losses.avg, global_step=cur_step) | |||
| # writer.add_scalar("acc1/test", top1.avg, global_step=cur_step) | |||
| # writer.add_scalar("acc5/test", top5.avg, global_step=cur_step) | |||
| logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg)) | |||
| return top1.avg | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("DARTS retrain") | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='./data/', help="search_space json file") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='.0/result.json', help="training result") | |||
| parser.add_argument("--log_path", type=str, | |||
| default='.0/log', help="log for info") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| parser.add_argument("--best_checkpoint_dir", type=str, | |||
| default='./', help="default name is best_checkpoint_epoch{}.pth") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument("--layers", default=20, type=int) | |||
| parser.add_argument("--lr", default=0.025, type=float) | |||
| parser.add_argument("--batch_size", default=128, type=int) | |||
| parser.add_argument("--log_frequency", default=10, type=int) | |||
| parser.add_argument("--epochs", default=5, type=int) | |||
| parser.add_argument("--aux_weight", default=0.4, type=float) | |||
| parser.add_argument("--drop_path_prob", default=0.2, type=float) | |||
| parser.add_argument("--workers", default=4, type=int) | |||
| parser.add_argument("--channels", default=36, type=int) | |||
| parser.add_argument("--grad_clip", default=5., type=float) | |||
| parser.add_argument("--class_num", default=10, type=int, help="cifar10") | |||
| args = parser.parse_args() | |||
| mkdirs(args.result_path, args.log_path, args.best_checkpoint_dir) | |||
| init_logger(args.log_path) | |||
| logger.info(args) | |||
| set_seed(args.trial_id) | |||
| dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16, root=args.data_dir) | |||
| model = CNN(32, 3, args.channels, args.class_num, args.layers, auxiliary=True) | |||
| apply_fixed_architecture(model, args.best_selected_space_path) | |||
| criterion = nn.CrossEntropyLoss() | |||
| model.to(device) | |||
| criterion.to(device) | |||
| optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=3.0E-4) | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6) | |||
| train_loader = torch.utils.data.DataLoader(dataset_train, | |||
| batch_size=args.batch_size, | |||
| shuffle=True, | |||
| num_workers=args.workers, | |||
| pin_memory=True) | |||
| valid_loader = torch.utils.data.DataLoader(dataset_valid, | |||
| batch_size=args.batch_size, | |||
| shuffle=False, | |||
| num_workers=args.workers, | |||
| pin_memory=True) | |||
| retrainer = DartsRetrainer(aux_weight=args.aux_weight, | |||
| grad_clip=args.grad_clip, | |||
| epochs=args.epochs, | |||
| log_frequency = args.log_frequency) | |||
| # result = {"Accuracy": [], "Cost_time": ''} | |||
| best_top1 = 0. | |||
| start_time = time.time() | |||
| with open(args.result_path, "w") as file: | |||
| file.write('') | |||
| for epoch in range(args.epochs): | |||
| drop_prob = args.drop_path_prob * epoch / args.epochs | |||
| model.drop_path_prob(drop_prob) | |||
| # training | |||
| retrainer.train(train_loader, model, optimizer, criterion, epoch) | |||
| # validation | |||
| cur_step = (epoch + 1) * len(train_loader) | |||
| top1 = retrainer.validate(valid_loader, model, criterion, epoch, cur_step) | |||
| # 后端在终端过滤,{"type": "Accuracy", "result": {"sequence": 1, "category": "epoch", "value":96.7}} | |||
| logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) + '\n') | |||
| # result["Accuracy"].append(top1) | |||
| best_top1 = max(best_top1, top1) | |||
| lr_scheduler.step() | |||
| logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) | |||
| cost_time = time.time() - start_time | |||
| # 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}} | |||
| logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})) | |||
| # result["Cost_time"] = str(cost_time) + ' s' | |||
| # dump_global_result(args.result_path, result) | |||
| save_best_checkpoint(args.best_checkpoint_dir, model, optimizer, epoch) | |||
| logger.info("Save best checkpoint in {}".format(os.path.join(args.best_checkpoint_dir, "best_checkpoint_epoch{}.pth".format(epoch)))) | |||
| @@ -0,0 +1,21 @@ | |||
| import sys | |||
| sys.path.append('../..') | |||
| from pytorch.selector import Selector | |||
| from argparse import ArgumentParser | |||
| class DartsSelector(Selector): | |||
| def __init__(self, single_candidate=True): | |||
| super().__init__(single_candidate) | |||
| def fit(self): | |||
| pass | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("DARTS select") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| args = parser.parse_args() | |||
| darts_selector = DartsSelector(True) | |||
| darts_selector.fit() | |||
| @@ -0,0 +1,85 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import time | |||
| from argparse import ArgumentParser | |||
| import torch | |||
| import torch.nn as nn | |||
| import datasets | |||
| from model import CNN | |||
| from utils import accuracy | |||
| from dartstrainer import DartsTrainer | |||
| from pytorch.utils import * | |||
| from pytorch.callbacks import BestArchitectureCheckpoint, LRSchedulerCallback | |||
| logger = logging.getLogger(__name__) | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("DARTS train") | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='../data/', help="search_space json file") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='.0/result.json', help="training result") | |||
| parser.add_argument("--log_path", type=str, | |||
| default='.0/log', help="log for info") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./search_space.json', help="search space of PDARTS") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument("--layers", default=8, type=int) | |||
| parser.add_argument("--batch_size", default=64, type=int) | |||
| parser.add_argument("--log_frequency", default=10, type=int) | |||
| parser.add_argument("--epochs", default=5, type=int) | |||
| parser.add_argument("--channels", default=16, type=int) | |||
| parser.add_argument('--model_lr', type=float, default=0.025, help='learning rate for training model weights') | |||
| parser.add_argument('--arch_lr', type=float, default=3e-4, help='learning rate for training architecture') | |||
| parser.add_argument("--unrolled", default=False, action="store_true") | |||
| parser.add_argument("--visualization", default=False, action="store_true") | |||
| parser.add_argument("--class_num", default=10, type=int, help="cifar10") | |||
| args = parser.parse_args() | |||
| mkdirs(args.result_path, args.log_path, args.search_space_path, args.best_selected_space_path) | |||
| init_logger(args.log_path, "info") | |||
| logger.info(args) | |||
| set_seed(args.trial_id) | |||
| dataset_train, dataset_valid = datasets.get_dataset("cifar10", root=args.data_dir) | |||
| model = CNN(32, 3, args.channels, args.class_num, args.layers) | |||
| criterion = nn.CrossEntropyLoss() | |||
| optim = torch.optim.SGD(model.parameters(), args.model_lr, momentum=0.9, weight_decay=3.0E-4) | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001) | |||
| trainer = DartsTrainer(model, | |||
| loss=criterion, | |||
| metrics=lambda output, target: accuracy(output, target, topk=(1,)), | |||
| optimizer=optim, | |||
| num_epochs=args.epochs, | |||
| dataset_train=dataset_train, | |||
| dataset_valid=dataset_valid, | |||
| search_space_path = args.search_space_path, | |||
| batch_size=args.batch_size, | |||
| log_frequency=args.log_frequency, | |||
| result_path=args.result_path, | |||
| unrolled=args.unrolled, | |||
| arch_lr=args.arch_lr, | |||
| callbacks=[LRSchedulerCallback(lr_scheduler), BestArchitectureCheckpoint(args.best_selected_space_path, args.epochs)]) | |||
| if args.visualization: | |||
| trainer.enable_visualization() | |||
| t1 = time.time() | |||
| trainer.train() | |||
| # res_json = trainer.result | |||
| cost_time = time.time() - t1 | |||
| # 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}} | |||
| logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})) | |||
| # res_json["Cost_time"] = str(cost_time) + ' s' | |||
| # dump_global_result(args.result_path, res_json) | |||
| @@ -0,0 +1,134 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import logging | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from collections import OrderedDict | |||
| from pytorch.mutator import Mutator | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| _logger = logging.getLogger(__name__) | |||
| class DartsMutator(Mutator): | |||
| """ | |||
| Connects the model in a DARTS (differentiable) way. | |||
| An extra connection is automatically inserted for each LayerChoice, when this connection is selected, there is no | |||
| op on this LayerChoice (namely a ``ZeroOp``), in which case, every element in the exported choice list is ``false`` | |||
| (not chosen). | |||
| All input choice will be fully connected in the search phase. On exporting, the input choice will choose inputs based | |||
| on keys in ``choose_from``. If the keys were to be keys of LayerChoices, the top logit of the corresponding LayerChoice | |||
| will join the competition of input choice to compete against other logits. Otherwise, the logit will be assumed 0. | |||
| It's possible to cut branches by setting parameter ``choices`` in a particular position to ``-inf``. After softmax, the | |||
| value would be 0. Framework will ignore 0 values and not connect. Note that the gradient on the ``-inf`` location will | |||
| be 0. Since manipulations with ``-inf`` will be ``nan``, you need to handle the gradient update phase carefully. | |||
| Attributes | |||
| ---------- | |||
| choices: ParameterDict | |||
| dict that maps keys of LayerChoices to weighted-connection float tensors. | |||
| """ | |||
| def __init__(self, model): | |||
| super().__init__(model) | |||
| self.choices = nn.ParameterDict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.length + 1)) | |||
| def device(self): | |||
| for v in self.choices.values(): | |||
| return v.device | |||
| def sample_search(self): | |||
| result = dict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| result[mutable.key] = F.softmax(self.choices[mutable.key], dim=-1)[:-1] | |||
| elif isinstance(mutable, InputChoice): | |||
| result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device()) | |||
| return result | |||
| def sample_final(self): | |||
| result = dict() | |||
| edges_max = dict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| max_val, index = torch.max(F.softmax(self.choices[mutable.key], dim=-1)[:-1], 0) | |||
| edges_max[mutable.key] = max_val | |||
| result[mutable.key] = F.one_hot(index, num_classes=len(mutable)).view(-1).bool() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, InputChoice): | |||
| if mutable.n_chosen is not None: | |||
| weights = [] | |||
| for src_key in mutable.choose_from: | |||
| if src_key not in edges_max: | |||
| _logger.warning("InputChoice.NO_KEY in '%s' is weighted 0 when selecting inputs.", mutable.key) | |||
| weights.append(edges_max.get(src_key, 0.)) | |||
| weights = torch.tensor(weights) # pylint: disable=not-callable | |||
| _, topk_edge_indices = torch.topk(weights, mutable.n_chosen) | |||
| selected_multihot = [] | |||
| for i, src_key in enumerate(mutable.choose_from): | |||
| if i not in topk_edge_indices and src_key in result: | |||
| # If an edge is never selected, there is no need to calculate any op on this edge. | |||
| # This is to eliminate redundant calculation. | |||
| result[src_key] = torch.zeros_like(result[src_key]) | |||
| selected_multihot.append(i in topk_edge_indices) | |||
| result[mutable.key] = torch.tensor(selected_multihot, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable | |||
| else: | |||
| result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable | |||
| return result | |||
| def _generate_search_space(self): | |||
| """ | |||
| Generate search space from mutables. | |||
| Here is the search space format: | |||
| :: | |||
| { key_name: {"_type": "layer_choice", | |||
| "_value": ["conv1", "conv2"]} } | |||
| { key_name: {"_type": "input_choice", | |||
| "_value": {"candidates": ["in1", "in2"], | |||
| "n_chosen": 1}} } | |||
| Returns | |||
| ------- | |||
| dict | |||
| the generated search space | |||
| """ | |||
| res = OrderedDict() | |||
| res["op_list"] = OrderedDict() | |||
| res["search_space"] = OrderedDict() | |||
| # res["normal_cell"] = OrderedDict(), | |||
| # res["reduction_cell"] = OrderedDict() | |||
| keys = [] | |||
| for mutable in self.mutables: | |||
| # for now we only generate flattened search space | |||
| if (len(res["search_space"])) >= 36: | |||
| break | |||
| if isinstance(mutable, LayerChoice): | |||
| key = mutable.key | |||
| if key not in keys: | |||
| val = mutable.names | |||
| if not res["op_list"]: | |||
| res["op_list"] = {"_type": "layer_choice", "_value": val + ["none"]} | |||
| # node_type = "normal_cell" if "normal" in key else "reduction_cell" | |||
| res["search_space"][key] = "op_list" | |||
| keys.append(key) | |||
| elif isinstance(mutable, InputChoice): | |||
| key = mutable.key | |||
| if key not in keys: | |||
| # node_type = "normal_cell" if "normal" in key else "reduction_cell" | |||
| res["search_space"][key] = {"_type": "input_choice", | |||
| "_value": {"candidates": mutable.choose_from, | |||
| "n_chosen": mutable.n_chosen}} | |||
| keys.append(key) | |||
| else: | |||
| raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) | |||
| return res | |||
| @@ -0,0 +1,227 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import copy | |||
| import logging | |||
| import torch | |||
| import torch.nn as nn | |||
| from pytorch.trainer import Trainer | |||
| from pytorch.utils import AverageMeterGroup, dump_global_result | |||
| from pytorch.darts.dartsmutator import DartsMutator | |||
| import json | |||
| logger = logging.getLogger(__name__) | |||
| class DartsTrainer(Trainer): | |||
| """ | |||
| DARTS trainer. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| PyTorch model to be trained. | |||
| loss : callable | |||
| Receives logits and ground truth label, return a loss tensor. | |||
| metrics : callable | |||
| Receives logits and ground truth label, return a dict of metrics. | |||
| optimizer : Optimizer | |||
| The optimizer used for optimizing the model. | |||
| num_epochs : int | |||
| Number of epochs planned for training. | |||
| dataset_train : Dataset | |||
| Dataset for training. Will be split for training weights and architecture weights. | |||
| dataset_valid : Dataset | |||
| Dataset for testing. | |||
| mutator : DartsMutator | |||
| Use in case of customizing your own DartsMutator. By default will instantiate a DartsMutator. | |||
| batch_size : int | |||
| Batch size. | |||
| workers : int | |||
| Workers for data loading. | |||
| device : torch.device | |||
| ``torch.device("cpu")`` or ``torch.device("cuda")``. | |||
| log_frequency : int | |||
| Step count per logging. | |||
| callbacks : list of Callback | |||
| list of callbacks to trigger at events. | |||
| arch_lr : float | |||
| Learning rate of architecture parameters. | |||
| unrolled : float | |||
| ``True`` if using second order optimization, else first order optimization. | |||
| """ | |||
| def __init__(self, model, loss, metrics, | |||
| optimizer, num_epochs, dataset_train, dataset_valid, search_space_path, result_path, num_pre_epochs=0, | |||
| mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, | |||
| callbacks=None, arch_lr=3.0E-4, unrolled=False): | |||
| super().__init__(model, mutator if mutator is not None else DartsMutator(model), | |||
| loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid, | |||
| batch_size, workers, device, log_frequency, callbacks) | |||
| self.ctrl_optim = torch.optim.Adam(self.mutator.parameters(), arch_lr, betas=(0.5, 0.999), weight_decay=1.0E-3) | |||
| self.unrolled = unrolled | |||
| self.num_pre_epoches = num_pre_epochs | |||
| self.result_path = result_path | |||
| with open(self.result_path, "w") as file: | |||
| file.write('') | |||
| n_train = len(self.dataset_train) | |||
| split = n_train // 2 | |||
| indices = list(range(n_train)) | |||
| train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:split]) | |||
| valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[split:]) | |||
| self.train_loader = torch.utils.data.DataLoader(self.dataset_train, | |||
| batch_size=batch_size, | |||
| sampler=train_sampler, | |||
| num_workers=workers) | |||
| self.valid_loader = torch.utils.data.DataLoader(self.dataset_train, | |||
| batch_size=batch_size, | |||
| sampler=valid_sampler, | |||
| num_workers=workers) | |||
| self.test_loader = torch.utils.data.DataLoader(self.dataset_valid, | |||
| batch_size=batch_size, | |||
| num_workers=workers) | |||
| if search_space_path is not None: | |||
| dump_global_result(search_space_path, self.mutator._generate_search_space()) | |||
| # self.result = {"Accuracy": []} | |||
| def train_one_epoch(self, epoch): | |||
| self.model.train() | |||
| self.mutator.train() | |||
| meters = AverageMeterGroup() | |||
| # t1 = time() | |||
| for step, ((trn_X, trn_y), (val_X, val_y)) in enumerate(zip(self.train_loader, self.valid_loader)): | |||
| trn_X, trn_y = trn_X.to(self.device), trn_y.to(self.device) | |||
| val_X, val_y = val_X.to(self.device), val_y.to(self.device) | |||
| if epoch >= self.num_pre_epoches: | |||
| # phase 1. architecture step | |||
| self.ctrl_optim.zero_grad() | |||
| if self.unrolled: | |||
| self._unrolled_backward(trn_X, trn_y, val_X, val_y) | |||
| else: | |||
| self._backward(val_X, val_y) | |||
| self.ctrl_optim.step() | |||
| # phase 2: child network step | |||
| self.optimizer.zero_grad() | |||
| logits, loss = self._logits_and_loss(trn_X, trn_y) | |||
| loss.backward() | |||
| nn.utils.clip_grad_norm_(self.model.parameters(), 5.) # gradient clipping | |||
| self.optimizer.step() | |||
| metrics = self.metrics(logits, trn_y) | |||
| metrics["loss"] = loss.item() | |||
| meters.update(metrics) | |||
| if self.log_frequency is not None and step % self.log_frequency == 0: | |||
| logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, | |||
| self.num_epochs, step + 1, len(self.train_loader), meters) | |||
| def validate_one_epoch(self, epoch, log_print=True): | |||
| self.model.eval() | |||
| self.mutator.eval() | |||
| meters = AverageMeterGroup() | |||
| with torch.no_grad(): | |||
| self.mutator.reset() | |||
| for step, (X, y) in enumerate(self.test_loader): | |||
| X, y = X.to(self.device), y.to(self.device) | |||
| logits = self.model(X) | |||
| metrics = self.metrics(logits, y) | |||
| meters.update(metrics) | |||
| if self.log_frequency is not None and step % self.log_frequency == 0: | |||
| logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, | |||
| self.num_epochs, step + 1, len(self.test_loader), meters) | |||
| if log_print: | |||
| # 后端在终端过滤,{"type": "Accuracy", "result": {"sequence": 1, "category": "epoch", "value":96.7}} | |||
| logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": meters.get_last_acc()}}) | |||
| with open(self.result_path, "a") as file: | |||
| file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": meters.get_last_acc()}}) + '\n') | |||
| # self.result["Accuracy"].append(meters.get_last_acc()) | |||
| def _logits_and_loss(self, X, y): | |||
| self.mutator.reset() | |||
| logits = self.model(X) | |||
| loss = self.loss(logits, y) | |||
| # self._write_graph_status() | |||
| return logits, loss | |||
| def _backward(self, val_X, val_y): | |||
| """ | |||
| Simple backward with gradient descent | |||
| """ | |||
| _, loss = self._logits_and_loss(val_X, val_y) | |||
| loss.backward() | |||
| def _unrolled_backward(self, trn_X, trn_y, val_X, val_y): | |||
| """ | |||
| Compute unrolled loss and backward its gradients | |||
| """ | |||
| backup_params = copy.deepcopy(tuple(self.model.parameters())) | |||
| # do virtual step on training data | |||
| lr = self.optimizer.param_groups[0]["lr"] | |||
| momentum = self.optimizer.param_groups[0]["momentum"] | |||
| weight_decay = self.optimizer.param_groups[0]["weight_decay"] | |||
| self._compute_virtual_model(trn_X, trn_y, lr, momentum, weight_decay) | |||
| # calculate unrolled loss on validation data | |||
| # keep gradients for model here for compute hessian | |||
| _, loss = self._logits_and_loss(val_X, val_y) | |||
| w_model, w_ctrl = tuple(self.model.parameters()), tuple(self.mutator.parameters()) | |||
| w_grads = torch.autograd.grad(loss, w_model + w_ctrl) | |||
| d_model, d_ctrl = w_grads[:len(w_model)], w_grads[len(w_model):] | |||
| # compute hessian and final gradients | |||
| hessian = self._compute_hessian(backup_params, d_model, trn_X, trn_y) | |||
| with torch.no_grad(): | |||
| for param, d, h in zip(w_ctrl, d_ctrl, hessian): | |||
| # gradient = dalpha - lr * hessian | |||
| param.grad = d - lr * h | |||
| # restore weights | |||
| self._restore_weights(backup_params) | |||
| def _compute_virtual_model(self, X, y, lr, momentum, weight_decay): | |||
| """ | |||
| Compute unrolled weights w` | |||
| """ | |||
| # don't need zero_grad, using autograd to calculate gradients | |||
| _, loss = self._logits_and_loss(X, y) | |||
| gradients = torch.autograd.grad(loss, self.model.parameters()) | |||
| with torch.no_grad(): | |||
| for w, g in zip(self.model.parameters(), gradients): | |||
| m = self.optimizer.state[w].get("momentum_buffer", 0.) | |||
| w = w - lr * (momentum * m + g + weight_decay * w) | |||
| def _restore_weights(self, backup_params): | |||
| with torch.no_grad(): | |||
| for param, backup in zip(self.model.parameters(), backup_params): | |||
| param.copy_(backup) | |||
| def _compute_hessian(self, backup_params, dw, trn_X, trn_y): | |||
| """ | |||
| dw = dw` { L_val(w`, alpha) } | |||
| w+ = w + eps * dw | |||
| w- = w - eps * dw | |||
| hessian = (dalpha { L_trn(w+, alpha) } - dalpha { L_trn(w-, alpha) }) / (2*eps) | |||
| eps = 0.01 / ||dw|| | |||
| """ | |||
| self._restore_weights(backup_params) | |||
| norm = torch.cat([w.view(-1) for w in dw]).norm() | |||
| eps = 0.01 / norm | |||
| if norm < 1E-8: | |||
| logger.warning("In computing hessian, norm is smaller than 1E-8, cause eps to be %.6f.", norm.item()) | |||
| dalphas = [] | |||
| for e in [eps, -2. * eps]: | |||
| # w+ = w + eps*dw`, w- = w - eps*dw` | |||
| with torch.no_grad(): | |||
| for p, d in zip(self.model.parameters(), dw): | |||
| p += e * d | |||
| _, loss = self._logits_and_loss(trn_X, trn_y) | |||
| dalphas.append(torch.autograd.grad(loss, self.mutator.parameters())) | |||
| dalpha_pos, dalpha_neg = dalphas # dalpha { L_trn(w+) }, # dalpha { L_trn(w-) } | |||
| hessian = [(p - n) / (2. * eps) for p, n in zip(dalpha_pos, dalpha_neg)] | |||
| return hessian | |||
| @@ -0,0 +1,56 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import numpy as np | |||
| import torch | |||
| from torchvision import transforms | |||
| from torchvision.datasets import CIFAR10 | |||
| class Cutout(object): | |||
| def __init__(self, length): | |||
| self.length = length | |||
| def __call__(self, img): | |||
| h, w = img.size(1), img.size(2) | |||
| mask = np.ones((h, w), np.float32) | |||
| y = np.random.randint(h) | |||
| x = np.random.randint(w) | |||
| y1 = np.clip(y - self.length // 2, 0, h) | |||
| y2 = np.clip(y + self.length // 2, 0, h) | |||
| x1 = np.clip(x - self.length // 2, 0, w) | |||
| x2 = np.clip(x + self.length // 2, 0, w) | |||
| mask[y1: y2, x1: x2] = 0. | |||
| mask = torch.from_numpy(mask) | |||
| mask = mask.expand_as(img) | |||
| img *= mask | |||
| return img | |||
| def get_dataset(cls, cutout_length=0, root=None): | |||
| MEAN = [0.49139968, 0.48215827, 0.44653124] | |||
| STD = [0.24703233, 0.24348505, 0.26158768] | |||
| transf = [ | |||
| transforms.RandomCrop(32, padding=4), | |||
| transforms.RandomHorizontalFlip() | |||
| ] | |||
| normalize = [ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(MEAN, STD) | |||
| ] | |||
| cutout = [] | |||
| if cutout_length > 0: | |||
| cutout.append(Cutout(cutout_length)) | |||
| train_transform = transforms.Compose(transf + normalize + cutout) | |||
| valid_transform = transforms.Compose(normalize) | |||
| if cls == "cifar10": | |||
| dataset_train = CIFAR10(root=root, train=True, download=True, transform=train_transform) | |||
| dataset_valid = CIFAR10(root=root, train=False, download=True, transform=valid_transform) | |||
| else: | |||
| raise NotImplementedError | |||
| return dataset_train, dataset_valid | |||
| @@ -0,0 +1,160 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| from collections import OrderedDict | |||
| import torch | |||
| import torch.nn as nn | |||
| import ops | |||
| import pytorch.mutables as mutables | |||
| class AuxiliaryHead(nn.Module): | |||
| """ Auxiliary head in 2/3 place of network to let the gradient flow well """ | |||
| def __init__(self, input_size, C, n_classes): | |||
| """ assuming input size 7x7 or 8x8 """ | |||
| assert input_size in [7, 8] | |||
| super().__init__() | |||
| self.net = nn.Sequential( | |||
| nn.ReLU(inplace=True), | |||
| nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False), # 2x2 out | |||
| nn.Conv2d(C, 128, kernel_size=1, bias=False), | |||
| nn.BatchNorm2d(128), | |||
| nn.ReLU(inplace=True), | |||
| nn.Conv2d(128, 768, kernel_size=2, bias=False), # 1x1 out | |||
| nn.BatchNorm2d(768), | |||
| nn.ReLU(inplace=True) | |||
| ) | |||
| self.linear = nn.Linear(768, n_classes) | |||
| def forward(self, x): | |||
| out = self.net(x) | |||
| out = out.view(out.size(0), -1) # flatten | |||
| logits = self.linear(out) | |||
| return logits | |||
| class Node(nn.Module): | |||
| def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect): | |||
| super().__init__() | |||
| self.ops = nn.ModuleList() | |||
| choice_keys = [] | |||
| for i in range(num_prev_nodes): | |||
| stride = 2 if i < num_downsample_connect else 1 | |||
| choice_keys.append("{}_p{}".format(node_id, i)) | |||
| self.ops.append( | |||
| mutables.LayerChoice(OrderedDict([ | |||
| ("maxpool", ops.PoolBN('max', channels, 3, stride, 1, affine=False)), | |||
| ("avgpool", ops.PoolBN('avg', channels, 3, stride, 1, affine=False)), | |||
| ("skipconnect", nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False)), | |||
| ("sepconv3x3", ops.SepConv(channels, channels, 3, stride, 1, affine=False)), | |||
| ("sepconv5x5", ops.SepConv(channels, channels, 5, stride, 2, affine=False)), | |||
| ("dilconv3x3", ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False)), | |||
| ("dilconv5x5", ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False)) | |||
| ]), key=choice_keys[-1])) | |||
| self.drop_path = ops.DropPath() | |||
| self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id)) | |||
| def forward(self, prev_nodes): | |||
| assert len(self.ops) == len(prev_nodes) | |||
| out = [op(node) for op, node in zip(self.ops, prev_nodes)] | |||
| out = [self.drop_path(o) if o is not None else None for o in out] | |||
| return self.input_switch(out) | |||
| class Cell(nn.Module): | |||
| def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction): | |||
| super().__init__() | |||
| self.reduction = reduction | |||
| self.n_nodes = n_nodes | |||
| # If previous cell is reduction cell, current input size does not match with | |||
| # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing. | |||
| if reduction_p: | |||
| self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False) | |||
| else: | |||
| self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False) | |||
| self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False) | |||
| # generate dag | |||
| self.mutable_ops = nn.ModuleList() | |||
| for depth in range(2, self.n_nodes + 2): | |||
| self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth), | |||
| depth, channels, 2 if reduction else 0)) | |||
| def forward(self, s0, s1): | |||
| # s0, s1 are the outputs of previous previous cell and previous cell, respectively. | |||
| tensors = [self.preproc0(s0), self.preproc1(s1)] | |||
| for node in self.mutable_ops: | |||
| cur_tensor = node(tensors) | |||
| tensors.append(cur_tensor) | |||
| output = torch.cat(tensors[2:], dim=1) | |||
| return output | |||
| class CNN(nn.Module): | |||
| def __init__(self, input_size, in_channels, channels, n_classes, n_layers, n_nodes=4, | |||
| stem_multiplier=3, auxiliary=False): | |||
| super().__init__() | |||
| self.in_channels = in_channels | |||
| self.channels = channels | |||
| self.n_classes = n_classes | |||
| self.n_layers = n_layers | |||
| self.aux_pos = 2 * n_layers // 3 if auxiliary else -1 | |||
| c_cur = stem_multiplier * self.channels | |||
| self.stem = nn.Sequential( | |||
| nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False), | |||
| nn.BatchNorm2d(c_cur) | |||
| ) | |||
| # for the first cell, stem is used for both s0 and s1 | |||
| # [!] channels_pp and channels_p is output channel size, but c_cur is input channel size. | |||
| channels_pp, channels_p, c_cur = c_cur, c_cur, channels | |||
| self.cells = nn.ModuleList() | |||
| reduction_p, reduction = False, False | |||
| for i in range(n_layers): | |||
| reduction_p, reduction = reduction, False | |||
| # Reduce featuremap size and double channels in 1/3 and 2/3 layer. | |||
| if i in [n_layers // 3, 2 * n_layers // 3]: | |||
| c_cur *= 2 | |||
| reduction = True | |||
| cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction) | |||
| self.cells.append(cell) | |||
| c_cur_out = c_cur * n_nodes | |||
| channels_pp, channels_p = channels_p, c_cur_out | |||
| if i == self.aux_pos: | |||
| self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes) | |||
| self.gap = nn.AdaptiveAvgPool2d(1) | |||
| self.linear = nn.Linear(channels_p, n_classes) | |||
| def forward(self, x): | |||
| s0 = s1 = self.stem(x) | |||
| aux_logits = None | |||
| for i, cell in enumerate(self.cells): | |||
| s0, s1 = s1, cell(s0, s1) | |||
| if i == self.aux_pos and self.training: | |||
| aux_logits = self.aux_head(s1) | |||
| out = self.gap(s1) | |||
| out = out.view(out.size(0), -1) # flatten | |||
| logits = self.linear(out) | |||
| if aux_logits is not None: | |||
| return logits, aux_logits | |||
| return logits | |||
| def drop_path_prob(self, p): | |||
| for module in self.modules(): | |||
| if isinstance(module, ops.DropPath): | |||
| module.p = p | |||
| @@ -0,0 +1,136 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import torch | |||
| import torch.nn as nn | |||
| class DropPath(nn.Module): | |||
| def __init__(self, p=0.): | |||
| """ | |||
| Drop path with probability. | |||
| Parameters | |||
| ---------- | |||
| p : float | |||
| Probability of an path to be zeroed. | |||
| """ | |||
| super().__init__() | |||
| self.p = p | |||
| def forward(self, x): | |||
| if self.training and self.p > 0.: | |||
| keep_prob = 1. - self.p | |||
| # per data point mask | |||
| mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob) | |||
| return x / keep_prob * mask | |||
| return x | |||
| class PoolBN(nn.Module): | |||
| """ | |||
| AvgPool or MaxPool with BN. `pool_type` must be `max` or `avg`. | |||
| """ | |||
| def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True): | |||
| super().__init__() | |||
| if pool_type.lower() == 'max': | |||
| self.pool = nn.MaxPool2d(kernel_size, stride, padding) | |||
| elif pool_type.lower() == 'avg': | |||
| self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False) | |||
| else: | |||
| raise ValueError() | |||
| self.bn = nn.BatchNorm2d(C, affine=affine) | |||
| def forward(self, x): | |||
| out = self.pool(x) | |||
| out = self.bn(out) | |||
| return out | |||
| class StdConv(nn.Module): | |||
| """ | |||
| Standard conv: ReLU - Conv - BN | |||
| """ | |||
| def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): | |||
| super().__init__() | |||
| self.net = nn.Sequential( | |||
| nn.ReLU(), | |||
| nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False), | |||
| nn.BatchNorm2d(C_out, affine=affine) | |||
| ) | |||
| def forward(self, x): | |||
| return self.net(x) | |||
| class FacConv(nn.Module): | |||
| """ | |||
| Factorized conv: ReLU - Conv(Kx1) - Conv(1xK) - BN | |||
| """ | |||
| def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True): | |||
| super().__init__() | |||
| self.net = nn.Sequential( | |||
| nn.ReLU(), | |||
| nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False), | |||
| nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False), | |||
| nn.BatchNorm2d(C_out, affine=affine) | |||
| ) | |||
| def forward(self, x): | |||
| return self.net(x) | |||
| class DilConv(nn.Module): | |||
| """ | |||
| (Dilated) depthwise separable conv. | |||
| ReLU - (Dilated) depthwise separable - Pointwise - BN. | |||
| If dilation == 2, 3x3 conv => 5x5 receptive field, 5x5 conv => 9x9 receptive field. | |||
| """ | |||
| def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True): | |||
| super().__init__() | |||
| self.net = nn.Sequential( | |||
| nn.ReLU(), | |||
| nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in, | |||
| bias=False), | |||
| nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False), | |||
| nn.BatchNorm2d(C_out, affine=affine) | |||
| ) | |||
| def forward(self, x): | |||
| return self.net(x) | |||
| class SepConv(nn.Module): | |||
| """ | |||
| Depthwise separable conv. | |||
| DilConv(dilation=1) * 2. | |||
| """ | |||
| def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): | |||
| super().__init__() | |||
| self.net = nn.Sequential( | |||
| DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine), | |||
| DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine) | |||
| ) | |||
| def forward(self, x): | |||
| return self.net(x) | |||
| class FactorizedReduce(nn.Module): | |||
| """ | |||
| Reduce feature map size by factorized pointwise (stride=2). | |||
| """ | |||
| def __init__(self, C_in, C_out, affine=True): | |||
| super().__init__() | |||
| self.relu = nn.ReLU() | |||
| self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) | |||
| self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) | |||
| self.bn = nn.BatchNorm2d(C_out, affine=affine) | |||
| def forward(self, x): | |||
| x = self.relu(x) | |||
| out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1) | |||
| out = self.bn(out) | |||
| return out | |||
| @@ -0,0 +1,83 @@ | |||
| # train stage | |||
| `python darts_train.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --search_space_path 'experiment_id/search_space.json' --best_selected_space_path 'experiment_id/best_selected_space.json' --trial_id 0 --layers 8 --model_lr 0.025 --arch_lr 3e-4 --epochs 1 --batch_size 64 --channels 16` | |||
| Note: | |||
| here `--epochs 2` just for debug | |||
| # select stage | |||
| `python darts_select.py --best_selected_space_path 'experiment_id/best_selected_space.json' ` | |||
| # retrain stage | |||
| `python darts_retrain.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --best_selected_space_path 'experiment_id/best_selected_space.json' --best_checkpoint_dir 'experiment_id/' --trial_id 0 --batch_size 96 --epochs 1 --lr 0.025 --layers 20 --channels 36` | |||
| # output file | |||
| `result.json` | |||
| ``` | |||
| {'type': 'Accuracy', 'result': {'sequence': 0, 'category': 'epoch', 'value': 0.1}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 1, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 2, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 3, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 4, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Cost_time', 'result': {'value': '41.614346981048584 s'}} | |||
| ``` | |||
| `search_space.json` | |||
| ``` | |||
| { | |||
| "op_list": { | |||
| "_type": "layer_choice", | |||
| "_value": [ | |||
| "maxpool", | |||
| "avgpool", | |||
| "skipconnect", | |||
| "sepconv3x3", | |||
| "sepconv5x5", | |||
| "dilconv3x3", | |||
| "dilconv5x5", | |||
| "none" | |||
| ] | |||
| }, | |||
| "search_space": { | |||
| "normal_n2_p0": "op_list", | |||
| "normal_n2_p1": "op_list", | |||
| "normal_n2_switch": { | |||
| "_type": "input_choice", | |||
| "_value": { | |||
| "candidates": [ | |||
| "normal_n2_p0", | |||
| "normal_n2_p1" | |||
| ], | |||
| "n_chosen": 2 | |||
| } | |||
| }, | |||
| ... | |||
| } | |||
| ``` | |||
| `best_selected_space.json` | |||
| ``` | |||
| { | |||
| "normal_n2_p0": "dilconv5x5", | |||
| "normal_n2_p1": "dilconv5x5", | |||
| "normal_n2_switch": [ | |||
| "normal_n2_p0", | |||
| "normal_n2_p1" | |||
| ], | |||
| "normal_n3_p0": "sepconv3x3", | |||
| "normal_n3_p1": "dilconv5x5", | |||
| "normal_n3_p2": [], | |||
| "normal_n3_switch": [ | |||
| "normal_n3_p0", | |||
| "normal_n3_p1" | |||
| ], | |||
| "normal_n4_p0": [], | |||
| "normal_n4_p1": "dilconv5x5", | |||
| "normal_n4_p2": "sepconv5x5", | |||
| "normal_n4_p3": [], | |||
| "normal_n4_switch": [ | |||
| "normal_n4_p1", | |||
| "normal_n4_p2" | |||
| ], | |||
| ... | |||
| } | |||
| ``` | |||
| @@ -0,0 +1,21 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| def accuracy(output, target, topk=(1,)): | |||
| """ Computes the precision@k for the specified values of k """ | |||
| maxk = max(topk) | |||
| batch_size = target.size(0) | |||
| _, pred = output.topk(maxk, 1, True, True) | |||
| pred = pred.t() | |||
| # one-hot case | |||
| if target.ndimension() > 1: | |||
| target = target.max(1)[1] | |||
| correct = pred.eq(target.view(1, -1).expand_as(pred)) | |||
| res = dict() | |||
| for k in topk: | |||
| correct_k = correct[:k].reshape(-1).float().sum(0) | |||
| res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item() | |||
| return res | |||
| @@ -0,0 +1,331 @@ | |||
| from nvidia.dali.pipeline import Pipeline | |||
| from nvidia.dali import ops | |||
| from nvidia.dali import types | |||
| from nvidia.dali.plugin.pytorch import DALIClassificationIterator | |||
| import numpy as np | |||
| import torch | |||
| from torch import nn | |||
| class HybridTrainPipeline(Pipeline): | |||
| def __init__(self, batch_size, file_root, num_threads, device_id, num_shards, shard_id): | |||
| super(HybridTrainPipeline, self).__init__(batch_size, num_threads, device_id) | |||
| device_type = {0:"cpu"} | |||
| if num_shards == 0: | |||
| self.input = ops.FileReader(file_root = file_root) | |||
| else: | |||
| self.input = ops.FileReader(file_root = file_root, num_shards = num_shards, shard_id = shard_id) | |||
| # ##### 可自由更改 ################################### | |||
| self.decode = ops.ImageDecoder(device = device_type.get(num_shards, "mixed"), output_type = types.RGB) | |||
| self.res = ops.RandomResizedCrop(device=device_type.get(num_shards, "gpu"), size = 224) | |||
| self.cmnp = ops.CropMirrorNormalize(device=device_type.get(num_shards, "gpu"), | |||
| dtype = types.FLOAT, # output_dtype=types.FLOAT, | |||
| output_layout=types.NCHW, | |||
| mean=0. ,# if spos_pre else [0.485 * 255, 0.456 * 255, 0.406 * 255], | |||
| std=1. )# if spos_pre else [0.229 * 255, 0.224 * 255, 0.225 * 255]) | |||
| # #################################################### | |||
| def define_graph(self, ): | |||
| jpegs, labels = self.input(name="Reader") | |||
| images = self.decode(jpegs) | |||
| images = self.res(images) | |||
| images = self.cmnp(images) | |||
| return images, labels | |||
| class HybridValPipeline(Pipeline): | |||
| def __init__(self, batch_size, file_root, num_threads, device_id, num_shards, shard_id): | |||
| super(HybridValPipeline, self).__init__(batch_size, num_threads, device_id) | |||
| device_type = {0:"cpu"} | |||
| if num_shards == 0: | |||
| self.input = ops.FileReader(file_root = file_root) | |||
| else: | |||
| self.input = ops.FileReader(file_root = file_root, num_shards = num_shards, shard_id = shard_id) | |||
| # ##### 可自由更改 ################################### | |||
| self.decode = ops.ImageDecoder(device = device_type.get(num_shards, "mixed"), output_type = types.RGB) | |||
| self.res = ops.RandomResizedCrop(device=device_type.get(num_shards, "gpu"), size = 224) | |||
| self.cmnp = ops.CropMirrorNormalize(device=device_type.get(num_shards, "gpu"), | |||
| dtype = types.FLOAT, # output_dtype=types.FLOAT, | |||
| output_layout=types.NCHW, | |||
| mean=0. ,# if spos_pre else [0.485 * 255, 0.456 * 255, 0.406 * 255], | |||
| std=1. )# if spos_pre else [0.229 * 255, 0.224 * 255, 0.225 * 255]) | |||
| # #################################################### | |||
| def define_graph(self, ): | |||
| jpegs, labels = self.input(name="Reader") | |||
| images = self.decode(jpegs) | |||
| images = self.res(images) | |||
| images = self.cmnp(images) | |||
| return images, labels | |||
| class TorchWrapper: | |||
| """ | |||
| 将多个pipeline封装为一个iterator | |||
| parameters: | |||
| num_shards : int 显卡并行数 | |||
| data_loader : dali.pipeline.Pipeline类型 经过pipeline处理的数据结果 | |||
| iter_mode : str recursion, iter 指定多个pipeline合并的方式,默认recursion | |||
| """ | |||
| def __init__(self, num_shards, data_loader, iter_mode = "recursion"): | |||
| self.index = 0 | |||
| self.count = 0 | |||
| self.num_shards = num_shards | |||
| self.data_loader = data_loader | |||
| self.iter_mode = iter_mode | |||
| if self.iter_mode not in {"recursion", "iter"}: | |||
| raise Exception("iter_mode should be either 'recursion' or 'iter'") | |||
| def __iter__(self,): | |||
| return self | |||
| def __len__(self, ): | |||
| # 返回样本总量,而非batch_num | |||
| if num_shards == 0: | |||
| return self.data_loader.size | |||
| else: | |||
| return len(self.data_loader)*self.data_loader[0].size | |||
| def __next__(self, ): | |||
| if num_shards == 0: | |||
| # 不使用GPU | |||
| data = next(self.data_loader) | |||
| return data[0]["data"], data[0]["label"].view(-1).long() | |||
| else: | |||
| # 使用一块或多块GPU | |||
| if self.iter_mode == "recursion": | |||
| return self._get_next_recursion() | |||
| elif self.iter_mode == "iter": | |||
| return self._get_next_iter(self.data_loader[0]) | |||
| def _get_next_iter(self, data_loader): | |||
| if self.count == data_loader.size: | |||
| self.index+=1 | |||
| data_loader = self.data_loader[self.index] | |||
| self.count+=1 | |||
| data = next(data_loader) | |||
| return data[0]["data"], data[0]["label"].view(-1).long() | |||
| def _get_next_recursion(self, ): | |||
| self.index = self.count%self.num_shards | |||
| self.count+=1 | |||
| data_loader = self.data_loader[self.index] | |||
| data = next(data_loader) | |||
| return data[0]["data"], data[0]["label"].view(-1).long() | |||
| def get_iter_dali_cuda(batch_size=256, train_file_root="", val_file_root="", num_threads=4, device_id=[-1], num_shards=0, shard_id=[-1]): | |||
| """ | |||
| 获取可用于pytorch训练的数据迭代器 | |||
| 数据的读取和处理部分可以使用多张GPU来完成 | |||
| 1、创建dali pipeline | |||
| 2、封装为适用于pytorch的数据迭代器 | |||
| 3、将多卡的各个pipeline封装在一起 | |||
| 4、数据输出在cpu端,在cuda中 | |||
| 数据需要保证如下形式: | |||
| images | |||
| |-file_list.txt | |||
| |-images/dog | |||
| |-dog_4.jpg | |||
| |-dog_5.jpg | |||
| |-dog_9.jpg | |||
| |-dog_6.jpg | |||
| |-dog_3.jpg | |||
| |-images/kitten | |||
| |-cat_10.jpg | |||
| |-cat_5.jpg | |||
| |-cat_9.jpg | |||
| |-cat_8.jpg | |||
| |-cat_1.jpg | |||
| parameters: | |||
| batch_size : int 每批数据的量 | |||
| file_root : str 数据的路径 | |||
| num_threads : int 读取数据的CPU线程数 | |||
| device_id : list of int GPU的物理编号 | |||
| shard_id : list of int GPU的虚拟编号 | |||
| num_shard : int | |||
| methods: | |||
| get_train_pipeline(shard_id, device_id) : 创建dali的pipeline,用以读取并处理训练数据 | |||
| get_val_pipeline(shard_id, device_id) : 创建dali的pipeline,用以读取并处理验证数据 | |||
| get_dali_iter_for_torch(piplines, data_num) : 封装成可用于pytorch的数据迭代器 | |||
| get_data_size(pipeline) : 计算每个pipeline实际输出的数据总量,数据总量是文件中的数据量,实际输出是去掉了不满一个批次大小的数据 | |||
| 例: | |||
| # 分别从TRAIN_PATH和VAL_PATH读取训练和验证数据,batch_size选择256,启动4个线程来读取数据,用2块GPU处理数据,分别是第0号和第4号GPU | |||
| # 程序默认使用所有显卡,和4线程 | |||
| # 如果使用单张GPU,请设置num_shards = 1, shard_id = [0], device_id保持一个列表形式 | |||
| # 如果不使用GPU,请使用get_iter_dali_cpu() | |||
| train_data_iter, val_data_iter = get_iter_dali(batch_size=256, | |||
| train_file_root=TRAIN_PATH, | |||
| val_file_root=Val_PATH, | |||
| num_threads=4, | |||
| device_id=[0,4], | |||
| num_shards=2, | |||
| shard_id=[0,1]) | |||
| # 在torch中训练 | |||
| torch_model = TorchModel(para) | |||
| criterion = nn.CrossEntropyLoss() | |||
| optimizer = torch.optim.Adam(torch_model.parameters()) | |||
| for epoch in range(epoches): | |||
| for step, x,y in enumerate(train_data_iter): | |||
| # 数据 : x | |||
| # 标签 : y | |||
| x = x.to("cuda:0") | |||
| y = y.to("cuda:0") | |||
| output = my_model(x) | |||
| optimizer.zero_grad() | |||
| loss = criterion(output, y) | |||
| loss.backward() | |||
| optimizer.step() | |||
| ... | |||
| ... | |||
| """ | |||
| def get_train_pipeline(shard_id, device_id): | |||
| pipeline = HybridTrainPipeline(batch_size = batch_size, | |||
| file_root = train_file_root, | |||
| num_threads = num_threads, | |||
| num_shards = num_shards, | |||
| shard_id = shard_id, | |||
| device_id = device_id) | |||
| return pipeline | |||
| def get_val_pipeline(shard_id, device_id): | |||
| pipeline = HybridValPipeline(batch_size = batch_size, | |||
| file_root = val_file_root, | |||
| num_threads = num_threads, | |||
| num_shards = num_shards, | |||
| shard_id = shard_id, | |||
| device_id = device_id) | |||
| return pipeline | |||
| pipeline_for_train = [get_train_pipeline(shard_id = shard_id_index, device_id = device_id_index) \ | |||
| for shard_id_index, device_id_index in zip(shard_id, device_id)] | |||
| pipeline_for_val = [get_val_pipeline(shard_id = shard_id_index, device_id = device_id_index) \ | |||
| for shard_id_index, device_id_index in zip(shard_id, device_id)] | |||
| [pipeline.build() for pipeline in pipeline_for_train] | |||
| [pipeline.build() for pipeline in pipeline_for_val] | |||
| def get_data_size(pipeline): | |||
| data_num = pipeline.epoch_size()["Reader"] | |||
| batch_size = pipeline.batch_size | |||
| return data_num//batch_size*batch_size | |||
| data_num_train = get_data_size(pipeline_for_train[0]) | |||
| data_num_val = get_data_size(pipeline_for_val[0]) | |||
| def get_dali_iter_for_torch(pipelines, data_num): | |||
| return [DALIClassificationIterator(pipelines=pipeline, | |||
| last_batch_policy="drop",size = data_num) for pipeline in pipelines] | |||
| data_loader_train = get_dali_iter_for_torch(pipeline_for_train, data_num_train) | |||
| data_loader_val = get_dali_iter_for_torch(pipeline_for_val, data_num_val) | |||
| train_data_iter = TorchWrapper(num_shards, data_loader_train) | |||
| val_data_iter = TorchWrapper(num_shards, data_loader_val) | |||
| return train_data_iter, val_data_iter | |||
| def get_iter_dali_cpu(batch_size=256, train_file_root="", val_file_root="", num_threads=4): | |||
| pipeline_train = HybridTrainPipeline(batch_size = batch_size, | |||
| file_root = train_file_root, | |||
| num_threads = num_threads, | |||
| num_shards = 0, | |||
| shard_id = -1, | |||
| device_id = 0) | |||
| pipeline_val = HybridTrainPipeline(batch_size = batch_size, | |||
| file_root = val_file_root, | |||
| num_threads = num_threads, | |||
| num_shards = 0, | |||
| shard_id = -1, | |||
| device_id = 0) | |||
| pipeline_train.build() | |||
| pipeline_val.build() | |||
| def get_data_size(pipeline): | |||
| data_num = pipeline.epoch_size()["Reader"] | |||
| batch_size = pipeline.batch_size | |||
| return data_num//batch_size*batch_size | |||
| data_num_train = get_data_size(pipeline_train) | |||
| data_num_val = get_data_size(pipeline_val) | |||
| data_loader_train = DALIClassificationIterator(pipelines=pipeline_train, | |||
| last_batch_policy="drop",size = data_num_train) | |||
| data_loader_val = DALIClassificationIterator(pipelines=pipeline_val, | |||
| last_batch_policy="drop",size = data_num_val) | |||
| train_data_iter = TorchWrapper(0,data_loader_train) | |||
| val_data_iter = TorchWrapper(0,data_loader_val) | |||
| return train_data_iter, val_data_iter | |||
| if __name__ == "__main__": | |||
| PATH = "./imagenet" | |||
| TRAIN_PATH = "./imagenet/train" | |||
| VALID_PATH = "./imagenet/val" | |||
| train_data_iter_cuda, val_data_iter_cuda = get_iter_dali_cuda(batch_size=256, | |||
| train_file_root=TRAIN_PATH, | |||
| val_file_root=TRAIN_PATH, | |||
| num_threads=4, | |||
| device_id=[0,4], | |||
| num_shards=2, | |||
| shard_id=[0,1]) | |||
| train_data_iter_cpu, val_data_iter_cpu = get_iter_dali_cpu(batch_size=256, | |||
| train_file_root=TRAIN_PATH, | |||
| val_file_root=TRAIN_PATH, | |||
| num_threads=4) | |||
| @@ -0,0 +1,46 @@ | |||
| # Efficient Neural Architecture Search (ENAS) | |||
| ## 1. Requirements | |||
| ``` | |||
| torch | |||
| torchvision | |||
| collections | |||
| argparser | |||
| pickle | |||
| pytest-shutil | |||
| ``` | |||
| ## 2.Train | |||
| ### Stage1: search an architecture | |||
| * macro search | |||
| ``` | |||
| python trainer.py --trial_id=0 --search_for macro --best_selected_space_path='./macro_selected_space.json' --result_path='./macro_result.json' | |||
| ``` | |||
| * micro search | |||
| ``` | |||
| python trainer.py --trial_id=0 --search_for micro --best_selected_space_path='./micro_selected_space.json' --result_path='./micro_result.json' | |||
| ``` | |||
| ### Stage2: select (deprecated) | |||
| ``` | |||
| python selector.py | |||
| ``` | |||
| ### stage3: retrain | |||
| * macro search | |||
| ``` | |||
| python retrainer.py --search_for macro --best_checkpoint_dir='./macro_checkpoint.pth' --best_selected_space_path= | |||
| './macro_selected_space.json' --result_path='./macro_result.json' | |||
| ``` | |||
| * micro search | |||
| ``` | |||
| python retrainer.py --search_for micro --best_checkpoint_dir='./micro_checkpoint.pth' --best_selected_space_path= | |||
| './micro_selected_space.json' --result_path='./micro_result.json' | |||
| ``` | |||
| @@ -0,0 +1,5 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| from .mutator import EnasMutator | |||
| from .trainer import EnasTrainer | |||
| @@ -0,0 +1,28 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| from torchvision import transforms | |||
| from torchvision.datasets import CIFAR10 | |||
| def get_dataset(cls,datadir): | |||
| MEAN = [0.49139968, 0.48215827, 0.44653124] | |||
| STD = [0.24703233, 0.24348505, 0.26158768] | |||
| transf = [ | |||
| transforms.RandomCrop(32, padding=4), | |||
| transforms.RandomHorizontalFlip() | |||
| ] | |||
| normalize = [ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(MEAN, STD) | |||
| ] | |||
| train_transform = transforms.Compose(transf + normalize) | |||
| valid_transform = transforms.Compose(normalize) | |||
| if cls == "cifar10": | |||
| dataset_train = CIFAR10(root=datadir, train=True, download=True, transform=train_transform) | |||
| dataset_valid = CIFAR10(root=datadir, train=False, download=True, transform=valid_transform) | |||
| else: | |||
| raise NotImplementedError | |||
| return dataset_train, dataset_valid | |||
| @@ -0,0 +1,87 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import torch.nn as nn | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| from pytorch import mutables # LayerChoice, InputChoice, MutableScope | |||
| from ops import FactorizedReduce, ConvBranch, PoolBranch | |||
| class ENASLayer(mutables.MutableScope): | |||
| def __init__(self, key, prev_labels, in_filters, out_filters): | |||
| super().__init__(key) | |||
| self.in_filters = in_filters | |||
| self.out_filters = out_filters | |||
| self.mutable = mutables.LayerChoice([ | |||
| ConvBranch(in_filters, out_filters, 3, 1, 1, separable=False), | |||
| ConvBranch(in_filters, out_filters, 3, 1, 1, separable=True), | |||
| ConvBranch(in_filters, out_filters, 5, 1, 2, separable=False), | |||
| ConvBranch(in_filters, out_filters, 5, 1, 2, separable=True), | |||
| PoolBranch('avg', in_filters, out_filters, 3, 1, 1), | |||
| PoolBranch('max', in_filters, out_filters, 3, 1, 1) | |||
| ]) | |||
| if len(prev_labels) > 0: | |||
| self.skipconnect = mutables.InputChoice(choose_from=prev_labels, n_chosen=None) | |||
| else: | |||
| self.skipconnect = None | |||
| self.batch_norm = nn.BatchNorm2d(out_filters, affine=False) | |||
| def forward(self, prev_layers): | |||
| out = self.mutable(prev_layers[-1]) | |||
| if self.skipconnect is not None: | |||
| connection = self.skipconnect(prev_layers[:-1]) | |||
| if connection is not None: | |||
| out += connection | |||
| return self.batch_norm(out) | |||
| class GeneralNetwork(nn.Module): | |||
| def __init__(self, num_layers=12, out_filters=24, in_channels=3, num_classes=10, | |||
| dropout_rate=0.0): | |||
| super().__init__() | |||
| self.num_layers = num_layers | |||
| self.num_classes = num_classes | |||
| self.out_filters = out_filters | |||
| self.stem = nn.Sequential( | |||
| nn.Conv2d(in_channels, out_filters, 3, 1, 1, bias=False), | |||
| nn.BatchNorm2d(out_filters) | |||
| ) | |||
| pool_distance = self.num_layers // 3 | |||
| self.pool_layers_idx = [pool_distance - 1, 2 * pool_distance - 1] | |||
| self.dropout_rate = dropout_rate | |||
| self.dropout = nn.Dropout(self.dropout_rate) | |||
| self.layers = nn.ModuleList() | |||
| self.pool_layers = nn.ModuleList() | |||
| labels = [] | |||
| for layer_id in range(self.num_layers): | |||
| labels.append("layer_{}".format(layer_id)) | |||
| if layer_id in self.pool_layers_idx: | |||
| self.pool_layers.append(FactorizedReduce(self.out_filters, self.out_filters)) | |||
| self.layers.append(ENASLayer(labels[-1], labels[:-1], self.out_filters, self.out_filters)) | |||
| self.gap = nn.AdaptiveAvgPool2d(1) | |||
| self.dense = nn.Linear(self.out_filters, self.num_classes) | |||
| def forward(self, x): | |||
| bs = x.size(0) | |||
| cur = self.stem(x) | |||
| layers = [cur] | |||
| for layer_id in range(self.num_layers): | |||
| cur = self.layers[layer_id](layers) | |||
| layers.append(cur) | |||
| if layer_id in self.pool_layers_idx: | |||
| for i, layer in enumerate(layers): | |||
| layers[i] = self.pool_layers[self.pool_layers_idx.index(layer_id)](layer) | |||
| cur = layers[-1] | |||
| cur = self.gap(cur).view(bs, -1) | |||
| cur = self.dropout(cur) | |||
| logits = self.dense(cur) | |||
| return logits | |||
| @@ -0,0 +1,187 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from pytorch import mutables | |||
| from ops import FactorizedReduce, StdConv, SepConvBN, Pool | |||
| class AuxiliaryHead(nn.Module): | |||
| def __init__(self, in_channels, num_classes): | |||
| super().__init__() | |||
| self.in_channels = in_channels | |||
| self.num_classes = num_classes | |||
| self.pooling = nn.Sequential( | |||
| nn.ReLU(), | |||
| nn.AvgPool2d(5, 3, 2) | |||
| ) | |||
| self.proj = nn.Sequential( | |||
| StdConv(in_channels, 128), | |||
| StdConv(128, 768) | |||
| ) | |||
| self.avg_pool = nn.AdaptiveAvgPool2d(1) | |||
| self.fc = nn.Linear(768, 10, bias=False) | |||
| def forward(self, x): | |||
| bs = x.size(0) | |||
| x = self.pooling(x) | |||
| x = self.proj(x) | |||
| x = self.avg_pool(x).view(bs, -1) | |||
| x = self.fc(x) | |||
| return x | |||
| class Cell(nn.Module): | |||
| def __init__(self, cell_name, prev_labels, channels): | |||
| super().__init__() | |||
| self.input_choice = mutables.InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True, | |||
| key=cell_name + "_input") | |||
| self.op_choice = mutables.LayerChoice([ | |||
| SepConvBN(channels, channels, 3, 1), | |||
| SepConvBN(channels, channels, 5, 2), | |||
| Pool("avg", 3, 1, 1), | |||
| Pool("max", 3, 1, 1), | |||
| nn.Identity() | |||
| ], key=cell_name + "_op") | |||
| def forward(self, prev_layers): | |||
| chosen_input, chosen_mask = self.input_choice(prev_layers) | |||
| cell_out = self.op_choice(chosen_input) | |||
| return cell_out, chosen_mask | |||
| class Node(mutables.MutableScope): | |||
| def __init__(self, node_name, prev_node_names, channels): | |||
| super().__init__(node_name) | |||
| self.cell_x = Cell(node_name + "_x", prev_node_names, channels) | |||
| self.cell_y = Cell(node_name + "_y", prev_node_names, channels) | |||
| def forward(self, prev_layers): | |||
| out_x, mask_x = self.cell_x(prev_layers) | |||
| out_y, mask_y = self.cell_y(prev_layers) | |||
| return out_x + out_y, mask_x | mask_y | |||
| class Calibration(nn.Module): | |||
| def __init__(self, in_channels, out_channels): | |||
| super().__init__() | |||
| self.process = None | |||
| if in_channels != out_channels: | |||
| self.process = StdConv(in_channels, out_channels) | |||
| def forward(self, x): | |||
| if self.process is None: | |||
| return x | |||
| return self.process(x) | |||
| class ReductionLayer(nn.Module): | |||
| def __init__(self, in_channels_pp, in_channels_p, out_channels): | |||
| super().__init__() | |||
| self.reduce0 = FactorizedReduce(in_channels_pp, out_channels, affine=False) | |||
| self.reduce1 = FactorizedReduce(in_channels_p, out_channels, affine=False) | |||
| def forward(self, pprev, prev): | |||
| return self.reduce0(pprev), self.reduce1(prev) | |||
| class ENASLayer(nn.Module): | |||
| def __init__(self, num_nodes, in_channels_pp, in_channels_p, out_channels, reduction): | |||
| super().__init__() | |||
| self.preproc0 = Calibration(in_channels_pp, out_channels) | |||
| self.preproc1 = Calibration(in_channels_p, out_channels) | |||
| self.num_nodes = num_nodes | |||
| name_prefix = "reduce" if reduction else "normal" | |||
| self.nodes = nn.ModuleList() | |||
| node_labels = [mutables.InputChoice.NO_KEY, mutables.InputChoice.NO_KEY] | |||
| for i in range(num_nodes): | |||
| node_labels.append("{}_node_{}".format(name_prefix, i)) | |||
| self.nodes.append(Node(node_labels[-1], node_labels[:-1], out_channels)) | |||
| self.final_conv_w = nn.Parameter(torch.zeros(out_channels, self.num_nodes + 2, out_channels, 1, 1), requires_grad=True) | |||
| self.bn = nn.BatchNorm2d(out_channels, affine=False) | |||
| self.reset_parameters() | |||
| def reset_parameters(self): | |||
| nn.init.kaiming_normal_(self.final_conv_w) | |||
| def forward(self, pprev, prev): | |||
| pprev_, prev_ = self.preproc0(pprev), self.preproc1(prev) | |||
| prev_nodes_out = [pprev_, prev_] | |||
| nodes_used_mask = torch.zeros(self.num_nodes + 2, dtype=torch.bool, device=prev.device) | |||
| for i in range(self.num_nodes): | |||
| node_out, mask = self.nodes[i](prev_nodes_out) | |||
| nodes_used_mask[:mask.size(0)] |= mask.to(node_out.device) | |||
| prev_nodes_out.append(node_out) | |||
| unused_nodes = torch.cat([out for used, out in zip(nodes_used_mask, prev_nodes_out) if not used], 1) | |||
| unused_nodes = F.relu(unused_nodes) | |||
| conv_weight = self.final_conv_w[:, ~nodes_used_mask, :, :, :] | |||
| conv_weight = conv_weight.view(conv_weight.size(0), -1, 1, 1) | |||
| out = F.conv2d(unused_nodes, conv_weight) | |||
| return prev, self.bn(out) | |||
| class MicroNetwork(nn.Module): | |||
| def __init__(self, num_layers=2, num_nodes=5, out_channels=24, in_channels=3, num_classes=10, | |||
| dropout_rate=0.0, use_aux_heads=False): | |||
| super().__init__() | |||
| self.num_layers = num_layers | |||
| self.use_aux_heads = use_aux_heads | |||
| self.stem = nn.Sequential( | |||
| nn.Conv2d(in_channels, out_channels * 3, 3, 1, 1, bias=False), | |||
| nn.BatchNorm2d(out_channels * 3) | |||
| ) | |||
| pool_distance = self.num_layers // 3 | |||
| pool_layers = [pool_distance, 2 * pool_distance + 1] | |||
| self.dropout = nn.Dropout(dropout_rate) | |||
| self.layers = nn.ModuleList() | |||
| c_pp = c_p = out_channels * 3 | |||
| c_cur = out_channels | |||
| for layer_id in range(self.num_layers + 2): | |||
| reduction = False | |||
| if layer_id in pool_layers: | |||
| c_cur, reduction = c_p * 2, True | |||
| self.layers.append(ReductionLayer(c_pp, c_p, c_cur)) | |||
| c_pp = c_p = c_cur | |||
| self.layers.append(ENASLayer(num_nodes, c_pp, c_p, c_cur, reduction)) | |||
| if self.use_aux_heads and layer_id == pool_layers[-1] + 1: | |||
| self.layers.append(AuxiliaryHead(c_cur, num_classes)) | |||
| c_pp, c_p = c_p, c_cur | |||
| self.gap = nn.AdaptiveAvgPool2d(1) | |||
| self.dense = nn.Linear(c_cur, num_classes) | |||
| self.reset_parameters() | |||
| def reset_parameters(self): | |||
| for m in self.modules(): | |||
| if isinstance(m, nn.Conv2d): | |||
| nn.init.kaiming_normal_(m.weight) | |||
| def forward(self, x): | |||
| bs = x.size(0) | |||
| prev = cur = self.stem(x) | |||
| aux_logits = None | |||
| for layer in self.layers: | |||
| if isinstance(layer, AuxiliaryHead): | |||
| if self.training: | |||
| aux_logits = layer(cur) | |||
| else: | |||
| prev, cur = layer(prev, cur) | |||
| cur = self.gap(F.relu(cur)).view(bs, -1) | |||
| cur = self.dropout(cur) | |||
| logits = self.dense(cur) | |||
| if aux_logits is not None: | |||
| return logits, aux_logits | |||
| return logits | |||
| @@ -0,0 +1,197 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from pytorch.mutator import Mutator | |||
| from pytorch.mutables import LayerChoice, InputChoice, MutableScope | |||
| class StackedLSTMCell(nn.Module): | |||
| def __init__(self, layers, size, bias): | |||
| super().__init__() | |||
| self.lstm_num_layers = layers | |||
| self.lstm_modules = nn.ModuleList([nn.LSTMCell(size, size, bias=bias) | |||
| for _ in range(self.lstm_num_layers)]) | |||
| def forward(self, inputs, hidden): | |||
| prev_c, prev_h = hidden | |||
| next_c, next_h = [], [] | |||
| for i, m in enumerate(self.lstm_modules): | |||
| curr_c, curr_h = m(inputs, (prev_c[i], prev_h[i])) | |||
| next_c.append(curr_c) | |||
| next_h.append(curr_h) | |||
| # current implementation only supports batch size equals 1, | |||
| # but the algorithm does not necessarily have this limitation | |||
| inputs = curr_h[-1].view(1, -1) | |||
| return next_c, next_h | |||
| class EnasMutator(Mutator): | |||
| """ | |||
| A mutator that mutates the graph with RL. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| PyTorch model. | |||
| lstm_size : int | |||
| Controller LSTM hidden units. | |||
| lstm_num_layers : int | |||
| Number of layers for stacked LSTM. | |||
| tanh_constant : float | |||
| Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``. | |||
| cell_exit_extra_step : bool | |||
| If true, RL controller will perform an extra step at the exit of each MutableScope, dump the hidden state | |||
| and mark it as the hidden state of this MutableScope. This is to align with the original implementation of paper. | |||
| skip_target : float | |||
| Target probability that skipconnect will appear. | |||
| temperature : float | |||
| Temperature constant that divides the logits. | |||
| branch_bias : float | |||
| Manual bias applied to make some operations more likely to be chosen. | |||
| Currently this is implemented with a hardcoded match rule that aligns with original repo. | |||
| If a mutable has a ``reduce`` in its key, all its op choices | |||
| that contains `conv` in their typename will receive a bias of ``+self.branch_bias`` initially; while others | |||
| receive a bias of ``-self.branch_bias``. | |||
| entropy_reduction : str | |||
| Can be one of ``sum`` and ``mean``. How the entropy of multi-input-choice is reduced. | |||
| """ | |||
| def __init__(self, model, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5, cell_exit_extra_step=False, | |||
| skip_target=0.4, temperature=None, branch_bias=0.25, entropy_reduction="sum"): | |||
| super().__init__(model) | |||
| self.lstm_size = lstm_size | |||
| self.lstm_num_layers = lstm_num_layers | |||
| self.tanh_constant = tanh_constant | |||
| self.temperature = temperature | |||
| self.cell_exit_extra_step = cell_exit_extra_step | |||
| self.skip_target = skip_target | |||
| self.branch_bias = branch_bias | |||
| self.lstm = StackedLSTMCell(self.lstm_num_layers, self.lstm_size, False) | |||
| self.attn_anchor = nn.Linear(self.lstm_size, self.lstm_size, bias=False) | |||
| self.attn_query = nn.Linear(self.lstm_size, self.lstm_size, bias=False) | |||
| self.v_attn = nn.Linear(self.lstm_size, 1, bias=False) | |||
| self.g_emb = nn.Parameter(torch.randn(1, self.lstm_size) * 0.1) | |||
| self.skip_targets = nn.Parameter(torch.tensor([1.0 - self.skip_target, self.skip_target]), requires_grad=False) # pylint: disable=not-callable | |||
| assert entropy_reduction in ["sum", "mean"], "Entropy reduction must be one of sum and mean." | |||
| self.entropy_reduction = torch.sum if entropy_reduction == "sum" else torch.mean | |||
| self.cross_entropy_loss = nn.CrossEntropyLoss(reduction="none") | |||
| self.bias_dict = nn.ParameterDict() | |||
| self.max_layer_choice = 0 | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| if self.max_layer_choice == 0: | |||
| self.max_layer_choice = len(mutable) | |||
| assert self.max_layer_choice == len(mutable), \ | |||
| "ENAS mutator requires all layer choice have the same number of candidates." | |||
| # We are judging by keys and module types to add biases to layer choices. Needs refactor. | |||
| if "reduce" in mutable.key: | |||
| def is_conv(choice): | |||
| return "conv" in str(type(choice)).lower() | |||
| bias = torch.tensor([self.branch_bias if is_conv(choice) else -self.branch_bias # pylint: disable=not-callable | |||
| for choice in mutable]) | |||
| self.bias_dict[mutable.key] = nn.Parameter(bias, requires_grad=False) | |||
| self.embedding = nn.Embedding(self.max_layer_choice + 1, self.lstm_size) | |||
| self.soft = nn.Linear(self.lstm_size, self.max_layer_choice, bias=False) | |||
| def sample_search(self): | |||
| self._initialize() | |||
| self._sample(self.mutables) | |||
| return self._choices | |||
| def sample_final(self): | |||
| return self.sample_search() | |||
| def _sample(self, tree): | |||
| mutable = tree.mutable | |||
| if isinstance(mutable, LayerChoice) and mutable.key not in self._choices: | |||
| self._choices[mutable.key] = self._sample_layer_choice(mutable) | |||
| elif isinstance(mutable, InputChoice) and mutable.key not in self._choices: | |||
| self._choices[mutable.key] = self._sample_input_choice(mutable) | |||
| for child in tree.children: | |||
| self._sample(child) | |||
| if isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid: | |||
| if self.cell_exit_extra_step: | |||
| self._lstm_next_step() | |||
| self._mark_anchor(mutable.key) | |||
| def _initialize(self): | |||
| self._choices = dict() | |||
| self._anchors_hid = dict() | |||
| self._inputs = self.g_emb.data | |||
| self._c = [torch.zeros((1, self.lstm_size), | |||
| dtype=self._inputs.dtype, | |||
| device=self._inputs.device) for _ in range(self.lstm_num_layers)] | |||
| self._h = [torch.zeros((1, self.lstm_size), | |||
| dtype=self._inputs.dtype, | |||
| device=self._inputs.device) for _ in range(self.lstm_num_layers)] | |||
| self.sample_log_prob = 0 | |||
| self.sample_entropy = 0 | |||
| self.sample_skip_penalty = 0 | |||
| def _lstm_next_step(self): | |||
| self._c, self._h = self.lstm(self._inputs, (self._c, self._h)) | |||
| def _mark_anchor(self, key): | |||
| self._anchors_hid[key] = self._h[-1] | |||
| def _sample_layer_choice(self, mutable): | |||
| self._lstm_next_step() | |||
| logit = self.soft(self._h[-1]) | |||
| if self.temperature is not None: | |||
| logit /= self.temperature | |||
| if self.tanh_constant is not None: | |||
| logit = self.tanh_constant * torch.tanh(logit) | |||
| if mutable.key in self.bias_dict: | |||
| logit += self.bias_dict[mutable.key] | |||
| branch_id = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) | |||
| log_prob = self.cross_entropy_loss(logit, branch_id) | |||
| self.sample_log_prob += self.entropy_reduction(log_prob) | |||
| entropy = (log_prob * torch.exp(-log_prob)).detach() # pylint: disable=invalid-unary-operand-type | |||
| self.sample_entropy += self.entropy_reduction(entropy) | |||
| self._inputs = self.embedding(branch_id) | |||
| return F.one_hot(branch_id, num_classes=self.max_layer_choice).bool().view(-1) | |||
| def _sample_input_choice(self, mutable): | |||
| query, anchors = [], [] | |||
| for label in mutable.choose_from: | |||
| if label not in self._anchors_hid: | |||
| self._lstm_next_step() | |||
| self._mark_anchor(label) # empty loop, fill not found | |||
| query.append(self.attn_anchor(self._anchors_hid[label])) | |||
| anchors.append(self._anchors_hid[label]) | |||
| query = torch.cat(query, 0) | |||
| query = torch.tanh(query + self.attn_query(self._h[-1])) | |||
| query = self.v_attn(query) | |||
| if self.temperature is not None: | |||
| query /= self.temperature | |||
| if self.tanh_constant is not None: | |||
| query = self.tanh_constant * torch.tanh(query) | |||
| if mutable.n_chosen is None: | |||
| logit = torch.cat([-query, query], 1) # pylint: disable=invalid-unary-operand-type | |||
| skip = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) | |||
| skip_prob = torch.sigmoid(logit) | |||
| kl = torch.sum(skip_prob * torch.log(skip_prob / self.skip_targets)) | |||
| self.sample_skip_penalty += kl | |||
| log_prob = self.cross_entropy_loss(logit, skip) | |||
| self._inputs = (torch.matmul(skip.float(), torch.cat(anchors, 0)) / (1. + torch.sum(skip))).unsqueeze(0) | |||
| else: | |||
| assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS." | |||
| logit = query.view(1, -1) | |||
| index = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) | |||
| skip = F.one_hot(index, num_classes=mutable.n_candidates).view(-1) | |||
| log_prob = self.cross_entropy_loss(logit, index) | |||
| self._inputs = anchors[index.item()] | |||
| self.sample_log_prob += self.entropy_reduction(log_prob) | |||
| entropy = (log_prob * torch.exp(-log_prob)).detach() # pylint: disable=invalid-unary-operand-type | |||
| self.sample_entropy += self.entropy_reduction(entropy) | |||
| return skip.bool() | |||
| @@ -0,0 +1,129 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import torch | |||
| import torch.nn as nn | |||
| class StdConv(nn.Module): | |||
| def __init__(self, C_in, C_out): | |||
| super(StdConv, self).__init__() | |||
| self.conv = nn.Sequential( | |||
| nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False), | |||
| nn.BatchNorm2d(C_out, affine=False), | |||
| nn.ReLU() | |||
| ) | |||
| def forward(self, x): | |||
| return self.conv(x) | |||
| def __str__(self): | |||
| return 'StdConv' | |||
| class PoolBranch(nn.Module): | |||
| def __init__(self, pool_type, C_in, C_out, kernel_size, stride, padding, affine=False): | |||
| super().__init__() | |||
| self.kernel_size = kernel_size | |||
| self.pool_type = pool_type | |||
| self.preproc = StdConv(C_in, C_out) | |||
| self.pool = Pool(pool_type, kernel_size, stride, padding) | |||
| self.bn = nn.BatchNorm2d(C_out, affine=affine) | |||
| def forward(self, x): | |||
| out = self.preproc(x) | |||
| out = self.pool(out) | |||
| out = self.bn(out) | |||
| return out | |||
| def __str__(self): | |||
| return '{}PoolBranch_{}'.format(self.pool_type, self.kernel_size) | |||
| class SeparableConv(nn.Module): | |||
| def __init__(self, C_in, C_out, kernel_size, stride, padding): | |||
| self.kernel_size = kernel_size | |||
| super(SeparableConv, self).__init__() | |||
| self.depthwise = nn.Conv2d(C_in, C_in, kernel_size=kernel_size, padding=padding, stride=stride, | |||
| groups=C_in, bias=False) | |||
| self.pointwise = nn.Conv2d(C_in, C_out, kernel_size=1, bias=False) | |||
| def forward(self, x): | |||
| out = self.depthwise(x) | |||
| out = self.pointwise(out) | |||
| return out | |||
| def __str__(self): | |||
| return 'SeparableConv_{}'.format(self.kernel_size) | |||
| class ConvBranch(nn.Module): | |||
| def __init__(self, C_in, C_out, kernel_size, stride, padding, separable): | |||
| super(ConvBranch, self).__init__() | |||
| self.kernel_size = kernel_size | |||
| self.preproc = StdConv(C_in, C_out) | |||
| if separable: | |||
| self.conv = SeparableConv(C_out, C_out, kernel_size, stride, padding) | |||
| else: | |||
| self.conv = nn.Conv2d(C_out, C_out, kernel_size, stride=stride, padding=padding) | |||
| self.postproc = nn.Sequential( | |||
| nn.BatchNorm2d(C_out, affine=False), | |||
| nn.ReLU() | |||
| ) | |||
| def forward(self, x): | |||
| out = self.preproc(x) | |||
| out = self.conv(out) | |||
| out = self.postproc(out) | |||
| return out | |||
| def __str__(self): | |||
| return 'ConvBranch_{}'.format(self.kernel_size) | |||
| class FactorizedReduce(nn.Module): | |||
| def __init__(self, C_in, C_out, affine=False): | |||
| super().__init__() | |||
| self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) | |||
| self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) | |||
| self.bn = nn.BatchNorm2d(C_out, affine=affine) | |||
| def forward(self, x): | |||
| out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1) | |||
| out = self.bn(out) | |||
| return out | |||
| def __str__(self): | |||
| return 'FactorizedReduce' | |||
| class Pool(nn.Module): | |||
| def __init__(self, pool_type, kernel_size, stride, padding): | |||
| super().__init__() | |||
| self.kernel_size = kernel_size | |||
| self.pool_type = pool_type | |||
| if pool_type.lower() == 'max': | |||
| self.pool = nn.MaxPool2d(kernel_size, stride, padding) | |||
| elif pool_type.lower() == 'avg': | |||
| self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False) | |||
| else: | |||
| raise ValueError() | |||
| def forward(self, x): | |||
| return self.pool(x) | |||
| def __str__(self): | |||
| return '{}Pool_{}'.format(self.pool_type, self.kernel_size) | |||
| class SepConvBN(nn.Module): | |||
| def __init__(self, C_in, C_out, kernel_size, padding): | |||
| super().__init__() | |||
| self.kernel_size = kernel_size | |||
| self.relu = nn.ReLU() | |||
| self.conv = SeparableConv(C_in, C_out, kernel_size, 1, padding) | |||
| self.bn = nn.BatchNorm2d(C_out, affine=True) | |||
| def forward(self, x): | |||
| x = self.relu(x) | |||
| x = self.conv(x) | |||
| x = self.bn(x) | |||
| return x | |||
| def __str__(self): | |||
| return 'SepConvBN_{}'.format(self.kernel_size) | |||
| @@ -0,0 +1,490 @@ | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import os | |||
| import logging | |||
| import pickle | |||
| import shutil | |||
| import random | |||
| import math | |||
| import time | |||
| import datetime | |||
| import argparse | |||
| import distutils.util | |||
| import numpy as np | |||
| import json | |||
| import torch | |||
| from torch import nn | |||
| from torch import optim | |||
| from torch.utils.data import DataLoader | |||
| import torch.nn.functional as Func | |||
| from macro import GeneralNetwork | |||
| from micro import MicroNetwork | |||
| import datasets | |||
| from utils import accuracy, reward_accuracy | |||
| from pytorch.fixed import apply_fixed_architecture | |||
| from pytorch.utils import AverageMeterGroup, to_device, save_best_checkpoint | |||
| logger = logging.getLogger("enas-retrain") | |||
| # TODO: | |||
| def set_random_seed(seed): | |||
| logger.info("set random seed for data reading: {}".format(seed)) | |||
| random.seed(seed) | |||
| os.environ['PYTHONHASHSEED'] = str(seed) | |||
| np.random.seed(seed) | |||
| random.seed(seed) | |||
| torch.manual_seed_all(seed) | |||
| if FLAGS.is_cuda: | |||
| torch.cuda.manual_seed_all(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| # TODO: parser args | |||
| def parse_args(): | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument( | |||
| "--data_dir", | |||
| type=str, | |||
| default="./data", | |||
| help="Directory containing the dataset and embedding file. (default: %(default)s)") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./search_space.json', help="search_space directory") | |||
| parser.add_argument( | |||
| "--selected_space_path", | |||
| type=str, | |||
| default="./selected_space.json", | |||
| # required=True, | |||
| help="Architecture json file. (default: %(default)s)") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./result.json', help="res directory") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument( | |||
| "--output_dir", | |||
| type=str, | |||
| default="./output", | |||
| help="The output directory. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--best_checkpoint_dir", | |||
| type=str, | |||
| default="best_checkpoint", | |||
| help="Path for saved checkpoints. (default: %(default)s)") | |||
| parser.add_argument("--search_for", | |||
| choices=["macro", "micro"], | |||
| default="micro") | |||
| parser.add_argument( | |||
| "--batch_size", | |||
| type=int, | |||
| default=128, | |||
| help="Number of samples each batch for training. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--eval_batch_size", | |||
| type=int, | |||
| default=128, | |||
| help="Number of samples each batch for evaluation. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--class_num", | |||
| type=int, | |||
| default=10, | |||
| help="The number of categories. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--epochs", | |||
| type=int, | |||
| default=10, | |||
| help="The number of training epochs. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--child_lr", | |||
| type=float, | |||
| default=0.02, | |||
| help="The initial learning rate. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--is_cuda", | |||
| type=distutils.util.strtobool, | |||
| default=True, | |||
| help="Specify the device type. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--load_checkpoint", | |||
| type=distutils.util.strtobool, | |||
| default=False, | |||
| help="Whether to load checkpoint. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--log_every", | |||
| type=int, | |||
| default=50, | |||
| help="How many steps to log. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--eval_every_epochs", | |||
| type=int, | |||
| default=1, | |||
| help="How many epochs to eval. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--child_grad_bound", | |||
| type=float, | |||
| default=5.0, | |||
| help="The threshold for gradient clipping. (default: %(default)s)") # | |||
| parser.add_argument( | |||
| "--child_lr_decay_scheme", | |||
| type=str, | |||
| default="cosine", | |||
| help="Learning rate annealing strategy, only 'cosine' supported. (default: %(default)s)") #todo: remove | |||
| parser.add_argument( | |||
| "--child_lr_T_0", | |||
| type=int, | |||
| default=10, | |||
| help="The length of one cycle. (default: %(default)s)") # todo: use for | |||
| parser.add_argument( | |||
| "--child_lr_T_mul", | |||
| type=int, | |||
| default=2, | |||
| help="The multiplication factor per cycle. (default: %(default)s)") # todo: use for | |||
| parser.add_argument( | |||
| "--child_l2_reg", | |||
| type=float, | |||
| default=3e-6, | |||
| help="Weight decay factor. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--child_lr_max", | |||
| type=float, | |||
| default=0.002, | |||
| help="The max learning rate. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--child_lr_min", | |||
| type=float, | |||
| default=0.001, | |||
| help="The min learning rate. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--multi_path", | |||
| type=distutils.util.strtobool, | |||
| default=False, | |||
| help="Search for multiple path in the architecture. (default: %(default)s)") # todo: use for | |||
| parser.add_argument( | |||
| "--is_mask", | |||
| type=distutils.util.strtobool, | |||
| default=True, | |||
| help="Apply mask. (default: %(default)s)") | |||
| global FLAGS | |||
| FLAGS = parser.parse_args() | |||
| def print_user_flags(FLAGS, line_limit=80): | |||
| log_strings = "\n" + "-" * line_limit + "\n" | |||
| for flag_name in sorted(vars(FLAGS)): | |||
| value = "{}".format(getattr(FLAGS, flag_name)) | |||
| log_string = flag_name | |||
| log_string += "." * (line_limit - len(flag_name) - len(value)) | |||
| log_string += value | |||
| log_strings = log_strings + log_string | |||
| log_strings = log_strings + "\n" | |||
| log_strings += "-" * line_limit | |||
| logger.info(log_strings) | |||
| def eval_once(child_model, device, eval_set, criterion, valid_dataloader=None, test_dataloader=None): | |||
| if eval_set == "test": | |||
| assert test_dataloader is not None | |||
| dataloader = test_dataloader | |||
| elif eval_set == "valid": | |||
| assert valid_dataloader is not None | |||
| dataloader = valid_dataloader | |||
| else: | |||
| raise NotImplementedError("Unknown eval_set '{}'".format(eval_set)) | |||
| tot_acc = 0 | |||
| tot = 0 | |||
| losses = [] | |||
| with torch.no_grad(): # save memory | |||
| for batch in dataloader: | |||
| x, y = batch | |||
| x, y = to_device(x, device), to_device(y, device) | |||
| logits = child_model(x) | |||
| if isinstance(logits, tuple): | |||
| logits, aux_logits = logits | |||
| aux_loss = criterion(aux_logits, y) | |||
| else: | |||
| aux_loss = 0. | |||
| loss = criterion(logits, y) | |||
| loss = loss + aux_weight * aux_loss | |||
| # loss = loss.mean() | |||
| preds = logits.argmax(dim=1).long() | |||
| acc = torch.eq(preds, y.long()).long().sum().item() | |||
| losses.append(loss) | |||
| tot_acc += acc | |||
| tot += len(y) | |||
| losses = torch.tensor(losses) | |||
| loss = losses.mean() | |||
| if tot > 0: | |||
| final_acc = float(tot_acc) / tot | |||
| else: | |||
| final_acc = 0 | |||
| logger.info("Error in calculating final_acc") | |||
| return final_acc, loss | |||
| # TODO: learning rate scheduler | |||
| def update_lr( | |||
| optimizer, | |||
| epoch, | |||
| l2_reg=1e-4, | |||
| lr_warmup_val=None, | |||
| lr_init=0.1, | |||
| lr_decay_scheme="cosine", | |||
| lr_max=0.002, | |||
| lr_min=0.000000001, | |||
| lr_T_0=4, | |||
| lr_T_mul=1, | |||
| sync_replicas=False, | |||
| num_aggregate=None, | |||
| num_replicas=None): | |||
| if lr_decay_scheme == "cosine": | |||
| assert lr_max is not None, "Need lr_max to use lr_cosine" | |||
| assert lr_min is not None, "Need lr_min to use lr_cosine" | |||
| assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine" | |||
| assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine" | |||
| T_i = lr_T_0 | |||
| t_epoch = epoch | |||
| last_reset = 0 | |||
| while True: | |||
| t_epoch -= T_i | |||
| if t_epoch < 0: | |||
| break | |||
| last_reset += T_i | |||
| T_i *= lr_T_mul | |||
| T_curr = epoch - last_reset | |||
| def _update(): | |||
| rate = T_curr / T_i * 3.1415926 | |||
| lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + math.cos(rate)) | |||
| return lr | |||
| learning_rate = _update() | |||
| else: | |||
| raise ValueError("Unknown learning rate decay scheme {}".format(lr_decay_scheme)) | |||
| #update lr in optimizer | |||
| for params_group in optimizer.param_groups: | |||
| params_group['lr'] = learning_rate | |||
| return learning_rate | |||
| def train(device, output_dir='./output'): | |||
| workers = 4 | |||
| data = 'cifar10' | |||
| data_dir = FLAGS.data_dir | |||
| output_dir = FLAGS.output_dir | |||
| checkpoint_dir = FLAGS.best_checkpoint_dir | |||
| batch_size = FLAGS.batch_size | |||
| eval_batch_size = FLAGS.eval_batch_size | |||
| class_num = FLAGS.class_num | |||
| epochs = FLAGS.epochs | |||
| child_lr = FLAGS.child_lr | |||
| is_cuda = FLAGS.is_cuda | |||
| load_checkpoint = FLAGS.load_checkpoint | |||
| log_every = FLAGS.log_every | |||
| eval_every_epochs = FLAGS.eval_every_epochs | |||
| child_grad_bound = FLAGS.child_grad_bound | |||
| child_l2_reg = FLAGS.child_l2_reg | |||
| logger.info("Build dataloader") | |||
| dataset_train, dataset_valid = datasets.get_dataset("cifar10") | |||
| n_train = len(dataset_train) | |||
| split = n_train // 10 | |||
| indices = list(range(n_train)) | |||
| train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split]) | |||
| valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:]) | |||
| train_dataloader = torch.utils.data.DataLoader(dataset_train, | |||
| batch_size=batch_size, | |||
| sampler=train_sampler, | |||
| num_workers=workers) | |||
| valid_dataloader = torch.utils.data.DataLoader(dataset_train, | |||
| batch_size=batch_size, | |||
| sampler=valid_sampler, | |||
| num_workers=workers) | |||
| test_dataloader = torch.utils.data.DataLoader(dataset_valid, | |||
| batch_size=batch_size, | |||
| num_workers=workers) | |||
| criterion = nn.CrossEntropyLoss() | |||
| optimizer = torch.optim.SGD(child_model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4, nesterov=True) | |||
| # optimizer = optim.Adam(child_model.parameters(), eps=1e-3, weight_decay=FLAGS.child_l2_reg) | |||
| # TODO | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=0.001) | |||
| # move model to CPU/GPU device | |||
| child_model.to(device) | |||
| criterion.to(device) | |||
| logger.info('Start training') | |||
| start_time = time.time() | |||
| step = 0 | |||
| # save path | |||
| if not os.path.exists(output_dir): | |||
| os.mkdir(output_dir) | |||
| # model_save_path = os.path.join(output_dir, "model.pth") | |||
| # best_model_save_path = os.path.join(output_dir, "best_model.pth") | |||
| best_acc = 0 | |||
| start_epoch = 0 | |||
| # TODO: load checkpoints | |||
| # train | |||
| for epoch in range(start_epoch, epochs): | |||
| lr = update_lr(optimizer, | |||
| epoch, | |||
| l2_reg= 1e-4, | |||
| lr_warmup_val=None, | |||
| lr_init=FLAGS.child_lr, | |||
| lr_decay_scheme=FLAGS.child_lr_decay_scheme, | |||
| lr_max=0.05, | |||
| lr_min=0.001, | |||
| lr_T_0=10, | |||
| lr_T_mul=2) | |||
| child_model.train() | |||
| for batch in train_dataloader: | |||
| step += 1 | |||
| x, y = batch | |||
| x, y = to_device(x, device), to_device(y, device) | |||
| logits = child_model(x) | |||
| if isinstance(logits, tuple): | |||
| logits, aux_logits = logits | |||
| aux_loss = criterion(aux_logits, y) | |||
| else: | |||
| aux_loss = 0. | |||
| acc = accuracy(logits, y) | |||
| loss = criterion(logits, y) | |||
| loss = loss + aux_weight * aux_loss | |||
| optimizer.zero_grad() | |||
| loss.backward() | |||
| grad_norm = 0 | |||
| trainable_params = child_model.parameters() | |||
| for param in trainable_params: | |||
| nn.utils.clip_grad_norm_(param, child_grad_bound) # clip grad | |||
| optimizer.step() | |||
| if step % log_every == 0: | |||
| curr_time = time.time() | |||
| log_string = "" | |||
| log_string += "epoch={:<6d}".format(epoch) | |||
| log_string += "ch_step={:<6d}".format(step) | |||
| log_string += " loss={:<8.6f}".format(loss) | |||
| log_string += " lr={:<8.4f}".format(lr) | |||
| log_string += " |g|={:<8.4f}".format(grad_norm) | |||
| log_string += " tr_acc={:<8.4f}/{:>3d}".format(acc['acc1'], logits.size()[0]) | |||
| log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60) | |||
| logger.info(log_string) | |||
| epoch += 1 | |||
| save_state = { | |||
| 'step': step, | |||
| 'epoch': epoch, | |||
| 'child_model_state_dict': child_model.state_dict(), | |||
| 'optimizer_state_dict': optimizer.state_dict()} | |||
| # print(' Epoch {:<3d} loss: {:<.2f} '.format(epoch, loss)) | |||
| # torch.save(save_state, model_save_path) | |||
| child_model.eval() | |||
| logger.info("Epoch {}: Eval".format(epoch)) | |||
| eval_acc, eval_loss = eval_once(child_model, device, "test", criterion, test_dataloader=test_dataloader) | |||
| logger.info( | |||
| "ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss)) | |||
| if eval_acc > best_acc: | |||
| best_acc = eval_acc | |||
| logger.info("Save best model") | |||
| # save_state = { | |||
| # 'step': step, | |||
| # 'epoch': epoch, | |||
| # 'child_model_state_dict': child_model.state_dict(), | |||
| # 'optimizer_state_dict': optimizer.state_dict()} | |||
| # torch.save(save_state, best_model_save_path) | |||
| save_best_checkpoint(checkpoint_dir, child_model, optimizer, epoch) | |||
| result['accuracy'].append('Epoch {} acc: {:<6.4f}'.format(epoch, eval_acc,)) | |||
| acc_l.append(eval_acc) | |||
| print(result['accuracy'][-1]) | |||
| print('max acc %.4f at epoch: %i'%(max(acc_l), np.argmax(np.array(acc_l)))) | |||
| print('Time cost: %.4f hours'%( float(time.time() - start_time) /3600. )) | |||
| return result | |||
| # macro = True | |||
| parse_args() | |||
| child_fixed_arc = FLAGS.selected_space_path # './macro_seletced_space' | |||
| search_for = FLAGS.search_for | |||
| # 设置随机种子 | |||
| torch.manual_seed(FLAGS.trial_id) | |||
| torch.cuda.manual_seed_all(FLAGS.trial_id) | |||
| np.random.seed(FLAGS.trial_id) | |||
| random.seed(FLAGS.trial_id) | |||
| aux_weight = 0.4 | |||
| result = {'accuracy':[]} | |||
| acc_l = [] | |||
| # decode human readable search space to model | |||
| def convert_selected_space_format(): | |||
| # with open('./macro_selected_space.json') as js: | |||
| with open(child_fixed_arc) as js: | |||
| selected_space = json.load(js) | |||
| ops = selected_space['op_list'] | |||
| selected_space.pop('op_list') | |||
| new_selected_space = {} | |||
| for key, value in selected_space.items(): | |||
| # for macro | |||
| if FLAGS.search_for == 'macro': | |||
| new_key = key.split('_')[-1] | |||
| # for micro | |||
| elif FLAGS.search_for == 'micro': | |||
| new_key = key | |||
| if len(value) > 1 or len(value)==0: | |||
| new_value = value | |||
| elif len(value) > 0 and value[0] in ops: | |||
| new_value = ops.index(value[0]) | |||
| else: | |||
| new_value = value[0] | |||
| new_selected_space[new_key] = new_value | |||
| return new_selected_space | |||
| fixed_arc = convert_selected_space_format() | |||
| # TODO : macro search or micro search | |||
| if FLAGS.search_for == 'macro': | |||
| child_model = GeneralNetwork() | |||
| elif FLAGS.search_for == 'micro': | |||
| child_model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True) | |||
| apply_fixed_architecture(child_model,fixed_arc) | |||
| def dump_global_result(res_path,global_result, sort_keys = False): | |||
| with open(res_path, "w") as ss_file: | |||
| json.dump(global_result, ss_file, sort_keys=sort_keys, indent=2) | |||
| def main(): | |||
| os.environ['CUDA_VISIBLE_DEVICES'] = '4' | |||
| # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |||
| device = torch.device("cuda" if FLAGS.is_cuda else "cpu") | |||
| train(device) | |||
| dump_global_result('result_retrain.json', result['accuracy']) | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -0,0 +1,495 @@ | |||
| import sys | |||
| from utils import accuracy | |||
| import torch | |||
| from torch import nn | |||
| from torch.utils.data import DataLoader | |||
| import datasets | |||
| import time | |||
| import logging | |||
| import os | |||
| import argparse | |||
| import distutils.util | |||
| import numpy as np | |||
| import json | |||
| import random | |||
| sys.path.append('..'+ '/' + '..') | |||
| # import custom packages | |||
| from macro import GeneralNetwork | |||
| from micro import MicroNetwork | |||
| from pytorch.fixed import apply_fixed_architecture | |||
| from pytorch.retrainer import Retrainer | |||
| from pytorch.utils import AverageMeterGroup, to_device, save_best_checkpoint, mkdirs | |||
| class EnasRetrainer(Retrainer): | |||
| """ | |||
| ENAS retrainer. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| PyTorch model to be trained. | |||
| data_dir : dataset path | |||
| The path of the dataset. | |||
| best_checkpoint_dir: 'best_checkpoint.pth' | |||
| The directory for saving model. | |||
| batch_size : int | |||
| Batch size. | |||
| eval_batch_size : int | |||
| Batch size. | |||
| num_epochs : int | |||
| Number of epochs planned for training. | |||
| lr : float | |||
| Learning rate. | |||
| is_cuda: Boolean | |||
| Whether to use GPU for training. | |||
| log_every : int | |||
| Step count per logging. | |||
| child_grad_bound : float | |||
| Gradient bound. | |||
| child_l2_reg: float | |||
| L2 regression. | |||
| eval_every_epochs: int | |||
| Evaluate every epochs. | |||
| logger: | |||
| logging. | |||
| workers : int | |||
| Workers for data loading. | |||
| device : torch.device | |||
| ``torch.device("cpu")`` or ``torch.device("cuda")``. | |||
| aux_weight : float | |||
| Weight of auxiliary head loss. ``aux_weight * aux_loss`` will be added to total loss. | |||
| """ | |||
| def __init__(self,model,data_dir = './data',best_checkpoint_dir = './best_checkpoint', | |||
| batch_size = 1024, eval_batch_size = 1024,num_epochs = 2,lr = 0.02,is_cuda = 'True', | |||
| log_every = 40,child_grad_bound = 0.5, child_l2_reg=3e-6, eval_every_epochs=2, | |||
| logger = logging.getLogger("enas-retrain"), result_path='./'): | |||
| self.aux_weight = 0.4 | |||
| self.device = torch.device("cuda:0" ) | |||
| self.workers = 4 | |||
| self.child_model = model | |||
| self.data_dir = data_dir | |||
| self.best_checkpoint_dir = best_checkpoint_dir | |||
| self.batch_size = batch_size | |||
| self.eval_batch_size = eval_batch_size | |||
| self.num_epochs = num_epochs | |||
| self.lr = lr | |||
| self.is_cuda = is_cuda | |||
| self.log_every = log_every | |||
| self.child_grad_bound = child_grad_bound | |||
| self.child_l2_reg = child_l2_reg | |||
| self.eval_every_epochs = eval_every_epochs | |||
| self.logger = logger | |||
| self.optimizer = torch.optim.SGD(self.child_model.parameters(), self.lr, momentum=0.9, weight_decay=1.0E-4, nesterov=True) | |||
| self.criterion = nn.CrossEntropyLoss() | |||
| self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=self.num_epochs, eta_min=0.001) | |||
| # load dataset | |||
| self.init_dataloader() | |||
| self.child_model.to(self.device) | |||
| self.result_path = result_path | |||
| with open(self.result_path, "w") as file: | |||
| file.write('') | |||
| def train(self): | |||
| """ | |||
| Train ``num_epochs``. | |||
| Trigger callbacks at the start and the end of each epoch. | |||
| Parameters | |||
| ---------- | |||
| validate : bool | |||
| If ``true``, will do validation every epoch. | |||
| """ | |||
| self.logger.info('** Start training **') | |||
| self.start_time = time.time() | |||
| for epoch in range(self.num_epochs): | |||
| self.train_one_epoch(epoch) | |||
| self.child_model.eval() | |||
| # if epoch / self.eval_every_epochs == 0: | |||
| self.logger.info("Epoch {}: Eval".format(epoch)) | |||
| self.validate_one_epoch(epoch) | |||
| self.lr_scheduler.step() | |||
| # print('** saving model **') | |||
| self.logger.info("** Save best model **") | |||
| # save_state = { | |||
| # 'epoch': epoch, | |||
| # 'child_model_state_dict': self.child_model.state_dict(), | |||
| # 'optimizer_state_dict': self.optimizer.state_dict()} | |||
| # torch.save(save_state, self.best_checkpoint_dir) | |||
| save_best_checkpoint(self.best_checkpoint_dir, self.child_model, self.optimizer, epoch) | |||
| def validate(self): | |||
| """ | |||
| Do one validation. Validate one epoch. | |||
| """ | |||
| pass | |||
| def export(self, file): | |||
| """ | |||
| dump the architecture to ``file``. | |||
| Parameters | |||
| ---------- | |||
| file : str | |||
| File path to export to. Expected to be a JSON. | |||
| """ | |||
| pass | |||
| def checkpoint(self): | |||
| """ | |||
| Override to dump a checkpoint. | |||
| """ | |||
| pass | |||
| def init_dataloader(self): | |||
| self.logger.info("Build dataloader") | |||
| self.dataset_train, self.dataset_valid = datasets.get_dataset("cifar10", self.data_dir) | |||
| n_train = len(self.dataset_train) | |||
| split = n_train // 10 | |||
| indices = list(range(n_train)) | |||
| train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split]) | |||
| valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:]) | |||
| self.train_loader = torch.utils.data.DataLoader(self.dataset_train, | |||
| batch_size=self.batch_size, | |||
| sampler=train_sampler, | |||
| num_workers=self.workers) | |||
| self.valid_loader = torch.utils.data.DataLoader(self.dataset_train, | |||
| batch_size=self.eval_batch_size, | |||
| sampler=valid_sampler, | |||
| num_workers=self.workers) | |||
| self.test_loader = torch.utils.data.DataLoader(self.dataset_valid, | |||
| batch_size=self.batch_size, | |||
| num_workers=self.workers) | |||
| # self.train_loader = cycle(self.train_loader) | |||
| # self.valid_loader = cycle(self.valid_loader) | |||
| def train_one_epoch(self,epoch): | |||
| """ | |||
| Train one epoch. | |||
| Parameters | |||
| ---------- | |||
| epoch : int | |||
| Epoch number starting from 0. | |||
| """ | |||
| tot_acc = 0 | |||
| tot = 0 | |||
| losses = [] | |||
| step = 0 | |||
| self.child_model.train() | |||
| meters = AverageMeterGroup() | |||
| for batch in self.train_loader: | |||
| step += 1 | |||
| x, y = batch | |||
| x, y = to_device(x, self.device), to_device(y, self.device) | |||
| logits = self.child_model(x) | |||
| if isinstance(logits, tuple): | |||
| logits, aux_logits = logits | |||
| aux_loss = self.criterion(aux_logits, y) | |||
| else: | |||
| aux_loss = 0. | |||
| acc = accuracy(logits, y) | |||
| loss = self.criterion(logits, y) | |||
| loss = loss + self.aux_weight * aux_loss | |||
| self.optimizer.zero_grad() | |||
| loss.backward() | |||
| grad_norm = 0 | |||
| trainable_params = self.child_model.parameters() | |||
| # assert FLAGS.child_grad_bound is not None, "Need grad_bound to clip gradients." | |||
| # # compute the gradient norm value | |||
| # grad_norm = nn.utils.clip_grad_norm_(trainable_params, 99999999) | |||
| # for param in trainable_params: | |||
| # nn.utils.clip_grad_norm_(param, self.child_grad_bound) # clip grad | |||
| # print(param_ == param) | |||
| if self.child_grad_bound is not None: | |||
| grad_norm = nn.utils.clip_grad_norm_(trainable_params, self.child_grad_bound) | |||
| trainable_params = grad_norm | |||
| self.optimizer.step() | |||
| tot_acc += acc['acc1'] | |||
| tot += 1 | |||
| losses.append(loss) | |||
| acc["loss"] = loss.item() | |||
| meters.update(acc) | |||
| if step % self.log_every == 0: | |||
| curr_time = time.time() | |||
| log_string = "" | |||
| log_string += "epoch={:<6d}".format(epoch) | |||
| log_string += "ch_step={:<6d}".format(step) | |||
| log_string += " loss={:<8.6f}".format(loss) | |||
| log_string += " lr={:<8.4f}".format(self.optimizer.param_groups[0]['lr']) | |||
| log_string += " |g|={:<8.4f}".format(grad_norm) | |||
| log_string += " tr_acc={:<8.4f}/{:>3d}".format(acc['acc1'], logits.size()[0]) | |||
| log_string += " mins={:<10.2f}".format(float(curr_time - self.start_time) / 60) | |||
| self.logger.info(log_string) | |||
| print("Model Epoch [%d/%d] %.3f mins %s \n " % (epoch + 1, | |||
| self.num_epochs, float(time.time() - self.start_time) / 60, meters )) | |||
| final_acc = float(tot_acc) / tot | |||
| losses = torch.tensor(losses) | |||
| loss = losses.mean() | |||
| def validate_one_epoch(self,epoch): | |||
| tot_acc = 0 | |||
| tot = 0 | |||
| losses = [] | |||
| meters = AverageMeterGroup() | |||
| with torch.no_grad(): # save memory | |||
| meters = AverageMeterGroup() | |||
| for batch in self.valid_loader: | |||
| x, y = batch | |||
| x, y = to_device(x, self.device), to_device(y, self.device) | |||
| logits = self.child_model(x) | |||
| if isinstance(logits, tuple): | |||
| logits, aux_logits = logits | |||
| aux_loss = self.criterion(aux_logits, y) | |||
| else: | |||
| aux_loss = 0. | |||
| loss = self.criterion(logits, y) | |||
| loss = loss + self.aux_weight * aux_loss | |||
| # loss = loss.mean() | |||
| preds = logits.argmax(dim=1).long() | |||
| acc = torch.eq(preds, y.long()).long().sum().item() | |||
| acc_v = accuracy(logits, y) | |||
| losses.append(loss) | |||
| tot_acc += acc | |||
| tot += len(y) | |||
| acc_v["loss"] = loss.item() | |||
| meters.update(acc_v) | |||
| losses = torch.tensor(losses) | |||
| loss = losses.mean() | |||
| if tot > 0: | |||
| final_acc = float(tot_acc) / tot | |||
| else: | |||
| final_acc = 0 | |||
| self.logger.info("Error in calculating final_acc") | |||
| with open(self.result_path, "a") as file: | |||
| file.write( | |||
| str({"type": "Accuracy", | |||
| "result": {"sequence": epoch, "category": "epoch", "value": final_acc}}) + '\n') | |||
| # print("Model eval %.3fmins %s \n " % ( | |||
| # float(time.time() - self.start_time) / 60, meters )) | |||
| print({"type": "Accuracy", | |||
| "result": {"sequence": epoch, "category": "epoch", "value": final_acc}}) | |||
| self.logger.info( | |||
| "ch_step= {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format( "test", final_acc, "test", loss)) | |||
| logging.basicConfig(format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s', | |||
| level=logging.INFO, | |||
| filename='./retrain.log', | |||
| filemode='a') | |||
| logger = logging.getLogger("enas-retrain") | |||
| def parse_args(): | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument( | |||
| "--data_dir", | |||
| type=str, | |||
| default="./data", | |||
| help="Directory containing the dataset and embedding file. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--model_selected_space_path", | |||
| type=str, | |||
| default="./model_selected_space.json", | |||
| # required=True, | |||
| help="Architecture json file. (default: %(default)s)") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./result.json', help="res directory") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./search_space.json', help="search_space directory") | |||
| parser.add_argument("--log_path", type=str, default='output/log') | |||
| parser.add_argument( | |||
| "--best_selected_space_path", | |||
| type=str, | |||
| default="./best_selected_space.json", | |||
| # required=True, | |||
| help="Best architecture selected json file by experiment. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--best_checkpoint_dir", | |||
| type=str, | |||
| default="best_checkpoint", | |||
| help="Path for saved checkpoints. (default: %(default)s)") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument("--search_for", | |||
| choices=["macro", "micro"], | |||
| default="macro") | |||
| parser.add_argument( | |||
| "--batch_size", | |||
| type=int, | |||
| default=128, | |||
| help="Number of samples each batch for training. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--eval_batch_size", | |||
| type=int, | |||
| default=128, | |||
| help="Number of samples each batch for evaluation. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--epochs", | |||
| type=int, | |||
| default=10, | |||
| help="The number of training epochs. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--lr", | |||
| type=float, | |||
| default=0.02, | |||
| help="The initial learning rate. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--is_cuda", | |||
| type=distutils.util.strtobool, | |||
| default=True, | |||
| help="Specify the device type. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--load_checkpoint", | |||
| type=distutils.util.strtobool, | |||
| default=False, | |||
| help="Whether to load checkpoint. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--log_every", | |||
| type=int, | |||
| default=50, | |||
| help="How many steps to log. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--eval_every_epochs", | |||
| type=int, | |||
| default=1, | |||
| help="How many epochs to eval. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--child_grad_bound", | |||
| type=float, | |||
| default=5.0, | |||
| help="The threshold for gradient clipping. (default: %(default)s)") # | |||
| parser.add_argument( | |||
| "--child_l2_reg", | |||
| type=float, | |||
| default=3e-6, | |||
| help="Weight decay factor. (default: %(default)s)") | |||
| parser.add_argument( | |||
| "--child_lr_decay_scheme", | |||
| type=str, | |||
| default="cosine", | |||
| help="Learning rate annealing strategy, only 'cosine' supported. (default: %(default)s)") #todo: remove | |||
| global FLAGS | |||
| FLAGS = parser.parse_args() | |||
| # decode human readable search space to model | |||
| def convert_selected_space_format(child_fixed_arc): | |||
| # with open('./macro_selected_space.json') as js: | |||
| with open(child_fixed_arc) as js: | |||
| selected_space = json.load(js) | |||
| ops = selected_space['op_list'] | |||
| selected_space.pop('op_list') | |||
| new_selected_space = {} | |||
| for key, value in selected_space.items(): | |||
| # for macro | |||
| if FLAGS.search_for == 'macro': | |||
| new_key = key.split('_')[-1] | |||
| # for micro | |||
| elif FLAGS.search_for == 'micro': | |||
| new_key = key | |||
| if len(value) > 1 or len(value)==0: | |||
| new_value = value | |||
| elif len(value) > 0 and value[0] in ops: | |||
| new_value = ops.index(value[0]) | |||
| else: | |||
| new_value = value[0] | |||
| new_selected_space[new_key] = new_value | |||
| return new_selected_space | |||
| def set_random_seed(seed): | |||
| logger.info("set random seed for data reading: {}".format(seed)) | |||
| random.seed(seed) | |||
| os.environ['PYTHONHASHSEED'] = str(seed) | |||
| np.random.seed(seed) | |||
| random.seed(seed) | |||
| torch.manual_seed(seed) | |||
| if FLAGS.is_cuda: | |||
| torch.cuda.manual_seed_all(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| def main(): | |||
| parse_args() | |||
| child_fixed_arc = FLAGS.best_selected_space_path # './macro_seletced_space' | |||
| search_for = FLAGS.search_for | |||
| # set seed to result todo: trial ID | |||
| set_random_seed(FLAGS.trial_id) | |||
| mkdirs(FLAGS.result_path, FLAGS.log_path, FLAGS.best_checkpoint_dir) | |||
| # define and load model | |||
| logger.info('** ' + FLAGS.search_for + 'search **') | |||
| fixed_arc = convert_selected_space_format(child_fixed_arc) | |||
| # Model, macro search or micro search | |||
| if FLAGS.search_for == 'macro': | |||
| child_model = GeneralNetwork() | |||
| elif FLAGS.search_for == 'micro': | |||
| child_model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True) | |||
| apply_fixed_architecture(child_model, fixed_arc) | |||
| # load model | |||
| if FLAGS.load_checkpoint: | |||
| print('** Load model **') | |||
| logger.info('** Load model **') | |||
| child_model.load_state_dict(torch.load(FLAGS.best_checkpoint_dir)['child_model_state_dict']) | |||
| retrainer = EnasRetrainer(model=child_model, | |||
| data_dir = FLAGS.data_dir, | |||
| best_checkpoint_dir=FLAGS.best_checkpoint_dir, | |||
| batch_size=FLAGS.batch_size, | |||
| eval_batch_size=FLAGS.eval_batch_size, | |||
| num_epochs=FLAGS.epochs, | |||
| lr=FLAGS.lr, | |||
| is_cuda=FLAGS.is_cuda, | |||
| log_every=FLAGS.log_every, | |||
| child_grad_bound=FLAGS.child_grad_bound, | |||
| child_l2_reg=FLAGS.child_l2_reg, | |||
| eval_every_epochs=FLAGS.eval_every_epochs, | |||
| logger=logger, | |||
| result_path=FLAGS.result_path, | |||
| ) | |||
| t1 = time.time() | |||
| retrainer.train() | |||
| print('cost time for retrain: ' , time.time() - t1) | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -0,0 +1,135 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import logging | |||
| import time | |||
| from argparse import ArgumentParser | |||
| import torch | |||
| import torch.nn as nn | |||
| import datasets | |||
| from macro import GeneralNetwork | |||
| from micro import MicroNetwork | |||
| from trainer import EnasTrainer | |||
| from mutator import EnasMutator | |||
| from pytorch.callbacks import (ArchitectureCheckpoint, | |||
| LRSchedulerCallback) | |||
| from utils import accuracy, reward_accuracy | |||
| from collections import OrderedDict | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| import json | |||
| torch.cuda.set_device(4) | |||
| logger = logging.getLogger('tadl-enas') | |||
| # save search space as search_space.json | |||
| def save_nas_search_space(mutator,file_path): | |||
| result = OrderedDict() | |||
| cur_layer_idx = None | |||
| for mutable in mutator.mutables.traverse(): | |||
| if not isinstance(mutable,(LayerChoice, InputChoice)): | |||
| cur_layer_idx = mutable.key + '_' | |||
| continue | |||
| # macro | |||
| if 'layer' in cur_layer_idx: | |||
| if isinstance(mutable, LayerChoice): | |||
| if 'op_list' not in result: | |||
| result['op_list'] = [str(i) for i in mutable] | |||
| result[cur_layer_idx + mutable.key] = 'op_list' | |||
| else: | |||
| result[cur_layer_idx + mutable.key] = {'skip_connection': False if mutable.n_chosen else True, | |||
| 'n_chosen': mutable.n_chosen if mutable.n_chosen else '', | |||
| 'choose_from': mutable.choose_from if mutable.choose_from else ''} | |||
| # micro | |||
| elif 'node' in cur_layer_idx: | |||
| if isinstance(mutable,LayerChoice): | |||
| if 'op_list' not in result: | |||
| result['op_list'] = [str(i) for i in mutable] | |||
| result[mutable.key] = 'op_list' | |||
| else: | |||
| result[mutable.key] = {'skip_connection':False if mutable.n_chosen else True, | |||
| 'n_chosen': mutable.n_chosen if mutable.n_chosen else '', | |||
| 'choose_from': mutable.choose_from if mutable.choose_from else ''} | |||
| dump_global_result(file_path,result) | |||
| # def dump_global_result(args,global_result): | |||
| # with open(args['result_path'], "w") as ss_file: | |||
| # json.dump(global_result, ss_file, sort_keys=True, indent=2) | |||
| def dump_global_result(res_path,global_result, sort_keys = False): | |||
| with open(res_path, "w") as ss_file: | |||
| json.dump(global_result, ss_file, sort_keys=sort_keys, indent=2) | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("enas") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./search_space.json', help="search_space directory") | |||
| parser.add_argument("--selected_space_path", type=str, | |||
| default='./selected_space.json', help="sapce_path_out directory") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./result.json', help="res directory") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument("--batch-size", default=128, type=int) | |||
| parser.add_argument("--log-frequency", default=10, type=int) | |||
| parser.add_argument("--search_for", choices=["macro", "micro"], default="macro") | |||
| parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)") | |||
| args = parser.parse_args() | |||
| # 设置随机种子 | |||
| torch.manual_seed(args.trial_id) | |||
| torch.cuda.manual_seed_all(args.trial_id) | |||
| np.random.seed(args.trial_id) | |||
| random.seed(args.trial_id) | |||
| dataset_train, dataset_valid = datasets.get_dataset("cifar10") | |||
| if args.search_for == "macro": | |||
| model = GeneralNetwork() | |||
| num_epochs = args.epochs or 310 | |||
| mutator = None | |||
| mutator = EnasMutator(model) | |||
| elif args.search_for == "micro": | |||
| model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True) | |||
| num_epochs = args.epochs or 150 | |||
| mutator = EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True) | |||
| else: | |||
| raise AssertionError | |||
| # 储存整个网络结构 | |||
| # args.search_spach_path = None#str(args.search_for) + str(args.search_space_path) | |||
| # print( args.search_space_path, args.search_for ) | |||
| save_nas_search_space(mutator, args.search_space_path) | |||
| criterion = nn.CrossEntropyLoss() | |||
| optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4) | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001) | |||
| trainer = EnasTrainer(model, | |||
| loss=criterion, | |||
| metrics=accuracy, | |||
| reward_function=reward_accuracy, | |||
| optimizer=optimizer, | |||
| callbacks=[LRSchedulerCallback(lr_scheduler)], | |||
| batch_size=args.batch_size, | |||
| num_epochs=num_epochs, | |||
| dataset_train=dataset_train, | |||
| dataset_valid=dataset_valid, | |||
| log_frequency=args.log_frequency, | |||
| mutator=mutator, | |||
| child_model_path='./'+args.search_for+'_child_model') | |||
| logger.info(trainer.metrics) | |||
| t1 = time.time() | |||
| trainer.train() | |||
| trainer.result["cost_time"] = time.time() - t1 | |||
| dump_global_result(args.result_path,trainer.result) | |||
| selected_model = trainer.export_child_model(selected_space = True) | |||
| dump_global_result(args.selected_space_path,selected_model) | |||
| @@ -0,0 +1,18 @@ | |||
| import sys | |||
| sys.path.append('../..') | |||
| from pytorch.selector import Selector | |||
| class EnasSelector(Selector): | |||
| def __init__(self, *args, single_candidate=True): | |||
| super().__init__(single_candidate) | |||
| self.args = args | |||
| def fit(self): | |||
| """ | |||
| only one candatite, function passed | |||
| """ | |||
| pass | |||
| if __name__ == "__main__": | |||
| hpo_selector = EnasSelector() | |||
| hpo_selector.fit() | |||
| @@ -0,0 +1,436 @@ | |||
| from itertools import cycle | |||
| import os | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import numpy as np | |||
| import random | |||
| import logging | |||
| import time | |||
| from argparse import ArgumentParser | |||
| from collections import OrderedDict | |||
| import json | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.optim as optim | |||
| # import custom libraries | |||
| import datasets | |||
| from pytorch.trainer import Trainer | |||
| from pytorch.utils import AverageMeterGroup, to_device, mkdirs | |||
| from pytorch.mutables import LayerChoice, InputChoice, MutableScope | |||
| from macro import GeneralNetwork | |||
| from micro import MicroNetwork | |||
| # from trainer import EnasTrainer | |||
| from mutator import EnasMutator | |||
| from pytorch.callbacks import (ArchitectureCheckpoint, | |||
| LRSchedulerCallback) | |||
| from utils import accuracy, reward_accuracy | |||
| torch.cuda.set_device(0) | |||
| logging.basicConfig(format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s', | |||
| level=logging.INFO, | |||
| filename='./train.log', | |||
| filemode='a') | |||
| logger = logging.getLogger('enas_train') | |||
| class EnasTrainer(Trainer): | |||
| """ | |||
| ENAS trainer. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| PyTorch model to be trained. | |||
| loss : callable | |||
| Receives logits and ground truth label, return a loss tensor. | |||
| metrics : callable | |||
| Receives logits and ground truth label, return a dict of metrics. | |||
| reward_function : callable | |||
| Receives logits and ground truth label, return a tensor, which will be feeded to RL controller as reward. | |||
| optimizer : Optimizer | |||
| The optimizer used for optimizing the model. | |||
| num_epochs : int | |||
| Number of epochs planned for training. | |||
| dataset_train : Dataset | |||
| Dataset for training. Will be split for training weights and architecture weights. | |||
| dataset_valid : Dataset | |||
| Dataset for testing. | |||
| mutator : EnasMutator | |||
| Use when customizing your own mutator or a mutator with customized parameters. | |||
| batch_size : int | |||
| Batch size. | |||
| workers : int | |||
| Workers for data loading. | |||
| device : torch.device | |||
| ``torch.device("cpu")`` or ``torch.device("cuda")``. | |||
| log_frequency : int | |||
| Step count per logging. | |||
| callbacks : list of Callback | |||
| list of callbacks to trigger at events. | |||
| entropy_weight : float | |||
| Weight of sample entropy loss. | |||
| skip_weight : float | |||
| Weight of skip penalty loss. | |||
| baseline_decay : float | |||
| Decay factor of baseline. New baseline will be equal to ``baseline_decay * baseline_old + reward * (1 - baseline_decay)``. | |||
| child_steps : int | |||
| How many mini-batches for model training per epoch. | |||
| mutator_lr : float | |||
| Learning rate for RL controller. | |||
| mutator_steps_aggregate : int | |||
| Number of steps that will be aggregated into one mini-batch for RL controller. | |||
| mutator_steps : int | |||
| Number of mini-batches for each epoch of RL controller learning. | |||
| aux_weight : float | |||
| Weight of auxiliary head loss. ``aux_weight * aux_loss`` will be added to total loss. | |||
| test_arc_per_epoch : int | |||
| How many architectures are chosen for direct test after each epoch. | |||
| """ | |||
| def __init__(self, model, loss, metrics, reward_function, | |||
| optimizer, num_epochs, dataset_train, dataset_valid, | |||
| mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None, | |||
| entropy_weight=0.0001, skip_weight=0.8, baseline_decay=0.999, child_steps=500, | |||
| mutator_lr=0.00035, mutator_steps_aggregate=20, mutator_steps=50, aux_weight=0.4, | |||
| test_arc_per_epoch=1,child_model_path = './', result_path='./'): | |||
| super().__init__(model, mutator if mutator is not None else EnasMutator(model), | |||
| loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid, | |||
| batch_size, workers, device, log_frequency, callbacks) | |||
| self.reward_function = reward_function | |||
| self.mutator_optim = optim.Adam(self.mutator.parameters(), lr=mutator_lr) | |||
| self.batch_size = batch_size | |||
| self.workers = workers | |||
| self.entropy_weight = entropy_weight | |||
| self.skip_weight = skip_weight | |||
| self.baseline_decay = baseline_decay | |||
| self.baseline = 0. | |||
| self.mutator_steps_aggregate = mutator_steps_aggregate | |||
| self.mutator_steps = mutator_steps | |||
| self.child_steps = child_steps | |||
| self.aux_weight = aux_weight | |||
| self.test_arc_per_epoch = test_arc_per_epoch | |||
| self.child_model_path = child_model_path # saving the child model | |||
| self.init_dataloader() | |||
| # self.result = {'accuracy':[], | |||
| # 'cost_time':0} | |||
| self.result_path = result_path | |||
| with open(self.result_path, "w") as file: | |||
| file.write('') | |||
| def init_dataloader(self): | |||
| n_train = len(self.dataset_train) | |||
| split = n_train // 10 | |||
| indices = list(range(n_train)) | |||
| train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split]) | |||
| valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:]) | |||
| self.train_loader = torch.utils.data.DataLoader(self.dataset_train, | |||
| batch_size=self.batch_size, | |||
| sampler=train_sampler, | |||
| num_workers=self.workers) | |||
| self.valid_loader = torch.utils.data.DataLoader(self.dataset_train, | |||
| batch_size=self.batch_size, | |||
| sampler=valid_sampler, | |||
| num_workers=self.workers) | |||
| self.test_loader = torch.utils.data.DataLoader(self.dataset_valid, | |||
| batch_size=self.batch_size, | |||
| num_workers=self.workers) | |||
| self.train_loader = cycle(self.train_loader) | |||
| self.valid_loader = cycle(self.valid_loader) | |||
| def train_one_epoch(self, epoch): | |||
| # Sample model and train | |||
| self.model.train() | |||
| self.mutator.eval() | |||
| meters = AverageMeterGroup() | |||
| for step in range(1, self.child_steps + 1): | |||
| x, y = next(self.train_loader) | |||
| x, y = to_device(x, self.device), to_device(y, self.device) | |||
| self.optimizer.zero_grad() | |||
| with torch.no_grad(): | |||
| self.mutator.reset() | |||
| # self._write_graph_status() | |||
| logits = self.model(x) | |||
| if isinstance(logits, tuple): | |||
| logits, aux_logits = logits | |||
| aux_loss = self.loss(aux_logits, y) | |||
| else: | |||
| aux_loss = 0. | |||
| metrics = self.metrics(logits, y) | |||
| loss = self.loss(logits, y) | |||
| loss = loss + self.aux_weight * aux_loss | |||
| loss.backward() | |||
| nn.utils.clip_grad_norm_(self.model.parameters(), 5.) | |||
| self.optimizer.step() | |||
| metrics["loss"] = loss.item() | |||
| meters.update(metrics) | |||
| if self.log_frequency is not None and step % self.log_frequency == 0: | |||
| logger.info("Model Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, | |||
| self.num_epochs, step, self.child_steps, meters) | |||
| # Train sampler (mutator) | |||
| self.model.eval() | |||
| self.mutator.train() | |||
| meters = AverageMeterGroup() | |||
| for mutator_step in range(1, self.mutator_steps + 1): | |||
| self.mutator_optim.zero_grad() | |||
| for step in range(1, self.mutator_steps_aggregate + 1): | |||
| x, y = next(self.valid_loader) | |||
| x, y = to_device(x, self.device), to_device(y, self.device) | |||
| self.mutator.reset() | |||
| with torch.no_grad(): | |||
| logits = self.model(x) | |||
| # self._write_graph_status() | |||
| metrics = self.metrics(logits, y) | |||
| reward = self.reward_function(logits, y) | |||
| if self.entropy_weight: | |||
| reward += self.entropy_weight * self.mutator.sample_entropy.item() | |||
| self.baseline = self.baseline * self.baseline_decay + reward * (1 - self.baseline_decay) | |||
| loss = self.mutator.sample_log_prob * (reward - self.baseline) | |||
| if self.skip_weight: | |||
| loss += self.skip_weight * self.mutator.sample_skip_penalty | |||
| metrics["reward"] = reward | |||
| metrics["loss"] = loss.item() | |||
| metrics["ent"] = self.mutator.sample_entropy.item() | |||
| metrics["log_prob"] = self.mutator.sample_log_prob.item() | |||
| metrics["baseline"] = self.baseline | |||
| metrics["skip"] = self.mutator.sample_skip_penalty | |||
| loss /= self.mutator_steps_aggregate | |||
| loss.backward() | |||
| meters.update(metrics) | |||
| cur_step = step + (mutator_step - 1) * self.mutator_steps_aggregate | |||
| if self.log_frequency is not None and cur_step % self.log_frequency == 0: | |||
| logger.info("RL Epoch [%d/%d] Step [%d/%d] [%d/%d] %s", epoch + 1, self.num_epochs, | |||
| mutator_step, self.mutator_steps, step, self.mutator_steps_aggregate, | |||
| meters) | |||
| nn.utils.clip_grad_norm_(self.mutator.parameters(), 5.) | |||
| self.mutator_optim.step() | |||
| def validate_one_epoch(self, epoch): | |||
| with torch.no_grad(): | |||
| accuracy = 0 | |||
| for arc_id in range(self.test_arc_per_epoch): | |||
| meters = AverageMeterGroup() | |||
| count, acc_this_round = 0,0 | |||
| for x, y in self.test_loader: | |||
| x, y = to_device(x, self.device), to_device(y, self.device) | |||
| self.mutator.reset() | |||
| child_model = self.export_child_model() | |||
| # self._generate_child_model(epoch, | |||
| # count, | |||
| # arc_id, | |||
| # child_model, | |||
| # self.child_model_path) | |||
| logits = self.model(x) | |||
| if isinstance(logits, tuple): | |||
| logits, _ = logits | |||
| metrics = self.metrics(logits, y) | |||
| loss = self.loss(logits, y) | |||
| metrics["loss"] = loss.item() | |||
| meters.update(metrics) | |||
| count += 1 | |||
| acc_this_round += metrics['acc1'] | |||
| logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary %s", | |||
| epoch + 1, self.num_epochs, arc_id + 1, self.test_arc_per_epoch, | |||
| meters.summary()) | |||
| acc_this_round /= count | |||
| accuracy += acc_this_round | |||
| # logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": meters.get_last_acc()}}) | |||
| print({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": meters.get_last_acc()}}) | |||
| with open(self.result_path, "a") as file: | |||
| file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", | |||
| "value": meters.get_last_acc()}}) + '\n') | |||
| # self.result['accuracy'].append(accuracy / self.test_arc_per_epoch) | |||
| # export child_model | |||
| def export_child_model(self, selected_space=False): | |||
| if selected_space: | |||
| sampled = self.mutator.sample_final() | |||
| else: | |||
| sampled = self.mutator._cache | |||
| result = OrderedDict() | |||
| cur_layer_id = None | |||
| for mutable in self.mutator.mutables: | |||
| if not isinstance(mutable, (LayerChoice, InputChoice)): | |||
| cur_layer_id = mutable.key | |||
| # not supported as built-in | |||
| continue | |||
| choosed_ops_idx = self.mutator._convert_mutable_decision_to_human_readable(mutable, sampled[mutable.key]) | |||
| if not isinstance(choosed_ops_idx, list): | |||
| choosed_ops_idx = [choosed_ops_idx] | |||
| if isinstance(mutable, LayerChoice): | |||
| if 'op_list' not in result: | |||
| result['op_list'] = [str(i) for i in mutable] | |||
| choosed_ops = [str(mutable[idx]) for idx in choosed_ops_idx] | |||
| else: | |||
| choosed_ops = choosed_ops_idx | |||
| if 'node' in cur_layer_id: | |||
| result[mutable.key] = choosed_ops | |||
| else: | |||
| result[cur_layer_id + '_' + mutable.key] = choosed_ops | |||
| return result | |||
| def _generate_child_model(self, | |||
| validation_epoch, | |||
| model_idx, | |||
| validation_step, | |||
| child_model, | |||
| file_path): | |||
| # create child_models folder | |||
| # parent_path = os.path.join(file_path, 'child_model') | |||
| parent_path = file_path | |||
| if not os.path.exists(parent_path): | |||
| os.mkdir(parent_path) | |||
| # create secondary directory | |||
| secondary_path = os.path.join(parent_path, 'validation_epoch_{}'.format(validation_epoch)) | |||
| if not os.path.exists(secondary_path): | |||
| os.mkdir(secondary_path) | |||
| # create third directory | |||
| folder_path = os.path.join(secondary_path, 'validation_step_{}'.format(validation_step)) | |||
| if not os.path.exists(folder_path): | |||
| os.mkdir(folder_path) | |||
| # save sampled child_model for validation | |||
| saved_path = os.path.join(folder_path, "child_model_%02d.json" % model_idx) | |||
| with open(saved_path, "w") as ss_file: | |||
| json.dump(child_model, ss_file, indent=2) | |||
| # save search space as search_space.json | |||
| def save_nas_search_space(mutator,file_path): | |||
| result = OrderedDict() | |||
| cur_layer_idx = None | |||
| for mutable in mutator.mutables.traverse(): | |||
| if not isinstance(mutable,(LayerChoice, InputChoice)): | |||
| cur_layer_idx = mutable.key + '_' | |||
| continue | |||
| # macro | |||
| if 'layer' in cur_layer_idx: | |||
| if isinstance(mutable, LayerChoice): | |||
| if 'op_list' not in result: | |||
| result['op_list'] = [str(i) for i in mutable] | |||
| result[cur_layer_idx + mutable.key] = 'op_list' | |||
| else: | |||
| result[cur_layer_idx + mutable.key] = {'skip_connection': False if mutable.n_chosen else True, | |||
| 'n_chosen': mutable.n_chosen if mutable.n_chosen else '', | |||
| 'choose_from': mutable.choose_from if mutable.choose_from else ''} | |||
| # micro | |||
| elif 'node' in cur_layer_idx: | |||
| if isinstance(mutable,LayerChoice): | |||
| if 'op_list' not in result: | |||
| result['op_list'] = [str(i) for i in mutable] | |||
| result[mutable.key] = 'op_list' | |||
| else: | |||
| result[mutable.key] = {'skip_connection':False if mutable.n_chosen else True, | |||
| 'n_chosen': mutable.n_chosen if mutable.n_chosen else '', | |||
| 'choose_from': mutable.choose_from if mutable.choose_from else ''} | |||
| dump_global_result(file_path,result) | |||
| # def dump_global_result(args,global_result): | |||
| # with open(args['result_path'], "w") as ss_file: | |||
| # json.dump(global_result, ss_file, sort_keys=True, indent=2) | |||
| def dump_global_result(res_path,global_result, sort_keys = False): | |||
| with open(res_path, "w") as ss_file: | |||
| json.dump(global_result, ss_file, sort_keys=sort_keys, indent=2) | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("enas") | |||
| parser.add_argument( | |||
| "--data_dir", | |||
| type=str, | |||
| default="./data", | |||
| help="Directory containing the dataset and embedding file. (default: %(default)s)") | |||
| parser.add_argument("--model_selected_space_path", type=str, | |||
| default='./model_selected_space.json', help="sapce_path_out directory") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./model_result.json', help="res directory") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./search_space.json', help="search_space directory") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./model_selected_space.json', help="Best sapce_path_out directory of experiment") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument('--lr', type=float, default=0.005, metavar='N', | |||
| help='learning rate') | |||
| parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)") | |||
| parser.add_argument("--batch_size", default=128, type=int) | |||
| parser.add_argument("--log_frequency", default=10, type=int) | |||
| parser.add_argument("--search_for", choices=["macro", "micro"], default="macro") | |||
| args = parser.parse_args() | |||
| mkdirs(args.result_path, args.search_space_path, args.best_selected_space_path) | |||
| # 设置随机种子 | |||
| torch.manual_seed(args.trial_id) | |||
| torch.cuda.manual_seed_all(args.trial_id) | |||
| np.random.seed(args.trial_id) | |||
| random.seed(args.trial_id) | |||
| # use deterministic instead of nondeterministic algorithm | |||
| # make sure exact results can be reproduced everytime. | |||
| torch.backends.cudnn.deterministic = True | |||
| dataset_train, dataset_valid = datasets.get_dataset("cifar10", args.data_dir) | |||
| if args.search_for == "macro": | |||
| model = GeneralNetwork() | |||
| num_epochs = args.epochs or 310 | |||
| mutator = None | |||
| mutator = EnasMutator(model) | |||
| elif args.search_for == "micro": | |||
| model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True) | |||
| num_epochs = args.epochs or 150 | |||
| mutator = EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True) | |||
| else: | |||
| raise AssertionError | |||
| # 储存整个网络结构 | |||
| # args.search_spach_path = None#str(args.search_for) + str(args.search_space_path) | |||
| # print( args.search_space_path, args.search_for ) | |||
| save_nas_search_space(mutator, args.search_space_path) | |||
| criterion = nn.CrossEntropyLoss() | |||
| optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4) | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001) | |||
| trainer = EnasTrainer(model, | |||
| loss=criterion, | |||
| metrics=accuracy, | |||
| reward_function=reward_accuracy, | |||
| optimizer=optimizer, | |||
| callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./"+args.search_for+"_checkpoints")], | |||
| batch_size=args.batch_size, | |||
| num_epochs=num_epochs, | |||
| dataset_train=dataset_train, | |||
| dataset_valid=dataset_valid, | |||
| log_frequency=args.log_frequency, | |||
| mutator=mutator, | |||
| child_steps=2, | |||
| mutator_steps=2, | |||
| child_model_path='./'+args.search_for+'_child_model', | |||
| result_path=args.result_path) | |||
| logger.info(trainer.metrics) | |||
| t1 = time.time() | |||
| trainer.train() | |||
| # trainer.result["cost_time"] = time.time() - t1 | |||
| # dump_global_result(args.result_path,trainer.result) | |||
| selected_model = trainer.export_child_model(selected_space = True) | |||
| dump_global_result(args.best_selected_space_path,selected_model) | |||
| @@ -0,0 +1,30 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import torch | |||
| def accuracy(output, target, topk=(1,)): | |||
| """ Computes the precision@k for the specified values of k """ | |||
| maxk = max(topk) | |||
| batch_size = target.size(0) | |||
| _, pred = output.topk(maxk, 1, True, True) | |||
| pred = pred.t() | |||
| # one-hot case | |||
| if target.ndimension() > 1: | |||
| target = target.max(1)[1] | |||
| correct = pred.eq(target.view(1, -1).expand_as(pred)) | |||
| res = dict() | |||
| for k in topk: | |||
| correct_k = correct[:k].view(-1).float().sum(0) | |||
| res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item() | |||
| return res | |||
| def reward_accuracy(output, target, topk=(1,)): | |||
| batch_size = target.size(0) | |||
| _, predicted = torch.max(output.data, 1) | |||
| return (predicted == target).sum().item() / batch_size | |||
| @@ -0,0 +1,141 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import json | |||
| import logging | |||
| from .mutables import InputChoice, LayerChoice, MutableScope | |||
| from .mutator import Mutator | |||
| from .utils import to_list | |||
| _logger = logging.getLogger(__name__) | |||
| _logger.setLevel(logging.INFO) | |||
| class FixedArchitecture(Mutator): | |||
| """ | |||
| Fixed architecture mutator that always selects a certain graph. | |||
| Parameters | |||
| ---------- | |||
| model : nn.Module | |||
| A mutable network. | |||
| fixed_arc : dict | |||
| Preloaded architecture object. | |||
| strict : bool | |||
| Force everything that appears in ``fixed_arc`` to be used at least once. | |||
| """ | |||
| def __init__(self, model, fixed_arc, strict=True): | |||
| super().__init__(model) | |||
| self._fixed_arc = fixed_arc | |||
| mutable_keys = set([mutable.key for mutable in self.mutables if not isinstance(mutable, MutableScope)]) | |||
| fixed_arc_keys = set(self._fixed_arc.keys()) | |||
| if fixed_arc_keys - mutable_keys: | |||
| raise RuntimeError("Unexpected keys found in fixed architecture: {}.".format(fixed_arc_keys - mutable_keys)) | |||
| if mutable_keys - fixed_arc_keys: | |||
| raise RuntimeError("Missing keys in fixed architecture: {}.".format(mutable_keys - fixed_arc_keys)) | |||
| self._fixed_arc = self._from_human_readable_architecture(self._fixed_arc) | |||
| def _from_human_readable_architecture(self, human_arc): | |||
| # convert from an exported architecture | |||
| result_arc = {k: to_list(v) for k, v in human_arc.items()} # there could be tensors, numpy arrays, etc. | |||
| # First, convert non-list to list, because there could be {"op1": 0} or {"op1": "conv"}, | |||
| # which means {"op1": [0, ]} ir {"op1": ["conv", ]} | |||
| result_arc = {k: v if isinstance(v, list) else [v] for k, v in result_arc.items()} | |||
| # Second, infer which ones are multi-hot arrays and which ones are in human-readable format. | |||
| # This is non-trivial, since if an array in [0, 1], we cannot know for sure it means [false, true] or [true, true]. | |||
| # Here, we assume an multihot array has to be a boolean array or a float array and matches the length. | |||
| for mutable in self.mutables: | |||
| if mutable.key not in result_arc: | |||
| continue # skip silently | |||
| choice_arr = result_arc[mutable.key] | |||
| if all(isinstance(v, bool) for v in choice_arr) or all(isinstance(v, float) for v in choice_arr): | |||
| if (isinstance(mutable, LayerChoice) and len(mutable) == len(choice_arr)) or \ | |||
| (isinstance(mutable, InputChoice) and mutable.n_candidates == len(choice_arr)): | |||
| # multihot, do nothing | |||
| continue | |||
| if isinstance(mutable, LayerChoice): | |||
| choice_arr = [mutable.names.index(val) if isinstance(val, str) else val for val in choice_arr] | |||
| choice_arr = [i in choice_arr for i in range(len(mutable))] | |||
| elif isinstance(mutable, InputChoice): | |||
| choice_arr = [mutable.choose_from.index(val) if isinstance(val, str) else val for val in choice_arr] | |||
| choice_arr = [i in choice_arr for i in range(mutable.n_candidates)] | |||
| result_arc[mutable.key] = choice_arr | |||
| return result_arc | |||
| def sample_search(self): | |||
| """ | |||
| Always returns the fixed architecture. | |||
| """ | |||
| return self._fixed_arc | |||
| def sample_final(self): | |||
| """ | |||
| Always returns the fixed architecture. | |||
| """ | |||
| return self._fixed_arc | |||
| def replace_layer_choice(self, module=None, prefix=""): | |||
| """ | |||
| Replace layer choices with selected candidates. It's done with best effort. | |||
| In case of weighted choices or multiple choices. if some of the choices on weighted with zero, delete them. | |||
| If single choice, replace the module with a normal module. | |||
| Parameters | |||
| ---------- | |||
| module : nn.Module | |||
| Module to be processed. | |||
| prefix : str | |||
| Module name under global namespace. | |||
| """ | |||
| if module is None: | |||
| module = self.model | |||
| for name, mutable in module.named_children(): | |||
| global_name = (prefix + "." if prefix else "") + name | |||
| if isinstance(mutable, LayerChoice): | |||
| chosen = self._fixed_arc[mutable.key] | |||
| if sum(chosen) == 1 and max(chosen) == 1 and not mutable.return_mask: | |||
| # sum is one, max is one, there has to be an only one | |||
| # this is compatible with both integer arrays, boolean arrays and float arrays | |||
| _logger.info("Replacing %s with candidate number %d.", global_name, chosen.index(1)) | |||
| setattr(module, name, mutable[chosen.index(1)]) | |||
| else: | |||
| if mutable.return_mask: | |||
| _logger.info("`return_mask` flag of %s is true. As it relies on the behavior of LayerChoice, " \ | |||
| "LayerChoice will not be replaced.") | |||
| # remove unused parameters | |||
| for ch, n in zip(chosen, mutable.names): | |||
| if ch == 0 and not isinstance(ch, float): | |||
| setattr(mutable, n, None) | |||
| else: | |||
| self.replace_layer_choice(mutable, global_name) | |||
| def apply_fixed_architecture(model, fixed_arc): | |||
| """ | |||
| Load architecture from `fixed_arc` and apply to model. | |||
| Parameters | |||
| ---------- | |||
| model : torch.nn.Module | |||
| Model with mutables. | |||
| fixed_arc : str or dict | |||
| Path to the JSON that stores the architecture, or dict that stores the exported architecture. | |||
| Returns | |||
| ------- | |||
| FixedArchitecture | |||
| Mutator that is responsible for fixes the graph. | |||
| """ | |||
| if isinstance(fixed_arc, str): | |||
| with open(fixed_arc) as f: | |||
| fixed_arc = json.load(f) | |||
| architecture = FixedArchitecture(model, fixed_arc) | |||
| architecture.reset() | |||
| # for the convenience of parameters counting | |||
| architecture.replace_layer_choice() | |||
| return architecture | |||
| @@ -0,0 +1,79 @@ | |||
| # -*- coding: utf-8 -*- | |||
| from datetime import datetime | |||
| from io import TextIOBase | |||
| import logging | |||
| from logging import FileHandler, Formatter, Handler, StreamHandler | |||
| from pathlib import Path | |||
| import sys | |||
| import time | |||
| from typing import Optional | |||
| time_format = '%Y-%m-%d %H:%M:%S' | |||
| formatter = Formatter( | |||
| '[%(asctime)s] %(levelname)s (%(name)s/%(threadName)s) %(message)s', | |||
| time_format | |||
| ) | |||
| def init_logger() -> None: | |||
| _setup_root_logger(StreamHandler(sys.stdout), logging.INFO) | |||
| logging.basicConfig() | |||
| def _prepare_log_dir(path: Optional[str]) -> Path: | |||
| if path is None: | |||
| return Path() | |||
| ret = Path(path) | |||
| ret.mkdir(parents=True, exist_ok=True) | |||
| return ret | |||
| def _setup_root_logger(handler: Handler, level: int) -> None: | |||
| _setup_logger('tadl', handler, level) | |||
| def _setup_logger(name: str, handler: Handler, level: int) -> None: | |||
| handler.setFormatter(formatter) | |||
| logger = logging.getLogger(name) | |||
| logger.addHandler(handler) | |||
| logger.setLevel(level) | |||
| logger.propagate = False | |||
| class _LogFileWrapper(TextIOBase): | |||
| # wrap the logger file so that anything written to it will automatically get formatted | |||
| def __init__(self, log_file: TextIOBase): | |||
| self.file: TextIOBase = log_file | |||
| self.line_buffer: Optional[str] = None | |||
| self.line_start_time: Optional[datetime] = None | |||
| def write(self, s: str) -> int: | |||
| cur_time = datetime.now() | |||
| if self.line_buffer and (cur_time - self.line_start_time).total_seconds() > 0.1: | |||
| self.flush() | |||
| if self.line_buffer: | |||
| self.line_buffer += s | |||
| else: | |||
| self.line_buffer = s | |||
| self.line_start_time = cur_time | |||
| if '\n' not in s: | |||
| return len(s) | |||
| time_str = cur_time.strftime(time_format) | |||
| lines = self.line_buffer.split('\n') | |||
| for line in lines[:-1]: | |||
| self.file.write(f'[{time_str}] PRINT {line}\n') | |||
| self.file.flush() | |||
| self.line_buffer = lines[-1] | |||
| self.line_start_time = cur_time | |||
| return len(s) | |||
| def flush(self) -> None: | |||
| if self.line_buffer: | |||
| time_str = self.line_start_time.strftime(time_format) | |||
| self.file.write(f'[{time_str}] PRINT {self.line_buffer}\n') | |||
| self.file.flush() | |||
| self.line_buffer = None | |||
| @@ -0,0 +1,340 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import logging | |||
| import warnings | |||
| from collections import OrderedDict | |||
| import torch.nn as nn | |||
| from .utils import global_mutable_counting | |||
| logger = logging.getLogger(__name__) | |||
| logger.setLevel(logging.INFO) | |||
| class Mutable(nn.Module): | |||
| """ | |||
| Mutable is designed to function as a normal layer, with all necessary operators' weights. | |||
| States and weights of architectures should be included in mutator, instead of the layer itself. | |||
| Mutable has a key, which marks the identity of the mutable. This key can be used by users to share | |||
| decisions among different mutables. In mutator's implementation, mutators should use the key to | |||
| distinguish different mutables. Mutables that share the same key should be "similar" to each other. | |||
| Currently the default scope for keys is global. By default, the keys uses a global counter from 1 to | |||
| produce unique ids. | |||
| Parameters | |||
| ---------- | |||
| key : str | |||
| The key of mutable. | |||
| Notes | |||
| ----- | |||
| The counter is program level, but mutables are model level. In case multiple models are defined, and | |||
| you want to have `counter` starting from 1 in the second model, it's recommended to assign keys manually | |||
| instead of using automatic keys. | |||
| """ | |||
| def __init__(self, key=None): | |||
| super().__init__() | |||
| if key is not None: | |||
| if not isinstance(key, str): | |||
| key = str(key) | |||
| logger.warning("Warning: key \"%s\" is not string, converted to string.", key) | |||
| self._key = key | |||
| else: | |||
| self._key = self.__class__.__name__ + str(global_mutable_counting()) | |||
| self.init_hook = self.forward_hook = None | |||
| def __deepcopy__(self, memodict=None): | |||
| raise NotImplementedError("Deep copy doesn't work for mutables.") | |||
| def __call__(self, *args, **kwargs): | |||
| self._check_built() | |||
| return super().__call__(*args, **kwargs) | |||
| def set_mutator(self, mutator): | |||
| if "mutator" in self.__dict__: | |||
| raise RuntimeError("`set_mutator` is called more than once. Did you parse the search space multiple times? " | |||
| "Or did you apply multiple fixed architectures?") | |||
| self.__dict__["mutator"] = mutator | |||
| @property | |||
| def key(self): | |||
| """ | |||
| Read-only property of key. | |||
| """ | |||
| return self._key | |||
| @property | |||
| def name(self): | |||
| """ | |||
| After the search space is parsed, it will be the module name of the mutable. | |||
| """ | |||
| return self._name if hasattr(self, "_name") else "_key" | |||
| @name.setter | |||
| def name(self, name): | |||
| self._name = name | |||
| def _check_built(self): | |||
| if not hasattr(self, "mutator"): | |||
| raise ValueError( | |||
| "Mutator not set for {}. You might have forgotten to initialize and apply your mutator. " | |||
| "Or did you initialize a mutable on the fly in forward pass? Move to `__init__` " | |||
| "so that trainer can locate all your mutables. See NNI docs for more details.".format(self)) | |||
| class MutableScope(Mutable): | |||
| """ | |||
| Mutable scope marks a subgraph/submodule to help mutators make better decisions. | |||
| If not annotated with mutable scope, search space will be flattened as a list. However, some mutators might | |||
| need to leverage the concept of a "cell". So if a module is defined as a mutable scope, everything in it will | |||
| look like "sub-search-space" in the scope. Scopes can be nested. | |||
| There are two ways mutators can use mutable scope. One is to traverse the search space as a tree during initialization | |||
| and reset. The other is to implement `enter_mutable_scope` and `exit_mutable_scope`. They are called before and after | |||
| the forward method of the class inheriting mutable scope. | |||
| Mutable scopes are also mutables that are listed in the mutator.mutables (search space), but they are not supposed | |||
| to appear in the dict of choices. | |||
| Parameters | |||
| ---------- | |||
| key : str | |||
| Key of mutable scope. | |||
| """ | |||
| def __init__(self, key): | |||
| super().__init__(key=key) | |||
| def __call__(self, *args, **kwargs): | |||
| try: | |||
| self._check_built() | |||
| self.mutator.enter_mutable_scope(self) | |||
| return super().__call__(*args, **kwargs) | |||
| finally: | |||
| self.mutator.exit_mutable_scope(self) | |||
| class LayerChoice(Mutable): | |||
| """ | |||
| Layer choice selects one of the ``op_candidates``, then apply it on inputs and return results. | |||
| In rare cases, it can also select zero or many. | |||
| Layer choice does not allow itself to be nested. | |||
| Parameters | |||
| ---------- | |||
| op_candidates : list of nn.Module or OrderedDict | |||
| A module list to be selected from. | |||
| reduction : str | |||
| ``mean``, ``concat``, ``sum`` or ``none``. Policy if multiples are selected. | |||
| If ``none``, a list is returned. ``mean`` returns the average. ``sum`` returns the sum. | |||
| ``concat`` concatenate the list at dimension 1. | |||
| return_mask : bool | |||
| If ``return_mask``, return output tensor and a mask. Otherwise return tensor only. | |||
| key : str | |||
| Key of the input choice. | |||
| Attributes | |||
| ---------- | |||
| length : int | |||
| Deprecated. Number of ops to choose from. ``len(layer_choice)`` is recommended. | |||
| names : list of str | |||
| Names of candidates. | |||
| choices : list of Module | |||
| Deprecated. A list of all candidate modules in the layer choice module. | |||
| ``list(layer_choice)`` is recommended, which will serve the same purpose. | |||
| Notes | |||
| ----- | |||
| ``op_candidates`` can be a list of modules or a ordered dict of named modules, for example, | |||
| .. code-block:: python | |||
| self.op_choice = LayerChoice(OrderedDict([ | |||
| ("conv3x3", nn.Conv2d(3, 16, 128)), | |||
| ("conv5x5", nn.Conv2d(5, 16, 128)), | |||
| ("conv7x7", nn.Conv2d(7, 16, 128)) | |||
| ])) | |||
| Elements in layer choice can be modified or deleted. Use ``del self.op_choice["conv5x5"]`` or | |||
| ``self.op_choice[1] = nn.Conv3d(...)``. Adding more choices is not supported yet. | |||
| """ | |||
| def __init__(self, op_candidates, reduction="sum", return_mask=False, key=None): | |||
| super().__init__(key=key) | |||
| self.names = [] | |||
| if isinstance(op_candidates, OrderedDict): | |||
| for name, module in op_candidates.items(): | |||
| assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \ | |||
| "Please don't use a reserved name '{}' for your module.".format(name) | |||
| self.add_module(name, module) | |||
| self.names.append(name) | |||
| elif isinstance(op_candidates, list): | |||
| for i, module in enumerate(op_candidates): | |||
| self.add_module(str(i), module) | |||
| self.names.append(str(i)) | |||
| else: | |||
| raise TypeError("Unsupported op_candidates type: {}".format(type(op_candidates))) | |||
| self.reduction = reduction | |||
| self.return_mask = return_mask | |||
| def __getitem__(self, idx): | |||
| if isinstance(idx, str): | |||
| return self._modules[idx] | |||
| return list(self)[idx] | |||
| def __setitem__(self, idx, module): | |||
| key = idx if isinstance(idx, str) else self.names[idx] | |||
| return setattr(self, key, module) | |||
| def __delitem__(self, idx): | |||
| if isinstance(idx, slice): | |||
| for key in self.names[idx]: | |||
| delattr(self, key) | |||
| else: | |||
| if isinstance(idx, str): | |||
| key, idx = idx, self.names.index(idx) | |||
| else: | |||
| key = self.names[idx] | |||
| delattr(self, key) | |||
| del self.names[idx] | |||
| @property | |||
| def length(self): | |||
| warnings.warn("layer_choice.length is deprecated. Use `len(layer_choice)` instead.", DeprecationWarning) | |||
| return len(self) | |||
| def __len__(self): | |||
| return len(self.names) | |||
| def __iter__(self): | |||
| return map(lambda name: self._modules[name], self.names) | |||
| @property | |||
| def choices(self): | |||
| warnings.warn("layer_choice.choices is deprecated. Use `list(layer_choice)` instead.", DeprecationWarning) | |||
| return list(self) | |||
| def forward(self, *args, **kwargs): | |||
| """ | |||
| Returns | |||
| ------- | |||
| tuple of tensors | |||
| Output and selection mask. If ``return_mask`` is ``False``, only output is returned. | |||
| """ | |||
| out, mask = self.mutator.on_forward_layer_choice(self, *args, **kwargs) | |||
| if self.return_mask: | |||
| return out, mask | |||
| return out | |||
| class InputChoice(Mutable): | |||
| """ | |||
| Input choice selects ``n_chosen`` inputs from ``choose_from`` (contains ``n_candidates`` keys). For beginners, | |||
| use ``n_candidates`` instead of ``choose_from`` is a safe option. To get the most power out of it, you might want to | |||
| know about ``choose_from``. | |||
| The keys in ``choose_from`` can be keys that appear in past mutables, or ``NO_KEY`` if there are no suitable ones. | |||
| The keys are designed to be the keys of the sources. To help mutators make better decisions, | |||
| mutators might be interested in how the tensors to choose from come into place. For example, the tensor is the | |||
| output of some operator, some node, some cell, or some module. If this operator happens to be a mutable (e.g., | |||
| ``LayerChoice`` or ``InputChoice``), it has a key naturally that can be used as a source key. If it's a | |||
| module/submodule, it needs to be annotated with a key: that's where a :class:`MutableScope` is needed. | |||
| In the example below, ``input_choice`` is a 4-choose-any. The first 3 is semantically output of cell1, output of cell2, | |||
| output of cell3 with respectively. Notice that an extra max pooling is followed by cell1, indicating x1 is not | |||
| "actually" the direct output of cell1. | |||
| .. code-block:: python | |||
| class Cell(MutableScope): | |||
| pass | |||
| class Net(nn.Module): | |||
| def __init__(self): | |||
| self.cell1 = Cell("cell1") | |||
| self.cell2 = Cell("cell2") | |||
| self.op = LayerChoice([conv3x3(), conv5x5()], key="op") | |||
| self.input_choice = InputChoice(choose_from=["cell1", "cell2", "op", InputChoice.NO_KEY]) | |||
| def forward(self, x): | |||
| x1 = max_pooling(self.cell1(x)) | |||
| x2 = self.cell2(x) | |||
| x3 = self.op(x) | |||
| x4 = torch.zeros_like(x) | |||
| return self.input_choice([x1, x2, x3, x4]) | |||
| Parameters | |||
| ---------- | |||
| n_candidates : int | |||
| Number of inputs to choose from. | |||
| choose_from : list of str | |||
| List of source keys to choose from. At least of one of ``choose_from`` and ``n_candidates`` must be fulfilled. | |||
| If ``n_candidates`` has a value but ``choose_from`` is None, it will be automatically treated as ``n_candidates`` | |||
| number of empty string. | |||
| n_chosen : int | |||
| Recommended inputs to choose. If None, mutator is instructed to select any. | |||
| reduction : str | |||
| ``mean``, ``concat``, ``sum`` or ``none``. See :class:`LayerChoice`. | |||
| return_mask : bool | |||
| If ``return_mask``, return output tensor and a mask. Otherwise return tensor only. | |||
| key : str | |||
| Key of the input choice. | |||
| """ | |||
| NO_KEY = "" | |||
| def __init__(self, n_candidates=None, choose_from=None, n_chosen=None, | |||
| reduction="sum", return_mask=False, key=None): | |||
| super().__init__(key=key) | |||
| # precondition check | |||
| assert n_candidates is not None or choose_from is not None, "At least one of `n_candidates` and `choose_from`" \ | |||
| "must be not None." | |||
| if choose_from is not None and n_candidates is None: | |||
| n_candidates = len(choose_from) | |||
| elif choose_from is None and n_candidates is not None: | |||
| choose_from = [self.NO_KEY] * n_candidates | |||
| assert n_candidates == len(choose_from), "Number of candidates must be equal to the length of `choose_from`." | |||
| assert n_candidates > 0, "Number of candidates must be greater than 0." | |||
| assert n_chosen is None or 0 <= n_chosen <= n_candidates, "Expected selected number must be None or no more " \ | |||
| "than number of candidates." | |||
| self.n_candidates = n_candidates | |||
| self.choose_from = choose_from.copy() | |||
| self.n_chosen = n_chosen | |||
| self.reduction = reduction | |||
| self.return_mask = return_mask | |||
| def forward(self, optional_inputs): | |||
| """ | |||
| Forward method of LayerChoice. | |||
| Parameters | |||
| ---------- | |||
| optional_inputs : list or dict | |||
| Recommended to be a dict. As a dict, inputs will be converted to a list that follows the order of | |||
| ``choose_from`` in initialization. As a list, inputs must follow the semantic order that is the same as | |||
| ``choose_from``. | |||
| Returns | |||
| ------- | |||
| tuple of tensors | |||
| Output and selection mask. If ``return_mask`` is ``False``, only output is returned. | |||
| """ | |||
| optional_input_list = optional_inputs | |||
| if isinstance(optional_inputs, dict): | |||
| optional_input_list = [optional_inputs[tag] for tag in self.choose_from] | |||
| assert isinstance(optional_input_list, list), \ | |||
| "Optional input list must be a list, not a {}.".format(type(optional_input_list)) | |||
| assert len(optional_inputs) == self.n_candidates, \ | |||
| "Length of the input list must be equal to number of candidates." | |||
| out, mask = self.mutator.on_forward_input_choice(self, optional_input_list) | |||
| if self.return_mask: | |||
| return out, mask | |||
| return out | |||
| @@ -0,0 +1,309 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import logging | |||
| from collections import defaultdict | |||
| import numpy as np | |||
| import torch | |||
| from .base_mutator import BaseMutator | |||
| from .mutables import LayerChoice, InputChoice | |||
| from .utils import to_list | |||
| logger = logging.getLogger(__name__) | |||
| logger.setLevel(logging.INFO) | |||
| class Mutator(BaseMutator): | |||
| def __init__(self, model): | |||
| super().__init__(model) | |||
| self._cache = dict() | |||
| self._connect_all = False | |||
| def sample_search(self): | |||
| """ | |||
| Override to implement this method to iterate over mutables and make decisions. | |||
| Returns | |||
| ------- | |||
| dict | |||
| A mapping from key of mutables to decisions. | |||
| """ | |||
| raise NotImplementedError | |||
| def sample_final(self): | |||
| """ | |||
| Override to implement this method to iterate over mutables and make decisions that is final | |||
| for export and retraining. | |||
| Returns | |||
| ------- | |||
| dict | |||
| A mapping from key of mutables to decisions. | |||
| """ | |||
| raise NotImplementedError | |||
| def reset(self): | |||
| """ | |||
| Reset the mutator by call the `sample_search` to resample (for search). Stores the result in a local | |||
| variable so that `on_forward_layer_choice` and `on_forward_input_choice` can use the decision directly. | |||
| """ | |||
| self._cache = self.sample_search() | |||
| def export(self): | |||
| """ | |||
| Resample (for final) and return results. | |||
| Returns | |||
| ------- | |||
| dict | |||
| A mapping from key of mutables to decisions. | |||
| """ | |||
| sampled = self.sample_final() | |||
| result = dict() | |||
| for mutable in self.mutables: | |||
| if not isinstance(mutable, (LayerChoice, InputChoice)): | |||
| # not supported as built-in | |||
| continue | |||
| result[mutable.key] = self._convert_mutable_decision_to_human_readable(mutable, sampled.pop(mutable.key)) | |||
| if sampled: | |||
| raise ValueError("Unexpected keys returned from 'sample_final()': %s", list(sampled.keys())) | |||
| return result | |||
| def status(self): | |||
| """ | |||
| Return current selection status of mutator. | |||
| Returns | |||
| ------- | |||
| dict | |||
| A mapping from key of mutables to decisions. All weights (boolean type and float type) | |||
| are converted into real number values. Numpy arrays and tensors are converted into list. | |||
| """ | |||
| data = dict() | |||
| for k, v in self._cache.items(): | |||
| if torch.is_tensor(v): | |||
| v = v.detach().cpu().numpy() | |||
| if isinstance(v, np.ndarray): | |||
| v = v.astype(np.float32).tolist() | |||
| data[k] = v | |||
| return data | |||
| def graph(self, inputs): | |||
| """ | |||
| Return model supernet graph. | |||
| Parameters | |||
| ---------- | |||
| inputs: tuple of tensor | |||
| Inputs that will be feeded into the network. | |||
| Returns | |||
| ------- | |||
| dict | |||
| Containing ``node``, in Tensorboard GraphDef format. | |||
| Additional key ``mutable`` is a map from key to list of modules. | |||
| """ | |||
| if not torch.__version__.startswith("1.4"): | |||
| logger.warning("Graph is only tested with PyTorch 1.4. Other versions might not work.") | |||
| from nni._graph_utils import build_graph | |||
| from google.protobuf import json_format | |||
| # protobuf should be installed as long as tensorboard is installed | |||
| try: | |||
| self._connect_all = True | |||
| graph_def, _ = build_graph(self.model, inputs, verbose=False) | |||
| result = json_format.MessageToDict(graph_def) | |||
| finally: | |||
| self._connect_all = False | |||
| # `mutable` is to map the keys to a list of corresponding modules. | |||
| # A key can be linked to multiple modules, use `dedup=False` to find them all. | |||
| result["mutable"] = defaultdict(list) | |||
| for mutable in self.mutables.traverse(deduplicate=False): | |||
| # A module will be represent in the format of | |||
| # [{"type": "Net", "name": ""}, {"type": "Cell", "name": "cell1"}, {"type": "Conv2d": "name": "conv"}] | |||
| # which will be concatenated into Net/Cell[cell1]/Conv2d[conv] in frontend. | |||
| # This format is aligned with the scope name jit gives. | |||
| modules = mutable.name.split(".") | |||
| path = [ | |||
| {"type": self.model.__class__.__name__, "name": ""} | |||
| ] | |||
| m = self.model | |||
| for module in modules: | |||
| m = getattr(m, module) | |||
| path.append({ | |||
| "type": m.__class__.__name__, | |||
| "name": module | |||
| }) | |||
| result["mutable"][mutable.key].append(path) | |||
| return result | |||
| def on_forward_layer_choice(self, mutable, *args, **kwargs): | |||
| """ | |||
| On default, this method retrieves the decision obtained previously, and select certain operations. | |||
| Only operations with non-zero weight will be executed. The results will be added to a list. | |||
| Then it will reduce the list of all tensor outputs with the policy specified in `mutable.reduction`. | |||
| Parameters | |||
| ---------- | |||
| mutable : LayerChoice | |||
| Layer choice module. | |||
| args : list of torch.Tensor | |||
| Inputs | |||
| kwargs : dict | |||
| Inputs | |||
| Returns | |||
| ------- | |||
| tuple of torch.Tensor and torch.Tensor | |||
| Output and mask. | |||
| """ | |||
| if self._connect_all: | |||
| return self._all_connect_tensor_reduction(mutable.reduction, | |||
| [op(*args, **kwargs) for op in mutable]), \ | |||
| torch.ones(len(mutable)).bool() | |||
| def _map_fn(op, args, kwargs): | |||
| return op(*args, **kwargs) | |||
| mask = self._get_decision(mutable) | |||
| assert len(mask) == len(mutable), \ | |||
| "Invalid mask, expected {} to be of length {}.".format(mask, len(mutable)) | |||
| out, mask = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable], mask) | |||
| return self._tensor_reduction(mutable.reduction, out), mask | |||
| def on_forward_input_choice(self, mutable, tensor_list): | |||
| """ | |||
| On default, this method retrieves the decision obtained previously, and select certain tensors. | |||
| Then it will reduce the list of all tensor outputs with the policy specified in `mutable.reduction`. | |||
| Parameters | |||
| ---------- | |||
| mutable : InputChoice | |||
| Input choice module. | |||
| tensor_list : list of torch.Tensor | |||
| Tensor list to apply the decision on. | |||
| Returns | |||
| ------- | |||
| tuple of torch.Tensor and torch.Tensor | |||
| Output and mask. | |||
| """ | |||
| if self._connect_all: | |||
| return self._all_connect_tensor_reduction(mutable.reduction, tensor_list), \ | |||
| torch.ones(mutable.n_candidates).bool() | |||
| mask = self._get_decision(mutable) | |||
| assert len(mask) == mutable.n_candidates, \ | |||
| "Invalid mask, expected {} to be of length {}.".format(mask, mutable.n_candidates) | |||
| out, mask = self._select_with_mask(lambda x: x, [(t,) for t in tensor_list], mask) | |||
| return self._tensor_reduction(mutable.reduction, out), mask | |||
| def _select_with_mask(self, map_fn, candidates, mask): | |||
| """ | |||
| Select masked tensors and return a list of tensors. | |||
| Parameters | |||
| ---------- | |||
| map_fn : function | |||
| Convert candidates to target candidates. Can be simply identity. | |||
| candidates : list of torch.Tensor | |||
| Tensor list to apply the decision on. | |||
| mask : list-like object | |||
| Can be a list, an numpy array or a tensor (recommended). Needs to | |||
| have the same length as ``candidates``. | |||
| Returns | |||
| ------- | |||
| tuple of list of torch.Tensor and torch.Tensor | |||
| Output and mask. | |||
| """ | |||
| if (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], bool)) or \ | |||
| (isinstance(mask, np.ndarray) and mask.dtype == np.bool) or \ | |||
| "BoolTensor" in mask.type(): | |||
| out = [map_fn(*cand) for cand, m in zip(candidates, mask) if m] | |||
| elif (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], (float, int))) or \ | |||
| (isinstance(mask, np.ndarray) and mask.dtype in (np.float32, np.float64, np.int32, np.int64)) or \ | |||
| "FloatTensor" in mask.type(): | |||
| out = [map_fn(*cand) * m for cand, m in zip(candidates, mask) if m] | |||
| else: | |||
| raise ValueError("Unrecognized mask '%s'" % mask) | |||
| if not torch.is_tensor(mask): | |||
| mask = torch.tensor(mask) # pylint: disable=not-callable | |||
| return out, mask | |||
| def _tensor_reduction(self, reduction_type, tensor_list): | |||
| if reduction_type == "none": | |||
| return tensor_list | |||
| if not tensor_list: | |||
| return None # empty. return None for now | |||
| if len(tensor_list) == 1: | |||
| return tensor_list[0] | |||
| if reduction_type == "sum": | |||
| return sum(tensor_list) | |||
| if reduction_type == "mean": | |||
| return sum(tensor_list) / len(tensor_list) | |||
| if reduction_type == "concat": | |||
| return torch.cat(tensor_list, dim=1) | |||
| raise ValueError("Unrecognized reduction policy: \"{}\"".format(reduction_type)) | |||
| def _all_connect_tensor_reduction(self, reduction_type, tensor_list): | |||
| if reduction_type == "none": | |||
| return tensor_list | |||
| if reduction_type == "concat": | |||
| return torch.cat(tensor_list, dim=1) | |||
| return torch.stack(tensor_list).sum(0) | |||
| def _get_decision(self, mutable): | |||
| """ | |||
| By default, this method checks whether `mutable.key` is already in the decision cache, | |||
| and returns the result without double-check. | |||
| Parameters | |||
| ---------- | |||
| mutable : Mutable | |||
| Returns | |||
| ------- | |||
| object | |||
| """ | |||
| if mutable.key not in self._cache: | |||
| raise ValueError("\"{}\" not found in decision cache.".format(mutable.key)) | |||
| result = self._cache[mutable.key] | |||
| logger.debug("Decision %s: %s", mutable.key, result) | |||
| return result | |||
| def _convert_mutable_decision_to_human_readable(self, mutable, sampled): | |||
| # Assert the existence of mutable.key in returned architecture. | |||
| # Also check if there is anything extra. | |||
| multihot_list = to_list(sampled) | |||
| converted = None | |||
| # If it's a boolean array, we can do optimization. | |||
| if all([t == 0 or t == 1 for t in multihot_list]): | |||
| if isinstance(mutable, LayerChoice): | |||
| assert len(multihot_list) == len(mutable), \ | |||
| "Results returned from 'sample_final()' (%s: %s) either too short or too long." \ | |||
| % (mutable.key, multihot_list) | |||
| # check if all modules have different names and they indeed have names | |||
| if len(set(mutable.names)) == len(mutable) and not all(d.isdigit() for d in mutable.names): | |||
| converted = [name for i, name in enumerate(mutable.names) if multihot_list[i]] | |||
| else: | |||
| converted = [i for i in range(len(multihot_list)) if multihot_list[i]] | |||
| if isinstance(mutable, InputChoice): | |||
| assert len(multihot_list) == mutable.n_candidates, \ | |||
| "Results returned from 'sample_final()' (%s: %s) either too short or too long." \ | |||
| % (mutable.key, multihot_list) | |||
| # check if all input candidates have different names | |||
| if len(set(mutable.choose_from)) == mutable.n_candidates: | |||
| converted = [name for i, name in enumerate(mutable.choose_from) if multihot_list[i]] | |||
| else: | |||
| converted = [i for i in range(len(multihot_list)) if multihot_list[i]] | |||
| if converted is not None: | |||
| # if only one element, then remove the bracket | |||
| if len(converted) == 1: | |||
| converted = converted[0] | |||
| else: | |||
| # do nothing | |||
| converted = multihot_list | |||
| return converted | |||
| @@ -0,0 +1,47 @@ | |||
| # Network Morphism | |||
| The implementation of the Network Morphism algorithm is based on | |||
| [Auto-Keras: An Efficient Neural Architecture Search System](https://arxiv.org/pdf/1806.10282.pdf) | |||
| Train stage | |||
| ``` | |||
| python network_morphism_train.py | |||
| --trial_id 0 | |||
| --experiment_dir 'tadl' | |||
| --log_path 'tadl/train/0/log' | |||
| --data_dir '../data/' | |||
| --result_path 'trial_id/result.json' | |||
| --log_path 'trial_id/log' | |||
| --search_space_path 'experiment_id/search_space.json' | |||
| --best_selected_space_path 'experiment_id/best_selected_space.json' | |||
| --lr 0.001 --epochs 100 --batch_size 32 --opt 'SGD' | |||
| ``` | |||
| select stage | |||
| ``` | |||
| python network_morphism_select.py | |||
| ``` | |||
| retrain stage | |||
| ``` | |||
| python network_morphism_retrain.py | |||
| --data_dir '../data/' | |||
| --experiment_dir 'tadl' | |||
| --result_path 'trial_id/result.json' | |||
| --log_path 'trial_id/log' | |||
| --best_selected_space_path 'experiment_id/best_selected_space.json' | |||
| --best_checkpoint_dir 'experiment_id/' | |||
| --trial_id 0 --batch_size 32 --opt 'SGD' --epochs 100 --lr 0.001 | |||
| ``` | |||
| The best model searched achieved 88.1% on CIFAR-10 dataset after 100 trials. | |||
| Dependencies: | |||
| ``` | |||
| Python = 3.6.13 | |||
| pytorch = 1.8.0 | |||
| torchvision = 0.9.0 | |||
| scipy = 1.5.2 | |||
| scikit-learn = 0.24.1 | |||
| ``` | |||
| @@ -0,0 +1,517 @@ | |||
| import math | |||
| import random | |||
| from copy import deepcopy | |||
| from functools import total_ordering | |||
| from queue import PriorityQueue | |||
| import numpy as np | |||
| from scipy.linalg import LinAlgError, cho_solve, cholesky, solve_triangular | |||
| from scipy.optimize import linear_sum_assignment | |||
| from sklearn.metrics.pairwise import rbf_kernel | |||
| from .graph_transformer import transform | |||
| from .layers import is_layer | |||
| from utils import Constant, OptimizeMode | |||
| import logging | |||
| logger = logging.getLogger(__name__) | |||
| # equation(6) dl | |||
| def layer_distance(a, b): | |||
| """The distance between two layers.""" | |||
| # pylint: disable=unidiomatic-typecheck | |||
| if not isinstance(a, type(b)): | |||
| return 1.0 | |||
| if is_layer(a, "Conv"): | |||
| att_diff = [ | |||
| (a.filters, b.filters), | |||
| (a.kernel_size, b.kernel_size), | |||
| (a.stride, b.stride), | |||
| ] | |||
| return attribute_difference(att_diff) | |||
| if is_layer(a, "Pooling"): | |||
| att_diff = [ | |||
| (a.padding, b.padding), | |||
| (a.kernel_size, b.kernel_size), | |||
| (a.stride, b.stride), | |||
| ] | |||
| return attribute_difference(att_diff) | |||
| return 0.0 | |||
| # equation(6) | |||
| def attribute_difference(att_diff): | |||
| ''' The attribute distance. | |||
| ''' | |||
| ret = 0 | |||
| for a_value, b_value in att_diff: | |||
| if max(a_value, b_value) == 0: | |||
| ret += 0 | |||
| else: | |||
| ret += abs(a_value - b_value) * 1.0 / max(a_value, b_value) | |||
| return ret * 1.0 / len(att_diff) | |||
| # equation(7) A | |||
| def layers_distance(list_a, list_b): | |||
| """The distance between the layers of two neural networks.""" | |||
| len_a = len(list_a) | |||
| len_b = len(list_b) | |||
| f = np.zeros((len_a + 1, len_b + 1)) | |||
| f[-1][-1] = 0 | |||
| for i in range(-1, len_a): | |||
| f[i][-1] = i + 1 | |||
| for j in range(-1, len_b): | |||
| f[-1][j] = j + 1 | |||
| for i in range(len_a): | |||
| for j in range(len_b): | |||
| f[i][j] = min( | |||
| f[i][j - 1] + 1, | |||
| f[i - 1][j] + 1, | |||
| f[i - 1][j - 1] + layer_distance(list_a[i], list_b[j]), | |||
| ) | |||
| return f[len_a - 1][len_b - 1] | |||
| # equation (9) ds | |||
| # 0: topo rank of the start, 1: rank of the end | |||
| def skip_connection_distance(a, b): | |||
| """The distance between two skip-connections.""" | |||
| if a[2] != b[2]: | |||
| return 1.0 | |||
| len_a = abs(a[1] - a[0]) | |||
| len_b = abs(b[1] - b[0]) | |||
| return (abs(a[0] - b[0]) + abs(len_a - len_b)) / \ | |||
| (max(a[0], b[0]) + max(len_a, len_b)) | |||
| # equation (8) Ds | |||
| # convert equation (8) minimization part into a bipartite graph matching problem and solved by hungarian algorithm(linear_sum_assignment) | |||
| def skip_connections_distance(list_a, list_b): | |||
| """The distance between the skip-connections of two neural networks.""" | |||
| distance_matrix = np.zeros((len(list_a), len(list_b))) | |||
| for i, a in enumerate(list_a): | |||
| for j, b in enumerate(list_b): | |||
| distance_matrix[i][j] = skip_connection_distance(a, b) | |||
| return distance_matrix[linear_sum_assignment(distance_matrix)].sum() + abs( | |||
| len(list_a) - len(list_b) | |||
| ) | |||
| # equation (4) | |||
| def edit_distance(x, y): | |||
| """The distance between two neural networks. | |||
| Args: | |||
| x: An instance of NetworkDescriptor. | |||
| y: An instance of NetworkDescriptor | |||
| Returns: | |||
| The edit-distance between x and y. | |||
| """ | |||
| ret = layers_distance(x.layers, y.layers) | |||
| ret += Constant.KERNEL_LAMBDA * skip_connections_distance( | |||
| x.skip_connections, y.skip_connections | |||
| ) | |||
| return ret | |||
| class IncrementalGaussianProcess: | |||
| """Gaussian process regressor. | |||
| Attributes: | |||
| alpha: A hyperparameter. | |||
| """ | |||
| def __init__(self): | |||
| self.alpha = 1e-10 | |||
| self._distance_matrix = None | |||
| self._x = None | |||
| self._y = None | |||
| self._first_fitted = False | |||
| self._l_matrix = None | |||
| self._alpha_vector = None | |||
| @property | |||
| def kernel_matrix(self): | |||
| ''' Kernel matric. | |||
| ''' | |||
| return self._distance_matrix | |||
| def fit(self, train_x, train_y): | |||
| """ Fit the regressor with more data. | |||
| Args: | |||
| train_x: A list of NetworkDescriptor. | |||
| train_y: A list of metric values. | |||
| """ | |||
| if self.first_fitted: | |||
| self.incremental_fit(train_x, train_y) | |||
| else: | |||
| self.first_fit(train_x, train_y) | |||
| # compute the kernel matrix k, alpha_vector | |||
| # 和first fit区别就是需要加入新的训练样本扩充distance matrix | |||
| def incremental_fit(self, train_x, train_y): | |||
| """ Incrementally fit the regressor. """ | |||
| if not self._first_fitted: | |||
| raise ValueError( | |||
| "The first_fit function needs to be called first.") | |||
| train_x, train_y = np.array(train_x), np.array(train_y) | |||
| # Incrementally compute K | |||
| up_right_k = edit_distance_matrix(self._x, train_x) | |||
| down_left_k = np.transpose(up_right_k) | |||
| down_right_k = edit_distance_matrix(train_x) | |||
| up_k = np.concatenate((self._distance_matrix, up_right_k), axis=1) | |||
| down_k = np.concatenate((down_left_k, down_right_k), axis=1) | |||
| temp_distance_matrix = np.concatenate((up_k, down_k), axis=0) | |||
| k_matrix = bourgain_embedding_matrix(temp_distance_matrix) | |||
| diagonal = np.diag_indices_from(k_matrix) | |||
| diagonal = (diagonal[0][-len(train_x):], diagonal[1][-len(train_x):]) | |||
| k_matrix[diagonal] += self.alpha | |||
| try: | |||
| self._l_matrix = cholesky(k_matrix, lower=True) # Line 2 | |||
| except LinAlgError as err: | |||
| logger.error('LinAlgError') | |||
| return self | |||
| self._x = np.concatenate((self._x, train_x), axis=0) | |||
| self._y = np.concatenate((self._y, train_y), axis=0) | |||
| self._distance_matrix = temp_distance_matrix | |||
| self._alpha_vector = cho_solve( | |||
| (self._l_matrix, True), self._y) # Line 3 | |||
| return self | |||
| @property | |||
| def first_fitted(self): | |||
| ''' if it is firsr fitted | |||
| ''' | |||
| return self._first_fitted | |||
| # update过程,第一次fit。 | |||
| def first_fit(self, train_x, train_y): | |||
| """ Fit the regressor for the first time. """ | |||
| train_x, train_y = np.array(train_x), np.array(train_y) | |||
| self._x = np.copy(train_x) | |||
| self._y = np.copy(train_y) | |||
| self._distance_matrix = edit_distance_matrix(self._x) | |||
| k_matrix = bourgain_embedding_matrix(self._distance_matrix) | |||
| k_matrix[np.diag_indices_from(k_matrix)] += self.alpha | |||
| self._l_matrix = cholesky(k_matrix, lower=True) # Line 2 | |||
| # cho_solve Ax = b return x = A^{-1}b | |||
| self._alpha_vector = cho_solve( | |||
| (self._l_matrix, True), self._y) # Line 3 | |||
| self._first_fitted = True | |||
| return self | |||
| # 获得 predictive distribution 的 mean & std | |||
| def predict(self, train_x): | |||
| """Predict the result. | |||
| Args: | |||
| train_x: A list of NetworkDescriptor. | |||
| Returns: | |||
| y_mean: The predicted mean. | |||
| y_std: The predicted standard deviation. | |||
| """ | |||
| k_trans = np.exp(-np.power(edit_distance_matrix(train_x, self._x), 2)) | |||
| y_mean = k_trans.dot(self._alpha_vector) # Line 4 (y_mean = f_star) | |||
| # compute inverse K_inv of K based on its Cholesky | |||
| # decomposition L and its inverse L_inv | |||
| l_inv = solve_triangular( | |||
| self._l_matrix.T, np.eye( | |||
| self._l_matrix.shape[0])) | |||
| k_inv = l_inv.dot(l_inv.T) | |||
| # Compute variance of predictive distribution | |||
| y_var = np.ones(len(train_x), dtype=np.float) | |||
| y_var -= np.einsum("ij,ij->i", np.dot(k_trans, k_inv), k_trans) | |||
| # Check if any of the variances is negative because of | |||
| # numerical issues. If yes: set the variance to 0. | |||
| y_var_negative = y_var < 0 | |||
| if np.any(y_var_negative): | |||
| y_var[y_var_negative] = 0.0 | |||
| return y_mean, np.sqrt(y_var) | |||
| def edit_distance_matrix(train_x, train_y=None): | |||
| """Calculate the edit distance. | |||
| Args: | |||
| train_x: A list of neural architectures. | |||
| train_y: A list of neural architectures. | |||
| Returns: | |||
| An edit-distance matrix. | |||
| """ | |||
| if train_y is None: | |||
| ret = np.zeros((train_x.shape[0], train_x.shape[0])) | |||
| for x_index, x in enumerate(train_x): | |||
| for y_index, y in enumerate(train_x): | |||
| if x_index == y_index: | |||
| ret[x_index][y_index] = 0 | |||
| elif x_index < y_index: | |||
| ret[x_index][y_index] = edit_distance(x, y) | |||
| else: | |||
| ret[x_index][y_index] = ret[y_index][x_index] | |||
| return ret | |||
| ret = np.zeros((train_x.shape[0], train_y.shape[0])) | |||
| for x_index, x in enumerate(train_x): | |||
| for y_index, y in enumerate(train_y): | |||
| ret[x_index][y_index] = edit_distance(x, y) | |||
| return ret | |||
| def vector_distance(a, b): | |||
| """The Euclidean distance between two vectors.""" | |||
| a = np.array(a) | |||
| b = np.array(b) | |||
| return np.linalg.norm(a - b) | |||
| # 从edit-distance矩阵空间到欧几里得空间的映射 | |||
| def bourgain_embedding_matrix(distance_matrix): | |||
| """Use Bourgain algorithm to embed the neural architectures based on their edit-distance. | |||
| Args: | |||
| distance_matrix: A matrix of edit-distances. | |||
| Returns: | |||
| A matrix of distances after embedding. | |||
| """ | |||
| distance_matrix = np.array(distance_matrix) | |||
| n = len(distance_matrix) | |||
| if n == 1: | |||
| return distance_matrix | |||
| np.random.seed(123) | |||
| distort_elements = [] | |||
| r = range(n) | |||
| k = int(math.ceil(math.log(n) / math.log(2) - 1)) | |||
| t = int(math.ceil(math.log(n))) | |||
| counter = 0 | |||
| for i in range(0, k + 1): | |||
| for t in range(t): | |||
| s = np.random.choice(r, 2 ** i) | |||
| for j in r: | |||
| d = min([distance_matrix[j][s] for s in s]) | |||
| counter += len(s) | |||
| if i == 0 and t == 0: | |||
| distort_elements.append([d]) | |||
| else: | |||
| distort_elements[j].append(d) | |||
| return rbf_kernel(distort_elements, distort_elements) | |||
| class BayesianOptimizer: | |||
| """ A Bayesian optimizer for neural architectures. | |||
| Attributes: | |||
| searcher: The Searcher which is calling the Bayesian optimizer. | |||
| t_min: The minimum temperature for simulated annealing. | |||
| metric: An instance of the Metric subclasses. | |||
| gpr: A GaussianProcessRegressor for bayesian optimization. | |||
| beta: The beta in acquisition function. (refer to our paper) | |||
| search_tree: The network morphism search tree. | |||
| """ | |||
| def __init__(self, searcher, t_min, optimizemode, beta=None): | |||
| self.searcher = searcher | |||
| self.t_min = t_min | |||
| self.optimizemode = optimizemode | |||
| self.gpr = IncrementalGaussianProcess() | |||
| self.beta = beta if beta is not None else Constant.BETA | |||
| self.search_tree = SearchTree() | |||
| def fit(self, x_queue, y_queue): | |||
| """ Fit the optimizer with new architectures and performances. | |||
| Args: | |||
| x_queue: A list of NetworkDescriptor. | |||
| y_queue: A list of metric values. | |||
| """ | |||
| self.gpr.fit(x_queue, y_queue) | |||
| # Algorithm 1 | |||
| # optimize acquisition function | |||
| def generate(self, descriptors): | |||
| """Generate new architecture. | |||
| Args: | |||
| descriptors: All the searched neural architectures. (search history) | |||
| Returns: | |||
| graph: An instance of Graph. A morphed neural network with weights. | |||
| father_id: The father node ID in the search tree. | |||
| """ | |||
| model_ids = self.search_tree.adj_list.keys() | |||
| target_graph = None | |||
| father_id = None | |||
| descriptors = deepcopy(descriptors) | |||
| elem_class = Elem | |||
| if self.optimizemode is OptimizeMode.Maximize: | |||
| elem_class = ReverseElem | |||
| ''' | |||
| 1.初始化优先队列 | |||
| 2.优先队列里面元素为之前所有生成的模型 | |||
| ''' | |||
| pq = PriorityQueue() | |||
| temp_list = [] | |||
| for model_id in model_ids: | |||
| metric_value = self.searcher.get_metric_value_by_id(model_id) | |||
| temp_list.append((metric_value, model_id)) | |||
| temp_list = sorted(temp_list) | |||
| for metric_value, model_id in temp_list: | |||
| graph = self.searcher.load_model_by_id(model_id) | |||
| graph.clear_operation_history() | |||
| graph.clear_weights() | |||
| # 已经产生的模型father_id就是自己的id | |||
| pq.put(elem_class(metric_value, model_id, graph)) | |||
| t = 1.0 | |||
| t_min = self.t_min | |||
| alpha = 0.9 | |||
| opt_acq = self._get_init_opt_acq_value() | |||
| num_iter = 0 | |||
| # logger.info('initial queue size ', pq.qsize()) | |||
| while not pq.empty() and t > t_min: | |||
| num_iter += 1 | |||
| elem = pq.get() | |||
| # logger.info("elem.metric_value:{}".format(elem.metric_value)) | |||
| # logger.info("opt_acq:{}".format(opt_acq)) | |||
| if self.optimizemode is OptimizeMode.Maximize: | |||
| temp_exp = min((elem.metric_value - opt_acq) / t, 1.0) | |||
| else: | |||
| temp_exp = min((opt_acq - elem.metric_value) / t, 1.0) | |||
| # logger.info("temp_exp this round ", temp_exp) | |||
| ap = math.exp(temp_exp) | |||
| # logger.info("ap this round ", ap) | |||
| if ap >= random.uniform(0, 1): | |||
| # line 9,10 in algorithm 1 | |||
| for temp_graph in transform(elem.graph): | |||
| # 已经出现过的网络不加入 | |||
| if contain(descriptors, temp_graph.extract_descriptor()): | |||
| continue | |||
| #用acq作为贝叶斯模型给出的评价标准 | |||
| temp_acq_value = self.acq(temp_graph) | |||
| # 这个优先队列会不断增长,就算transform出来的网络也会进入。 | |||
| pq.put( | |||
| # 记住这个模型是从哪个father生长出来的 | |||
| elem_class( | |||
| temp_acq_value, | |||
| elem.father_id, | |||
| temp_graph)) | |||
| # logger.info('temp_acq_value ', temp_acq_value) | |||
| # logger.info('queue size ', pq.qsize()) | |||
| descriptors.append(temp_graph.extract_descriptor()) | |||
| # 选一个最好的当父 | |||
| if self._accept_new_acq_value(opt_acq, temp_acq_value): | |||
| opt_acq = temp_acq_value | |||
| father_id = elem.father_id | |||
| target_graph = deepcopy(temp_graph) | |||
| t *= alpha | |||
| # logger.info('number of iter in this search {}'.format(num_iter)) | |||
| # Did not found a not duplicated architecture | |||
| if father_id is None: | |||
| return None, None | |||
| nm_graph = self.searcher.load_model_by_id(father_id) | |||
| # 从当前父graph开始,根据target_graph中的operation_history,一步步从当前父网络操作到target_graph | |||
| # 因为在存入pq时进行了clear_operation_history()操作。等于target_graph中只存了从当前父网络到target_graph的操作 | |||
| # 而nm_graph中的operation_history保存完整的,到基类的history | |||
| for args in target_graph.operation_history: | |||
| getattr(nm_graph, args[0])(*list(args[1:])) | |||
| # target space | |||
| return nm_graph, father_id | |||
| # equation (10) | |||
| def acq(self, graph): | |||
| ''' estimate the value of generated graph | |||
| ''' | |||
| mean, std = self.gpr.predict(np.array([graph.extract_descriptor()])) | |||
| if self.optimizemode is OptimizeMode.Maximize: | |||
| return mean + self.beta * std | |||
| return mean - self.beta * std | |||
| def _get_init_opt_acq_value(self): | |||
| if self.optimizemode is OptimizeMode.Maximize: | |||
| return -np.inf | |||
| return np.inf | |||
| def _accept_new_acq_value(self, opt_acq, temp_acq_value): | |||
| if temp_acq_value > opt_acq and self.optimizemode is OptimizeMode.Maximize: | |||
| return True | |||
| if temp_acq_value < opt_acq and not self.optimizemode is OptimizeMode.Maximize: | |||
| return True | |||
| return False | |||
| def add_child(self, father_id, model_id): | |||
| ''' add child to the search tree | |||
| Arguments: | |||
| father_id {int} -- father id | |||
| model_id {int} -- model id | |||
| ''' | |||
| self.search_tree.add_child(father_id, model_id) | |||
| @total_ordering | |||
| class Elem: | |||
| """Elements to be sorted according to metric value.""" | |||
| def __init__(self, metric_value, father_id, graph): | |||
| self.father_id = father_id | |||
| self.graph = graph | |||
| self.metric_value = metric_value | |||
| def __eq__(self, other): | |||
| return self.metric_value == other.metric_value | |||
| def __lt__(self, other): | |||
| return self.metric_value < other.metric_value | |||
| class ReverseElem(Elem): | |||
| """Elements to be reversely sorted according to metric value.""" | |||
| def __lt__(self, other): | |||
| return self.metric_value > other.metric_value | |||
| def contain(descriptors, target_descriptor): | |||
| """Check if the target descriptor is in the descriptors.""" | |||
| for descriptor in descriptors: | |||
| if edit_distance(descriptor, target_descriptor) < 1e-5: | |||
| return True | |||
| return False | |||
| class SearchTree: | |||
| """The network morphism search tree.""" | |||
| def __init__(self): | |||
| self.root = None | |||
| self.adj_list = {} | |||
| def add_child(self, u, v): | |||
| ''' add child to search tree itself. | |||
| Arguments: | |||
| u {int} -- father id | |||
| v {int} -- child id | |||
| ''' | |||
| if u == -1: | |||
| self.root = v | |||
| self.adj_list[v] = [] | |||
| return | |||
| if v not in self.adj_list[u]: | |||
| self.adj_list[u].append(v) | |||
| if v not in self.adj_list: | |||
| self.adj_list[v] = [] | |||
| def get_dict(self, u=None): | |||
| """ A recursive function to return the content of the tree in a dict.""" | |||
| if u is None: | |||
| return self.get_dict(self.root) | |||
| children = [] | |||
| for v in self.adj_list[u]: | |||
| children.append(self.get_dict(v)) | |||
| ret = {"name": u, "children": children} | |||
| return ret | |||
| @@ -0,0 +1,919 @@ | |||
| import json | |||
| from collections.abc import Iterable | |||
| from copy import deepcopy, copy | |||
| from queue import Queue | |||
| import numpy as np | |||
| import torch | |||
| from .layer_transformer import ( | |||
| add_noise, | |||
| wider_bn, | |||
| wider_next_conv, | |||
| wider_next_dense, | |||
| wider_pre_conv, | |||
| wider_pre_dense, | |||
| init_dense_weight, | |||
| init_conv_weight, | |||
| init_bn_weight, | |||
| ) | |||
| from .layers import ( | |||
| StubAdd, | |||
| StubConcatenate, | |||
| StubReLU, | |||
| get_batch_norm_class, | |||
| get_conv_class, | |||
| is_layer, | |||
| layer_width, | |||
| set_stub_weight_to_torch, | |||
| set_torch_weight_to_stub, | |||
| layer_description_extractor, | |||
| layer_description_builder, | |||
| ) | |||
| from utils import Constant | |||
| class NetworkDescriptor: | |||
| """A class describing the neural architecture for neural network kernel. | |||
| It only record the width of convolutional and dense layers, and the skip-connection types and positions. | |||
| """ | |||
| CONCAT_CONNECT = "concat" | |||
| ADD_CONNECT = "add" | |||
| def __init__(self): | |||
| self.skip_connections = [] | |||
| self.layers = [] | |||
| @property | |||
| def n_layers(self): | |||
| return len(self.layers) | |||
| def add_skip_connection(self, u, v, connection_type): | |||
| """ Add a skip-connection to the descriptor. | |||
| Args: | |||
| u: Number of convolutional layers before the starting point. | |||
| v: Number of convolutional layers before the ending point. | |||
| connection_type: Must be either CONCAT_CONNECT or ADD_CONNECT. | |||
| """ | |||
| if connection_type not in [self.CONCAT_CONNECT, self.ADD_CONNECT]: | |||
| raise ValueError( | |||
| "connection_type should be NetworkDescriptor.CONCAT_CONNECT " | |||
| "or NetworkDescriptor.ADD_CONNECT." | |||
| ) | |||
| self.skip_connections.append((u, v, connection_type)) | |||
| def to_json(self): | |||
| ''' NetworkDescriptor to json representation | |||
| ''' | |||
| skip_list = [] | |||
| for u, v, connection_type in self.skip_connections: | |||
| skip_list.append({"from": u, "to": v, "type": connection_type}) | |||
| return {"node_list": self.layers, "skip_list": skip_list} | |||
| def add_layer(self, layer): | |||
| ''' add one layer | |||
| ''' | |||
| self.layers.append(layer) | |||
| class Node: | |||
| """A class for intermediate output tensor (node) in the Graph. | |||
| Attributes: | |||
| shape: A tuple describing the shape of the tensor. | |||
| """ | |||
| def __init__(self, shape): | |||
| self.shape = shape | |||
| class Graph: | |||
| """A class representing the neural architecture graph of a model. | |||
| Graph extracts the neural architecture graph from a model. | |||
| Each node in the graph is a intermediate tensor between layers. | |||
| Each layer is an edge in the graph. | |||
| Notably, multiple edges may refer to the same layer. | |||
| (e.g. Add layer is adding two tensor into one tensor. So it is related to two edges.) | |||
| Attributes: | |||
| weighted: A boolean of whether the weights and biases in the neural network | |||
| should be included in the graph. | |||
| input_shape: A tuple of integers, which does not include the batch axis. | |||
| node_list: A list of integers. The indices of the list are the identifiers. | |||
| layer_list: A list of stub layers. The indices of the list are the identifiers. | |||
| node_to_id: A dict instance mapping from node integers to their identifiers. | |||
| layer_to_id: A dict instance mapping from stub layers to their identifiers. | |||
| layer_id_to_input_node_ids: A dict instance mapping from layer identifiers | |||
| to their input nodes identifiers. | |||
| layer_id_to_output_node_ids: A dict instance mapping from layer identifiers | |||
| to their output nodes identifiers. | |||
| adj_list: A two dimensional list. The adjacency list of the graph. The first dimension is | |||
| identified by tensor identifiers. In each edge list, the elements are two-element tuples | |||
| of (tensor identifier, layer identifier). | |||
| reverse_adj_list: A reverse adjacent list in the same format as adj_list. | |||
| operation_history: A list saving all the network morphism operations. | |||
| vis: A dictionary of temporary storage for whether an local operation has been done | |||
| during the network morphism. | |||
| """ | |||
| def __init__(self, input_shape, weighted=True): | |||
| """Initializer for Graph. | |||
| """ | |||
| self.input_shape = input_shape | |||
| self.weighted = weighted | |||
| self.node_list = [] | |||
| self.layer_list = [] | |||
| # node id start with 0 | |||
| self.node_to_id = {} | |||
| self.layer_to_id = {} | |||
| self.layer_id_to_input_node_ids = {} | |||
| self.layer_id_to_output_node_ids = {} | |||
| self.adj_list = {} | |||
| self.reverse_adj_list = {} | |||
| self.operation_history = [] | |||
| self.n_dim = len(input_shape) - 1 | |||
| self.conv = get_conv_class(self.n_dim) | |||
| self.batch_norm = get_batch_norm_class(self.n_dim) | |||
| self.vis = None | |||
| self._add_node(Node(input_shape)) | |||
| def add_layer(self, layer, input_node_id): | |||
| """Add a layer to the Graph. | |||
| Args: | |||
| layer: An instance of the subclasses of StubLayer in layers.py. | |||
| input_node_id: An integer. The ID of the input node of the layer. | |||
| Returns: | |||
| output_node_id: An integer. The ID of the output node of the layer. | |||
| """ | |||
| if isinstance(input_node_id, Iterable): | |||
| layer.input = list(map(lambda x: self.node_list[x], input_node_id)) | |||
| output_node_id = self._add_node(Node(layer.output_shape)) | |||
| for node_id in input_node_id: | |||
| self._add_edge(layer, node_id, output_node_id) | |||
| else: | |||
| layer.input = self.node_list[input_node_id] | |||
| output_node_id = self._add_node(Node(layer.output_shape)) | |||
| self._add_edge(layer, input_node_id, output_node_id) | |||
| layer.output = self.node_list[output_node_id] | |||
| return output_node_id | |||
| def clear_operation_history(self): | |||
| self.operation_history = [] | |||
| @property | |||
| def n_nodes(self): | |||
| """Return the number of nodes in the model.""" | |||
| return len(self.node_list) | |||
| @property | |||
| def n_layers(self): | |||
| """Return the number of layers in the model.""" | |||
| return len(self.layer_list) | |||
| def _add_node(self, node): | |||
| """Add a new node to node_list and give the node an ID. | |||
| Args: | |||
| node: An instance of Node. | |||
| Returns: | |||
| node_id: An integer. | |||
| """ | |||
| node_id = len(self.node_list) | |||
| self.node_to_id[node] = node_id | |||
| self.node_list.append(node) | |||
| self.adj_list[node_id] = [] | |||
| self.reverse_adj_list[node_id] = [] | |||
| return node_id | |||
| def _add_edge(self, layer, input_id, output_id): | |||
| """Add a new layer to the graph. The nodes should be created in advance.""" | |||
| if layer in self.layer_to_id: | |||
| layer_id = self.layer_to_id[layer] | |||
| if input_id not in self.layer_id_to_input_node_ids[layer_id]: | |||
| self.layer_id_to_input_node_ids[layer_id].append(input_id) | |||
| if output_id not in self.layer_id_to_output_node_ids[layer_id]: | |||
| self.layer_id_to_output_node_ids[layer_id].append(output_id) | |||
| else: | |||
| layer_id = len(self.layer_list) | |||
| self.layer_list.append(layer) | |||
| self.layer_to_id[layer] = layer_id | |||
| self.layer_id_to_input_node_ids[layer_id] = [input_id] | |||
| self.layer_id_to_output_node_ids[layer_id] = [output_id] | |||
| self.adj_list[input_id].append((output_id, layer_id)) | |||
| self.reverse_adj_list[output_id].append((input_id, layer_id)) | |||
| def _redirect_edge(self, u_id, v_id, new_v_id): | |||
| """Redirect the layer to a new node. | |||
| Change the edge originally from `u_id` to `v_id` into an edge from `u_id` to `new_v_id` | |||
| while keeping all other property of the edge the same. | |||
| """ | |||
| layer_id = None | |||
| for index, edge_tuple in enumerate(self.adj_list[u_id]): | |||
| if edge_tuple[0] == v_id: | |||
| layer_id = edge_tuple[1] | |||
| self.adj_list[u_id][index] = (new_v_id, layer_id) | |||
| self.layer_list[layer_id].output = self.node_list[new_v_id] | |||
| break | |||
| for index, edge_tuple in enumerate(self.reverse_adj_list[v_id]): | |||
| if edge_tuple[0] == u_id: | |||
| layer_id = edge_tuple[1] | |||
| self.reverse_adj_list[v_id].remove(edge_tuple) | |||
| break | |||
| self.reverse_adj_list[new_v_id].append((u_id, layer_id)) | |||
| for index, value in enumerate( | |||
| self.layer_id_to_output_node_ids[layer_id]): | |||
| if value == v_id: | |||
| self.layer_id_to_output_node_ids[layer_id][index] = new_v_id | |||
| break | |||
| def _replace_layer(self, layer_id, new_layer): | |||
| """Replace the layer with a new layer.""" | |||
| old_layer = self.layer_list[layer_id] | |||
| new_layer.input = old_layer.input | |||
| new_layer.output = old_layer.output | |||
| new_layer.output.shape = new_layer.output_shape | |||
| self.layer_list[layer_id] = new_layer | |||
| self.layer_to_id[new_layer] = layer_id | |||
| self.layer_to_id.pop(old_layer) | |||
| @property | |||
| def topological_order(self): | |||
| """Return the topological order of the node IDs from the input node to the output node.""" | |||
| q = Queue() | |||
| in_degree = {} | |||
| for i in range(self.n_nodes): | |||
| in_degree[i] = 0 | |||
| for u in range(self.n_nodes): | |||
| for v, _ in self.adj_list[u]: | |||
| in_degree[v] += 1 | |||
| for i in range(self.n_nodes): | |||
| if in_degree[i] == 0: | |||
| q.put(i) | |||
| order_list = [] | |||
| while not q.empty(): | |||
| u = q.get() | |||
| order_list.append(u) | |||
| for v, _ in self.adj_list[u]: | |||
| in_degree[v] -= 1 | |||
| if in_degree[v] == 0: | |||
| q.put(v) | |||
| return order_list | |||
| def _get_pooling_layers(self, start_node_id, end_node_id): | |||
| """ | |||
| Given two node IDs, return all the pooling layers between them. | |||
| Conv layer with strid > 1 is also considered as a Pooling layer. | |||
| """ | |||
| layer_list = [] | |||
| node_list = [start_node_id] | |||
| assert self._depth_first_search(end_node_id, layer_list, node_list) | |||
| ret = [] | |||
| for layer_id in layer_list: | |||
| layer = self.layer_list[layer_id] | |||
| if is_layer(layer, "Pooling"): | |||
| ret.append(layer) | |||
| elif is_layer(layer, "Conv") and layer.stride != 1: | |||
| ret.append(layer) | |||
| return ret | |||
| def _depth_first_search(self, target_id, layer_id_list, node_list): | |||
| """Search for all the layers and nodes down the path. | |||
| A recursive function to search all the layers and nodes between the node in the node_list | |||
| and the node with target_id.""" | |||
| assert len(node_list) <= self.n_nodes | |||
| u = node_list[-1] | |||
| if u == target_id: | |||
| return True | |||
| for v, layer_id in self.adj_list[u]: | |||
| layer_id_list.append(layer_id) | |||
| node_list.append(v) | |||
| if self._depth_first_search(target_id, layer_id_list, node_list): | |||
| return True | |||
| layer_id_list.pop() | |||
| node_list.pop() | |||
| return False | |||
| def _search(self, u, start_dim, total_dim, n_add): | |||
| """Search the graph for all the layers to be widened caused by an operation. | |||
| It is an recursive function with duplication check to avoid deadlock. | |||
| It searches from a starting node u until the corresponding layers has been widened. | |||
| Args: | |||
| u: The starting node ID. | |||
| start_dim: The position to insert the additional dimensions. | |||
| total_dim: The total number of dimensions the layer has before widening. | |||
| n_add: The number of dimensions to add. | |||
| """ | |||
| if (u, start_dim, total_dim, n_add) in self.vis: | |||
| return | |||
| self.vis[(u, start_dim, total_dim, n_add)] = True | |||
| for v, layer_id in self.adj_list[u]: | |||
| layer = self.layer_list[layer_id] | |||
| if is_layer(layer, "Conv"): | |||
| new_layer = wider_next_conv( | |||
| layer, start_dim, total_dim, n_add, self.weighted | |||
| ) | |||
| self._replace_layer(layer_id, new_layer) | |||
| elif is_layer(layer, "Dense"): | |||
| new_layer = wider_next_dense( | |||
| layer, start_dim, total_dim, n_add, self.weighted | |||
| ) | |||
| self._replace_layer(layer_id, new_layer) | |||
| elif is_layer(layer, "BatchNormalization"): | |||
| new_layer = wider_bn( | |||
| layer, start_dim, total_dim, n_add, self.weighted) | |||
| self._replace_layer(layer_id, new_layer) | |||
| self._search(v, start_dim, total_dim, n_add) | |||
| elif is_layer(layer, "Concatenate"): | |||
| if self.layer_id_to_input_node_ids[layer_id][1] == u: | |||
| # u is on the right of the concat | |||
| # next_start_dim += next_total_dim - total_dim | |||
| left_dim = self._upper_layer_width( | |||
| self.layer_id_to_input_node_ids[layer_id][0] | |||
| ) | |||
| next_start_dim = start_dim + left_dim | |||
| next_total_dim = total_dim + left_dim | |||
| else: | |||
| next_start_dim = start_dim | |||
| next_total_dim = total_dim + self._upper_layer_width( | |||
| self.layer_id_to_input_node_ids[layer_id][1] | |||
| ) | |||
| self._search(v, next_start_dim, next_total_dim, n_add) | |||
| else: | |||
| self._search(v, start_dim, total_dim, n_add) | |||
| for v, layer_id in self.reverse_adj_list[u]: | |||
| layer = self.layer_list[layer_id] | |||
| if is_layer(layer, "Conv"): | |||
| new_layer = wider_pre_conv(layer, n_add, self.weighted) | |||
| self._replace_layer(layer_id, new_layer) | |||
| elif is_layer(layer, "Dense"): | |||
| new_layer = wider_pre_dense(layer, n_add, self.weighted) | |||
| self._replace_layer(layer_id, new_layer) | |||
| elif is_layer(layer, "Concatenate"): | |||
| continue | |||
| else: | |||
| self._search(v, start_dim, total_dim, n_add) | |||
| def _upper_layer_width(self, u): | |||
| for v, layer_id in self.reverse_adj_list[u]: | |||
| layer = self.layer_list[layer_id] | |||
| if is_layer(layer, "Conv") or is_layer(layer, "Dense"): | |||
| return layer_width(layer) | |||
| elif is_layer(layer, "Concatenate"): | |||
| a = self.layer_id_to_input_node_ids[layer_id][0] | |||
| b = self.layer_id_to_input_node_ids[layer_id][1] | |||
| return self._upper_layer_width(a) + self._upper_layer_width(b) | |||
| else: | |||
| return self._upper_layer_width(v) | |||
| return self.node_list[0].shape[-1] | |||
| def to_deeper_model(self, target_id, new_layer): | |||
| """Insert a relu-conv-bn block after the target block. | |||
| Args: | |||
| target_id: A convolutional layer ID. The new block should be inserted after the block. | |||
| new_layer: An instance of StubLayer subclasses. | |||
| """ | |||
| self.operation_history.append( | |||
| ("to_deeper_model", target_id, new_layer)) | |||
| input_id = self.layer_id_to_input_node_ids[target_id][0] | |||
| output_id = self.layer_id_to_output_node_ids[target_id][0] | |||
| if self.weighted: | |||
| if is_layer(new_layer, "Dense"): | |||
| init_dense_weight(new_layer) | |||
| elif is_layer(new_layer, "Conv"): | |||
| init_conv_weight(new_layer) | |||
| elif is_layer(new_layer, "BatchNormalization"): | |||
| init_bn_weight(new_layer) | |||
| self._insert_new_layers([new_layer], input_id, output_id) | |||
| def to_wider_model(self, pre_layer_id, n_add): | |||
| """Widen the last dimension of the output of the pre_layer. | |||
| Args: | |||
| pre_layer_id: The ID of a convolutional layer or dense layer. | |||
| n_add: The number of dimensions to add. | |||
| """ | |||
| self.operation_history.append(("to_wider_model", pre_layer_id, n_add)) | |||
| pre_layer = self.layer_list[pre_layer_id] | |||
| output_id = self.layer_id_to_output_node_ids[pre_layer_id][0] | |||
| dim = layer_width(pre_layer) | |||
| self.vis = {} | |||
| self._search(output_id, dim, dim, n_add) | |||
| # Update the tensor shapes. | |||
| for u in self.topological_order: | |||
| for v, layer_id in self.adj_list[u]: | |||
| self.node_list[v].shape = self.layer_list[layer_id].output_shape | |||
| def _insert_new_layers(self, new_layers, start_node_id, end_node_id): | |||
| """Insert the new_layers after the node with start_node_id.""" | |||
| new_node_id = self._add_node(deepcopy(self.node_list[end_node_id])) | |||
| temp_output_id = new_node_id | |||
| for layer in new_layers[:-1]: | |||
| temp_output_id = self.add_layer(layer, temp_output_id) | |||
| self._add_edge(new_layers[-1], temp_output_id, end_node_id) | |||
| new_layers[-1].input = self.node_list[temp_output_id] | |||
| new_layers[-1].output = self.node_list[end_node_id] | |||
| self._redirect_edge(start_node_id, end_node_id, new_node_id) | |||
| def _block_end_node(self, layer_id, block_size): | |||
| ret = self.layer_id_to_output_node_ids[layer_id][0] | |||
| for _ in range(block_size - 2): | |||
| ret = self.adj_list[ret][0][0] | |||
| return ret | |||
| def _dense_block_end_node(self, layer_id): | |||
| return self.layer_id_to_input_node_ids[layer_id][0] | |||
| def _conv_block_end_node(self, layer_id): | |||
| """Get the input node ID of the last layer in the block by layer ID. | |||
| Return the input node ID of the last layer in the convolutional block. | |||
| Args: | |||
| layer_id: the convolutional layer ID. | |||
| """ | |||
| return self._block_end_node(layer_id, Constant.CONV_BLOCK_DISTANCE) | |||
| def to_add_skip_model(self, start_id, end_id): | |||
| """Add a weighted add skip-connection from after start node to end node. | |||
| Args: | |||
| start_id: The convolutional layer ID, after which to start the skip-connection. | |||
| end_id: The convolutional layer ID, after which to end the skip-connection. | |||
| """ | |||
| self.operation_history.append(("to_add_skip_model", start_id, end_id)) | |||
| filters_end = self.layer_list[end_id].output.shape[-1] | |||
| filters_start = self.layer_list[start_id].output.shape[-1] | |||
| start_node_id = self.layer_id_to_output_node_ids[start_id][0] | |||
| pre_end_node_id = self.layer_id_to_input_node_ids[end_id][0] | |||
| end_node_id = self.layer_id_to_output_node_ids[end_id][0] | |||
| skip_output_id = self._insert_pooling_layer_chain( | |||
| start_node_id, end_node_id) | |||
| # Add the conv layer in order to align the number of channels with end layer id | |||
| new_conv_layer = get_conv_class( | |||
| self.n_dim)( | |||
| filters_start, | |||
| filters_end, | |||
| 1) | |||
| skip_output_id = self.add_layer(new_conv_layer, skip_output_id) | |||
| # Add the add layer. | |||
| add_input_node_id = self._add_node( | |||
| deepcopy(self.node_list[end_node_id])) | |||
| add_layer = StubAdd() | |||
| self._redirect_edge(pre_end_node_id, end_node_id, add_input_node_id) | |||
| self._add_edge(add_layer, add_input_node_id, end_node_id) | |||
| self._add_edge(add_layer, skip_output_id, end_node_id) | |||
| add_layer.input = [ | |||
| self.node_list[add_input_node_id], | |||
| self.node_list[skip_output_id], | |||
| ] | |||
| add_layer.output = self.node_list[end_node_id] | |||
| self.node_list[end_node_id].shape = add_layer.output_shape | |||
| # Set weights to the additional conv layer. | |||
| if self.weighted: | |||
| filter_shape = (1,) * self.n_dim | |||
| weights = np.zeros((filters_end, filters_start) + filter_shape) | |||
| bias = np.zeros(filters_end) | |||
| new_conv_layer.set_weights( | |||
| (add_noise(weights, np.array([0, 1])), add_noise( | |||
| bias, np.array([0, 1]))) | |||
| ) | |||
| def to_concat_skip_model(self, start_id, end_id): | |||
| """Add a weighted add concatenate connection from after start node to end node. | |||
| Args: | |||
| start_id: The convolutional layer ID, after which to start the skip-connection. | |||
| end_id: The convolutional layer ID, after which to end the skip-connection. | |||
| """ | |||
| self.operation_history.append( | |||
| ("to_concat_skip_model", start_id, end_id)) | |||
| filters_end = self.layer_list[end_id].output.shape[-1] | |||
| filters_start = self.layer_list[start_id].output.shape[-1] | |||
| start_node_id = self.layer_id_to_output_node_ids[start_id][0] | |||
| pre_end_node_id = self.layer_id_to_input_node_ids[end_id][0] | |||
| end_node_id = self.layer_id_to_output_node_ids[end_id][0] | |||
| skip_output_id = self._insert_pooling_layer_chain( | |||
| start_node_id, end_node_id) | |||
| concat_input_node_id = self._add_node( | |||
| deepcopy(self.node_list[end_node_id])) | |||
| self._redirect_edge(pre_end_node_id, end_node_id, concat_input_node_id) | |||
| concat_layer = StubConcatenate() | |||
| concat_layer.input = [ | |||
| self.node_list[concat_input_node_id], | |||
| self.node_list[skip_output_id], | |||
| ] | |||
| concat_output_node_id = self._add_node(Node(concat_layer.output_shape)) | |||
| self._add_edge( | |||
| concat_layer, | |||
| concat_input_node_id, | |||
| concat_output_node_id) | |||
| self._add_edge(concat_layer, skip_output_id, concat_output_node_id) | |||
| concat_layer.output = self.node_list[concat_output_node_id] | |||
| self.node_list[concat_output_node_id].shape = concat_layer.output_shape | |||
| # Add the concatenate layer. | |||
| # concat过channel数增加,用conv class 回到原先的channel数 | |||
| new_conv_layer = get_conv_class(self.n_dim)( | |||
| filters_start + filters_end, filters_end, 1 | |||
| ) | |||
| self._add_edge(new_conv_layer, concat_output_node_id, end_node_id) | |||
| new_conv_layer.input = self.node_list[concat_output_node_id] | |||
| new_conv_layer.output = self.node_list[end_node_id] | |||
| self.node_list[end_node_id].shape = new_conv_layer.output_shape | |||
| if self.weighted: | |||
| filter_shape = (1,) * self.n_dim | |||
| weights = np.zeros((filters_end, filters_end) + filter_shape) | |||
| for i in range(filters_end): | |||
| filter_weight = np.zeros((filters_end,) + filter_shape) | |||
| center_index = (i,) + (0,) * self.n_dim | |||
| filter_weight[center_index] = 1 | |||
| weights[i, ...] = filter_weight | |||
| weights = np.concatenate( | |||
| (weights, np.zeros((filters_end, filters_start) + filter_shape)), axis=1 | |||
| ) | |||
| bias = np.zeros(filters_end) | |||
| new_conv_layer.set_weights( | |||
| (add_noise(weights, np.array([0, 1])), add_noise( | |||
| bias, np.array([0, 1]))) | |||
| ) | |||
| def _insert_pooling_layer_chain(self, start_node_id, end_node_id): | |||
| """ | |||
| insert pooling layer | |||
| """ | |||
| skip_output_id = start_node_id | |||
| # 得到从start_node_id 到 end_node_id之间的所有pooling layer(包括conv layer stride > 1) | |||
| for layer in self._get_pooling_layers(start_node_id, end_node_id): | |||
| new_layer = deepcopy(layer) | |||
| # 如果是conv层需要重新初始化weights | |||
| if is_layer(new_layer, "Conv"): | |||
| # start node id 的通道数 | |||
| filters = self.node_list[start_node_id].shape[-1] | |||
| new_layer = get_conv_class(self.n_dim)( | |||
| filters, filters, 1, layer.stride) | |||
| if self.weighted: | |||
| init_conv_weight(new_layer) | |||
| else: | |||
| new_layer = deepcopy(layer) | |||
| skip_output_id = self.add_layer(new_layer, skip_output_id) | |||
| skip_output_id = self.add_layer(StubReLU(), skip_output_id) | |||
| return skip_output_id | |||
| def extract_descriptor(self): | |||
| """Extract the the description of the Graph as an instance of NetworkDescriptor.""" | |||
| main_chain = self.get_main_chain() | |||
| index_in_main_chain = {} | |||
| for index, u in enumerate(main_chain): | |||
| index_in_main_chain[u] = index | |||
| ret = NetworkDescriptor() | |||
| for u in main_chain: | |||
| for v, layer_id in self.adj_list[u]: | |||
| if v not in index_in_main_chain: | |||
| continue | |||
| layer = self.layer_list[layer_id] | |||
| copied_layer = copy(layer) | |||
| copied_layer.weights = None | |||
| ret.add_layer(deepcopy(copied_layer)) | |||
| for u in index_in_main_chain: | |||
| for v, layer_id in self.adj_list[u]: | |||
| if v not in index_in_main_chain: | |||
| temp_u = u | |||
| temp_v = v | |||
| temp_layer_id = layer_id | |||
| skip_type = None | |||
| while not ( | |||
| temp_v in index_in_main_chain and temp_u in index_in_main_chain): | |||
| if is_layer( | |||
| self.layer_list[temp_layer_id], "Concatenate"): | |||
| skip_type = NetworkDescriptor.CONCAT_CONNECT | |||
| if is_layer(self.layer_list[temp_layer_id], "Add"): | |||
| skip_type = NetworkDescriptor.ADD_CONNECT | |||
| temp_u = temp_v | |||
| temp_v, temp_layer_id = self.adj_list[temp_v][0] | |||
| ret.add_skip_connection( | |||
| index_in_main_chain[u], index_in_main_chain[temp_u], skip_type | |||
| ) | |||
| elif index_in_main_chain[v] - index_in_main_chain[u] != 1: | |||
| skip_type = None | |||
| if is_layer(self.layer_list[layer_id], "Concatenate"): | |||
| skip_type = NetworkDescriptor.CONCAT_CONNECT | |||
| if is_layer(self.layer_list[layer_id], "Add"): | |||
| skip_type = NetworkDescriptor.ADD_CONNECT | |||
| ret.add_skip_connection( | |||
| index_in_main_chain[u], index_in_main_chain[v], skip_type | |||
| ) | |||
| return ret | |||
| def clear_weights(self): | |||
| ''' clear weights of the graph | |||
| ''' | |||
| self.weighted = False | |||
| for layer in self.layer_list: | |||
| layer.weights = None | |||
| def produce_torch_model(self): | |||
| """Build a new Torch model based on the current graph.""" | |||
| return TorchModel(self) | |||
| def produce_json_model(self): | |||
| """Build a new Json model based on the current graph.""" | |||
| return JSONModel(self).data | |||
| @classmethod | |||
| def parsing_json_model(cls, json_model): | |||
| '''build a graph from json | |||
| ''' | |||
| return json_to_graph(json_model) | |||
| def _layer_ids_in_order(self, layer_ids): | |||
| node_id_to_order_index = {} | |||
| for index, node_id in enumerate(self.topological_order): | |||
| node_id_to_order_index[node_id] = index | |||
| return sorted( | |||
| layer_ids, | |||
| key=lambda layer_id: node_id_to_order_index[ | |||
| self.layer_id_to_output_node_ids[layer_id][0] | |||
| ], | |||
| ) | |||
| def _layer_ids_by_type(self, type_str): | |||
| return list( | |||
| filter( | |||
| lambda layer_id: is_layer(self.layer_list[layer_id], type_str), | |||
| range(self.n_layers), | |||
| ) | |||
| ) | |||
| def get_main_chain_layers(self): | |||
| """Return a list of layer IDs in the main chain.""" | |||
| main_chain = self.get_main_chain() | |||
| ret = [] | |||
| for u in main_chain: | |||
| for v, layer_id in self.adj_list[u]: | |||
| if v in main_chain and u in main_chain: | |||
| ret.append(layer_id) | |||
| return ret | |||
| def _conv_layer_ids_in_order(self): | |||
| return list( | |||
| filter( | |||
| lambda layer_id: is_layer(self.layer_list[layer_id], "Conv"), | |||
| self.get_main_chain_layers(), | |||
| ) | |||
| ) | |||
| def _dense_layer_ids_in_order(self): | |||
| return self._layer_ids_in_order(self._layer_ids_by_type("Dense")) | |||
| def deep_layer_ids(self): | |||
| ret = [] | |||
| for layer_id in self.get_main_chain_layers(): | |||
| layer = self.layer_list[layer_id] | |||
| # GAP之后就不插入layer了 | |||
| if is_layer(layer, "GlobalAveragePooling"): | |||
| break | |||
| if is_layer(layer, "Add") or is_layer(layer, "Concatenate"): | |||
| continue | |||
| ret.append(layer_id) | |||
| return ret | |||
| def wide_layer_ids(self): | |||
| return ( | |||
| self._conv_layer_ids_in_order( | |||
| )[:-1] + self._dense_layer_ids_in_order()[:-1] | |||
| ) | |||
| def skip_connection_layer_ids(self): | |||
| return self.deep_layer_ids()[:-1] | |||
| def size(self): | |||
| return sum(list(map(lambda x: x.size(), self.layer_list))) | |||
| def get_main_chain(self): | |||
| """Returns the main chain node ID list.""" | |||
| pre_node = {} | |||
| distance = {} | |||
| # 初始化每个节点距离为0,他的前一个节点为自己 | |||
| for i in range(self.n_nodes): | |||
| distance[i] = 0 | |||
| pre_node[i] = i | |||
| # 遍历所有节点,根据邻接表找到他的前一个节点以及他本身的位置 | |||
| for i in range(self.n_nodes - 1): | |||
| for u in range(self.n_nodes): | |||
| for v, _ in self.adj_list[u]: | |||
| if distance[u] + 1 > distance[v]: | |||
| distance[v] = distance[u] + 1 | |||
| pre_node[v] = u | |||
| # temp_id记录距离最大的node | |||
| temp_id = 0 | |||
| for i in range(self.n_nodes): | |||
| if distance[i] > distance[temp_id]: | |||
| temp_id = i | |||
| # 从距离最大的node开始不断找到他的前一个节点,最终找到主链 | |||
| ret = [] | |||
| for i in range(self.n_nodes + 5): | |||
| ret.append(temp_id) | |||
| if pre_node[temp_id] == temp_id: | |||
| break | |||
| temp_id = pre_node[temp_id] | |||
| assert temp_id == pre_node[temp_id] | |||
| ret.reverse() | |||
| return ret | |||
| class TorchModel(torch.nn.Module): | |||
| """A neural network class using pytorch constructed from an instance of Graph.""" | |||
| def __init__(self, graph): | |||
| super(TorchModel, self).__init__() | |||
| self.graph = graph | |||
| self.layers = torch.nn.ModuleList() | |||
| for layer in graph.layer_list: | |||
| self.layers.append(layer.to_real_layer()) | |||
| if graph.weighted: | |||
| for index, layer in enumerate(self.layers): | |||
| set_stub_weight_to_torch(self.graph.layer_list[index], layer) | |||
| for index, layer in enumerate(self.layers): | |||
| self.add_module(str(index), layer) | |||
| def forward(self, input_tensor): | |||
| topo_node_list = self.graph.topological_order | |||
| output_id = topo_node_list[-1] | |||
| input_id = topo_node_list[0] | |||
| node_list = deepcopy(self.graph.node_list) | |||
| node_list[input_id] = input_tensor | |||
| for v in topo_node_list: | |||
| for u, layer_id in self.graph.reverse_adj_list[v]: | |||
| layer = self.graph.layer_list[layer_id] | |||
| torch_layer = self.layers[layer_id] | |||
| if isinstance(layer, (StubAdd, StubConcatenate)): | |||
| edge_input_tensor = list( | |||
| map( | |||
| lambda x: node_list[x], | |||
| self.graph.layer_id_to_input_node_ids[layer_id], | |||
| ) | |||
| ) | |||
| else: | |||
| edge_input_tensor = node_list[u] | |||
| temp_tensor = torch_layer(edge_input_tensor) | |||
| node_list[v] = temp_tensor | |||
| return node_list[output_id] | |||
| def set_weight_to_graph(self): | |||
| self.graph.weighted = True | |||
| for index, layer in enumerate(self.layers): | |||
| set_torch_weight_to_stub(layer, self.graph.layer_list[index]) | |||
| class JSONModel: | |||
| def __init__(self, graph): | |||
| data = dict() | |||
| node_list = list() | |||
| layer_list = list() | |||
| operation_history = list() | |||
| data["input_shape"] = graph.input_shape | |||
| vis = graph.vis | |||
| data["vis"] = list(vis.keys()) if vis is not None else None | |||
| data["weighted"] = graph.weighted | |||
| for item in graph.operation_history: | |||
| if item[0] == "to_deeper_model": | |||
| operation_history.append( | |||
| [ | |||
| item[0], | |||
| item[1], | |||
| layer_description_extractor(item[2], graph.node_to_id), | |||
| ] | |||
| ) | |||
| else: | |||
| operation_history.append(item) | |||
| data["operation_history"] = operation_history | |||
| data["layer_id_to_input_node_ids"] = graph.layer_id_to_input_node_ids | |||
| data["layer_id_to_output_node_ids"] = graph.layer_id_to_output_node_ids | |||
| data["adj_list"] = graph.adj_list | |||
| data["reverse_adj_list"] = graph.reverse_adj_list | |||
| for node in graph.node_list: | |||
| node_id = graph.node_to_id[node] | |||
| node_information = node.shape | |||
| node_list.append((node_id, node_information)) | |||
| for layer_id, item in enumerate(graph.layer_list): | |||
| layer = graph.layer_list[layer_id] | |||
| layer_information = layer_description_extractor( | |||
| layer, graph.node_to_id) | |||
| layer_list.append((layer_id, layer_information)) | |||
| data["node_list"] = node_list | |||
| data["layer_list"] = layer_list | |||
| self.data = data | |||
| def graph_to_json(graph, json_model_path): | |||
| json_out = graph.produce_json_model() | |||
| with open(json_model_path, "w") as fout: | |||
| json.dump(json_out, fout) | |||
| json_out = json.dumps(json_out) | |||
| return json_out | |||
| def json_to_graph(json_model: str): | |||
| json_model = json.loads(json_model) | |||
| # restore graph data from json data | |||
| input_shape = tuple(json_model["input_shape"]) | |||
| node_list = list() | |||
| node_to_id = dict() | |||
| id_to_node = dict() | |||
| layer_list = list() | |||
| layer_to_id = dict() | |||
| operation_history = list() | |||
| graph = Graph(input_shape, False) | |||
| graph.input_shape = input_shape | |||
| vis = json_model["vis"] | |||
| graph.vis = { | |||
| tuple(item): True for item in vis} if vis is not None else None | |||
| graph.weighted = json_model["weighted"] | |||
| layer_id_to_input_node_ids = json_model["layer_id_to_input_node_ids"] | |||
| graph.layer_id_to_input_node_ids = { | |||
| int(k): v for k, v in layer_id_to_input_node_ids.items() | |||
| } | |||
| layer_id_to_output_node_ids = json_model["layer_id_to_output_node_ids"] | |||
| graph.layer_id_to_output_node_ids = { | |||
| int(k): v for k, v in layer_id_to_output_node_ids.items() | |||
| } | |||
| adj_list = {} | |||
| for k, v in json_model["adj_list"].items(): | |||
| adj_list[int(k)] = [tuple(i) for i in v] | |||
| graph.adj_list = adj_list | |||
| reverse_adj_list = {} | |||
| for k, v in json_model["reverse_adj_list"].items(): | |||
| reverse_adj_list[int(k)] = [tuple(i) for i in v] | |||
| graph.reverse_adj_list = reverse_adj_list | |||
| for item in json_model["node_list"]: | |||
| new_node = Node(tuple(item[1])) | |||
| node_id = item[0] | |||
| node_list.append(new_node) | |||
| node_to_id[new_node] = node_id | |||
| id_to_node[node_id] = new_node | |||
| for item in json_model["operation_history"]: | |||
| if item[0] == "to_deeper_model": | |||
| operation_history.append( | |||
| (item[0], item[1], layer_description_builder(item[2], id_to_node)) | |||
| ) | |||
| else: | |||
| operation_history.append(item) | |||
| graph.operation_history = operation_history | |||
| for item in json_model["layer_list"]: | |||
| new_layer = layer_description_builder(item[1], id_to_node) | |||
| layer_id = int(item[0]) | |||
| layer_list.append(new_layer) | |||
| layer_to_id[new_layer] = layer_id | |||
| graph.node_list = node_list | |||
| graph.node_to_id = node_to_id | |||
| graph.layer_list = layer_list | |||
| graph.layer_to_id = layer_to_id | |||
| return graph | |||
| @@ -0,0 +1,178 @@ | |||
| from copy import deepcopy | |||
| from random import randrange, sample | |||
| from .graph import NetworkDescriptor | |||
| from .layers import ( | |||
| StubDense, | |||
| StubReLU, | |||
| get_batch_norm_class, | |||
| get_conv_class, | |||
| get_dropout_class, | |||
| get_pooling_class, | |||
| is_layer, | |||
| ) | |||
| from utils import Constant | |||
| def to_wider_graph(graph): | |||
| ''' wider graph | |||
| ''' | |||
| weighted_layer_ids = graph.wide_layer_ids() | |||
| weighted_layer_ids = list( | |||
| filter( | |||
| lambda x: graph.layer_list[x].output.shape[-1], weighted_layer_ids) | |||
| ) | |||
| wider_layers = sample(weighted_layer_ids, 1) | |||
| # count the number of layers with width larger than the max width | |||
| layer_width_maxed = 0 | |||
| for layer_id in wider_layers: | |||
| layer = graph.layer_list[layer_id] | |||
| if is_layer(layer, "Conv"): | |||
| n_add = layer.filters | |||
| else: | |||
| n_add = layer.units | |||
| if n_add*2 > Constant.MAX_LAYER_WIDTH: | |||
| layer_width_maxed += 1 | |||
| continue | |||
| graph.to_wider_model(layer_id, n_add) | |||
| if layer_width_maxed == len(wider_layers): | |||
| return None | |||
| return graph | |||
| def to_skip_connection_graph(graph): | |||
| ''' skip connection graph | |||
| ''' | |||
| # The last conv layer cannot be widen since wider operator cannot be done | |||
| # over the two sides of flatten. | |||
| weighted_layer_ids = graph.skip_connection_layer_ids() | |||
| valid_connection = [] | |||
| for skip_type in sorted( | |||
| [NetworkDescriptor.ADD_CONNECT, NetworkDescriptor.CONCAT_CONNECT]): | |||
| for index_a in range(len(weighted_layer_ids)): | |||
| for index_b in range(len(weighted_layer_ids))[index_a + 1:]: | |||
| valid_connection.append((index_a, index_b, skip_type)) | |||
| if len(valid_connection) < 1: | |||
| return graph | |||
| for index_a, index_b, skip_type in sample(valid_connection, 1): | |||
| a_id = weighted_layer_ids[index_a] | |||
| b_id = weighted_layer_ids[index_b] | |||
| if skip_type == NetworkDescriptor.ADD_CONNECT: | |||
| graph.to_add_skip_model(a_id, b_id) | |||
| else: | |||
| graph.to_concat_skip_model(a_id, b_id) | |||
| return graph | |||
| def create_new_layer(layer, n_dim): | |||
| ''' create new layer for the graph | |||
| ''' | |||
| input_shape = layer.output.shape | |||
| # 一般情况 | |||
| dense_deeper_classes = [StubDense, get_dropout_class(n_dim), StubReLU] | |||
| conv_deeper_classes = [ | |||
| get_conv_class(n_dim), | |||
| get_batch_norm_class(n_dim), | |||
| StubReLU] | |||
| # 三种情况有特别的layer class | |||
| if is_layer(layer, "ReLU"): | |||
| conv_deeper_classes = [ | |||
| get_conv_class(n_dim), | |||
| get_batch_norm_class(n_dim)] | |||
| dense_deeper_classes = [StubDense, get_dropout_class(n_dim)] | |||
| elif is_layer(layer, "Dropout"): | |||
| dense_deeper_classes = [StubDense, StubReLU] | |||
| elif is_layer(layer, "BatchNormalization"): | |||
| conv_deeper_classes = [get_conv_class(n_dim), StubReLU] | |||
| layer_class = None | |||
| if len(input_shape) == 1: | |||
| # It is in the dense layer part. | |||
| layer_class = sample(dense_deeper_classes, 1)[0] | |||
| else: | |||
| # It is in the conv layer part. | |||
| layer_class = sample(conv_deeper_classes, 1)[0] | |||
| if layer_class == StubDense: | |||
| new_layer = StubDense(input_shape[0], input_shape[0]) | |||
| elif layer_class == get_dropout_class(n_dim): | |||
| new_layer = layer_class(Constant.DENSE_DROPOUT_RATE) | |||
| elif layer_class == get_conv_class(n_dim): | |||
| new_layer = layer_class( | |||
| input_shape[-1], input_shape[-1], sample((1, 3, 5), 1)[0], stride=1 | |||
| ) | |||
| elif layer_class == get_batch_norm_class(n_dim): | |||
| new_layer = layer_class(input_shape[-1]) | |||
| elif layer_class == get_pooling_class(n_dim): | |||
| new_layer = layer_class(sample((1, 3, 5), 1)[0]) | |||
| else: | |||
| new_layer = layer_class() | |||
| return new_layer | |||
| def to_deeper_graph(graph): | |||
| ''' deeper graph | |||
| ''' | |||
| weighted_layer_ids = graph.deep_layer_ids() | |||
| if len(weighted_layer_ids) >= Constant.MAX_LAYERS: | |||
| return None | |||
| deeper_layer_ids = sample(weighted_layer_ids, 1) | |||
| for layer_id in deeper_layer_ids: | |||
| layer = graph.layer_list[layer_id] | |||
| new_layer = create_new_layer(layer, graph.n_dim) | |||
| graph.to_deeper_model(layer_id, new_layer) | |||
| return graph | |||
| def legal_graph(graph): | |||
| '''judge if a graph is legal or not. | |||
| ''' | |||
| descriptor = graph.extract_descriptor() | |||
| skips = descriptor.skip_connections | |||
| if len(skips) != len(set(skips)): | |||
| return False | |||
| return True | |||
| # morph f with operations in O | |||
| def transform(graph): | |||
| '''core transform function for graph. | |||
| ''' | |||
| graphs = [] | |||
| for _ in range(Constant.N_NEIGHBOURS * 2): | |||
| random_num = randrange(3) | |||
| temp_graph = None | |||
| if random_num == 0: | |||
| temp_graph = to_deeper_graph(deepcopy(graph)) | |||
| elif random_num == 1: | |||
| temp_graph = to_wider_graph(deepcopy(graph)) | |||
| elif random_num == 2: | |||
| temp_graph = to_skip_connection_graph(deepcopy(graph)) | |||
| if temp_graph is not None and temp_graph.size() <= Constant.MAX_MODEL_SIZE: | |||
| graphs.append(temp_graph) | |||
| # 最多8次操作 | |||
| if len(graphs) >= Constant.N_NEIGHBOURS: | |||
| break | |||
| return graphs | |||
| @@ -0,0 +1,213 @@ | |||
| import numpy as np | |||
| from .layers import ( | |||
| StubDense, | |||
| get_batch_norm_class, | |||
| get_conv_class, | |||
| get_n_dim, | |||
| ) | |||
| NOISE_RATIO = 1e-4 | |||
| def wider_pre_dense(layer, n_add, weighted=True): | |||
| '''wider previous dense layer. | |||
| ''' | |||
| if not weighted: | |||
| return StubDense(layer.input_units, layer.units + n_add) | |||
| n_units2 = layer.units | |||
| teacher_w, teacher_b = layer.get_weights() | |||
| rand = np.random.randint(n_units2, size=n_add) | |||
| student_w = teacher_w.copy() | |||
| student_b = teacher_b.copy() | |||
| # target layer update (i) | |||
| for i in range(n_add): | |||
| teacher_index = rand[i] | |||
| new_weight = teacher_w[teacher_index, :] | |||
| new_weight = new_weight[np.newaxis, :] | |||
| student_w = np.concatenate( | |||
| (student_w, add_noise(new_weight, student_w)), axis=0) | |||
| student_b = np.append( | |||
| student_b, add_noise( | |||
| teacher_b[teacher_index], student_b)) | |||
| new_pre_layer = StubDense(layer.input_units, n_units2 + n_add) | |||
| new_pre_layer.set_weights((student_w, student_b)) | |||
| return new_pre_layer | |||
| def wider_pre_conv(layer, n_add_filters, weighted=True): | |||
| '''wider previous conv layer. | |||
| ''' | |||
| n_dim = get_n_dim(layer) | |||
| if not weighted: | |||
| return get_conv_class(n_dim)( | |||
| layer.input_channel, | |||
| layer.filters + n_add_filters, | |||
| kernel_size=layer.kernel_size, | |||
| stride=layer.stride | |||
| ) | |||
| n_pre_filters = layer.filters | |||
| rand = np.random.randint(n_pre_filters, size=n_add_filters) | |||
| teacher_w, teacher_b = layer.get_weights() | |||
| student_w = teacher_w.copy() | |||
| student_b = teacher_b.copy() | |||
| # target layer update (i) | |||
| for i in range(len(rand)): | |||
| teacher_index = rand[i] | |||
| new_weight = teacher_w[teacher_index, ...] | |||
| new_weight = new_weight[np.newaxis, ...] | |||
| student_w = np.concatenate((student_w, new_weight), axis=0) | |||
| student_b = np.append(student_b, teacher_b[teacher_index]) | |||
| new_pre_layer = get_conv_class(n_dim)( | |||
| layer.input_channel, | |||
| n_pre_filters + n_add_filters, | |||
| kernel_size=layer.kernel_size, | |||
| stride=layer.stride | |||
| ) | |||
| new_pre_layer.set_weights( | |||
| (add_noise(student_w, teacher_w), add_noise(student_b, teacher_b)) | |||
| ) | |||
| return new_pre_layer | |||
| def wider_next_conv(layer, start_dim, total_dim, n_add, weighted=True): | |||
| '''wider next conv layer. | |||
| ''' | |||
| n_dim = get_n_dim(layer) | |||
| if not weighted: | |||
| return get_conv_class(n_dim)(layer.input_channel + n_add, | |||
| layer.filters, | |||
| kernel_size=layer.kernel_size, | |||
| stride=layer.stride) | |||
| n_filters = layer.filters | |||
| teacher_w, teacher_b = layer.get_weights() | |||
| new_weight_shape = list(teacher_w.shape) | |||
| new_weight_shape[1] = n_add | |||
| new_weight = np.zeros(tuple(new_weight_shape)) | |||
| student_w = np.concatenate((teacher_w[:, :start_dim, ...].copy(), | |||
| add_noise(new_weight, teacher_w), | |||
| teacher_w[:, start_dim:total_dim, ...].copy()), axis=1) | |||
| new_layer = get_conv_class(n_dim)(layer.input_channel + n_add, | |||
| n_filters, | |||
| kernel_size=layer.kernel_size, | |||
| stride=layer.stride) | |||
| new_layer.set_weights((student_w, teacher_b)) | |||
| return new_layer | |||
| def wider_bn(layer, start_dim, total_dim, n_add, weighted=True): | |||
| '''wider batch norm layer. | |||
| ''' | |||
| n_dim = get_n_dim(layer) | |||
| if not weighted: | |||
| return get_batch_norm_class(n_dim)(layer.num_features + n_add) | |||
| weights = layer.get_weights() | |||
| new_weights = [ | |||
| add_noise(np.ones(n_add, dtype=np.float32), np.array([0, 1])), | |||
| add_noise(np.zeros(n_add, dtype=np.float32), np.array([0, 1])), | |||
| add_noise(np.zeros(n_add, dtype=np.float32), np.array([0, 1])), | |||
| add_noise(np.ones(n_add, dtype=np.float32), np.array([0, 1])), | |||
| ] | |||
| student_w = tuple() | |||
| for weight, new_weight in zip(weights, new_weights): | |||
| temp_w = weight.copy() | |||
| temp_w = np.concatenate( | |||
| (temp_w[:start_dim], new_weight, temp_w[start_dim:total_dim]) | |||
| ) | |||
| student_w += (temp_w,) | |||
| new_layer = get_batch_norm_class(n_dim)(layer.num_features + n_add) | |||
| new_layer.set_weights(student_w) | |||
| return new_layer | |||
| def wider_next_dense(layer, start_dim, total_dim, n_add, weighted=True): | |||
| '''wider next dense layer. | |||
| ''' | |||
| if not weighted: | |||
| return StubDense(layer.input_units + n_add, layer.units) | |||
| teacher_w, teacher_b = layer.get_weights() | |||
| student_w = teacher_w.copy() | |||
| n_units_each_channel = int(teacher_w.shape[1] / total_dim) | |||
| new_weight = np.zeros((teacher_w.shape[0], n_add * n_units_each_channel)) | |||
| student_w = np.concatenate( | |||
| ( | |||
| student_w[:, : start_dim * n_units_each_channel], | |||
| add_noise(new_weight, student_w), | |||
| student_w[ | |||
| :, start_dim * n_units_each_channel: total_dim * n_units_each_channel | |||
| ], | |||
| ), | |||
| axis=1, | |||
| ) | |||
| new_layer = StubDense(layer.input_units + n_add, layer.units) | |||
| new_layer.set_weights((student_w, teacher_b)) | |||
| return new_layer | |||
| def add_noise(weights, other_weights): | |||
| '''add noise to the layer. | |||
| ''' | |||
| w_range = np.ptp(other_weights.flatten()) | |||
| noise_range = NOISE_RATIO * w_range | |||
| noise = np.random.uniform(-noise_range / 2.0, | |||
| noise_range / 2.0, weights.shape) | |||
| return np.add(noise, weights) | |||
| def init_dense_weight(layer): | |||
| '''initilize dense layer weight. | |||
| ''' | |||
| units = layer.units | |||
| weight = np.eye(units) | |||
| bias = np.zeros(units) | |||
| layer.set_weights( | |||
| (add_noise(weight, np.array([0, 1])), | |||
| add_noise(bias, np.array([0, 1]))) | |||
| ) | |||
| def init_conv_weight(layer): | |||
| '''initilize conv layer weight. | |||
| ''' | |||
| n_filters = layer.filters | |||
| filter_shape = (layer.kernel_size,) * get_n_dim(layer) | |||
| weight = np.zeros((n_filters, n_filters) + filter_shape) | |||
| center = tuple(map(lambda x: int((x - 1) / 2), filter_shape)) | |||
| for i in range(n_filters): | |||
| filter_weight = np.zeros((n_filters,) + filter_shape) | |||
| index = (i,) + center | |||
| filter_weight[index] = 1 | |||
| weight[i, ...] = filter_weight | |||
| bias = np.zeros(n_filters) | |||
| layer.set_weights( | |||
| (add_noise(weight, np.array([0, 1])), | |||
| add_noise(bias, np.array([0, 1]))) | |||
| ) | |||
| def init_bn_weight(layer): | |||
| '''initilize batch norm layer weight. | |||
| ''' | |||
| n_filters = layer.num_features | |||
| new_weights = [ | |||
| add_noise(np.ones(n_filters, dtype=np.float32), np.array([0, 1])), | |||
| add_noise(np.zeros(n_filters, dtype=np.float32), np.array([0, 1])), | |||
| add_noise(np.zeros(n_filters, dtype=np.float32), np.array([0, 1])), | |||
| add_noise(np.ones(n_filters, dtype=np.float32), np.array([0, 1])), | |||
| ] | |||
| layer.set_weights(new_weights) | |||
| @@ -0,0 +1,765 @@ | |||
| from abc import abstractmethod | |||
| from collections.abc import Iterable | |||
| import torch | |||
| from torch import nn | |||
| from torch.nn import functional | |||
| from utils import Constant | |||
| class AvgPool(nn.Module): | |||
| """ | |||
| AvgPool Module. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| @abstractmethod | |||
| def forward(self, input_tensor): | |||
| pass | |||
| class GlobalAvgPool1d(AvgPool): | |||
| """ | |||
| GlobalAvgPool1d Module. | |||
| """ | |||
| def forward(self, input_tensor): | |||
| return functional.avg_pool1d(input_tensor, input_tensor.size()[2:]).view( | |||
| input_tensor.size()[:2] | |||
| ) | |||
| class GlobalAvgPool2d(AvgPool): | |||
| """ | |||
| GlobalAvgPool2d Module. | |||
| """ | |||
| def forward(self, input_tensor): | |||
| return functional.avg_pool2d(input_tensor, input_tensor.size()[2:]).view( | |||
| input_tensor.size()[:2] | |||
| ) | |||
| class GlobalAvgPool3d(AvgPool): | |||
| """ | |||
| GlobalAvgPool3d Module. | |||
| """ | |||
| def forward(self, input_tensor): | |||
| return functional.avg_pool3d(input_tensor, input_tensor.size()[2:]).view( | |||
| input_tensor.size()[:2] | |||
| ) | |||
| class StubLayer: | |||
| """ | |||
| StubLayer Module. Base Module. | |||
| """ | |||
| def __init__(self, input_node=None, output_node=None): | |||
| self.input = input_node | |||
| self.output = output_node | |||
| self.weights = None | |||
| def build(self, shape): | |||
| """ | |||
| build shape. | |||
| """ | |||
| def set_weights(self, weights): | |||
| """ | |||
| set weights. | |||
| """ | |||
| self.weights = weights | |||
| def import_weights(self, torch_layer): | |||
| """ | |||
| import weights. | |||
| """ | |||
| def export_weights(self, torch_layer): | |||
| """ | |||
| export weights. | |||
| """ | |||
| def get_weights(self): | |||
| """ | |||
| get weights. | |||
| """ | |||
| return self.weights | |||
| def size(self): | |||
| """ | |||
| size(). | |||
| """ | |||
| return 0 | |||
| @property | |||
| def output_shape(self): | |||
| """ | |||
| output shape. | |||
| """ | |||
| return self.input.shape | |||
| def to_real_layer(self): | |||
| """ | |||
| to real layer. | |||
| """ | |||
| def __str__(self): | |||
| """ | |||
| str() function to print. | |||
| """ | |||
| return type(self).__name__[4:] | |||
| class StubWeightBiasLayer(StubLayer): | |||
| """ | |||
| StubWeightBiasLayer Module to set the bias. | |||
| """ | |||
| def import_weights(self, torch_layer): | |||
| self.set_weights( | |||
| (torch_layer.weight.data.cpu().numpy(), | |||
| torch_layer.bias.data.cpu().numpy()) | |||
| ) | |||
| def export_weights(self, torch_layer): | |||
| torch_layer.weight.data = torch.Tensor(self.weights[0]) | |||
| torch_layer.bias.data = torch.Tensor(self.weights[1]) | |||
| class StubBatchNormalization(StubWeightBiasLayer): | |||
| """ | |||
| StubBatchNormalization Module. Batch Norm. | |||
| """ | |||
| def __init__(self, num_features, input_node=None, output_node=None): | |||
| super().__init__(input_node, output_node) | |||
| self.num_features = num_features | |||
| def import_weights(self, torch_layer): | |||
| self.set_weights( | |||
| ( | |||
| torch_layer.weight.data.cpu().numpy(), | |||
| torch_layer.bias.data.cpu().numpy(), | |||
| torch_layer.running_mean.cpu().numpy(), | |||
| torch_layer.running_var.cpu().numpy(), | |||
| ) | |||
| ) | |||
| def export_weights(self, torch_layer): | |||
| torch_layer.weight.data = torch.Tensor(self.weights[0]) | |||
| torch_layer.bias.data = torch.Tensor(self.weights[1]) | |||
| torch_layer.running_mean = torch.Tensor(self.weights[2]) | |||
| torch_layer.running_var = torch.Tensor(self.weights[3]) | |||
| def size(self): | |||
| return self.num_features * 4 | |||
| @abstractmethod | |||
| def to_real_layer(self): | |||
| pass | |||
| class StubBatchNormalization1d(StubBatchNormalization): | |||
| """ | |||
| StubBatchNormalization1d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.BatchNorm1d(self.num_features) | |||
| class StubBatchNormalization2d(StubBatchNormalization): | |||
| """ | |||
| StubBatchNormalization2d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.BatchNorm2d(self.num_features) | |||
| class StubBatchNormalization3d(StubBatchNormalization): | |||
| """ | |||
| StubBatchNormalization3d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.BatchNorm3d(self.num_features) | |||
| class StubDense(StubWeightBiasLayer): | |||
| """ | |||
| StubDense Module. Linear. | |||
| """ | |||
| def __init__(self, input_units, units, input_node=None, output_node=None): | |||
| super().__init__(input_node, output_node) | |||
| self.input_units = input_units | |||
| self.units = units | |||
| @property | |||
| def output_shape(self): | |||
| return (self.units,) | |||
| def size(self): | |||
| return self.input_units * self.units + self.units | |||
| def to_real_layer(self): | |||
| return torch.nn.Linear(self.input_units, self.units) | |||
| class StubConv(StubWeightBiasLayer): | |||
| """ | |||
| StubConv Module. Conv. | |||
| """ | |||
| def __init__(self, input_channel, filters, kernel_size, | |||
| stride=1, input_node=None, output_node=None): | |||
| super().__init__(input_node, output_node) | |||
| self.input_channel = input_channel | |||
| self.filters = filters | |||
| self.kernel_size = kernel_size | |||
| self.stride = stride | |||
| self.padding = int(self.kernel_size / 2) | |||
| @property | |||
| def output_shape(self): | |||
| ret = list(self.input.shape[:-1]) | |||
| for index, dim in enumerate(ret): | |||
| ret[index] = ( | |||
| int((dim + 2 * self.padding - self.kernel_size) / self.stride) + 1 | |||
| ) | |||
| ret = ret + [self.filters] | |||
| return tuple(ret) | |||
| def size(self): | |||
| return (self.input_channel * self.kernel_size * | |||
| self.kernel_size + 1) * self.filters | |||
| @abstractmethod | |||
| def to_real_layer(self): | |||
| pass | |||
| def __str__(self): | |||
| return ( | |||
| super().__str__() | |||
| + "(" | |||
| + ", ".join( | |||
| str(item) | |||
| for item in [ | |||
| self.input_channel, | |||
| self.filters, | |||
| self.kernel_size, | |||
| self.stride, | |||
| ] | |||
| ) | |||
| + ")" | |||
| ) | |||
| class StubConv1d(StubConv): | |||
| """ | |||
| StubConv1d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.Conv1d( | |||
| self.input_channel, | |||
| self.filters, | |||
| self.kernel_size, | |||
| stride=self.stride, | |||
| padding=self.padding, | |||
| ) | |||
| class StubConv2d(StubConv): | |||
| """ | |||
| StubConv2d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.Conv2d( | |||
| self.input_channel, | |||
| self.filters, | |||
| self.kernel_size, | |||
| stride=self.stride, | |||
| padding=self.padding, | |||
| ) | |||
| class StubConv3d(StubConv): | |||
| """ | |||
| StubConv3d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.Conv3d( | |||
| self.input_channel, | |||
| self.filters, | |||
| self.kernel_size, | |||
| stride=self.stride, | |||
| padding=self.padding, | |||
| ) | |||
| class StubAggregateLayer(StubLayer): | |||
| """ | |||
| StubAggregateLayer Module. | |||
| """ | |||
| def __init__(self, input_nodes=None, output_node=None): | |||
| if input_nodes is None: | |||
| input_nodes = [] | |||
| super().__init__(input_nodes, output_node) | |||
| class StubConcatenate(StubAggregateLayer): | |||
| """StubConcatenate Module. | |||
| """ | |||
| @property | |||
| def output_shape(self): | |||
| ret = 0 | |||
| for current_input in self.input: | |||
| ret += current_input.shape[-1] | |||
| ret = self.input[0].shape[:-1] + (ret,) | |||
| return ret | |||
| def to_real_layer(self): | |||
| return TorchConcatenate() | |||
| class StubAdd(StubAggregateLayer): | |||
| """ | |||
| StubAdd Module. | |||
| """ | |||
| @property | |||
| def output_shape(self): | |||
| return self.input[0].shape | |||
| def to_real_layer(self): | |||
| return TorchAdd() | |||
| class StubFlatten(StubLayer): | |||
| """ | |||
| StubFlatten Module. | |||
| """ | |||
| @property | |||
| def output_shape(self): | |||
| ret = 1 | |||
| for dim in self.input.shape: | |||
| ret *= dim | |||
| return (ret,) | |||
| def to_real_layer(self): | |||
| return TorchFlatten() | |||
| class StubReLU(StubLayer): | |||
| """ | |||
| StubReLU Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.ReLU() | |||
| class StubSoftmax(StubLayer): | |||
| """ | |||
| StubSoftmax Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.LogSoftmax(dim=1) | |||
| class StubDropout(StubLayer): | |||
| """ | |||
| StubDropout Module. | |||
| """ | |||
| def __init__(self, rate, input_node=None, output_node=None): | |||
| super().__init__(input_node, output_node) | |||
| self.rate = rate | |||
| @abstractmethod | |||
| def to_real_layer(self): | |||
| pass | |||
| class StubDropout1d(StubDropout): | |||
| """ | |||
| StubDropout1d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.Dropout(self.rate) | |||
| class StubDropout2d(StubDropout): | |||
| """ | |||
| StubDropout2d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.Dropout2d(self.rate) | |||
| class StubDropout3d(StubDropout): | |||
| """ | |||
| StubDropout3d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.Dropout3d(self.rate) | |||
| class StubInput(StubLayer): | |||
| """ | |||
| StubInput Module. | |||
| """ | |||
| def __init__(self, input_node=None, output_node=None): | |||
| super().__init__(input_node, output_node) | |||
| class StubPooling(StubLayer): | |||
| """ | |||
| StubPooling Module. | |||
| """ | |||
| def __init__(self, | |||
| kernel_size=None, | |||
| stride=None, | |||
| padding=0, | |||
| input_node=None, | |||
| output_node=None): | |||
| super().__init__(input_node, output_node) | |||
| self.kernel_size = ( | |||
| kernel_size if kernel_size is not None else Constant.POOLING_KERNEL_SIZE | |||
| ) | |||
| self.stride = stride if stride is not None else self.kernel_size | |||
| self.padding = padding | |||
| @property | |||
| def output_shape(self): | |||
| ret = tuple() | |||
| for dim in self.input.shape[:-1]: | |||
| ret = ret + (max(int((dim + 2 * self.padding) / self.kernel_size), 1),) | |||
| ret = ret + (self.input.shape[-1],) | |||
| return ret | |||
| @abstractmethod | |||
| def to_real_layer(self): | |||
| pass | |||
| class StubPooling1d(StubPooling): | |||
| """ | |||
| StubPooling1d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.MaxPool1d(self.kernel_size, stride=self.stride) | |||
| class StubPooling2d(StubPooling): | |||
| """ | |||
| StubPooling2d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.MaxPool2d(self.kernel_size, stride=self.stride) | |||
| class StubPooling3d(StubPooling): | |||
| """ | |||
| StubPooling3d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return torch.nn.MaxPool3d(self.kernel_size, stride=self.stride) | |||
| class StubGlobalPooling(StubLayer): | |||
| """ | |||
| StubGlobalPooling Module. | |||
| """ | |||
| def __init__(self, input_node=None, output_node=None): | |||
| super().__init__(input_node, output_node) | |||
| @property | |||
| def output_shape(self): | |||
| return (self.input.shape[-1],) | |||
| @abstractmethod | |||
| def to_real_layer(self): | |||
| pass | |||
| class StubGlobalPooling1d(StubGlobalPooling): | |||
| """ | |||
| StubGlobalPooling1d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return GlobalAvgPool1d() | |||
| class StubGlobalPooling2d(StubGlobalPooling): | |||
| """ | |||
| StubGlobalPooling2d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return GlobalAvgPool2d() | |||
| class StubGlobalPooling3d(StubGlobalPooling): | |||
| """ | |||
| StubGlobalPooling3d Module. | |||
| """ | |||
| def to_real_layer(self): | |||
| return GlobalAvgPool3d() | |||
| class TorchConcatenate(nn.Module): | |||
| """ | |||
| TorchConcatenate Module. | |||
| """ | |||
| def forward(self, input_list): | |||
| return torch.cat(input_list, dim=1) | |||
| class TorchAdd(nn.Module): | |||
| """ | |||
| TorchAdd Module. | |||
| """ | |||
| def forward(self, input_list): | |||
| return input_list[0] + input_list[1] | |||
| class TorchFlatten(nn.Module): | |||
| """ | |||
| TorchFlatten Module. | |||
| """ | |||
| def forward(self, input_tensor): | |||
| return input_tensor.view(input_tensor.size(0), -1) | |||
| def is_layer(layer, layer_type): | |||
| """ | |||
| Judge the layer type. | |||
| Returns | |||
| ------- | |||
| bool | |||
| boolean -- True or False | |||
| """ | |||
| if layer_type == "Input": | |||
| return isinstance(layer, StubInput) | |||
| elif layer_type == "Conv": | |||
| return isinstance(layer, StubConv) | |||
| elif layer_type == "Dense": | |||
| return isinstance(layer, (StubDense,)) | |||
| elif layer_type == "BatchNormalization": | |||
| return isinstance(layer, (StubBatchNormalization,)) | |||
| elif layer_type == "Concatenate": | |||
| return isinstance(layer, (StubConcatenate,)) | |||
| elif layer_type == "Add": | |||
| return isinstance(layer, (StubAdd,)) | |||
| elif layer_type == "Pooling": | |||
| return isinstance(layer, StubPooling) | |||
| elif layer_type == "Dropout": | |||
| return isinstance(layer, (StubDropout,)) | |||
| elif layer_type == "Softmax": | |||
| return isinstance(layer, (StubSoftmax,)) | |||
| elif layer_type == "ReLU": | |||
| return isinstance(layer, (StubReLU,)) | |||
| elif layer_type == "Flatten": | |||
| return isinstance(layer, (StubFlatten,)) | |||
| elif layer_type == "GlobalAveragePooling": | |||
| return isinstance(layer, StubGlobalPooling) | |||
| return None # note: this is not written by original author, feel free to modify if you think it's incorrect | |||
| def layer_description_extractor(layer, node_to_id): | |||
| """ | |||
| Get layer description. | |||
| """ | |||
| layer_input = layer.input | |||
| layer_output = layer.output | |||
| if layer_input is not None: | |||
| if isinstance(layer_input, Iterable): | |||
| layer_input = list(map(lambda x: node_to_id[x], layer_input)) | |||
| else: | |||
| layer_input = node_to_id[layer_input] | |||
| if layer_output is not None: | |||
| layer_output = node_to_id[layer_output] | |||
| if isinstance(layer, StubConv): | |||
| return ( | |||
| type(layer).__name__, | |||
| layer_input, | |||
| layer_output, | |||
| layer.input_channel, | |||
| layer.filters, | |||
| layer.kernel_size, | |||
| layer.stride, | |||
| layer.padding, | |||
| ) | |||
| elif isinstance(layer, (StubDense,)): | |||
| return [ | |||
| type(layer).__name__, | |||
| layer_input, | |||
| layer_output, | |||
| layer.input_units, | |||
| layer.units, | |||
| ] | |||
| elif isinstance(layer, (StubBatchNormalization,)): | |||
| return (type(layer).__name__, layer_input, | |||
| layer_output, layer.num_features) | |||
| elif isinstance(layer, (StubDropout,)): | |||
| return (type(layer).__name__, layer_input, layer_output, layer.rate) | |||
| elif isinstance(layer, StubPooling): | |||
| return ( | |||
| type(layer).__name__, | |||
| layer_input, | |||
| layer_output, | |||
| layer.kernel_size, | |||
| layer.stride, | |||
| layer.padding, | |||
| ) | |||
| else: | |||
| return (type(layer).__name__, layer_input, layer_output) | |||
| def layer_description_builder(layer_information, id_to_node): | |||
| """build layer from description. | |||
| """ | |||
| layer_type = layer_information[0] | |||
| layer_input_ids = layer_information[1] | |||
| if isinstance(layer_input_ids, Iterable): | |||
| layer_input = list(map(lambda x: id_to_node[x], layer_input_ids)) | |||
| else: | |||
| layer_input = id_to_node[layer_input_ids] | |||
| layer_output = id_to_node[layer_information[2]] | |||
| if layer_type.startswith("StubConv"): | |||
| input_channel = layer_information[3] | |||
| filters = layer_information[4] | |||
| kernel_size = layer_information[5] | |||
| stride = layer_information[6] | |||
| return globals()[layer_type]( | |||
| input_channel, filters, kernel_size, stride, layer_input, layer_output | |||
| ) | |||
| elif layer_type.startswith("StubDense"): | |||
| input_units = layer_information[3] | |||
| units = layer_information[4] | |||
| return globals()[layer_type](input_units, units, layer_input, layer_output) | |||
| elif layer_type.startswith("StubBatchNormalization"): | |||
| num_features = layer_information[3] | |||
| return globals()[layer_type](num_features, layer_input, layer_output) | |||
| elif layer_type.startswith("StubDropout"): | |||
| rate = layer_information[3] | |||
| return globals()[layer_type](rate, layer_input, layer_output) | |||
| elif layer_type.startswith("StubPooling"): | |||
| kernel_size = layer_information[3] | |||
| stride = layer_information[4] | |||
| padding = layer_information[5] | |||
| return globals()[layer_type](kernel_size, stride, padding, layer_input, layer_output) | |||
| else: | |||
| return globals()[layer_type](layer_input, layer_output) | |||
| def layer_width(layer): | |||
| """ | |||
| Get layer width. | |||
| """ | |||
| if is_layer(layer, "Dense"): | |||
| return layer.units | |||
| if is_layer(layer, "Conv"): | |||
| return layer.filters | |||
| raise TypeError("The layer should be either Dense or Conv layer.") | |||
| def set_torch_weight_to_stub(torch_layer, stub_layer): | |||
| stub_layer.import_weights(torch_layer) | |||
| def set_stub_weight_to_torch(stub_layer, torch_layer): | |||
| stub_layer.export_weights(torch_layer) | |||
| def get_conv_class(n_dim): | |||
| conv_class_list = [StubConv1d, StubConv2d, StubConv3d] | |||
| return conv_class_list[n_dim - 1] | |||
| def get_dropout_class(n_dim): | |||
| dropout_class_list = [StubDropout1d, StubDropout2d, StubDropout3d] | |||
| return dropout_class_list[n_dim - 1] | |||
| def get_global_avg_pooling_class(n_dim): | |||
| global_avg_pooling_class_list = [ | |||
| StubGlobalPooling1d, | |||
| StubGlobalPooling2d, | |||
| StubGlobalPooling3d, | |||
| ] | |||
| return global_avg_pooling_class_list[n_dim - 1] | |||
| def get_pooling_class(n_dim): | |||
| pooling_class_list = [StubPooling1d, StubPooling2d, StubPooling3d] | |||
| return pooling_class_list[n_dim - 1] | |||
| def get_batch_norm_class(n_dim): | |||
| batch_norm_class_list = [ | |||
| StubBatchNormalization1d, | |||
| StubBatchNormalization2d, | |||
| StubBatchNormalization3d, | |||
| ] | |||
| return batch_norm_class_list[n_dim - 1] | |||
| def get_n_dim(layer): | |||
| if isinstance(layer, ( | |||
| StubConv1d, | |||
| StubDropout1d, | |||
| StubGlobalPooling1d, | |||
| StubPooling1d, | |||
| StubBatchNormalization1d, | |||
| )): | |||
| return 1 | |||
| if isinstance(layer, ( | |||
| StubConv2d, | |||
| StubDropout2d, | |||
| StubGlobalPooling2d, | |||
| StubPooling2d, | |||
| StubBatchNormalization2d, | |||
| )): | |||
| return 2 | |||
| if isinstance(layer, ( | |||
| StubConv3d, | |||
| StubDropout3d, | |||
| StubGlobalPooling3d, | |||
| StubPooling3d, | |||
| StubBatchNormalization3d, | |||
| )): | |||
| return 3 | |||
| return -1 | |||
| @@ -0,0 +1,293 @@ | |||
| import logging | |||
| import os | |||
| import shutil | |||
| from utils import Constant, OptimizeMode | |||
| from .bayesian import BayesianOptimizer | |||
| from .nn import CnnGenerator, ResNetGenerator, MlpGenerator | |||
| from .graph import graph_to_json, json_to_graph | |||
| logger = logging.getLogger(__name__) | |||
| class NetworkMorphismSearcher: | |||
| """ | |||
| NetworkMorphismSearcher is a tuner which using network morphism techniques. | |||
| Attributes | |||
| ---------- | |||
| n_classes : int | |||
| The class number or output node number (default: ``10``) | |||
| input_shape : tuple | |||
| A tuple including: (input_width, input_width, input_channel) | |||
| t_min : float | |||
| The minimum temperature for simulated annealing. (default: ``Constant.T_MIN``) | |||
| beta : float | |||
| The beta in acquisition function. (default: ``Constant.BETA``) | |||
| algorithm_name : str | |||
| algorithm name used in the network morphism (default: ``"Bayesian"``) | |||
| optimize_mode : str | |||
| optimize mode "minimize" or "maximize" (default: ``"minimize"``) | |||
| verbose : bool | |||
| verbose to print the log (default: ``True``) | |||
| bo : BayesianOptimizer | |||
| The optimizer used in networkmorphsim tuner. | |||
| max_model_size : int | |||
| max model size to the graph (default: ``Constant.MAX_MODEL_SIZE``) | |||
| default_model_len : int | |||
| default model length (default: ``Constant.MODEL_LEN``) | |||
| default_model_width : int | |||
| default model width (default: ``Constant.MODEL_WIDTH``) | |||
| search_space : dict | |||
| """ | |||
| def __init__( | |||
| self, | |||
| path, | |||
| best_selected_space_path, | |||
| task="cv", | |||
| input_width=32, | |||
| input_channel=3, | |||
| n_output_node=10, | |||
| algorithm_name="Bayesian", | |||
| optimize_mode="maximize", | |||
| verbose=True, | |||
| beta=Constant.BETA, | |||
| t_min=Constant.T_MIN, | |||
| max_model_size=Constant.MAX_MODEL_SIZE, | |||
| default_model_len=Constant.MODEL_LEN, | |||
| default_model_width=Constant.MODEL_WIDTH, | |||
| ): | |||
| """ | |||
| initilizer of the NetworkMorphismSearcher. | |||
| """ | |||
| self.path = path | |||
| self.best_selected_space_path = best_selected_space_path | |||
| if task == "cv": | |||
| self.generators = [CnnGenerator] | |||
| elif task == "common": | |||
| self.generators = [MlpGenerator] | |||
| else: | |||
| raise NotImplementedError( | |||
| '{} task not supported in List ["cv","common"]') | |||
| self.n_classes = n_output_node | |||
| self.input_shape = (input_width, input_width, input_channel) | |||
| self.t_min = t_min | |||
| self.beta = beta | |||
| self.algorithm_name = algorithm_name | |||
| self.optimize_mode = OptimizeMode(optimize_mode) | |||
| self.json = None | |||
| self.total_data = {} | |||
| self.verbose = verbose | |||
| self.bo = BayesianOptimizer( | |||
| self, self.t_min, self.optimize_mode, self.beta) | |||
| self.training_queue = [] | |||
| self.descriptors = [] | |||
| self.history = [] | |||
| self.max_model_size = max_model_size | |||
| self.default_model_len = default_model_len | |||
| self.default_model_width = default_model_width | |||
| def search(self, parameter_id, args): | |||
| """ | |||
| Returns a set of trial neural architecture, as a serializable object. | |||
| Parameters | |||
| ---------- | |||
| parameter_id : int | |||
| """ | |||
| if not self.history: | |||
| self.init_search(args) | |||
| new_father_id = None | |||
| generated_graph = None | |||
| # 先看training queue里面有没有元素,有就是init_search了 | |||
| # 如果有history了话那就生成一个 | |||
| if not self.training_queue: | |||
| new_father_id, generated_graph = self.generate() | |||
| new_model_id = args.trial_id | |||
| self.training_queue.append( | |||
| (generated_graph, new_father_id, new_model_id)) | |||
| self.descriptors.append(generated_graph.extract_descriptor()) | |||
| graph, father_id, model_id = self.training_queue.pop(0) | |||
| # from graph to json | |||
| json_out = graph_to_json(graph, os.path.join(self.path, str(model_id),'model_selected_space.json')) | |||
| self.total_data[parameter_id] = (json_out, father_id, model_id) | |||
| return json_out | |||
| def update_searcher(self, parameter_id, value, **kwargs): | |||
| """ | |||
| Record an observation of the objective function. | |||
| Parameters | |||
| ---------- | |||
| parameter_id : int | |||
| the id of a group of paramters that generated by nni manager. | |||
| value : dict/float | |||
| if value is dict, it should have "default" key. | |||
| """ | |||
| if parameter_id not in self.total_data: | |||
| raise RuntimeError("Received parameter_id not in total_data.") | |||
| (_, father_id, model_id) = self.total_data[parameter_id] | |||
| graph = self.bo.searcher.load_model_by_id(model_id) | |||
| # to use the value and graph | |||
| self.add_model(value, model_id) | |||
| self.update(father_id, graph, value, model_id) | |||
| def init_search(self,args): | |||
| """ | |||
| Call the generators to generate the initial architectures for the search. | |||
| """ | |||
| if self.verbose: | |||
| logger.info("Initializing search.") | |||
| for generator in self.generators: | |||
| graph = generator(self.n_classes, self.input_shape).generate( | |||
| self.default_model_len, self.default_model_width | |||
| ) | |||
| model_id = args.trial_id | |||
| self.training_queue.append((graph, -1, model_id)) | |||
| self.descriptors.append(graph.extract_descriptor()) | |||
| if self.verbose: | |||
| logger.info("Initialization finished.") | |||
| def generate(self): | |||
| """ | |||
| Generate the next neural architecture. | |||
| Returns | |||
| ------- | |||
| other_info : any object | |||
| Anything to be saved in the training queue together with the architecture. | |||
| generated_graph : Graph | |||
| An instance of Graph. | |||
| """ | |||
| generated_graph, new_father_id = self.bo.generate(self.descriptors) | |||
| if new_father_id is None: | |||
| new_father_id = 0 | |||
| generated_graph = self.generators[0]( | |||
| self.n_classes, self.input_shape | |||
| ).generate(self.default_model_len, self.default_model_width) | |||
| return new_father_id, generated_graph | |||
| def update(self, other_info, graph, metric_value, model_id): | |||
| """ | |||
| Update the controller with evaluation result of a neural architecture. | |||
| Parameters | |||
| ---------- | |||
| other_info: any object | |||
| In our case it is the father ID in the search tree. | |||
| graph: Graph | |||
| An instance of Graph. The trained neural architecture. | |||
| metric_value: float | |||
| The final evaluated metric value. | |||
| model_id: int | |||
| """ | |||
| father_id = other_info | |||
| self.bo.fit([graph.extract_descriptor()], [metric_value]) | |||
| self.bo.add_child(father_id, model_id) | |||
| def add_model(self, metric_value, model_id): | |||
| """ | |||
| Add model to the history, x_queue and y_queue | |||
| Parameters | |||
| ---------- | |||
| metric_value : float | |||
| graph : dict | |||
| model_id : int | |||
| Returns | |||
| ------- | |||
| model : dict | |||
| """ | |||
| if self.verbose: | |||
| logger.info("Saving model.") | |||
| # Update best_model text file | |||
| ret = {"model_id": model_id, "metric_value": metric_value} | |||
| self.history.append(ret) | |||
| # update best selected space | |||
| if model_id == self.get_best_model_id(): | |||
| best_model_path = os.path.join(self.path, str(model_id),'model_selected_space.json') | |||
| shutil.copy(best_model_path, self.best_selected_space_path) | |||
| return ret | |||
| def get_best_model_id(self): | |||
| """ | |||
| Get the best model_id from history using the metric value | |||
| """ | |||
| if self.optimize_mode is OptimizeMode.Maximize: | |||
| return max(self.history, key=lambda x: x["metric_value"])[ | |||
| "model_id"] | |||
| return min(self.history, key=lambda x: x["metric_value"])["model_id"] | |||
| def load_model_by_id(self, model_id): | |||
| """ | |||
| Get the model by model_id | |||
| Parameters | |||
| ---------- | |||
| model_id : int | |||
| model index | |||
| Returns | |||
| ------- | |||
| load_model : Graph | |||
| the model graph representation | |||
| """ | |||
| with open(os.path.join(self.path, str(model_id), "model_selected_space.json")) as fin: | |||
| json_str = fin.read().replace("\n", "") | |||
| load_model = json_to_graph(json_str) | |||
| return load_model | |||
| def load_best_model(self): | |||
| """ | |||
| Get the best model by model id | |||
| Returns | |||
| ------- | |||
| load_model : Graph | |||
| the model graph representation | |||
| """ | |||
| return self.load_model_by_id(self.get_best_model_id()) | |||
| def get_metric_value_by_id(self, model_id): | |||
| """ | |||
| Get the model metric valud by its model_id | |||
| Parameters | |||
| ---------- | |||
| model_id : int | |||
| model index | |||
| Returns | |||
| ------- | |||
| float | |||
| the model metric | |||
| """ | |||
| for item in self.history: | |||
| if item["model_id"] == model_id: | |||
| return item["metric_value"] | |||
| return None | |||
| @@ -0,0 +1,227 @@ | |||
| from abc import abstractmethod | |||
| from .graph import Graph | |||
| from .layers import (StubAdd, StubDense, StubDropout1d, | |||
| StubReLU, get_batch_norm_class, | |||
| get_conv_class, | |||
| get_dropout_class, | |||
| get_global_avg_pooling_class, | |||
| get_pooling_class) | |||
| from utils import Constant | |||
| class NetworkGenerator: | |||
| """The base class for generating a network. | |||
| It can be used to generate a CNN or Multi-Layer Perceptron. | |||
| Attributes: | |||
| n_output_node: Number of output nodes in the network. | |||
| input_shape: A tuple to represent the input shape. | |||
| """ | |||
| def __init__(self, n_output_node, input_shape): | |||
| self.n_output_node = n_output_node | |||
| self.input_shape = input_shape | |||
| @abstractmethod | |||
| def generate(self, model_len, model_width): | |||
| pass | |||
| class CnnGenerator(NetworkGenerator): | |||
| """A class to generate CNN. | |||
| Attributes: | |||
| n_dim: `len(self.input_shape) - 1` | |||
| conv: A class that represents `(n_dim-1)` dimensional convolution. | |||
| dropout: A class that represents `(n_dim-1)` dimensional dropout. | |||
| global_avg_pooling: A class that represents `(n_dim-1)` dimensional Global Average Pooling. | |||
| pooling: A class that represents `(n_dim-1)` dimensional pooling. | |||
| batch_norm: A class that represents `(n_dim-1)` dimensional batch normalization. | |||
| """ | |||
| def __init__(self, n_output_node, input_shape): | |||
| super(CnnGenerator, self).__init__(n_output_node, input_shape) | |||
| self.n_dim = len(self.input_shape) - 1 | |||
| if len(self.input_shape) > 4: | |||
| raise ValueError("The input dimension is too high.") | |||
| if len(self.input_shape) < 2: | |||
| raise ValueError("The input dimension is too low.") | |||
| self.conv = get_conv_class(self.n_dim) | |||
| self.dropout = get_dropout_class(self.n_dim) | |||
| self.global_avg_pooling = get_global_avg_pooling_class(self.n_dim) | |||
| self.pooling = get_pooling_class(self.n_dim) | |||
| self.batch_norm = get_batch_norm_class(self.n_dim) | |||
| def generate(self, model_len=None, model_width=None): | |||
| """Generates a CNN. | |||
| Args: | |||
| model_len: An integer. Number of convolutional layers. | |||
| model_width: An integer. Number of filters for the convolutional layers. | |||
| Returns: | |||
| An instance of the class Graph. Represents the neural architecture graph of the generated model. | |||
| """ | |||
| if model_len is None: | |||
| model_len = Constant.MODEL_LEN | |||
| if model_width is None: | |||
| model_width = Constant.MODEL_WIDTH | |||
| pooling_len = int(model_len / 4) | |||
| graph = Graph(self.input_shape, False) | |||
| temp_input_channel = self.input_shape[-1] | |||
| output_node_id = 0 | |||
| stride = 1 | |||
| for i in range(model_len): | |||
| output_node_id = graph.add_layer(StubReLU(), output_node_id) | |||
| output_node_id = graph.add_layer( | |||
| self.batch_norm( | |||
| graph.node_list[output_node_id].shape[-1]), output_node_id | |||
| ) | |||
| output_node_id = graph.add_layer( | |||
| self.conv( | |||
| temp_input_channel, | |||
| model_width, | |||
| kernel_size=3, | |||
| stride=stride), | |||
| output_node_id, | |||
| ) | |||
| temp_input_channel = model_width | |||
| if pooling_len == 0 or ( | |||
| (i + 1) % pooling_len == 0 and i != model_len - 1): | |||
| output_node_id = graph.add_layer( | |||
| self.pooling(), output_node_id) | |||
| output_node_id = graph.add_layer( | |||
| self.global_avg_pooling(), output_node_id) | |||
| output_node_id = graph.add_layer( | |||
| self.dropout(Constant.CONV_DROPOUT_RATE), output_node_id | |||
| ) | |||
| output_node_id = graph.add_layer( | |||
| StubDense(graph.node_list[output_node_id].shape[0], model_width), | |||
| output_node_id, | |||
| ) | |||
| output_node_id = graph.add_layer(StubReLU(), output_node_id) | |||
| graph.add_layer( | |||
| StubDense( | |||
| model_width, | |||
| self.n_output_node), | |||
| output_node_id) | |||
| return graph | |||
| class ResNetGenerator(NetworkGenerator): | |||
| def __init__(self, n_output_node, input_shape): | |||
| super(ResNetGenerator, self).__init__(n_output_node, input_shape) | |||
| # self.layers = [2, 2, 2, 2] | |||
| self.in_planes = 64 | |||
| self.block_expansion = 1 | |||
| self.n_dim = len(self.input_shape) - 1 | |||
| if len(self.input_shape) > 4: | |||
| raise ValueError('The input dimension is too high.') | |||
| elif len(self.input_shape) < 2: | |||
| raise ValueError('The input dimension is too low.') | |||
| self.conv = get_conv_class(self.n_dim) | |||
| self.dropout = get_dropout_class(self.n_dim) | |||
| self.global_avg_pooling = get_global_avg_pooling_class(self.n_dim) | |||
| self.adaptive_avg_pooling = get_global_avg_pooling_class(self.n_dim) | |||
| self.batch_norm = get_batch_norm_class(self.n_dim) | |||
| def generate(self, model_len=None, model_width=None): | |||
| if model_width is None: | |||
| model_width = Constant.MODEL_WIDTH | |||
| graph = Graph(self.input_shape, False) | |||
| temp_input_channel = self.input_shape[-1] | |||
| output_node_id = 0 | |||
| # output_node_id = graph.add_layer(StubReLU(), output_node_id) | |||
| output_node_id = graph.add_layer(self.conv(temp_input_channel, model_width, kernel_size=3), output_node_id) | |||
| output_node_id = graph.add_layer(self.batch_norm(model_width), output_node_id) | |||
| # output_node_id = graph.add_layer(self.pooling(kernel_size=3, stride=2, padding=1), output_node_id) | |||
| output_node_id = self._make_layer(graph, model_width, 2, output_node_id, 1) | |||
| model_width *= 2 | |||
| output_node_id = self._make_layer(graph, model_width, 2, output_node_id, 2) | |||
| model_width *= 2 | |||
| output_node_id = self._make_layer(graph, model_width, 2, output_node_id, 2) | |||
| model_width *= 2 | |||
| output_node_id = self._make_layer(graph, model_width, 2, output_node_id, 2) | |||
| output_node_id = graph.add_layer(self.global_avg_pooling(), output_node_id) | |||
| graph.add_layer(StubDense(model_width * self.block_expansion, self.n_output_node), output_node_id) | |||
| return graph | |||
| def _make_layer(self, graph, planes, blocks, node_id, stride): | |||
| strides = [stride] + [1] * (blocks - 1) | |||
| out = node_id | |||
| for current_stride in strides: | |||
| out = self._make_block(graph, self.in_planes, planes, out, current_stride) | |||
| self.in_planes = planes * self.block_expansion | |||
| return out | |||
| def _make_block(self, graph, in_planes, planes, node_id, stride=1): | |||
| out = graph.add_layer(self.batch_norm(in_planes), node_id) | |||
| out = graph.add_layer(StubReLU(), out) | |||
| residual_node_id = out | |||
| out = graph.add_layer(self.conv(in_planes, planes, kernel_size=3, stride=stride), out) | |||
| out = graph.add_layer(self.batch_norm(planes), out) | |||
| out = graph.add_layer(StubReLU(), out) | |||
| out = graph.add_layer(self.conv(planes, planes, kernel_size=3), out) | |||
| residual_node_id = graph.add_layer(StubReLU(), residual_node_id) | |||
| residual_node_id = graph.add_layer(self.conv(in_planes, | |||
| planes * self.block_expansion, | |||
| kernel_size=1, | |||
| stride=stride), residual_node_id) | |||
| out = graph.add_layer(StubAdd(), (out, residual_node_id)) | |||
| return out | |||
| class MlpGenerator(NetworkGenerator): | |||
| """A class to generate Multi-Layer Perceptron. | |||
| """ | |||
| def __init__(self, n_output_node, input_shape): | |||
| """Initialize the instance. | |||
| Args: | |||
| n_output_node: An integer. Number of output nodes in the network. | |||
| input_shape: A tuple. Input shape of the network. If it is 1D, ensure the value is appended by a comma | |||
| in the tuple. | |||
| """ | |||
| super(MlpGenerator, self).__init__(n_output_node, input_shape) | |||
| if len(self.input_shape) > 1: | |||
| raise ValueError("The input dimension is too high.") | |||
| def generate(self, model_len=None, model_width=None): | |||
| """Generates a Multi-Layer Perceptron. | |||
| Args: | |||
| model_len: An integer. Number of hidden layers. | |||
| model_width: An integer or a list of integers of length `model_len`. If it is a list, it represents the | |||
| number of nodes in each hidden layer. If it is an integer, all hidden layers have nodes equal to this | |||
| value. | |||
| Returns: | |||
| An instance of the class Graph. Represents the neural architecture graph of the generated model. | |||
| """ | |||
| if model_len is None: | |||
| model_len = Constant.MODEL_LEN | |||
| if model_width is None: | |||
| model_width = Constant.MODEL_WIDTH | |||
| if isinstance(model_width, list) and not len(model_width) == model_len: | |||
| raise ValueError( | |||
| "The length of 'model_width' does not match 'model_len'") | |||
| elif isinstance(model_width, int): | |||
| model_width = [model_width] * model_len | |||
| graph = Graph(self.input_shape, False) | |||
| output_node_id = 0 | |||
| n_nodes_prev_layer = self.input_shape[0] | |||
| for width in model_width: | |||
| output_node_id = graph.add_layer( | |||
| StubDense(n_nodes_prev_layer, width), output_node_id | |||
| ) | |||
| output_node_id = graph.add_layer( | |||
| StubDropout1d(Constant.MLP_DROPOUT_RATE), output_node_id | |||
| ) | |||
| output_node_id = graph.add_layer(StubReLU(), output_node_id) | |||
| n_nodes_prev_layer = width | |||
| graph.add_layer( | |||
| StubDense( | |||
| n_nodes_prev_layer, | |||
| self.n_output_node), | |||
| output_node_id) | |||
| return graph | |||
| @@ -0,0 +1,64 @@ | |||
| import numpy as np | |||
| import torch | |||
| import torchvision.transforms as transforms | |||
| from utils import Constant | |||
| class Cutout: | |||
| """Randomly mask out one or more patches from an image. | |||
| Args: | |||
| n_holes (int): Number of patches to cut out of each image. | |||
| length (int): The length (in pixels) of each square patch. | |||
| """ | |||
| def __init__(self, n_holes, length): | |||
| self.n_holes = n_holes | |||
| self.length = length | |||
| def __call__(self, img): | |||
| """ | |||
| Args: | |||
| img (Tensor): Tensor image of size (C, H, W). | |||
| Returns: | |||
| Tensor: Image with n_holes of dimension length x length cut out of it. | |||
| """ | |||
| h, w = img.size(1), img.size(2) | |||
| mask = np.ones((h, w), np.float32) | |||
| for _ in range(self.n_holes): | |||
| y = np.random.randint(h) | |||
| x = np.random.randint(w) | |||
| y1 = np.clip(y - self.length // 2, 0, h) | |||
| y2 = np.clip(y + self.length // 2, 0, h) | |||
| x1 = np.clip(x - self.length // 2, 0, w) | |||
| x2 = np.clip(x + self.length // 2, 0, w) | |||
| mask[y1:y2, x1:x2] = 0.0 | |||
| mask = torch.from_numpy(mask) | |||
| mask = mask.expand_as(img) | |||
| img *= mask | |||
| return img | |||
| def data_transforms_cifar10(): | |||
| """ data_transforms for cifar10 dataset | |||
| """ | |||
| cifar_mean = [0.49139968, 0.48215827, 0.44653124] | |||
| cifar_std = [0.24703233, 0.24348505, 0.26158768] | |||
| train_transform = transforms.Compose( | |||
| [ | |||
| transforms.RandomCrop(32, padding=4), | |||
| transforms.RandomHorizontalFlip(), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(cifar_mean, cifar_std), | |||
| Cutout(n_holes=Constant.CUTOUT_HOLES, | |||
| length=int(32 * Constant.CUTOUT_RATIO)) | |||
| ] | |||
| ) | |||
| valid_transform = transforms.Compose( | |||
| [transforms.ToTensor(), transforms.Normalize(cifar_mean, cifar_std)] | |||
| ) | |||
| return train_transform, valid_transform | |||
| @@ -0,0 +1,57 @@ | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import os | |||
| import logging | |||
| from pytorch.network_morphism.network_morphism_trainer import NetworkMorphismTrainer | |||
| import argparse | |||
| from pytorch.utils import init_logger, mkdirs | |||
| import json | |||
| logger = logging.getLogger(__name__) | |||
| class Retrain: | |||
| def __init__(self, args): | |||
| self.args = args | |||
| def run(self): | |||
| logger.info("Retraining the best model.") | |||
| with open(args.best_selected_space_path, 'r') as f: | |||
| json_out = json.load(f) | |||
| json_out = json.dumps(json_out) | |||
| trainer = NetworkMorphismTrainer(json_out, self.args) | |||
| trainer.retrain() | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser("network_morphism_retrain") | |||
| parser.add_argument("--trial_id", type=int, default=0, help="Trial id") | |||
| parser.add_argument("--log_path", type=str, | |||
| default='./log', help="log for info") | |||
| parser.add_argument( | |||
| "--experiment_dir", type=str, default='./TADL', help="experiment level path" | |||
| ) | |||
| parser.add_argument( | |||
| "--best_selected_space_path", type=str, default='./best_selected_space.json', help="Path to best selected space" | |||
| ) | |||
| parser.add_argument( | |||
| "--result_path", type=str, default='./result.json', help="Path to result" | |||
| ) | |||
| parser.add_argument( | |||
| "--best_checkpoint_dir", type=str, default='./', help="Path to checkpoint saved" | |||
| ) | |||
| parser.add_argument( | |||
| "--data_dir", type=str, default='../data/', help="Path to dataset" | |||
| ) | |||
| parser.add_argument("--batch_size", type=int, | |||
| default=128, help="batch size") | |||
| parser.add_argument("--opt", type=str, default="SGD", help="optimizer") | |||
| parser.add_argument("--epochs", type=int, default=200, help="epoch limit") | |||
| parser.add_argument( | |||
| "--lr", type=float, default=0.001, help="learning rate" | |||
| ) | |||
| args = parser.parse_args() | |||
| mkdirs(args.result_path, args.log_path, args.best_checkpoint_dir) | |||
| init_logger(args.log_path) | |||
| retrain = Retrain(args) | |||
| retrain.run() | |||
| @@ -0,0 +1,22 @@ | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| from argparse import ArgumentParser | |||
| from pytorch.selector import Selector | |||
| class NetworkMorphismSelector(Selector): | |||
| def __init__(self, single_candidate=True): | |||
| super().__init__(single_candidate) | |||
| def fit(self): | |||
| pass | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("NetworkMorphism select") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| args = parser.parse_args() | |||
| select = NetworkMorphismSelector(True) | |||
| select.fit() | |||
| @@ -0,0 +1,87 @@ | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import os | |||
| import logging | |||
| from pytorch.network_morphism.network_morphism_trainer import NetworkMorphismTrainer | |||
| from pytorch.network_morphism.algorithm.networkmorphism_searcher import NetworkMorphismSearcher | |||
| import argparse | |||
| import pickle | |||
| from pytorch.utils import init_logger, mkdirs | |||
| logger = logging.getLogger(__name__) | |||
| def create_dir(path): | |||
| if os.path.exists(path): | |||
| # shutil.rmtree(path) | |||
| return path | |||
| os.makedirs(path) | |||
| return path | |||
| class Train: | |||
| def __init__(self, args): | |||
| self.id = args.trial_id | |||
| self.trial_dir = os.path.join( | |||
| args.experiment_dir, 'train', str(args.trial_id)) | |||
| self.searcher_dir = os.path.join( | |||
| args.experiment_dir, '{}.pkl'.format(NetworkMorphismSearcher.__name__)) | |||
| self.args = args | |||
| self.searcher = None | |||
| # first trial | |||
| if not os.path.exists(self.searcher_dir): | |||
| self.searcher = NetworkMorphismSearcher(os.path.join( | |||
| args.experiment_dir, 'train'), args.best_selected_space_path) | |||
| else: | |||
| # load from previous round | |||
| with open(self.searcher_dir, 'rb') as f: | |||
| self.searcher = pickle.load(f) | |||
| def run_trial_job(self): | |||
| logger.info('trial {} search next model'.format(self.id)) | |||
| model = self.searcher.search(self.id,self.args) | |||
| trainer = NetworkMorphismTrainer(model, self.args) | |||
| logger.info('trial {} run training script'.format(self.id)) | |||
| metric = trainer.train() | |||
| if metric != None: | |||
| logger.info('trial {} receive trial result'.format(self.id)) | |||
| self.searcher.update_searcher(self.id, metric) | |||
| with open(self.searcher_dir, 'wb') as f: | |||
| pickle.dump(self.searcher, f) | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser("network_morphism") | |||
| parser.add_argument("--trial_id", type=int, default=0, help="Trial id") | |||
| parser.add_argument( | |||
| "--data_dir", type=str, default='../data/', help="Path to dataset" | |||
| ) | |||
| parser.add_argument( | |||
| "--log_path", type=str, default='./log', help="Path to log file" | |||
| ) | |||
| parser.add_argument( | |||
| "--experiment_dir", type=str, default='./TADL', help="experiment level path" | |||
| ) | |||
| parser.add_argument( | |||
| "--result_path", type=str, default='./result.json', help="trial level path to result" | |||
| ) | |||
| parser.add_argument( | |||
| "--search_space_path", type=str, default='./search_space.json', help="experiment level path to search space" | |||
| ) | |||
| parser.add_argument( | |||
| "--best_selected_space_path", type=str, default='./best_selected_space.json', help="experiment level path to best selected space" | |||
| ) | |||
| parser.add_argument("--batch_size", type=int, | |||
| default=128, help="batch size") | |||
| parser.add_argument("--opt", type=str, default="SGD", help="optimizer") | |||
| parser.add_argument("--epochs", type=int, default=2, help="epoch limit") | |||
| parser.add_argument( | |||
| "--lr", type=float, default=0.001, help="learning rate" | |||
| ) | |||
| args = parser.parse_args() | |||
| mkdirs(args.experiment_dir, args.result_path, args.log_path, args.search_space_path, args.best_selected_space_path) | |||
| create_dir(os.path.join(args.experiment_dir,'train',str(args.trial_id))) | |||
| init_logger(args.log_path) | |||
| train = Train(args) | |||
| train.run_trial_job() | |||
| @@ -0,0 +1,175 @@ | |||
| import logging | |||
| from algorithm.graph import json_to_graph | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.optim as optim | |||
| import torch.utils.data as data | |||
| import torchvision | |||
| import re | |||
| import datasets | |||
| from utils import Constant, EarlyStop, save_json_result | |||
| from pytorch.utils import save_best_checkpoint | |||
| # pylint: disable=W0603 | |||
| # set the logger format | |||
| logger = logging.getLogger(__name__) | |||
| class NetworkMorphismTrainer: | |||
| def __init__(self, model_json, args): | |||
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |||
| self.batch_size = args.batch_size | |||
| self.epochs = args.epochs | |||
| self.lr = args.lr | |||
| self.optimizer_name = args.opt | |||
| self.data_dir = args.data_dir | |||
| self.trial_id = args.trial_id | |||
| self.args = args | |||
| # Loading Data | |||
| logger.info("Preparing data..") | |||
| transform_train, transform_test = datasets.data_transforms_cifar10() | |||
| trainset = torchvision.datasets.CIFAR10( | |||
| root=self.data_dir, train=True, download=True, transform=transform_train | |||
| ) | |||
| self.trainloader = data.DataLoader( | |||
| trainset, batch_size=self.batch_size, shuffle=True, num_workers=1 | |||
| ) | |||
| testset = torchvision.datasets.CIFAR10( | |||
| root=self.data_dir, train=False, download=True, transform=transform_test | |||
| ) | |||
| self.testloader = data.DataLoader( | |||
| testset, batch_size=self.batch_size, shuffle=False, num_workers=1 | |||
| ) | |||
| # Model | |||
| logger.info("Building model..") | |||
| # build model from json representation | |||
| self.graph = json_to_graph(model_json) | |||
| self.net = self.graph.produce_torch_model() | |||
| if self.device == "cuda" and torch.cuda.device_count() > 1: | |||
| self.net = nn.DataParallel(self.net) | |||
| self.net.to(self.device) | |||
| self.criterion = nn.CrossEntropyLoss() | |||
| if self.optimizer_name == "SGD": | |||
| self.optimizer = optim.SGD( | |||
| self.net.parameters(), lr=self.lr, momentum=0.9, weight_decay=3e-4 | |||
| ) | |||
| if self.optimizer_name == "Adadelta": | |||
| self.optimizer = optim.Adadelta(self.net.parameters(), lr=self.lr) | |||
| if self.optimizer_name == "Adagrad": | |||
| self.optimizer = optim.Adagrad(self.net.parameters(), lr=self.lr) | |||
| if self.optimizer_name == "Adam": | |||
| self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr) | |||
| if self.optimizer_name == "Adamax": | |||
| self.optimizer = optim.Adamax(self.net.parameters(), lr=self.lr) | |||
| if self.optimizer_name == "RMSprop": | |||
| self.optimizer = optim.RMSprop(self.net.parameters(), lr=self.lr) | |||
| self.scheduler = optim.lr_scheduler.CosineAnnealingLR( | |||
| self.optimizer, self.epochs) | |||
| def train_one_epoch(self): | |||
| """ | |||
| train model on each epoch in trainset | |||
| """ | |||
| self.net.train() | |||
| for batch_idx, (inputs, targets) in enumerate(self.trainloader): | |||
| inputs, targets = inputs.to(self.device), targets.to(self.device) | |||
| self.optimizer.zero_grad() | |||
| outputs = self.net(inputs) | |||
| loss = self.criterion(outputs, targets) | |||
| loss.backward() | |||
| self.optimizer.step() | |||
| def validate_one_epoch(self, epoch): | |||
| """ eval model on each epoch in testset | |||
| """ | |||
| self.net.eval() | |||
| test_loss = 0 | |||
| correct = 0 | |||
| total = 0 | |||
| with torch.no_grad(): | |||
| for batch_idx, (inputs, targets) in enumerate(self.testloader): | |||
| inputs, targets = inputs.to( | |||
| self.device), targets.to(self.device) | |||
| outputs = self.net(inputs) | |||
| loss = self.criterion(outputs, targets) | |||
| test_loss += loss.item() | |||
| _, predicted = outputs.max(1) | |||
| total += targets.size(0) | |||
| correct += predicted.eq(targets).sum().item() | |||
| acc = correct / total | |||
| logger.info("Epoch: %d, accuracy: %.3f", epoch, acc) | |||
| result = {"type": "Accuracy", "result": { | |||
| "sequence": epoch, "category": "epoch", "value": acc}} | |||
| save_json_result(self.args.result_path, result) | |||
| return test_loss, acc | |||
| def train(self): | |||
| try: | |||
| max_no_improvement_num = Constant.MAX_NO_IMPROVEMENT_NUM | |||
| early_stop = EarlyStop(max_no_improvement_num) | |||
| early_stop.on_train_begin() | |||
| test_metric_value_list = [] | |||
| for ep in range(self.epochs): | |||
| self.train_one_epoch() | |||
| test_loss, test_acc = self.validate_one_epoch(ep) | |||
| self.scheduler.step() | |||
| test_metric_value_list.append(test_acc) | |||
| decreasing = early_stop.on_epoch_end(test_loss) | |||
| if not decreasing: | |||
| break | |||
| last_num = min(max_no_improvement_num, self.epochs) | |||
| estimated_performance = sum( | |||
| test_metric_value_list[-last_num:]) / last_num | |||
| logger.info("final accuracy: %.3f", estimated_performance) | |||
| except RuntimeError as e: | |||
| if not re.search('out of memory', str(e)): | |||
| raise e | |||
| print( | |||
| '\nCurrent model size is too big. Discontinuing training this model to search for other models.') | |||
| Constant.MAX_MODEL_SIZE = self.graph.size()-1 | |||
| return None | |||
| except Exception as e: | |||
| logger.exception(e) | |||
| raise | |||
| return estimated_performance | |||
| def retrain(self): | |||
| logger.info("here") | |||
| try: | |||
| best_acc = 0.0 | |||
| for ep in range(self.epochs): | |||
| logger.info(ep) | |||
| self.train_one_epoch() | |||
| _, test_acc = self.validate_one_epoch(ep) | |||
| self.scheduler.step() | |||
| if test_acc > best_acc: | |||
| best_acc = test_acc | |||
| save_best_checkpoint(self.args.best_checkpoint_dir, | |||
| self.net, self.optimizer, self.epochs) | |||
| logger.info("final accuracy: %.3f", best_acc) | |||
| except Exception as exception: | |||
| logger.exception(exception) | |||
| raise | |||
| @@ -0,0 +1,102 @@ | |||
| from enum import Enum | |||
| import json | |||
| class Constant: | |||
| # Data | |||
| CUTOUT_HOLES = 1 | |||
| CUTOUT_RATIO = 0.5 | |||
| # Searcher | |||
| MAX_MODEL_NUM = 1000 | |||
| MAX_LAYERS = 200 | |||
| N_NEIGHBOURS = 8 | |||
| MAX_MODEL_SIZE = (1 << 25) | |||
| MAX_LAYER_WIDTH = 4096 | |||
| KERNEL_LAMBDA = 1.0 | |||
| BETA = 2.576 | |||
| T_MIN = 0.0001 | |||
| MLP_MODEL_LEN = 3 | |||
| MLP_MODEL_WIDTH = 5 | |||
| MODEL_LEN = 3 | |||
| MODEL_WIDTH = 64 | |||
| POOLING_KERNEL_SIZE = 2 | |||
| DENSE_DROPOUT_RATE = 0.5 | |||
| CONV_DROPOUT_RATE = 0.25 | |||
| MLP_DROPOUT_RATE = 0.25 | |||
| CONV_BLOCK_DISTANCE = 2 | |||
| # trainer | |||
| MAX_NO_IMPROVEMENT_NUM = 5 | |||
| MIN_LOSS_DEC = 1e-4 | |||
| class OptimizeMode(Enum): | |||
| """Optimize Mode class | |||
| if OptimizeMode is 'minimize', it means the tuner need to minimize the reward | |||
| that received from Trial. | |||
| if OptimizeMode is 'maximize', it means the tuner need to maximize the reward | |||
| that received from Trial. | |||
| """ | |||
| Minimize = 'minimize' | |||
| Maximize = 'maximize' | |||
| class EarlyStop: | |||
| """A class check for early stop condition. | |||
| Attributes: | |||
| training_losses: Record all the training loss. | |||
| minimum_loss: The minimum loss we achieve so far. Used to compared to determine no improvement condition. | |||
| no_improvement_count: Current no improvement count. | |||
| _max_no_improvement_num: The maximum number specified. | |||
| _done: Whether condition met. | |||
| _min_loss_dec: A threshold for loss improvement. | |||
| """ | |||
| def __init__(self, max_no_improvement_num=None, min_loss_dec=None): | |||
| self.training_losses = [] | |||
| self.minimum_loss = None | |||
| self.no_improvement_count = 0 | |||
| self._max_no_improvement_num = max_no_improvement_num if max_no_improvement_num is not None \ | |||
| else Constant.MAX_NO_IMPROVEMENT_NUM | |||
| self._done = False | |||
| self._min_loss_dec = min_loss_dec if min_loss_dec is not None else Constant.MIN_LOSS_DEC | |||
| def on_train_begin(self): | |||
| """Initiate the early stop condition. | |||
| Call on every time the training iteration begins. | |||
| """ | |||
| self.training_losses = [] | |||
| self.no_improvement_count = 0 | |||
| self._done = False | |||
| self.minimum_loss = float('inf') | |||
| def on_epoch_end(self, loss): | |||
| """Check the early stop condition. | |||
| Call on every time the training iteration end. | |||
| Args: | |||
| loss: The loss function achieved by the epoch. | |||
| Returns: | |||
| True if condition met, otherwise False. | |||
| """ | |||
| self.training_losses.append(loss) | |||
| if self._done and loss > (self.minimum_loss - self._min_loss_dec): | |||
| return False | |||
| if loss > (self.minimum_loss - self._min_loss_dec): | |||
| self.no_improvement_count += 1 | |||
| else: | |||
| self.no_improvement_count = 0 | |||
| self.minimum_loss = loss | |||
| if self.no_improvement_count > self._max_no_improvement_num: | |||
| self._done = True | |||
| return True | |||
| def save_json_result(path, data): | |||
| with open(path,'a') as f: | |||
| json.dump(data,f) | |||
| f.write('\n') | |||
| @@ -0,0 +1 @@ | |||
| from pytorch.pcdarts.pcdartsmutator import PCdartsMutator | |||
| @@ -0,0 +1,227 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| from collections import OrderedDict | |||
| import torch | |||
| import torch.nn as nn | |||
| from pytorch import mutables | |||
| from pytorch.darts import ops | |||
| def random_channel_shuffle(x): | |||
| num_channels = x.data.size()[1] | |||
| indices = torch.randperm(num_channels) | |||
| x = x[:, indices] | |||
| return x | |||
| def channel_shuffle(x, groups): | |||
| batchsize, num_channels, height, width = x.data.size() | |||
| channels_per_group = num_channels // groups | |||
| # reshape | |||
| x = x.view(batchsize, groups, | |||
| channels_per_group, height, width) | |||
| x = torch.transpose(x, 1, 2).contiguous() | |||
| # flatten | |||
| x = x.view(batchsize, -1, height, width) | |||
| return x | |||
| class AuxiliaryHead(nn.Module): | |||
| """ Auxiliary head in 2/3 place of network to let the gradient flow well """ | |||
| def __init__(self, input_size, C, n_classes): | |||
| """ assuming input size 7x7 or 8x8 """ | |||
| assert input_size in [7, 8] | |||
| super().__init__() | |||
| self.net = nn.Sequential( | |||
| nn.ReLU(inplace=True), | |||
| nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False), # 2x2 out | |||
| nn.Conv2d(C, 128, kernel_size=1, bias=False), | |||
| nn.BatchNorm2d(128), | |||
| nn.ReLU(inplace=True), | |||
| nn.Conv2d(128, 768, kernel_size=2, bias=False), # 1x1 out | |||
| nn.BatchNorm2d(768), | |||
| nn.ReLU(inplace=True) | |||
| ) | |||
| self.linear = nn.Linear(768, n_classes) | |||
| def forward(self, x): | |||
| out = self.net(x) | |||
| out = out.view(out.size(0), -1) # flatten | |||
| logits = self.linear(out) | |||
| return logits | |||
| class Node(nn.Module): | |||
| def __init__(self, node_id, num_prev_nodes, channels, k, num_downsample_connect, search): | |||
| super().__init__() | |||
| if search: | |||
| self.k = k | |||
| partial_channles = channels // k | |||
| else: | |||
| partial_channles = channels | |||
| self.search = search | |||
| self.ops = nn.ModuleList() | |||
| choice_keys = [] | |||
| for i in range(num_prev_nodes): | |||
| stride = 2 if i < num_downsample_connect else 1 | |||
| choice_keys.append("{}_p{}".format(node_id, i)) | |||
| self.ops.append( | |||
| mutables.LayerChoice(OrderedDict([ | |||
| ("maxpool", ops.PoolBN('max', partial_channles, 3, stride, 1, affine=False)), | |||
| ("avgpool", ops.PoolBN('avg', partial_channles, 3, stride, 1, affine=False)), | |||
| ("skipconnect", nn.Identity() if stride == 1 else ops.FactorizedReduce(partial_channles, partial_channles, affine=False)), | |||
| ("sepconv3x3", ops.SepConv(partial_channles, partial_channles, 3, stride, 1, affine=False)), | |||
| ("sepconv5x5", ops.SepConv(partial_channles, partial_channles, 5, stride, 2, affine=False)), | |||
| ("dilconv3x3", ops.DilConv(partial_channles, partial_channles, 3, stride, 2, 2, affine=False)), | |||
| ("dilconv5x5", ops.DilConv(partial_channles, partial_channles, 5, stride, 4, 2, affine=False)) | |||
| ]), key=choice_keys[-1])) | |||
| self.drop_path = ops.DropPath() | |||
| self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id)) | |||
| self.pool = nn.MaxPool2d(2,2) | |||
| def forward(self, prev_nodes): | |||
| assert len(self.ops) == len(prev_nodes), "len(self.ops) != len(prev_nodes) in Node" | |||
| # for each candicate predecessor of each intermediate node | |||
| if self.search: | |||
| # in search | |||
| results = [] | |||
| for op, x in zip(self.ops, prev_nodes): | |||
| # channel shuffle | |||
| channels = x.shape[1] | |||
| # channel proportion k=4 | |||
| temp0 = x[ : , : channels//self.k, : , :] | |||
| temp1 = x[ : ,channels//self.k : , : , :] | |||
| out = op(temp0) | |||
| # normal | |||
| if out.shape[2] == x.shape[2]: | |||
| result = torch.cat([out, temp1], dim=1) | |||
| # reduction | |||
| else: | |||
| result = torch.cat([out, self.pool(temp1)], dim=1) | |||
| results.append(channel_shuffle(result, self.k)) | |||
| # # channel random shuffule | |||
| # channels = random_channel_shuffle(x).shape[1] | |||
| # # channel proportion k=4 | |||
| # temp0 = x[ : , : channels//self.k, : , :] | |||
| # temp1 = x[ : ,channels//self.k : , : , :] | |||
| # out = op(temp0) | |||
| # # normal | |||
| # if out.shape[2] == x.shape[2]: | |||
| # result = torch.cat([out, temp1], dim=1) | |||
| # # reduction | |||
| # else: | |||
| # result = torch.cat([out, self.pool(temp1)], dim=1) | |||
| # results.append(result) | |||
| else: | |||
| # in retrain, no channel shuffle | |||
| results = [op(node) for op, node in zip(self.ops, prev_nodes)] | |||
| output = [self.drop_path(re) if re is not None else None for re in results] | |||
| return self.input_switch(output) | |||
| class Cell(nn.Module): | |||
| def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction, k, search): | |||
| super().__init__() | |||
| self.reduction = reduction | |||
| self.n_nodes = n_nodes | |||
| # If previous cell is reduction cell, current input size does not match with | |||
| # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing. | |||
| if reduction_p: | |||
| self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False) | |||
| else: | |||
| self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False) | |||
| self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False) | |||
| # generate dag | |||
| self.mutable_ops = nn.ModuleList() | |||
| for depth in range(2, self.n_nodes + 2): | |||
| self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth), depth, channels, k, 2 if reduction else 0, search)) | |||
| def forward(self, s0, s1): | |||
| # s0, s1 are the outputs of previous previous cell and previous cell, respectively. | |||
| tensors = [self.preproc0(s0), self.preproc1(s1)] | |||
| for node in self.mutable_ops: | |||
| cur_tensor = node(tensors) | |||
| tensors.append(cur_tensor) | |||
| output = torch.cat(tensors[2:], dim=1) | |||
| return output | |||
| class CNN(nn.Module): | |||
| def __init__(self, input_size, in_channels, channels, n_classes, n_layers, k=4, n_nodes=4, stem_multiplier=3, auxiliary=False, search=True): | |||
| super().__init__() | |||
| self.in_channels = in_channels | |||
| self.channels = channels | |||
| self.n_classes = n_classes | |||
| self.n_layers = n_layers | |||
| self.n_nodes = n_nodes | |||
| self.aux_pos = 2 * n_layers // 3 if auxiliary else -1 | |||
| c_cur = stem_multiplier * self.channels | |||
| self.stem = nn.Sequential( | |||
| nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False), | |||
| nn.BatchNorm2d(c_cur) | |||
| ) | |||
| # for the first cell, stem is used for both s0 and s1 | |||
| # [!] channels_pp and channels_p is output channel size, but c_cur is input channel size. | |||
| channels_pp, channels_p, c_cur = c_cur, c_cur, channels | |||
| self.cells = nn.ModuleList() | |||
| reduction_p, reduction = False, False | |||
| for i in range(n_layers): | |||
| reduction_p, reduction = reduction, False | |||
| # Reduce featuremap size and double channels in 1/3 and 2/3 layer. | |||
| if i in [n_layers // 3, 2 * n_layers // 3]: | |||
| c_cur *= 2 | |||
| reduction = True | |||
| cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction, k, search) | |||
| self.cells.append(cell) | |||
| c_cur_out = c_cur * n_nodes | |||
| channels_pp, channels_p = channels_p, c_cur_out | |||
| if i == self.aux_pos: | |||
| self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes) | |||
| self.gap = nn.AdaptiveAvgPool2d(1) | |||
| self.linear = nn.Linear(channels_p, n_classes) | |||
| def forward(self, x): | |||
| s0 = s1 = self.stem(x) | |||
| aux_logits = None | |||
| for i, cell in enumerate(self.cells): | |||
| s0, s1 = s1, cell(s0, s1) | |||
| if i == self.aux_pos and self.training: | |||
| aux_logits = self.aux_head(s1) | |||
| out = self.gap(s1) | |||
| out = out.view(out.size(0), -1) # flatten | |||
| logits = self.linear(out) | |||
| if aux_logits is not None: | |||
| return logits, aux_logits | |||
| return logits | |||
| def drop_path_prob(self, p): | |||
| for module in self.modules(): | |||
| if isinstance(module, ops.DropPath): | |||
| module.p = p | |||
| def _loss(self, input, target): | |||
| logits = self(input) | |||
| return self._criterion(logits, target) | |||
| @@ -0,0 +1,204 @@ | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import os | |||
| import logging | |||
| import time | |||
| from argparse import ArgumentParser | |||
| import torch | |||
| import torch.nn as nn | |||
| import numpy as np | |||
| # from torch.utils.tensorboard import SummaryWriter | |||
| import torch.backends.cudnn as cudnn | |||
| from model import CNN | |||
| from pytorch.fixed import apply_fixed_architecture | |||
| from pytorch.utils import set_seed, mkdirs, init_logger, save_best_checkpoint, AverageMeter | |||
| from pytorch.darts import utils | |||
| from pytorch.darts import datasets | |||
| from pytorch.retrainer import Retrainer | |||
| logger = logging.getLogger(__name__) | |||
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |||
| # writer = SummaryWriter() | |||
| class PCdartsRetrainer(Retrainer): | |||
| def __init__(self, aux_weight, grad_clip, epochs, log_frequency): | |||
| self.aux_weight = aux_weight | |||
| self.grad_clip = grad_clip | |||
| self.epochs = epochs | |||
| self.log_frequency = log_frequency | |||
| def train(self, train_loader, model, optimizer, criterion, epoch): | |||
| top1 = AverageMeter("top1") | |||
| top5 = AverageMeter("top5") | |||
| losses = AverageMeter("losses") | |||
| cur_step = epoch * len(train_loader) | |||
| cur_lr = optimizer.param_groups[0]["lr"] | |||
| logger.info("Epoch %d LR %.6f", epoch, cur_lr) | |||
| # writer.add_scalar("lr", cur_lr, global_step=cur_step) | |||
| model.train() | |||
| for step, (x, y) in enumerate(train_loader): | |||
| x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) | |||
| bs = x.size(0) | |||
| optimizer.zero_grad() | |||
| logits, aux_logits = model(x) | |||
| loss = criterion(logits, y) | |||
| if self.aux_weight > 0.: | |||
| loss += self.aux_weight * criterion(aux_logits, y) | |||
| loss.backward() | |||
| # gradient clipping | |||
| nn.utils.clip_grad_norm_(model.parameters(), self.grad_clip) | |||
| optimizer.step() | |||
| accuracy = utils.accuracy(logits, y, topk=(1, 5)) | |||
| losses.update(loss.item(), bs) | |||
| top1.update(accuracy["acc1"], bs) | |||
| top5.update(accuracy["acc5"], bs) | |||
| # writer.add_scalar("loss/train", loss.item(), global_step=cur_step) | |||
| # writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step) | |||
| # writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step) | |||
| if step % self.log_frequency == 0 or step == len(train_loader) - 1: | |||
| logger.info( | |||
| "Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " | |||
| "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( | |||
| epoch + 1, self.epochs, step, len(train_loader) - 1, losses=losses, | |||
| top1=top1, top5=top5)) | |||
| cur_step += 1 | |||
| logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg)) | |||
| def validate(self, valid_loader, model, criterion, epoch, cur_step): | |||
| top1 = AverageMeter("top1") | |||
| top5 = AverageMeter("top5") | |||
| losses = AverageMeter("losses") | |||
| model.eval() | |||
| with torch.no_grad(): | |||
| for step, (X, y) in enumerate(valid_loader): | |||
| X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True) | |||
| bs = X.size(0) | |||
| logits = model(X) | |||
| loss = criterion(logits, y) | |||
| accuracy = utils.accuracy(logits, y, topk=(1, 5)) | |||
| losses.update(loss.item(), bs) | |||
| top1.update(accuracy["acc1"], bs) | |||
| top5.update(accuracy["acc5"], bs) | |||
| if step % self.log_frequency == 0 or step == len(valid_loader) - 1: | |||
| logger.info( | |||
| "Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " | |||
| "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( | |||
| epoch + 1, self.epochs, step, len(valid_loader) - 1, losses=losses, | |||
| top1=top1, top5=top5)) | |||
| # writer.add_scalar("loss/test", losses.avg, global_step=cur_step) | |||
| # writer.add_scalar("acc1/test", top1.avg, global_step=cur_step) | |||
| # writer.add_scalar("acc5/test", top5.avg, global_step=cur_step) | |||
| logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg)) | |||
| return top1.avg | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("PCDARTS retrain") | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='./', help="search_space json file") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./result.json', help="training result") | |||
| parser.add_argument("--log_path", type=str, | |||
| default='.0/log', help="log for info") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| parser.add_argument("--best_checkpoint_dir", type=str, | |||
| default='', help="default name is best_checkpoint_epoch{}.pth") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument("--layers", default=20, type=int) | |||
| parser.add_argument("--lr", default=0.01, type=float) | |||
| parser.add_argument("--batch_size", default=96, type=int) | |||
| parser.add_argument("--log_frequency", default=10, type=int) | |||
| parser.add_argument("--epochs", default=600, type=int) | |||
| parser.add_argument("--aux_weight", default=0.4, type=float) | |||
| parser.add_argument("--drop_path_prob", default=0.2, type=float) | |||
| parser.add_argument("--workers", default=4, type=int) | |||
| parser.add_argument("--class_num", default=10, type=int, help="cifar10") | |||
| parser.add_argument("--channels", default=36, type=int) | |||
| parser.add_argument("--grad_clip", default=6., type=float) | |||
| args = parser.parse_args() | |||
| mkdirs(args.result_path, args.log_path, args.best_checkpoint_dir) | |||
| init_logger(args.log_path) | |||
| logger.info(args) | |||
| set_seed(args.trial_id) | |||
| logger.info("loading data") | |||
| dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16, root=args.data_dir) | |||
| model = CNN(32, 3, args.channels, args.class_num, args.layers, auxiliary=True, search=False) | |||
| apply_fixed_architecture(model, args.best_selected_space_path) | |||
| criterion = nn.CrossEntropyLoss() | |||
| model.to(device) | |||
| criterion.to(device) | |||
| optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=3.0E-4) | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6) | |||
| train_loader = torch.utils.data.DataLoader(dataset_train, | |||
| batch_size=args.batch_size, | |||
| shuffle=True, | |||
| num_workers=args.workers, | |||
| pin_memory=True) | |||
| valid_loader = torch.utils.data.DataLoader(dataset_valid, | |||
| batch_size=args.batch_size, | |||
| shuffle=False, | |||
| num_workers=args.workers, | |||
| pin_memory=True) | |||
| retrainer = PCdartsRetrainer(aux_weight=args.aux_weight, | |||
| grad_clip=args.grad_clip, | |||
| epochs=args.epochs, | |||
| log_frequency = args.log_frequency) | |||
| # result = {"Accuracy": [], "Cost_time": ''} | |||
| best_top1 = 0. | |||
| start_time = time.time() | |||
| for epoch in range(args.epochs): | |||
| drop_prob = args.drop_path_prob * epoch / args.epochs | |||
| model.drop_path_prob(drop_prob) | |||
| # training | |||
| retrainer.train(train_loader, model, optimizer, criterion, epoch) | |||
| # validation | |||
| cur_step = (epoch + 1) * len(train_loader) | |||
| top1 = retrainer.validate(valid_loader, model, criterion, epoch, cur_step) | |||
| # 后端在终端过滤,{"type": "Accuracy", "result": {"sequence": 1, "category": "epoch", "value":96.7}} | |||
| logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) + '\n') | |||
| # result["Accuracy"].append(top1) | |||
| best_top1 = max(best_top1, top1) | |||
| lr_scheduler.step() | |||
| logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) | |||
| cost_time = time.time() - start_time | |||
| # 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}} | |||
| logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})) | |||
| # result["Cost_time"] = str(cost_time) + ' s' | |||
| # dump_global_result(args.result_path, result) | |||
| save_best_checkpoint(args.best_checkpoint_dir, model, optimizer, epoch) | |||
| logger.info("Save best checkpoint in {}".format(os.path.join(args.best_checkpoint_dir, "best_checkpoint_epoch{}.pth".format(epoch)))) | |||
| @@ -0,0 +1,21 @@ | |||
| import sys | |||
| sys.path.append('../..') | |||
| from pytorch.selector import Selector | |||
| from argparse import ArgumentParser | |||
| class PCdartsSelector(Selector): | |||
| def __init__(self, single_candidate=True): | |||
| super().__init__(single_candidate) | |||
| def fit(self): | |||
| pass | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("DARTS select") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| args = parser.parse_args() | |||
| darts_selector = PCdartsSelector(True) | |||
| darts_selector.fit() | |||
| @@ -0,0 +1,93 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import time | |||
| from argparse import ArgumentParser | |||
| from model import CNN | |||
| import torch | |||
| import torch.nn as nn | |||
| from pytorch.callbacks import BestArchitectureCheckpoint, LRSchedulerCallback | |||
| from pytorch.pcdarts import PCdartsMutator | |||
| from pytorch.darts import DartsTrainer | |||
| from pytorch.darts.utils import accuracy | |||
| from pytorch.darts import datasets | |||
| from pytorch.utils import * | |||
| logger = logging.getLogger(__name__) | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("PCDARTS train") | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='../data/', help="search_space json file") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='.0/result.json', help="training result") | |||
| parser.add_argument("--log_path", type=str, | |||
| default='.0/log', help="log for info") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./search_space.json', help="search space of PDARTS") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument('--model_lr', type=float, default=0.1, help='learning rate for training model weights') | |||
| parser.add_argument('--arch_lr', type=float, default=6e-4, help='learning rate for training architecture') | |||
| parser.add_argument("--nodes", default=4, type=int) | |||
| parser.add_argument("--layers", default=8, type=int) | |||
| parser.add_argument("--channels", default=16, type=int) | |||
| parser.add_argument("--batch_size", default=96, type=int) | |||
| parser.add_argument("--log_frequency", default=50, type=int) | |||
| parser.add_argument("--class_num", default=10, type=int, help="cifar10") | |||
| parser.add_argument("--epochs", default=5, type=int) | |||
| parser.add_argument("--pre_epochs", default=15, type=int, help='pre epochs to train weight only') | |||
| parser.add_argument("--k", default=4, type=int, help="channel portion of channel shuffle") | |||
| parser.add_argument("--unrolled", default=False, action="store_true") | |||
| args = parser.parse_args() | |||
| mkdirs(args.result_path, args.log_path, args.search_space_path, args.best_selected_space_path) | |||
| init_logger(args.log_path, "info") | |||
| logger.info(args) | |||
| set_seed(args.trial_id) | |||
| logger.info("loading data") | |||
| dataset_train, dataset_valid = datasets.get_dataset("cifar10", root=args.data_dir) | |||
| model = CNN(32, 3, args.channels, args.class_num, args.layers, n_nodes=args.nodes, k=args.k) | |||
| criterion = nn.CrossEntropyLoss() | |||
| optim = torch.optim.SGD(model.parameters(), args.model_lr, momentum=0.9, weight_decay=3.0E-4) | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001) | |||
| logger.info("initializing trainer") | |||
| trainer = DartsTrainer(model, | |||
| loss=criterion, | |||
| metrics=lambda output, target: accuracy(output, target, topk=(1,)), | |||
| optimizer=optim, | |||
| num_epochs=args.epochs, | |||
| dataset_train=dataset_train, | |||
| dataset_valid=dataset_valid, | |||
| mutator=PCdartsMutator(model), | |||
| batch_size=args.batch_size, | |||
| log_frequency=args.log_frequency, | |||
| arch_lr=args.arch_lr, | |||
| unrolled=args.unrolled, | |||
| result_path=args.result_path, | |||
| num_pre_epochs=args.pre_epochs, | |||
| search_space_path=args.search_space_path, | |||
| callbacks= | |||
| [LRSchedulerCallback(lr_scheduler), BestArchitectureCheckpoint(args.best_selected_space_path, args.epochs)]) | |||
| logger.info("training") | |||
| t1 = time.time() | |||
| trainer.train() | |||
| # res_json = trainer.result | |||
| cost_time = time.time() - t1 | |||
| # 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}} | |||
| logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})) | |||
| # res_json["Cost_time"] = str(cost_time) + ' s' | |||
| # dump_global_result(args.result_path, res_json) | |||
| @@ -0,0 +1,146 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| import logging | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from collections import OrderedDict | |||
| from pytorch.mutator import Mutator | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| _logger = logging.getLogger(__name__) | |||
| class PCdartsMutator(Mutator): | |||
| """ | |||
| Connects the model in a PC-DARTS (differentiable) way. | |||
| Two connections are automatically inserted for each LayerChoice and InputChoice, when these connections are selected by softmax function. | |||
| Ops on the LayerChoice are selected by max top-k probabilities. But channels in the all candicate predecessors on the InputChoice are weighted sum | |||
| There is no op on this LayerChoice (namely a ``ZeroOp``), in which case, every element in the exported choice list is ``false`` | |||
| (not chosen). | |||
| All input choice will be fully connected in the search phase. On exporting, the input choice will choose inputs based | |||
| on keys in ``choose_from``. If the keys were to be keys of LayerChoices, the top logit of the corresponding LayerChoice | |||
| will join the competition of input choice to compete against other logits. Otherwise, the logit will be assumed 0. | |||
| It's possible to cut branches by setting parameter ``choices`` in a particular position to ``-inf``. After softmax, the | |||
| value would be 0. Framework will ignore 0 values and not connect. Note that the gradient on the ``-inf`` location will | |||
| be 0. Since manipulations with ``-inf`` will be ``nan``, you need to handle the gradient update phase carefully. | |||
| Attributes | |||
| ---------- | |||
| choices: ParameterDict | |||
| dict that maps keys of LayerChoices to weighted-connection float tensors. | |||
| """ | |||
| def __init__(self, model): | |||
| super().__init__(model) | |||
| self.choices = nn.ParameterDict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.length + 1)) | |||
| if isinstance(mutable, InputChoice): | |||
| self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.n_candidates)) | |||
| def device(self): | |||
| for v in self.choices.values(): | |||
| return v.device | |||
| def sample_search(self): | |||
| result = dict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| result[mutable.key] = F.softmax(self.choices[mutable.key], dim=-1)[:-1] | |||
| elif isinstance(mutable, InputChoice): | |||
| result[mutable.key] = F.softmax(self.choices[mutable.key], dim=-1) | |||
| return result | |||
| def sample_final(self): | |||
| result = dict() | |||
| edges_max = dict() | |||
| choices = dict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| # multiply the normalized coefficients together to select top-1 op in each LayerChoice | |||
| predecessor_idx = int(mutable.key[-1]) | |||
| inputchoice_key = mutable.key[:-2] + "switch" | |||
| choices[mutable.key] = self.choices[mutable.key] * self.choices[inputchoice_key][predecessor_idx] | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| # select non-none top-1 op | |||
| max_val, index = torch.max(F.softmax(choices[mutable.key], dim=-1)[:-1], 0) | |||
| edges_max[mutable.key] = max_val | |||
| result[mutable.key] = F.one_hot(index, num_classes=len(mutable)).view(-1).bool() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, InputChoice): | |||
| if mutable.n_chosen is not None: | |||
| weights = [] | |||
| for src_key in mutable.choose_from: | |||
| if src_key not in edges_max: | |||
| _logger.warning("InputChoice.NO_KEY in '%s' is weighted 0 when selecting inputs.", mutable.key) | |||
| weights.append(edges_max.get(src_key, 0.)) | |||
| weights = torch.tensor(weights) # pylint: disable=not-callable | |||
| # select top-2 strongest predecessor | |||
| _, topk_edge_indices = torch.topk(weights, mutable.n_chosen) | |||
| selected_multihot = [] | |||
| for i, src_key in enumerate(mutable.choose_from): | |||
| if i not in topk_edge_indices and src_key in result: | |||
| # If an edge is never selected, there is no need to calculate any op on this edge. | |||
| # This is to eliminate redundant calculation. | |||
| result[src_key] = torch.zeros_like(result[src_key]) | |||
| selected_multihot.append(i in topk_edge_indices) | |||
| result[mutable.key] = torch.tensor(selected_multihot, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable | |||
| else: | |||
| result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable | |||
| return result | |||
| def _generate_search_space(self): | |||
| """ | |||
| Generate search space from mutables. | |||
| Here is the search space format: | |||
| :: | |||
| { key_name: {"_type": "layer_choice", | |||
| "_value": ["conv1", "conv2"]} } | |||
| { key_name: {"_type": "input_choice", | |||
| "_value": {"candidates": ["in1", "in2"], | |||
| "n_chosen": 1}} } | |||
| Returns | |||
| ------- | |||
| dict | |||
| the generated search space | |||
| """ | |||
| res = OrderedDict() | |||
| res["op_list"] = OrderedDict() | |||
| res["search_space"] = {"reduction_cell": OrderedDict(), "normal_cell": OrderedDict()} | |||
| keys = [] | |||
| for mutable in self.mutables: | |||
| # for now we only generate flattened search space | |||
| if (len(res["search_space"]["reduction_cell"]) + len(res["search_space"]["normal_cell"])) >= 36: | |||
| break | |||
| if isinstance(mutable, LayerChoice): | |||
| key = mutable.key | |||
| if key not in keys: | |||
| val = mutable.names | |||
| if not res["op_list"]: | |||
| res["op_list"] = {"_type": "layer_choice", "_value": val + ["none"]} | |||
| node_type = "normal_cell" if "normal" in key else "reduction_cell" | |||
| res["search_space"][node_type][key] = "op_list" | |||
| keys.append(key) | |||
| elif isinstance(mutable, InputChoice): | |||
| key = mutable.key | |||
| if key not in keys: | |||
| node_type = "normal_cell" if "normal" in key else "reduction_cell" | |||
| res["search_space"][node_type][key] = {"_type": "input_choice", | |||
| "_value": {"candidates": mutable.choose_from, | |||
| "n_chosen": mutable.n_chosen}} | |||
| keys.append(key) | |||
| else: | |||
| raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) | |||
| return res | |||
| @@ -0,0 +1,81 @@ | |||
| # train stage | |||
| `python pcdarts_train.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --search_space_path 'experiment_id/search_space.json' --best_selected_space_path 'experiment_id/best_selected_space.json' --trial_id 0 --layers 5 --model_lr 0.025 --arch_lr 3e-4 --epochs 2 --pre_epochs 1 --batch_size 64 --channels 16` | |||
| # select stage | |||
| `python pcdarts_select.py --best_selected_space_path 'experiment_id/best_selected_space.json' ` | |||
| # retrain stage | |||
| `python pcdarts_retrain.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --best_selected_space_path 'experiment_id/best_selected_space.json' --best_checkpoint_dir 'experiment_id/' --trial_id 0 --batch_size 96 --epochs 2 --lr 0.01 --layers 20 --channels 36` | |||
| # output file | |||
| `result.json` | |||
| ``` | |||
| {'type': 'Accuracy', 'result': {'sequence': 0, 'category': 'epoch', 'value': 0.1}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 1, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 2, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 3, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 4, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Cost_time', 'result': {'value': '41.614346981048584 s'}} | |||
| ``` | |||
| `search_space.json` | |||
| ``` | |||
| { | |||
| "op_list": { | |||
| "_type": "layer_choice", | |||
| "_value": [ | |||
| "maxpool", | |||
| "avgpool", | |||
| "skipconnect", | |||
| "sepconv3x3", | |||
| "sepconv5x5", | |||
| "dilconv3x3", | |||
| "dilconv5x5", | |||
| "none" | |||
| ] | |||
| }, | |||
| "search_space": { | |||
| "normal_n2_p0": "op_list", | |||
| "normal_n2_p1": "op_list", | |||
| "normal_n2_switch": { | |||
| "_type": "input_choice", | |||
| "_value": { | |||
| "candidates": [ | |||
| "normal_n2_p0", | |||
| "normal_n2_p1" | |||
| ], | |||
| "n_chosen": 2 | |||
| } | |||
| }, | |||
| ... | |||
| } | |||
| ``` | |||
| `best_selected_space.json` | |||
| ``` | |||
| { | |||
| "normal_n2_p0": "dilconv5x5", | |||
| "normal_n2_p1": "dilconv5x5", | |||
| "normal_n2_switch": [ | |||
| "normal_n2_p0", | |||
| "normal_n2_p1" | |||
| ], | |||
| "normal_n3_p0": "sepconv3x3", | |||
| "normal_n3_p1": "dilconv5x5", | |||
| "normal_n3_p2": [], | |||
| "normal_n3_switch": [ | |||
| "normal_n3_p0", | |||
| "normal_n3_p1" | |||
| ], | |||
| "normal_n4_p0": [], | |||
| "normal_n4_p1": "dilconv5x5", | |||
| "normal_n4_p2": "sepconv5x5", | |||
| "normal_n4_p3": [], | |||
| "normal_n4_switch": [ | |||
| "normal_n4_p1", | |||
| "normal_n4_p2" | |||
| ], | |||
| ... | |||
| } | |||
| ``` | |||
| @@ -0,0 +1,2 @@ | |||
| from pytorch.pdarts.pdartsmutator import PdartsMutator | |||
| from pytorch.pdarts.pdartstrainer import PdartsTrainer | |||
| @@ -0,0 +1,177 @@ | |||
| # Copyright (c) Microsoft Corporation. | |||
| # Licensed under the MIT license. | |||
| from collections import OrderedDict | |||
| import torch | |||
| import torch.nn as nn | |||
| from pytorch import mutables | |||
| from pytorch.darts import ops | |||
| class AuxiliaryHead(nn.Module): | |||
| """ Auxiliary head in 2/3 place of network to let the gradient flow well """ | |||
| def __init__(self, input_size, C, n_classes): | |||
| """ assuming input size 7x7 or 8x8 """ | |||
| assert input_size in [7, 8] | |||
| super().__init__() | |||
| self.net = nn.Sequential( | |||
| nn.ReLU(inplace=True), | |||
| nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False), # 2x2 out | |||
| nn.Conv2d(C, 128, kernel_size=1, bias=False), | |||
| nn.BatchNorm2d(128), | |||
| nn.ReLU(inplace=True), | |||
| nn.Conv2d(128, 768, kernel_size=2, bias=False), # 1x1 out | |||
| nn.BatchNorm2d(768), | |||
| nn.ReLU(inplace=True) | |||
| ) | |||
| self.linear = nn.Linear(768, n_classes) | |||
| def forward(self, x): | |||
| out = self.net(x) | |||
| out = out.view(out.size(0), -1) # flatten | |||
| logits = self.linear(out) | |||
| return logits | |||
| class Node(nn.Module): | |||
| def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect, search, dropout_rate): | |||
| super().__init__() | |||
| self.dropout_rate = dropout_rate | |||
| self.ops = nn.ModuleList() | |||
| choice_keys = [] | |||
| for i in range(num_prev_nodes): | |||
| stride = 2 if i < num_downsample_connect else 1 | |||
| choice_keys.append("{}_p{}".format(node_id, i)) | |||
| skip_op = nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False) | |||
| # In search, op-level dropout for skip-connect | |||
| if search and self.dropout_rate > 0: | |||
| skip_op = nn.Sequential(skip_op, nn.Dropout(self.dropout_rate)) | |||
| self.ops.append( | |||
| mutables.LayerChoice(OrderedDict([ | |||
| ("maxpool", ops.PoolBN('max', channels, 3, stride, 1, affine=False)), | |||
| ("avgpool", ops.PoolBN('avg', channels, 3, stride, 1, affine=False)), | |||
| ("skipconnect", skip_op), | |||
| ("sepconv3x3", ops.SepConv(channels, channels, 3, stride, 1, affine=False)), | |||
| ("sepconv5x5", ops.SepConv(channels, channels, 5, stride, 2, affine=False)), | |||
| ("dilconv3x3", ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False)), | |||
| ("dilconv5x5", ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False)) | |||
| ]), key=choice_keys[-1])) | |||
| # In retrain, DropPath for non skip-connect, p in DropPath default to 0 | |||
| self.drop_path = ops.DropPath() | |||
| self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id)) | |||
| def forward(self, prev_nodes): | |||
| assert len(self.ops) == len(prev_nodes) | |||
| output = [] | |||
| for op, node in zip(self.ops, prev_nodes): | |||
| out = op(node) | |||
| # In retrain | |||
| if out is not None: | |||
| if not isinstance(op, nn.Identity): | |||
| out = self.drop_path(out) | |||
| else: | |||
| out = None | |||
| output.append(out) | |||
| # out = [op(node) for op, node in zip(self.ops, prev_nodes)] | |||
| # out = [self.drop_path(o) if o is not None else None for o in out] | |||
| return self.input_switch(output) | |||
| class Cell(nn.Module): | |||
| def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction, search, dropout_rate): | |||
| super().__init__() | |||
| self.reduction = reduction | |||
| self.n_nodes = n_nodes | |||
| # If previous cell is reduction cell, current input size does not match with | |||
| # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing. | |||
| if reduction_p: | |||
| self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False) | |||
| else: | |||
| self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False) | |||
| self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False) | |||
| # generate dag | |||
| self.mutable_ops = nn.ModuleList() | |||
| for depth in range(2, self.n_nodes + 2): | |||
| self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth), depth, channels, 2 if reduction else 0, search, dropout_rate)) | |||
| def forward(self, s0, s1): | |||
| # s0, s1 are the outputs of previous previous cell and previous cell, respectively. | |||
| tensors = [self.preproc0(s0), self.preproc1(s1)] | |||
| for node in self.mutable_ops: | |||
| cur_tensor = node(tensors) | |||
| tensors.append(cur_tensor) | |||
| output = torch.cat(tensors[2:], dim=1) | |||
| return output | |||
| class CNN(nn.Module): | |||
| def __init__(self, input_size, in_channels, channels, n_classes, n_layers, dropout_rate, n_nodes=4, stem_multiplier=3, auxiliary=False, search=True): | |||
| super().__init__() | |||
| self.in_channels = in_channels | |||
| self.channels = channels | |||
| self.n_classes = n_classes | |||
| self.n_layers = n_layers | |||
| self.aux_pos = 2 * n_layers // 3 if auxiliary else -1 | |||
| c_cur = stem_multiplier * self.channels | |||
| self.stem = nn.Sequential( | |||
| nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False), | |||
| nn.BatchNorm2d(c_cur) | |||
| ) | |||
| # for the first cell, stem is used for both s0 and s1 | |||
| # [!] channels_pp and channels_p is output channel size, but c_cur is input channel size. | |||
| channels_pp, channels_p, c_cur = c_cur, c_cur, channels | |||
| self.cells = nn.ModuleList() | |||
| reduction_p, reduction = False, False | |||
| for i in range(n_layers): | |||
| reduction_p, reduction = reduction, False | |||
| # Reduce featuremap size and double channels in 1/3 and 2/3 layer. | |||
| if i in [n_layers // 3, 2 * n_layers // 3]: | |||
| c_cur *= 2 | |||
| reduction = True | |||
| cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction, search, dropout_rate) | |||
| self.cells.append(cell) | |||
| c_cur_out = c_cur * n_nodes | |||
| channels_pp, channels_p = channels_p, c_cur_out | |||
| if i == self.aux_pos: | |||
| self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes) | |||
| self.gap = nn.AdaptiveAvgPool2d(1) | |||
| self.linear = nn.Linear(channels_p, n_classes) | |||
| def forward(self, x): | |||
| s0 = s1 = self.stem(x) | |||
| aux_logits = None | |||
| for i, cell in enumerate(self.cells): | |||
| s0, s1 = s1, cell(s0, s1) | |||
| if i == self.aux_pos and self.training: | |||
| aux_logits = self.aux_head(s1) | |||
| out = self.gap(s1) | |||
| out = out.view(out.size(0), -1) # flatten | |||
| logits = self.linear(out) | |||
| if aux_logits is not None: | |||
| return logits, aux_logits | |||
| return logits | |||
| def drop_path_prob(self, p, search=True): | |||
| if search: | |||
| for module in self.modules(): | |||
| # In search, update dropout rate | |||
| if isinstance(module, nn.Sequential) and isinstance(module[0], nn.Identity): | |||
| module[1].dropout_rate = p | |||
| else: | |||
| # In retrain, update ops.DropPath | |||
| for module in self.modules(): | |||
| if isinstance(module, ops.DropPath): | |||
| module.p = p | |||
| @@ -0,0 +1,203 @@ | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import os | |||
| import logging | |||
| import time | |||
| import json | |||
| from argparse import ArgumentParser | |||
| import torch | |||
| import torch.nn as nn | |||
| # from torch.utils.tensorboard import SummaryWriter | |||
| from model import CNN | |||
| from pytorch.fixed import apply_fixed_architecture | |||
| from pytorch.utils import set_seed, mkdirs, init_logger, save_best_checkpoint, AverageMeter | |||
| from pytorch.darts import utils | |||
| from pytorch.darts import datasets | |||
| from pytorch.retrainer import Retrainer | |||
| logger = logging.getLogger(__name__) | |||
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |||
| # writer = SummaryWriter() | |||
| class PdartsRetrainer(Retrainer): | |||
| def __init__(self, aux_weight, grad_clip, epochs, log_frequency): | |||
| self.aux_weight = aux_weight | |||
| self.grad_clip = grad_clip | |||
| self.epochs = epochs | |||
| self.log_frequency = log_frequency | |||
| def train(self, train_loader, model, optimizer, criterion, epoch): | |||
| top1 = AverageMeter("top1") | |||
| top5 = AverageMeter("top5") | |||
| losses = AverageMeter("losses") | |||
| cur_step = epoch * len(train_loader) | |||
| cur_lr = optimizer.param_groups[0]["lr"] | |||
| logger.info("Epoch %d LR %.6f", epoch, cur_lr) | |||
| # writer.add_scalar("lr", cur_lr, global_step=cur_step) | |||
| model.train() | |||
| for step, (x, y) in enumerate(train_loader): | |||
| x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) | |||
| bs = x.size(0) | |||
| optimizer.zero_grad() | |||
| logits, aux_logits = model(x) | |||
| loss = criterion(logits, y) | |||
| if self.aux_weight > 0.: | |||
| loss += self.aux_weight * criterion(aux_logits, y) | |||
| loss.backward() | |||
| # gradient clipping | |||
| nn.utils.clip_grad_norm_(model.parameters(), self.grad_clip) | |||
| optimizer.step() | |||
| accuracy = utils.accuracy(logits, y, topk=(1, 5)) | |||
| losses.update(loss.item(), bs) | |||
| top1.update(accuracy["acc1"], bs) | |||
| top5.update(accuracy["acc5"], bs) | |||
| # writer.add_scalar("loss/train", loss.item(), global_step=cur_step) | |||
| # writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step) | |||
| # writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step) | |||
| if step % self.log_frequency == 0 or step == len(train_loader) - 1: | |||
| logger.info( | |||
| "Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " | |||
| "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( | |||
| epoch + 1, self.epochs, step, len(train_loader) - 1, losses=losses, | |||
| top1=top1, top5=top5)) | |||
| cur_step += 1 | |||
| logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg)) | |||
| def validate(self, valid_loader, model, criterion, epoch, cur_step): | |||
| top1 = AverageMeter("top1") | |||
| top5 = AverageMeter("top5") | |||
| losses = AverageMeter("losses") | |||
| model.eval() | |||
| with torch.no_grad(): | |||
| for step, (X, y) in enumerate(valid_loader): | |||
| X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True) | |||
| bs = X.size(0) | |||
| logits = model(X) | |||
| loss = criterion(logits, y) | |||
| accuracy = utils.accuracy(logits, y, topk=(1, 5)) | |||
| losses.update(loss.item(), bs) | |||
| top1.update(accuracy["acc1"], bs) | |||
| top5.update(accuracy["acc5"], bs) | |||
| if step % self.log_frequency == 0 or step == len(valid_loader) - 1: | |||
| logger.info( | |||
| "Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " | |||
| "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( | |||
| epoch + 1, self.epochs, step, len(valid_loader) - 1, losses=losses, | |||
| top1=top1, top5=top5)) | |||
| # writer.add_scalar("loss/test", losses.avg, global_step=cur_step) | |||
| # writer.add_scalar("acc1/test", top1.avg, global_step=cur_step) | |||
| # writer.add_scalar("acc5/test", top5.avg, global_step=cur_step) | |||
| logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg)) | |||
| return top1.avg | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("Pdarts retrain") | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='./', help="search_space json file") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='./result.json', help="training result") | |||
| parser.add_argument("--log_path", type=str, | |||
| default='.0/log', help="log for info") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| parser.add_argument("--best_checkpoint_dir", type=str, | |||
| default='', help="default name is best_checkpoint_epoch{}.pth") | |||
| parser.add_argument('--trial_id', type=int, default=0, metavar='N', | |||
| help='trial_id,start from 0') | |||
| parser.add_argument("--layers", default=20, type=int) | |||
| parser.add_argument("--batch_size", default=96, type=int) | |||
| parser.add_argument("--log_frequency", default=10, type=int) | |||
| parser.add_argument("--epochs", default=600, type=int) | |||
| parser.add_argument("--lr", default=0.025, type=float) | |||
| parser.add_argument("--channels", default=36, type=int) | |||
| parser.add_argument("--aux_weight", default=0.4, type=float) | |||
| parser.add_argument("--drop_path_prob", default=0.3, type=float) | |||
| parser.add_argument("--workers", default=4) | |||
| parser.add_argument("--grad_clip", default=5., type=float) | |||
| args = parser.parse_args() | |||
| mkdirs(args.result_path, args.log_path, args.best_checkpoint_dir) | |||
| init_logger(args.log_path) | |||
| logger.info(args) | |||
| set_seed(args.trial_id) | |||
| logger.info("loading data") | |||
| dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16, root=args.data_dir) | |||
| model = CNN(32, 3, 36, 10, args.layers, auxiliary=True, search=False, dropout_rate=0.0) | |||
| if isinstance(args.best_selected_space_path, str): | |||
| with open(args.best_selected_space_path) as f: | |||
| fixed_arc = json.load(f) | |||
| apply_fixed_architecture(model, fixed_arc=fixed_arc["best_selected_space"]) | |||
| criterion = nn.CrossEntropyLoss() | |||
| model.to(device) | |||
| criterion.to(device) | |||
| optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=3.0E-4) | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6) | |||
| train_loader = torch.utils.data.DataLoader(dataset_train, | |||
| batch_size=args.batch_size, | |||
| shuffle=True, | |||
| num_workers=args.workers, | |||
| pin_memory=True) | |||
| valid_loader = torch.utils.data.DataLoader(dataset_valid, | |||
| batch_size=args.batch_size, | |||
| shuffle=False, | |||
| num_workers=args.workers, | |||
| pin_memory=True) | |||
| retrainer = PdartsRetrainer(aux_weight=args.aux_weight, | |||
| grad_clip=args.grad_clip, | |||
| epochs=args.epochs, | |||
| log_frequency = args.log_frequency) | |||
| best_top1 = 0. | |||
| start_time = time.time() | |||
| for epoch in range(args.epochs): | |||
| drop_prob = args.drop_path_prob * epoch / args.epochs | |||
| model.drop_path_prob(drop_prob) | |||
| # training | |||
| retrainer.train(train_loader, model, optimizer, criterion, epoch) | |||
| # validation | |||
| cur_step = (epoch + 1) * len(train_loader) | |||
| top1 = retrainer.validate(valid_loader, model, criterion, epoch, cur_step) | |||
| # 后端在终端过滤,{"type": "Accuracy", "result": {"sequence": 1, "category": "epoch", "value":96.7}} | |||
| logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) + '\n') | |||
| best_top1 = max(best_top1, top1) | |||
| lr_scheduler.step() | |||
| logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) | |||
| cost_time = time.time() - start_time | |||
| # 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}} | |||
| logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})) | |||
| # result["Cost_time"] = str(cost_time) + ' s' | |||
| # dump_global_result(args.result_path, result) | |||
| save_best_checkpoint(args.best_checkpoint_dir, model, optimizer, epoch) | |||
| logger.info("Save best checkpoint in {}".format(os.path.join(args.best_checkpoint_dir, "best_checkpoint_epoch{}.pth".format(epoch)))) | |||
| @@ -0,0 +1,22 @@ | |||
| import sys | |||
| sys.path.append('../..') | |||
| from pytorch.selector import Selector | |||
| from argparse import ArgumentParser | |||
| class PdartsSelector(Selector): | |||
| def __init__(self, single_candidate=True): | |||
| super().__init__(single_candidate) | |||
| def fit(self): | |||
| pass | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("PDARTS select") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| args = parser.parse_args() | |||
| darts_selector = PdartsSelector(True) | |||
| darts_selector.fit() | |||
| @@ -0,0 +1,79 @@ | |||
| import sys | |||
| sys.path.append('..'+ '/' + '..') | |||
| import time | |||
| import logging | |||
| from argparse import ArgumentParser | |||
| from pdartstrainer import PdartsTrainer | |||
| from pytorch.utils import mkdirs, set_seed, init_logger, list_str2int | |||
| logger = logging.getLogger(__name__) | |||
| if __name__ == "__main__": | |||
| parser = ArgumentParser("pdarts") | |||
| parser.add_argument("--data_dir", type=str, | |||
| default='../data/', help="search_space json file") | |||
| parser.add_argument("--result_path", type=str, | |||
| default='0/result.json', help="training result") | |||
| parser.add_argument("--log_path", type=str, | |||
| default='0/log', help="log for info") | |||
| parser.add_argument("--search_space_path", type=str, | |||
| default='./search_space.json', help="search space of PDARTS") | |||
| parser.add_argument("--best_selected_space_path", type=str, | |||
| default='./best_selected_space.json', help="final best selected space") | |||
| parser.add_argument('--trial_id', type=int, default=0, help='for ensuring reproducibility ') | |||
| parser.add_argument('--model_lr', type=float, default=0.025, help='learning rate for training model weights') | |||
| parser.add_argument('--arch_lr', type=float, default=3e-4, help='learning rate for training architecture') | |||
| parser.add_argument("--epochs", default=2, type=int) | |||
| parser.add_argument("--pre_epochs", default=15, type=int) | |||
| parser.add_argument("--batch_size", default=96, type=int) | |||
| parser.add_argument("--init_layers", default=5, type=int) | |||
| parser.add_argument('--add_layers', default=[0, 6, 12], nargs='+', type=int, help='add layers in each stage') | |||
| parser.add_argument('--dropped_ops', default=[3, 2, 1], nargs='+', type=int, help='drop ops in each stage') | |||
| parser.add_argument('--dropout_rates', default=[0.1, 0.4, 0.7], nargs='+', type=float, help='drop ops probability in each stage') | |||
| # parser.add_argument('--add_layers', action='append', help='add layers in each stage') | |||
| # parser.add_argument('--dropped_ops', action='append', help='drop ops in each stage') | |||
| # parser.add_argument('--dropout_rates', action='append', help='drop ops probability in each stage') | |||
| parser.add_argument("--channels", default=16, type=int) | |||
| parser.add_argument("--log_frequency", default=50, type=int) | |||
| parser.add_argument("--class_num", default=10, type=int) | |||
| parser.add_argument("--unrolled", default=False, action="store_true") | |||
| args = parser.parse_args() | |||
| mkdirs(args.result_path, args.log_path, args.search_space_path, args.best_selected_space_path) | |||
| init_logger(args.log_path, "info") | |||
| set_seed(args.trial_id) | |||
| # args.add_layers = list_str2int(args.add_layers) | |||
| # args.dropped_ops = list_str2int(args.dropped_ops) | |||
| # args.dropout_rates = list_str2int(args.dropout_rates) | |||
| logger.info(args) | |||
| logger.info("initializing pdarts trainer") | |||
| trainer = PdartsTrainer( | |||
| init_layers=args.init_layers, | |||
| pdarts_num_layers=args.add_layers, | |||
| pdarts_num_to_drop=args.dropped_ops, | |||
| pdarts_dropout_rates=args.dropout_rates, | |||
| num_epochs=args.epochs, | |||
| num_pre_epochs=args.pre_epochs, | |||
| model_lr=args.model_lr, | |||
| arch_lr=args.arch_lr, | |||
| batch_size=args.batch_size, | |||
| class_num=args.class_num, | |||
| channels=args.channels, | |||
| result_path=args.result_path, | |||
| log_frequency=args.log_frequency, | |||
| unrolled=args.unrolled, | |||
| data_dir = args.data_dir, | |||
| search_space_path=args.search_space_path, | |||
| best_selected_space_path=args.best_selected_space_path | |||
| ) | |||
| logger.info("training") | |||
| start_time = time.time() | |||
| trainer.train(validate=True) | |||
| # result = trainer.result | |||
| cost_time = time.time() - start_time | |||
| # 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}} | |||
| logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}) | |||
| with open(args.result_path, "a") as file: | |||
| file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})) | |||
| @@ -0,0 +1,201 @@ | |||
| import copy | |||
| import numpy as np | |||
| import torch | |||
| import logging | |||
| from collections import OrderedDict | |||
| from torch import nn | |||
| from pytorch.darts.dartsmutator import DartsMutator | |||
| from pytorch.mutables import LayerChoice, InputChoice | |||
| logger = logging.getLogger(__name__) | |||
| class PdartsMutator(DartsMutator): | |||
| """ | |||
| It works with PdartsTrainer to calculate ops weights, | |||
| and drop weights in different PDARTS epochs. | |||
| """ | |||
| def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}): | |||
| self.pdarts_epoch_index = pdarts_epoch_index | |||
| self.pdarts_num_to_drop = pdarts_num_to_drop | |||
| # save the last two switches and choices for restrict skip | |||
| self.last_two_switches = None | |||
| self.last_two_choices = None | |||
| if switches is None: | |||
| self.switches = {} | |||
| else: | |||
| self.switches = switches | |||
| super(PdartsMutator, self).__init__(model) | |||
| # this loop go through mutables with different keys, | |||
| # it's mainly to update length of choices. | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| switches = self.switches.get(mutable.key, [True for j in range(len(mutable))]) | |||
| # choices = self.choices[mutable.key] | |||
| operations_count = np.sum(switches) | |||
| # +1 and -1 are caused by zero operation in darts network | |||
| # the zero operation is not in choices list(switches) in network, but its weight are in, | |||
| # so it needs one more weights and switch for zero. | |||
| self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(operations_count + 1)) | |||
| self.switches[mutable.key] = switches | |||
| # update LayerChoice instances in model, | |||
| # it's physically remove dropped choices operations. | |||
| for module in self.model.modules(): | |||
| if isinstance(module, LayerChoice): | |||
| switches = self.switches.get(module.key) | |||
| choices = self.choices[module.key] | |||
| if len(module) > len(choices): | |||
| # from last to first, so that it won't effect previous indexes after removed one. | |||
| for index in range(len(switches)-1, -1, -1): | |||
| if switches[index] == False: | |||
| del module[index] | |||
| assert len(module) <= len(choices), "Failed to remove dropped choices." | |||
| def export(self, last, switches): | |||
| # In last pdarts_epoches, need to restrict skipconnection | |||
| # Cannot rely on super().export() because P-DARTS has deleted some of the choices and has misaligned length. | |||
| if last: | |||
| # restrict Up to 2 skipconnect (normal cell only) | |||
| name = "normal" | |||
| max_num = 2 | |||
| skip_num = self.check_skip_num(name, switches) | |||
| logger.info("Initially, the number of skipconnect is {}.".format(skip_num)) | |||
| while skip_num > max_num: | |||
| logger.info("Restricting {} skipconnect to {}.".format(skip_num, max_num)) | |||
| logger.info("Original normal_switch is {}.".format(switches)) | |||
| # update self.choices setting skip prob to 0 and self.switches setting skip prob to False | |||
| switches = self.delete_min_sk(name, switches) | |||
| logger.info("Restricted normal_switch is {}.".format(switches)) | |||
| skip_num = self.check_skip_num(name, switches) | |||
| # from bool result convert to human readable by Mutator export() | |||
| results = super().sample_final() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| # As some operations are dropped physically, | |||
| # so it needs to fill back false to track dropped operations. | |||
| trained_result = results[mutable.key] | |||
| trained_index = 0 | |||
| switches = self.switches[mutable.key] | |||
| result = torch.Tensor(switches).bool() | |||
| for index in range(len(result)): | |||
| if result[index]: | |||
| result[index] = trained_result[trained_index] | |||
| trained_index += 1 | |||
| results[mutable.key] = result | |||
| return results | |||
| def drop_paths(self): | |||
| """ | |||
| This method is called when a PDARTS epoch is finished. | |||
| It prepares switches for next epoch. | |||
| candidate operations with False switch will be doppped in next epoch. | |||
| """ | |||
| all_switches = copy.deepcopy(self.switches) | |||
| for key in all_switches: | |||
| switches = all_switches[key] | |||
| idxs = [] | |||
| for j in range(len(switches)): | |||
| if switches[j]: | |||
| idxs.append(j) | |||
| sorted_weights = self.choices[key].data.cpu().numpy()[:-1] | |||
| drop = np.argsort(sorted_weights)[:self.pdarts_num_to_drop[self.pdarts_epoch_index]] | |||
| for idx in drop: | |||
| switches[idxs[idx]] = False | |||
| return all_switches | |||
| def check_skip_num(self, name, switches): | |||
| counter = 0 | |||
| for key in switches: | |||
| if name in key: | |||
| # zero operation not in switches, so "skipconnect" in 2 | |||
| if switches[key][2]: | |||
| counter += 1 | |||
| return counter | |||
| def delete_min_sk(self, name, switches): | |||
| def _get_sk_idx(key, switches): | |||
| if not switches[key][2]: | |||
| idx = -1 | |||
| else: | |||
| idx = 0 | |||
| for i in range(2): | |||
| # switches has 1 True, self.switches has 2 True | |||
| if self.switches[key][i]: | |||
| idx += 1 | |||
| return idx | |||
| sk_choices = [1.0 for i in range(14)] | |||
| sk_keys = [None for i in range(14)] # key has skip connection | |||
| sk_choices_idx = -1 | |||
| for key in switches: | |||
| if name in key: | |||
| # default key in order | |||
| sk_choices_idx += 1 | |||
| idx = _get_sk_idx(key, switches) | |||
| if not idx == -1: | |||
| sk_keys[sk_choices_idx] = key | |||
| sk_choices[sk_choices_idx] = self.choices[key][idx] | |||
| min_sk_idx = np.argmin(sk_choices) | |||
| idx = _get_sk_idx(sk_keys[min_sk_idx], switches) | |||
| # modify self.choices or copy.deepcopy ? | |||
| self.choices[sk_keys[min_sk_idx]][idx] = 0.0 | |||
| # modify self.switches or copy.deepcopy ? | |||
| # self.switches indicate last two switches, and switches indicate present(last) switches | |||
| self.switches[sk_keys[min_sk_idx]][2] = False | |||
| switches[sk_keys[min_sk_idx]][2] = False | |||
| return switches | |||
| def _generate_search_space(self): | |||
| """ | |||
| Generate search space from mutables. | |||
| Here is the search space format: | |||
| :: | |||
| { key_name: {"_type": "layer_choice", | |||
| "_value": ["conv1", "conv2"]} } | |||
| { key_name: {"_type": "input_choice", | |||
| "_value": {"candidates": ["in1", "in2"], | |||
| "n_chosen": 1}} } | |||
| Returns | |||
| ------- | |||
| dict | |||
| the generated search space | |||
| """ | |||
| res = OrderedDict() | |||
| res["op_list"] = OrderedDict() | |||
| res["search_space"] = {"reduction_cell": OrderedDict(), "normal_cell": OrderedDict()} | |||
| keys = [] | |||
| for mutable in self.mutables: | |||
| # for now we only generate flattened search space | |||
| if (len(res["search_space"]["reduction_cell"]) + len(res["search_space"]["normal_cell"])) >= 36: | |||
| break | |||
| if isinstance(mutable, LayerChoice): | |||
| key = mutable.key | |||
| if key not in keys: | |||
| val = mutable.names | |||
| if not res["op_list"]: | |||
| res["op_list"] = {"_type": "layer_choice", "_value": val + ["none"]} | |||
| node_type = "normal_cell" if "normal" in key else "reduction_cell" | |||
| res["search_space"][node_type][key] = "op_list" | |||
| keys.append(key) | |||
| elif isinstance(mutable, InputChoice): | |||
| key = mutable.key | |||
| if key not in keys: | |||
| node_type = "normal_cell" if "normal" in key else "reduction_cell" | |||
| res["search_space"][node_type][key] = {"_type": "input_choice", | |||
| "_value": {"candidates": mutable.choose_from, | |||
| "n_chosen": mutable.n_chosen}} | |||
| keys.append(key) | |||
| else: | |||
| raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) | |||
| return res | |||
| @@ -0,0 +1,167 @@ | |||
| import os | |||
| import logging | |||
| import torch | |||
| import torch.nn as nn | |||
| import numpy as np | |||
| from collections import OrderedDict | |||
| import json | |||
| from pytorch.callbacks import LRSchedulerCallback | |||
| from pytorch.trainer import BaseTrainer, TorchTensorEncoder | |||
| from pytorch.utils import dump_global_result | |||
| from model import CNN | |||
| from pdartsmutator import PdartsMutator | |||
| from pytorch.darts.utils import accuracy | |||
| from pytorch.darts import datasets | |||
| from pytorch.darts.dartstrainer import DartsTrainer | |||
| logger = logging.getLogger(__name__) | |||
| class PdartsTrainer(BaseTrainer): | |||
| """ | |||
| This trainer implements the PDARTS algorithm. | |||
| PDARTS bases on DARTS algorithm, and provides a network growth approach to find deeper and better network. | |||
| This class relies on pdarts_num_layers and pdarts_num_to_drop parameters to control how network grows. | |||
| pdarts_num_layers means how many layers more than first epoch. | |||
| pdarts_num_to_drop means how many candidate operations should be dropped in each epoch. | |||
| So that the grew network can in similar size. | |||
| """ | |||
| def __init__(self, init_layers, pdarts_num_layers, pdarts_num_to_drop, pdarts_dropout_rates, num_epochs, num_pre_epochs, model_lr, class_num, | |||
| arch_lr, channels, batch_size, result_path, log_frequency, unrolled, data_dir, search_space_path, | |||
| best_selected_space_path, device=None, workers=4): | |||
| super(PdartsTrainer, self).__init__() | |||
| self.init_layers = init_layers | |||
| self.class_num = class_num | |||
| self.channels = channels | |||
| self.model_lr = model_lr | |||
| self.num_epochs = num_epochs | |||
| self.class_num = class_num | |||
| self.pdarts_num_layers = pdarts_num_layers | |||
| self.pdarts_num_to_drop = pdarts_num_to_drop | |||
| self.pdarts_dropout_rates = pdarts_dropout_rates | |||
| self.pdarts_epoches = len(pdarts_num_to_drop) | |||
| self.search_space_path = search_space_path | |||
| self.best_selected_space_path = best_selected_space_path | |||
| logger.info("loading data") | |||
| dataset_train, dataset_valid = datasets.get_dataset( | |||
| "cifar10", root=data_dir) | |||
| self.darts_parameters = { | |||
| "metrics": lambda output, target: accuracy(output, target, topk=(1,)), | |||
| "arch_lr": arch_lr, | |||
| "num_epochs": num_epochs, | |||
| "num_pre_epochs": num_pre_epochs, | |||
| "dataset_train": dataset_train, | |||
| "dataset_valid": dataset_valid, | |||
| "batch_size": batch_size, | |||
| "result_path": result_path, | |||
| "workers": workers, | |||
| "device": device, | |||
| "log_frequency": log_frequency, | |||
| "unrolled": unrolled, | |||
| "search_space_path": None | |||
| } | |||
| def train(self, validate=False): | |||
| switches = None | |||
| last = False | |||
| for epoch in range(self.pdarts_epoches): | |||
| if epoch == self.pdarts_epoches - 1: | |||
| last = True | |||
| # create network for each stage | |||
| layers = self.init_layers + self.pdarts_num_layers[epoch] | |||
| init_dropout_rate = float(self.pdarts_dropout_rates[epoch]) | |||
| model = CNN(32, 3, self.channels, self.class_num, layers, | |||
| init_dropout_rate, n_nodes=4, search=True) | |||
| criterion = nn.CrossEntropyLoss() | |||
| optim = torch.optim.SGD( | |||
| model.parameters(), self.model_lr, momentum=0.9, weight_decay=3.0E-4) | |||
| lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( | |||
| optim, self.num_epochs, eta_min=0.001) | |||
| logger.info( | |||
| "############Start PDARTS training epoch %s############", epoch) | |||
| self.mutator = PdartsMutator( | |||
| model, epoch, self.pdarts_num_to_drop, switches) | |||
| if epoch == 0: | |||
| # only write original search space in first stage | |||
| search_space = self.mutator._generate_search_space() | |||
| dump_global_result(self.search_space_path, | |||
| search_space) | |||
| darts_callbacks = [] | |||
| if lr_scheduler is not None: | |||
| darts_callbacks.append(LRSchedulerCallback(lr_scheduler)) | |||
| # darts_callbacks.append(ArchitectureCheckpoint( | |||
| # os.path.join(self.selected_space_path, "stage_{}".format(epoch)))) | |||
| self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, | |||
| optimizer=optim, callbacks=darts_callbacks, **self.darts_parameters) | |||
| for train_epoch in range(self.darts_parameters["num_epochs"]): | |||
| for callback in darts_callbacks: | |||
| callback.on_epoch_begin(train_epoch) | |||
| # training | |||
| logger.info("Epoch %d Training", train_epoch) | |||
| if train_epoch < self.darts_parameters["num_pre_epochs"]: | |||
| dropout_rate = init_dropout_rate * \ | |||
| (self.darts_parameters["num_epochs"] - train_epoch - | |||
| 1) / self.darts_parameters["num_epochs"] | |||
| else: | |||
| # scale_factor = 0.2 | |||
| dropout_rate = init_dropout_rate * \ | |||
| np.exp(-(epoch - | |||
| self.darts_parameters["num_pre_epochs"]) * 0.2) | |||
| model.drop_path_prob(search=True, p=dropout_rate) | |||
| self.trainer.train_one_epoch(train_epoch) | |||
| if validate: | |||
| # validation | |||
| logger.info("Epoch %d Validating", train_epoch + 1) | |||
| self.trainer.validate_one_epoch( | |||
| train_epoch, log_print=True if last else False) | |||
| for callback in darts_callbacks: | |||
| callback.on_epoch_end(train_epoch) | |||
| switches = self.mutator.drop_paths() | |||
| # In last pdarts_epoches, need to restrict skipconnection and save best structure | |||
| if last: | |||
| res = OrderedDict() | |||
| op_value = [value for value in search_space["op_list"]["_value"] if value != 'none'] | |||
| res["op_list"] = search_space["op_list"] | |||
| res["op_list"]["_value"] = op_value | |||
| res["best_selected_space"] = self.mutator.export(last, switches) | |||
| logger.info(res) | |||
| dump_global_result(self.best_selected_space_path, res) | |||
| def validate(self): | |||
| self.trainer.validate() | |||
| def export(self, file, last, switches): | |||
| self.mutator.export(last, switches) | |||
| mutator_export = self.mutator.export() | |||
| with open(file, "w") as f: | |||
| json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder) | |||
| def checkpoint(self, file_path, epoch): | |||
| if isinstance(self.model, nn.DataParallel): | |||
| child_model_state_dict = self.model.module.state_dict() | |||
| else: | |||
| child_model_state_dict = self.model.state_dict() | |||
| save_state = {'child_model_state_dict': child_model_state_dict, | |||
| 'optimizer_state_dict': self.optimizer.state_dict(), | |||
| 'epoch': epoch} | |||
| dest_path = os.path.join( | |||
| file_path, "best_checkpoint_epoch_{}.pth.tar".format(epoch)) | |||
| logger.info("Saving model to %s", dest_path) | |||
| torch.save(save_state, dest_path) | |||
| raise NotImplementedError("Not implemented yet") | |||
| @@ -0,0 +1,92 @@ | |||
| # train stage | |||
| `python pdarts_train.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --search_space_path 'experiment_id/search_space.json' --best_selected_space_path 'experiment_id/best_selected_space.json' --trial_id 0 --model_lr 0.025 --arch_lr 3e-4 --epochs 2 --pre_epochs 1 --batch_size 64 --channels 16 --init_layers 5 --add_layer 0 6 12 --dropped_ops 3 2 1 --dropout_rates 0.1 0.4 0.7` | |||
| # select stage | |||
| `python pdarts_select.py --best_selected_space_path 'experiment_id/best_selected_space.json'` | |||
| # retrain stage | |||
| `python pdarts_retrain.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --best_selected_space_path 'experiment_id/best_selected_space.json' --best_checkpoint_dir 'experiment_id/' --trial_id 0 --batch_size 96 --epochs 2 --lr 0.025 --layers 20 --channels 36` | |||
| # output file | |||
| `result.json` | |||
| ``` | |||
| {'type': 'Accuracy', 'result': {'sequence': 0, 'category': 'epoch', 'value': 0.1}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 1, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 2, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 3, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Accuracy', 'result': {'sequence': 4, 'category': 'epoch', 'value': 0.0}} | |||
| {'type': 'Cost_time', 'result': {'value': '41.614346981048584 s'}} | |||
| ``` | |||
| `search_space.json` | |||
| ``` | |||
| { | |||
| "op_list": { | |||
| "_type": "layer_choice", | |||
| "_value": [ | |||
| "maxpool", | |||
| "avgpool", | |||
| "skipconnect", | |||
| "sepconv3x3", | |||
| "sepconv5x5", | |||
| "dilconv3x3", | |||
| "dilconv5x5", | |||
| "none" | |||
| ] | |||
| }, | |||
| "search_space": { | |||
| "normal_n2_p0": "op_list", | |||
| "normal_n2_p1": "op_list", | |||
| "normal_n2_switch": { | |||
| "_type": "input_choice", | |||
| "_value": { | |||
| "candidates": [ | |||
| "normal_n2_p0", | |||
| "normal_n2_p1" | |||
| ], | |||
| "n_chosen": 2 | |||
| } | |||
| }, | |||
| ... | |||
| } | |||
| ``` | |||
| `best_selected_space.json` | |||
| ``` | |||
| { | |||
| { | |||
| "op_list": { | |||
| "_type": "layer_choice", | |||
| "_value": [ | |||
| "maxpool", | |||
| "avgpool", | |||
| "skipconnect", | |||
| "sepconv3x3", | |||
| "sepconv5x5", | |||
| "dilconv3x3", | |||
| "dilconv5x5" | |||
| ] | |||
| }, | |||
| "best_selected_space": { | |||
| "normal_n2_p0": [ | |||
| false, | |||
| false, | |||
| false, | |||
| false, | |||
| true, | |||
| false, | |||
| false | |||
| ], | |||
| "normal_n2_p1": [ | |||
| true, | |||
| false, | |||
| false, | |||
| false, | |||
| false, | |||
| false, | |||
| false | |||
| ], | |||
| ... | |||
| } | |||
| ``` | |||
| @@ -0,0 +1,46 @@ | |||
| from abc import ABC, abstractmethod | |||
| class Retrainer(ABC): | |||
| """ | |||
| Train the best performance model from scratch without structure optimization. | |||
| To implement a new selector, users need to implement: | |||
| method: "train" | |||
| method: "__init__" | |||
| super().__init__() must be called in __init__ method | |||
| parameters: | |||
| ----------- | |||
| candidates: candidates to be evaluated | |||
| """ | |||
| @abstractmethod | |||
| def train(self): | |||
| """ | |||
| Override the method to train. | |||
| """ | |||
| raise NotImplementedError | |||
| def validate(self): | |||
| """ | |||
| Override the method to validate. | |||
| """ | |||
| raise NotImplementedError | |||
| def export(self, file): | |||
| """ | |||
| Override the method to export to file. | |||
| Parameters | |||
| ---------- | |||
| file : str | |||
| File path to export to. | |||
| """ | |||
| raise NotImplementedError | |||
| def checkpoint(self): | |||
| """ | |||
| Override to dump a checkpoint. | |||
| """ | |||
| raise NotImplementedError | |||
| @@ -0,0 +1,56 @@ | |||
| from abc import ABC, abstractmethod | |||
| class Selector(ABC): | |||
| """ | |||
| choose the best model from a group of candidates. | |||
| To implement a new selector, users need to implement: | |||
| method: "fit" | |||
| method: "__init__" | |||
| super().__init__() must be called in __init__ method | |||
| parameters: | |||
| ----------- | |||
| candidates: candidates to be evaluated | |||
| ##### Examples ##### | |||
| # class HPOSelector(Selector): | |||
| # def __init__(self, *args, single_candidate=True): | |||
| # super().__init__(single_candidate) | |||
| # self.args = args | |||
| # def fit(self): | |||
| # | |||
| # # only one candatite, function passed | |||
| # | |||
| # pass | |||
| ########### | |||
| """ | |||
| @abstractmethod | |||
| def __init__(self, single_candidate=True): | |||
| self.single_candidate = single_candidate | |||
| self._valid() | |||
| @abstractmethod | |||
| def fit(self, candidates=None): | |||
| """ | |||
| evaluate the candidates to select the best one. | |||
| any optimization algos could be implement here. | |||
| if the inputs has only one candidates, just return the candidate directly | |||
| """ | |||
| raise NotImplementedError | |||
| def _valid(self, ): | |||
| if self.single_candidate: | |||
| print("### single model, selecting finished ###") | |||
| exit(0) | |||
| @@ -0,0 +1 @@ | |||
| from .mutator import RandomMutator | |||
| @@ -0,0 +1,36 @@ | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from nni.nas.pytorch.mutator import Mutator | |||
| from nni.nas.pytorch.mutables import LayerChoice, InputChoice | |||
| class RandomMutator(Mutator): | |||
| """ | |||
| Random mutator that samples a random candidate in the search space each time ``reset()``. | |||
| It uses random function in PyTorch, so users can set seed in PyTorch to ensure deterministic behavior. | |||
| """ | |||
| def sample_search(self): | |||
| """ | |||
| Sample a random candidate. | |||
| """ | |||
| result = dict() | |||
| for mutable in self.mutables: | |||
| if isinstance(mutable, LayerChoice): | |||
| gen_index = torch.randint(high=len(mutable), size=(1, )) | |||
| result[mutable.key] = F.one_hot(gen_index, num_classes=len(mutable)).view(-1).bool() | |||
| elif isinstance(mutable, InputChoice): | |||
| if mutable.n_chosen is None: | |||
| result[mutable.key] = torch.randint(high=2, size=(mutable.n_candidates,)).view(-1).bool() | |||
| else: | |||
| perm = torch.randperm(mutable.n_candidates) | |||
| mask = [i in perm[:mutable.n_chosen] for i in range(mutable.n_candidates)] | |||
| result[mutable.key] = torch.tensor(mask, dtype=torch.bool) # pylint: disable=not-callable | |||
| return result | |||
| def sample_final(self): | |||
| """ | |||
| Same as :meth:`sample_search`. | |||
| """ | |||
| return self.sample_search() | |||
| @@ -0,0 +1,3 @@ | |||
| from .evolution import SPOSEvolution | |||
| from .mutator import SPOSSupernetTrainingMutator | |||
| from .trainer import SPOSSupernetTrainer | |||