Browse Source

add tadl算法

pull/13/MERGE
enlin 2 years ago
parent
commit
adeddaab09
100 changed files with 15338 additions and 0 deletions
  1. +2
    -0
      dubhe-tadl/__init__.py
  2. +159
    -0
      dubhe-tadl/base_mutator.py
  3. +40
    -0
      dubhe-tadl/base_trainer.py
  4. +167
    -0
      dubhe-tadl/callbacks.py
  5. +150
    -0
      dubhe-tadl/classic_nas/fixed.py
  6. +206
    -0
      dubhe-tadl/classic_nas/mnist.py
  7. +52
    -0
      dubhe-tadl/classic_nas/model.py
  8. +260
    -0
      dubhe-tadl/classic_nas/mutator.py
  9. +64
    -0
      dubhe-tadl/classic_nas/readme
  10. +288
    -0
      dubhe-tadl/classic_nas/retrainer.py
  11. +58
    -0
      dubhe-tadl/classic_nas/selector.py
  12. +269
    -0
      dubhe-tadl/classic_nas/trainer.py
  13. +70
    -0
      dubhe-tadl/cream/README.md
  14. +5
    -0
      dubhe-tadl/cream/algorithms/__init__.py
  15. +36
    -0
      dubhe-tadl/cream/algorithms/mutator.py
  16. +437
    -0
      dubhe-tadl/cream/algorithms/trainer.py
  17. +39
    -0
      dubhe-tadl/cream/algorithms/utils.py
  18. +123
    -0
      dubhe-tadl/cream/lib/config.py
  19. +129
    -0
      dubhe-tadl/cream/lib/core/retrain.py
  20. +100
    -0
      dubhe-tadl/cream/lib/core/test.py
  21. +2
    -0
      dubhe-tadl/cream/lib/models/blocks/__init__.py
  22. +113
    -0
      dubhe-tadl/cream/lib/models/blocks/inverted_residual_block.py
  23. +105
    -0
      dubhe-tadl/cream/lib/models/blocks/residual_block.py
  24. +182
    -0
      dubhe-tadl/cream/lib/models/builders/build_childnet.py
  25. +214
    -0
      dubhe-tadl/cream/lib/models/builders/build_supernet.py
  26. +145
    -0
      dubhe-tadl/cream/lib/models/structures/childnet.py
  27. +202
    -0
      dubhe-tadl/cream/lib/models/structures/supernet.py
  28. +270
    -0
      dubhe-tadl/cream/lib/utils/builder_util.py
  29. +79
    -0
      dubhe-tadl/cream/lib/utils/flops_table.py
  30. +42
    -0
      dubhe-tadl/cream/lib/utils/op_by_layer_dict.py
  31. +47
    -0
      dubhe-tadl/cream/lib/utils/search_structure_supernet.py
  32. +178
    -0
      dubhe-tadl/cream/lib/utils/util.py
  33. +6
    -0
      dubhe-tadl/cream/pretrained/README.md
  34. +462
    -0
      dubhe-tadl/cream/retrainer.py
  35. +21
    -0
      dubhe-tadl/cream/selector.py
  36. +167
    -0
      dubhe-tadl/cream/test.py
  37. +312
    -0
      dubhe-tadl/cream/trainer.py
  38. +2
    -0
      dubhe-tadl/darts/__init__.py
  39. +205
    -0
      dubhe-tadl/darts/darts_retrain.py
  40. +21
    -0
      dubhe-tadl/darts/darts_select.py
  41. +85
    -0
      dubhe-tadl/darts/darts_train.py
  42. +134
    -0
      dubhe-tadl/darts/dartsmutator.py
  43. +227
    -0
      dubhe-tadl/darts/dartstrainer.py
  44. +56
    -0
      dubhe-tadl/darts/datasets.py
  45. +160
    -0
      dubhe-tadl/darts/model.py
  46. +136
    -0
      dubhe-tadl/darts/ops.py
  47. +83
    -0
      dubhe-tadl/darts/readme.md
  48. +21
    -0
      dubhe-tadl/darts/utils.py
  49. +331
    -0
      dubhe-tadl/dataloader.py
  50. +46
    -0
      dubhe-tadl/enas/README.md
  51. +5
    -0
      dubhe-tadl/enas/__init__.py
  52. +28
    -0
      dubhe-tadl/enas/datasets.py
  53. +87
    -0
      dubhe-tadl/enas/macro.py
  54. +187
    -0
      dubhe-tadl/enas/micro.py
  55. +197
    -0
      dubhe-tadl/enas/mutator.py
  56. +129
    -0
      dubhe-tadl/enas/ops.py
  57. +490
    -0
      dubhe-tadl/enas/retrain.py
  58. +495
    -0
      dubhe-tadl/enas/retrainer.py
  59. +135
    -0
      dubhe-tadl/enas/search.py
  60. +18
    -0
      dubhe-tadl/enas/selector.py
  61. +436
    -0
      dubhe-tadl/enas/trainer.py
  62. +30
    -0
      dubhe-tadl/enas/utils.py
  63. +141
    -0
      dubhe-tadl/fixed.py
  64. +79
    -0
      dubhe-tadl/log.py
  65. +340
    -0
      dubhe-tadl/mutables.py
  66. +309
    -0
      dubhe-tadl/mutator.py
  67. +47
    -0
      dubhe-tadl/network_morphism/README.md
  68. +517
    -0
      dubhe-tadl/network_morphism/algorithm/bayesian.py
  69. +919
    -0
      dubhe-tadl/network_morphism/algorithm/graph.py
  70. +178
    -0
      dubhe-tadl/network_morphism/algorithm/graph_transformer.py
  71. +213
    -0
      dubhe-tadl/network_morphism/algorithm/layer_transformer.py
  72. +765
    -0
      dubhe-tadl/network_morphism/algorithm/layers.py
  73. +293
    -0
      dubhe-tadl/network_morphism/algorithm/networkmorphism_searcher.py
  74. +227
    -0
      dubhe-tadl/network_morphism/algorithm/nn.py
  75. +64
    -0
      dubhe-tadl/network_morphism/datasets.py
  76. +57
    -0
      dubhe-tadl/network_morphism/network_morphism_retrain.py
  77. +22
    -0
      dubhe-tadl/network_morphism/network_morphism_select.py
  78. +87
    -0
      dubhe-tadl/network_morphism/network_morphism_train.py
  79. +175
    -0
      dubhe-tadl/network_morphism/network_morphism_trainer.py
  80. +102
    -0
      dubhe-tadl/network_morphism/utils.py
  81. +1
    -0
      dubhe-tadl/pcdarts/__init__.py
  82. +227
    -0
      dubhe-tadl/pcdarts/model.py
  83. +204
    -0
      dubhe-tadl/pcdarts/pcdarts_retrain.py
  84. +21
    -0
      dubhe-tadl/pcdarts/pcdarts_select.py
  85. +93
    -0
      dubhe-tadl/pcdarts/pcdarts_train.py
  86. +146
    -0
      dubhe-tadl/pcdarts/pcdartsmutator.py
  87. +81
    -0
      dubhe-tadl/pcdarts/reademe.md
  88. +2
    -0
      dubhe-tadl/pdarts/__init__.py
  89. +177
    -0
      dubhe-tadl/pdarts/model.py
  90. +203
    -0
      dubhe-tadl/pdarts/pdarts_retrain.py
  91. +22
    -0
      dubhe-tadl/pdarts/pdarts_select.py
  92. +79
    -0
      dubhe-tadl/pdarts/pdarts_train.py
  93. +201
    -0
      dubhe-tadl/pdarts/pdartsmutator.py
  94. +167
    -0
      dubhe-tadl/pdarts/pdartstrainer.py
  95. +92
    -0
      dubhe-tadl/pdarts/reademe.md
  96. +46
    -0
      dubhe-tadl/retrainer.py
  97. +56
    -0
      dubhe-tadl/selector.py
  98. +1
    -0
      dubhe-tadl/spos/algorithms/random/__init__.py
  99. +36
    -0
      dubhe-tadl/spos/algorithms/random/mutator.py
  100. +3
    -0
      dubhe-tadl/spos/algorithms/spos/__init__.py

+ 2
- 0
dubhe-tadl/__init__.py View File

@@ -0,0 +1,2 @@
# from .log import init_logger
# init_logger()

+ 159
- 0
dubhe-tadl/base_mutator.py View File

@@ -0,0 +1,159 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging

import torch.nn as nn
from .mutables import Mutable, MutableScope, InputChoice
from .utils import StructuredMutableTreeNode

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class BaseMutator(nn.Module):
"""
A mutator is responsible for mutating a graph by obtaining the search space from the network and implementing
callbacks that are called in ``forward`` in mutables.

Parameters
----------
model : nn.Module
PyTorch model to apply mutator on.
"""

def __init__(self, model):
super().__init__()
self.__dict__["model"] = model
self._structured_mutables = self._parse_search_space(self.model)

def _parse_search_space(self, module, root=None, prefix="", memo=None, nested_detection=None):
if memo is None:
memo = set()
if root is None:
root = StructuredMutableTreeNode(None)
if module not in memo:
memo.add(module)

if isinstance(module, Mutable):
if nested_detection is not None:
raise RuntimeError("Cannot have nested search space. Error at {} in {}"
.format(module, nested_detection))
module.name = prefix

module.set_mutator(self)
root = root.add_child(module)
if not isinstance(module, MutableScope):
nested_detection = module
if isinstance(module, InputChoice):
for k in module.choose_from:
if k != InputChoice.NO_KEY and k not in [m.key for m in memo if isinstance(m, Mutable)]:
raise RuntimeError("'{}' required by '{}' not found in keys that appeared before, and is not NO_KEY."
.format(k, module.key))
for name, submodule in module._modules.items():

if submodule is None:
continue
submodule_prefix = prefix + ("." if prefix else "") + name
self._parse_search_space(submodule, root, submodule_prefix, memo=memo,
nested_detection=nested_detection)
return root

@property
def mutables(self):
"""
A generator of all modules inheriting :class:`~nni.nas.pytorch.mutables.Mutable`.
Modules are yielded in the order that they are defined in ``__init__``.
For mutables with their keys appearing multiple times, only the first one will appear.
"""
return self._structured_mutables

@property
def undedup_mutables(self):
return self._structured_mutables.traverse(deduplicate=False)

def forward(self, *inputs):
"""
Warnings
--------
Don't call forward of a mutator.
"""
raise RuntimeError("Forward is undefined for mutators.")

def __setattr__(self, name, value):
if name == "model":
raise AttributeError("Attribute `model` can be set at most once, and you shouldn't use `self.model = model` to "
"include you network, as it will include all parameters in model into the mutator.")
return super().__setattr__(name, value)

def enter_mutable_scope(self, mutable_scope):
"""
Callback when forward of a MutableScope is entered.

Parameters
----------
mutable_scope : MutableScope
The mutable scope that is entered.
"""
pass

def exit_mutable_scope(self, mutable_scope):
"""
Callback when forward of a MutableScope is exited.

Parameters
----------
mutable_scope : MutableScope
The mutable scope that is exited.
"""
pass

def on_forward_layer_choice(self, mutable, *args, **kwargs):
"""
Callbacks of forward in LayerChoice.

Parameters
----------
mutable : LayerChoice
Module whose forward is called.
args : list of torch.Tensor
The arguments of its forward function.
kwargs : dict
The keyword arguments of its forward function.

Returns
-------
tuple of torch.Tensor and torch.Tensor
Output tensor and mask.
"""
raise NotImplementedError

def on_forward_input_choice(self, mutable, tensor_list):
"""
Callbacks of forward in InputChoice.

Parameters
----------
mutable : InputChoice
Mutable that is called.
tensor_list : list of torch.Tensor
The arguments mutable is called with.

Returns
-------
tuple of torch.Tensor and torch.Tensor
Output tensor and mask.
"""
raise NotImplementedError

def export(self):
"""
Export the data of all decisions. This should output the decisions of all the mutables, so that the whole
network can be fully determined with these decisions for further training from scratch.

Returns
-------
dict
Mappings from mutable keys to decisions.
"""
raise NotImplementedError

+ 40
- 0
dubhe-tadl/base_trainer.py View File

@@ -0,0 +1,40 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from abc import ABC, abstractmethod


class BaseTrainer(ABC):

@abstractmethod
def train(self):
"""
Override the method to train.
"""
raise NotImplementedError

@abstractmethod
def validate(self):
"""
Override the method to validate.
"""
raise NotImplementedError

@abstractmethod
def export(self, file):
"""
Override the method to export to file.

Parameters
----------
file : str
File path to export to.
"""
raise NotImplementedError

@abstractmethod
def checkpoint(self):
"""
Override to dump a checkpoint.
"""
raise NotImplementedError

+ 167
- 0
dubhe-tadl/callbacks.py View File

@@ -0,0 +1,167 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import os

import torch
import torch.nn as nn

_logger = logging.getLogger(__name__)
_logger.setLevel(logging.INFO)

class Callback:
"""
Callback provides an easy way to react to events like begin/end of epochs.
"""

def __init__(self):
self.model = None
self.optimizer = None
self.mutator = None
self.trainer = None

def build(self, model, optimizer, mutator, trainer):
"""
Callback needs to be built with model, mutator, trainer, to get updates from them.

Parameters
----------
model : nn.Module
Model to be trained.
mutator : nn.Module
Mutator that mutates the model.
trainer : BaseTrainer
Trainer that is to call the callback.
"""
self.model = model
self.optimizer = optimizer
self.mutator = mutator
self.trainer = trainer

def on_epoch_begin(self, epoch):
"""
Implement this to do something at the begin of epoch.

Parameters
----------
epoch : int
Epoch number, starting from 0.
"""
pass

def on_epoch_end(self, epoch):
"""
Implement this to do something at the end of epoch.

Parameters
----------
epoch : int
Epoch number, starting from 0.
"""
pass

def on_batch_begin(self, epoch):
pass

def on_batch_end(self, epoch):
pass


class LRSchedulerCallback(Callback):
"""
Calls scheduler on every epoch ends.

Parameters
----------
scheduler : LRScheduler
Scheduler to be called.
"""
def __init__(self, scheduler, mode="epoch"):
super().__init__()
assert mode == "epoch"
self.scheduler = scheduler
self.mode = mode

def on_epoch_end(self, epoch):
"""
Call ``self.scheduler.step()`` on epoch end.
"""
self.scheduler.step()


class ArchitectureCheckpoint(Callback):
"""
Calls ``trainer.export()`` on every epoch ends.

Parameters
----------
checkpoint_dir : str
Location to save checkpoints.
"""
def __init__(self, checkpoint_dir):
super().__init__()
self.checkpoint_dir = checkpoint_dir
os.makedirs(self.checkpoint_dir, exist_ok=True)

def on_epoch_end(self, epoch):
"""
Dump to ``/checkpoint_dir/epoch_{number}.json`` on epoch end.
"""
dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.json".format(epoch))
_logger.info("Saving architecture to %s", dest_path)
self.trainer.export(dest_path)

class BestArchitectureCheckpoint(Callback):
"""
Calls ``trainer.export()`` on final epoch ends.

Parameters
----------
checkpoint_path : str
Location to save checkpoints.
"""
def __init__(self, checkpoint_path, epoches):
super().__init__()
self.epoches = epoches
self.checkpoint_path = checkpoint_path

def on_epoch_end(self, epoch):
"""
Dump to ``./best_selected_space.json`` on epoch end.
"""
if epoch == self.epoches -1:
_logger.info("Saving architecture to %s", self.checkpoint_path)
self.trainer.export(self.checkpoint_path)
class ModelCheckpoint(Callback):
"""
Calls ``trainer.export()`` on every epoch ends.

Parameters
----------
checkpoint_dir : str
Location to save checkpoints.
"""
def __init__(self, checkpoint_dir):
super().__init__()
self.checkpoint_dir = checkpoint_dir
os.makedirs(self.checkpoint_dir, exist_ok=True)

def on_epoch_end(self, epoch):
"""
Dump to ``/checkpoint_dir/epoch_{number}.pth.tar`` on every epoch end.
``DataParallel`` object will have their inside modules exported.
"""
if isinstance(self.model, nn.DataParallel):
child_model_state_dict = self.model.module.state_dict()
else:
child_model_state_dict = self.model.state_dict()

save_state = {'child_model_state_dict': child_model_state_dict,
'optimizer_state_dict': self.optimizer.state_dict(),
'epoch': epoch}

dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.pth.tar".format(epoch))
_logger.info("Saving model to %s", dest_path)
torch.save(save_state, dest_path)

+ 150
- 0
dubhe-tadl/classic_nas/fixed.py View File

@@ -0,0 +1,150 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import logging

import sys
sys.path.append('..'+ '/' + '..')
from pytorch.mutables import InputChoice, LayerChoice, MutableScope
from pytorch.mutator import Mutator
from pytorch.utils import to_list


_logger = logging.getLogger(__name__)
#_logger.setLevel(logging.INFO)

class FixedArchitecture(Mutator):
"""
Fixed architecture mutator that always selects a certain graph.

Parameters
----------
model : nn.Module
A mutable network.
fixed_arc : dict
Preloaded architecture object.
strict : bool
Force everything that appears in ``fixed_arc`` to be used at least once.
"""

def __init__(self, model, fixed_arc, strict=True):
super().__init__(model)
self._fixed_arc = fixed_arc

mutable_keys = set([mutable.key for mutable in self.mutables if not isinstance(mutable, MutableScope)])
fixed_arc_keys = set(self._fixed_arc.keys())
if fixed_arc_keys - mutable_keys:
raise RuntimeError("Unexpected keys found in fixed architecture: {}.".format(fixed_arc_keys - mutable_keys))
if mutable_keys - fixed_arc_keys:
raise RuntimeError("Missing keys in fixed architecture: {}.".format(mutable_keys - fixed_arc_keys))
self._fixed_arc = self._from_human_readable_architecture(self._fixed_arc)

def _from_human_readable_architecture(self, human_arc):
# convert from an exported architecture
#print('human_arc',human_arc)
result_arc = {k: to_list(v) for k, v in human_arc.items()} # there could be tensors, numpy arrays, etc.
#print('result_arc',result_arc)
# First, convert non-list to list, because there could be {"op1": 0} or {"op1": "conv"},
# which means {"op1": [0, ]} ir {"op1": ["conv", ]}
result_arc = {k: v['_value'] if isinstance(v['_value'], list) else [v['_value']] for k, v in result_arc.items()}
# Second, infer which ones are multi-hot arrays and which ones are in human-readable format.
# This is non-trivial, since if an array in [0, 1], we cannot know for sure it means [false, true] or [true, true].
# Here, we assume an multihot array has to be a boolean array or a float array and matches the length.
for mutable in self.mutables:

if mutable.key not in result_arc:
continue # skip silently
choice_arr = result_arc[mutable.key]

if all(isinstance(v, bool) for v in choice_arr) or all(isinstance(v, float) for v in choice_arr):
if (isinstance(mutable, LayerChoice) and len(mutable) == len(choice_arr)) or \
(isinstance(mutable, InputChoice) and mutable.n_candidates == len(choice_arr)):
# multihot, do nothing
continue
if isinstance(mutable, LayerChoice):

choice_arr = [mutable.names.index(val) if isinstance(val, str) else val for val in choice_arr]

choice_arr = [i in choice_arr for i in range(len(mutable))]

elif isinstance(mutable, InputChoice):
choice_arr = [mutable.choose_from.index(val) if isinstance(val, str) else val for val in choice_arr]
choice_arr = [i in choice_arr for i in range(mutable.n_candidates)]
result_arc[mutable.key] = choice_arr
return result_arc

def sample_search(self):
"""
Always returns the fixed architecture.
"""
return self._fixed_arc

def sample_final(self):
"""
Always returns the fixed architecture.
"""
return self._fixed_arc

def replace_layer_choice(self, module=None, prefix=""):
"""
Replace layer choices with selected candidates. It's done with best effort.
In case of weighted choices or multiple choices. if some of the choices on weighted with zero, delete them.
If single choice, replace the module with a normal module.

Parameters
----------
module : nn.Module
Module to be processed.
prefix : str
Module name under global namespace.
"""
if module is None:
module = self.model
for name, mutable in module.named_children():
global_name = (prefix + "." if prefix else "") + name
if isinstance(mutable, LayerChoice):
chosen = self._fixed_arc[mutable.key]
if sum(chosen) == 1 and max(chosen) == 1 and not mutable.return_mask:
# sum is one, max is one, there has to be an only one
# this is compatible with both integer arrays, boolean arrays and float arrays
_logger.info("Replacing %s with candidate number %d.", global_name, chosen.index(1))
setattr(module, name, mutable[chosen.index(1)])
else:
if mutable.return_mask:
_logger.info("`return_mask` flag of %s is true. As it relies on the behavior of LayerChoice, " \
"LayerChoice will not be replaced.")
# remove unused parameters
for ch, n in zip(chosen, mutable.names):
if ch == 0 and not isinstance(ch, float):
setattr(mutable, n, None)
else:
self.replace_layer_choice(mutable, global_name)


def apply_fixed_architecture(model, fixed_arc):
"""
Load architecture from `fixed_arc` and apply to model.

Parameters
----------
model : torch.nn.Module
Model with mutables.
fixed_arc : str or dict
Path to the JSON that stores the architecture, or dict that stores the exported architecture.

Returns
-------
FixedArchitecture
Mutator that is responsible for fixes the graph.
"""

if isinstance(fixed_arc, str):
with open(fixed_arc) as f:
fixed_arc = json.load(f)
architecture = FixedArchitecture(model, fixed_arc)
architecture.reset()

# for the convenience of parameters counting
architecture.replace_layer_choice()
return architecture

+ 206
- 0
dubhe-tadl/classic_nas/mnist.py View File

@@ -0,0 +1,206 @@
"""
A deep MNIST classifier using convolutional layers.

This file is a modification of the official pytorch mnist example:
https://github.com/pytorch/examples/blob/master/mnist/main.py
"""

import os
import argparse
import logging
import sys
sys.path.append('..'+ '/' + '..')
from collections import OrderedDict


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from pytorch.mutables import LayerChoice, InputChoice
from mutator import ClassicMutator
import numpy as np
import time
import json

logger = logging.getLogger('mnist_AutoML')


class Net(nn.Module):
def __init__(self, hidden_size):
super(Net, self).__init__()
# two options of conv1
self.conv1 = LayerChoice(OrderedDict([
("conv5x5", nn.Conv2d(1, 20, 5, 1)),
("conv3x3", nn.Conv2d(1, 20, 3, 1))
]), key='first_conv')
# two options of mid_conv
self.mid_conv = LayerChoice([
nn.Conv2d(20, 20, 3, 1, padding=1),
nn.Conv2d(20, 20, 5, 1, padding=2)
], key='mid_conv')
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, hidden_size)
self.fc2 = nn.Linear(hidden_size, 10)
# skip connection over mid_conv
self.input_switch = InputChoice(n_candidates=2,
n_chosen=1,
key='skip')

def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
old_x = x
x = F.relu(self.mid_conv(x))
zero_x = torch.zeros_like(old_x)
skip_x = self.input_switch([zero_x, old_x])
x = torch.add(x, skip_x)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)


def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args['log_interval'] == 0:
logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))


def test(args, model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
# sum up batch loss
test_loss += F.nll_loss(output, target, reduction='sum').item()
# get the index of the max log-probability
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

accuracy = 100. * correct / len(test_loader.dataset)

logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset), accuracy))

return accuracy


def main(args):
global_result={'accuarcy':[]}
use_cuda = not args['no_cuda'] and torch.cuda.is_available()

torch.manual_seed(args['seed'])

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

data_dir = args['data_dir']

train_loader = torch.utils.data.DataLoader(
datasets.MNIST(data_dir, train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args['batch_size'], shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST(data_dir, train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=1000, shuffle=True, **kwargs)

hidden_size = args['hidden_size']

model = Net(hidden_size=hidden_size).to(device)
#np.random.seed(42)
#x = np.random.rand(2,1,28,28).astype(np.float32)
#x= torch.from_numpy(x).to(device)
ClassicMutator(model,trial_id=args['trial_id'],selected_path=args["selected_space_path"],search_space_path=args["search_space_path"])
#y=model(x)
#print(y)

optimizer = optim.SGD(model.parameters(), lr=args['lr'],
momentum=args['momentum'])

for epoch in range(1, args['epochs'] + 1):


train(args, model, device, train_loader, optimizer, epoch)
test_acc = test(args, model, device, test_loader)
print({"type":"accuracy","result":{"sequence":epoch,"category":"epoch","value":test_acc}} )
global_result['accuarcy'].append(test_acc)

return global_result

def dump_global_result(args,global_result):
with open(args['result_path'], "w") as ss_file:
json.dump(global_result, ss_file, sort_keys=True, indent=2)

def get_params():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument("--data_dir", type=str,
default='./data', help="data directory")
parser.add_argument("--selected_space_path", type=str,
default='./selected_space.json', help="selected_space_path")
parser.add_argument("--search_space_path", type=str,
default='./selected_space.json', help="search_space_path")
parser.add_argument("--result_path", type=str,
default='./result.json', help="result_path")
parser.add_argument('--batch_size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument("--hidden_size", type=int, default=512, metavar='N',
help='hidden layer size (default: 512)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--no_cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')

args, _ = parser.parse_known_args()
return args


if __name__ == '__main__':
try:
start=time.time()
params = vars(get_params())
global_result = main(params)
global_result['cost_time'] = str(time.time() - start) +'s'
dump_global_result(params,global_result)
except Exception as exception:
logger.exception(exception)
raise

+ 52
- 0
dubhe-tadl/classic_nas/model.py View File

@@ -0,0 +1,52 @@
import os
import argparse
import logging
import sys

from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from pytorch.mutables import LayerChoice, InputChoice
from mutator import ClassicMutator
import numpy as np

class Net(nn.Module):
def __init__(self, hidden_size):
super(Net, self).__init__()
# two options of conv1
self.conv1 = LayerChoice(OrderedDict([
("conv5x5", nn.Conv2d(1, 20, 5, 1)),
("conv3x3", nn.Conv2d(1, 20, 3, 1))
]), key='conv1')
# two options of mid_conv
self.mid_conv = LayerChoice(OrderedDict([
("conv3x3",nn.Conv2d(20, 20, 3, 1, padding=1)),
("conv5x5",nn.Conv2d(20, 20, 5, 1, padding=2))
]), key='mid_conv')
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, hidden_size)
self.fc2 = nn.Linear(hidden_size, 10)
# skip connection over mid_conv
self.input_switch = InputChoice(n_candidates=2,
n_chosen=1,
key='skip')

def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
old_x = x
x = F.relu(self.mid_conv(x))
zero_x = torch.zeros_like(old_x)
skip_x = self.input_switch([zero_x, old_x])
x = torch.add(x, skip_x)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)

+ 260
- 0
dubhe-tadl/classic_nas/mutator.py View File

@@ -0,0 +1,260 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import logging
import os
import sys
sys.path.append('..'+ '/' + '..')
import torch



from pytorch.mutables import LayerChoice, InputChoice, MutableScope
from pytorch.mutator import Mutator
import numpy as np
import random
logger = logging.getLogger(__name__)

NNI_GEN_SEARCH_SPACE = "NNI_GEN_SEARCH_SPACE"
NNI_PLATFORM = "GPU"
LAYER_CHOICE = "layer_choice"
INPUT_CHOICE = "input_choice"


def get_and_apply_next_architecture(model):
"""
Wrapper of :class:`~nni.nas.pytorch.classic_nas.mutator.ClassicMutator` to make it more meaningful,
similar to ``get_next_parameter`` for HPO.

It will generate search space based on ``model``.
If env ``NNI_GEN_SEARCH_SPACE`` exists, this is in dry run mode for
generating search space for the experiment.
If not, there are still two mode, one is nni experiment mode where users
use ``nnictl`` to start an experiment. The other is standalone mode
where users directly run the trial command, this mode chooses the first
one(s) for each LayerChoice and InputChoice.

Parameters
----------
model : nn.Module
User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
"""
ClassicMutator(model)


class ClassicMutator(Mutator):
"""
This mutator is to apply the architecture chosen from tuner.
It implements the forward function of LayerChoice and InputChoice,
to only activate the chosen ones.

Parameters
----------
model : nn.Module
User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
"""

def __init__(self, model,trial_id,selected_path,search_space_path,load_selected_space=False):
super(ClassicMutator, self).__init__(model)
self._chosen_arch = {}
self._search_space = self._generate_search_space()
self.trial_id = trial_id
#if NNI_GEN_SEARCH_SPACE in os.environ:
# dry run for only generating search space
self._dump_search_space(search_space_path)
#sys.exit(0)

if load_selected_space:
logger.warning("load selected space.")
self._chosen_arch = self.load_selected_space
else:
# get chosen arch from tuner
self._chosen_arch = self.random_generate_chosen()


self._generate_selected_space(selected_path)
self.reset()

def _sample_layer_choice(self, mutable, idx, value, search_space_item):
"""
Convert layer choice to tensor representation.

Parameters
----------
mutable : Mutable
idx : int
Number `idx` of list will be selected.
value : str
The verbose representation of the selected value.
search_space_item : list
The list for corresponding search space.
"""
# doesn't support multihot for layer choice yet
onehot_list = [False] * len(mutable)
assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \
"Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value)
onehot_list[idx] = True
return torch.tensor(onehot_list, dtype=torch.bool) # pylint: disable=not-callable

def _sample_input_choice(self, mutable, idx, value, search_space_item):
"""
Convert input choice to tensor representation.

Parameters
----------
mutable : Mutable
idx : int
Number `idx` of list will be selected.
value : str
The verbose representation of the selected value.
search_space_item : list
The list for corresponding search space.
"""
candidate_repr = search_space_item["candidates"]
multihot_list = [False] * mutable.n_candidates
for i, v in zip(idx, value):
assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \
"Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v)
assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx)
multihot_list[i] = True
return torch.tensor(multihot_list, dtype=torch.bool) # pylint: disable=not-callable

def sample_search(self):
"""
See :meth:`sample_final`.
"""
return self.sample_final()

def sample_final(self):
"""
Convert the chosen arch and apply it on model.
"""
assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \
"Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(),
self._chosen_arch.keys())
result = dict()
for mutable in self.mutables:
if isinstance(mutable, (LayerChoice, InputChoice)):
assert mutable.key in self._chosen_arch, \
"Expected '{}' in chosen arch, but not found.".format(mutable.key)
data = self._chosen_arch[mutable.key]
assert isinstance(data, dict) and "_value" in data and "_idx" in data, \
"'{}' is not a valid choice.".format(data)
if isinstance(mutable, LayerChoice):
result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"],
self._search_space[mutable.key]["_value"])
elif isinstance(mutable, InputChoice):
result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"],
self._search_space[mutable.key]["_value"])
elif isinstance(mutable, MutableScope):
logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key)
else:
raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
return result

def _standalone_generate_chosen(self):
"""
Generate the chosen architecture for standalone mode,
i.e., choose the first one(s) for LayerChoice and InputChoice.
::
{ key_name: {"_value": "conv1",
"_idx": 0} }
{ key_name: {"_value": ["in1"],
"_idx": [0]} }
Returns
-------
dict
the chosen architecture
"""
chosen_arch = {}
for key, val in self._search_space.items():
if val["_type"] == LAYER_CHOICE:
choices = val["_value"]
chosen_arch[key] = {"_value": choices[0], "_idx": 0}
elif val["_type"] == INPUT_CHOICE:
choices = val["_value"]["candidates"]
n_chosen = val["_value"]["n_chosen"]
if n_chosen is None:
n_chosen = len(choices)
chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))}
else:
raise ValueError("Unknown key '%s' and value '%s'." % (key, val))
return chosen_arch

def random_generate_chosen(self):
"""
Generate the chosen architecture for standalone mode,
i.e., choose the first one(s) for LayerChoice and InputChoice.
::
{ key_name: {"_value": "conv1",
"_idx": 0} }
{ key_name: {"_value": ["in1"],
"_idx": [0]} }
Returns
-------
dict
the chosen architecture
"""
chosen_arch = {}
np.random.seed(self.trial_id)
random.seed(self.trial_id)
for key, val in self._search_space.items():
if val["_type"] == LAYER_CHOICE:
choices = val["_value"]
chosen_idx = np.random.randint(len(choices))
chosen_arch[key] = {"_value": choices[chosen_idx], "_idx": chosen_idx}
elif val["_type"] == INPUT_CHOICE:
choices = val["_value"]["candidates"]
n_chosen = val["_value"]["n_chosen"]
if n_chosen is None:
n_chosen = len(choices)
chosen_idx = random.sample(list(range(n_chosen)),n_chosen)
chosen_arch[key] = {"_value": [choices[idx] for idx in chosen_idx], "_idx": chosen_idx}
else:
raise ValueError("Unknown key '%s' and value '%s'." % (key, val))
return chosen_arch

def _generate_search_space(self):
"""
Generate search space from mutables.
Here is the search space format:
::
{ key_name: {"_type": "layer_choice",
"_value": ["conv1", "conv2"]} }
{ key_name: {"_type": "input_choice",
"_value": {"candidates": ["in1", "in2"],
"n_chosen": 1}} }
Returns
-------
dict
the generated search space
"""
search_space = {}
for mutable in self.mutables:
# for now we only generate flattened search space
if isinstance(mutable, LayerChoice):
key = mutable.key
val = mutable.names
search_space[key] = {"_type": LAYER_CHOICE, "_value": val}
elif isinstance(mutable, InputChoice):
key = mutable.key
search_space[key] = {"_type": INPUT_CHOICE,
"_value": {"candidates": mutable.choose_from,
"n_chosen": mutable.n_chosen}}
elif isinstance(mutable, MutableScope):
logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key)
else:
raise TypeError("Unsupported mutable type: '%s'." % type(mutable))

return search_space

def _dump_search_space(self, file_path):
with open(file_path, "w") as ss_file:
json.dump(self._search_space, ss_file, sort_keys=True, indent=2)

def _generate_selected_space(self,file_path):
with open(file_path, "w") as ss_file:
json.dump(self._chosen_arch, ss_file, sort_keys=True, indent=2)

+ 64
- 0
dubhe-tadl/classic_nas/readme View File

@@ -0,0 +1,64 @@
stage1:python trainer.py --trial_id=1 --model_selected_space_path='./exp/train/2/model_selected_space.json' --search_space_path='./search_space.json' --result_path='./exp/train/2/result.json'
stage2:python selector.py --experiment_dir='./exp' --best_selected_space_path='./best_selected_space.json'
stage3:python retrainer.py --best_checkpoint_dir='experiment_id/' --best_selected_space_path='./best_selected_space.json' --result_path='result.json'
search_space.json:
{
"first_conv": {
"_type": "layer_choice",
"_value": [
"conv5x5",
"conv3x3"
]
},
"mid_conv": {
"_type": "layer_choice",
"_value": [
"0",
"1"
]
},
"skip": {
"_type": "input_choice",
"_value": {
"candidates": [
"",
""
],
"n_chosen": 1
}
}
}


selected_space.json:
{
"first_conv": {
"_idx": 0,
"_value": "conv5x5"
},
"mid_conv": {
"_idx": 0,
"_value": "0"
},
"skip": {
"_idx": [
0
],
"_value": [
""
]
}
}

result.json:
{'type': 'accuracy', 'result': {'sequence': 1, 'category': 'epoch', 'value': 96.73815907059875}}
{'type': 'accuracy', 'result': {'sequence': 2, 'category': 'epoch', 'value': 97.6988382484361}}
{'type': 'accuracy', 'result': {'sequence': 3, 'category': 'epoch', 'value': 98.63717605004469}}
{'type': 'accuracy', 'result': {'sequence': 4, 'category': 'epoch', 'value': 98.72654155495978}}
{'type': 'accuracy', 'result': {'sequence': 5, 'category': 'epoch', 'value': 99.27390527256479}}
{'type': 'accuracy', 'result': {'sequence': 6, 'category': 'epoch', 'value': 99.13985701519213}}
{'type': 'accuracy', 'result': {'sequence': 7, 'category': 'epoch', 'value': 99.3632707774799}}
{'type': 'accuracy', 'result': {'sequence': 8, 'category': 'epoch', 'value': 99.4414655942806}}
{'type': 'accuracy', 'result': {'sequence': 9, 'category': 'epoch', 'value': 99.67605004468275}}
{'type': 'accuracy', 'result': {'sequence': 10, 'category': 'epoch', 'value': 99.74307417336908}}


+ 288
- 0
dubhe-tadl/classic_nas/retrainer.py View File

@@ -0,0 +1,288 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import copy
import logging
import os
import argparse
import logging
import sys
sys.path.append('..'+ '/' + '..')
from collections import OrderedDict


import torch
import torch.nn as nn
import torch.nn.functional as F

from torchvision import datasets, transforms
from model import Net

from pytorch.trainer import Trainer
from pytorch.utils import AverageMeterGroup
from pytorch.utils import mkdirs
from pytorch.mutables import LayerChoice, InputChoice
from fixed import apply_fixed_architecture

from mutator import ClassicMutator
from abc import ABC, abstractmethod
from pytorch.retrainer import Retrainer
import numpy as np
import time
import json

logger = logging.getLogger(__name__)
#logger.setLevel(logging.INFO)


class ClassicnasRetrainer(Retrainer):
"""
Classicnas trainer.

Parameters
----------
model : nn.Module
PyTorch model to be trained.
loss : callable
Receives logits and ground truth label, return a loss tensor.
metrics : callable
Receives logits and ground truth label, return a dict of metrics.
optimizer : Optimizer
The optimizer used for optimizing the model.
epochs : int
Number of epochs planned for training.
dataset_train : Dataset
Dataset for training. Will be split for training weights and architecture weights.
dataset_valid : Dataset
Dataset for testing.
mutator : ClassicMutator
Use in case of customizing your own ClassicMutator. By default will instantiate a ClassicMutator.
batch_size : int
Batch size.
workers : int
Workers for data loading.
device : torch.device
``torch.device("cpu")`` or ``torch.device("cuda")``.
log_frequency : int
Step count per logging.
callbacks : list of Callback
list of callbacks to trigger at events.
arc_learning_rate : float
Learning rate of architecture parameters.
unrolled : float
``True`` if using second order optimization, else first order optimization.
"""
def __init__(self, model, loss, metrics,
optimizer, epochs, dataset_train, dataset_valid, search_space_path,selected_space_path,checkpoint_dir,trial_id,
mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
callbacks=None, arc_learning_rate=3.0E-4, unrolled=False):


self.model = model

self.loss = loss
self.metrics = metrics
self.optimizer = optimizer
self.epochs = epochs
self.device = device
self.batch_size = batch_size
self.checkpoint_dir =checkpoint_dir

self.train_loader = torch.utils.data.DataLoader(
datasets.MNIST(dataset_train, train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True, **kwargs)

self.test_loader = torch.utils.data.DataLoader(
datasets.MNIST(dataset_valid, train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=1000, shuffle=True, **kwargs)



self.search_space_path = search_space_path
self.selected_space_path =selected_space_path
self.trial_id = trial_id
self.result = {"accuracy": [],"cost_time": 0.}

def train(self):

# t1 = time()
# phase 1. architecture step
#print(self.model.state_dict)
apply_fixed_architecture(self.model, self.selected_space_path)
#print(self.model.state_dict)
# phase 2: child network step
for child_epoch in range(1, self.epochs + 1):

self.model.train()
for batch_idx, (data, target) in enumerate(self.train_loader):
data, target = data.to(self.device), target.to(self.device)
optimizer.zero_grad()
output = self.model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args['log_interval'] == 0:
logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
child_epoch, batch_idx * len(data), len(self.train_loader.dataset),
100. * batch_idx / len(self.train_loader), loss.item()))

test_acc = self.validate()
print({"type":"accuracy","result":{"sequence":child_epoch,"category":"epoch","value":test_acc}} )
with open(args['result_path'], "a") as ss_file:
ss_file.write(json.dumps({"type":"accuracy","result":{"sequence":child_epoch,"category":"epoch","value":test_acc}} ) + '\n')
self.result['accuracy'].append(test_acc)

def validate(self):
self.model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in self.test_loader:
data, target = data.to(self.device), target.to(self.device)
output = self.model(data)
# sum up batch loss
test_loss += F.nll_loss(output, target, reduction='sum').item()
# get the index of the max log-probability
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(self.test_loader.dataset)

accuracy = 100. * correct / len(self.test_loader.dataset)

logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(self.test_loader.dataset), accuracy))

return accuracy

#
# def export(self, file):
# """
# Override the method to export to file.

# Parameters
# ----------
# file : str
# File path to export to.
# """
# raise NotImplementedError

def checkpoint(self):
"""
Override to dump a checkpoint.
"""
if isinstance(self.model, nn.DataParallel):
state_dict = self.model.module.state_dict()
else:
state_dict = self.model.state_dict()
if not os.path.exists(self.checkpoint_dir):
os.makedirs(self.checkpoint_dir)
dest_path = os.path.join(self.checkpoint_dir, f"best_checkpoint_epoch{self.epochs}.pth")
logger.info("Saving model to %s", dest_path)
torch.save(state_dict, dest_path)



def dump_global_result(args,global_result):
with open(args['result_path'], "w") as ss_file:
json.dump(global_result, ss_file, sort_keys=True, indent=2)

def get_params():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument("--data_dir", type=str,
default='./data', help="data directory")
parser.add_argument("--best_selected_space_path", type=str,
default='./selected_space.json', help="selected_space_path")
parser.add_argument("--search_space_path", type=str,
default='./selected_space.json', help="search_space_path")
parser.add_argument("--result_path", type=str,
default='./result.json', help="result_path")
parser.add_argument('--batch_size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument("--hidden_size", type=int, default=512, metavar='N',
help='hidden layer size (default: 512)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--no_cuda', default=False,
help='disables CUDA training')
parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument("--best_checkpoint_dir",type=str,default="path/to/",
help="Path for saved checkpoints. (default: %(default)s)")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')

args, _ = parser.parse_known_args()
return args

if __name__ == '__main__':
try:
start=time.time()

params = vars(get_params())
args =params

use_cuda = not args['no_cuda'] and torch.cuda.is_available()

torch.manual_seed(args['seed'])

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

data_dir = args['data_dir']

hidden_size = args['hidden_size']

model = Net(hidden_size=hidden_size).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=args['lr'],
momentum=args['momentum'])

#mkdirs(args['search_space_path'])
mkdirs(args['best_selected_space_path'])
mkdirs(args['result_path'])
trainer = ClassicnasRetrainer(model,
loss=None,
metrics=None,
optimizer=optimizer,
epochs=args['epochs'],
dataset_train=data_dir,
dataset_valid=data_dir,
search_space_path = args['search_space_path'],
selected_space_path = args['best_selected_space_path'],
checkpoint_dir = args['best_checkpoint_dir'],
trial_id = args['trial_id'],
batch_size=args['batch_size'],
log_frequency=args['log_interval'],
device= device,
unrolled=None,
callbacks=None)

with open(args['result_path'], "w") as ss_file:
ss_file.write('')
trainer.train()
trainer.checkpoint()
global_result = trainer.result
global_result['cost_time'] = str(time.time() - start) +'s'
#dump_global_result(params,global_result)
except Exception as exception:
logger.exception(exception)
raise

+ 58
- 0
dubhe-tadl/classic_nas/selector.py View File

@@ -0,0 +1,58 @@
import sys
sys.path.append('../..')
from pytorch.selector import Selector
from pytorch.utils import mkdirs
import shutil
import argparse
import os
import json

class ClassicnasSelector(Selector):
def __init__(self, args, single_candidate=True):
super().__init__(single_candidate)
self.args = args

def fit(self):
"""
only one candatite, function passed
"""
train_dir = os.path.join(self.args['experiment_dir'],'train')
max_accuracy = 0
best_selected_space = ''
for trialId in os.listdir(train_dir):
path= os.path.join(train_dir,trialId,'result','result.json')
max_accuracy_trial = 0
with open(path,'r') as f:
for line in f:
result_dict = json.loads(line)
accuracy = result_dict["result"]["value"]
if accuracy>max_accuracy_trial:
max_accuracy_trial=accuracy
print(max_accuracy_trial)
if max_accuracy_trial > max_accuracy:
max_accuracy = max_accuracy_trial
best_selected_space = os.path.join(train_dir,trialId,'model_selected_space','model_selected_space.json')
print('best trial id:',trialId)
shutil.copyfile(best_selected_space,self.args['best_selected_space_path'])

def get_params():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument("--experiment_dir", type=str,
default='./experiment_dir', help="data directory")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="selected_space_path")

args, _ = parser.parse_known_args()
return args

if __name__ == "__main__":

params = vars(get_params())
args =params
mkdirs(args['best_selected_space_path'])

hpo_selector = ClassicnasSelector(args,single_candidate=False)
hpo_selector.fit()

+ 269
- 0
dubhe-tadl/classic_nas/trainer.py View File

@@ -0,0 +1,269 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import copy
import logging
import os
import argparse
import logging
import sys
sys.path.append('..'+ '/' + '..')
from collections import OrderedDict


import torch
import torch.nn as nn
import torch.nn.functional as F

from torchvision import datasets, transforms
from model import Net

from pytorch.trainer import Trainer
from pytorch.utils import AverageMeterGroup
from pytorch.utils import mkdirs
from pytorch.mutables import LayerChoice, InputChoice

from mutator import ClassicMutator
import numpy as np
import time
import json

logger = logging.getLogger(__name__)
#logger.setLevel(logging.INFO)


class ClassicnasTrainer(Trainer):
"""
Classicnas trainer.

Parameters
----------
model : nn.Module
PyTorch model to be trained.
loss : callable
Receives logits and ground truth label, return a loss tensor.
metrics : callable
Receives logits and ground truth label, return a dict of metrics.
optimizer : Optimizer
The optimizer used for optimizing the model.
num_epochs : int
Number of epochs planned for training.
dataset_train : Dataset
Dataset for training. Will be split for training weights and architecture weights.
dataset_valid : Dataset
Dataset for testing.
mutator : ClassicMutator
Use in case of customizing your own ClassicMutator. By default will instantiate a ClassicMutator.
batch_size : int
Batch size.
workers : int
Workers for data loading.
device : torch.device
``torch.device("cpu")`` or ``torch.device("cuda")``.
log_frequency : int
Step count per logging.
callbacks : list of Callback
list of callbacks to trigger at events.
arc_learning_rate : float
Learning rate of architecture parameters.
unrolled : float
``True`` if using second order optimization, else first order optimization.
"""
def __init__(self, model, loss, metrics,
optimizer, epochs, dataset_train, dataset_valid, search_space_path,selected_space_path,trial_id,
mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
callbacks=None, arc_learning_rate=3.0E-4, unrolled=False):


self.model = model

self.loss = loss
self.metrics = metrics
self.optimizer = optimizer
self.epochs = epochs
self.device = device
self.batch_size = batch_size

self.train_loader = torch.utils.data.DataLoader(
datasets.MNIST(dataset_train, train=True, download=False,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True, **kwargs)

self.test_loader = torch.utils.data.DataLoader(
datasets.MNIST(dataset_valid, train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=1000, shuffle=True, **kwargs)



self.search_space_path = search_space_path
self.selected_space_path =selected_space_path
self.trial_id = trial_id
self.num_epochs = 10
self.classicmutator=ClassicMutator(self.model,trial_id=self.trial_id,selected_path=self.selected_space_path,search_space_path=self.search_space_path)

self.result = {"accuracy": [],"cost_time": 0.}

def train_one_epoch(self, epoch):

# t1 = time()
# phase 1. architecture step
self.classicmutator.trial_id = epoch
self.classicmutator._chosen_arch=self.classicmutator.random_generate_chosen()
#print('epoch:',epoch,'\n',self.classicmutator._chosen_arch)

# phase 2: child network step
for child_epoch in range(1, self.epochs + 1):

self.model.train()
for batch_idx, (data, target) in enumerate(self.train_loader):
data, target = data.to(self.device), target.to(self.device)
optimizer.zero_grad()
output = self.model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args['log_interval'] == 0:
logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
child_epoch, batch_idx * len(data), len(self.train_loader.dataset),
100. * batch_idx / len(self.train_loader), loss.item()))

test_acc = self.validate_one_epoch(epoch)
print({"type":"accuracy","result":{"sequence":child_epoch,"category":"epoch","value":test_acc}} )
with open(args['result_path'], "a") as ss_file:
ss_file.write(json.dumps({"type":"accuracy","result":{"sequence":child_epoch,"category":"epoch","value":test_acc}} ) + '\n')
self.result['accuracy'].append(test_acc)

def validate_one_epoch(self, epoch):
self.model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in self.test_loader:
data, target = data.to(self.device), target.to(self.device)
output = self.model(data)
# sum up batch loss
test_loss += F.nll_loss(output, target, reduction='sum').item()
# get the index of the max log-probability
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(self.test_loader.dataset)

accuracy = 100. * correct / len(self.test_loader.dataset)

logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(self.test_loader.dataset), accuracy))

return accuracy
def train(self):
"""
Train ``num_epochs``.
Trigger callbacks at the start and the end of each epoch.

Parameters
----------
validate : bool
If ``true``, will do validation every epoch.
"""
for epoch in range(self.num_epochs):
# training
self.train_one_epoch(epoch)


def dump_global_result(args,global_result):
with open(args['result_path'], "w") as ss_file:
json.dump(global_result, ss_file, sort_keys=True, indent=2)

def get_params():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument("--data_dir", type=str,
default='./data', help="data directory")
parser.add_argument("--model_selected_space_path", type=str,
default='./selected_space.json', help="selected_space_path")
parser.add_argument("--search_space_path", type=str,
default='./selected_space.json', help="search_space_path")
parser.add_argument("--result_path", type=str,
default='./model_result.json', help="result_path")
parser.add_argument('--batch_size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument("--hidden_size", type=int, default=512, metavar='N',
help='hidden layer size (default: 512)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--no_cuda', default=False,
help='disables CUDA training')
parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')

args, _ = parser.parse_known_args()
return args

if __name__ == '__main__':
try:
start=time.time()

params = vars(get_params())
args =params

use_cuda = not args['no_cuda'] and torch.cuda.is_available()

torch.manual_seed(args['seed'])

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

data_dir = args['data_dir']

hidden_size = args['hidden_size']

model = Net(hidden_size=hidden_size).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=args['lr'],
momentum=args['momentum'])

mkdirs(args['search_space_path'])
mkdirs(args['model_selected_space_path'])
mkdirs(args['result_path'])
trainer = ClassicnasTrainer(model,
loss=None,
metrics=None,
optimizer=optimizer,
epochs=args['epochs'],
dataset_train=data_dir,
dataset_valid=data_dir,
search_space_path = args['search_space_path'],
selected_space_path = args['model_selected_space_path'],
trial_id = args['trial_id'],
batch_size=args['batch_size'],
log_frequency=args['log_interval'],
device= device,
unrolled=None,
callbacks=None)

with open(args['result_path'], "w") as ss_file:
ss_file.write('')
trainer.train_one_epoch(args['trial_id'])
#trainer.train()
global_result = trainer.result
#global_result['cost_time'] = str(time.time() - start) +'s'
#dump_global_result(params,global_result)
except Exception as exception:
logger.exception(exception)
raise

+ 70
- 0
dubhe-tadl/cream/README.md View File

@@ -0,0 +1,70 @@
# Cream of the Crop: Distilling Prioritized Paths For One-Shot Neural Architecture Search

## 0x01 requirements

* Install the following requirements:

```
future
thop
timm<0.4
yacs
ptflops==0.6.4
#tensorboardx
#tensorboard
#opencv-python
#torch-scope
#git+https://github.com/sovrasov/flops-counter.pytorch.git
#git+https://github.com/Tramac/torchscope.git
```

* (required) Build and install apex to accelerate the training
(see [yuque](https://www.yuque.com/kcgyxv/ukpea3/mxz5xy)),
a little bit faster than pytorch DistributedDataParallel.

* Put the imagenet data in `./data` Using the following script:

```
cd TADL_DIR/pytorch/cream/
ln -s /mnt/data .
```

## 0x02 Quick Start

* Run the following script to search an architecture.

```
python trainer.py
```

* Selector (deprecated)

```
python selector.py
```

* Train searched architectures.

> Note: exponential moving average(model_ema) is not available yet.

```
python retrainer.py
```

<!--
* Test trained models.

```
$ cp configs/test.yaml.example configs/test.yaml
$ python -m torch.distributed.launch --nproc_per_node=1 ./test.py --cfg ./configs/test.yaml

> 01/26 02:06:27 AM | [Model-14] Flops: 13.768M Params: 2.673M
> 01/26 02:06:30 AM | Training on Process 0 with 1 GPUs.
> 01/26 02:06:30 AM | Restoring model state from checkpoint...
> 01/26 02:06:30 AM | Loaded checkpoint './pretrained/14.pth.tar' (epoch 591)
> 01/26 02:06:30 AM | Loaded state_dict_ema
> 01/26 02:06:32 AM | Test_EMA: [ 0/390] Time: 1.573 (1.573) Loss: 0.9613 (0.9613) Prec@1: 82.8125 (82.8125) Prec@5: 91.4062 (91.4062)
> ...
> 01/26 02:07:50 AM | Test_EMA: [ 390/390] Time: 0.077 (0.203) Loss: 3.4356 (2.0912) Prec@1: 25.0000 (53.7640) Prec@5: 53.7500 (77.2840)
```
-->

+ 5
- 0
dubhe-tadl/cream/algorithms/__init__.py View File

@@ -0,0 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from .trainer import CreamSupernetTrainer
from .mutator import RandomMutator

+ 36
- 0
dubhe-tadl/cream/algorithms/mutator.py View File

@@ -0,0 +1,36 @@
import torch
import torch.nn.functional as F

from pytorch.mutator import Mutator
from pytorch.mutables import LayerChoice, InputChoice

# TODO: This class is duplicate with SPOS.
class RandomMutator(Mutator):
"""
Random mutator that samples a random candidate in the search space each time ``reset()``.
It uses random function in PyTorch, so users can set seed in PyTorch to ensure deterministic behavior.
"""

def sample_search(self):
"""
Sample a random candidate.
"""
result = dict()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
gen_index = torch.randint(high=len(mutable), size=(1, ))
result[mutable.key] = F.one_hot(gen_index, num_classes=len(mutable)).view(-1).bool()
elif isinstance(mutable, InputChoice):
if mutable.n_chosen is None:
result[mutable.key] = torch.randint(high=2, size=(mutable.n_candidates,)).view(-1).bool()
else:
perm = torch.randperm(mutable.n_candidates)
mask = [i in perm[:mutable.n_chosen] for i in range(mutable.n_candidates)]
result[mutable.key] = torch.tensor(mask, dtype=torch.bool) # pylint: disable=not-callable
return result

def sample_final(self):
"""
Same as :meth:`sample_search`.
"""
return self.sample_search()

+ 437
- 0
dubhe-tadl/cream/algorithms/trainer.py View File

@@ -0,0 +1,437 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
import json
import numpy as np
import torch
import logging

from copy import deepcopy
from pytorch.trainer import Trainer
from pytorch.utils import AverageMeterGroup

from .utils import accuracy, reduce_metrics

logger = logging.getLogger(__name__)


class CreamSupernetTrainer(Trainer):
"""
This trainer trains a supernet and output prioritized architectures that can be used for other tasks.

Parameters
----------
model : nn.Module
Model with mutables.
loss : callable
Called with logits and targets. Returns a loss tensor.
val_loss : callable
Called with logits and targets for validation only. Returns a loss tensor.
optimizer : Optimizer
Optimizer that optimizes the model.
num_epochs : int
Number of epochs of training.
train_loader : iterablez
Data loader of training. Raise ``StopIteration`` when one epoch is exhausted.
valid_loader : iterablez
Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted.
mutator : Mutator
A mutator object that has been initialized with the model.
batch_size : int
Batch size.
log_frequency : int
Number of mini-batches to log metrics.
meta_sta_epoch : int
start epoch of using meta matching network to pick teacher architecture
update_iter : int
interval of updating meta matching networks
slices : int
batch size of mini training data in the process of training meta matching network
pool_size : int
board size
pick_method : basestring
how to pick teacher network
choice_num : int
number of operations in supernet
sta_num : int
layer number of each stage in supernet (5 stage in supernet)
acc_gap : int
maximum accuracy improvement to omit the limitation of flops
flops_dict : Dict
dictionary of each layer's operations in supernet
flops_fixed : int
flops of fixed part in supernet
local_rank : int
index of current rank
callbacks : list of Callback
Callbacks to plug into the trainer. See Callbacks.
"""

def __init__(self, selected_space, model, loss, val_loss,
optimizer, num_epochs, train_loader, valid_loader,
mutator=None, batch_size=64, log_frequency=None,
meta_sta_epoch=20, update_iter=200, slices=2,
pool_size=10, pick_method='meta', choice_num=6,
sta_num=(4, 4, 4, 4, 4), acc_gap=5,
flops_dict=None, flops_fixed=0, local_rank=0, callbacks=None, result_path=None):
assert torch.cuda.is_available()
super(CreamSupernetTrainer, self).__init__(model, mutator, loss, None,
optimizer, num_epochs, None, None,
batch_size, None, None, log_frequency, callbacks)
self.selected_space = selected_space
self.model = model
self.loss = loss
self.val_loss = val_loss
self.train_loader = train_loader
self.valid_loader = valid_loader
self.log_frequency = log_frequency
self.batch_size = batch_size
self.optimizer = optimizer
self.model = model
self.loss = loss
self.num_epochs = num_epochs
self.meta_sta_epoch = meta_sta_epoch
self.update_iter = update_iter
self.slices = slices
self.pick_method = pick_method
self.pool_size = pool_size
self.local_rank = local_rank
self.choice_num = choice_num
self.sta_num = sta_num
self.acc_gap = acc_gap
self.flops_dict = flops_dict
self.flops_fixed = flops_fixed

self.current_student_arch = None
self.current_teacher_arch = None
self.main_proc = (local_rank == 0)
self.current_epoch = 0

self.prioritized_board = []
self.result_path = result_path

# size of prioritized board
def _board_size(self):
return len(self.prioritized_board)

# select teacher architecture according to the logit difference
def _select_teacher(self):
self._replace_mutator_cand(self.current_student_arch)

if self.pick_method == 'top1':
meta_value, teacher_cand = 0.5, sorted(
self.prioritized_board, reverse=True)[0][3]
elif self.pick_method == 'meta':
meta_value, cand_idx, teacher_cand = -1000000000, -1, None
for now_idx, item in enumerate(self.prioritized_board):
inputx = item[4]
output = torch.nn.functional.softmax(self.model(inputx), dim=1)
weight = self.model.forward_meta(output - item[5])
if weight > meta_value:
meta_value = weight
cand_idx = now_idx
teacher_cand = self.prioritized_board[cand_idx][3]
assert teacher_cand is not None
meta_value = torch.nn.functional.sigmoid(-weight)
else:
raise ValueError('Method Not supported')

return meta_value, teacher_cand

# check whether to update prioritized board
def _isUpdateBoard(self, prec1, flops):
if self.current_epoch <= self.meta_sta_epoch:
return False

if len(self.prioritized_board) < self.pool_size:
return True

if prec1 > self.prioritized_board[-1][1] + self.acc_gap:
return True

if prec1 > self.prioritized_board[-1][1] and flops < self.prioritized_board[-1][2]:
return True

return False

# update prioritized board
def _update_prioritized_board(self, inputs, teacher_output, outputs, prec1, flops):
if self._isUpdateBoard(prec1, flops):
val_prec1 = prec1
training_data = deepcopy(inputs[:self.slices].detach())
if len(self.prioritized_board) == 0:
features = deepcopy(outputs[:self.slices].detach())
else:
features = deepcopy(teacher_output[:self.slices].detach())

self.prioritized_board.append(
(val_prec1,
prec1,
flops,
self.current_student_arch,
training_data,
torch.nn.functional.softmax(
features,
dim=1)))
self.prioritized_board = sorted(
self.prioritized_board, reverse=True)

if len(self.prioritized_board) > self.pool_size:
self.prioritized_board = sorted(
self.prioritized_board, reverse=True)
del self.prioritized_board[-1]

# only update student network weights
def _update_student_weights_only(self, grad_1):
for weight, grad_item in zip(
self.model.module.rand_parameters(self.current_student_arch), grad_1):
weight.grad = grad_item
torch.nn.utils.clip_grad_norm_(
self.model.module.rand_parameters(self.current_student_arch), 1)
self.optimizer.step()
for weight, grad_item in zip(
self.model.module.rand_parameters(self.current_student_arch), grad_1):
del weight.grad

# only update meta networks weights
def _update_meta_weights_only(self, teacher_cand, grad_teacher):
for weight, grad_item in zip(self.model.module.rand_parameters(
teacher_cand, self.pick_method == 'meta'), grad_teacher):
weight.grad = grad_item

# clip gradients
torch.nn.utils.clip_grad_norm_(
self.model.module.rand_parameters(
self.current_student_arch, self.pick_method == 'meta'), 1)

self.optimizer.step()
for weight, grad_item in zip(self.model.module.rand_parameters(
teacher_cand, self.pick_method == 'meta'), grad_teacher):
del weight.grad

# simulate sgd updating
def _simulate_sgd_update(self, w, g, optimizer):
return g * optimizer.param_groups[-1]['lr'] + w

# split training images into several slices
def _get_minibatch_input(self, input):
slice = self.slices
x = deepcopy(input[:slice].clone().detach())
return x

# calculate 1st gradient of student architectures
def _calculate_1st_gradient(self, kd_loss):
self.optimizer.zero_grad()
grad = torch.autograd.grad(
kd_loss,
self.model.module.rand_parameters(self.current_student_arch),
create_graph=True)
return grad

# calculate 2nd gradient of meta networks
def _calculate_2nd_gradient(self, validation_loss, teacher_cand, students_weight):
self.optimizer.zero_grad()
grad_student_val = torch.autograd.grad(
validation_loss,
self.model.module.rand_parameters(self.current_student_arch),
retain_graph=True)

grad_teacher = torch.autograd.grad(
students_weight[0],
self.model.module.rand_parameters(
teacher_cand,
self.pick_method == 'meta'),
grad_outputs=grad_student_val)
return grad_teacher

# forward training data
def _forward_training(self, x, meta_value):
self._replace_mutator_cand(self.current_student_arch)
output = self.model(x)

with torch.no_grad():
self._replace_mutator_cand(self.current_teacher_arch)
teacher_output = self.model(x)
soft_label = torch.nn.functional.softmax(teacher_output, dim=1)

kd_loss = meta_value * \
self._cross_entropy_loss_with_soft_target(output, soft_label)
return kd_loss

# calculate soft target loss
def _cross_entropy_loss_with_soft_target(self, pred, soft_target):
logsoftmax = torch.nn.LogSoftmax()
return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1))

# forward validation data
def _forward_validation(self, input, target):
slice = self.slices
x = input[slice:slice * 2].clone()

self._replace_mutator_cand(self.current_student_arch)
output_2 = self.model(x)

validation_loss = self.loss(output_2, target[slice:slice * 2])
return validation_loss

def _isUpdateMeta(self, batch_idx):
isUpdate = True
isUpdate &= (self.current_epoch > self.meta_sta_epoch)
isUpdate &= (batch_idx > 0)
isUpdate &= (batch_idx % self.update_iter == 0)
isUpdate &= (self._board_size() > 0)
return isUpdate

def _replace_mutator_cand(self, cand):
self.mutator._cache = cand

# update meta matching networks
def _run_update(self, input, target, batch_idx):
if self._isUpdateMeta(batch_idx):
x = self._get_minibatch_input(input)

meta_value, teacher_cand = self._select_teacher()

kd_loss = self._forward_training(x, meta_value)

# calculate 1st gradient
grad_1st = self._calculate_1st_gradient(kd_loss)

# simulate updated student weights
students_weight = [
self._simulate_sgd_update(
p, grad_item, self.optimizer) for p, grad_item in zip(
self.model.module.rand_parameters(self.current_student_arch), grad_1st)]

# update student weights
self._update_student_weights_only(grad_1st)

validation_loss = self._forward_validation(input, target)

# calculate 2nd gradient
grad_teacher = self._calculate_2nd_gradient(validation_loss,
teacher_cand,
students_weight)

# update meta matching networks
self._update_meta_weights_only(teacher_cand, grad_teacher)

# delete internal variants
del grad_teacher, grad_1st, x, validation_loss, kd_loss, students_weight

def _get_cand_flops(self, cand):
flops = 0
for block_id, block in enumerate(cand):
if block == 'LayerChoice1' or block_id == 'LayerChoice23':
continue
for idx, choice in enumerate(cand[block]):
flops += self.flops_dict[block_id][idx] * (1 if choice else 0)
return flops + self.flops_fixed

def train_one_epoch(self, epoch):
self.current_epoch = epoch
meters = AverageMeterGroup()
self.steps_per_epoch = len(self.train_loader)
for step, (input_data, target) in enumerate(self.train_loader):
self.mutator.reset()
self.current_student_arch = self.mutator._cache

input_data, target = input_data.cuda(), target.cuda()

# calculate flops of current architecture
cand_flops = self._get_cand_flops(self.mutator._cache)

# update meta matching network
self._run_update(input_data, target, step)

if self._board_size() > 0:
# select teacher architecture
meta_value, teacher_cand = self._select_teacher()
self.current_teacher_arch = teacher_cand

# forward supernet
if self._board_size() == 0 or epoch <= self.meta_sta_epoch:
self._replace_mutator_cand(self.current_student_arch)
output = self.model(input_data)

loss = self.loss(output, target)
kd_loss, teacher_output, teacher_cand = None, None, None
else:
self._replace_mutator_cand(self.current_student_arch)
output = self.model(input_data)

gt_loss = self.loss(output, target)

with torch.no_grad():
self._replace_mutator_cand(self.current_teacher_arch)
teacher_output = self.model(input_data).detach()

soft_label = torch.nn.functional.softmax(teacher_output, dim=1)
kd_loss = self._cross_entropy_loss_with_soft_target(output, soft_label)

loss = (meta_value * kd_loss + (2 - meta_value) * gt_loss) / 2

# update network
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

# update metrics
prec1, prec5 = accuracy(output, target, topk=(1, 5))
metrics = {"prec1": prec1, "prec5": prec5, "loss": loss}
metrics = reduce_metrics(metrics)
meters.update(metrics)

# update prioritized board
self._update_prioritized_board(input_data,
teacher_output,
output,
metrics['prec1'],
cand_flops)

if self.main_proc and (
step % self.log_frequency == 0 or step + 1 == self.steps_per_epoch):
logger.info("Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, self.num_epochs,
step + 1, len(self.train_loader), meters)

arch_list = []
# if self.main_proc and self.num_epochs == epoch + 1:
for idx, i in enumerate(self.prioritized_board):
# logger.info("prioritized_board: No.%s %s", idx, i[:4])
if idx == 0:
for arch in list(i[3].values()):
_ = arch.numpy()
_ = np.where(_)[0].tolist()
arch_list.append(_)

if len(arch_list) > 0:
with open(self.selected_space, "w") as f:
print("dump selected space.")
json.dump({'selected_space': arch_list}, f)

def validate_one_epoch(self, epoch):
self.model.eval()
meters = AverageMeterGroup()
with torch.no_grad():
for step, (x, y) in enumerate(self.valid_loader):
self.mutator.reset()
logits = self.model(x)
loss = self.val_loss(logits, y)
prec1, prec5 = accuracy(logits, y, topk=(1, 5))
metrics = {"prec1": prec1, "prec5": prec5, "loss": loss}
metrics = reduce_metrics(metrics)
meters.update(metrics)

if self.log_frequency is not None and step % self.log_frequency == 0:
logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1,
self.num_epochs, step + 1, len(self.valid_loader), meters)
# print({'type': 'Accuracy', 'result': {'sequence': epoch, 'category': 'epoch',
# 'value': metrics["prec1"]}})
if self.result_path is not None:
with open(self.result_path, "a") as ss_file:
ss_file.write(json.dumps(
{'type': 'Accuracy',
'result': {'sequence': epoch,
'category': 'epoch',
'value': metrics["prec1"]}}) + '\n')

+ 39
- 0
dubhe-tadl/cream/algorithms/utils.py View File

@@ -0,0 +1,39 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.


import os
import torch
import torch.distributed as dist


def accuracy(output, target, topk=(1,)):
""" Computes the precision@k for the specified values of k """
maxk = max(topk)
batch_size = target.size(0)

_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
# one-hot case
if target.ndimension() > 1:
target = target.max(1)[1]

correct = pred.eq(target.reshape(1, -1).expand_as(pred))

res = []
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0)
res.append(correct_k.mul_(1.0 / batch_size))
return res


def reduce_metrics(metrics):
return {k: reduce_tensor(v).item() for k, v in metrics.items()}


def reduce_tensor(tensor):
rt = torch.sum(tensor)
# rt = tensor.clone()
# dist.all_reduce(rt, op=dist.ReduceOp.SUM)
# rt /= float(os.environ["WORLD_SIZE"])
return rt

+ 123
- 0
dubhe-tadl/cream/lib/config.py View File

@@ -0,0 +1,123 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from yacs.config import CfgNode as CN

DEFAULT_CROP_PCT = 0.875
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)

__C = CN()

cfg = __C

__C.AUTO_RESUME = True
__C.DATA_DIR = './data/imagenet'
__C.MODEL = 'cream'
__C.RESUME_PATH = './experiments/ckps/resume.pth.tar'
__C.SAVE_PATH = './experiments/ckps/'
__C.SEED = 42
__C.LOG_INTERVAL = 50
__C.RECOVERY_INTERVAL = 0
__C.WORKERS = 4
__C.NUM_GPU = 1
__C.SAVE_IMAGES = False
__C.AMP = False
__C.ACC_GAP = 5
__C.OUTPUT = 'output/path/'
__C.EVAL_METRICS = 'prec1'
__C.TTA = 0 # Test or inference time augmentation
__C.LOCAL_RANK = 0
__C.VERBOSE = False

# dataset configs
__C.DATASET = CN()
__C.DATASET.NUM_CLASSES = 1000
__C.DATASET.IMAGE_SIZE = 224 # image patch size
__C.DATASET.INTERPOLATION = 'bilinear' # Image resize interpolation type
__C.DATASET.BATCH_SIZE = 32 # batch size
__C.DATASET.NO_PREFECHTER = False
__C.DATASET.PIN_MEM = True
__C.DATASET.VAL_BATCH_MUL = 4


# model configs
__C.NET = CN()
__C.NET.SELECTION = 14
__C.NET.GP = 'avg' # type of global pool ["avg", "max", "avgmax", "avgmaxc"]
__C.NET.DROPOUT_RATE = 0.0 # dropout rate
__C.NET.INPUT_ARCH = [[0], [3], [3, 3], [3, 1, 3], [3, 3, 3, 3], [3, 3, 3], [0]]

# model ema parameters
__C.NET.EMA = CN()
__C.NET.EMA.USE = True
__C.NET.EMA.FORCE_CPU = False # force model ema to be tracked on CPU
__C.NET.EMA.DECAY = 0.9998

# optimizer configs
__C.OPT = 'sgd'
__C.OPT_EPS = 1e-2
__C.MOMENTUM = 0.9
__C.WEIGHT_DECAY = 1e-4
__C.OPTIMIZER = CN()
__C.OPTIMIZER.NAME = 'sgd'
__C.OPTIMIZER.MOMENTUM = 0.9
__C.OPTIMIZER.WEIGHT_DECAY = 1e-3

# scheduler configs
__C.SCHED = 'sgd'
__C.LR_NOISE = None
__C.LR_NOISE_PCT = 0.67
__C.LR_NOISE_STD = 1.0
__C.WARMUP_LR = 1e-4
__C.MIN_LR = 1e-5
__C.EPOCHS = 200
__C.START_EPOCH = None
__C.DECAY_EPOCHS = 30.0
__C.WARMUP_EPOCHS = 3
__C.COOLDOWN_EPOCHS = 10
__C.PATIENCE_EPOCHS = 10
__C.DECAY_RATE = 0.1
__C.LR = 1e-2
__C.META_LR = 1e-4

# data augmentation parameters
__C.AUGMENTATION = CN()
__C.AUGMENTATION.AA = 'rand-m9-mstd0.5'
__C.AUGMENTATION.COLOR_JITTER = 0.4
__C.AUGMENTATION.RE_PROB = 0.2 # random erase prob
__C.AUGMENTATION.RE_MODE = 'pixel' # random erase mode
__C.AUGMENTATION.MIXUP = 0.0 # mixup alpha
__C.AUGMENTATION.MIXUP_OFF_EPOCH = 0 # turn off mixup after this epoch
__C.AUGMENTATION.SMOOTHING = 0.1 # label smoothing parameters

# batch norm parameters (only works with gen_efficientnet based models
# currently)
__C.BATCHNORM = CN()
__C.BATCHNORM.SYNC_BN = False
__C.BATCHNORM.BN_TF = False
__C.BATCHNORM.BN_MOMENTUM = 0.1 # batchnorm momentum override
__C.BATCHNORM.BN_EPS = 1e-5 # batchnorm eps override

# supernet training hyperparameters
__C.SUPERNET = CN()
__C.SUPERNET.UPDATE_ITER = 1300
__C.SUPERNET.SLICE = 4
__C.SUPERNET.POOL_SIZE = 10
__C.SUPERNET.RESUNIT = False
__C.SUPERNET.DIL_CONV = False
__C.SUPERNET.UPDATE_2ND = True
__C.SUPERNET.FLOPS_MAXIMUM = 600
__C.SUPERNET.FLOPS_MINIMUM = 0
__C.SUPERNET.PICK_METHOD = 'meta' # pick teacher method
__C.SUPERNET.META_STA_EPOCH = 20 # start using meta picking method
__C.SUPERNET.HOW_TO_PROB = 'pre_prob' # sample method
__C.SUPERNET.PRE_PROB = (0.05, 0.2, 0.05, 0.5, 0.05,
0.15) # sample prob in 'pre_prob'

+ 129
- 0
dubhe-tadl/cream/lib/core/retrain.py View File

@@ -0,0 +1,129 @@
import os
import time
import timm
import torch
import torchvision

from collections import OrderedDict

from ..utils.util import AverageMeter, accuracy, reduce_tensor


def train_epoch(
epoch, model, loader, optimizer, loss_fn, args,
lr_scheduler=None, saver=None, output_dir='', use_amp=False,
model_ema=None, logger=None, writer=None, local_rank=0):
batch_time_m = AverageMeter()
data_time_m = AverageMeter()
losses_m = AverageMeter()
prec1_m = AverageMeter()
prec5_m = AverageMeter()

model.train()

end = time.time()
last_idx = len(loader) - 1
num_updates = epoch * len(loader)
optimizer.zero_grad()
for batch_idx, (input, target) in enumerate(loader):
last_batch = batch_idx == last_idx
data_time_m.update(time.time() - end)

input = input.cuda()
target = target.cuda()
output = model(input)

loss = loss_fn(output, target)

prec1, prec5 = accuracy(output, target, topk=(1, 5))

if args.num_gpu > 1:
reduced_loss = reduce_tensor(loss.data, args.num_gpu)
prec1 = reduce_tensor(prec1, args.num_gpu)
prec5 = reduce_tensor(prec5, args.num_gpu)
else:
reduced_loss = loss.data

optimizer.zero_grad()
loss.backward()
optimizer.step()

torch.cuda.synchronize()

losses_m.update(reduced_loss.item(), input.size(0))
prec1_m.update(prec1.item(), output.size(0))
prec5_m.update(prec5.item(), output.size(0))

if model_ema is not None:
model_ema.update(model)
num_updates += 1

batch_time_m.update(time.time() - end)
if last_batch or batch_idx % args.log_interval == 0:
lrl = [param_group['lr'] for param_group in optimizer.param_groups]
lr = sum(lrl) / len(lrl)

if local_rank == 0:
logger.info(
'Train: {} [{:>4d}/{}] '
'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f}) '
'Prec@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) '
'Prec@5: {top5.val:>7.4f} ({top5.avg:>7.4f}) '
'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s '
'({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) '
'LR: {lr:.3e}'
'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
epoch,
batch_idx,
len(loader),
loss=losses_m,
top1=prec1_m,
top5=prec5_m,
batch_time=batch_time_m,
rate=input.size(0) * args.num_gpu / batch_time_m.val,
rate_avg=input.size(0) * args.num_gpu / batch_time_m.avg,
lr=lr,
data_time=data_time_m))


# writer.add_scalar(
# 'Loss/train', prec1_m.avg, epoch * len(loader) + batch_idx)
# writer.add_scalar(
# 'Accuracy/train', prec1_m.avg, epoch * len(loader) + batch_idx)
# writer.add_scalar(
# 'Learning_Rate',
# optimizer.param_groups[0]['lr'], epoch * len(loader) + batch_idx)

if args.save_images and output_dir:
torchvision.utils.save_image(
input, os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx),
padding=0, normalize=True)

if saver is not None and args.recovery_interval and (
last_batch or (batch_idx + 1) % args.recovery_interval == 0):
if int(timm.__version__[2]) >= 3:
saver.save_recovery(
epoch,
batch_idx=batch_idx)
else:
saver.save_recovery(
model,
optimizer,
args,
epoch,
model_ema=model_ema,
use_amp=use_amp,
batch_idx=batch_idx)

if lr_scheduler is not None:
lr_scheduler.step_update(
num_updates=num_updates,
metric=losses_m.avg)

end = time.time()
# end for

if hasattr(optimizer, 'sync_lookahead'):
optimizer.sync_lookahead()

return OrderedDict([('loss', losses_m.avg)])

+ 100
- 0
dubhe-tadl/cream/lib/core/test.py View File

@@ -0,0 +1,100 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com

import time
import torch
import json
from collections import OrderedDict
from ..utils.util import AverageMeter, accuracy, reduce_tensor


def validate(epoch, model, loader, loss_fn, args, log_suffix='',
logger=None, writer=None, local_rank=0,result_path=None):
batch_time_m = AverageMeter()
losses_m = AverageMeter()
prec1_m = AverageMeter()
prec5_m = AverageMeter()

model.eval()

end = time.time()
last_idx = len(loader) - 1
with torch.no_grad():
for batch_idx, (input, target) in enumerate(loader):
last_batch = batch_idx == last_idx

output = model(input)
if isinstance(output, (tuple, list)):
output = output[0]

# augmentation reduction
reduce_factor = args.tta
if reduce_factor > 1:
output = output.unfold(
0,
reduce_factor,
reduce_factor).mean(
dim=2)
target = target[0:target.size(0):reduce_factor]

loss = loss_fn(output, target)
prec1, prec5 = accuracy(output, target, topk=(1, 5))

if args.num_gpu > 1:
reduced_loss = reduce_tensor(loss.data, args.num_gpu)
prec1 = reduce_tensor(prec1, args.num_gpu)
prec5 = reduce_tensor(prec5, args.num_gpu)
else:
reduced_loss = loss.data

torch.cuda.synchronize()

losses_m.update(reduced_loss.item(), input.size(0))
prec1_m.update(prec1.item(), output.size(0))
prec5_m.update(prec5.item(), output.size(0))

batch_time_m.update(time.time() - end)
end = time.time()
if local_rank == 0 and (last_batch or batch_idx % args.log_interval == 0):
log_name = 'Test' + log_suffix
logger.info(
'{0}: [{1:>4d}/{2}] '
'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) '
'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) '
'Prec@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) '
'Prec@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
log_name, batch_idx, last_idx,
batch_time=batch_time_m, loss=losses_m,
top1=prec1_m, top5=prec5_m))

# print({'type': 'Accuracy', 'result': {'sequence': epoch, 'category': 'epoch', 'value': prec1_m.val}})


if result_path is not None:
with open(result_path, "a") as ss_file:
ss_file.write(json.dumps(
{'type': 'Accuracy',
'result': {'sequence': epoch,
'category': 'epoch',
'value': prec1_m.val}}) + '\n')


# writer.add_scalar(
# 'Loss' + log_suffix + '/vaild',
# prec1_m.avg,
# epoch * len(loader) + batch_idx)
# writer.add_scalar(
# 'Accuracy' +
# log_suffix +
# '/vaild',
# prec1_m.avg,
# epoch *
# len(loader) +
# batch_idx)

metrics = OrderedDict(
[('loss', losses_m.avg), ('prec1', prec1_m.avg), ('prec5', prec5_m.avg)])

return metrics

+ 2
- 0
dubhe-tadl/cream/lib/models/blocks/__init__.py View File

@@ -0,0 +1,2 @@
from .residual_block import get_Bottleneck, get_BasicBlock
from .inverted_residual_block import InvertedResidual

+ 113
- 0
dubhe-tadl/cream/lib/models/blocks/inverted_residual_block.py View File

@@ -0,0 +1,113 @@
# This file is downloaded from
# https://github.com/rwightman/pytorch-image-models

import torch.nn as nn

from timm.models.layers import create_conv2d
from timm.models.efficientnet_blocks import make_divisible, resolve_se_args, \
SqueezeExcite, drop_path


class InvertedResidual(nn.Module):
""" Inverted residual block w/ optional SE and CondConv routing"""

def __init__(
self,
in_chs,
out_chs,
dw_kernel_size=3,
stride=1,
dilation=1,
pad_type='',
act_layer=nn.ReLU,
noskip=False,
exp_ratio=1.0,
exp_kernel_size=1,
pw_kernel_size=1,
se_ratio=0.,
se_kwargs=None,
norm_layer=nn.BatchNorm2d,
norm_kwargs=None,
conv_kwargs=None,
drop_path_rate=0.):
super(InvertedResidual, self).__init__()
norm_kwargs = norm_kwargs or {}
conv_kwargs = conv_kwargs or {}
mid_chs = make_divisible(in_chs * exp_ratio)
has_se = se_ratio is not None and se_ratio > 0.
self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
self.drop_path_rate = drop_path_rate

# Point-wise expansion
self.conv_pw = create_conv2d(
in_chs,
mid_chs,
exp_kernel_size,
padding=pad_type,
**conv_kwargs)
self.bn1 = norm_layer(mid_chs, **norm_kwargs)
self.act1 = act_layer(inplace=True)

# Depth-wise convolution
self.conv_dw = create_conv2d(
mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
padding=pad_type, depthwise=True, **conv_kwargs)
self.bn2 = norm_layer(mid_chs, **norm_kwargs)
self.act2 = act_layer(inplace=True)

# Squeeze-and-excitation
if has_se:
se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
else:
self.se = None

# Point-wise linear projection
self.conv_pwl = create_conv2d(
mid_chs,
out_chs,
pw_kernel_size,
padding=pad_type,
**conv_kwargs)
self.bn3 = norm_layer(out_chs, **norm_kwargs)

def feature_info(self, location):
if location == 'expansion': # after SE, input to PWL
info = dict(
module='conv_pwl',
hook_type='forward_pre',
num_chs=self.conv_pwl.in_channels)
else: # location == 'bottleneck', block output
info = dict(
module='',
hook_type='',
num_chs=self.conv_pwl.out_channels)
return info

def forward(self, x):
residual = x

# Point-wise expansion
x = self.conv_pw(x)
x = self.bn1(x)
x = self.act1(x)

# Depth-wise convolution
x = self.conv_dw(x)
x = self.bn2(x)
x = self.act2(x)

# Squeeze-and-excitation
if self.se is not None:
x = self.se(x)

# Point-wise linear projection
x = self.conv_pwl(x)
x = self.bn3(x)

if self.has_residual:
if self.drop_path_rate > 0.:
x = drop_path(x, self.drop_path_rate, self.training)
x += residual

return x

+ 105
- 0
dubhe-tadl/cream/lib/models/blocks/residual_block.py View File

@@ -0,0 +1,105 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com

import torch
import torch.nn as nn
import torch.nn.functional as F


def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=True)


class BasicBlock(nn.Module):
expansion = 1

def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride

def forward(self, x):
residual = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)

if self.downsample is not None:
residual = self.downsample(x)

out += residual
out = self.relu(out)

return out


class Bottleneck(nn.Module):

def __init__(self, inplanes, planes, stride=1, expansion=4):
super(Bottleneck, self).__init__()
planes = int(planes / expansion)
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=True)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(
planes,
planes * expansion,
kernel_size=1,
bias=True)
self.bn3 = nn.BatchNorm2d(planes * expansion)
self.relu = nn.ReLU(inplace=True)
self.stride = stride
self.expansion = expansion
if inplanes != planes * self.expansion:
self.downsample = nn.Sequential(
nn.Conv2d(inplanes, planes * self.expansion,
kernel_size=1, stride=stride, bias=True),
nn.BatchNorm2d(planes * self.expansion),
)
else:
self.downsample = None

def forward(self, x):
residual = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)

out = self.conv3(out)
out = self.bn3(out)

if self.downsample is not None:
residual = self.downsample(x)

out += residual
out = self.relu(out)

return out


def get_Bottleneck(in_c, out_c, stride):
return Bottleneck(in_c, out_c, stride=stride)


def get_BasicBlock(in_c, out_c, stride):
return BasicBlock(in_c, out_c, stride=stride)

+ 182
- 0
dubhe-tadl/cream/lib/models/builders/build_childnet.py View File

@@ -0,0 +1,182 @@
from ...utils.util import *

from collections import OrderedDict
from timm.models.efficientnet_blocks import *


class ChildNetBuilder:
def __init__(
self,
channel_multiplier=1.0,
channel_divisor=8,
channel_min=None,
output_stride=32,
pad_type='',
act_layer=None,
se_kwargs=None,
norm_layer=nn.BatchNorm2d,
norm_kwargs=None,
drop_path_rate=0.,
feature_location='',
verbose=False,
logger=None):
self.channel_multiplier = channel_multiplier
self.channel_divisor = channel_divisor
self.channel_min = channel_min
self.output_stride = output_stride
self.pad_type = pad_type
self.act_layer = act_layer
self.se_kwargs = se_kwargs
self.norm_layer = norm_layer
self.norm_kwargs = norm_kwargs
self.drop_path_rate = drop_path_rate
self.feature_location = feature_location
assert feature_location in ('pre_pwl', 'post_exp', '')
self.verbose = verbose
self.in_chs = None
self.features = OrderedDict()
self.logger = logger

def _round_channels(self, chs):
return round_channels(
chs,
self.channel_multiplier,
self.channel_divisor,
self.channel_min)

def _make_block(self, ba, block_idx, block_count):
drop_path_rate = self.drop_path_rate * block_idx / block_count
bt = ba.pop('block_type')
ba['in_chs'] = self.in_chs
ba['out_chs'] = self._round_channels(ba['out_chs'])
if 'fake_in_chs' in ba and ba['fake_in_chs']:
ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
ba['norm_layer'] = self.norm_layer
ba['norm_kwargs'] = self.norm_kwargs
ba['pad_type'] = self.pad_type
# block act fn overrides the model default
ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
assert ba['act_layer'] is not None
if bt == 'ir':
ba['drop_path_rate'] = drop_path_rate
ba['se_kwargs'] = self.se_kwargs
if self.verbose:
self.logger.info(
' InvertedResidual {}, Args: {}'.format(
block_idx, str(ba)))
block = InvertedResidual(**ba)
elif bt == 'ds' or bt == 'dsa':
ba['drop_path_rate'] = drop_path_rate
ba['se_kwargs'] = self.se_kwargs
if self.verbose:
self.logger.info(
' DepthwiseSeparable {}, Args: {}'.format(
block_idx, str(ba)))
block = DepthwiseSeparableConv(**ba)
elif bt == 'cn':
if self.verbose:
self.logger.info(
' ConvBnAct {}, Args: {}'.format(
block_idx, str(ba)))
block = ConvBnAct(**ba)
else:
assert False, 'Uknkown block type (%s) while building model.' % bt
self.in_chs = ba['out_chs'] # update in_chs for arg of next block

return block

def __call__(self, in_chs, model_block_args):
""" Build the blocks
Args:
in_chs: Number of input-channels passed to first block
model_block_args: A list of lists, outer list defines stages, inner
list contains strings defining block configuration(s)
Return:
List of block stacks (each stack wrapped in nn.Sequential)
"""
if self.verbose:
self.logger.info(
'Building model trunk with %d stages...' %
len(model_block_args))
self.in_chs = in_chs
total_block_count = sum([len(x) for x in model_block_args])
total_block_idx = 0
current_stride = 2
current_dilation = 1
feature_idx = 0
stages = []
# outer list of block_args defines the stacks ('stages' by some
# conventions)
for stage_idx, stage_block_args in enumerate(model_block_args):
last_stack = stage_idx == (len(model_block_args) - 1)
if self.verbose:
self.logger.info('Stack: {}'.format(stage_idx))
assert isinstance(stage_block_args, list)

blocks = []
# each stack (stage) contains a list of block arguments
for block_idx, block_args in enumerate(stage_block_args):
last_block = block_idx == (len(stage_block_args) - 1)
extract_features = '' # No features extracted
if self.verbose:
self.logger.info(' Block: {}'.format(block_idx))

# Sort out stride, dilation, and feature extraction details
assert block_args['stride'] in (1, 2)
if block_idx >= 1:
# only the first block in any stack can have a stride > 1
block_args['stride'] = 1

do_extract = False
if self.feature_location == 'pre_pwl':
if last_block:
next_stage_idx = stage_idx + 1
if next_stage_idx >= len(model_block_args):
do_extract = True
else:
do_extract = model_block_args[next_stage_idx][0]['stride'] > 1
elif self.feature_location == 'post_exp':
if block_args['stride'] > 1 or (last_stack and last_block):
do_extract = True
if do_extract:
extract_features = self.feature_location

next_dilation = current_dilation
if block_args['stride'] > 1:
next_output_stride = current_stride * block_args['stride']
if next_output_stride > self.output_stride:
next_dilation = current_dilation * block_args['stride']
block_args['stride'] = 1
if self.verbose:
self.logger.info(
' Converting stride to dilation to maintain output_stride=={}'.format(
self.output_stride))
else:
current_stride = next_output_stride
block_args['dilation'] = current_dilation
if next_dilation != current_dilation:
current_dilation = next_dilation

# create the block
block = self._make_block(
block_args, total_block_idx, total_block_count)
blocks.append(block)

# stash feature module name and channel info for model feature
# extraction
if extract_features:
feature_module = block.feature_module(extract_features)
if feature_module:
feature_module = 'blocks.{}.{}.'.format(
stage_idx, block_idx) + feature_module
feature_channels = block.feature_channels(extract_features)
self.features[feature_idx] = dict(
name=feature_module,
num_chs=feature_channels
)
feature_idx += 1

# incr global block idx (across all stacks)
total_block_idx += 1
stages.append(nn.Sequential(*blocks))
return stages

+ 214
- 0
dubhe-tadl/cream/lib/models/builders/build_supernet.py View File

@@ -0,0 +1,214 @@
from copy import deepcopy

from ...utils.builder_util import modify_block_args
from ..blocks import get_Bottleneck, InvertedResidual

from timm.models.efficientnet_blocks import *

from pytorch.mutables import LayerChoice

class SuperNetBuilder:
""" Build Trunk Blocks
"""

def __init__(
self,
choices,
channel_multiplier=1.0,
channel_divisor=8,
channel_min=None,
output_stride=32,
pad_type='',
act_layer=None,
se_kwargs=None,
norm_layer=nn.BatchNorm2d,
norm_kwargs=None,
drop_path_rate=0.,
feature_location='',
verbose=False,
resunit=False,
dil_conv=False,
logger=None):

# dict
# choices = {'kernel_size': [3, 5, 7], 'exp_ratio': [4, 6]}
self.choices = [[x, y] for x in choices['kernel_size']
for y in choices['exp_ratio']]
self.choices_num = len(self.choices) - 1
self.channel_multiplier = channel_multiplier
self.channel_divisor = channel_divisor
self.channel_min = channel_min
self.output_stride = output_stride
self.pad_type = pad_type
self.act_layer = act_layer
self.se_kwargs = se_kwargs
self.norm_layer = norm_layer
self.norm_kwargs = norm_kwargs
self.drop_path_rate = drop_path_rate
self.feature_location = feature_location
assert feature_location in ('pre_pwl', 'post_exp', '')
self.verbose = verbose
self.resunit = resunit
self.dil_conv = dil_conv
self.logger = logger

# state updated during build, consumed by model
self.in_chs = None

def _round_channels(self, chs):
return round_channels(
chs,
self.channel_multiplier,
self.channel_divisor,
self.channel_min)

def _make_block(
self,
ba,
choice_idx,
block_idx,
block_count,
resunit=False,
dil_conv=False):
drop_path_rate = self.drop_path_rate * block_idx / block_count
bt = ba.pop('block_type')
ba['in_chs'] = self.in_chs
ba['out_chs'] = self._round_channels(ba['out_chs'])
if 'fake_in_chs' in ba and ba['fake_in_chs']:
# FIXME this is a hack to work around mismatch in origin impl input
# filters
ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
ba['norm_layer'] = self.norm_layer
ba['norm_kwargs'] = self.norm_kwargs
ba['pad_type'] = self.pad_type
# block act fn overrides the model default
ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
assert ba['act_layer'] is not None
if bt == 'ir':
ba['drop_path_rate'] = drop_path_rate
ba['se_kwargs'] = self.se_kwargs
if self.verbose:
self.logger.info(
' InvertedResidual {}, Args: {}'.format(
block_idx, str(ba)))
block = InvertedResidual(**ba)
elif bt == 'ds' or bt == 'dsa':
ba['drop_path_rate'] = drop_path_rate
ba['se_kwargs'] = self.se_kwargs
if self.verbose:
self.logger.info(
' DepthwiseSeparable {}, Args: {}'.format(
block_idx, str(ba)))
block = DepthwiseSeparableConv(**ba)
elif bt == 'cn':
if self.verbose:
self.logger.info(
' ConvBnAct {}, Args: {}'.format(
block_idx, str(ba)))
block = ConvBnAct(**ba)
else:
assert False, 'Uknkown block type (%s) while building model.' % bt
if choice_idx == self.choice_num - 1:
self.in_chs = ba['out_chs'] # update in_chs for arg of next block

return block

def __call__(self, in_chs, model_block_args):
""" Build the blocks
Args:
in_chs: Number of input-channels passed to first block
model_block_args: A list of lists, outer list defines stages, inner
list contains strings defining block configuration(s)
Return:
List of block stacks (each stack wrapped in nn.Sequential)
"""
if self.verbose:
self.logger.info('Building model trunk with %d stages...' % len(model_block_args))
self.in_chs = in_chs
total_block_count = sum([len(x) for x in model_block_args])
total_block_idx = 0
current_stride = 2
current_dilation = 1
feature_idx = 0
stages = []
# outer list of block_args defines the stacks ('stages' by some conventions)
for stage_idx, stage_block_args in enumerate(model_block_args):
last_stack = stage_idx == (len(model_block_args) - 1)
if self.verbose:
self.logger.info('Stack: {}'.format(stage_idx))
assert isinstance(stage_block_args, list)

# blocks = []
# each stack (stage) contains a list of block arguments
for block_idx, block_args in enumerate(stage_block_args):
last_block = block_idx == (len(stage_block_args) - 1)
if self.verbose:
self.logger.info(' Block: {}'.format(block_idx))

# Sort out stride, dilation, and feature extraction details
assert block_args['stride'] in (1, 2)
if block_idx >= 1:
# only the first block in any stack can have a stride > 1
block_args['stride'] = 1

next_dilation = current_dilation
if block_args['stride'] > 1:
next_output_stride = current_stride * block_args['stride']
if next_output_stride > self.output_stride:
next_dilation = current_dilation * block_args['stride']
block_args['stride'] = 1
else:
current_stride = next_output_stride
block_args['dilation'] = current_dilation
if next_dilation != current_dilation:
current_dilation = next_dilation


if stage_idx==0 or stage_idx==6:
self.choice_num = 1
else:
self.choice_num = len(self.choices)

if self.dil_conv:
self.choice_num += 2

choice_blocks = []
block_args_copy = deepcopy(block_args)
if self.choice_num == 1:
# create the block
block = self._make_block(block_args, 0, total_block_idx, total_block_count)
choice_blocks.append(block)
else:
for choice_idx, choice in enumerate(self.choices):
# create the block
block_args = deepcopy(block_args_copy)
block_args = modify_block_args(block_args, choice[0], choice[1])
block = self._make_block(block_args, choice_idx, total_block_idx, total_block_count)
choice_blocks.append(block)
if self.dil_conv:
block_args = deepcopy(block_args_copy)
block_args = modify_block_args(block_args, 3, 0)
block = self._make_block(block_args, self.choice_num - 2, total_block_idx, total_block_count,
resunit=self.resunit, dil_conv=self.dil_conv)
choice_blocks.append(block)

block_args = deepcopy(block_args_copy)
block_args = modify_block_args(block_args, 5, 0)
block = self._make_block(block_args, self.choice_num - 1, total_block_idx, total_block_count,
resunit=self.resunit, dil_conv=self.dil_conv)
choice_blocks.append(block)

if self.resunit:
block = get_Bottleneck(block.conv_pw.in_channels,
block.conv_pwl.out_channels,
block.conv_dw.stride[0])
choice_blocks.append(block)

choice_block = LayerChoice(choice_blocks)
stages.append(choice_block)
# create the block
# block = self._make_block(block_args, total_block_idx, total_block_count)
total_block_idx += 1 # incr global block idx (across all stacks)

# stages.append(blocks)
return stages

+ 145
- 0
dubhe-tadl/cream/lib/models/structures/childnet.py View File

@@ -0,0 +1,145 @@
from ...utils.builder_util import *
from ..builders.build_childnet import *

from timm.models.layers import SelectAdaptivePool2d
from timm.models.layers.activations import hard_sigmoid


class ChildNet(nn.Module):

def __init__(
self,
block_args,
num_classes=1000,
in_chans=3,
stem_size=16,
num_features=1280,
head_bias=True,
channel_multiplier=1.0,
pad_type='',
act_layer=nn.ReLU,
drop_rate=0.,
drop_path_rate=0.,
se_kwargs=None,
norm_layer=nn.BatchNorm2d,
norm_kwargs=None,
global_pool='avg',
logger=None,
verbose=False):
super(ChildNet, self).__init__()

self.num_classes = num_classes
self.num_features = num_features
self.drop_rate = drop_rate
self._in_chs = in_chans
self.logger = logger

# Stem
stem_size = round_channels(stem_size, channel_multiplier)
self.conv_stem = create_conv2d(
self._in_chs, stem_size, 3, stride=2, padding=pad_type)
self.bn1 = norm_layer(stem_size, **norm_kwargs)
self.act1 = act_layer(inplace=True)
self._in_chs = stem_size

# Middle stages (IR/ER/DS Blocks)
builder = ChildNetBuilder(
channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs,
norm_layer, norm_kwargs, drop_path_rate, verbose=verbose)
self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
# self.blocks = builder(self._in_chs, block_args)
self._in_chs = builder.in_chs

# Head + Pooling
self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
self.conv_head = create_conv2d(
self._in_chs,
self.num_features,
1,
padding=pad_type,
bias=head_bias)
self.act2 = act_layer(inplace=True)

# Classifier
self.classifier = nn.Linear(
self.num_features *
self.global_pool.feat_mult(),
self.num_classes)

efficientnet_init_weights(self)

def get_classifier(self):
return self.classifier

def reset_classifier(self, num_classes, global_pool='avg'):
self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
self.num_classes = num_classes
self.classifier = nn.Linear(
self.num_features * self.global_pool.feat_mult(),
num_classes) if self.num_classes else None

def forward_features(self, x):
# architecture = [[0], [], [], [], [], [0]]
x = self.conv_stem(x)
x = self.bn1(x)
x = self.act1(x)
x = self.blocks(x)
x = self.global_pool(x)
x = self.conv_head(x)
x = self.act2(x)
return x

def forward(self, x):
x = self.forward_features(x)
x = x.flatten(1)
if self.drop_rate > 0.:
x = F.dropout(x, p=self.drop_rate, training=self.training)
x = self.classifier(x)
return x


def gen_childnet(arch_list, arch_def, **kwargs):
# arch_list = [[0], [], [], [], [], [0]]
choices = {'kernel_size': [3, 5, 7], 'exp_ratio': [4, 6]}
choices_list = [[x, y] for x in choices['kernel_size']
for y in choices['exp_ratio']]

num_features = 1280

# act_layer = HardSwish
act_layer = Swish

new_arch = []
# change to child arch_def
for i, (layer_choice, layer_arch) in enumerate(zip(arch_list, arch_def)):
if len(layer_arch) == 1:
new_arch.append(layer_arch)
continue
else:
new_layer = []
for j, (block_choice, block_arch) in enumerate(
zip(layer_choice, layer_arch)):
kernel_size, exp_ratio = choices_list[block_choice]
elements = block_arch.split('_')
block_arch = block_arch.replace(
elements[2], 'k{}'.format(str(kernel_size)))
block_arch = block_arch.replace(
elements[4], 'e{}'.format(str(exp_ratio)))
new_layer.append(block_arch)
new_arch.append(new_layer)

model_kwargs = dict(
block_args=decode_arch_def(new_arch),
num_features=num_features,
stem_size=16,
norm_kwargs=resolve_bn_args(kwargs),
act_layer=act_layer,
se_kwargs=dict(
act_layer=nn.ReLU,
gate_fn=hard_sigmoid,
reduce_mid=True,
divisor=8),
**kwargs,
)
model = ChildNet(**model_kwargs)
return model

+ 202
- 0
dubhe-tadl/cream/lib/models/structures/supernet.py View File

@@ -0,0 +1,202 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com

from ...utils.builder_util import *
from ...utils.search_structure_supernet import *
from ...utils.op_by_layer_dict import flops_op_dict
from ..builders.build_supernet import *

from timm.models.layers import SelectAdaptivePool2d
from timm.models.layers.activations import hard_sigmoid


class SuperNet(nn.Module):

def __init__(
self,
block_args,
choices,
num_classes=1000,
in_chans=3,
stem_size=16,
num_features=1280,
head_bias=True,
channel_multiplier=1.0,
pad_type='',
act_layer=nn.ReLU,
drop_rate=0.,
drop_path_rate=0.,
slice=4,
se_kwargs=None,
norm_layer=nn.BatchNorm2d,
logger=None,
norm_kwargs=None,
global_pool='avg',
resunit=False,
dil_conv=False,
verbose=False):
super(SuperNet, self).__init__()

self.num_classes = num_classes
self.num_features = num_features
self.drop_rate = drop_rate
self._in_chs = in_chans
self.logger = logger

# Stem
stem_size = round_channels(stem_size, channel_multiplier)
self.conv_stem = create_conv2d(
self._in_chs, stem_size, 3, stride=2, padding=pad_type)
self.bn1 = norm_layer(stem_size, **norm_kwargs)
self.act1 = act_layer(inplace=True)
self._in_chs = stem_size

# Middle stages (IR/ER/DS Blocks)
builder = SuperNetBuilder(
choices,
channel_multiplier,
8,
None,
32,
pad_type,
act_layer,
se_kwargs,
norm_layer,
norm_kwargs,
drop_path_rate,
verbose=verbose,
resunit=resunit,
dil_conv=dil_conv,
logger=self.logger)
blocks = builder(self._in_chs, block_args)
self.blocks = nn.Sequential(*blocks)
self._in_chs = builder.in_chs

# Head + Pooling
self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
self.conv_head = create_conv2d(
self._in_chs,
self.num_features,
1,
padding=pad_type,
bias=head_bias)
self.act2 = act_layer(inplace=True)

# Classifier
self.classifier = nn.Linear(
self.num_features *
self.global_pool.feat_mult(),
self.num_classes)

self.meta_layer = nn.Linear(self.num_classes * slice, 1)
efficientnet_init_weights(self)

def get_classifier(self):
return self.classifier

def reset_classifier(self, num_classes, global_pool='avg'):
self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
self.num_classes = num_classes
self.classifier = nn.Linear(
self.num_features * self.global_pool.feat_mult(),
num_classes) if self.num_classes else None

def forward_features(self, x):
x = self.conv_stem(x)
x = self.bn1(x)
x = self.act1(x)
x = self.blocks(x)
x = self.global_pool(x)
x = self.conv_head(x)
x = self.act2(x)
return x

def forward(self, x):
x = self.forward_features(x)
x = x.flatten(1)
if self.drop_rate > 0.:
x = F.dropout(x, p=self.drop_rate, training=self.training)
return self.classifier(x)

def forward_meta(self, features):
return self.meta_layer(features.view(1, -1))

def rand_parameters(self, architecture, meta=False):
for name, param in self.named_parameters(recurse=True):
if 'meta' in name and meta:
yield param
elif 'blocks' not in name and 'meta' not in name and (not meta):
yield param

if not meta:
for layer, layer_arch in zip(self.blocks, architecture):
for blocks, arch in zip(layer, layer_arch):
if arch == -1:
continue
for name, param in blocks[arch].named_parameters(
recurse=True):
yield param


class Classifier(nn.Module):
def __init__(self, num_classes=1000):
super(Classifier, self).__init__()
self.classifier = nn.Linear(num_classes, num_classes)

def forward(self, x):
return self.classifier(x)


def gen_supernet(flops_minimum=0, flops_maximum=600, **kwargs):
choices = {'kernel_size': [3, 5, 7], 'exp_ratio': [4, 6]}

num_features = 1280

# act_layer = HardSwish
act_layer = Swish
arch_def = [
# stage 0, 112x112 in
['ds_r1_k3_s1_e1_c16_se0.25'],
# stage 1, 112x112 in
['ir_r1_k3_s2_e4_c24_se0.25', 'ir_r1_k3_s1_e4_c24_se0.25', 'ir_r1_k3_s1_e4_c24_se0.25',
'ir_r1_k3_s1_e4_c24_se0.25'],
# stage 2, 56x56 in
['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r1_k5_s1_e4_c40_se0.25', 'ir_r1_k5_s2_e4_c40_se0.25',
'ir_r1_k5_s2_e4_c40_se0.25'],
# stage 3, 28x28 in
['ir_r1_k3_s2_e6_c80_se0.25', 'ir_r1_k3_s1_e4_c80_se0.25', 'ir_r1_k3_s1_e4_c80_se0.25',
'ir_r2_k3_s1_e4_c80_se0.25'],
# stage 4, 14x14in
['ir_r1_k3_s1_e6_c96_se0.25', 'ir_r1_k3_s1_e6_c96_se0.25', 'ir_r1_k3_s1_e6_c96_se0.25',
'ir_r1_k3_s1_e6_c96_se0.25'],
# stage 5, 14x14in
['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k5_s2_e6_c192_se0.25',
'ir_r1_k5_s2_e6_c192_se0.25'],
# stage 6, 7x7 in
['cn_r1_k1_s1_c320_se0.25'],
]

sta_num, arch_def, resolution = search_for_layer(
flops_op_dict, arch_def, flops_minimum, flops_maximum)

if sta_num is None or arch_def is None or resolution is None:
raise ValueError('Invalid FLOPs Settings')

model_kwargs = dict(
block_args=decode_arch_def(arch_def),
choices=choices,
num_features=num_features,
stem_size=16,
norm_kwargs=resolve_bn_args(kwargs),
act_layer=act_layer,
se_kwargs=dict(
act_layer=nn.ReLU,
gate_fn=hard_sigmoid,
reduce_mid=True,
divisor=8),
**kwargs,
)
model = SuperNet(**model_kwargs)
return model, sta_num, resolution, arch_def

+ 270
- 0
dubhe-tadl/cream/lib/utils/builder_util.py View File

@@ -0,0 +1,270 @@
import re
import math
import torch.nn as nn

from copy import deepcopy

from timm.utils import *
from timm.models.layers.activations import Swish
from timm.models.layers import CondConv2d, get_condconv_initializer


def parse_ksize(ss):
if ss.isdigit():
return int(ss)
else:
return [int(k) for k in ss.split('.')]


def decode_arch_def(
arch_def,
depth_multiplier=1.0,
depth_trunc='ceil',
experts_multiplier=1):
arch_args = []
for stack_idx, block_strings in enumerate(arch_def):
assert isinstance(block_strings, list)
stack_args = []
repeats = []
for block_str in block_strings:
assert isinstance(block_str, str)
ba, rep = decode_block_str(block_str)
if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
ba['num_experts'] *= experts_multiplier
stack_args.append(ba)
repeats.append(rep)
arch_args.append(
scale_stage_depth(
stack_args,
repeats,
depth_multiplier,
depth_trunc))
return arch_args


def modify_block_args(block_args, kernel_size, exp_ratio):
block_type = block_args['block_type']
if block_type == 'cn':
block_args['kernel_size'] = kernel_size
elif block_type == 'er':
block_args['exp_kernel_size'] = kernel_size
else:
block_args['dw_kernel_size'] = kernel_size

if block_type == 'ir' or block_type == 'er':
block_args['exp_ratio'] = exp_ratio
return block_args


def decode_block_str(block_str):
""" Decode block definition string
Gets a list of block arg (dicts) through a string notation of arguments.
E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip
All args can exist in any order with the exception of the leading string which
is assumed to indicate the block type.
leading string - block type (
ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
r - number of repeat blocks,
k - kernel size,
s - strides (1-9),
e - expansion ratio,
c - output channels,
se - squeeze/excitation ratio
n - activation fn ('re', 'r6', 'hs', or 'sw')
Args:
block_str: a string representation of block arguments.
Returns:
A list of block args (dicts)
Raises:
ValueError: if the string def not properly specified (TODO)
"""
assert isinstance(block_str, str)
ops = block_str.split('_')
block_type = ops[0] # take the block type off the front
ops = ops[1:]
options = {}
noskip = False
for op in ops:
# string options being checked on individual basis, combine if they
# grow
if op == 'noskip':
noskip = True
elif op.startswith('n'):
# activation fn
key = op[0]
v = op[1:]
if v == 're':
value = nn.ReLU
elif v == 'r6':
value = nn.ReLU6
elif v == 'sw':
value = Swish
else:
continue
options[key] = value
else:
# all numeric options
splits = re.split(r'(\d.*)', op)
if len(splits) >= 2:
key, value = splits[:2]
options[key] = value

# if act_layer is None, the model default (passed to model init) will be
# used
act_layer = options['n'] if 'n' in options else None
exp_kernel_size = parse_ksize(options['a']) if 'a' in options else 1
pw_kernel_size = parse_ksize(options['p']) if 'p' in options else 1
# FIXME hack to deal with in_chs issue in TPU def
fake_in_chs = int(options['fc']) if 'fc' in options else 0

num_repeat = int(options['r'])
# each type of block has different valid arguments, fill accordingly
if block_type == 'ir':
block_args = dict(
block_type=block_type,
dw_kernel_size=parse_ksize(options['k']),
exp_kernel_size=exp_kernel_size,
pw_kernel_size=pw_kernel_size,
out_chs=int(options['c']),
exp_ratio=float(options['e']),
se_ratio=float(options['se']) if 'se' in options else None,
stride=int(options['s']),
act_layer=act_layer,
noskip=noskip,
)
if 'cc' in options:
block_args['num_experts'] = int(options['cc'])
elif block_type == 'ds' or block_type == 'dsa':
block_args = dict(
block_type=block_type,
dw_kernel_size=parse_ksize(options['k']),
pw_kernel_size=pw_kernel_size,
out_chs=int(options['c']),
se_ratio=float(options['se']) if 'se' in options else None,
stride=int(options['s']),
act_layer=act_layer,
pw_act=block_type == 'dsa',
noskip=block_type == 'dsa' or noskip,
)
elif block_type == 'cn':
block_args = dict(
block_type=block_type,
kernel_size=int(options['k']),
out_chs=int(options['c']),
stride=int(options['s']),
act_layer=act_layer,
)
else:
assert False, 'Unknown block type (%s)' % block_type

return block_args, num_repeat


def scale_stage_depth(
stack_args,
repeats,
depth_multiplier=1.0,
depth_trunc='ceil'):
""" Per-stage depth scaling
Scales the block repeats in each stage. This depth scaling impl maintains
compatibility with the EfficientNet scaling method, while allowing sensible
scaling for other models that may have multiple block arg definitions in each stage.
"""

# We scale the total repeat count for each stage, there may be multiple
# block arg defs per stage so we need to sum.
num_repeat = sum(repeats)
if depth_trunc == 'round':
# Truncating to int by rounding allows stages with few repeats to remain
# proportionally smaller for longer. This is a good choice when stage definitions
# include single repeat stages that we'd prefer to keep that way as
# long as possible
num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
else:
# The default for EfficientNet truncates repeats to int via 'ceil'.
# Any multiplier > 1.0 will result in an increased depth for every
# stage.
num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))

# Proportionally distribute repeat count scaling to each block definition in the stage.
# Allocation is done in reverse as it results in the first block being less likely to be scaled.
# The first block makes less sense to repeat in most of the arch
# definitions.
repeats_scaled = []
for r in repeats[::-1]:
rs = max(1, round((r / num_repeat * num_repeat_scaled)))
repeats_scaled.append(rs)
num_repeat -= r
num_repeat_scaled -= rs
repeats_scaled = repeats_scaled[::-1]

# Apply the calculated scaling to each block arg in the stage
sa_scaled = []
for ba, rep in zip(stack_args, repeats_scaled):
sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
return sa_scaled


def init_weight_goog(m, n='', fix_group_fanout=True, last_bn=None):
""" Weight initialization as per Tensorflow official implementations.
Args:
m (nn.Module): module to init
n (str): module name
fix_group_fanout (bool): enable correct (matching Tensorflow TPU impl) fanout calculation w/ group convs
Handles layers in EfficientNet, EfficientNet-CondConv, MixNet, MnasNet, MobileNetV3, etc:
* https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
* https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
"""
if isinstance(m, CondConv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
if fix_group_fanout:
fan_out //= m.groups
init_weight_fn = get_condconv_initializer(lambda w: w.data.normal_(
0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape)
init_weight_fn(m.weight)
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
if fix_group_fanout:
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
if n in last_bn:
m.weight.data.zero_()
m.bias.data.zero_()
else:
m.weight.data.fill_(1.0)
m.bias.data.zero_()
m.weight.data.fill_(1.0)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
fan_out = m.weight.size(0) # fan-out
fan_in = 0
if 'routing_fn' in n:
fan_in = m.weight.size(1)
init_range = 1.0 / math.sqrt(fan_in + fan_out)
m.weight.data.uniform_(-init_range, init_range)
m.bias.data.zero_()


def efficientnet_init_weights(
model: nn.Module,
init_fn=None,
zero_gamma=False):
last_bn = []
if zero_gamma:
prev_n = ''
for n, m in model.named_modules():
if isinstance(m, nn.BatchNorm2d):
if ''.join(prev_n.split('.')[:-1]) != ''.join(n.split('.')[:-1]):
last_bn.append(prev_n)
prev_n = n
last_bn.append(prev_n)

init_fn = init_fn or init_weight_goog
for n, m in model.named_modules():
init_fn(m, n, last_bn=last_bn)
init_fn(m, n, last_bn=last_bn)

+ 79
- 0
dubhe-tadl/cream/lib/utils/flops_table.py View File

@@ -0,0 +1,79 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com

import torch

from ptflops import get_model_complexity_info


class FlopsEst(object):
def __init__(self, model, input_shape=(2, 3, 224, 224), device='cpu'):
self.block_num = len(model.blocks)
self.choice_num = len(model.blocks[0])
self.flops_dict = {}
self.params_dict = {}

if device == 'cpu':
model = model.cpu()
else:
model = model.cuda()

self.params_fixed = 0
self.flops_fixed = 0

input = torch.randn(input_shape)

flops, params = get_model_complexity_info(
model.conv_stem, (3, 224, 224), as_strings=False, print_per_layer_stat=False)
self.params_fixed += params / 1e6
self.flops_fixed += flops / 1e6

input = model.conv_stem(input)

for block_id, block in enumerate(model.blocks):
self.flops_dict[block_id] = {}
self.params_dict[block_id] = {}
for module_id, module in enumerate(block):
flops, params = get_model_complexity_info(module, tuple(
input.shape[1:]), as_strings=False, print_per_layer_stat=False)
# Flops(M)
self.flops_dict[block_id][module_id] = flops / 1e6
# Params(M)
self.params_dict[block_id][module_id] = params / 1e6

input = module(input)

# conv_last
flops, params = get_model_complexity_info(model.global_pool, tuple(
input.shape[1:]), as_strings=False, print_per_layer_stat=False)
self.params_fixed += params / 1e6
self.flops_fixed += flops / 1e6

input = model.global_pool(input)

# globalpool
flops, params = get_model_complexity_info(model.conv_head, tuple(
input.shape[1:]), as_strings=False, print_per_layer_stat=False)
self.params_fixed += params / 1e6
self.flops_fixed += flops / 1e6

# return params (M)
def get_params(self, arch):
params = 0
for block_id, block in enumerate(arch):
if block == -1:
continue
params += self.params_dict[block_id][block]
return params + self.params_fixed

# return flops (M)
def get_flops(self, arch):
flops = 0
for block_id, block in enumerate(arch):
if block == 'LayerChoice1' or block_id == 'LayerChoice23':
continue
for idx, choice in enumerate(arch[block]):
flops += self.flops_dict[block_id][idx] * (1 if choice else 0)
return flops + self.flops_fixed

+ 42
- 0
dubhe-tadl/cream/lib/utils/op_by_layer_dict.py View File

@@ -0,0 +1,42 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com

# This dictionary is generated from calculating each operation of each layer to quickly search for layers.
# flops_op_dict[which_stage][which_operation] =
# (flops_of_operation_with_stride1, flops_of_operation_with_stride2)

flops_op_dict = {}
for i in range(5):
flops_op_dict[i] = {}
flops_op_dict[0][0] = (21.828704, 18.820752)
flops_op_dict[0][1] = (32.669328, 28.16048)
flops_op_dict[0][2] = (25.039968, 23.637648)
flops_op_dict[0][3] = (37.486224, 35.385824)
flops_op_dict[0][4] = (29.856864, 30.862992)
flops_op_dict[0][5] = (44.711568, 46.22384)
flops_op_dict[1][0] = (11.808656, 11.86712)
flops_op_dict[1][1] = (17.68624, 17.780848)
flops_op_dict[1][2] = (13.01288, 13.87416)
flops_op_dict[1][3] = (19.492576, 20.791408)
flops_op_dict[1][4] = (14.819216, 16.88472)
flops_op_dict[1][5] = (22.20208, 25.307248)
flops_op_dict[2][0] = (8.198, 10.99632)
flops_op_dict[2][1] = (12.292848, 16.5172)
flops_op_dict[2][2] = (8.69976, 11.99984)
flops_op_dict[2][3] = (13.045488, 18.02248)
flops_op_dict[2][4] = (9.4524, 13.50512)
flops_op_dict[2][5] = (14.174448, 20.2804)
flops_op_dict[3][0] = (12.006112, 15.61632)
flops_op_dict[3][1] = (18.028752, 23.46096)
flops_op_dict[3][2] = (13.009632, 16.820544)
flops_op_dict[3][3] = (19.534032, 25.267296)
flops_op_dict[3][4] = (14.514912, 18.62688)
flops_op_dict[3][5] = (21.791952, 27.9768)
flops_op_dict[4][0] = (11.307456, 15.292416)
flops_op_dict[4][1] = (17.007072, 23.1504)
flops_op_dict[4][2] = (11.608512, 15.894528)
flops_op_dict[4][3] = (17.458656, 24.053568)
flops_op_dict[4][4] = (12.060096, 16.797696)
flops_op_dict[4][5] = (18.136032, 25.40832)

+ 47
- 0
dubhe-tadl/cream/lib/utils/search_structure_supernet.py View File

@@ -0,0 +1,47 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com


def search_for_layer(flops_op_dict, arch_def, flops_minimum, flops_maximum):
sta_num = [1, 1, 1, 1, 1]
order = [2, 3, 4, 1, 0, 2, 3, 4, 1, 0]
limits = [3, 3, 3, 2, 2, 4, 4, 4, 4, 4]
size_factor = 224 // 32
base_min_flops = sum([flops_op_dict[i][0][0] for i in range(5)])
base_max_flops = sum([flops_op_dict[i][5][0] for i in range(5)])

if base_min_flops > flops_maximum:
while base_min_flops > flops_maximum and size_factor >= 2:
size_factor = size_factor - 1
flops_minimum = flops_minimum * (7. / size_factor)
flops_maximum = flops_maximum * (7. / size_factor)
if size_factor < 2:
return None, None, None
elif base_max_flops < flops_minimum:
cur_ptr = 0
while base_max_flops < flops_minimum and cur_ptr <= 9:
if sta_num[order[cur_ptr]] >= limits[cur_ptr]:
cur_ptr += 1
continue
base_max_flops = base_max_flops + flops_op_dict[order[cur_ptr]][5][1]
sta_num[order[cur_ptr]] += 1
if cur_ptr > 7 and base_max_flops < flops_minimum:
return None, None, None

cur_ptr = 0
while cur_ptr <= 9:
if sta_num[order[cur_ptr]] >= limits[cur_ptr]:
cur_ptr += 1
continue
base_max_flops = base_max_flops + flops_op_dict[order[cur_ptr]][5][1]
if base_max_flops <= flops_maximum:
sta_num[order[cur_ptr]] += 1
else:
break

arch_def = [item[:i] for i, item in zip([1] + sta_num + [1], arch_def)]
# print(arch_def)

return sta_num, arch_def, size_factor * 32

+ 178
- 0
dubhe-tadl/cream/lib/utils/util.py View File

@@ -0,0 +1,178 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com

import sys
import torch
import logging
import argparse
import torch
import torch.nn as nn

from copy import deepcopy
from torch import optim as optim
from thop import profile, clever_format

from timm.utils import *

from ..config import cfg


def get_path_acc(model, path, val_loader, args, val_iters=50):
prec1_m = AverageMeter()
prec5_m = AverageMeter()
with torch.no_grad():
for batch_idx, (input, target) in enumerate(val_loader):
if batch_idx >= val_iters:
break
if not args.prefetcher:
input = input.cuda()
target = target.cuda()

output = model(input, path)
if isinstance(output, (tuple, list)):
output = output[0]

# augmentation reduction
reduce_factor = args.tta
if reduce_factor > 1:
output = output.unfold(
0,
reduce_factor,
reduce_factor).mean(
dim=2)
target = target[0:target.size(0):reduce_factor]

prec1, prec5 = accuracy(output, target, topk=(1, 5))

torch.cuda.synchronize()

prec1_m.update(prec1.item(), output.size(0))
prec5_m.update(prec5.item(), output.size(0))

return (prec1_m.avg, prec5_m.avg)


def get_logger(file_path):
""" Make python logger """
log_format = '%(asctime)s | %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
format=log_format, datefmt='%m/%d %I:%M:%S %p')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter(log_format, datefmt='%m/%d %I:%M:%S %p')
file_handler = logging.FileHandler(file_path)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
return logger


def add_weight_decay_supernet(model, args, weight_decay=1e-5, skip_list=()):
decay = []
no_decay = []
meta_layer_no_decay = []
meta_layer_decay = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue # frozen weights
if len(param.shape) == 1 or name.endswith(
".bias") or name in skip_list:
if 'meta_layer' in name:
meta_layer_no_decay.append(param)
else:
no_decay.append(param)
else:
if 'meta_layer' in name:
meta_layer_decay.append(param)
else:
decay.append(param)
return [
{'params': no_decay, 'weight_decay': 0., 'lr': args.lr},
{'params': decay, 'weight_decay': weight_decay, 'lr': args.lr},
{'params': meta_layer_no_decay, 'weight_decay': 0., 'lr': args.meta_lr},
{'params': meta_layer_decay, 'weight_decay': 0, 'lr': args.meta_lr},
]


def create_optimizer_supernet(args, model, has_apex=False, filter_bias_and_bn=True):
weight_decay = args.weight_decay
if 'adamw' == args.opt or 'radam' == args.opt :
weight_decay /= args.lr
if weight_decay and filter_bias_and_bn:
parameters = add_weight_decay_supernet(model, args, weight_decay)
weight_decay = 0.
else:
parameters = model.parameters()

if 'fused' == args.opt:
assert has_apex and torch.cuda.is_available(
), 'APEX and CUDA required for fused optimizers'

if args.opt == 'sgd' or args.opt == 'nesterov':
optimizer = optim.SGD(
parameters,
momentum=args.momentum,
weight_decay=weight_decay,
nesterov=True)
elif args.opt == 'momentum':
optimizer = optim.SGD(
parameters,
momentum=args.momentum,
weight_decay=weight_decay,
nesterov=False)
elif args.opt == 'adam':
optimizer = optim.Adam(
parameters, weight_decay=weight_decay, eps=args.opt_eps)
else:
assert False and "Invalid optimizer"
raise ValueError

return optimizer


def convert_lowercase(cfg):
keys = cfg.keys()
lowercase_keys = [key.lower() for key in keys]
values = [cfg.get(key) for key in keys]
for lowercase_key, value in zip(lowercase_keys, values):
cfg.setdefault(lowercase_key, value)
return cfg

#
# def parse_config_args(exp_name):
# parser = argparse.ArgumentParser(description=exp_name)
# parser.add_argument(
# '--cfg',
# type=str,
# default='../experiments/workspace/retrain/retrain.yaml',
# help='configuration of cream')
# parser.add_argument('--local_rank', type=int, default=0,
# help='local_rank')
# args = parser.parse_args()
#
# cfg.merge_from_file(args.cfg)
# converted_cfg = convert_lowercase(cfg)
#
# return args, converted_cfg


def get_model_flops_params(model, input_size=(1, 3, 224, 224)):
input = torch.randn(input_size)
macs, params = profile(deepcopy(model), inputs=(input,), verbose=False)
macs, params = clever_format([macs, params], "%.3f")
return macs, params


def cross_entropy_loss_with_soft_target(pred, soft_target):
logsoftmax = nn.LogSoftmax()
return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1))


def create_supernet_scheduler(optimizer, epochs, num_gpu, batch_size, lr):
ITERS = epochs * \
(1280000 / (num_gpu * batch_size))
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: (
lr - step / ITERS) if step <= ITERS else 0, last_epoch=-1)
return lr_scheduler, epochs

+ 6
- 0
dubhe-tadl/cream/pretrained/README.md View File

@@ -0,0 +1,6 @@
## Pretrained models

The official 14M/43M/114M/287M/481M/604M pretrained models in
[google drive](https://drive.google.com/drive/folders/1CQjyBryZ4F20Rutj7coF8HWFcedApUn2) or
[Models-Baidu Disk (password: wqw6)](https://pan.baidu.com/s/1TqQNm2s14oEdyNPimw3T9g).


+ 462
- 0
dubhe-tadl/cream/retrainer.py View File

@@ -0,0 +1,462 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com
import sys

sys.path.append('../..')
import os
import json
import time
import timm
import torch
import numpy as np
import torch.nn as nn

from argparse import ArgumentParser
# from torch.utils.tensorboard import SummaryWriter

# import timm packages
from timm.optim import create_optimizer
from timm.models import resume_checkpoint
from timm.scheduler import create_scheduler
from timm.data import Dataset, create_loader
from timm.utils import CheckpointSaver, ModelEma, update_summary
from timm.loss import LabelSmoothingCrossEntropy

# import apex as distributed package
try:
from apex import amp
from apex.parallel import DistributedDataParallel as DDP
from apex.parallel import convert_syncbn_model

HAS_APEX = True
except ImportError as e:
print(e)
from torch.nn.parallel import DistributedDataParallel as DDP

HAS_APEX = False

# import models and training functions
from pytorch.utils import mkdirs, save_best_checkpoint, str2bool
from lib.core.test import validate
from lib.core.retrain import train_epoch
from lib.models.structures.childnet import gen_childnet
from lib.utils.util import get_logger, get_model_flops_params
from lib.config import DEFAULT_CROP_PCT, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD


def parse_args():
"""See lib.utils.config"""
parser = ArgumentParser()

# path
parser.add_argument("--best_checkpoint_dir", type=str, default='./output/best_checkpoint/')
parser.add_argument("--checkpoint_dir", type=str, default='./output/checkpoints/')
parser.add_argument("--data_dir", type=str, default='./data')
parser.add_argument("--experiment_dir", type=str, default='./')
parser.add_argument("--model_name", type=str, default='retrainer')
parser.add_argument("--log_path", type=str, default='output/log')
parser.add_argument("--result_path", type=str, default='output/result.json')
parser.add_argument("--best_selected_space_path", type=str,
default='output/selected_space.json')

# int
parser.add_argument("--acc_gap", type=int, default=5)
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--cooldown_epochs", type=int, default=10)
parser.add_argument("--decay_epochs", type=int, default=10)
parser.add_argument("--epochs", type=int, default=200)
parser.add_argument("--flops_minimum", type=int, default=0)
parser.add_argument("--flops_maximum", type=int, default=200)
parser.add_argument("--image_size", type=int, default=224)
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument("--log_interval", type=int, default=50)
parser.add_argument("--meta_sta_epoch", type=int, default=20)
parser.add_argument("--num_classes", type=int, default=1000)
parser.add_argument("--num_gpu", type=int, default=1)
parser.add_argument("--parience_epochs", type=int, default=10)
parser.add_argument("--pool_size", type=int, default=10)
parser.add_argument("--recovery_interval", type=int, default=10)
parser.add_argument("--trial_id", type=int, default=42)
parser.add_argument("--selection", type=int, default=-1)
parser.add_argument("--slice_num", type=int, default=4)
parser.add_argument("--tta", type=int, default=0)
parser.add_argument("--update_iter", type=int, default=1300)
parser.add_argument("--val_batch_mul", type=int, default=4)
parser.add_argument("--warmup_epochs", type=int, default=3)
parser.add_argument("--workers", type=int, default=4)

# float
parser.add_argument("--color_jitter", type=float, default=0.4)
parser.add_argument("--decay_rate", type=float, default=0.1)
parser.add_argument("--dropout_rate", type=float, default=0.0)
parser.add_argument("--ema_decay", type=float, default=0.998)
parser.add_argument("--lr", type=float, default=1e-2)
parser.add_argument("--meta_lr", type=float, default=1e-4)
parser.add_argument("--re_prob", type=float, default=0.2)
parser.add_argument("--opt_eps", type=float, default=1e-2)
parser.add_argument("--momentum", type=float, default=0.9)
parser.add_argument("--min_lr", type=float, default=1e-5)
parser.add_argument("--smoothing", type=float, default=0.1)
parser.add_argument("--weight_decay", type=float, default=1e-4)
parser.add_argument("--warmup_lr", type=float, default=1e-4)

# bool
parser.add_argument("--auto_resume", type=str2bool, default='False')
parser.add_argument("--dil_conv", type=str2bool, default='False')
parser.add_argument("--ema_cpu", type=str2bool, default='False')
parser.add_argument("--pin_mem", type=str2bool, default='True')
parser.add_argument("--resunit", type=str2bool, default='False')
parser.add_argument("--save_images", type=str2bool, default='False')
parser.add_argument("--sync_bn", type=str2bool, default='False')
parser.add_argument("--use_ema", type=str2bool, default='False')
parser.add_argument("--verbose", type=str2bool, default='False')

# str
parser.add_argument("--aa", type=str, default='rand-m9-mstd0.5')
parser.add_argument("--eval_metrics", type=str, default='prec1')
# gp: type of global pool ["avg", "max", "avgmax", "avgmaxc"]
parser.add_argument("--gp", type=str, default='avg')
parser.add_argument("--interpolation", type=str, default='bilinear')
parser.add_argument("--opt", type=str, default='sgd')
parser.add_argument("--pick_method", type=str, default='meta')
parser.add_argument("--re_mode", type=str, default='pixel')
parser.add_argument("--sched", type=str, default='sgd')

args = parser.parse_args()
args.sync_bn = False
args.verbose = False
args.data_dir = args.data_dir + "/imagenet"
return args


def main():
args = parse_args()

mkdirs(args.checkpoint_dir + "/",
args.experiment_dir,
args.best_selected_space_path,
args.result_path)
with open(args.result_path, "w") as ss_file:
ss_file.write('')

if len(args.checkpoint_dir > 1):
mkdirs(args.best_checkpoint_dir + "/")

args.checkpoint_dir = os.path.join(
args.checkpoint_dir,
"{}_{}".format(args.model_name, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
)
if not os.path.exists(args.checkpoint_dir):
os.mkdir(args.checkpoint_dir)

# resolve logging
if args.local_rank == 0:
logger = get_logger(args.log_path)
writer = None # SummaryWriter(os.path.join(output_dir, 'runs'))
else:
writer, logger = None, None

# retrain model selection

if args.selection == -1:
if os.path.exists(args.best_selected_space_path):
with open(args.best_selected_space_path, "r") as f:
arch_list = json.load(f)['selected_space']
else:
args.selection = 14
logger.warning("args.best_selected_space_path is not exist. Set selection to 14.")

if args.selection == 481:
arch_list = [
[0], [
3, 4, 3, 1], [
3, 2, 3, 0], [
3, 3, 3, 1], [
3, 3, 3, 3], [
3, 3, 3, 3], [0]]
args.image_size = 224
elif args.selection == 43:
arch_list = [[0], [3], [3, 1], [3, 1], [3, 3, 3], [3, 3], [0]]
args.image_size = 96
elif args.selection == 14:
arch_list = [[0], [3], [3, 3], [3, 3], [3], [3], [0]]
args.image_size = 64
elif args.selection == 112:
arch_list = [[0], [3], [3, 3], [3, 3], [3, 3, 3], [3, 3], [0]]
args.image_size = 160
elif args.selection == 287:
arch_list = [[0], [3], [3, 3], [3, 1, 3], [3, 3, 3, 3], [3, 3, 3], [0]]
args.image_size = 224
elif args.selection == 604:
arch_list = [
[0], [
3, 3, 2, 3, 3], [
3, 2, 3, 2, 3], [
3, 2, 3, 2, 3], [
3, 3, 2, 2, 3, 3], [
3, 3, 2, 3, 3, 3], [0]]
args.image_size = 224
elif args.selection == -1:
args.image_size = 224
else:
raise ValueError("Model Retrain Selection is not Supported!")

print(arch_list)
# define childnet architecture from arch_list
stem = ['ds_r1_k3_s1_e1_c16_se0.25', 'cn_r1_k1_s1_c320_se0.25']

# TODO: this param from NNI is different from microsoft/Cream.
choice_block_pool = ['ir_r1_k3_s2_e4_c24_se0.25',
'ir_r1_k5_s2_e4_c40_se0.25',
'ir_r1_k3_s2_e6_c80_se0.25',
'ir_r1_k3_s1_e6_c96_se0.25',
'ir_r1_k5_s2_e6_c192_se0.25']
arch_def = [[stem[0]]] + [[choice_block_pool[idx]
for repeat_times in range(len(arch_list[idx + 1]))]
for idx in range(len(choice_block_pool))] + [[stem[1]]]

# generate childnet
model = gen_childnet(
arch_list,
arch_def,
num_classes=args.num_classes,
drop_rate=args.dropout_rate,
global_pool=args.gp)

# initialize distributed parameters
distributed = args.num_gpu > 1
torch.cuda.set_device(args.local_rank)
if args.local_rank == 0:
logger.info(
'Training on Process {} with {} GPUs.'.format(
args.local_rank, args.num_gpu))

# fix random seeds
torch.manual_seed(args.trial_id)
torch.cuda.manual_seed_all(args.trial_id)
np.random.seed(args.trial_id)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# get parameters and FLOPs of model
if args.local_rank == 0:
macs, params = get_model_flops_params(model, input_size=(
1, 3, args.image_size, args.image_size))
logger.info(
'[Model-{}] Flops: {} Params: {}'.format(args.selection, macs, params))

# create optimizer
model = model.cuda()
optimizer = create_optimizer(args, model)

# optionally resume from a checkpoint
resume_epoch = None
if args.auto_resume:
if int(timm.__version__[2]) >= 3:
resume_epoch = resume_checkpoint(model, args.experiment_dir, optimizer)
else:
resume_state, resume_epoch = resume_checkpoint(model, args.experiment_dir)
optimizer.load_state_dict(resume_state['optimizer'])
del resume_state

model_ema = None
if args.use_ema:
model_ema = ModelEma(
model,
decay=args.ema_decay,
device='cpu' if args.ema_cpu else '',
resume=args.experiment_dir if args.auto_resume else None)

# initialize training parameters
eval_metric = args.eval_metrics
best_metric, best_epoch, saver = None, None, None
if args.local_rank == 0:
decreasing = True if eval_metric == 'loss' else False
if int(timm.__version__[2]) >= 3:
saver = CheckpointSaver(model, optimizer,
checkpoint_dir=args.checkpoint_dir,
recovery_dir=args.checkpoint_dir,
model_ema=model_ema,
decreasing=decreasing,
max_history=2)
else:
saver = CheckpointSaver(
checkpoint_dir=args.checkpoint_dir,
recovery_dir=args.checkpoint_dir,
decreasing=decreasing,
max_history=2)

if distributed:
torch.distributed.init_process_group(backend='nccl', init_method='env://')

if args.sync_bn:
try:
if HAS_APEX:
model = convert_syncbn_model(model)
else:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
if args.local_rank == 0:
logger.info('Converted model to use Synchronized BatchNorm.')
except Exception as e:
if args.local_rank == 0:
logger.error(
'Failed to enable Synchronized BatchNorm. '
'Install Apex or Torch >= 1.1 with exception {}'.format(e))
if HAS_APEX:
model = DDP(model, delay_allreduce=True)
else:
if args.local_rank == 0:
logger.info(
"Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP.")
# can use device str in Torch >= 1.1
model = DDP(model, device_ids=[args.local_rank], find_unused_parameters=True)

# imagenet train dataset
train_dir = os.path.join(args.data_dir, 'train')
if not os.path.exists(train_dir) and args.local_rank == 0:
logger.error('Training folder does not exist at: {}'.format(train_dir))
exit(1)
dataset_train = Dataset(train_dir)
loader_train = create_loader(
dataset_train,
input_size=(3, args.image_size, args.image_size),
batch_size=args.batch_size,
is_training=True,
color_jitter=args.color_jitter,
auto_augment=args.aa,
num_aug_splits=0,
crop_pct=DEFAULT_CROP_PCT,
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD,
num_workers=args.workers,
distributed=distributed,
collate_fn=None,
pin_memory=args.pin_mem,
interpolation='random',
re_mode=args.re_mode,
re_prob=args.re_prob
)

# imagenet validation dataset
eval_dir = os.path.join(args.data_dir, 'val')
if not os.path.exists(eval_dir) and args.local_rank == 0:
logger.error(
'Validation folder does not exist at: {}'.format(eval_dir))
exit(1)
dataset_eval = Dataset(eval_dir)
loader_eval = create_loader(
dataset_eval,
input_size=(3, args.image_size, args.image_size),
batch_size=args.val_batch_mul * args.batch_size,
is_training=False,
interpolation=args.interpolation,
crop_pct=DEFAULT_CROP_PCT,
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD,
num_workers=args.workers,
distributed=distributed,
pin_memory=args.pin_mem
)

# whether to use label smoothing
if args.smoothing > 0.:
train_loss_fn = LabelSmoothingCrossEntropy(
smoothing=args.smoothing).cuda()
validate_loss_fn = nn.CrossEntropyLoss().cuda()
else:
train_loss_fn = nn.CrossEntropyLoss().cuda()
validate_loss_fn = train_loss_fn

# create learning rate scheduler
lr_scheduler, num_epochs = create_scheduler(args, optimizer)
start_epoch = resume_epoch if resume_epoch is not None else 0
if start_epoch > 0:
lr_scheduler.step(start_epoch)
if args.local_rank == 0:
logger.info('Scheduled epochs: {}'.format(num_epochs))

try:
best_record, best_ep = 0, 0
for epoch in range(start_epoch, num_epochs):
if distributed:
loader_train.sampler.set_epoch(epoch)

train_metrics = train_epoch(
epoch,
model,
loader_train,
optimizer,
train_loss_fn,
args,
lr_scheduler=lr_scheduler,
saver=saver,
output_dir=args.checkpoint_dir,
model_ema=model_ema,
logger=logger,
writer=writer,
local_rank=args.local_rank)

eval_metrics = validate(
epoch,
model,
loader_eval,
validate_loss_fn,
args,
logger=logger,
writer=writer,
local_rank=args.local_rank,
result_path=args.result_path
)

if model_ema is not None and not args.ema_cpu:
ema_eval_metrics = validate(
epoch,
model_ema.ema,
loader_eval,
validate_loss_fn,
args,
log_suffix='_EMA',
logger=logger,
writer=writer,
local_rank=args.local_rank
)
eval_metrics = ema_eval_metrics

if lr_scheduler is not None:
lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])

update_summary(epoch, train_metrics, eval_metrics, os.path.join(
args.checkpoint_dir, 'summary.csv'), write_header=best_metric is None)

if saver is not None:
# save proper checkpoint with eval metric
save_metric = eval_metrics[eval_metric]

if int(timm.__version__[2]) >= 3:
best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric)
else:
best_metric, best_epoch = saver.save_checkpoint(
model, optimizer, args,
epoch=epoch, metric=save_metric)

if best_record < eval_metrics[eval_metric]:
best_record = eval_metrics[eval_metric]
best_ep = epoch

if args.local_rank == 0:
logger.info(
'*** Best metric: {0} (epoch {1})'.format(best_record, best_ep))

except KeyboardInterrupt:
pass

if best_metric is not None:
logger.info(
'*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
save_best_checkpoint(args.best_checkpoint_dir, model, optimizer, epoch)


if __name__ == '__main__':
main()

+ 21
- 0
dubhe-tadl/cream/selector.py View File

@@ -0,0 +1,21 @@
import sys

sys.path.append('../..')
from pytorch.selector import Selector


class ClassicnasSelector(Selector):
def __init__(self, *args, single_candidate=True):
super().__init__(single_candidate)
self.args = args

def fit(self):
"""
only one candatite, function passed
"""
pass


if __name__ == "__main__":
hpo_selector = ClassicnasSelector()
hpo_selector.fit()

+ 167
- 0
dubhe-tadl/cream/test.py View File

@@ -0,0 +1,167 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Written by Hao Du and Houwen Peng
# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com

import os
import warnings
import datetime
import torch
import torch.nn as nn

# from torch.utils.tensorboard import SummaryWriter

# import timm packages
from timm.utils import ModelEma
from timm.models import resume_checkpoint
from timm.data import Dataset, create_loader

# import apex as distributed package
try:
from apex.parallel import convert_syncbn_model
from apex.parallel import DistributedDataParallel as DDP

HAS_APEX = True
except ImportError as e:
print(e)
from torch.nn.parallel import DistributedDataParallel as DDP

HAS_APEX = False

# import models and training functions
from lib.core.test import validate
from lib.models.structures.childnet import gen_childnet
from lib.utils.util import parse_config_args, get_logger, get_model_flops_params
from lib.config import DEFAULT_CROP_PCT, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD


def main():
args, cfg = parse_config_args('child net testing')

# resolve logging
output_dir = os.path.join(cfg.SAVE_PATH,
"{}-{}".format(datetime.date.today().strftime('%m%d'),
cfg.MODEL))
if not os.path.exists(output_dir):
os.mkdir(output_dir)

if args.local_rank == 0:
logger = get_logger(os.path.join(output_dir, 'test.log'))
writer = None # SummaryWriter(os.path.join(output_dir, 'runs'))
else:
writer, logger = None, None

# retrain model selection
if cfg.NET.SELECTION == 481:
arch_list = [
[0], [
3, 4, 3, 1], [
3, 2, 3, 0], [
3, 3, 3, 1], [
3, 3, 3, 3], [
3, 3, 3, 3], [0]]
cfg.DATASET.IMAGE_SIZE = 224
elif cfg.NET.SELECTION == 43:
arch_list = [[0], [3], [3, 1], [3, 1], [3, 3, 3], [3, 3], [0]]
cfg.DATASET.IMAGE_SIZE = 96
elif cfg.NET.SELECTION == 14:
arch_list = [[0], [3], [3, 3], [3, 3], [3], [3], [0]]
cfg.DATASET.IMAGE_SIZE = 64
elif cfg.NET.SELECTION == 112:
arch_list = [[0], [3], [3, 3], [3, 3], [3, 3, 3], [3, 3], [0]]
cfg.DATASET.IMAGE_SIZE = 160
elif cfg.NET.SELECTION == 287:
arch_list = [[0], [3], [3, 3], [3, 1, 3], [3, 3, 3, 3], [3, 3, 3], [0]]
cfg.DATASET.IMAGE_SIZE = 224
elif cfg.NET.SELECTION == 604:
arch_list = [[0], [3, 3, 2, 3, 3], [3, 2, 3, 2, 3], [3, 2, 3, 2, 3],
[3, 3, 2, 2, 3, 3], [3, 3, 2, 3, 3, 3], [0]]
cfg.DATASET.IMAGE_SIZE = 224
else:
raise ValueError("Model Test Selection is not Supported!")

# define childnet architecture from arch_list
stem = ['ds_r1_k3_s1_e1_c16_se0.25', 'cn_r1_k1_s1_c320_se0.25']
# TODO: this param from NNI is different from microsoft/Cream.
choice_block_pool = ['ir_r1_k3_s2_e4_c24_se0.25',
'ir_r1_k5_s2_e4_c40_se0.25',
'ir_r1_k3_s2_e6_c80_se0.25',
'ir_r1_k3_s1_e6_c96_se0.25',
'ir_r1_k5_s2_e6_c192_se0.25']
arch_def = [[stem[0]]] + [[choice_block_pool[idx]
for repeat_times in range(len(arch_list[idx + 1]))]
for idx in range(len(choice_block_pool))] + [[stem[1]]]

# generate childnet
model = gen_childnet(
arch_list,
arch_def,
num_classes=cfg.DATASET.NUM_CLASSES,
drop_rate=cfg.NET.DROPOUT_RATE,
global_pool=cfg.NET.GP)

if args.local_rank == 0:
macs, params = get_model_flops_params(model, input_size=(
1, 3, cfg.DATASET.IMAGE_SIZE, cfg.DATASET.IMAGE_SIZE))
logger.info(
'[Model-{}] Flops: {} Params: {}'.format(cfg.NET.SELECTION, macs, params))

# initialize distributed parameters
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
if args.local_rank == 0:
logger.info(
"Training on Process {} with {} GPUs.".format(
args.local_rank, cfg.NUM_GPU))

# resume model from checkpoint
assert cfg.AUTO_RESUME is True and os.path.exists(cfg.RESUME_PATH)
resume_checkpoint(model, cfg.RESUME_PATH)

model = model.cuda()

model_ema = None
if cfg.NET.EMA.USE:
# Important to create EMA model after cuda(), DP wrapper, and AMP but
# before SyncBN and DDP wrapper
model_ema = ModelEma(
model,
decay=cfg.NET.EMA.DECAY,
device='cpu' if cfg.NET.EMA.FORCE_CPU else '',
resume=cfg.RESUME_PATH)

# imagenet validation dataset
eval_dir = os.path.join(cfg.DATA_DIR, 'val')
if not os.path.exists(eval_dir) and args.local_rank == 0:
logger.error(
'Validation folder does not exist at: {}'.format(eval_dir))
exit(1)

dataset_eval = Dataset(eval_dir)
loader_eval = create_loader(
dataset_eval,
input_size=(3, cfg.DATASET.IMAGE_SIZE, cfg.DATASET.IMAGE_SIZE),
batch_size=cfg.DATASET.VAL_BATCH_MUL * cfg.DATASET.BATCH_SIZE,
is_training=False,
num_workers=cfg.WORKERS,
distributed=True,
pin_memory=cfg.DATASET.PIN_MEM,
crop_pct=DEFAULT_CROP_PCT,
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD
)

# only test accuracy of model-EMA
validate_loss_fn = nn.CrossEntropyLoss().cuda()
validate(0, model, loader_eval, validate_loss_fn, cfg,
log_suffix='_EMA', logger=logger,
writer=writer, local_rank=args.local_rank)

if cfg.NET.EMA.USE:
validate(0, model_ema.ema, loader_eval, validate_loss_fn, cfg,
log_suffix='_EMA', logger=logger,
writer=writer, local_rank=args.local_rank)


if __name__ == '__main__':
main()

+ 312
- 0
dubhe-tadl/cream/trainer.py View File

@@ -0,0 +1,312 @@
# https://github.com/microsoft/nni/blob/v2.0/examples/nas/cream/train.py
import sys

sys.path.append('../..')

import os
import sys
import time
import json
import torch
import numpy as np
import torch.nn as nn

from argparse import ArgumentParser

# import timm packages
from timm.loss import LabelSmoothingCrossEntropy
from timm.data import Dataset, create_loader
from timm.models import resume_checkpoint

# import apex as distributed package
# try:
# from apex.parallel import DistributedDataParallel as DDP
# from apex.parallel import convert_syncbn_model
#
# USE_APEX = True
# except ImportError as e:
# print(e)
# from torch.nn.parallel import DistributedDataParallel as DDP
#
# USE_APEX = False

# import models and training functions
from lib.utils.flops_table import FlopsEst
from lib.models.structures.supernet import gen_supernet
from lib.config import DEFAULT_CROP_PCT, IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN
from lib.utils.util import get_logger, \
create_optimizer_supernet, create_supernet_scheduler

from pytorch.utils import mkdirs, str2bool
from pytorch.callbacks import LRSchedulerCallback
from pytorch.callbacks import ModelCheckpoint
from algorithms import CreamSupernetTrainer
from algorithms import RandomMutator


def parse_args():
"""See lib.utils.config"""
parser = ArgumentParser()

# path
parser.add_argument("--checkpoint_dir", type=str, default='')
parser.add_argument("--data_dir", type=str, default='./data')
parser.add_argument("--experiment_dir", type=str, default='./')
parser.add_argument("--model_name", type=str, default='trainer')
parser.add_argument("--log_path", type=str, default='output/log')
parser.add_argument("--result_path", type=str, default='output/result.json')
parser.add_argument("--search_space_path", type=str, default='output/search_space.json')
parser.add_argument("--best_selected_space_path", type=str,
default='output/selected_space.json')

# int
parser.add_argument("--acc_gap", type=int, default=5)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--epochs", type=int, default=200)
parser.add_argument("--flops_minimum", type=int, default=0)
parser.add_argument("--flops_maximum", type=int, default=200)
parser.add_argument("--image_size", type=int, default=224)
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument("--log_interval", type=int, default=50)
parser.add_argument("--meta_sta_epoch", type=int, default=20)
parser.add_argument("--num_classes", type=int, default=1000)
parser.add_argument("--num_gpu", type=int, default=1)
parser.add_argument("--pool_size", type=int, default=10)
parser.add_argument("--trial_id", type=int, default=42)
parser.add_argument("--slice_num", type=int, default=4)
parser.add_argument("--tta", type=int, default=0)
parser.add_argument("--update_iter", type=int, default=1300)
parser.add_argument("--workers", type=int, default=4)

# float
parser.add_argument("--color_jitter", type=float, default=0.4)
parser.add_argument("--dropout_rate", type=float, default=0.0)
parser.add_argument("--lr", type=float, default=1e-2)
parser.add_argument("--meta_lr", type=float, default=1e-4)
parser.add_argument("--opt_eps", type=float, default=1e-2)
parser.add_argument("--re_prob", type=float, default=0.2)
parser.add_argument("--momentum", type=float, default=0.9)
parser.add_argument("--smoothing", type=float, default=0.1)
parser.add_argument("--weight_decay", type=float, default=1e-4)

# bool
parser.add_argument("--auto_resume", type=str2bool, default='False')
parser.add_argument("--dil_conv", type=str2bool, default='False')
parser.add_argument("--resunit", type=str2bool, default='False')
parser.add_argument("--sync_bn", type=str2bool, default='False')
parser.add_argument("--verbose", type=str2bool, default='False')

# str
# gp: type of global pool ["avg", "max", "avgmax", "avgmaxc"]
parser.add_argument("--gp", type=str, default='avg')
parser.add_argument("--interpolation", type=str, default='bilinear')
parser.add_argument("--opt", type=str, default='sgd')
parser.add_argument("--pick_method", type=str, default='meta')
parser.add_argument("--re_mode", type=str, default='pixel')

args = parser.parse_args()
args.sync_bn = False
args.verbose = False
args.data_dir = args.data_dir + "/imagenet"
return args


def main():
args = parse_args()

mkdirs(args.experiment_dir,
args.best_selected_space_path,
args.search_space_path,
args.result_path,
args.log_path)

with open(args.result_path, "w") as ss_file:
ss_file.write('')

# resolve logging

if len(args.checkpoint_dir > 1):
mkdirs(args.checkpoint_dir + "/")
args.checkpoint_dir = os.path.join(
args.checkpoint_dir,
"{}_{}".format(args.model_name, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
)
if not os.path.exists(args.checkpoint_dir):
os.mkdir(args.checkpoint_dir)

if args.local_rank == 0:
logger = get_logger(args.log_path)
else:
logger = None

# initialize distributed parameters
torch.cuda.set_device(args.local_rank)
# torch.distributed.init_process_group(backend='nccl', init_method='env://')
if args.local_rank == 0:
logger.info(
'Training on Process %d with %d GPUs.',
args.local_rank, args.num_gpu)

# fix random seeds
torch.manual_seed(args.trial_id)
torch.cuda.manual_seed_all(args.trial_id)
np.random.seed(args.trial_id)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# generate supernet and optimizer
model, sta_num, resolution, search_space = gen_supernet(
flops_minimum=args.flops_minimum,
flops_maximum=args.flops_maximum,
num_classes=args.num_classes,
drop_rate=args.dropout_rate,
global_pool=args.gp,
resunit=args.resunit,
dil_conv=args.dil_conv,
slice=args.slice_num,
verbose=args.verbose,
logger=logger)
optimizer = create_optimizer_supernet(args, model)

# number of choice blocks in supernet
choice_num = len(model.blocks[7])
if args.local_rank == 0:
logger.info('Supernet created, param count: %d', (
sum([m.numel() for m in model.parameters()])))
logger.info('resolution: %d', resolution)
logger.info('choice number: %d', choice_num)
with open(args.search_space_path, "w") as f:
print("dump search space.")
json.dump({'search_space': search_space}, f)

# initialize flops look-up table
model_est = FlopsEst(model)
flops_dict, flops_fixed = model_est.flops_dict, model_est.flops_fixed
model = model.cuda()

# convert model to distributed mode
if args.sync_bn:
try:
# if USE_APEX:
# model = convert_syncbn_model(model)
# else:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
if args.local_rank == 0:
logger.info('Converted model to use Synchronized BatchNorm.')
except Exception as exception:
logger.info(
'Failed to enable Synchronized BatchNorm. '
'Install Apex or Torch >= 1.1 with Exception %s', exception)
# if USE_APEX:
# model = DDP(model, delay_allreduce=True)
# else:
# if args.local_rank == 0:
# logger.info(
# "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP.")
# # can use device str in Torch >= 1.1
# model = DDP(model, device_ids=[args.local_rank], find_unused_parameters=True)

# optionally resume from a checkpoint
resume_epoch = None
if False: # args.auto_resume:
checkpoint = torch.load(args.experiment_dir)

model.load_state_dict(checkpoint['child_model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
resume_epoch = checkpoint['epoch']

# create learning rate scheduler
lr_scheduler, num_epochs = create_supernet_scheduler(optimizer, args.epochs, args.num_gpu,
args.batch_size, args.lr)

start_epoch = resume_epoch if resume_epoch is not None else 0
if start_epoch > 0:
lr_scheduler.step(start_epoch)

if args.local_rank == 0:
logger.info('Scheduled epochs: %d', num_epochs)

# imagenet train dataset
train_dir = os.path.join(args.data_dir, 'train')
if not os.path.exists(train_dir):
logger.info('Training folder does not exist at: %s', train_dir)
sys.exit()

dataset_train = Dataset(train_dir)
loader_train = create_loader(
dataset_train,
input_size=(3, args.image_size, args.image_size),
batch_size=args.batch_size,
is_training=True,
use_prefetcher=True,
re_prob=args.re_prob,
re_mode=args.re_mode,
color_jitter=args.color_jitter,
interpolation='random',
num_workers=args.workers,
distributed=False,
collate_fn=None,
crop_pct=DEFAULT_CROP_PCT,
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD
)

# imagenet validation dataset
eval_dir = os.path.join(args.data_dir, 'val')
if not os.path.isdir(eval_dir):
logger.info('Validation folder does not exist at: %s', eval_dir)
sys.exit()
dataset_eval = Dataset(eval_dir)
loader_eval = create_loader(
dataset_eval,
input_size=(3, args.image_size, args.image_size),
batch_size=4 * args.batch_size,
is_training=False,
use_prefetcher=True,
num_workers=args.workers,
distributed=False,
crop_pct=DEFAULT_CROP_PCT,
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD,
interpolation=args.interpolation
)

# whether to use label smoothing
if args.smoothing > 0.:
train_loss_fn = LabelSmoothingCrossEntropy(
smoothing=args.smoothing).cuda()
validate_loss_fn = nn.CrossEntropyLoss().cuda()
else:
train_loss_fn = nn.CrossEntropyLoss().cuda()
validate_loss_fn = train_loss_fn

mutator = RandomMutator(model)

_callbacks = [LRSchedulerCallback(lr_scheduler)]
if len(args.checkpoint_dir) > 1:
_callbacks.append(ModelCheckpoint(checkpoint_dir))
trainer = CreamSupernetTrainer(args.best_selected_space_path, model, train_loss_fn,
validate_loss_fn,
optimizer, num_epochs, loader_train, loader_eval,
result_path=args.result_path,
mutator=mutator,
batch_size=args.batch_size,
log_frequency=args.log_interval,
meta_sta_epoch=args.meta_sta_epoch,
update_iter=args.update_iter,
slices=args.slice_num,
pool_size=args.pool_size,
pick_method=args.pick_method,
choice_num=choice_num,
sta_num=sta_num,
acc_gap=args.acc_gap,
flops_dict=flops_dict,
flops_fixed=flops_fixed,
local_rank=args.local_rank,
callbacks=_callbacks)

trainer.train()


if __name__ == '__main__':
main()

+ 2
- 0
dubhe-tadl/darts/__init__.py View File

@@ -0,0 +1,2 @@
from pytorch.darts.dartstrainer import DartsTrainer
from pytorch.darts.dartsmutator import DartsMutator

+ 205
- 0
dubhe-tadl/darts/darts_retrain.py View File

@@ -0,0 +1,205 @@
import sys
sys.path.append('..'+ '/' + '..')
import os
import logging
import time
from argparse import ArgumentParser

import torch
import torch.nn as nn
# from torch.utils.tensorboard import SummaryWriter

import datasets
import utils
from model import CNN

from pytorch.utils import set_seed, mkdirs, init_logger, save_best_checkpoint, AverageMeter
from pytorch.fixed import apply_fixed_architecture
from pytorch.retrainer import Retrainer

logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# writer = SummaryWriter()

class DartsRetrainer(Retrainer):
def __init__(self, aux_weight, grad_clip, epochs, log_frequency):
self.aux_weight = aux_weight
self.grad_clip = grad_clip
self.epochs = epochs
self.log_frequency = log_frequency

def train(self, train_loader, model, optimizer, criterion, epoch):
top1 = AverageMeter("top1")
top5 = AverageMeter("top5")
losses = AverageMeter("losses")

cur_step = epoch * len(train_loader)
cur_lr = optimizer.param_groups[0]["lr"]
logger.info("Epoch %d LR %.6f", epoch, cur_lr)
# writer.add_scalar("lr", cur_lr, global_step=cur_step)

model.train()

for step, (x, y) in enumerate(train_loader):
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
bs = x.size(0)

optimizer.zero_grad()
logits, aux_logits = model(x)
loss = criterion(logits, y)
if self.aux_weight > 0.:
loss += self.aux_weight * criterion(aux_logits, y)
loss.backward()
# gradient clipping
nn.utils.clip_grad_norm_(model.parameters(), self.grad_clip)
optimizer.step()

accuracy = utils.accuracy(logits, y, topk=(1, 5))
losses.update(loss.item(), bs)
top1.update(accuracy["acc1"], bs)
top5.update(accuracy["acc5"], bs)
# writer.add_scalar("loss/train", loss.item(), global_step=cur_step)
# writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step)
# writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step)

if step % self.log_frequency == 0 or step == len(train_loader) - 1:
logger.info(
"Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
"Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
epoch + 1, self.epochs, step, len(train_loader) - 1, losses=losses,
top1=top1, top5=top5))

cur_step += 1

logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg))


def validate(self, valid_loader, model, criterion, epoch, cur_step):
top1 = AverageMeter("top1")
top5 = AverageMeter("top5")
losses = AverageMeter("losses")

model.eval()

with torch.no_grad():
for step, (X, y) in enumerate(valid_loader):
X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
bs = X.size(0)

logits = model(X)
loss = criterion(logits, y)

accuracy = utils.accuracy(logits, y, topk=(1, 5))
losses.update(loss.item(), bs)
top1.update(accuracy["acc1"], bs)
top5.update(accuracy["acc5"], bs)

if step % self.log_frequency == 0 or step == len(valid_loader) - 1:
logger.info(
"Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
"Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
epoch + 1, self.epochs, step, len(valid_loader) - 1, losses=losses,
top1=top1, top5=top5))

# writer.add_scalar("loss/test", losses.avg, global_step=cur_step)
# writer.add_scalar("acc1/test", top1.avg, global_step=cur_step)
# writer.add_scalar("acc5/test", top5.avg, global_step=cur_step)

logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg))

return top1.avg

if __name__ == "__main__":
parser = ArgumentParser("DARTS retrain")
parser.add_argument("--data_dir", type=str,
default='./data/', help="search_space json file")
parser.add_argument("--result_path", type=str,
default='.0/result.json', help="training result")
parser.add_argument("--log_path", type=str,
default='.0/log', help="log for info")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")
parser.add_argument("--best_checkpoint_dir", type=str,
default='./', help="default name is best_checkpoint_epoch{}.pth")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')
parser.add_argument("--layers", default=20, type=int)
parser.add_argument("--lr", default=0.025, type=float)
parser.add_argument("--batch_size", default=128, type=int)
parser.add_argument("--log_frequency", default=10, type=int)
parser.add_argument("--epochs", default=5, type=int)
parser.add_argument("--aux_weight", default=0.4, type=float)
parser.add_argument("--drop_path_prob", default=0.2, type=float)
parser.add_argument("--workers", default=4, type=int)
parser.add_argument("--channels", default=36, type=int)
parser.add_argument("--grad_clip", default=5., type=float)
parser.add_argument("--class_num", default=10, type=int, help="cifar10")
args = parser.parse_args()

mkdirs(args.result_path, args.log_path, args.best_checkpoint_dir)
init_logger(args.log_path)
logger.info(args)
set_seed(args.trial_id)
dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16, root=args.data_dir)

model = CNN(32, 3, args.channels, args.class_num, args.layers, auxiliary=True)
apply_fixed_architecture(model, args.best_selected_space_path)
criterion = nn.CrossEntropyLoss()

model.to(device)
criterion.to(device)

optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6)

train_loader = torch.utils.data.DataLoader(dataset_train,
batch_size=args.batch_size,
shuffle=True,
num_workers=args.workers,
pin_memory=True)
valid_loader = torch.utils.data.DataLoader(dataset_valid,
batch_size=args.batch_size,
shuffle=False,
num_workers=args.workers,
pin_memory=True)
retrainer = DartsRetrainer(aux_weight=args.aux_weight,
grad_clip=args.grad_clip,
epochs=args.epochs,
log_frequency = args.log_frequency)

# result = {"Accuracy": [], "Cost_time": ''}
best_top1 = 0.
start_time = time.time()
with open(args.result_path, "w") as file:
file.write('')
for epoch in range(args.epochs):
drop_prob = args.drop_path_prob * epoch / args.epochs
model.drop_path_prob(drop_prob)

# training
retrainer.train(train_loader, model, optimizer, criterion, epoch)

# validation
cur_step = (epoch + 1) * len(train_loader)
top1 = retrainer.validate(valid_loader, model, criterion, epoch, cur_step)
# 后端在终端过滤,{"type": "Accuracy", "result": {"sequence": 1, "category": "epoch", "value":96.7}}
logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) + '\n')
# result["Accuracy"].append(top1)
best_top1 = max(best_top1, top1)

lr_scheduler.step()

logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
cost_time = time.time() - start_time
# 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}}
logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}))

# result["Cost_time"] = str(cost_time) + ' s'
# dump_global_result(args.result_path, result)
save_best_checkpoint(args.best_checkpoint_dir, model, optimizer, epoch)
logger.info("Save best checkpoint in {}".format(os.path.join(args.best_checkpoint_dir, "best_checkpoint_epoch{}.pth".format(epoch))))

+ 21
- 0
dubhe-tadl/darts/darts_select.py View File

@@ -0,0 +1,21 @@
import sys
sys.path.append('../..')
from pytorch.selector import Selector
from argparse import ArgumentParser


class DartsSelector(Selector):
def __init__(self, single_candidate=True):
super().__init__(single_candidate)
def fit(self):
pass

if __name__ == "__main__":
parser = ArgumentParser("DARTS select")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")

args = parser.parse_args()
darts_selector = DartsSelector(True)
darts_selector.fit()

+ 85
- 0
dubhe-tadl/darts/darts_train.py View File

@@ -0,0 +1,85 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import sys
sys.path.append('..'+ '/' + '..')

import time
from argparse import ArgumentParser

import torch
import torch.nn as nn
import datasets
from model import CNN
from utils import accuracy
from dartstrainer import DartsTrainer
from pytorch.utils import *
from pytorch.callbacks import BestArchitectureCheckpoint, LRSchedulerCallback

logger = logging.getLogger(__name__)

if __name__ == "__main__":
parser = ArgumentParser("DARTS train")
parser.add_argument("--data_dir", type=str,
default='../data/', help="search_space json file")
parser.add_argument("--result_path", type=str,
default='.0/result.json', help="training result")
parser.add_argument("--log_path", type=str,
default='.0/log', help="log for info")
parser.add_argument("--search_space_path", type=str,
default='./search_space.json', help="search space of PDARTS")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')
parser.add_argument("--layers", default=8, type=int)
parser.add_argument("--batch_size", default=64, type=int)
parser.add_argument("--log_frequency", default=10, type=int)
parser.add_argument("--epochs", default=5, type=int)
parser.add_argument("--channels", default=16, type=int)
parser.add_argument('--model_lr', type=float, default=0.025, help='learning rate for training model weights')
parser.add_argument('--arch_lr', type=float, default=3e-4, help='learning rate for training architecture')
parser.add_argument("--unrolled", default=False, action="store_true")
parser.add_argument("--visualization", default=False, action="store_true")
parser.add_argument("--class_num", default=10, type=int, help="cifar10")
args = parser.parse_args()
mkdirs(args.result_path, args.log_path, args.search_space_path, args.best_selected_space_path)
init_logger(args.log_path, "info")
logger.info(args)
set_seed(args.trial_id)
dataset_train, dataset_valid = datasets.get_dataset("cifar10", root=args.data_dir)
model = CNN(32, 3, args.channels, args.class_num, args.layers)
criterion = nn.CrossEntropyLoss()

optim = torch.optim.SGD(model.parameters(), args.model_lr, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001)

trainer = DartsTrainer(model,
loss=criterion,
metrics=lambda output, target: accuracy(output, target, topk=(1,)),
optimizer=optim,
num_epochs=args.epochs,
dataset_train=dataset_train,
dataset_valid=dataset_valid,
search_space_path = args.search_space_path,
batch_size=args.batch_size,
log_frequency=args.log_frequency,
result_path=args.result_path,
unrolled=args.unrolled,
arch_lr=args.arch_lr,
callbacks=[LRSchedulerCallback(lr_scheduler), BestArchitectureCheckpoint(args.best_selected_space_path, args.epochs)])

if args.visualization:
trainer.enable_visualization()
t1 = time.time()
trainer.train()
# res_json = trainer.result
cost_time = time.time() - t1
# 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}}
logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}))
# res_json["Cost_time"] = str(cost_time) + ' s'
# dump_global_result(args.result_path, res_json)

+ 134
- 0
dubhe-tadl/darts/dartsmutator.py View File

@@ -0,0 +1,134 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from pytorch.mutator import Mutator
from pytorch.mutables import LayerChoice, InputChoice

_logger = logging.getLogger(__name__)

class DartsMutator(Mutator):
"""
Connects the model in a DARTS (differentiable) way.

An extra connection is automatically inserted for each LayerChoice, when this connection is selected, there is no
op on this LayerChoice (namely a ``ZeroOp``), in which case, every element in the exported choice list is ``false``
(not chosen).

All input choice will be fully connected in the search phase. On exporting, the input choice will choose inputs based
on keys in ``choose_from``. If the keys were to be keys of LayerChoices, the top logit of the corresponding LayerChoice
will join the competition of input choice to compete against other logits. Otherwise, the logit will be assumed 0.

It's possible to cut branches by setting parameter ``choices`` in a particular position to ``-inf``. After softmax, the
value would be 0. Framework will ignore 0 values and not connect. Note that the gradient on the ``-inf`` location will
be 0. Since manipulations with ``-inf`` will be ``nan``, you need to handle the gradient update phase carefully.

Attributes
----------
choices: ParameterDict
dict that maps keys of LayerChoices to weighted-connection float tensors.
"""
def __init__(self, model):
super().__init__(model)
self.choices = nn.ParameterDict()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.length + 1))

def device(self):
for v in self.choices.values():
return v.device

def sample_search(self):
result = dict()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
result[mutable.key] = F.softmax(self.choices[mutable.key], dim=-1)[:-1]
elif isinstance(mutable, InputChoice):
result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device())
return result

def sample_final(self):
result = dict()
edges_max = dict()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
max_val, index = torch.max(F.softmax(self.choices[mutable.key], dim=-1)[:-1], 0)
edges_max[mutable.key] = max_val
result[mutable.key] = F.one_hot(index, num_classes=len(mutable)).view(-1).bool()
for mutable in self.mutables:
if isinstance(mutable, InputChoice):
if mutable.n_chosen is not None:
weights = []
for src_key in mutable.choose_from:
if src_key not in edges_max:
_logger.warning("InputChoice.NO_KEY in '%s' is weighted 0 when selecting inputs.", mutable.key)
weights.append(edges_max.get(src_key, 0.))
weights = torch.tensor(weights) # pylint: disable=not-callable
_, topk_edge_indices = torch.topk(weights, mutable.n_chosen)
selected_multihot = []
for i, src_key in enumerate(mutable.choose_from):
if i not in topk_edge_indices and src_key in result:
# If an edge is never selected, there is no need to calculate any op on this edge.
# This is to eliminate redundant calculation.
result[src_key] = torch.zeros_like(result[src_key])
selected_multihot.append(i in topk_edge_indices)
result[mutable.key] = torch.tensor(selected_multihot, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable
else:
result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable
return result

def _generate_search_space(self):
"""
Generate search space from mutables.
Here is the search space format:
::
{ key_name: {"_type": "layer_choice",
"_value": ["conv1", "conv2"]} }
{ key_name: {"_type": "input_choice",
"_value": {"candidates": ["in1", "in2"],
"n_chosen": 1}} }
Returns
-------
dict
the generated search space
"""
res = OrderedDict()
res["op_list"] = OrderedDict()
res["search_space"] = OrderedDict()
# res["normal_cell"] = OrderedDict(),
# res["reduction_cell"] = OrderedDict()

keys = []
for mutable in self.mutables:
# for now we only generate flattened search space
if (len(res["search_space"])) >= 36:
break

if isinstance(mutable, LayerChoice):
key = mutable.key
if key not in keys:
val = mutable.names
if not res["op_list"]:
res["op_list"] = {"_type": "layer_choice", "_value": val + ["none"]}
# node_type = "normal_cell" if "normal" in key else "reduction_cell"
res["search_space"][key] = "op_list"
keys.append(key)

elif isinstance(mutable, InputChoice):
key = mutable.key
if key not in keys:
# node_type = "normal_cell" if "normal" in key else "reduction_cell"
res["search_space"][key] = {"_type": "input_choice",
"_value": {"candidates": mutable.choose_from,
"n_chosen": mutable.n_chosen}}
keys.append(key)
else:
raise TypeError("Unsupported mutable type: '%s'." % type(mutable))

return res

+ 227
- 0
dubhe-tadl/darts/dartstrainer.py View File

@@ -0,0 +1,227 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import copy
import logging

import torch
import torch.nn as nn
from pytorch.trainer import Trainer
from pytorch.utils import AverageMeterGroup, dump_global_result
from pytorch.darts.dartsmutator import DartsMutator
import json

logger = logging.getLogger(__name__)

class DartsTrainer(Trainer):
"""
DARTS trainer.

Parameters
----------
model : nn.Module
PyTorch model to be trained.
loss : callable
Receives logits and ground truth label, return a loss tensor.
metrics : callable
Receives logits and ground truth label, return a dict of metrics.
optimizer : Optimizer
The optimizer used for optimizing the model.
num_epochs : int
Number of epochs planned for training.
dataset_train : Dataset
Dataset for training. Will be split for training weights and architecture weights.
dataset_valid : Dataset
Dataset for testing.
mutator : DartsMutator
Use in case of customizing your own DartsMutator. By default will instantiate a DartsMutator.
batch_size : int
Batch size.
workers : int
Workers for data loading.
device : torch.device
``torch.device("cpu")`` or ``torch.device("cuda")``.
log_frequency : int
Step count per logging.
callbacks : list of Callback
list of callbacks to trigger at events.
arch_lr : float
Learning rate of architecture parameters.
unrolled : float
``True`` if using second order optimization, else first order optimization.
"""
def __init__(self, model, loss, metrics,
optimizer, num_epochs, dataset_train, dataset_valid, search_space_path, result_path, num_pre_epochs=0,
mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
callbacks=None, arch_lr=3.0E-4, unrolled=False):
super().__init__(model, mutator if mutator is not None else DartsMutator(model),
loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid,
batch_size, workers, device, log_frequency, callbacks)

self.ctrl_optim = torch.optim.Adam(self.mutator.parameters(), arch_lr, betas=(0.5, 0.999), weight_decay=1.0E-3)
self.unrolled = unrolled
self.num_pre_epoches = num_pre_epochs
self.result_path = result_path
with open(self.result_path, "w") as file:
file.write('')
n_train = len(self.dataset_train)
split = n_train // 2
indices = list(range(n_train))
train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:split])
valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[split:])
self.train_loader = torch.utils.data.DataLoader(self.dataset_train,
batch_size=batch_size,
sampler=train_sampler,
num_workers=workers)
self.valid_loader = torch.utils.data.DataLoader(self.dataset_train,
batch_size=batch_size,
sampler=valid_sampler,
num_workers=workers)
self.test_loader = torch.utils.data.DataLoader(self.dataset_valid,
batch_size=batch_size,
num_workers=workers)
if search_space_path is not None:
dump_global_result(search_space_path, self.mutator._generate_search_space())
# self.result = {"Accuracy": []}

def train_one_epoch(self, epoch):
self.model.train()
self.mutator.train()
meters = AverageMeterGroup()
# t1 = time()
for step, ((trn_X, trn_y), (val_X, val_y)) in enumerate(zip(self.train_loader, self.valid_loader)):
trn_X, trn_y = trn_X.to(self.device), trn_y.to(self.device)
val_X, val_y = val_X.to(self.device), val_y.to(self.device)

if epoch >= self.num_pre_epoches:
# phase 1. architecture step
self.ctrl_optim.zero_grad()
if self.unrolled:
self._unrolled_backward(trn_X, trn_y, val_X, val_y)
else:
self._backward(val_X, val_y)
self.ctrl_optim.step()

# phase 2: child network step
self.optimizer.zero_grad()
logits, loss = self._logits_and_loss(trn_X, trn_y)
loss.backward()
nn.utils.clip_grad_norm_(self.model.parameters(), 5.) # gradient clipping
self.optimizer.step()

metrics = self.metrics(logits, trn_y)
metrics["loss"] = loss.item()
meters.update(metrics)
if self.log_frequency is not None and step % self.log_frequency == 0:
logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1,
self.num_epochs, step + 1, len(self.train_loader), meters)

def validate_one_epoch(self, epoch, log_print=True):
self.model.eval()
self.mutator.eval()
meters = AverageMeterGroup()
with torch.no_grad():
self.mutator.reset()
for step, (X, y) in enumerate(self.test_loader):
X, y = X.to(self.device), y.to(self.device)
logits = self.model(X)
metrics = self.metrics(logits, y)
meters.update(metrics)
if self.log_frequency is not None and step % self.log_frequency == 0:
logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1,
self.num_epochs, step + 1, len(self.test_loader), meters)
if log_print:
# 后端在终端过滤,{"type": "Accuracy", "result": {"sequence": 1, "category": "epoch", "value":96.7}}
logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": meters.get_last_acc()}})
with open(self.result_path, "a") as file:
file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": meters.get_last_acc()}}) + '\n')
# self.result["Accuracy"].append(meters.get_last_acc())

def _logits_and_loss(self, X, y):
self.mutator.reset()
logits = self.model(X)
loss = self.loss(logits, y)
# self._write_graph_status()
return logits, loss

def _backward(self, val_X, val_y):
"""
Simple backward with gradient descent
"""
_, loss = self._logits_and_loss(val_X, val_y)
loss.backward()

def _unrolled_backward(self, trn_X, trn_y, val_X, val_y):
"""
Compute unrolled loss and backward its gradients
"""
backup_params = copy.deepcopy(tuple(self.model.parameters()))

# do virtual step on training data
lr = self.optimizer.param_groups[0]["lr"]
momentum = self.optimizer.param_groups[0]["momentum"]
weight_decay = self.optimizer.param_groups[0]["weight_decay"]
self._compute_virtual_model(trn_X, trn_y, lr, momentum, weight_decay)

# calculate unrolled loss on validation data
# keep gradients for model here for compute hessian
_, loss = self._logits_and_loss(val_X, val_y)
w_model, w_ctrl = tuple(self.model.parameters()), tuple(self.mutator.parameters())
w_grads = torch.autograd.grad(loss, w_model + w_ctrl)
d_model, d_ctrl = w_grads[:len(w_model)], w_grads[len(w_model):]

# compute hessian and final gradients
hessian = self._compute_hessian(backup_params, d_model, trn_X, trn_y)
with torch.no_grad():
for param, d, h in zip(w_ctrl, d_ctrl, hessian):
# gradient = dalpha - lr * hessian
param.grad = d - lr * h

# restore weights
self._restore_weights(backup_params)

def _compute_virtual_model(self, X, y, lr, momentum, weight_decay):
"""
Compute unrolled weights w`
"""
# don't need zero_grad, using autograd to calculate gradients
_, loss = self._logits_and_loss(X, y)
gradients = torch.autograd.grad(loss, self.model.parameters())
with torch.no_grad():
for w, g in zip(self.model.parameters(), gradients):
m = self.optimizer.state[w].get("momentum_buffer", 0.)
w = w - lr * (momentum * m + g + weight_decay * w)

def _restore_weights(self, backup_params):
with torch.no_grad():
for param, backup in zip(self.model.parameters(), backup_params):
param.copy_(backup)

def _compute_hessian(self, backup_params, dw, trn_X, trn_y):
"""
dw = dw` { L_val(w`, alpha) }
w+ = w + eps * dw
w- = w - eps * dw
hessian = (dalpha { L_trn(w+, alpha) } - dalpha { L_trn(w-, alpha) }) / (2*eps)
eps = 0.01 / ||dw||
"""
self._restore_weights(backup_params)
norm = torch.cat([w.view(-1) for w in dw]).norm()
eps = 0.01 / norm
if norm < 1E-8:
logger.warning("In computing hessian, norm is smaller than 1E-8, cause eps to be %.6f.", norm.item())

dalphas = []
for e in [eps, -2. * eps]:
# w+ = w + eps*dw`, w- = w - eps*dw`
with torch.no_grad():
for p, d in zip(self.model.parameters(), dw):
p += e * d

_, loss = self._logits_and_loss(trn_X, trn_y)
dalphas.append(torch.autograd.grad(loss, self.mutator.parameters()))

dalpha_pos, dalpha_neg = dalphas # dalpha { L_trn(w+) }, # dalpha { L_trn(w-) }
hessian = [(p - n) / (2. * eps) for p, n in zip(dalpha_pos, dalpha_neg)]
return hessian

+ 56
- 0
dubhe-tadl/darts/datasets.py View File

@@ -0,0 +1,56 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
import torch
from torchvision import transforms
from torchvision.datasets import CIFAR10


class Cutout(object):
def __init__(self, length):
self.length = length

def __call__(self, img):
h, w = img.size(1), img.size(2)
mask = np.ones((h, w), np.float32)
y = np.random.randint(h)
x = np.random.randint(w)

y1 = np.clip(y - self.length // 2, 0, h)
y2 = np.clip(y + self.length // 2, 0, h)
x1 = np.clip(x - self.length // 2, 0, w)
x2 = np.clip(x + self.length // 2, 0, w)

mask[y1: y2, x1: x2] = 0.
mask = torch.from_numpy(mask)
mask = mask.expand_as(img)
img *= mask

return img


def get_dataset(cls, cutout_length=0, root=None):
MEAN = [0.49139968, 0.48215827, 0.44653124]
STD = [0.24703233, 0.24348505, 0.26158768]
transf = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
normalize = [
transforms.ToTensor(),
transforms.Normalize(MEAN, STD)
]
cutout = []
if cutout_length > 0:
cutout.append(Cutout(cutout_length))

train_transform = transforms.Compose(transf + normalize + cutout)
valid_transform = transforms.Compose(normalize)

if cls == "cifar10":
dataset_train = CIFAR10(root=root, train=True, download=True, transform=train_transform)
dataset_valid = CIFAR10(root=root, train=False, download=True, transform=valid_transform)
else:
raise NotImplementedError
return dataset_train, dataset_valid

+ 160
- 0
dubhe-tadl/darts/model.py View File

@@ -0,0 +1,160 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from collections import OrderedDict

import torch
import torch.nn as nn

import ops
import pytorch.mutables as mutables


class AuxiliaryHead(nn.Module):
""" Auxiliary head in 2/3 place of network to let the gradient flow well """

def __init__(self, input_size, C, n_classes):
""" assuming input size 7x7 or 8x8 """
assert input_size in [7, 8]
super().__init__()
self.net = nn.Sequential(
nn.ReLU(inplace=True),
nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False), # 2x2 out
nn.Conv2d(C, 128, kernel_size=1, bias=False),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 768, kernel_size=2, bias=False), # 1x1 out
nn.BatchNorm2d(768),
nn.ReLU(inplace=True)
)
self.linear = nn.Linear(768, n_classes)

def forward(self, x):
out = self.net(x)
out = out.view(out.size(0), -1) # flatten
logits = self.linear(out)
return logits


class Node(nn.Module):
def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect):
super().__init__()
self.ops = nn.ModuleList()
choice_keys = []
for i in range(num_prev_nodes):
stride = 2 if i < num_downsample_connect else 1
choice_keys.append("{}_p{}".format(node_id, i))
self.ops.append(
mutables.LayerChoice(OrderedDict([
("maxpool", ops.PoolBN('max', channels, 3, stride, 1, affine=False)),
("avgpool", ops.PoolBN('avg', channels, 3, stride, 1, affine=False)),
("skipconnect", nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False)),
("sepconv3x3", ops.SepConv(channels, channels, 3, stride, 1, affine=False)),
("sepconv5x5", ops.SepConv(channels, channels, 5, stride, 2, affine=False)),
("dilconv3x3", ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False)),
("dilconv5x5", ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False))
]), key=choice_keys[-1]))
self.drop_path = ops.DropPath()
self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id))

def forward(self, prev_nodes):
assert len(self.ops) == len(prev_nodes)
out = [op(node) for op, node in zip(self.ops, prev_nodes)]
out = [self.drop_path(o) if o is not None else None for o in out]
return self.input_switch(out)


class Cell(nn.Module):

def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction):
super().__init__()
self.reduction = reduction
self.n_nodes = n_nodes

# If previous cell is reduction cell, current input size does not match with
# output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
if reduction_p:
self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False)
else:
self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False)
self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False)

# generate dag
self.mutable_ops = nn.ModuleList()
for depth in range(2, self.n_nodes + 2):
self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth),
depth, channels, 2 if reduction else 0))

def forward(self, s0, s1):
# s0, s1 are the outputs of previous previous cell and previous cell, respectively.
tensors = [self.preproc0(s0), self.preproc1(s1)]
for node in self.mutable_ops:
cur_tensor = node(tensors)
tensors.append(cur_tensor)

output = torch.cat(tensors[2:], dim=1)
return output


class CNN(nn.Module):

def __init__(self, input_size, in_channels, channels, n_classes, n_layers, n_nodes=4,
stem_multiplier=3, auxiliary=False):
super().__init__()
self.in_channels = in_channels
self.channels = channels
self.n_classes = n_classes
self.n_layers = n_layers
self.aux_pos = 2 * n_layers // 3 if auxiliary else -1

c_cur = stem_multiplier * self.channels
self.stem = nn.Sequential(
nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
nn.BatchNorm2d(c_cur)
)

# for the first cell, stem is used for both s0 and s1
# [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
channels_pp, channels_p, c_cur = c_cur, c_cur, channels

self.cells = nn.ModuleList()
reduction_p, reduction = False, False
for i in range(n_layers):
reduction_p, reduction = reduction, False
# Reduce featuremap size and double channels in 1/3 and 2/3 layer.
if i in [n_layers // 3, 2 * n_layers // 3]:
c_cur *= 2
reduction = True

cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction)
self.cells.append(cell)
c_cur_out = c_cur * n_nodes
channels_pp, channels_p = channels_p, c_cur_out

if i == self.aux_pos:
self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes)

self.gap = nn.AdaptiveAvgPool2d(1)
self.linear = nn.Linear(channels_p, n_classes)

def forward(self, x):
s0 = s1 = self.stem(x)

aux_logits = None
for i, cell in enumerate(self.cells):
s0, s1 = s1, cell(s0, s1)
if i == self.aux_pos and self.training:
aux_logits = self.aux_head(s1)

out = self.gap(s1)
out = out.view(out.size(0), -1) # flatten
logits = self.linear(out)

if aux_logits is not None:
return logits, aux_logits
return logits

def drop_path_prob(self, p):
for module in self.modules():
if isinstance(module, ops.DropPath):
module.p = p

+ 136
- 0
dubhe-tadl/darts/ops.py View File

@@ -0,0 +1,136 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import torch.nn as nn


class DropPath(nn.Module):
def __init__(self, p=0.):
"""
Drop path with probability.

Parameters
----------
p : float
Probability of an path to be zeroed.
"""
super().__init__()
self.p = p

def forward(self, x):
if self.training and self.p > 0.:
keep_prob = 1. - self.p
# per data point mask
mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob)
return x / keep_prob * mask

return x


class PoolBN(nn.Module):
"""
AvgPool or MaxPool with BN. `pool_type` must be `max` or `avg`.
"""
def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True):
super().__init__()
if pool_type.lower() == 'max':
self.pool = nn.MaxPool2d(kernel_size, stride, padding)
elif pool_type.lower() == 'avg':
self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
else:
raise ValueError()

self.bn = nn.BatchNorm2d(C, affine=affine)

def forward(self, x):
out = self.pool(x)
out = self.bn(out)
return out


class StdConv(nn.Module):
"""
Standard conv: ReLU - Conv - BN
"""
def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
super().__init__()
self.net = nn.Sequential(
nn.ReLU(),
nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False),
nn.BatchNorm2d(C_out, affine=affine)
)

def forward(self, x):
return self.net(x)


class FacConv(nn.Module):
"""
Factorized conv: ReLU - Conv(Kx1) - Conv(1xK) - BN
"""
def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True):
super().__init__()
self.net = nn.Sequential(
nn.ReLU(),
nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False),
nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False),
nn.BatchNorm2d(C_out, affine=affine)
)

def forward(self, x):
return self.net(x)


class DilConv(nn.Module):
"""
(Dilated) depthwise separable conv.
ReLU - (Dilated) depthwise separable - Pointwise - BN.
If dilation == 2, 3x3 conv => 5x5 receptive field, 5x5 conv => 9x9 receptive field.
"""
def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
super().__init__()
self.net = nn.Sequential(
nn.ReLU(),
nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in,
bias=False),
nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(C_out, affine=affine)
)

def forward(self, x):
return self.net(x)


class SepConv(nn.Module):
"""
Depthwise separable conv.
DilConv(dilation=1) * 2.
"""
def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
super().__init__()
self.net = nn.Sequential(
DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine),
DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine)
)

def forward(self, x):
return self.net(x)


class FactorizedReduce(nn.Module):
"""
Reduce feature map size by factorized pointwise (stride=2).
"""
def __init__(self, C_in, C_out, affine=True):
super().__init__()
self.relu = nn.ReLU()
self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
self.bn = nn.BatchNorm2d(C_out, affine=affine)

def forward(self, x):
x = self.relu(x)
out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
out = self.bn(out)
return out

+ 83
- 0
dubhe-tadl/darts/readme.md View File

@@ -0,0 +1,83 @@
# train stage
`python darts_train.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --search_space_path 'experiment_id/search_space.json' --best_selected_space_path 'experiment_id/best_selected_space.json' --trial_id 0 --layers 8 --model_lr 0.025 --arch_lr 3e-4 --epochs 1 --batch_size 64 --channels 16`
Note:
here `--epochs 2` just for debug

# select stage
`python darts_select.py --best_selected_space_path 'experiment_id/best_selected_space.json' `

# retrain stage
`python darts_retrain.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --best_selected_space_path 'experiment_id/best_selected_space.json' --best_checkpoint_dir 'experiment_id/' --trial_id 0 --batch_size 96 --epochs 1 --lr 0.025 --layers 20 --channels 36`

# output file
`result.json`
```
{'type': 'Accuracy', 'result': {'sequence': 0, 'category': 'epoch', 'value': 0.1}}
{'type': 'Accuracy', 'result': {'sequence': 1, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 2, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 3, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 4, 'category': 'epoch', 'value': 0.0}}
{'type': 'Cost_time', 'result': {'value': '41.614346981048584 s'}}
```

`search_space.json`
```
{
"op_list": {
"_type": "layer_choice",
"_value": [
"maxpool",
"avgpool",
"skipconnect",
"sepconv3x3",
"sepconv5x5",
"dilconv3x3",
"dilconv5x5",
"none"
]
},
"search_space": {
"normal_n2_p0": "op_list",
"normal_n2_p1": "op_list",
"normal_n2_switch": {
"_type": "input_choice",
"_value": {
"candidates": [
"normal_n2_p0",
"normal_n2_p1"
],
"n_chosen": 2
}
},

...
}
```

`best_selected_space.json`
```
{
"normal_n2_p0": "dilconv5x5",
"normal_n2_p1": "dilconv5x5",
"normal_n2_switch": [
"normal_n2_p0",
"normal_n2_p1"
],
"normal_n3_p0": "sepconv3x3",
"normal_n3_p1": "dilconv5x5",
"normal_n3_p2": [],
"normal_n3_switch": [
"normal_n3_p0",
"normal_n3_p1"
],
"normal_n4_p0": [],
"normal_n4_p1": "dilconv5x5",
"normal_n4_p2": "sepconv5x5",
"normal_n4_p3": [],
"normal_n4_switch": [
"normal_n4_p1",
"normal_n4_p2"
],
...
}
```

+ 21
- 0
dubhe-tadl/darts/utils.py View File

@@ -0,0 +1,21 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

def accuracy(output, target, topk=(1,)):
""" Computes the precision@k for the specified values of k """
maxk = max(topk)
batch_size = target.size(0)

_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
# one-hot case
if target.ndimension() > 1:
target = target.max(1)[1]

correct = pred.eq(target.view(1, -1).expand_as(pred))

res = dict()
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0)
res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
return res

+ 331
- 0
dubhe-tadl/dataloader.py View File

@@ -0,0 +1,331 @@
from nvidia.dali.pipeline import Pipeline
from nvidia.dali import ops
from nvidia.dali import types
from nvidia.dali.plugin.pytorch import DALIClassificationIterator

import numpy as np
import torch
from torch import nn


class HybridTrainPipeline(Pipeline):
def __init__(self, batch_size, file_root, num_threads, device_id, num_shards, shard_id):
super(HybridTrainPipeline, self).__init__(batch_size, num_threads, device_id)
device_type = {0:"cpu"}
if num_shards == 0:
self.input = ops.FileReader(file_root = file_root)
else:
self.input = ops.FileReader(file_root = file_root, num_shards = num_shards, shard_id = shard_id)
# ##### 可自由更改 ###################################
self.decode = ops.ImageDecoder(device = device_type.get(num_shards, "mixed"), output_type = types.RGB)
self.res = ops.RandomResizedCrop(device=device_type.get(num_shards, "gpu"), size = 224)
self.cmnp = ops.CropMirrorNormalize(device=device_type.get(num_shards, "gpu"),
dtype = types.FLOAT, # output_dtype=types.FLOAT,
output_layout=types.NCHW,
mean=0. ,# if spos_pre else [0.485 * 255, 0.456 * 255, 0.406 * 255],
std=1. )# if spos_pre else [0.229 * 255, 0.224 * 255, 0.225 * 255])
# ####################################################
def define_graph(self, ):
jpegs, labels = self.input(name="Reader")
images = self.decode(jpegs)
images = self.res(images)
images = self.cmnp(images)
return images, labels
class HybridValPipeline(Pipeline):
def __init__(self, batch_size, file_root, num_threads, device_id, num_shards, shard_id):
super(HybridValPipeline, self).__init__(batch_size, num_threads, device_id)
device_type = {0:"cpu"}
if num_shards == 0:
self.input = ops.FileReader(file_root = file_root)
else:
self.input = ops.FileReader(file_root = file_root, num_shards = num_shards, shard_id = shard_id)
# ##### 可自由更改 ###################################
self.decode = ops.ImageDecoder(device = device_type.get(num_shards, "mixed"), output_type = types.RGB)
self.res = ops.RandomResizedCrop(device=device_type.get(num_shards, "gpu"), size = 224)
self.cmnp = ops.CropMirrorNormalize(device=device_type.get(num_shards, "gpu"),
dtype = types.FLOAT, # output_dtype=types.FLOAT,
output_layout=types.NCHW,
mean=0. ,# if spos_pre else [0.485 * 255, 0.456 * 255, 0.406 * 255],
std=1. )# if spos_pre else [0.229 * 255, 0.224 * 255, 0.225 * 255])
# ####################################################
def define_graph(self, ):
jpegs, labels = self.input(name="Reader")
images = self.decode(jpegs)
images = self.res(images)
images = self.cmnp(images)
return images, labels
class TorchWrapper:
"""
将多个pipeline封装为一个iterator
parameters:
num_shards : int 显卡并行数
data_loader : dali.pipeline.Pipeline类型 经过pipeline处理的数据结果
iter_mode : str recursion, iter 指定多个pipeline合并的方式,默认recursion
"""
def __init__(self, num_shards, data_loader, iter_mode = "recursion"):
self.index = 0
self.count = 0
self.num_shards = num_shards
self.data_loader = data_loader
self.iter_mode = iter_mode
if self.iter_mode not in {"recursion", "iter"}:
raise Exception("iter_mode should be either 'recursion' or 'iter'")
def __iter__(self,):
return self
def __len__(self, ):
# 返回样本总量,而非batch_num
if num_shards == 0:
return self.data_loader.size
else:
return len(self.data_loader)*self.data_loader[0].size
def __next__(self, ):
if num_shards == 0:
# 不使用GPU
data = next(self.data_loader)
return data[0]["data"], data[0]["label"].view(-1).long()
else:
# 使用一块或多块GPU
if self.iter_mode == "recursion":
return self._get_next_recursion()
elif self.iter_mode == "iter":
return self._get_next_iter(self.data_loader[0])
def _get_next_iter(self, data_loader):
if self.count == data_loader.size:
self.index+=1
data_loader = self.data_loader[self.index]
self.count+=1
data = next(data_loader)
return data[0]["data"], data[0]["label"].view(-1).long()
def _get_next_recursion(self, ):
self.index = self.count%self.num_shards
self.count+=1
data_loader = self.data_loader[self.index]
data = next(data_loader)

return data[0]["data"], data[0]["label"].view(-1).long()

def get_iter_dali_cuda(batch_size=256, train_file_root="", val_file_root="", num_threads=4, device_id=[-1], num_shards=0, shard_id=[-1]):
"""
获取可用于pytorch训练的数据迭代器
数据的读取和处理部分可以使用多张GPU来完成
1、创建dali pipeline
2、封装为适用于pytorch的数据迭代器
3、将多卡的各个pipeline封装在一起
4、数据输出在cpu端,在cuda中
数据需要保证如下形式:
images
|-file_list.txt
|-images/dog
|-dog_4.jpg
|-dog_5.jpg
|-dog_9.jpg
|-dog_6.jpg
|-dog_3.jpg
|-images/kitten
|-cat_10.jpg
|-cat_5.jpg
|-cat_9.jpg
|-cat_8.jpg
|-cat_1.jpg
parameters:
batch_size : int 每批数据的量
file_root : str 数据的路径
num_threads : int 读取数据的CPU线程数
device_id : list of int GPU的物理编号
shard_id : list of int GPU的虚拟编号
num_shard : int
methods:
get_train_pipeline(shard_id, device_id) : 创建dali的pipeline,用以读取并处理训练数据
get_val_pipeline(shard_id, device_id) : 创建dali的pipeline,用以读取并处理验证数据
get_dali_iter_for_torch(piplines, data_num) : 封装成可用于pytorch的数据迭代器
get_data_size(pipeline) : 计算每个pipeline实际输出的数据总量,数据总量是文件中的数据量,实际输出是去掉了不满一个批次大小的数据
例:
# 分别从TRAIN_PATH和VAL_PATH读取训练和验证数据,batch_size选择256,启动4个线程来读取数据,用2块GPU处理数据,分别是第0号和第4号GPU
# 程序默认使用所有显卡,和4线程
# 如果使用单张GPU,请设置num_shards = 1, shard_id = [0], device_id保持一个列表形式
# 如果不使用GPU,请使用get_iter_dali_cpu()
train_data_iter, val_data_iter = get_iter_dali(batch_size=256,
train_file_root=TRAIN_PATH,
val_file_root=Val_PATH,
num_threads=4,
device_id=[0,4],
num_shards=2,
shard_id=[0,1])
# 在torch中训练
torch_model = TorchModel(para)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(torch_model.parameters())
for epoch in range(epoches):
for step, x,y in enumerate(train_data_iter):
# 数据 : x
# 标签 : y
x = x.to("cuda:0")
y = y.to("cuda:0")
output = my_model(x)
optimizer.zero_grad()
loss = criterion(output, y)
loss.backward()
optimizer.step()
...
...
"""
def get_train_pipeline(shard_id, device_id):
pipeline = HybridTrainPipeline(batch_size = batch_size,
file_root = train_file_root,
num_threads = num_threads,
num_shards = num_shards,
shard_id = shard_id,
device_id = device_id)
return pipeline

def get_val_pipeline(shard_id, device_id):
pipeline = HybridValPipeline(batch_size = batch_size,
file_root = val_file_root,
num_threads = num_threads,
num_shards = num_shards,
shard_id = shard_id,
device_id = device_id)
return pipeline
pipeline_for_train = [get_train_pipeline(shard_id = shard_id_index, device_id = device_id_index) \
for shard_id_index, device_id_index in zip(shard_id, device_id)]
pipeline_for_val = [get_val_pipeline(shard_id = shard_id_index, device_id = device_id_index) \
for shard_id_index, device_id_index in zip(shard_id, device_id)]
[pipeline.build() for pipeline in pipeline_for_train]
[pipeline.build() for pipeline in pipeline_for_val]
def get_data_size(pipeline):
data_num = pipeline.epoch_size()["Reader"]
batch_size = pipeline.batch_size
return data_num//batch_size*batch_size
data_num_train = get_data_size(pipeline_for_train[0])
data_num_val = get_data_size(pipeline_for_val[0])
def get_dali_iter_for_torch(pipelines, data_num):
return [DALIClassificationIterator(pipelines=pipeline,
last_batch_policy="drop",size = data_num) for pipeline in pipelines]
data_loader_train = get_dali_iter_for_torch(pipeline_for_train, data_num_train)
data_loader_val = get_dali_iter_for_torch(pipeline_for_val, data_num_val)
train_data_iter = TorchWrapper(num_shards, data_loader_train)
val_data_iter = TorchWrapper(num_shards, data_loader_val)
return train_data_iter, val_data_iter


def get_iter_dali_cpu(batch_size=256, train_file_root="", val_file_root="", num_threads=4):

pipeline_train = HybridTrainPipeline(batch_size = batch_size,
file_root = train_file_root,
num_threads = num_threads,
num_shards = 0,
shard_id = -1,
device_id = 0)
pipeline_val = HybridTrainPipeline(batch_size = batch_size,
file_root = val_file_root,
num_threads = num_threads,
num_shards = 0,
shard_id = -1,
device_id = 0)
pipeline_train.build()
pipeline_val.build()
def get_data_size(pipeline):
data_num = pipeline.epoch_size()["Reader"]
batch_size = pipeline.batch_size
return data_num//batch_size*batch_size
data_num_train = get_data_size(pipeline_train)
data_num_val = get_data_size(pipeline_val)
data_loader_train = DALIClassificationIterator(pipelines=pipeline_train,
last_batch_policy="drop",size = data_num_train)
data_loader_val = DALIClassificationIterator(pipelines=pipeline_val,
last_batch_policy="drop",size = data_num_val)
train_data_iter = TorchWrapper(0,data_loader_train)
val_data_iter = TorchWrapper(0,data_loader_val)
return train_data_iter, val_data_iter
if __name__ == "__main__":
PATH = "./imagenet"
TRAIN_PATH = "./imagenet/train"
VALID_PATH = "./imagenet/val"
train_data_iter_cuda, val_data_iter_cuda = get_iter_dali_cuda(batch_size=256,
train_file_root=TRAIN_PATH,
val_file_root=TRAIN_PATH,
num_threads=4,
device_id=[0,4],
num_shards=2,
shard_id=[0,1])
train_data_iter_cpu, val_data_iter_cpu = get_iter_dali_cpu(batch_size=256,
train_file_root=TRAIN_PATH,
val_file_root=TRAIN_PATH,
num_threads=4)

+ 46
- 0
dubhe-tadl/enas/README.md View File

@@ -0,0 +1,46 @@
# Efficient Neural Architecture Search (ENAS)

## 1. Requirements
```
torch
torchvision
collections
argparser
pickle
pytest-shutil
```

## 2.Train
### Stage1: search an architecture

* macro search

```
python trainer.py --trial_id=0 --search_for macro --best_selected_space_path='./macro_selected_space.json' --result_path='./macro_result.json'
```

* micro search

```
python trainer.py --trial_id=0 --search_for micro --best_selected_space_path='./micro_selected_space.json' --result_path='./micro_result.json'
```

### Stage2: select (deprecated)
```
python selector.py
```

### stage3: retrain
* macro search

```
python retrainer.py --search_for macro --best_checkpoint_dir='./macro_checkpoint.pth' --best_selected_space_path=
'./macro_selected_space.json' --result_path='./macro_result.json'
```

* micro search

```
python retrainer.py --search_for micro --best_checkpoint_dir='./micro_checkpoint.pth' --best_selected_space_path=
'./micro_selected_space.json' --result_path='./micro_result.json'
```

+ 5
- 0
dubhe-tadl/enas/__init__.py View File

@@ -0,0 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from .mutator import EnasMutator
from .trainer import EnasTrainer

+ 28
- 0
dubhe-tadl/enas/datasets.py View File

@@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from torchvision import transforms
from torchvision.datasets import CIFAR10


def get_dataset(cls,datadir):
MEAN = [0.49139968, 0.48215827, 0.44653124]
STD = [0.24703233, 0.24348505, 0.26158768]
transf = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
normalize = [
transforms.ToTensor(),
transforms.Normalize(MEAN, STD)
]

train_transform = transforms.Compose(transf + normalize)
valid_transform = transforms.Compose(normalize)

if cls == "cifar10":
dataset_train = CIFAR10(root=datadir, train=True, download=True, transform=train_transform)
dataset_valid = CIFAR10(root=datadir, train=False, download=True, transform=valid_transform)
else:
raise NotImplementedError
return dataset_train, dataset_valid

+ 87
- 0
dubhe-tadl/enas/macro.py View File

@@ -0,0 +1,87 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch.nn as nn

import sys
sys.path.append('..'+ '/' + '..')
from pytorch import mutables # LayerChoice, InputChoice, MutableScope
from ops import FactorizedReduce, ConvBranch, PoolBranch


class ENASLayer(mutables.MutableScope):

def __init__(self, key, prev_labels, in_filters, out_filters):
super().__init__(key)
self.in_filters = in_filters
self.out_filters = out_filters
self.mutable = mutables.LayerChoice([
ConvBranch(in_filters, out_filters, 3, 1, 1, separable=False),
ConvBranch(in_filters, out_filters, 3, 1, 1, separable=True),
ConvBranch(in_filters, out_filters, 5, 1, 2, separable=False),
ConvBranch(in_filters, out_filters, 5, 1, 2, separable=True),
PoolBranch('avg', in_filters, out_filters, 3, 1, 1),
PoolBranch('max', in_filters, out_filters, 3, 1, 1)
])
if len(prev_labels) > 0:
self.skipconnect = mutables.InputChoice(choose_from=prev_labels, n_chosen=None)
else:
self.skipconnect = None
self.batch_norm = nn.BatchNorm2d(out_filters, affine=False)

def forward(self, prev_layers):
out = self.mutable(prev_layers[-1])
if self.skipconnect is not None:
connection = self.skipconnect(prev_layers[:-1])
if connection is not None:
out += connection
return self.batch_norm(out)


class GeneralNetwork(nn.Module):
def __init__(self, num_layers=12, out_filters=24, in_channels=3, num_classes=10,
dropout_rate=0.0):
super().__init__()
self.num_layers = num_layers
self.num_classes = num_classes
self.out_filters = out_filters

self.stem = nn.Sequential(
nn.Conv2d(in_channels, out_filters, 3, 1, 1, bias=False),
nn.BatchNorm2d(out_filters)
)

pool_distance = self.num_layers // 3
self.pool_layers_idx = [pool_distance - 1, 2 * pool_distance - 1]
self.dropout_rate = dropout_rate
self.dropout = nn.Dropout(self.dropout_rate)
self.layers = nn.ModuleList()
self.pool_layers = nn.ModuleList()
labels = []
for layer_id in range(self.num_layers):
labels.append("layer_{}".format(layer_id))
if layer_id in self.pool_layers_idx:
self.pool_layers.append(FactorizedReduce(self.out_filters, self.out_filters))
self.layers.append(ENASLayer(labels[-1], labels[:-1], self.out_filters, self.out_filters))

self.gap = nn.AdaptiveAvgPool2d(1)
self.dense = nn.Linear(self.out_filters, self.num_classes)

def forward(self, x):
bs = x.size(0)
cur = self.stem(x)

layers = [cur]

for layer_id in range(self.num_layers):
cur = self.layers[layer_id](layers)
layers.append(cur)
if layer_id in self.pool_layers_idx:
for i, layer in enumerate(layers):
layers[i] = self.pool_layers[self.pool_layers_idx.index(layer_id)](layer)
cur = layers[-1]

cur = self.gap(cur).view(bs, -1)
cur = self.dropout(cur)
logits = self.dense(cur)
return logits

+ 187
- 0
dubhe-tadl/enas/micro.py View File

@@ -0,0 +1,187 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import torch.nn as nn
import torch.nn.functional as F

from pytorch import mutables
from ops import FactorizedReduce, StdConv, SepConvBN, Pool


class AuxiliaryHead(nn.Module):
def __init__(self, in_channels, num_classes):
super().__init__()
self.in_channels = in_channels
self.num_classes = num_classes
self.pooling = nn.Sequential(
nn.ReLU(),
nn.AvgPool2d(5, 3, 2)
)
self.proj = nn.Sequential(
StdConv(in_channels, 128),
StdConv(128, 768)
)
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(768, 10, bias=False)

def forward(self, x):
bs = x.size(0)
x = self.pooling(x)
x = self.proj(x)
x = self.avg_pool(x).view(bs, -1)
x = self.fc(x)
return x


class Cell(nn.Module):
def __init__(self, cell_name, prev_labels, channels):
super().__init__()
self.input_choice = mutables.InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True,
key=cell_name + "_input")
self.op_choice = mutables.LayerChoice([
SepConvBN(channels, channels, 3, 1),
SepConvBN(channels, channels, 5, 2),
Pool("avg", 3, 1, 1),
Pool("max", 3, 1, 1),
nn.Identity()
], key=cell_name + "_op")

def forward(self, prev_layers):
chosen_input, chosen_mask = self.input_choice(prev_layers)
cell_out = self.op_choice(chosen_input)
return cell_out, chosen_mask


class Node(mutables.MutableScope):
def __init__(self, node_name, prev_node_names, channels):
super().__init__(node_name)
self.cell_x = Cell(node_name + "_x", prev_node_names, channels)
self.cell_y = Cell(node_name + "_y", prev_node_names, channels)

def forward(self, prev_layers):
out_x, mask_x = self.cell_x(prev_layers)
out_y, mask_y = self.cell_y(prev_layers)
return out_x + out_y, mask_x | mask_y


class Calibration(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.process = None
if in_channels != out_channels:
self.process = StdConv(in_channels, out_channels)
def forward(self, x):
if self.process is None:
return x
return self.process(x)


class ReductionLayer(nn.Module):
def __init__(self, in_channels_pp, in_channels_p, out_channels):
super().__init__()
self.reduce0 = FactorizedReduce(in_channels_pp, out_channels, affine=False)
self.reduce1 = FactorizedReduce(in_channels_p, out_channels, affine=False)

def forward(self, pprev, prev):
return self.reduce0(pprev), self.reduce1(prev)


class ENASLayer(nn.Module):
def __init__(self, num_nodes, in_channels_pp, in_channels_p, out_channels, reduction):
super().__init__()
self.preproc0 = Calibration(in_channels_pp, out_channels)
self.preproc1 = Calibration(in_channels_p, out_channels)

self.num_nodes = num_nodes
name_prefix = "reduce" if reduction else "normal"
self.nodes = nn.ModuleList()
node_labels = [mutables.InputChoice.NO_KEY, mutables.InputChoice.NO_KEY]
for i in range(num_nodes):
node_labels.append("{}_node_{}".format(name_prefix, i))
self.nodes.append(Node(node_labels[-1], node_labels[:-1], out_channels))
self.final_conv_w = nn.Parameter(torch.zeros(out_channels, self.num_nodes + 2, out_channels, 1, 1), requires_grad=True)
self.bn = nn.BatchNorm2d(out_channels, affine=False)
self.reset_parameters()

def reset_parameters(self):
nn.init.kaiming_normal_(self.final_conv_w)

def forward(self, pprev, prev):
pprev_, prev_ = self.preproc0(pprev), self.preproc1(prev)

prev_nodes_out = [pprev_, prev_]
nodes_used_mask = torch.zeros(self.num_nodes + 2, dtype=torch.bool, device=prev.device)
for i in range(self.num_nodes):
node_out, mask = self.nodes[i](prev_nodes_out)
nodes_used_mask[:mask.size(0)] |= mask.to(node_out.device)
prev_nodes_out.append(node_out)

unused_nodes = torch.cat([out for used, out in zip(nodes_used_mask, prev_nodes_out) if not used], 1)
unused_nodes = F.relu(unused_nodes)
conv_weight = self.final_conv_w[:, ~nodes_used_mask, :, :, :]
conv_weight = conv_weight.view(conv_weight.size(0), -1, 1, 1)
out = F.conv2d(unused_nodes, conv_weight)
return prev, self.bn(out)


class MicroNetwork(nn.Module):
def __init__(self, num_layers=2, num_nodes=5, out_channels=24, in_channels=3, num_classes=10,
dropout_rate=0.0, use_aux_heads=False):
super().__init__()
self.num_layers = num_layers
self.use_aux_heads = use_aux_heads

self.stem = nn.Sequential(
nn.Conv2d(in_channels, out_channels * 3, 3, 1, 1, bias=False),
nn.BatchNorm2d(out_channels * 3)
)

pool_distance = self.num_layers // 3
pool_layers = [pool_distance, 2 * pool_distance + 1]
self.dropout = nn.Dropout(dropout_rate)

self.layers = nn.ModuleList()
c_pp = c_p = out_channels * 3
c_cur = out_channels
for layer_id in range(self.num_layers + 2):
reduction = False
if layer_id in pool_layers:
c_cur, reduction = c_p * 2, True
self.layers.append(ReductionLayer(c_pp, c_p, c_cur))
c_pp = c_p = c_cur
self.layers.append(ENASLayer(num_nodes, c_pp, c_p, c_cur, reduction))
if self.use_aux_heads and layer_id == pool_layers[-1] + 1:
self.layers.append(AuxiliaryHead(c_cur, num_classes))
c_pp, c_p = c_p, c_cur

self.gap = nn.AdaptiveAvgPool2d(1)
self.dense = nn.Linear(c_cur, num_classes)

self.reset_parameters()

def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)

def forward(self, x):
bs = x.size(0)
prev = cur = self.stem(x)
aux_logits = None

for layer in self.layers:
if isinstance(layer, AuxiliaryHead):
if self.training:
aux_logits = layer(cur)
else:
prev, cur = layer(prev, cur)

cur = self.gap(F.relu(cur)).view(bs, -1)
cur = self.dropout(cur)
logits = self.dense(cur)

if aux_logits is not None:
return logits, aux_logits
return logits

+ 197
- 0
dubhe-tadl/enas/mutator.py View File

@@ -0,0 +1,197 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import torch.nn as nn
import torch.nn.functional as F

from pytorch.mutator import Mutator
from pytorch.mutables import LayerChoice, InputChoice, MutableScope


class StackedLSTMCell(nn.Module):
def __init__(self, layers, size, bias):
super().__init__()
self.lstm_num_layers = layers
self.lstm_modules = nn.ModuleList([nn.LSTMCell(size, size, bias=bias)
for _ in range(self.lstm_num_layers)])

def forward(self, inputs, hidden):
prev_c, prev_h = hidden
next_c, next_h = [], []
for i, m in enumerate(self.lstm_modules):
curr_c, curr_h = m(inputs, (prev_c[i], prev_h[i]))
next_c.append(curr_c)
next_h.append(curr_h)
# current implementation only supports batch size equals 1,
# but the algorithm does not necessarily have this limitation
inputs = curr_h[-1].view(1, -1)
return next_c, next_h


class EnasMutator(Mutator):
"""
A mutator that mutates the graph with RL.

Parameters
----------
model : nn.Module
PyTorch model.
lstm_size : int
Controller LSTM hidden units.
lstm_num_layers : int
Number of layers for stacked LSTM.
tanh_constant : float
Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``.
cell_exit_extra_step : bool
If true, RL controller will perform an extra step at the exit of each MutableScope, dump the hidden state
and mark it as the hidden state of this MutableScope. This is to align with the original implementation of paper.
skip_target : float
Target probability that skipconnect will appear.
temperature : float
Temperature constant that divides the logits.
branch_bias : float
Manual bias applied to make some operations more likely to be chosen.
Currently this is implemented with a hardcoded match rule that aligns with original repo.
If a mutable has a ``reduce`` in its key, all its op choices
that contains `conv` in their typename will receive a bias of ``+self.branch_bias`` initially; while others
receive a bias of ``-self.branch_bias``.
entropy_reduction : str
Can be one of ``sum`` and ``mean``. How the entropy of multi-input-choice is reduced.
"""

def __init__(self, model, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5, cell_exit_extra_step=False,
skip_target=0.4, temperature=None, branch_bias=0.25, entropy_reduction="sum"):
super().__init__(model)
self.lstm_size = lstm_size
self.lstm_num_layers = lstm_num_layers
self.tanh_constant = tanh_constant
self.temperature = temperature
self.cell_exit_extra_step = cell_exit_extra_step
self.skip_target = skip_target
self.branch_bias = branch_bias

self.lstm = StackedLSTMCell(self.lstm_num_layers, self.lstm_size, False)
self.attn_anchor = nn.Linear(self.lstm_size, self.lstm_size, bias=False)
self.attn_query = nn.Linear(self.lstm_size, self.lstm_size, bias=False)
self.v_attn = nn.Linear(self.lstm_size, 1, bias=False)
self.g_emb = nn.Parameter(torch.randn(1, self.lstm_size) * 0.1)
self.skip_targets = nn.Parameter(torch.tensor([1.0 - self.skip_target, self.skip_target]), requires_grad=False) # pylint: disable=not-callable
assert entropy_reduction in ["sum", "mean"], "Entropy reduction must be one of sum and mean."
self.entropy_reduction = torch.sum if entropy_reduction == "sum" else torch.mean
self.cross_entropy_loss = nn.CrossEntropyLoss(reduction="none")
self.bias_dict = nn.ParameterDict()

self.max_layer_choice = 0
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
if self.max_layer_choice == 0:
self.max_layer_choice = len(mutable)
assert self.max_layer_choice == len(mutable), \
"ENAS mutator requires all layer choice have the same number of candidates."
# We are judging by keys and module types to add biases to layer choices. Needs refactor.
if "reduce" in mutable.key:
def is_conv(choice):
return "conv" in str(type(choice)).lower()
bias = torch.tensor([self.branch_bias if is_conv(choice) else -self.branch_bias # pylint: disable=not-callable
for choice in mutable])
self.bias_dict[mutable.key] = nn.Parameter(bias, requires_grad=False)

self.embedding = nn.Embedding(self.max_layer_choice + 1, self.lstm_size)
self.soft = nn.Linear(self.lstm_size, self.max_layer_choice, bias=False)

def sample_search(self):
self._initialize()
self._sample(self.mutables)
return self._choices

def sample_final(self):
return self.sample_search()

def _sample(self, tree):
mutable = tree.mutable
if isinstance(mutable, LayerChoice) and mutable.key not in self._choices:
self._choices[mutable.key] = self._sample_layer_choice(mutable)
elif isinstance(mutable, InputChoice) and mutable.key not in self._choices:
self._choices[mutable.key] = self._sample_input_choice(mutable)
for child in tree.children:
self._sample(child)
if isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid:
if self.cell_exit_extra_step:
self._lstm_next_step()
self._mark_anchor(mutable.key)

def _initialize(self):
self._choices = dict()
self._anchors_hid = dict()
self._inputs = self.g_emb.data
self._c = [torch.zeros((1, self.lstm_size),
dtype=self._inputs.dtype,
device=self._inputs.device) for _ in range(self.lstm_num_layers)]
self._h = [torch.zeros((1, self.lstm_size),
dtype=self._inputs.dtype,
device=self._inputs.device) for _ in range(self.lstm_num_layers)]
self.sample_log_prob = 0
self.sample_entropy = 0
self.sample_skip_penalty = 0

def _lstm_next_step(self):
self._c, self._h = self.lstm(self._inputs, (self._c, self._h))

def _mark_anchor(self, key):
self._anchors_hid[key] = self._h[-1]

def _sample_layer_choice(self, mutable):
self._lstm_next_step()
logit = self.soft(self._h[-1])
if self.temperature is not None:
logit /= self.temperature
if self.tanh_constant is not None:
logit = self.tanh_constant * torch.tanh(logit)
if mutable.key in self.bias_dict:
logit += self.bias_dict[mutable.key]
branch_id = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
log_prob = self.cross_entropy_loss(logit, branch_id)
self.sample_log_prob += self.entropy_reduction(log_prob)
entropy = (log_prob * torch.exp(-log_prob)).detach() # pylint: disable=invalid-unary-operand-type
self.sample_entropy += self.entropy_reduction(entropy)
self._inputs = self.embedding(branch_id)
return F.one_hot(branch_id, num_classes=self.max_layer_choice).bool().view(-1)

def _sample_input_choice(self, mutable):
query, anchors = [], []
for label in mutable.choose_from:
if label not in self._anchors_hid:
self._lstm_next_step()
self._mark_anchor(label) # empty loop, fill not found
query.append(self.attn_anchor(self._anchors_hid[label]))
anchors.append(self._anchors_hid[label])
query = torch.cat(query, 0)
query = torch.tanh(query + self.attn_query(self._h[-1]))
query = self.v_attn(query)
if self.temperature is not None:
query /= self.temperature
if self.tanh_constant is not None:
query = self.tanh_constant * torch.tanh(query)

if mutable.n_chosen is None:
logit = torch.cat([-query, query], 1) # pylint: disable=invalid-unary-operand-type

skip = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
skip_prob = torch.sigmoid(logit)
kl = torch.sum(skip_prob * torch.log(skip_prob / self.skip_targets))
self.sample_skip_penalty += kl
log_prob = self.cross_entropy_loss(logit, skip)
self._inputs = (torch.matmul(skip.float(), torch.cat(anchors, 0)) / (1. + torch.sum(skip))).unsqueeze(0)
else:
assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS."
logit = query.view(1, -1)
index = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1)
skip = F.one_hot(index, num_classes=mutable.n_candidates).view(-1)
log_prob = self.cross_entropy_loss(logit, index)
self._inputs = anchors[index.item()]

self.sample_log_prob += self.entropy_reduction(log_prob)
entropy = (log_prob * torch.exp(-log_prob)).detach() # pylint: disable=invalid-unary-operand-type
self.sample_entropy += self.entropy_reduction(entropy)
return skip.bool()

+ 129
- 0
dubhe-tadl/enas/ops.py View File

@@ -0,0 +1,129 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import torch.nn as nn


class StdConv(nn.Module):
def __init__(self, C_in, C_out):
super(StdConv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(C_out, affine=False),
nn.ReLU()
)

def forward(self, x):
return self.conv(x)

def __str__(self):
return 'StdConv'


class PoolBranch(nn.Module):
def __init__(self, pool_type, C_in, C_out, kernel_size, stride, padding, affine=False):
super().__init__()
self.kernel_size = kernel_size
self.pool_type = pool_type
self.preproc = StdConv(C_in, C_out)
self.pool = Pool(pool_type, kernel_size, stride, padding)
self.bn = nn.BatchNorm2d(C_out, affine=affine)

def forward(self, x):
out = self.preproc(x)
out = self.pool(out)
out = self.bn(out)
return out

def __str__(self):
return '{}PoolBranch_{}'.format(self.pool_type, self.kernel_size)

class SeparableConv(nn.Module):
def __init__(self, C_in, C_out, kernel_size, stride, padding):
self.kernel_size = kernel_size
super(SeparableConv, self).__init__()
self.depthwise = nn.Conv2d(C_in, C_in, kernel_size=kernel_size, padding=padding, stride=stride,
groups=C_in, bias=False)
self.pointwise = nn.Conv2d(C_in, C_out, kernel_size=1, bias=False)

def forward(self, x):
out = self.depthwise(x)
out = self.pointwise(out)
return out

def __str__(self):
return 'SeparableConv_{}'.format(self.kernel_size)

class ConvBranch(nn.Module):
def __init__(self, C_in, C_out, kernel_size, stride, padding, separable):
super(ConvBranch, self).__init__()
self.kernel_size = kernel_size
self.preproc = StdConv(C_in, C_out)
if separable:
self.conv = SeparableConv(C_out, C_out, kernel_size, stride, padding)
else:
self.conv = nn.Conv2d(C_out, C_out, kernel_size, stride=stride, padding=padding)
self.postproc = nn.Sequential(
nn.BatchNorm2d(C_out, affine=False),
nn.ReLU()
)

def forward(self, x):
out = self.preproc(x)
out = self.conv(out)
out = self.postproc(out)
return out

def __str__(self):
return 'ConvBranch_{}'.format(self.kernel_size)

class FactorizedReduce(nn.Module):
def __init__(self, C_in, C_out, affine=False):
super().__init__()
self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
self.bn = nn.BatchNorm2d(C_out, affine=affine)

def forward(self, x):
out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
out = self.bn(out)
return out

def __str__(self):
return 'FactorizedReduce'

class Pool(nn.Module):
def __init__(self, pool_type, kernel_size, stride, padding):
super().__init__()
self.kernel_size = kernel_size
self.pool_type = pool_type
if pool_type.lower() == 'max':
self.pool = nn.MaxPool2d(kernel_size, stride, padding)
elif pool_type.lower() == 'avg':
self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
else:
raise ValueError()

def forward(self, x):
return self.pool(x)

def __str__(self):
return '{}Pool_{}'.format(self.pool_type, self.kernel_size)

class SepConvBN(nn.Module):
def __init__(self, C_in, C_out, kernel_size, padding):
super().__init__()
self.kernel_size = kernel_size
self.relu = nn.ReLU()
self.conv = SeparableConv(C_in, C_out, kernel_size, 1, padding)
self.bn = nn.BatchNorm2d(C_out, affine=True)

def forward(self, x):
x = self.relu(x)
x = self.conv(x)
x = self.bn(x)
return x

def __str__(self):
return 'SepConvBN_{}'.format(self.kernel_size)

+ 490
- 0
dubhe-tadl/enas/retrain.py View File

@@ -0,0 +1,490 @@
import sys
sys.path.append('..'+ '/' + '..')
import os
import logging
import pickle
import shutil
import random
import math
import time
import datetime
import argparse
import distutils.util
import numpy as np
import json
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as Func

from macro import GeneralNetwork
from micro import MicroNetwork
import datasets
from utils import accuracy, reward_accuracy
from pytorch.fixed import apply_fixed_architecture
from pytorch.utils import AverageMeterGroup, to_device, save_best_checkpoint

logger = logging.getLogger("enas-retrain")

# TODO:
def set_random_seed(seed):
logger.info("set random seed for data reading: {}".format(seed))
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)
torch.manual_seed_all(seed)
if FLAGS.is_cuda:
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True


# TODO: parser args
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_dir",
type=str,
default="./data",
help="Directory containing the dataset and embedding file. (default: %(default)s)")
parser.add_argument("--search_space_path", type=str,
default='./search_space.json', help="search_space directory")
parser.add_argument(
"--selected_space_path",
type=str,
default="./selected_space.json",
# required=True,
help="Architecture json file. (default: %(default)s)")
parser.add_argument("--result_path", type=str,
default='./result.json', help="res directory")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')
parser.add_argument(
"--output_dir",
type=str,
default="./output",
help="The output directory. (default: %(default)s)")
parser.add_argument(
"--best_checkpoint_dir",
type=str,
default="best_checkpoint",
help="Path for saved checkpoints. (default: %(default)s)")
parser.add_argument("--search_for",
choices=["macro", "micro"],
default="micro")
parser.add_argument(
"--batch_size",
type=int,
default=128,
help="Number of samples each batch for training. (default: %(default)s)")
parser.add_argument(
"--eval_batch_size",
type=int,
default=128,
help="Number of samples each batch for evaluation. (default: %(default)s)")
parser.add_argument(
"--class_num",
type=int,
default=10,
help="The number of categories. (default: %(default)s)")
parser.add_argument(
"--epochs",
type=int,
default=10,
help="The number of training epochs. (default: %(default)s)")
parser.add_argument(
"--child_lr",
type=float,
default=0.02,
help="The initial learning rate. (default: %(default)s)")
parser.add_argument(
"--is_cuda",
type=distutils.util.strtobool,
default=True,
help="Specify the device type. (default: %(default)s)")
parser.add_argument(
"--load_checkpoint",
type=distutils.util.strtobool,
default=False,
help="Whether to load checkpoint. (default: %(default)s)")
parser.add_argument(
"--log_every",
type=int,
default=50,
help="How many steps to log. (default: %(default)s)")
parser.add_argument(
"--eval_every_epochs",
type=int,
default=1,
help="How many epochs to eval. (default: %(default)s)")
parser.add_argument(
"--child_grad_bound",
type=float,
default=5.0,
help="The threshold for gradient clipping. (default: %(default)s)") #
parser.add_argument(
"--child_lr_decay_scheme",
type=str,
default="cosine",
help="Learning rate annealing strategy, only 'cosine' supported. (default: %(default)s)") #todo: remove
parser.add_argument(
"--child_lr_T_0",
type=int,
default=10,
help="The length of one cycle. (default: %(default)s)") # todo: use for
parser.add_argument(
"--child_lr_T_mul",
type=int,
default=2,
help="The multiplication factor per cycle. (default: %(default)s)") # todo: use for
parser.add_argument(
"--child_l2_reg",
type=float,
default=3e-6,
help="Weight decay factor. (default: %(default)s)")
parser.add_argument(
"--child_lr_max",
type=float,
default=0.002,
help="The max learning rate. (default: %(default)s)")
parser.add_argument(
"--child_lr_min",
type=float,
default=0.001,
help="The min learning rate. (default: %(default)s)")
parser.add_argument(
"--multi_path",
type=distutils.util.strtobool,
default=False,
help="Search for multiple path in the architecture. (default: %(default)s)") # todo: use for
parser.add_argument(
"--is_mask",
type=distutils.util.strtobool,
default=True,
help="Apply mask. (default: %(default)s)")
global FLAGS
FLAGS = parser.parse_args()


def print_user_flags(FLAGS, line_limit=80):
log_strings = "\n" + "-" * line_limit + "\n"
for flag_name in sorted(vars(FLAGS)):
value = "{}".format(getattr(FLAGS, flag_name))
log_string = flag_name
log_string += "." * (line_limit - len(flag_name) - len(value))
log_string += value
log_strings = log_strings + log_string
log_strings = log_strings + "\n"
log_strings += "-" * line_limit
logger.info(log_strings)

def eval_once(child_model, device, eval_set, criterion, valid_dataloader=None, test_dataloader=None):
if eval_set == "test":
assert test_dataloader is not None
dataloader = test_dataloader
elif eval_set == "valid":
assert valid_dataloader is not None
dataloader = valid_dataloader
else:
raise NotImplementedError("Unknown eval_set '{}'".format(eval_set))

tot_acc = 0
tot = 0
losses = []

with torch.no_grad(): # save memory
for batch in dataloader:

x, y = batch
x, y = to_device(x, device), to_device(y, device)
logits = child_model(x)

if isinstance(logits, tuple):
logits, aux_logits = logits
aux_loss = criterion(aux_logits, y)
else:
aux_loss = 0.

loss = criterion(logits, y)
loss = loss + aux_weight * aux_loss
# loss = loss.mean()
preds = logits.argmax(dim=1).long()
acc = torch.eq(preds, y.long()).long().sum().item()

losses.append(loss)
tot_acc += acc
tot += len(y)

losses = torch.tensor(losses)
loss = losses.mean()
if tot > 0:
final_acc = float(tot_acc) / tot
else:
final_acc = 0
logger.info("Error in calculating final_acc")
return final_acc, loss

# TODO: learning rate scheduler
def update_lr(
optimizer,
epoch,
l2_reg=1e-4,
lr_warmup_val=None,
lr_init=0.1,
lr_decay_scheme="cosine",
lr_max=0.002,
lr_min=0.000000001,
lr_T_0=4,
lr_T_mul=1,
sync_replicas=False,
num_aggregate=None,
num_replicas=None):
if lr_decay_scheme == "cosine":
assert lr_max is not None, "Need lr_max to use lr_cosine"
assert lr_min is not None, "Need lr_min to use lr_cosine"
assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine"
assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine"

T_i = lr_T_0
t_epoch = epoch
last_reset = 0
while True:
t_epoch -= T_i
if t_epoch < 0:
break
last_reset += T_i
T_i *= lr_T_mul

T_curr = epoch - last_reset

def _update():
rate = T_curr / T_i * 3.1415926
lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + math.cos(rate))
return lr

learning_rate = _update()
else:
raise ValueError("Unknown learning rate decay scheme {}".format(lr_decay_scheme))

#update lr in optimizer
for params_group in optimizer.param_groups:
params_group['lr'] = learning_rate
return learning_rate

def train(device, output_dir='./output'):
workers = 4
data = 'cifar10'

data_dir = FLAGS.data_dir
output_dir = FLAGS.output_dir
checkpoint_dir = FLAGS.best_checkpoint_dir
batch_size = FLAGS.batch_size
eval_batch_size = FLAGS.eval_batch_size
class_num = FLAGS.class_num
epochs = FLAGS.epochs
child_lr = FLAGS.child_lr
is_cuda = FLAGS.is_cuda
load_checkpoint = FLAGS.load_checkpoint
log_every = FLAGS.log_every
eval_every_epochs = FLAGS.eval_every_epochs

child_grad_bound = FLAGS.child_grad_bound
child_l2_reg = FLAGS.child_l2_reg

logger.info("Build dataloader")
dataset_train, dataset_valid = datasets.get_dataset("cifar10")
n_train = len(dataset_train)
split = n_train // 10
indices = list(range(n_train))
train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split])
valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:])
train_dataloader = torch.utils.data.DataLoader(dataset_train,
batch_size=batch_size,
sampler=train_sampler,
num_workers=workers)
valid_dataloader = torch.utils.data.DataLoader(dataset_train,
batch_size=batch_size,
sampler=valid_sampler,
num_workers=workers)
test_dataloader = torch.utils.data.DataLoader(dataset_valid,
batch_size=batch_size,
num_workers=workers)



criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(child_model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4, nesterov=True)
# optimizer = optim.Adam(child_model.parameters(), eps=1e-3, weight_decay=FLAGS.child_l2_reg)
# TODO
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=0.001)

# move model to CPU/GPU device
child_model.to(device)
criterion.to(device)

logger.info('Start training')
start_time = time.time()
step = 0

# save path
if not os.path.exists(output_dir):
os.mkdir(output_dir)
# model_save_path = os.path.join(output_dir, "model.pth")
# best_model_save_path = os.path.join(output_dir, "best_model.pth")
best_acc = 0
start_epoch = 0

# TODO: load checkpoints

# train
for epoch in range(start_epoch, epochs):
lr = update_lr(optimizer,
epoch,
l2_reg= 1e-4,
lr_warmup_val=None,
lr_init=FLAGS.child_lr,
lr_decay_scheme=FLAGS.child_lr_decay_scheme,
lr_max=0.05,
lr_min=0.001,
lr_T_0=10,
lr_T_mul=2)
child_model.train()
for batch in train_dataloader:
step += 1

x, y = batch
x, y = to_device(x, device), to_device(y, device)
logits = child_model(x)

if isinstance(logits, tuple):
logits, aux_logits = logits
aux_loss = criterion(aux_logits, y)
else:
aux_loss = 0.

acc = accuracy(logits, y)
loss = criterion(logits, y)
loss = loss + aux_weight * aux_loss

optimizer.zero_grad()
loss.backward()
grad_norm = 0
trainable_params = child_model.parameters()

for param in trainable_params:
nn.utils.clip_grad_norm_(param, child_grad_bound) # clip grad

optimizer.step()

if step % log_every == 0:
curr_time = time.time()
log_string = ""
log_string += "epoch={:<6d}".format(epoch)
log_string += "ch_step={:<6d}".format(step)
log_string += " loss={:<8.6f}".format(loss)
log_string += " lr={:<8.4f}".format(lr)
log_string += " |g|={:<8.4f}".format(grad_norm)
log_string += " tr_acc={:<8.4f}/{:>3d}".format(acc['acc1'], logits.size()[0])
log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60)
logger.info(log_string)

epoch += 1
save_state = {
'step': step,
'epoch': epoch,
'child_model_state_dict': child_model.state_dict(),
'optimizer_state_dict': optimizer.state_dict()}
# print(' Epoch {:<3d} loss: {:<.2f} '.format(epoch, loss))
# torch.save(save_state, model_save_path)
child_model.eval()
logger.info("Epoch {}: Eval".format(epoch))
eval_acc, eval_loss = eval_once(child_model, device, "test", criterion, test_dataloader=test_dataloader)
logger.info(
"ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss))
if eval_acc > best_acc:
best_acc = eval_acc
logger.info("Save best model")
# save_state = {
# 'step': step,
# 'epoch': epoch,
# 'child_model_state_dict': child_model.state_dict(),
# 'optimizer_state_dict': optimizer.state_dict()}
# torch.save(save_state, best_model_save_path)
save_best_checkpoint(checkpoint_dir, child_model, optimizer, epoch)

result['accuracy'].append('Epoch {} acc: {:<6.4f}'.format(epoch, eval_acc,))

acc_l.append(eval_acc)

print(result['accuracy'][-1])

print('max acc %.4f at epoch: %i'%(max(acc_l), np.argmax(np.array(acc_l))))
print('Time cost: %.4f hours'%( float(time.time() - start_time) /3600. ))
return result

# macro = True
parse_args()
child_fixed_arc = FLAGS.selected_space_path # './macro_seletced_space'
search_for = FLAGS.search_for
# 设置随机种子
torch.manual_seed(FLAGS.trial_id)
torch.cuda.manual_seed_all(FLAGS.trial_id)
np.random.seed(FLAGS.trial_id)
random.seed(FLAGS.trial_id)

aux_weight = 0.4
result = {'accuracy':[]}
acc_l = []

# decode human readable search space to model
def convert_selected_space_format():
# with open('./macro_selected_space.json') as js:
with open(child_fixed_arc) as js:
selected_space = json.load(js)

ops = selected_space['op_list']
selected_space.pop('op_list')
new_selected_space = {}

for key, value in selected_space.items():
# for macro
if FLAGS.search_for == 'macro':
new_key = key.split('_')[-1]
# for micro
elif FLAGS.search_for == 'micro':
new_key = key

if len(value) > 1 or len(value)==0:
new_value = value
elif len(value) > 0 and value[0] in ops:
new_value = ops.index(value[0])
else:
new_value = value[0]
new_selected_space[new_key] = new_value
return new_selected_space

fixed_arc = convert_selected_space_format()
# TODO : macro search or micro search
if FLAGS.search_for == 'macro':
child_model = GeneralNetwork()
elif FLAGS.search_for == 'micro':
child_model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True)

apply_fixed_architecture(child_model,fixed_arc)

def dump_global_result(res_path,global_result, sort_keys = False):
with open(res_path, "w") as ss_file:
json.dump(global_result, ss_file, sort_keys=sort_keys, indent=2)


def main():
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("cuda" if FLAGS.is_cuda else "cpu")
train(device)
dump_global_result('result_retrain.json', result['accuracy'])

if __name__ == "__main__":
main()


+ 495
- 0
dubhe-tadl/enas/retrainer.py View File

@@ -0,0 +1,495 @@
import sys
from utils import accuracy
import torch
from torch import nn
from torch.utils.data import DataLoader
import datasets
import time
import logging
import os
import argparse
import distutils.util
import numpy as np
import json
import random

sys.path.append('..'+ '/' + '..')

# import custom packages
from macro import GeneralNetwork
from micro import MicroNetwork
from pytorch.fixed import apply_fixed_architecture
from pytorch.retrainer import Retrainer
from pytorch.utils import AverageMeterGroup, to_device, save_best_checkpoint, mkdirs

class EnasRetrainer(Retrainer):
"""
ENAS retrainer.

Parameters
----------
model : nn.Module
PyTorch model to be trained.
data_dir : dataset path
The path of the dataset.
best_checkpoint_dir: 'best_checkpoint.pth'
The directory for saving model.
batch_size : int
Batch size.
eval_batch_size : int
Batch size.
num_epochs : int
Number of epochs planned for training.
lr : float
Learning rate.
is_cuda: Boolean
Whether to use GPU for training.
log_every : int
Step count per logging.
child_grad_bound : float
Gradient bound.
child_l2_reg: float
L2 regression.
eval_every_epochs: int
Evaluate every epochs.
logger:
logging.
workers : int
Workers for data loading.
device : torch.device
``torch.device("cpu")`` or ``torch.device("cuda")``.
aux_weight : float
Weight of auxiliary head loss. ``aux_weight * aux_loss`` will be added to total loss.
"""
def __init__(self,model,data_dir = './data',best_checkpoint_dir = './best_checkpoint',
batch_size = 1024, eval_batch_size = 1024,num_epochs = 2,lr = 0.02,is_cuda = 'True',
log_every = 40,child_grad_bound = 0.5, child_l2_reg=3e-6, eval_every_epochs=2,
logger = logging.getLogger("enas-retrain"), result_path='./'):
self.aux_weight = 0.4
self.device = torch.device("cuda:0" )
self.workers = 4

self.child_model = model
self.data_dir = data_dir
self.best_checkpoint_dir = best_checkpoint_dir
self.batch_size = batch_size
self.eval_batch_size = eval_batch_size
self.num_epochs = num_epochs
self.lr = lr
self.is_cuda = is_cuda
self.log_every = log_every
self.child_grad_bound = child_grad_bound
self.child_l2_reg = child_l2_reg
self.eval_every_epochs = eval_every_epochs
self.logger = logger

self.optimizer = torch.optim.SGD(self.child_model.parameters(), self.lr, momentum=0.9, weight_decay=1.0E-4, nesterov=True)
self.criterion = nn.CrossEntropyLoss()
self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=self.num_epochs, eta_min=0.001)

# load dataset
self.init_dataloader()
self.child_model.to(self.device)
self.result_path = result_path
with open(self.result_path, "w") as file:
file.write('')

def train(self):
"""
Train ``num_epochs``.
Trigger callbacks at the start and the end of each epoch.

Parameters
----------
validate : bool
If ``true``, will do validation every epoch.
"""
self.logger.info('** Start training **')

self.start_time = time.time()
for epoch in range(self.num_epochs):

self.train_one_epoch(epoch)

self.child_model.eval()

# if epoch / self.eval_every_epochs == 0:
self.logger.info("Epoch {}: Eval".format(epoch))
self.validate_one_epoch(epoch)

self.lr_scheduler.step()

# print('** saving model **')

self.logger.info("** Save best model **")
# save_state = {
# 'epoch': epoch,
# 'child_model_state_dict': self.child_model.state_dict(),
# 'optimizer_state_dict': self.optimizer.state_dict()}
# torch.save(save_state, self.best_checkpoint_dir)
save_best_checkpoint(self.best_checkpoint_dir, self.child_model, self.optimizer, epoch)

def validate(self):
"""
Do one validation. Validate one epoch.
"""
pass

def export(self, file):
"""
dump the architecture to ``file``.

Parameters
----------
file : str
File path to export to. Expected to be a JSON.
"""
pass

def checkpoint(self):
"""
Override to dump a checkpoint.
"""
pass

def init_dataloader(self):
self.logger.info("Build dataloader")
self.dataset_train, self.dataset_valid = datasets.get_dataset("cifar10", self.data_dir)
n_train = len(self.dataset_train)
split = n_train // 10
indices = list(range(n_train))
train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split])
valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:])
self.train_loader = torch.utils.data.DataLoader(self.dataset_train,
batch_size=self.batch_size,
sampler=train_sampler,
num_workers=self.workers)
self.valid_loader = torch.utils.data.DataLoader(self.dataset_train,
batch_size=self.eval_batch_size,
sampler=valid_sampler,
num_workers=self.workers)
self.test_loader = torch.utils.data.DataLoader(self.dataset_valid,
batch_size=self.batch_size,
num_workers=self.workers)
# self.train_loader = cycle(self.train_loader)
# self.valid_loader = cycle(self.valid_loader)

def train_one_epoch(self,epoch):
"""
Train one epoch.

Parameters
----------
epoch : int
Epoch number starting from 0.
"""
tot_acc = 0
tot = 0
losses = []
step = 0
self.child_model.train()
meters = AverageMeterGroup()

for batch in self.train_loader:
step += 1

x, y = batch
x, y = to_device(x, self.device), to_device(y, self.device)

logits = self.child_model(x)

if isinstance(logits, tuple):
logits, aux_logits = logits
aux_loss = self.criterion(aux_logits, y)
else:
aux_loss = 0.

acc = accuracy(logits, y)
loss = self.criterion(logits, y)
loss = loss + self.aux_weight * aux_loss

self.optimizer.zero_grad()
loss.backward()
grad_norm = 0
trainable_params = self.child_model.parameters()

# assert FLAGS.child_grad_bound is not None, "Need grad_bound to clip gradients."
# # compute the gradient norm value
# grad_norm = nn.utils.clip_grad_norm_(trainable_params, 99999999)
# for param in trainable_params:
# nn.utils.clip_grad_norm_(param, self.child_grad_bound) # clip grad
# print(param_ == param)
if self.child_grad_bound is not None:
grad_norm = nn.utils.clip_grad_norm_(trainable_params, self.child_grad_bound)
trainable_params = grad_norm

self.optimizer.step()

tot_acc += acc['acc1']
tot += 1
losses.append(loss)
acc["loss"] = loss.item()
meters.update(acc)

if step % self.log_every == 0:
curr_time = time.time()
log_string = ""
log_string += "epoch={:<6d}".format(epoch)
log_string += "ch_step={:<6d}".format(step)
log_string += " loss={:<8.6f}".format(loss)
log_string += " lr={:<8.4f}".format(self.optimizer.param_groups[0]['lr'])
log_string += " |g|={:<8.4f}".format(grad_norm)
log_string += " tr_acc={:<8.4f}/{:>3d}".format(acc['acc1'], logits.size()[0])
log_string += " mins={:<10.2f}".format(float(curr_time - self.start_time) / 60)
self.logger.info(log_string)

print("Model Epoch [%d/%d] %.3f mins %s \n " % (epoch + 1,
self.num_epochs, float(time.time() - self.start_time) / 60, meters ))
final_acc = float(tot_acc) / tot

losses = torch.tensor(losses)
loss = losses.mean()


def validate_one_epoch(self,epoch):
tot_acc = 0
tot = 0
losses = []
meters = AverageMeterGroup()

with torch.no_grad(): # save memory
meters = AverageMeterGroup()
for batch in self.valid_loader:
x, y = batch
x, y = to_device(x, self.device), to_device(y, self.device)
logits = self.child_model(x)

if isinstance(logits, tuple):
logits, aux_logits = logits
aux_loss = self.criterion(aux_logits, y)
else:
aux_loss = 0.

loss = self.criterion(logits, y)
loss = loss + self.aux_weight * aux_loss
# loss = loss.mean()
preds = logits.argmax(dim=1).long()
acc = torch.eq(preds, y.long()).long().sum().item()
acc_v = accuracy(logits, y)

losses.append(loss)
tot_acc += acc
tot += len(y)

acc_v["loss"] = loss.item()
meters.update(acc_v)

losses = torch.tensor(losses)
loss = losses.mean()
if tot > 0:
final_acc = float(tot_acc) / tot
else:
final_acc = 0
self.logger.info("Error in calculating final_acc")

with open(self.result_path, "a") as file:
file.write(
str({"type": "Accuracy",
"result": {"sequence": epoch, "category": "epoch", "value": final_acc}}) + '\n')

# print("Model eval %.3fmins %s \n " % (
# float(time.time() - self.start_time) / 60, meters ))
print({"type": "Accuracy",
"result": {"sequence": epoch, "category": "epoch", "value": final_acc}})

self.logger.info(
"ch_step= {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format( "test", final_acc, "test", loss))


logging.basicConfig(format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s',
level=logging.INFO,
filename='./retrain.log',
filemode='a')
logger = logging.getLogger("enas-retrain")

def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_dir",
type=str,
default="./data",
help="Directory containing the dataset and embedding file. (default: %(default)s)")
parser.add_argument(
"--model_selected_space_path",
type=str,
default="./model_selected_space.json",
# required=True,
help="Architecture json file. (default: %(default)s)")
parser.add_argument("--result_path", type=str,
default='./result.json', help="res directory")
parser.add_argument("--search_space_path", type=str,
default='./search_space.json', help="search_space directory")
parser.add_argument("--log_path", type=str, default='output/log')
parser.add_argument(
"--best_selected_space_path",
type=str,
default="./best_selected_space.json",
# required=True,
help="Best architecture selected json file by experiment. (default: %(default)s)")
parser.add_argument(
"--best_checkpoint_dir",
type=str,
default="best_checkpoint",
help="Path for saved checkpoints. (default: %(default)s)")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')
parser.add_argument("--search_for",
choices=["macro", "micro"],
default="macro")
parser.add_argument(
"--batch_size",
type=int,
default=128,
help="Number of samples each batch for training. (default: %(default)s)")
parser.add_argument(
"--eval_batch_size",
type=int,
default=128,
help="Number of samples each batch for evaluation. (default: %(default)s)")
parser.add_argument(
"--epochs",
type=int,
default=10,
help="The number of training epochs. (default: %(default)s)")
parser.add_argument(
"--lr",
type=float,
default=0.02,
help="The initial learning rate. (default: %(default)s)")
parser.add_argument(
"--is_cuda",
type=distutils.util.strtobool,
default=True,
help="Specify the device type. (default: %(default)s)")
parser.add_argument(
"--load_checkpoint",
type=distutils.util.strtobool,
default=False,
help="Whether to load checkpoint. (default: %(default)s)")
parser.add_argument(
"--log_every",
type=int,
default=50,
help="How many steps to log. (default: %(default)s)")
parser.add_argument(
"--eval_every_epochs",
type=int,
default=1,
help="How many epochs to eval. (default: %(default)s)")
parser.add_argument(
"--child_grad_bound",
type=float,
default=5.0,
help="The threshold for gradient clipping. (default: %(default)s)") #
parser.add_argument(
"--child_l2_reg",
type=float,
default=3e-6,
help="Weight decay factor. (default: %(default)s)")
parser.add_argument(
"--child_lr_decay_scheme",
type=str,
default="cosine",
help="Learning rate annealing strategy, only 'cosine' supported. (default: %(default)s)") #todo: remove
global FLAGS
FLAGS = parser.parse_args()

# decode human readable search space to model
def convert_selected_space_format(child_fixed_arc):
# with open('./macro_selected_space.json') as js:
with open(child_fixed_arc) as js:
selected_space = json.load(js)

ops = selected_space['op_list']
selected_space.pop('op_list')
new_selected_space = {}

for key, value in selected_space.items():
# for macro
if FLAGS.search_for == 'macro':
new_key = key.split('_')[-1]
# for micro
elif FLAGS.search_for == 'micro':
new_key = key

if len(value) > 1 or len(value)==0:
new_value = value
elif len(value) > 0 and value[0] in ops:
new_value = ops.index(value[0])
else:
new_value = value[0]
new_selected_space[new_key] = new_value

return new_selected_space

def set_random_seed(seed):
logger.info("set random seed for data reading: {}".format(seed))
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
if FLAGS.is_cuda:
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

def main():

parse_args()

child_fixed_arc = FLAGS.best_selected_space_path # './macro_seletced_space'
search_for = FLAGS.search_for

# set seed to result todo: trial ID
set_random_seed(FLAGS.trial_id)

mkdirs(FLAGS.result_path, FLAGS.log_path, FLAGS.best_checkpoint_dir)
# define and load model
logger.info('** ' + FLAGS.search_for + 'search **')
fixed_arc = convert_selected_space_format(child_fixed_arc)
# Model, macro search or micro search
if FLAGS.search_for == 'macro':
child_model = GeneralNetwork()
elif FLAGS.search_for == 'micro':
child_model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True)

apply_fixed_architecture(child_model, fixed_arc)

# load model
if FLAGS.load_checkpoint:
print('** Load model **')
logger.info('** Load model **')
child_model.load_state_dict(torch.load(FLAGS.best_checkpoint_dir)['child_model_state_dict'])

retrainer = EnasRetrainer(model=child_model,
data_dir = FLAGS.data_dir,
best_checkpoint_dir=FLAGS.best_checkpoint_dir,
batch_size=FLAGS.batch_size,
eval_batch_size=FLAGS.eval_batch_size,
num_epochs=FLAGS.epochs,
lr=FLAGS.lr,
is_cuda=FLAGS.is_cuda,
log_every=FLAGS.log_every,
child_grad_bound=FLAGS.child_grad_bound,
child_l2_reg=FLAGS.child_l2_reg,
eval_every_epochs=FLAGS.eval_every_epochs,
logger=logger,
result_path=FLAGS.result_path,
)

t1 = time.time()
retrainer.train()
print('cost time for retrain: ' , time.time() - t1)

if __name__ == "__main__":
main()

+ 135
- 0
dubhe-tadl/enas/search.py View File

@@ -0,0 +1,135 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import sys
sys.path.append('..'+ '/' + '..')
import logging
import time
from argparse import ArgumentParser

import torch
import torch.nn as nn

import datasets
from macro import GeneralNetwork
from micro import MicroNetwork
from trainer import EnasTrainer
from mutator import EnasMutator
from pytorch.callbacks import (ArchitectureCheckpoint,
LRSchedulerCallback)
from utils import accuracy, reward_accuracy
from collections import OrderedDict
from pytorch.mutables import LayerChoice, InputChoice
import json
torch.cuda.set_device(4)

logger = logging.getLogger('tadl-enas')

# save search space as search_space.json
def save_nas_search_space(mutator,file_path):
result = OrderedDict()
cur_layer_idx = None
for mutable in mutator.mutables.traverse():
if not isinstance(mutable,(LayerChoice, InputChoice)):
cur_layer_idx = mutable.key + '_'
continue
# macro
if 'layer' in cur_layer_idx:
if isinstance(mutable, LayerChoice):
if 'op_list' not in result:
result['op_list'] = [str(i) for i in mutable]
result[cur_layer_idx + mutable.key] = 'op_list'
else:
result[cur_layer_idx + mutable.key] = {'skip_connection': False if mutable.n_chosen else True,
'n_chosen': mutable.n_chosen if mutable.n_chosen else '',
'choose_from': mutable.choose_from if mutable.choose_from else ''}
# micro
elif 'node' in cur_layer_idx:
if isinstance(mutable,LayerChoice):
if 'op_list' not in result:
result['op_list'] = [str(i) for i in mutable]
result[mutable.key] = 'op_list'
else:
result[mutable.key] = {'skip_connection':False if mutable.n_chosen else True,
'n_chosen': mutable.n_chosen if mutable.n_chosen else '',
'choose_from': mutable.choose_from if mutable.choose_from else ''}

dump_global_result(file_path,result)

# def dump_global_result(args,global_result):
# with open(args['result_path'], "w") as ss_file:
# json.dump(global_result, ss_file, sort_keys=True, indent=2)

def dump_global_result(res_path,global_result, sort_keys = False):
with open(res_path, "w") as ss_file:
json.dump(global_result, ss_file, sort_keys=sort_keys, indent=2)



if __name__ == "__main__":
parser = ArgumentParser("enas")
parser.add_argument("--search_space_path", type=str,
default='./search_space.json', help="search_space directory")
parser.add_argument("--selected_space_path", type=str,
default='./selected_space.json', help="sapce_path_out directory")
parser.add_argument("--result_path", type=str,
default='./result.json', help="res directory")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')

parser.add_argument("--batch-size", default=128, type=int)
parser.add_argument("--log-frequency", default=10, type=int)
parser.add_argument("--search_for", choices=["macro", "micro"], default="macro")
parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)")
args = parser.parse_args()

# 设置随机种子
torch.manual_seed(args.trial_id)
torch.cuda.manual_seed_all(args.trial_id)
np.random.seed(args.trial_id)
random.seed(args.trial_id)

dataset_train, dataset_valid = datasets.get_dataset("cifar10")
if args.search_for == "macro":
model = GeneralNetwork()
num_epochs = args.epochs or 310
mutator = None
mutator = EnasMutator(model)
elif args.search_for == "micro":
model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True)
num_epochs = args.epochs or 150
mutator = EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
else:
raise AssertionError

# 储存整个网络结构
# args.search_spach_path = None#str(args.search_for) + str(args.search_space_path)
# print( args.search_space_path, args.search_for )
save_nas_search_space(mutator, args.search_space_path)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001)

trainer = EnasTrainer(model,
loss=criterion,
metrics=accuracy,
reward_function=reward_accuracy,
optimizer=optimizer,
callbacks=[LRSchedulerCallback(lr_scheduler)],
batch_size=args.batch_size,
num_epochs=num_epochs,
dataset_train=dataset_train,
dataset_valid=dataset_valid,
log_frequency=args.log_frequency,
mutator=mutator,
child_model_path='./'+args.search_for+'_child_model')

logger.info(trainer.metrics)

t1 = time.time()
trainer.train()
trainer.result["cost_time"] = time.time() - t1
dump_global_result(args.result_path,trainer.result)

selected_model = trainer.export_child_model(selected_space = True)
dump_global_result(args.selected_space_path,selected_model)

+ 18
- 0
dubhe-tadl/enas/selector.py View File

@@ -0,0 +1,18 @@
import sys
sys.path.append('../..')
from pytorch.selector import Selector

class EnasSelector(Selector):
def __init__(self, *args, single_candidate=True):
super().__init__(single_candidate)
self.args = args

def fit(self):
"""
only one candatite, function passed
"""
pass

if __name__ == "__main__":
hpo_selector = EnasSelector()
hpo_selector.fit()

+ 436
- 0
dubhe-tadl/enas/trainer.py View File

@@ -0,0 +1,436 @@
from itertools import cycle
import os
import sys
sys.path.append('..'+ '/' + '..')
import numpy as np
import random
import logging
import time
from argparse import ArgumentParser
from collections import OrderedDict
import json
import torch
import torch.nn as nn
import torch.optim as optim


# import custom libraries
import datasets
from pytorch.trainer import Trainer
from pytorch.utils import AverageMeterGroup, to_device, mkdirs
from pytorch.mutables import LayerChoice, InputChoice, MutableScope
from macro import GeneralNetwork
from micro import MicroNetwork
# from trainer import EnasTrainer
from mutator import EnasMutator
from pytorch.callbacks import (ArchitectureCheckpoint,
LRSchedulerCallback)
from utils import accuracy, reward_accuracy

torch.cuda.set_device(0)

logging.basicConfig(format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s',
level=logging.INFO,
filename='./train.log',
filemode='a')
logger = logging.getLogger('enas_train')

class EnasTrainer(Trainer):
"""
ENAS trainer.

Parameters
----------
model : nn.Module
PyTorch model to be trained.
loss : callable
Receives logits and ground truth label, return a loss tensor.
metrics : callable
Receives logits and ground truth label, return a dict of metrics.
reward_function : callable
Receives logits and ground truth label, return a tensor, which will be feeded to RL controller as reward.
optimizer : Optimizer
The optimizer used for optimizing the model.
num_epochs : int
Number of epochs planned for training.
dataset_train : Dataset
Dataset for training. Will be split for training weights and architecture weights.
dataset_valid : Dataset
Dataset for testing.
mutator : EnasMutator
Use when customizing your own mutator or a mutator with customized parameters.
batch_size : int
Batch size.
workers : int
Workers for data loading.
device : torch.device
``torch.device("cpu")`` or ``torch.device("cuda")``.
log_frequency : int
Step count per logging.
callbacks : list of Callback
list of callbacks to trigger at events.
entropy_weight : float
Weight of sample entropy loss.
skip_weight : float
Weight of skip penalty loss.
baseline_decay : float
Decay factor of baseline. New baseline will be equal to ``baseline_decay * baseline_old + reward * (1 - baseline_decay)``.
child_steps : int
How many mini-batches for model training per epoch.
mutator_lr : float
Learning rate for RL controller.
mutator_steps_aggregate : int
Number of steps that will be aggregated into one mini-batch for RL controller.
mutator_steps : int
Number of mini-batches for each epoch of RL controller learning.
aux_weight : float
Weight of auxiliary head loss. ``aux_weight * aux_loss`` will be added to total loss.
test_arc_per_epoch : int
How many architectures are chosen for direct test after each epoch.
"""
def __init__(self, model, loss, metrics, reward_function,
optimizer, num_epochs, dataset_train, dataset_valid,
mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None,
entropy_weight=0.0001, skip_weight=0.8, baseline_decay=0.999, child_steps=500,
mutator_lr=0.00035, mutator_steps_aggregate=20, mutator_steps=50, aux_weight=0.4,
test_arc_per_epoch=1,child_model_path = './', result_path='./'):
super().__init__(model, mutator if mutator is not None else EnasMutator(model),
loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid,
batch_size, workers, device, log_frequency, callbacks)
self.reward_function = reward_function
self.mutator_optim = optim.Adam(self.mutator.parameters(), lr=mutator_lr)
self.batch_size = batch_size
self.workers = workers

self.entropy_weight = entropy_weight
self.skip_weight = skip_weight
self.baseline_decay = baseline_decay
self.baseline = 0.
self.mutator_steps_aggregate = mutator_steps_aggregate
self.mutator_steps = mutator_steps
self.child_steps = child_steps
self.aux_weight = aux_weight
self.test_arc_per_epoch = test_arc_per_epoch
self.child_model_path = child_model_path # saving the child model
self.init_dataloader()
# self.result = {'accuracy':[],
# 'cost_time':0}
self.result_path = result_path
with open(self.result_path, "w") as file:
file.write('')

def init_dataloader(self):
n_train = len(self.dataset_train)
split = n_train // 10
indices = list(range(n_train))
train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split])
valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:])
self.train_loader = torch.utils.data.DataLoader(self.dataset_train,
batch_size=self.batch_size,
sampler=train_sampler,
num_workers=self.workers)
self.valid_loader = torch.utils.data.DataLoader(self.dataset_train,
batch_size=self.batch_size,
sampler=valid_sampler,
num_workers=self.workers)
self.test_loader = torch.utils.data.DataLoader(self.dataset_valid,
batch_size=self.batch_size,
num_workers=self.workers)
self.train_loader = cycle(self.train_loader)
self.valid_loader = cycle(self.valid_loader)

def train_one_epoch(self, epoch):
# Sample model and train
self.model.train()
self.mutator.eval()
meters = AverageMeterGroup()
for step in range(1, self.child_steps + 1):
x, y = next(self.train_loader)
x, y = to_device(x, self.device), to_device(y, self.device)
self.optimizer.zero_grad()

with torch.no_grad():
self.mutator.reset()
# self._write_graph_status()
logits = self.model(x)

if isinstance(logits, tuple):
logits, aux_logits = logits
aux_loss = self.loss(aux_logits, y)
else:
aux_loss = 0.
metrics = self.metrics(logits, y)
loss = self.loss(logits, y)
loss = loss + self.aux_weight * aux_loss
loss.backward()
nn.utils.clip_grad_norm_(self.model.parameters(), 5.)
self.optimizer.step()
metrics["loss"] = loss.item()
meters.update(metrics)

if self.log_frequency is not None and step % self.log_frequency == 0:
logger.info("Model Epoch [%d/%d] Step [%d/%d] %s", epoch + 1,
self.num_epochs, step, self.child_steps, meters)

# Train sampler (mutator)
self.model.eval()
self.mutator.train()
meters = AverageMeterGroup()
for mutator_step in range(1, self.mutator_steps + 1):
self.mutator_optim.zero_grad()
for step in range(1, self.mutator_steps_aggregate + 1):
x, y = next(self.valid_loader)
x, y = to_device(x, self.device), to_device(y, self.device)

self.mutator.reset()
with torch.no_grad():
logits = self.model(x)
# self._write_graph_status()
metrics = self.metrics(logits, y)
reward = self.reward_function(logits, y)
if self.entropy_weight:
reward += self.entropy_weight * self.mutator.sample_entropy.item()
self.baseline = self.baseline * self.baseline_decay + reward * (1 - self.baseline_decay)
loss = self.mutator.sample_log_prob * (reward - self.baseline)
if self.skip_weight:
loss += self.skip_weight * self.mutator.sample_skip_penalty
metrics["reward"] = reward
metrics["loss"] = loss.item()
metrics["ent"] = self.mutator.sample_entropy.item()
metrics["log_prob"] = self.mutator.sample_log_prob.item()
metrics["baseline"] = self.baseline
metrics["skip"] = self.mutator.sample_skip_penalty

loss /= self.mutator_steps_aggregate
loss.backward()
meters.update(metrics)

cur_step = step + (mutator_step - 1) * self.mutator_steps_aggregate
if self.log_frequency is not None and cur_step % self.log_frequency == 0:
logger.info("RL Epoch [%d/%d] Step [%d/%d] [%d/%d] %s", epoch + 1, self.num_epochs,
mutator_step, self.mutator_steps, step, self.mutator_steps_aggregate,
meters)

nn.utils.clip_grad_norm_(self.mutator.parameters(), 5.)
self.mutator_optim.step()

def validate_one_epoch(self, epoch):
with torch.no_grad():
accuracy = 0
for arc_id in range(self.test_arc_per_epoch):
meters = AverageMeterGroup()
count, acc_this_round = 0,0
for x, y in self.test_loader:
x, y = to_device(x, self.device), to_device(y, self.device)
self.mutator.reset()
child_model = self.export_child_model()
# self._generate_child_model(epoch,
# count,
# arc_id,
# child_model,
# self.child_model_path)
logits = self.model(x)
if isinstance(logits, tuple):
logits, _ = logits
metrics = self.metrics(logits, y)
loss = self.loss(logits, y)
metrics["loss"] = loss.item()
meters.update(metrics)
count += 1
acc_this_round += metrics['acc1']

logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary %s",
epoch + 1, self.num_epochs, arc_id + 1, self.test_arc_per_epoch,
meters.summary())
acc_this_round /= count
accuracy += acc_this_round
# logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": meters.get_last_acc()}})
print({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": meters.get_last_acc()}})
with open(self.result_path, "a") as file:
file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch",
"value": meters.get_last_acc()}}) + '\n')
# self.result['accuracy'].append(accuracy / self.test_arc_per_epoch)

# export child_model
def export_child_model(self, selected_space=False):
if selected_space:
sampled = self.mutator.sample_final()
else:
sampled = self.mutator._cache
result = OrderedDict()
cur_layer_id = None
for mutable in self.mutator.mutables:
if not isinstance(mutable, (LayerChoice, InputChoice)):
cur_layer_id = mutable.key
# not supported as built-in
continue
choosed_ops_idx = self.mutator._convert_mutable_decision_to_human_readable(mutable, sampled[mutable.key])
if not isinstance(choosed_ops_idx, list):
choosed_ops_idx = [choosed_ops_idx]
if isinstance(mutable, LayerChoice):
if 'op_list' not in result:
result['op_list'] = [str(i) for i in mutable]
choosed_ops = [str(mutable[idx]) for idx in choosed_ops_idx]
else:

choosed_ops = choosed_ops_idx
if 'node' in cur_layer_id:
result[mutable.key] = choosed_ops
else:
result[cur_layer_id + '_' + mutable.key] = choosed_ops

return result

def _generate_child_model(self,
validation_epoch,
model_idx,
validation_step,
child_model,
file_path):

# create child_models folder
# parent_path = os.path.join(file_path, 'child_model')
parent_path = file_path
if not os.path.exists(parent_path):
os.mkdir(parent_path)

# create secondary directory
secondary_path = os.path.join(parent_path, 'validation_epoch_{}'.format(validation_epoch))
if not os.path.exists(secondary_path):
os.mkdir(secondary_path)

# create third directory
folder_path = os.path.join(secondary_path, 'validation_step_{}'.format(validation_step))
if not os.path.exists(folder_path):
os.mkdir(folder_path)

# save sampled child_model for validation
saved_path = os.path.join(folder_path, "child_model_%02d.json" % model_idx)

with open(saved_path, "w") as ss_file:
json.dump(child_model, ss_file, indent=2)

# save search space as search_space.json
def save_nas_search_space(mutator,file_path):
result = OrderedDict()
cur_layer_idx = None
for mutable in mutator.mutables.traverse():
if not isinstance(mutable,(LayerChoice, InputChoice)):
cur_layer_idx = mutable.key + '_'
continue
# macro
if 'layer' in cur_layer_idx:
if isinstance(mutable, LayerChoice):
if 'op_list' not in result:
result['op_list'] = [str(i) for i in mutable]
result[cur_layer_idx + mutable.key] = 'op_list'
else:
result[cur_layer_idx + mutable.key] = {'skip_connection': False if mutable.n_chosen else True,
'n_chosen': mutable.n_chosen if mutable.n_chosen else '',
'choose_from': mutable.choose_from if mutable.choose_from else ''}
# micro
elif 'node' in cur_layer_idx:
if isinstance(mutable,LayerChoice):
if 'op_list' not in result:
result['op_list'] = [str(i) for i in mutable]
result[mutable.key] = 'op_list'
else:
result[mutable.key] = {'skip_connection':False if mutable.n_chosen else True,
'n_chosen': mutable.n_chosen if mutable.n_chosen else '',
'choose_from': mutable.choose_from if mutable.choose_from else ''}

dump_global_result(file_path,result)

# def dump_global_result(args,global_result):
# with open(args['result_path'], "w") as ss_file:
# json.dump(global_result, ss_file, sort_keys=True, indent=2)

def dump_global_result(res_path,global_result, sort_keys = False):
with open(res_path, "w") as ss_file:
json.dump(global_result, ss_file, sort_keys=sort_keys, indent=2)


if __name__ == "__main__":
parser = ArgumentParser("enas")
parser.add_argument(
"--data_dir",
type=str,
default="./data",
help="Directory containing the dataset and embedding file. (default: %(default)s)")
parser.add_argument("--model_selected_space_path", type=str,
default='./model_selected_space.json', help="sapce_path_out directory")
parser.add_argument("--result_path", type=str,
default='./model_result.json', help="res directory")
parser.add_argument("--search_space_path", type=str,
default='./search_space.json', help="search_space directory")
parser.add_argument("--best_selected_space_path", type=str,
default='./model_selected_space.json', help="Best sapce_path_out directory of experiment")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')
parser.add_argument('--lr', type=float, default=0.005, metavar='N',
help='learning rate')
parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)")
parser.add_argument("--batch_size", default=128, type=int)
parser.add_argument("--log_frequency", default=10, type=int)
parser.add_argument("--search_for", choices=["macro", "micro"], default="macro")
args = parser.parse_args()

mkdirs(args.result_path, args.search_space_path, args.best_selected_space_path)
# 设置随机种子
torch.manual_seed(args.trial_id)
torch.cuda.manual_seed_all(args.trial_id)
np.random.seed(args.trial_id)
random.seed(args.trial_id)
# use deterministic instead of nondeterministic algorithm
# make sure exact results can be reproduced everytime.
torch.backends.cudnn.deterministic = True


dataset_train, dataset_valid = datasets.get_dataset("cifar10", args.data_dir)
if args.search_for == "macro":
model = GeneralNetwork()
num_epochs = args.epochs or 310
mutator = None
mutator = EnasMutator(model)
elif args.search_for == "micro":
model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True)
num_epochs = args.epochs or 150
mutator = EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
else:
raise AssertionError

# 储存整个网络结构
# args.search_spach_path = None#str(args.search_for) + str(args.search_space_path)
# print( args.search_space_path, args.search_for )
save_nas_search_space(mutator, args.search_space_path)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001)

trainer = EnasTrainer(model,
loss=criterion,
metrics=accuracy,
reward_function=reward_accuracy,
optimizer=optimizer,
callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./"+args.search_for+"_checkpoints")],
batch_size=args.batch_size,
num_epochs=num_epochs,
dataset_train=dataset_train,
dataset_valid=dataset_valid,
log_frequency=args.log_frequency,
mutator=mutator,
child_steps=2,
mutator_steps=2,
child_model_path='./'+args.search_for+'_child_model',
result_path=args.result_path)

logger.info(trainer.metrics)

t1 = time.time()
trainer.train()
# trainer.result["cost_time"] = time.time() - t1
# dump_global_result(args.result_path,trainer.result)

selected_model = trainer.export_child_model(selected_space = True)
dump_global_result(args.best_selected_space_path,selected_model)

+ 30
- 0
dubhe-tadl/enas/utils.py View File

@@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch


def accuracy(output, target, topk=(1,)):
""" Computes the precision@k for the specified values of k """
maxk = max(topk)
batch_size = target.size(0)

_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
# one-hot case
if target.ndimension() > 1:
target = target.max(1)[1]

correct = pred.eq(target.view(1, -1).expand_as(pred))

res = dict()
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
return res


def reward_accuracy(output, target, topk=(1,)):
batch_size = target.size(0)
_, predicted = torch.max(output.data, 1)
return (predicted == target).sum().item() / batch_size

+ 141
- 0
dubhe-tadl/fixed.py View File

@@ -0,0 +1,141 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import logging

from .mutables import InputChoice, LayerChoice, MutableScope
from .mutator import Mutator
from .utils import to_list


_logger = logging.getLogger(__name__)
_logger.setLevel(logging.INFO)

class FixedArchitecture(Mutator):
"""
Fixed architecture mutator that always selects a certain graph.

Parameters
----------
model : nn.Module
A mutable network.
fixed_arc : dict
Preloaded architecture object.
strict : bool
Force everything that appears in ``fixed_arc`` to be used at least once.
"""

def __init__(self, model, fixed_arc, strict=True):
super().__init__(model)
self._fixed_arc = fixed_arc

mutable_keys = set([mutable.key for mutable in self.mutables if not isinstance(mutable, MutableScope)])
fixed_arc_keys = set(self._fixed_arc.keys())
if fixed_arc_keys - mutable_keys:
raise RuntimeError("Unexpected keys found in fixed architecture: {}.".format(fixed_arc_keys - mutable_keys))
if mutable_keys - fixed_arc_keys:
raise RuntimeError("Missing keys in fixed architecture: {}.".format(mutable_keys - fixed_arc_keys))
self._fixed_arc = self._from_human_readable_architecture(self._fixed_arc)

def _from_human_readable_architecture(self, human_arc):
# convert from an exported architecture
result_arc = {k: to_list(v) for k, v in human_arc.items()} # there could be tensors, numpy arrays, etc.
# First, convert non-list to list, because there could be {"op1": 0} or {"op1": "conv"},
# which means {"op1": [0, ]} ir {"op1": ["conv", ]}
result_arc = {k: v if isinstance(v, list) else [v] for k, v in result_arc.items()}
# Second, infer which ones are multi-hot arrays and which ones are in human-readable format.
# This is non-trivial, since if an array in [0, 1], we cannot know for sure it means [false, true] or [true, true].
# Here, we assume an multihot array has to be a boolean array or a float array and matches the length.
for mutable in self.mutables:
if mutable.key not in result_arc:
continue # skip silently
choice_arr = result_arc[mutable.key]
if all(isinstance(v, bool) for v in choice_arr) or all(isinstance(v, float) for v in choice_arr):
if (isinstance(mutable, LayerChoice) and len(mutable) == len(choice_arr)) or \
(isinstance(mutable, InputChoice) and mutable.n_candidates == len(choice_arr)):
# multihot, do nothing
continue
if isinstance(mutable, LayerChoice):
choice_arr = [mutable.names.index(val) if isinstance(val, str) else val for val in choice_arr]
choice_arr = [i in choice_arr for i in range(len(mutable))]
elif isinstance(mutable, InputChoice):
choice_arr = [mutable.choose_from.index(val) if isinstance(val, str) else val for val in choice_arr]
choice_arr = [i in choice_arr for i in range(mutable.n_candidates)]
result_arc[mutable.key] = choice_arr
return result_arc

def sample_search(self):
"""
Always returns the fixed architecture.
"""
return self._fixed_arc

def sample_final(self):
"""
Always returns the fixed architecture.
"""
return self._fixed_arc

def replace_layer_choice(self, module=None, prefix=""):
"""
Replace layer choices with selected candidates. It's done with best effort.
In case of weighted choices or multiple choices. if some of the choices on weighted with zero, delete them.
If single choice, replace the module with a normal module.

Parameters
----------
module : nn.Module
Module to be processed.
prefix : str
Module name under global namespace.
"""
if module is None:
module = self.model
for name, mutable in module.named_children():
global_name = (prefix + "." if prefix else "") + name
if isinstance(mutable, LayerChoice):
chosen = self._fixed_arc[mutable.key]
if sum(chosen) == 1 and max(chosen) == 1 and not mutable.return_mask:
# sum is one, max is one, there has to be an only one
# this is compatible with both integer arrays, boolean arrays and float arrays
_logger.info("Replacing %s with candidate number %d.", global_name, chosen.index(1))
setattr(module, name, mutable[chosen.index(1)])
else:
if mutable.return_mask:
_logger.info("`return_mask` flag of %s is true. As it relies on the behavior of LayerChoice, " \
"LayerChoice will not be replaced.")
# remove unused parameters
for ch, n in zip(chosen, mutable.names):
if ch == 0 and not isinstance(ch, float):
setattr(mutable, n, None)
else:
self.replace_layer_choice(mutable, global_name)


def apply_fixed_architecture(model, fixed_arc):
"""
Load architecture from `fixed_arc` and apply to model.

Parameters
----------
model : torch.nn.Module
Model with mutables.
fixed_arc : str or dict
Path to the JSON that stores the architecture, or dict that stores the exported architecture.

Returns
-------
FixedArchitecture
Mutator that is responsible for fixes the graph.
"""

if isinstance(fixed_arc, str):
with open(fixed_arc) as f:
fixed_arc = json.load(f)
architecture = FixedArchitecture(model, fixed_arc)
architecture.reset()

# for the convenience of parameters counting
architecture.replace_layer_choice()
return architecture

+ 79
- 0
dubhe-tadl/log.py View File

@@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-
from datetime import datetime
from io import TextIOBase
import logging
from logging import FileHandler, Formatter, Handler, StreamHandler
from pathlib import Path
import sys
import time
from typing import Optional

time_format = '%Y-%m-%d %H:%M:%S'

formatter = Formatter(
'[%(asctime)s] %(levelname)s (%(name)s/%(threadName)s) %(message)s',
time_format
)

def init_logger() -> None:
_setup_root_logger(StreamHandler(sys.stdout), logging.INFO)

logging.basicConfig()


def _prepare_log_dir(path: Optional[str]) -> Path:
if path is None:
return Path()
ret = Path(path)
ret.mkdir(parents=True, exist_ok=True)
return ret

def _setup_root_logger(handler: Handler, level: int) -> None:
_setup_logger('tadl', handler, level)

def _setup_logger(name: str, handler: Handler, level: int) -> None:
handler.setFormatter(formatter)
logger = logging.getLogger(name)
logger.addHandler(handler)
logger.setLevel(level)
logger.propagate = False

class _LogFileWrapper(TextIOBase):
# wrap the logger file so that anything written to it will automatically get formatted

def __init__(self, log_file: TextIOBase):
self.file: TextIOBase = log_file
self.line_buffer: Optional[str] = None
self.line_start_time: Optional[datetime] = None

def write(self, s: str) -> int:
cur_time = datetime.now()
if self.line_buffer and (cur_time - self.line_start_time).total_seconds() > 0.1:
self.flush()

if self.line_buffer:
self.line_buffer += s
else:
self.line_buffer = s
self.line_start_time = cur_time

if '\n' not in s:
return len(s)

time_str = cur_time.strftime(time_format)
lines = self.line_buffer.split('\n')
for line in lines[:-1]:
self.file.write(f'[{time_str}] PRINT {line}\n')
self.file.flush()

self.line_buffer = lines[-1]
self.line_start_time = cur_time
return len(s)

def flush(self) -> None:
if self.line_buffer:
time_str = self.line_start_time.strftime(time_format)
self.file.write(f'[{time_str}] PRINT {self.line_buffer}\n')
self.file.flush()
self.line_buffer = None


+ 340
- 0
dubhe-tadl/mutables.py View File

@@ -0,0 +1,340 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import warnings
from collections import OrderedDict

import torch.nn as nn

from .utils import global_mutable_counting

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class Mutable(nn.Module):
"""
Mutable is designed to function as a normal layer, with all necessary operators' weights.
States and weights of architectures should be included in mutator, instead of the layer itself.

Mutable has a key, which marks the identity of the mutable. This key can be used by users to share
decisions among different mutables. In mutator's implementation, mutators should use the key to
distinguish different mutables. Mutables that share the same key should be "similar" to each other.

Currently the default scope for keys is global. By default, the keys uses a global counter from 1 to
produce unique ids.

Parameters
----------
key : str
The key of mutable.

Notes
-----
The counter is program level, but mutables are model level. In case multiple models are defined, and
you want to have `counter` starting from 1 in the second model, it's recommended to assign keys manually
instead of using automatic keys.
"""

def __init__(self, key=None):
super().__init__()
if key is not None:
if not isinstance(key, str):
key = str(key)
logger.warning("Warning: key \"%s\" is not string, converted to string.", key)
self._key = key
else:
self._key = self.__class__.__name__ + str(global_mutable_counting())
self.init_hook = self.forward_hook = None

def __deepcopy__(self, memodict=None):
raise NotImplementedError("Deep copy doesn't work for mutables.")

def __call__(self, *args, **kwargs):
self._check_built()
return super().__call__(*args, **kwargs)

def set_mutator(self, mutator):
if "mutator" in self.__dict__:
raise RuntimeError("`set_mutator` is called more than once. Did you parse the search space multiple times? "
"Or did you apply multiple fixed architectures?")
self.__dict__["mutator"] = mutator

@property
def key(self):
"""
Read-only property of key.
"""
return self._key

@property
def name(self):
"""
After the search space is parsed, it will be the module name of the mutable.
"""
return self._name if hasattr(self, "_name") else "_key"

@name.setter
def name(self, name):
self._name = name

def _check_built(self):
if not hasattr(self, "mutator"):
raise ValueError(
"Mutator not set for {}. You might have forgotten to initialize and apply your mutator. "
"Or did you initialize a mutable on the fly in forward pass? Move to `__init__` "
"so that trainer can locate all your mutables. See NNI docs for more details.".format(self))


class MutableScope(Mutable):
"""
Mutable scope marks a subgraph/submodule to help mutators make better decisions.

If not annotated with mutable scope, search space will be flattened as a list. However, some mutators might
need to leverage the concept of a "cell". So if a module is defined as a mutable scope, everything in it will
look like "sub-search-space" in the scope. Scopes can be nested.

There are two ways mutators can use mutable scope. One is to traverse the search space as a tree during initialization
and reset. The other is to implement `enter_mutable_scope` and `exit_mutable_scope`. They are called before and after
the forward method of the class inheriting mutable scope.

Mutable scopes are also mutables that are listed in the mutator.mutables (search space), but they are not supposed
to appear in the dict of choices.

Parameters
----------
key : str
Key of mutable scope.
"""
def __init__(self, key):
super().__init__(key=key)

def __call__(self, *args, **kwargs):
try:
self._check_built()
self.mutator.enter_mutable_scope(self)
return super().__call__(*args, **kwargs)
finally:
self.mutator.exit_mutable_scope(self)


class LayerChoice(Mutable):
"""
Layer choice selects one of the ``op_candidates``, then apply it on inputs and return results.
In rare cases, it can also select zero or many.

Layer choice does not allow itself to be nested.

Parameters
----------
op_candidates : list of nn.Module or OrderedDict
A module list to be selected from.
reduction : str
``mean``, ``concat``, ``sum`` or ``none``. Policy if multiples are selected.
If ``none``, a list is returned. ``mean`` returns the average. ``sum`` returns the sum.
``concat`` concatenate the list at dimension 1.
return_mask : bool
If ``return_mask``, return output tensor and a mask. Otherwise return tensor only.
key : str
Key of the input choice.

Attributes
----------
length : int
Deprecated. Number of ops to choose from. ``len(layer_choice)`` is recommended.
names : list of str
Names of candidates.
choices : list of Module
Deprecated. A list of all candidate modules in the layer choice module.
``list(layer_choice)`` is recommended, which will serve the same purpose.

Notes
-----
``op_candidates`` can be a list of modules or a ordered dict of named modules, for example,

.. code-block:: python

self.op_choice = LayerChoice(OrderedDict([
("conv3x3", nn.Conv2d(3, 16, 128)),
("conv5x5", nn.Conv2d(5, 16, 128)),
("conv7x7", nn.Conv2d(7, 16, 128))
]))

Elements in layer choice can be modified or deleted. Use ``del self.op_choice["conv5x5"]`` or
``self.op_choice[1] = nn.Conv3d(...)``. Adding more choices is not supported yet.
"""

def __init__(self, op_candidates, reduction="sum", return_mask=False, key=None):
super().__init__(key=key)
self.names = []
if isinstance(op_candidates, OrderedDict):
for name, module in op_candidates.items():
assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \
"Please don't use a reserved name '{}' for your module.".format(name)
self.add_module(name, module)
self.names.append(name)
elif isinstance(op_candidates, list):
for i, module in enumerate(op_candidates):
self.add_module(str(i), module)
self.names.append(str(i))
else:
raise TypeError("Unsupported op_candidates type: {}".format(type(op_candidates)))
self.reduction = reduction
self.return_mask = return_mask

def __getitem__(self, idx):
if isinstance(idx, str):
return self._modules[idx]
return list(self)[idx]

def __setitem__(self, idx, module):
key = idx if isinstance(idx, str) else self.names[idx]
return setattr(self, key, module)

def __delitem__(self, idx):
if isinstance(idx, slice):
for key in self.names[idx]:
delattr(self, key)
else:
if isinstance(idx, str):
key, idx = idx, self.names.index(idx)
else:
key = self.names[idx]
delattr(self, key)
del self.names[idx]

@property
def length(self):
warnings.warn("layer_choice.length is deprecated. Use `len(layer_choice)` instead.", DeprecationWarning)
return len(self)

def __len__(self):
return len(self.names)

def __iter__(self):
return map(lambda name: self._modules[name], self.names)

@property
def choices(self):
warnings.warn("layer_choice.choices is deprecated. Use `list(layer_choice)` instead.", DeprecationWarning)
return list(self)

def forward(self, *args, **kwargs):
"""
Returns
-------
tuple of tensors
Output and selection mask. If ``return_mask`` is ``False``, only output is returned.
"""
out, mask = self.mutator.on_forward_layer_choice(self, *args, **kwargs)
if self.return_mask:
return out, mask
return out


class InputChoice(Mutable):
"""
Input choice selects ``n_chosen`` inputs from ``choose_from`` (contains ``n_candidates`` keys). For beginners,
use ``n_candidates`` instead of ``choose_from`` is a safe option. To get the most power out of it, you might want to
know about ``choose_from``.

The keys in ``choose_from`` can be keys that appear in past mutables, or ``NO_KEY`` if there are no suitable ones.
The keys are designed to be the keys of the sources. To help mutators make better decisions,
mutators might be interested in how the tensors to choose from come into place. For example, the tensor is the
output of some operator, some node, some cell, or some module. If this operator happens to be a mutable (e.g.,
``LayerChoice`` or ``InputChoice``), it has a key naturally that can be used as a source key. If it's a
module/submodule, it needs to be annotated with a key: that's where a :class:`MutableScope` is needed.

In the example below, ``input_choice`` is a 4-choose-any. The first 3 is semantically output of cell1, output of cell2,
output of cell3 with respectively. Notice that an extra max pooling is followed by cell1, indicating x1 is not
"actually" the direct output of cell1.

.. code-block:: python

class Cell(MutableScope):
pass

class Net(nn.Module):
def __init__(self):
self.cell1 = Cell("cell1")
self.cell2 = Cell("cell2")
self.op = LayerChoice([conv3x3(), conv5x5()], key="op")
self.input_choice = InputChoice(choose_from=["cell1", "cell2", "op", InputChoice.NO_KEY])

def forward(self, x):
x1 = max_pooling(self.cell1(x))
x2 = self.cell2(x)
x3 = self.op(x)
x4 = torch.zeros_like(x)
return self.input_choice([x1, x2, x3, x4])

Parameters
----------
n_candidates : int
Number of inputs to choose from.
choose_from : list of str
List of source keys to choose from. At least of one of ``choose_from`` and ``n_candidates`` must be fulfilled.
If ``n_candidates`` has a value but ``choose_from`` is None, it will be automatically treated as ``n_candidates``
number of empty string.
n_chosen : int
Recommended inputs to choose. If None, mutator is instructed to select any.
reduction : str
``mean``, ``concat``, ``sum`` or ``none``. See :class:`LayerChoice`.
return_mask : bool
If ``return_mask``, return output tensor and a mask. Otherwise return tensor only.
key : str
Key of the input choice.
"""

NO_KEY = ""

def __init__(self, n_candidates=None, choose_from=None, n_chosen=None,
reduction="sum", return_mask=False, key=None):
super().__init__(key=key)
# precondition check
assert n_candidates is not None or choose_from is not None, "At least one of `n_candidates` and `choose_from`" \
"must be not None."
if choose_from is not None and n_candidates is None:
n_candidates = len(choose_from)
elif choose_from is None and n_candidates is not None:
choose_from = [self.NO_KEY] * n_candidates
assert n_candidates == len(choose_from), "Number of candidates must be equal to the length of `choose_from`."
assert n_candidates > 0, "Number of candidates must be greater than 0."
assert n_chosen is None or 0 <= n_chosen <= n_candidates, "Expected selected number must be None or no more " \
"than number of candidates."

self.n_candidates = n_candidates
self.choose_from = choose_from.copy()
self.n_chosen = n_chosen
self.reduction = reduction
self.return_mask = return_mask

def forward(self, optional_inputs):
"""
Forward method of LayerChoice.

Parameters
----------
optional_inputs : list or dict
Recommended to be a dict. As a dict, inputs will be converted to a list that follows the order of
``choose_from`` in initialization. As a list, inputs must follow the semantic order that is the same as
``choose_from``.

Returns
-------
tuple of tensors
Output and selection mask. If ``return_mask`` is ``False``, only output is returned.
"""
optional_input_list = optional_inputs
if isinstance(optional_inputs, dict):
optional_input_list = [optional_inputs[tag] for tag in self.choose_from]
assert isinstance(optional_input_list, list), \
"Optional input list must be a list, not a {}.".format(type(optional_input_list))
assert len(optional_inputs) == self.n_candidates, \
"Length of the input list must be equal to number of candidates."
out, mask = self.mutator.on_forward_input_choice(self, optional_input_list)
if self.return_mask:
return out, mask
return out


+ 309
- 0
dubhe-tadl/mutator.py View File

@@ -0,0 +1,309 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from collections import defaultdict

import numpy as np
import torch

from .base_mutator import BaseMutator
from .mutables import LayerChoice, InputChoice
from .utils import to_list

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class Mutator(BaseMutator):

def __init__(self, model):
super().__init__(model)
self._cache = dict()
self._connect_all = False

def sample_search(self):
"""
Override to implement this method to iterate over mutables and make decisions.

Returns
-------
dict
A mapping from key of mutables to decisions.
"""
raise NotImplementedError

def sample_final(self):
"""
Override to implement this method to iterate over mutables and make decisions that is final
for export and retraining.

Returns
-------
dict
A mapping from key of mutables to decisions.
"""
raise NotImplementedError

def reset(self):
"""
Reset the mutator by call the `sample_search` to resample (for search). Stores the result in a local
variable so that `on_forward_layer_choice` and `on_forward_input_choice` can use the decision directly.
"""
self._cache = self.sample_search()

def export(self):
"""
Resample (for final) and return results.

Returns
-------
dict
A mapping from key of mutables to decisions.
"""
sampled = self.sample_final()
result = dict()
for mutable in self.mutables:
if not isinstance(mutable, (LayerChoice, InputChoice)):
# not supported as built-in
continue
result[mutable.key] = self._convert_mutable_decision_to_human_readable(mutable, sampled.pop(mutable.key))
if sampled:
raise ValueError("Unexpected keys returned from 'sample_final()': %s", list(sampled.keys()))
return result

def status(self):
"""
Return current selection status of mutator.

Returns
-------
dict
A mapping from key of mutables to decisions. All weights (boolean type and float type)
are converted into real number values. Numpy arrays and tensors are converted into list.
"""
data = dict()
for k, v in self._cache.items():
if torch.is_tensor(v):
v = v.detach().cpu().numpy()
if isinstance(v, np.ndarray):
v = v.astype(np.float32).tolist()
data[k] = v
return data

def graph(self, inputs):
"""
Return model supernet graph.

Parameters
----------
inputs: tuple of tensor
Inputs that will be feeded into the network.

Returns
-------
dict
Containing ``node``, in Tensorboard GraphDef format.
Additional key ``mutable`` is a map from key to list of modules.
"""
if not torch.__version__.startswith("1.4"):
logger.warning("Graph is only tested with PyTorch 1.4. Other versions might not work.")
from nni._graph_utils import build_graph
from google.protobuf import json_format
# protobuf should be installed as long as tensorboard is installed
try:
self._connect_all = True
graph_def, _ = build_graph(self.model, inputs, verbose=False)
result = json_format.MessageToDict(graph_def)
finally:
self._connect_all = False

# `mutable` is to map the keys to a list of corresponding modules.
# A key can be linked to multiple modules, use `dedup=False` to find them all.
result["mutable"] = defaultdict(list)
for mutable in self.mutables.traverse(deduplicate=False):
# A module will be represent in the format of
# [{"type": "Net", "name": ""}, {"type": "Cell", "name": "cell1"}, {"type": "Conv2d": "name": "conv"}]
# which will be concatenated into Net/Cell[cell1]/Conv2d[conv] in frontend.
# This format is aligned with the scope name jit gives.
modules = mutable.name.split(".")
path = [
{"type": self.model.__class__.__name__, "name": ""}
]
m = self.model
for module in modules:
m = getattr(m, module)
path.append({
"type": m.__class__.__name__,
"name": module
})
result["mutable"][mutable.key].append(path)
return result

def on_forward_layer_choice(self, mutable, *args, **kwargs):
"""
On default, this method retrieves the decision obtained previously, and select certain operations.
Only operations with non-zero weight will be executed. The results will be added to a list.
Then it will reduce the list of all tensor outputs with the policy specified in `mutable.reduction`.

Parameters
----------
mutable : LayerChoice
Layer choice module.
args : list of torch.Tensor
Inputs
kwargs : dict
Inputs

Returns
-------
tuple of torch.Tensor and torch.Tensor
Output and mask.
"""
if self._connect_all:
return self._all_connect_tensor_reduction(mutable.reduction,
[op(*args, **kwargs) for op in mutable]), \
torch.ones(len(mutable)).bool()

def _map_fn(op, args, kwargs):
return op(*args, **kwargs)

mask = self._get_decision(mutable)
assert len(mask) == len(mutable), \
"Invalid mask, expected {} to be of length {}.".format(mask, len(mutable))
out, mask = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable], mask)
return self._tensor_reduction(mutable.reduction, out), mask

def on_forward_input_choice(self, mutable, tensor_list):
"""
On default, this method retrieves the decision obtained previously, and select certain tensors.
Then it will reduce the list of all tensor outputs with the policy specified in `mutable.reduction`.

Parameters
----------
mutable : InputChoice
Input choice module.
tensor_list : list of torch.Tensor
Tensor list to apply the decision on.

Returns
-------
tuple of torch.Tensor and torch.Tensor
Output and mask.
"""
if self._connect_all:
return self._all_connect_tensor_reduction(mutable.reduction, tensor_list), \
torch.ones(mutable.n_candidates).bool()
mask = self._get_decision(mutable)
assert len(mask) == mutable.n_candidates, \
"Invalid mask, expected {} to be of length {}.".format(mask, mutable.n_candidates)
out, mask = self._select_with_mask(lambda x: x, [(t,) for t in tensor_list], mask)
return self._tensor_reduction(mutable.reduction, out), mask

def _select_with_mask(self, map_fn, candidates, mask):
"""
Select masked tensors and return a list of tensors.

Parameters
----------
map_fn : function
Convert candidates to target candidates. Can be simply identity.
candidates : list of torch.Tensor
Tensor list to apply the decision on.
mask : list-like object
Can be a list, an numpy array or a tensor (recommended). Needs to
have the same length as ``candidates``.

Returns
-------
tuple of list of torch.Tensor and torch.Tensor
Output and mask.
"""
if (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], bool)) or \
(isinstance(mask, np.ndarray) and mask.dtype == np.bool) or \
"BoolTensor" in mask.type():
out = [map_fn(*cand) for cand, m in zip(candidates, mask) if m]
elif (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], (float, int))) or \
(isinstance(mask, np.ndarray) and mask.dtype in (np.float32, np.float64, np.int32, np.int64)) or \
"FloatTensor" in mask.type():
out = [map_fn(*cand) * m for cand, m in zip(candidates, mask) if m]
else:
raise ValueError("Unrecognized mask '%s'" % mask)
if not torch.is_tensor(mask):
mask = torch.tensor(mask) # pylint: disable=not-callable
return out, mask

def _tensor_reduction(self, reduction_type, tensor_list):
if reduction_type == "none":
return tensor_list
if not tensor_list:
return None # empty. return None for now
if len(tensor_list) == 1:
return tensor_list[0]
if reduction_type == "sum":
return sum(tensor_list)
if reduction_type == "mean":
return sum(tensor_list) / len(tensor_list)
if reduction_type == "concat":
return torch.cat(tensor_list, dim=1)
raise ValueError("Unrecognized reduction policy: \"{}\"".format(reduction_type))

def _all_connect_tensor_reduction(self, reduction_type, tensor_list):
if reduction_type == "none":
return tensor_list
if reduction_type == "concat":
return torch.cat(tensor_list, dim=1)
return torch.stack(tensor_list).sum(0)

def _get_decision(self, mutable):
"""
By default, this method checks whether `mutable.key` is already in the decision cache,
and returns the result without double-check.

Parameters
----------
mutable : Mutable

Returns
-------
object
"""
if mutable.key not in self._cache:
raise ValueError("\"{}\" not found in decision cache.".format(mutable.key))
result = self._cache[mutable.key]
logger.debug("Decision %s: %s", mutable.key, result)
return result

def _convert_mutable_decision_to_human_readable(self, mutable, sampled):
# Assert the existence of mutable.key in returned architecture.
# Also check if there is anything extra.
multihot_list = to_list(sampled)
converted = None
# If it's a boolean array, we can do optimization.
if all([t == 0 or t == 1 for t in multihot_list]):
if isinstance(mutable, LayerChoice):
assert len(multihot_list) == len(mutable), \
"Results returned from 'sample_final()' (%s: %s) either too short or too long." \
% (mutable.key, multihot_list)
# check if all modules have different names and they indeed have names
if len(set(mutable.names)) == len(mutable) and not all(d.isdigit() for d in mutable.names):
converted = [name for i, name in enumerate(mutable.names) if multihot_list[i]]
else:
converted = [i for i in range(len(multihot_list)) if multihot_list[i]]
if isinstance(mutable, InputChoice):
assert len(multihot_list) == mutable.n_candidates, \
"Results returned from 'sample_final()' (%s: %s) either too short or too long." \
% (mutable.key, multihot_list)
# check if all input candidates have different names
if len(set(mutable.choose_from)) == mutable.n_candidates:
converted = [name for i, name in enumerate(mutable.choose_from) if multihot_list[i]]
else:
converted = [i for i in range(len(multihot_list)) if multihot_list[i]]
if converted is not None:
# if only one element, then remove the bracket
if len(converted) == 1:
converted = converted[0]
else:
# do nothing
converted = multihot_list
return converted

+ 47
- 0
dubhe-tadl/network_morphism/README.md View File

@@ -0,0 +1,47 @@
# Network Morphism
The implementation of the Network Morphism algorithm is based on
[Auto-Keras: An Efficient Neural Architecture Search System](https://arxiv.org/pdf/1806.10282.pdf)

Train stage
```
python network_morphism_train.py
--trial_id 0
--experiment_dir 'tadl'
--log_path 'tadl/train/0/log'
--data_dir '../data/'
--result_path 'trial_id/result.json'
--log_path 'trial_id/log'
--search_space_path 'experiment_id/search_space.json'
--best_selected_space_path 'experiment_id/best_selected_space.json'
--lr 0.001 --epochs 100 --batch_size 32 --opt 'SGD'
```

select stage
```
python network_morphism_select.py
```

retrain stage
```
python network_morphism_retrain.py
--data_dir '../data/'
--experiment_dir 'tadl'
--result_path 'trial_id/result.json'
--log_path 'trial_id/log'
--best_selected_space_path 'experiment_id/best_selected_space.json'
--best_checkpoint_dir 'experiment_id/'
--trial_id 0 --batch_size 32 --opt 'SGD' --epochs 100 --lr 0.001


```

The best model searched achieved 88.1% on CIFAR-10 dataset after 100 trials.

Dependencies:
```
Python = 3.6.13
pytorch = 1.8.0
torchvision = 0.9.0
scipy = 1.5.2
scikit-learn = 0.24.1
```

+ 517
- 0
dubhe-tadl/network_morphism/algorithm/bayesian.py View File

@@ -0,0 +1,517 @@

import math
import random
from copy import deepcopy
from functools import total_ordering
from queue import PriorityQueue

import numpy as np
from scipy.linalg import LinAlgError, cho_solve, cholesky, solve_triangular
from scipy.optimize import linear_sum_assignment
from sklearn.metrics.pairwise import rbf_kernel

from .graph_transformer import transform
from .layers import is_layer
from utils import Constant, OptimizeMode
import logging

logger = logging.getLogger(__name__)

# equation(6) dl
def layer_distance(a, b):
"""The distance between two layers."""
# pylint: disable=unidiomatic-typecheck
if not isinstance(a, type(b)):
return 1.0
if is_layer(a, "Conv"):
att_diff = [
(a.filters, b.filters),
(a.kernel_size, b.kernel_size),
(a.stride, b.stride),
]
return attribute_difference(att_diff)
if is_layer(a, "Pooling"):
att_diff = [
(a.padding, b.padding),
(a.kernel_size, b.kernel_size),
(a.stride, b.stride),
]
return attribute_difference(att_diff)
return 0.0

# equation(6)
def attribute_difference(att_diff):
''' The attribute distance.
'''

ret = 0
for a_value, b_value in att_diff:
if max(a_value, b_value) == 0:
ret += 0
else:
ret += abs(a_value - b_value) * 1.0 / max(a_value, b_value)
return ret * 1.0 / len(att_diff)

# equation(7) A
def layers_distance(list_a, list_b):
"""The distance between the layers of two neural networks."""
len_a = len(list_a)
len_b = len(list_b)
f = np.zeros((len_a + 1, len_b + 1))
f[-1][-1] = 0
for i in range(-1, len_a):
f[i][-1] = i + 1
for j in range(-1, len_b):
f[-1][j] = j + 1
for i in range(len_a):
for j in range(len_b):
f[i][j] = min(
f[i][j - 1] + 1,
f[i - 1][j] + 1,
f[i - 1][j - 1] + layer_distance(list_a[i], list_b[j]),
)
return f[len_a - 1][len_b - 1]

# equation (9) ds
# 0: topo rank of the start, 1: rank of the end
def skip_connection_distance(a, b):
"""The distance between two skip-connections."""
if a[2] != b[2]:
return 1.0
len_a = abs(a[1] - a[0])
len_b = abs(b[1] - b[0])
return (abs(a[0] - b[0]) + abs(len_a - len_b)) / \
(max(a[0], b[0]) + max(len_a, len_b))

# equation (8) Ds
# convert equation (8) minimization part into a bipartite graph matching problem and solved by hungarian algorithm(linear_sum_assignment)
def skip_connections_distance(list_a, list_b):
"""The distance between the skip-connections of two neural networks."""
distance_matrix = np.zeros((len(list_a), len(list_b)))
for i, a in enumerate(list_a):
for j, b in enumerate(list_b):
distance_matrix[i][j] = skip_connection_distance(a, b)
return distance_matrix[linear_sum_assignment(distance_matrix)].sum() + abs(
len(list_a) - len(list_b)
)

# equation (4)
def edit_distance(x, y):
"""The distance between two neural networks.
Args:
x: An instance of NetworkDescriptor.
y: An instance of NetworkDescriptor
Returns:
The edit-distance between x and y.
"""

ret = layers_distance(x.layers, y.layers)
ret += Constant.KERNEL_LAMBDA * skip_connections_distance(
x.skip_connections, y.skip_connections
)
return ret


class IncrementalGaussianProcess:
"""Gaussian process regressor.
Attributes:
alpha: A hyperparameter.
"""

def __init__(self):
self.alpha = 1e-10
self._distance_matrix = None
self._x = None
self._y = None
self._first_fitted = False
self._l_matrix = None
self._alpha_vector = None

@property
def kernel_matrix(self):
''' Kernel matric.
'''
return self._distance_matrix

def fit(self, train_x, train_y):
""" Fit the regressor with more data.
Args:
train_x: A list of NetworkDescriptor.
train_y: A list of metric values.
"""
if self.first_fitted:
self.incremental_fit(train_x, train_y)
else:
self.first_fit(train_x, train_y)

# compute the kernel matrix k, alpha_vector
# 和first fit区别就是需要加入新的训练样本扩充distance matrix
def incremental_fit(self, train_x, train_y):
""" Incrementally fit the regressor. """
if not self._first_fitted:
raise ValueError(
"The first_fit function needs to be called first.")

train_x, train_y = np.array(train_x), np.array(train_y)

# Incrementally compute K
up_right_k = edit_distance_matrix(self._x, train_x)
down_left_k = np.transpose(up_right_k)
down_right_k = edit_distance_matrix(train_x)
up_k = np.concatenate((self._distance_matrix, up_right_k), axis=1)
down_k = np.concatenate((down_left_k, down_right_k), axis=1)
temp_distance_matrix = np.concatenate((up_k, down_k), axis=0)

k_matrix = bourgain_embedding_matrix(temp_distance_matrix)

diagonal = np.diag_indices_from(k_matrix)
diagonal = (diagonal[0][-len(train_x):], diagonal[1][-len(train_x):])
k_matrix[diagonal] += self.alpha
try:
self._l_matrix = cholesky(k_matrix, lower=True) # Line 2
except LinAlgError as err:
logger.error('LinAlgError')
return self

self._x = np.concatenate((self._x, train_x), axis=0)
self._y = np.concatenate((self._y, train_y), axis=0)
self._distance_matrix = temp_distance_matrix
self._alpha_vector = cho_solve(
(self._l_matrix, True), self._y) # Line 3

return self

@property
def first_fitted(self):
''' if it is firsr fitted
'''
return self._first_fitted

# update过程,第一次fit。
def first_fit(self, train_x, train_y):
""" Fit the regressor for the first time. """
train_x, train_y = np.array(train_x), np.array(train_y)

self._x = np.copy(train_x)
self._y = np.copy(train_y)

self._distance_matrix = edit_distance_matrix(self._x)
k_matrix = bourgain_embedding_matrix(self._distance_matrix)
k_matrix[np.diag_indices_from(k_matrix)] += self.alpha

self._l_matrix = cholesky(k_matrix, lower=True) # Line 2

# cho_solve Ax = b return x = A^{-1}b
self._alpha_vector = cho_solve(
(self._l_matrix, True), self._y) # Line 3

self._first_fitted = True
return self
# 获得 predictive distribution 的 mean & std
def predict(self, train_x):
"""Predict the result.
Args:
train_x: A list of NetworkDescriptor.
Returns:
y_mean: The predicted mean.
y_std: The predicted standard deviation.
"""
k_trans = np.exp(-np.power(edit_distance_matrix(train_x, self._x), 2))
y_mean = k_trans.dot(self._alpha_vector) # Line 4 (y_mean = f_star)

# compute inverse K_inv of K based on its Cholesky
# decomposition L and its inverse L_inv
l_inv = solve_triangular(
self._l_matrix.T, np.eye(
self._l_matrix.shape[0]))
k_inv = l_inv.dot(l_inv.T)
# Compute variance of predictive distribution
y_var = np.ones(len(train_x), dtype=np.float)
y_var -= np.einsum("ij,ij->i", np.dot(k_trans, k_inv), k_trans)

# Check if any of the variances is negative because of
# numerical issues. If yes: set the variance to 0.
y_var_negative = y_var < 0
if np.any(y_var_negative):
y_var[y_var_negative] = 0.0
return y_mean, np.sqrt(y_var)


def edit_distance_matrix(train_x, train_y=None):
"""Calculate the edit distance.
Args:
train_x: A list of neural architectures.
train_y: A list of neural architectures.
Returns:
An edit-distance matrix.
"""
if train_y is None:
ret = np.zeros((train_x.shape[0], train_x.shape[0]))
for x_index, x in enumerate(train_x):
for y_index, y in enumerate(train_x):
if x_index == y_index:
ret[x_index][y_index] = 0
elif x_index < y_index:
ret[x_index][y_index] = edit_distance(x, y)
else:
ret[x_index][y_index] = ret[y_index][x_index]
return ret
ret = np.zeros((train_x.shape[0], train_y.shape[0]))
for x_index, x in enumerate(train_x):
for y_index, y in enumerate(train_y):
ret[x_index][y_index] = edit_distance(x, y)
return ret


def vector_distance(a, b):
"""The Euclidean distance between two vectors."""
a = np.array(a)
b = np.array(b)
return np.linalg.norm(a - b)

# 从edit-distance矩阵空间到欧几里得空间的映射
def bourgain_embedding_matrix(distance_matrix):
"""Use Bourgain algorithm to embed the neural architectures based on their edit-distance.
Args:
distance_matrix: A matrix of edit-distances.
Returns:
A matrix of distances after embedding.
"""
distance_matrix = np.array(distance_matrix)
n = len(distance_matrix)
if n == 1:
return distance_matrix
np.random.seed(123)
distort_elements = []
r = range(n)
k = int(math.ceil(math.log(n) / math.log(2) - 1))
t = int(math.ceil(math.log(n)))
counter = 0
for i in range(0, k + 1):
for t in range(t):
s = np.random.choice(r, 2 ** i)
for j in r:
d = min([distance_matrix[j][s] for s in s])
counter += len(s)
if i == 0 and t == 0:
distort_elements.append([d])
else:
distort_elements[j].append(d)
return rbf_kernel(distort_elements, distort_elements)


class BayesianOptimizer:
""" A Bayesian optimizer for neural architectures.
Attributes:
searcher: The Searcher which is calling the Bayesian optimizer.
t_min: The minimum temperature for simulated annealing.
metric: An instance of the Metric subclasses.
gpr: A GaussianProcessRegressor for bayesian optimization.
beta: The beta in acquisition function. (refer to our paper)
search_tree: The network morphism search tree.
"""

def __init__(self, searcher, t_min, optimizemode, beta=None):
self.searcher = searcher
self.t_min = t_min
self.optimizemode = optimizemode
self.gpr = IncrementalGaussianProcess()
self.beta = beta if beta is not None else Constant.BETA
self.search_tree = SearchTree()

def fit(self, x_queue, y_queue):
""" Fit the optimizer with new architectures and performances.
Args:
x_queue: A list of NetworkDescriptor.
y_queue: A list of metric values.
"""
self.gpr.fit(x_queue, y_queue)

# Algorithm 1
# optimize acquisition function
def generate(self, descriptors):
"""Generate new architecture.
Args:
descriptors: All the searched neural architectures. (search history)
Returns:
graph: An instance of Graph. A morphed neural network with weights.
father_id: The father node ID in the search tree.
"""
model_ids = self.search_tree.adj_list.keys()

target_graph = None
father_id = None
descriptors = deepcopy(descriptors)
elem_class = Elem
if self.optimizemode is OptimizeMode.Maximize:
elem_class = ReverseElem

'''
1.初始化优先队列
2.优先队列里面元素为之前所有生成的模型
'''
pq = PriorityQueue()
temp_list = []
for model_id in model_ids:
metric_value = self.searcher.get_metric_value_by_id(model_id)
temp_list.append((metric_value, model_id))
temp_list = sorted(temp_list)
for metric_value, model_id in temp_list:
graph = self.searcher.load_model_by_id(model_id)
graph.clear_operation_history()
graph.clear_weights()
# 已经产生的模型father_id就是自己的id
pq.put(elem_class(metric_value, model_id, graph))

t = 1.0
t_min = self.t_min
alpha = 0.9
opt_acq = self._get_init_opt_acq_value()
num_iter = 0
# logger.info('initial queue size ', pq.qsize())
while not pq.empty() and t > t_min:
num_iter += 1
elem = pq.get()
# logger.info("elem.metric_value:{}".format(elem.metric_value))
# logger.info("opt_acq:{}".format(opt_acq))
if self.optimizemode is OptimizeMode.Maximize:
temp_exp = min((elem.metric_value - opt_acq) / t, 1.0)
else:
temp_exp = min((opt_acq - elem.metric_value) / t, 1.0)
# logger.info("temp_exp this round ", temp_exp)
ap = math.exp(temp_exp)
# logger.info("ap this round ", ap)
if ap >= random.uniform(0, 1):
# line 9,10 in algorithm 1
for temp_graph in transform(elem.graph):
# 已经出现过的网络不加入
if contain(descriptors, temp_graph.extract_descriptor()):
continue
#用acq作为贝叶斯模型给出的评价标准
temp_acq_value = self.acq(temp_graph)

# 这个优先队列会不断增长,就算transform出来的网络也会进入。
pq.put(
# 记住这个模型是从哪个father生长出来的
elem_class(
temp_acq_value,
elem.father_id,
temp_graph))
# logger.info('temp_acq_value ', temp_acq_value)
# logger.info('queue size ', pq.qsize())
descriptors.append(temp_graph.extract_descriptor())
# 选一个最好的当父
if self._accept_new_acq_value(opt_acq, temp_acq_value):
opt_acq = temp_acq_value
father_id = elem.father_id
target_graph = deepcopy(temp_graph)
t *= alpha
# logger.info('number of iter in this search {}'.format(num_iter))
# Did not found a not duplicated architecture
if father_id is None:
return None, None
nm_graph = self.searcher.load_model_by_id(father_id)
# 从当前父graph开始,根据target_graph中的operation_history,一步步从当前父网络操作到target_graph
# 因为在存入pq时进行了clear_operation_history()操作。等于target_graph中只存了从当前父网络到target_graph的操作
# 而nm_graph中的operation_history保存完整的,到基类的history
for args in target_graph.operation_history:
getattr(nm_graph, args[0])(*list(args[1:]))
# target space
return nm_graph, father_id

# equation (10)
def acq(self, graph):
''' estimate the value of generated graph
'''
mean, std = self.gpr.predict(np.array([graph.extract_descriptor()]))
if self.optimizemode is OptimizeMode.Maximize:
return mean + self.beta * std
return mean - self.beta * std

def _get_init_opt_acq_value(self):
if self.optimizemode is OptimizeMode.Maximize:
return -np.inf
return np.inf

def _accept_new_acq_value(self, opt_acq, temp_acq_value):
if temp_acq_value > opt_acq and self.optimizemode is OptimizeMode.Maximize:
return True
if temp_acq_value < opt_acq and not self.optimizemode is OptimizeMode.Maximize:
return True
return False

def add_child(self, father_id, model_id):
''' add child to the search tree
Arguments:
father_id {int} -- father id
model_id {int} -- model id
'''

self.search_tree.add_child(father_id, model_id)


@total_ordering
class Elem:
"""Elements to be sorted according to metric value."""

def __init__(self, metric_value, father_id, graph):
self.father_id = father_id
self.graph = graph
self.metric_value = metric_value

def __eq__(self, other):
return self.metric_value == other.metric_value

def __lt__(self, other):
return self.metric_value < other.metric_value


class ReverseElem(Elem):
"""Elements to be reversely sorted according to metric value."""

def __lt__(self, other):
return self.metric_value > other.metric_value


def contain(descriptors, target_descriptor):
"""Check if the target descriptor is in the descriptors."""
for descriptor in descriptors:
if edit_distance(descriptor, target_descriptor) < 1e-5:
return True
return False


class SearchTree:
"""The network morphism search tree."""

def __init__(self):
self.root = None
self.adj_list = {}

def add_child(self, u, v):
''' add child to search tree itself.
Arguments:
u {int} -- father id
v {int} -- child id
'''

if u == -1:
self.root = v
self.adj_list[v] = []
return
if v not in self.adj_list[u]:
self.adj_list[u].append(v)
if v not in self.adj_list:
self.adj_list[v] = []

def get_dict(self, u=None):
""" A recursive function to return the content of the tree in a dict."""
if u is None:
return self.get_dict(self.root)
children = []
for v in self.adj_list[u]:
children.append(self.get_dict(v))
ret = {"name": u, "children": children}
return ret

+ 919
- 0
dubhe-tadl/network_morphism/algorithm/graph.py View File

@@ -0,0 +1,919 @@
import json
from collections.abc import Iterable
from copy import deepcopy, copy
from queue import Queue

import numpy as np
import torch

from .layer_transformer import (
add_noise,
wider_bn,
wider_next_conv,
wider_next_dense,
wider_pre_conv,
wider_pre_dense,
init_dense_weight,
init_conv_weight,
init_bn_weight,
)
from .layers import (
StubAdd,
StubConcatenate,
StubReLU,
get_batch_norm_class,
get_conv_class,
is_layer,
layer_width,
set_stub_weight_to_torch,
set_torch_weight_to_stub,
layer_description_extractor,
layer_description_builder,
)
from utils import Constant


class NetworkDescriptor:
"""A class describing the neural architecture for neural network kernel.
It only record the width of convolutional and dense layers, and the skip-connection types and positions.
"""

CONCAT_CONNECT = "concat"
ADD_CONNECT = "add"

def __init__(self):
self.skip_connections = []
self.layers = []

@property
def n_layers(self):
return len(self.layers)

def add_skip_connection(self, u, v, connection_type):
""" Add a skip-connection to the descriptor.
Args:
u: Number of convolutional layers before the starting point.
v: Number of convolutional layers before the ending point.
connection_type: Must be either CONCAT_CONNECT or ADD_CONNECT.
"""
if connection_type not in [self.CONCAT_CONNECT, self.ADD_CONNECT]:
raise ValueError(
"connection_type should be NetworkDescriptor.CONCAT_CONNECT "
"or NetworkDescriptor.ADD_CONNECT."
)

self.skip_connections.append((u, v, connection_type))

def to_json(self):
''' NetworkDescriptor to json representation
'''

skip_list = []
for u, v, connection_type in self.skip_connections:
skip_list.append({"from": u, "to": v, "type": connection_type})
return {"node_list": self.layers, "skip_list": skip_list}

def add_layer(self, layer):
''' add one layer
'''
self.layers.append(layer)


class Node:
"""A class for intermediate output tensor (node) in the Graph.
Attributes:
shape: A tuple describing the shape of the tensor.
"""

def __init__(self, shape):
self.shape = shape


class Graph:
"""A class representing the neural architecture graph of a model.
Graph extracts the neural architecture graph from a model.
Each node in the graph is a intermediate tensor between layers.
Each layer is an edge in the graph.
Notably, multiple edges may refer to the same layer.
(e.g. Add layer is adding two tensor into one tensor. So it is related to two edges.)
Attributes:
weighted: A boolean of whether the weights and biases in the neural network
should be included in the graph.
input_shape: A tuple of integers, which does not include the batch axis.
node_list: A list of integers. The indices of the list are the identifiers.
layer_list: A list of stub layers. The indices of the list are the identifiers.
node_to_id: A dict instance mapping from node integers to their identifiers.
layer_to_id: A dict instance mapping from stub layers to their identifiers.
layer_id_to_input_node_ids: A dict instance mapping from layer identifiers
to their input nodes identifiers.
layer_id_to_output_node_ids: A dict instance mapping from layer identifiers
to their output nodes identifiers.
adj_list: A two dimensional list. The adjacency list of the graph. The first dimension is
identified by tensor identifiers. In each edge list, the elements are two-element tuples
of (tensor identifier, layer identifier).
reverse_adj_list: A reverse adjacent list in the same format as adj_list.
operation_history: A list saving all the network morphism operations.
vis: A dictionary of temporary storage for whether an local operation has been done
during the network morphism.
"""

def __init__(self, input_shape, weighted=True):
"""Initializer for Graph.
"""
self.input_shape = input_shape
self.weighted = weighted
self.node_list = []
self.layer_list = []
# node id start with 0
self.node_to_id = {}
self.layer_to_id = {}
self.layer_id_to_input_node_ids = {}
self.layer_id_to_output_node_ids = {}
self.adj_list = {}
self.reverse_adj_list = {}
self.operation_history = []
self.n_dim = len(input_shape) - 1
self.conv = get_conv_class(self.n_dim)
self.batch_norm = get_batch_norm_class(self.n_dim)

self.vis = None
self._add_node(Node(input_shape))

def add_layer(self, layer, input_node_id):
"""Add a layer to the Graph.
Args:
layer: An instance of the subclasses of StubLayer in layers.py.
input_node_id: An integer. The ID of the input node of the layer.
Returns:
output_node_id: An integer. The ID of the output node of the layer.
"""
if isinstance(input_node_id, Iterable):
layer.input = list(map(lambda x: self.node_list[x], input_node_id))
output_node_id = self._add_node(Node(layer.output_shape))
for node_id in input_node_id:
self._add_edge(layer, node_id, output_node_id)

else:
layer.input = self.node_list[input_node_id]
output_node_id = self._add_node(Node(layer.output_shape))
self._add_edge(layer, input_node_id, output_node_id)

layer.output = self.node_list[output_node_id]
return output_node_id

def clear_operation_history(self):
self.operation_history = []

@property
def n_nodes(self):
"""Return the number of nodes in the model."""
return len(self.node_list)

@property
def n_layers(self):
"""Return the number of layers in the model."""
return len(self.layer_list)

def _add_node(self, node):
"""Add a new node to node_list and give the node an ID.
Args:
node: An instance of Node.
Returns:
node_id: An integer.
"""
node_id = len(self.node_list)
self.node_to_id[node] = node_id
self.node_list.append(node)
self.adj_list[node_id] = []
self.reverse_adj_list[node_id] = []
return node_id

def _add_edge(self, layer, input_id, output_id):
"""Add a new layer to the graph. The nodes should be created in advance."""

if layer in self.layer_to_id:
layer_id = self.layer_to_id[layer]
if input_id not in self.layer_id_to_input_node_ids[layer_id]:
self.layer_id_to_input_node_ids[layer_id].append(input_id)
if output_id not in self.layer_id_to_output_node_ids[layer_id]:
self.layer_id_to_output_node_ids[layer_id].append(output_id)
else:
layer_id = len(self.layer_list)
self.layer_list.append(layer)
self.layer_to_id[layer] = layer_id
self.layer_id_to_input_node_ids[layer_id] = [input_id]
self.layer_id_to_output_node_ids[layer_id] = [output_id]

self.adj_list[input_id].append((output_id, layer_id))
self.reverse_adj_list[output_id].append((input_id, layer_id))

def _redirect_edge(self, u_id, v_id, new_v_id):
"""Redirect the layer to a new node.
Change the edge originally from `u_id` to `v_id` into an edge from `u_id` to `new_v_id`
while keeping all other property of the edge the same.
"""
layer_id = None
for index, edge_tuple in enumerate(self.adj_list[u_id]):
if edge_tuple[0] == v_id:
layer_id = edge_tuple[1]
self.adj_list[u_id][index] = (new_v_id, layer_id)
self.layer_list[layer_id].output = self.node_list[new_v_id]
break

for index, edge_tuple in enumerate(self.reverse_adj_list[v_id]):
if edge_tuple[0] == u_id:
layer_id = edge_tuple[1]
self.reverse_adj_list[v_id].remove(edge_tuple)
break
self.reverse_adj_list[new_v_id].append((u_id, layer_id))
for index, value in enumerate(
self.layer_id_to_output_node_ids[layer_id]):
if value == v_id:
self.layer_id_to_output_node_ids[layer_id][index] = new_v_id
break

def _replace_layer(self, layer_id, new_layer):
"""Replace the layer with a new layer."""
old_layer = self.layer_list[layer_id]
new_layer.input = old_layer.input
new_layer.output = old_layer.output
new_layer.output.shape = new_layer.output_shape
self.layer_list[layer_id] = new_layer
self.layer_to_id[new_layer] = layer_id
self.layer_to_id.pop(old_layer)

@property
def topological_order(self):
"""Return the topological order of the node IDs from the input node to the output node."""
q = Queue()
in_degree = {}
for i in range(self.n_nodes):
in_degree[i] = 0
for u in range(self.n_nodes):
for v, _ in self.adj_list[u]:
in_degree[v] += 1
for i in range(self.n_nodes):
if in_degree[i] == 0:
q.put(i)

order_list = []
while not q.empty():
u = q.get()
order_list.append(u)
for v, _ in self.adj_list[u]:
in_degree[v] -= 1
if in_degree[v] == 0:
q.put(v)
return order_list

def _get_pooling_layers(self, start_node_id, end_node_id):
"""
Given two node IDs, return all the pooling layers between them.
Conv layer with strid > 1 is also considered as a Pooling layer.
"""

layer_list = []
node_list = [start_node_id]
assert self._depth_first_search(end_node_id, layer_list, node_list)
ret = []
for layer_id in layer_list:
layer = self.layer_list[layer_id]
if is_layer(layer, "Pooling"):
ret.append(layer)
elif is_layer(layer, "Conv") and layer.stride != 1:
ret.append(layer)
return ret

def _depth_first_search(self, target_id, layer_id_list, node_list):
"""Search for all the layers and nodes down the path.
A recursive function to search all the layers and nodes between the node in the node_list
and the node with target_id."""
assert len(node_list) <= self.n_nodes
u = node_list[-1]
if u == target_id:
return True

for v, layer_id in self.adj_list[u]:
layer_id_list.append(layer_id)
node_list.append(v)
if self._depth_first_search(target_id, layer_id_list, node_list):
return True
layer_id_list.pop()
node_list.pop()

return False

def _search(self, u, start_dim, total_dim, n_add):
"""Search the graph for all the layers to be widened caused by an operation.
It is an recursive function with duplication check to avoid deadlock.
It searches from a starting node u until the corresponding layers has been widened.
Args:
u: The starting node ID.
start_dim: The position to insert the additional dimensions.
total_dim: The total number of dimensions the layer has before widening.
n_add: The number of dimensions to add.
"""
if (u, start_dim, total_dim, n_add) in self.vis:
return
self.vis[(u, start_dim, total_dim, n_add)] = True
for v, layer_id in self.adj_list[u]:
layer = self.layer_list[layer_id]

if is_layer(layer, "Conv"):
new_layer = wider_next_conv(
layer, start_dim, total_dim, n_add, self.weighted
)
self._replace_layer(layer_id, new_layer)

elif is_layer(layer, "Dense"):
new_layer = wider_next_dense(
layer, start_dim, total_dim, n_add, self.weighted
)
self._replace_layer(layer_id, new_layer)

elif is_layer(layer, "BatchNormalization"):
new_layer = wider_bn(
layer, start_dim, total_dim, n_add, self.weighted)
self._replace_layer(layer_id, new_layer)
self._search(v, start_dim, total_dim, n_add)

elif is_layer(layer, "Concatenate"):
if self.layer_id_to_input_node_ids[layer_id][1] == u:
# u is on the right of the concat
# next_start_dim += next_total_dim - total_dim
left_dim = self._upper_layer_width(
self.layer_id_to_input_node_ids[layer_id][0]
)
next_start_dim = start_dim + left_dim
next_total_dim = total_dim + left_dim
else:
next_start_dim = start_dim
next_total_dim = total_dim + self._upper_layer_width(
self.layer_id_to_input_node_ids[layer_id][1]
)
self._search(v, next_start_dim, next_total_dim, n_add)

else:
self._search(v, start_dim, total_dim, n_add)

for v, layer_id in self.reverse_adj_list[u]:
layer = self.layer_list[layer_id]
if is_layer(layer, "Conv"):
new_layer = wider_pre_conv(layer, n_add, self.weighted)
self._replace_layer(layer_id, new_layer)
elif is_layer(layer, "Dense"):
new_layer = wider_pre_dense(layer, n_add, self.weighted)
self._replace_layer(layer_id, new_layer)
elif is_layer(layer, "Concatenate"):
continue
else:
self._search(v, start_dim, total_dim, n_add)

def _upper_layer_width(self, u):
for v, layer_id in self.reverse_adj_list[u]:
layer = self.layer_list[layer_id]
if is_layer(layer, "Conv") or is_layer(layer, "Dense"):
return layer_width(layer)
elif is_layer(layer, "Concatenate"):
a = self.layer_id_to_input_node_ids[layer_id][0]
b = self.layer_id_to_input_node_ids[layer_id][1]
return self._upper_layer_width(a) + self._upper_layer_width(b)
else:
return self._upper_layer_width(v)
return self.node_list[0].shape[-1]

def to_deeper_model(self, target_id, new_layer):
"""Insert a relu-conv-bn block after the target block.
Args:
target_id: A convolutional layer ID. The new block should be inserted after the block.
new_layer: An instance of StubLayer subclasses.
"""
self.operation_history.append(
("to_deeper_model", target_id, new_layer))
input_id = self.layer_id_to_input_node_ids[target_id][0]
output_id = self.layer_id_to_output_node_ids[target_id][0]
if self.weighted:
if is_layer(new_layer, "Dense"):
init_dense_weight(new_layer)
elif is_layer(new_layer, "Conv"):
init_conv_weight(new_layer)
elif is_layer(new_layer, "BatchNormalization"):
init_bn_weight(new_layer)

self._insert_new_layers([new_layer], input_id, output_id)

def to_wider_model(self, pre_layer_id, n_add):
"""Widen the last dimension of the output of the pre_layer.
Args:
pre_layer_id: The ID of a convolutional layer or dense layer.
n_add: The number of dimensions to add.
"""
self.operation_history.append(("to_wider_model", pre_layer_id, n_add))
pre_layer = self.layer_list[pre_layer_id]
output_id = self.layer_id_to_output_node_ids[pre_layer_id][0]
dim = layer_width(pre_layer)
self.vis = {}
self._search(output_id, dim, dim, n_add)
# Update the tensor shapes.
for u in self.topological_order:
for v, layer_id in self.adj_list[u]:
self.node_list[v].shape = self.layer_list[layer_id].output_shape

def _insert_new_layers(self, new_layers, start_node_id, end_node_id):
"""Insert the new_layers after the node with start_node_id."""
new_node_id = self._add_node(deepcopy(self.node_list[end_node_id]))
temp_output_id = new_node_id
for layer in new_layers[:-1]:
temp_output_id = self.add_layer(layer, temp_output_id)

self._add_edge(new_layers[-1], temp_output_id, end_node_id)
new_layers[-1].input = self.node_list[temp_output_id]
new_layers[-1].output = self.node_list[end_node_id]
self._redirect_edge(start_node_id, end_node_id, new_node_id)

def _block_end_node(self, layer_id, block_size):
ret = self.layer_id_to_output_node_ids[layer_id][0]
for _ in range(block_size - 2):
ret = self.adj_list[ret][0][0]
return ret

def _dense_block_end_node(self, layer_id):
return self.layer_id_to_input_node_ids[layer_id][0]

def _conv_block_end_node(self, layer_id):
"""Get the input node ID of the last layer in the block by layer ID.
Return the input node ID of the last layer in the convolutional block.
Args:
layer_id: the convolutional layer ID.
"""
return self._block_end_node(layer_id, Constant.CONV_BLOCK_DISTANCE)

def to_add_skip_model(self, start_id, end_id):
"""Add a weighted add skip-connection from after start node to end node.
Args:
start_id: The convolutional layer ID, after which to start the skip-connection.
end_id: The convolutional layer ID, after which to end the skip-connection.
"""
self.operation_history.append(("to_add_skip_model", start_id, end_id))
filters_end = self.layer_list[end_id].output.shape[-1]
filters_start = self.layer_list[start_id].output.shape[-1]
start_node_id = self.layer_id_to_output_node_ids[start_id][0]

pre_end_node_id = self.layer_id_to_input_node_ids[end_id][0]
end_node_id = self.layer_id_to_output_node_ids[end_id][0]

skip_output_id = self._insert_pooling_layer_chain(
start_node_id, end_node_id)

# Add the conv layer in order to align the number of channels with end layer id
new_conv_layer = get_conv_class(
self.n_dim)(
filters_start,
filters_end,
1)
skip_output_id = self.add_layer(new_conv_layer, skip_output_id)

# Add the add layer.
add_input_node_id = self._add_node(
deepcopy(self.node_list[end_node_id]))
add_layer = StubAdd()

self._redirect_edge(pre_end_node_id, end_node_id, add_input_node_id)
self._add_edge(add_layer, add_input_node_id, end_node_id)
self._add_edge(add_layer, skip_output_id, end_node_id)
add_layer.input = [
self.node_list[add_input_node_id],
self.node_list[skip_output_id],
]
add_layer.output = self.node_list[end_node_id]
self.node_list[end_node_id].shape = add_layer.output_shape

# Set weights to the additional conv layer.
if self.weighted:
filter_shape = (1,) * self.n_dim
weights = np.zeros((filters_end, filters_start) + filter_shape)
bias = np.zeros(filters_end)
new_conv_layer.set_weights(
(add_noise(weights, np.array([0, 1])), add_noise(
bias, np.array([0, 1])))
)

def to_concat_skip_model(self, start_id, end_id):
"""Add a weighted add concatenate connection from after start node to end node.
Args:
start_id: The convolutional layer ID, after which to start the skip-connection.
end_id: The convolutional layer ID, after which to end the skip-connection.
"""
self.operation_history.append(
("to_concat_skip_model", start_id, end_id))
filters_end = self.layer_list[end_id].output.shape[-1]
filters_start = self.layer_list[start_id].output.shape[-1]
start_node_id = self.layer_id_to_output_node_ids[start_id][0]

pre_end_node_id = self.layer_id_to_input_node_ids[end_id][0]
end_node_id = self.layer_id_to_output_node_ids[end_id][0]

skip_output_id = self._insert_pooling_layer_chain(
start_node_id, end_node_id)

concat_input_node_id = self._add_node(
deepcopy(self.node_list[end_node_id]))
self._redirect_edge(pre_end_node_id, end_node_id, concat_input_node_id)

concat_layer = StubConcatenate()
concat_layer.input = [
self.node_list[concat_input_node_id],
self.node_list[skip_output_id],
]
concat_output_node_id = self._add_node(Node(concat_layer.output_shape))
self._add_edge(
concat_layer,
concat_input_node_id,
concat_output_node_id)
self._add_edge(concat_layer, skip_output_id, concat_output_node_id)
concat_layer.output = self.node_list[concat_output_node_id]
self.node_list[concat_output_node_id].shape = concat_layer.output_shape

# Add the concatenate layer.
# concat过channel数增加,用conv class 回到原先的channel数
new_conv_layer = get_conv_class(self.n_dim)(
filters_start + filters_end, filters_end, 1
)
self._add_edge(new_conv_layer, concat_output_node_id, end_node_id)
new_conv_layer.input = self.node_list[concat_output_node_id]
new_conv_layer.output = self.node_list[end_node_id]
self.node_list[end_node_id].shape = new_conv_layer.output_shape

if self.weighted:
filter_shape = (1,) * self.n_dim
weights = np.zeros((filters_end, filters_end) + filter_shape)
for i in range(filters_end):
filter_weight = np.zeros((filters_end,) + filter_shape)
center_index = (i,) + (0,) * self.n_dim
filter_weight[center_index] = 1
weights[i, ...] = filter_weight
weights = np.concatenate(
(weights, np.zeros((filters_end, filters_start) + filter_shape)), axis=1
)
bias = np.zeros(filters_end)
new_conv_layer.set_weights(
(add_noise(weights, np.array([0, 1])), add_noise(
bias, np.array([0, 1])))
)

def _insert_pooling_layer_chain(self, start_node_id, end_node_id):
"""
insert pooling layer
"""
skip_output_id = start_node_id
# 得到从start_node_id 到 end_node_id之间的所有pooling layer(包括conv layer stride > 1)
for layer in self._get_pooling_layers(start_node_id, end_node_id):
new_layer = deepcopy(layer)
# 如果是conv层需要重新初始化weights
if is_layer(new_layer, "Conv"):
# start node id 的通道数
filters = self.node_list[start_node_id].shape[-1]
new_layer = get_conv_class(self.n_dim)(
filters, filters, 1, layer.stride)
if self.weighted:
init_conv_weight(new_layer)
else:
new_layer = deepcopy(layer)
skip_output_id = self.add_layer(new_layer, skip_output_id)
skip_output_id = self.add_layer(StubReLU(), skip_output_id)
return skip_output_id

def extract_descriptor(self):
"""Extract the the description of the Graph as an instance of NetworkDescriptor."""
main_chain = self.get_main_chain()
index_in_main_chain = {}
for index, u in enumerate(main_chain):
index_in_main_chain[u] = index

ret = NetworkDescriptor()
for u in main_chain:
for v, layer_id in self.adj_list[u]:
if v not in index_in_main_chain:
continue
layer = self.layer_list[layer_id]
copied_layer = copy(layer)
copied_layer.weights = None
ret.add_layer(deepcopy(copied_layer))

for u in index_in_main_chain:
for v, layer_id in self.adj_list[u]:
if v not in index_in_main_chain:
temp_u = u
temp_v = v
temp_layer_id = layer_id
skip_type = None
while not (
temp_v in index_in_main_chain and temp_u in index_in_main_chain):
if is_layer(
self.layer_list[temp_layer_id], "Concatenate"):
skip_type = NetworkDescriptor.CONCAT_CONNECT
if is_layer(self.layer_list[temp_layer_id], "Add"):
skip_type = NetworkDescriptor.ADD_CONNECT
temp_u = temp_v
temp_v, temp_layer_id = self.adj_list[temp_v][0]
ret.add_skip_connection(
index_in_main_chain[u], index_in_main_chain[temp_u], skip_type
)

elif index_in_main_chain[v] - index_in_main_chain[u] != 1:
skip_type = None
if is_layer(self.layer_list[layer_id], "Concatenate"):
skip_type = NetworkDescriptor.CONCAT_CONNECT
if is_layer(self.layer_list[layer_id], "Add"):
skip_type = NetworkDescriptor.ADD_CONNECT
ret.add_skip_connection(
index_in_main_chain[u], index_in_main_chain[v], skip_type
)

return ret

def clear_weights(self):
''' clear weights of the graph
'''
self.weighted = False
for layer in self.layer_list:
layer.weights = None

def produce_torch_model(self):
"""Build a new Torch model based on the current graph."""
return TorchModel(self)

def produce_json_model(self):
"""Build a new Json model based on the current graph."""
return JSONModel(self).data

@classmethod
def parsing_json_model(cls, json_model):
'''build a graph from json
'''
return json_to_graph(json_model)

def _layer_ids_in_order(self, layer_ids):
node_id_to_order_index = {}
for index, node_id in enumerate(self.topological_order):
node_id_to_order_index[node_id] = index
return sorted(
layer_ids,
key=lambda layer_id: node_id_to_order_index[
self.layer_id_to_output_node_ids[layer_id][0]
],
)

def _layer_ids_by_type(self, type_str):
return list(
filter(
lambda layer_id: is_layer(self.layer_list[layer_id], type_str),
range(self.n_layers),
)
)

def get_main_chain_layers(self):
"""Return a list of layer IDs in the main chain."""
main_chain = self.get_main_chain()
ret = []
for u in main_chain:
for v, layer_id in self.adj_list[u]:
if v in main_chain and u in main_chain:
ret.append(layer_id)
return ret

def _conv_layer_ids_in_order(self):
return list(
filter(
lambda layer_id: is_layer(self.layer_list[layer_id], "Conv"),
self.get_main_chain_layers(),
)
)

def _dense_layer_ids_in_order(self):
return self._layer_ids_in_order(self._layer_ids_by_type("Dense"))

def deep_layer_ids(self):
ret = []
for layer_id in self.get_main_chain_layers():
layer = self.layer_list[layer_id]
# GAP之后就不插入layer了
if is_layer(layer, "GlobalAveragePooling"):
break
if is_layer(layer, "Add") or is_layer(layer, "Concatenate"):
continue
ret.append(layer_id)
return ret

def wide_layer_ids(self):
return (
self._conv_layer_ids_in_order(
)[:-1] + self._dense_layer_ids_in_order()[:-1]
)

def skip_connection_layer_ids(self):
return self.deep_layer_ids()[:-1]

def size(self):
return sum(list(map(lambda x: x.size(), self.layer_list)))

def get_main_chain(self):
"""Returns the main chain node ID list."""
pre_node = {}
distance = {}
# 初始化每个节点距离为0,他的前一个节点为自己
for i in range(self.n_nodes):
distance[i] = 0
pre_node[i] = i

# 遍历所有节点,根据邻接表找到他的前一个节点以及他本身的位置
for i in range(self.n_nodes - 1):
for u in range(self.n_nodes):
for v, _ in self.adj_list[u]:
if distance[u] + 1 > distance[v]:
distance[v] = distance[u] + 1
pre_node[v] = u

# temp_id记录距离最大的node
temp_id = 0
for i in range(self.n_nodes):
if distance[i] > distance[temp_id]:
temp_id = i

# 从距离最大的node开始不断找到他的前一个节点,最终找到主链
ret = []
for i in range(self.n_nodes + 5):
ret.append(temp_id)
if pre_node[temp_id] == temp_id:
break
temp_id = pre_node[temp_id]
assert temp_id == pre_node[temp_id]
ret.reverse()
return ret


class TorchModel(torch.nn.Module):
"""A neural network class using pytorch constructed from an instance of Graph."""

def __init__(self, graph):
super(TorchModel, self).__init__()
self.graph = graph
self.layers = torch.nn.ModuleList()
for layer in graph.layer_list:
self.layers.append(layer.to_real_layer())
if graph.weighted:
for index, layer in enumerate(self.layers):
set_stub_weight_to_torch(self.graph.layer_list[index], layer)
for index, layer in enumerate(self.layers):
self.add_module(str(index), layer)

def forward(self, input_tensor):
topo_node_list = self.graph.topological_order
output_id = topo_node_list[-1]
input_id = topo_node_list[0]

node_list = deepcopy(self.graph.node_list)
node_list[input_id] = input_tensor

for v in topo_node_list:
for u, layer_id in self.graph.reverse_adj_list[v]:
layer = self.graph.layer_list[layer_id]
torch_layer = self.layers[layer_id]

if isinstance(layer, (StubAdd, StubConcatenate)):
edge_input_tensor = list(
map(
lambda x: node_list[x],
self.graph.layer_id_to_input_node_ids[layer_id],
)
)
else:
edge_input_tensor = node_list[u]

temp_tensor = torch_layer(edge_input_tensor)
node_list[v] = temp_tensor
return node_list[output_id]

def set_weight_to_graph(self):
self.graph.weighted = True
for index, layer in enumerate(self.layers):
set_torch_weight_to_stub(layer, self.graph.layer_list[index])


class JSONModel:
def __init__(self, graph):
data = dict()
node_list = list()
layer_list = list()
operation_history = list()

data["input_shape"] = graph.input_shape
vis = graph.vis
data["vis"] = list(vis.keys()) if vis is not None else None
data["weighted"] = graph.weighted

for item in graph.operation_history:
if item[0] == "to_deeper_model":
operation_history.append(
[
item[0],
item[1],
layer_description_extractor(item[2], graph.node_to_id),
]
)
else:
operation_history.append(item)
data["operation_history"] = operation_history
data["layer_id_to_input_node_ids"] = graph.layer_id_to_input_node_ids
data["layer_id_to_output_node_ids"] = graph.layer_id_to_output_node_ids
data["adj_list"] = graph.adj_list
data["reverse_adj_list"] = graph.reverse_adj_list

for node in graph.node_list:
node_id = graph.node_to_id[node]
node_information = node.shape
node_list.append((node_id, node_information))

for layer_id, item in enumerate(graph.layer_list):
layer = graph.layer_list[layer_id]
layer_information = layer_description_extractor(
layer, graph.node_to_id)
layer_list.append((layer_id, layer_information))

data["node_list"] = node_list
data["layer_list"] = layer_list

self.data = data


def graph_to_json(graph, json_model_path):
json_out = graph.produce_json_model()
with open(json_model_path, "w") as fout:
json.dump(json_out, fout)
json_out = json.dumps(json_out)
return json_out


def json_to_graph(json_model: str):
json_model = json.loads(json_model)
# restore graph data from json data
input_shape = tuple(json_model["input_shape"])
node_list = list()
node_to_id = dict()
id_to_node = dict()
layer_list = list()
layer_to_id = dict()
operation_history = list()
graph = Graph(input_shape, False)

graph.input_shape = input_shape
vis = json_model["vis"]
graph.vis = {
tuple(item): True for item in vis} if vis is not None else None
graph.weighted = json_model["weighted"]
layer_id_to_input_node_ids = json_model["layer_id_to_input_node_ids"]
graph.layer_id_to_input_node_ids = {
int(k): v for k, v in layer_id_to_input_node_ids.items()
}
layer_id_to_output_node_ids = json_model["layer_id_to_output_node_ids"]
graph.layer_id_to_output_node_ids = {
int(k): v for k, v in layer_id_to_output_node_ids.items()
}
adj_list = {}
for k, v in json_model["adj_list"].items():
adj_list[int(k)] = [tuple(i) for i in v]
graph.adj_list = adj_list
reverse_adj_list = {}
for k, v in json_model["reverse_adj_list"].items():
reverse_adj_list[int(k)] = [tuple(i) for i in v]
graph.reverse_adj_list = reverse_adj_list

for item in json_model["node_list"]:
new_node = Node(tuple(item[1]))
node_id = item[0]
node_list.append(new_node)
node_to_id[new_node] = node_id
id_to_node[node_id] = new_node

for item in json_model["operation_history"]:
if item[0] == "to_deeper_model":
operation_history.append(
(item[0], item[1], layer_description_builder(item[2], id_to_node))
)
else:
operation_history.append(item)
graph.operation_history = operation_history

for item in json_model["layer_list"]:
new_layer = layer_description_builder(item[1], id_to_node)
layer_id = int(item[0])
layer_list.append(new_layer)
layer_to_id[new_layer] = layer_id

graph.node_list = node_list
graph.node_to_id = node_to_id
graph.layer_list = layer_list
graph.layer_to_id = layer_to_id

return graph

+ 178
- 0
dubhe-tadl/network_morphism/algorithm/graph_transformer.py View File

@@ -0,0 +1,178 @@
from copy import deepcopy

from random import randrange, sample

from .graph import NetworkDescriptor
from .layers import (
StubDense,
StubReLU,
get_batch_norm_class,
get_conv_class,
get_dropout_class,
get_pooling_class,
is_layer,
)
from utils import Constant


def to_wider_graph(graph):
''' wider graph
'''
weighted_layer_ids = graph.wide_layer_ids()
weighted_layer_ids = list(
filter(
lambda x: graph.layer_list[x].output.shape[-1], weighted_layer_ids)
)
wider_layers = sample(weighted_layer_ids, 1)

# count the number of layers with width larger than the max width
layer_width_maxed = 0

for layer_id in wider_layers:
layer = graph.layer_list[layer_id]
if is_layer(layer, "Conv"):
n_add = layer.filters
else:
n_add = layer.units
if n_add*2 > Constant.MAX_LAYER_WIDTH:
layer_width_maxed += 1
continue

graph.to_wider_model(layer_id, n_add)

if layer_width_maxed == len(wider_layers):
return None
return graph


def to_skip_connection_graph(graph):
''' skip connection graph
'''
# The last conv layer cannot be widen since wider operator cannot be done
# over the two sides of flatten.
weighted_layer_ids = graph.skip_connection_layer_ids()
valid_connection = []
for skip_type in sorted(
[NetworkDescriptor.ADD_CONNECT, NetworkDescriptor.CONCAT_CONNECT]):
for index_a in range(len(weighted_layer_ids)):
for index_b in range(len(weighted_layer_ids))[index_a + 1:]:
valid_connection.append((index_a, index_b, skip_type))

if len(valid_connection) < 1:
return graph
for index_a, index_b, skip_type in sample(valid_connection, 1):
a_id = weighted_layer_ids[index_a]
b_id = weighted_layer_ids[index_b]
if skip_type == NetworkDescriptor.ADD_CONNECT:
graph.to_add_skip_model(a_id, b_id)
else:
graph.to_concat_skip_model(a_id, b_id)
return graph


def create_new_layer(layer, n_dim):
''' create new layer for the graph
'''

input_shape = layer.output.shape
# 一般情况
dense_deeper_classes = [StubDense, get_dropout_class(n_dim), StubReLU]
conv_deeper_classes = [
get_conv_class(n_dim),
get_batch_norm_class(n_dim),
StubReLU]
# 三种情况有特别的layer class
if is_layer(layer, "ReLU"):
conv_deeper_classes = [
get_conv_class(n_dim),
get_batch_norm_class(n_dim)]
dense_deeper_classes = [StubDense, get_dropout_class(n_dim)]
elif is_layer(layer, "Dropout"):
dense_deeper_classes = [StubDense, StubReLU]
elif is_layer(layer, "BatchNormalization"):
conv_deeper_classes = [get_conv_class(n_dim), StubReLU]

layer_class = None
if len(input_shape) == 1:
# It is in the dense layer part.
layer_class = sample(dense_deeper_classes, 1)[0]
else:
# It is in the conv layer part.
layer_class = sample(conv_deeper_classes, 1)[0]

if layer_class == StubDense:
new_layer = StubDense(input_shape[0], input_shape[0])

elif layer_class == get_dropout_class(n_dim):
new_layer = layer_class(Constant.DENSE_DROPOUT_RATE)

elif layer_class == get_conv_class(n_dim):
new_layer = layer_class(
input_shape[-1], input_shape[-1], sample((1, 3, 5), 1)[0], stride=1
)

elif layer_class == get_batch_norm_class(n_dim):
new_layer = layer_class(input_shape[-1])

elif layer_class == get_pooling_class(n_dim):
new_layer = layer_class(sample((1, 3, 5), 1)[0])

else:
new_layer = layer_class()

return new_layer


def to_deeper_graph(graph):
''' deeper graph
'''

weighted_layer_ids = graph.deep_layer_ids()
if len(weighted_layer_ids) >= Constant.MAX_LAYERS:
return None

deeper_layer_ids = sample(weighted_layer_ids, 1)

for layer_id in deeper_layer_ids:
layer = graph.layer_list[layer_id]
new_layer = create_new_layer(layer, graph.n_dim)
graph.to_deeper_model(layer_id, new_layer)
return graph


def legal_graph(graph):
'''judge if a graph is legal or not.
'''

descriptor = graph.extract_descriptor()
skips = descriptor.skip_connections
if len(skips) != len(set(skips)):
return False
return True


# morph f with operations in O
def transform(graph):
'''core transform function for graph.
'''

graphs = []
for _ in range(Constant.N_NEIGHBOURS * 2):
random_num = randrange(3)
temp_graph = None
if random_num == 0:
temp_graph = to_deeper_graph(deepcopy(graph))
elif random_num == 1:
temp_graph = to_wider_graph(deepcopy(graph))
elif random_num == 2:
temp_graph = to_skip_connection_graph(deepcopy(graph))

if temp_graph is not None and temp_graph.size() <= Constant.MAX_MODEL_SIZE:
graphs.append(temp_graph)

# 最多8次操作
if len(graphs) >= Constant.N_NEIGHBOURS:
break
return graphs

+ 213
- 0
dubhe-tadl/network_morphism/algorithm/layer_transformer.py View File

@@ -0,0 +1,213 @@
import numpy as np

from .layers import (
StubDense,
get_batch_norm_class,
get_conv_class,
get_n_dim,
)

NOISE_RATIO = 1e-4

def wider_pre_dense(layer, n_add, weighted=True):
'''wider previous dense layer.
'''
if not weighted:
return StubDense(layer.input_units, layer.units + n_add)

n_units2 = layer.units

teacher_w, teacher_b = layer.get_weights()
rand = np.random.randint(n_units2, size=n_add)
student_w = teacher_w.copy()
student_b = teacher_b.copy()

# target layer update (i)
for i in range(n_add):
teacher_index = rand[i]
new_weight = teacher_w[teacher_index, :]
new_weight = new_weight[np.newaxis, :]
student_w = np.concatenate(
(student_w, add_noise(new_weight, student_w)), axis=0)
student_b = np.append(
student_b, add_noise(
teacher_b[teacher_index], student_b))

new_pre_layer = StubDense(layer.input_units, n_units2 + n_add)
new_pre_layer.set_weights((student_w, student_b))

return new_pre_layer


def wider_pre_conv(layer, n_add_filters, weighted=True):
'''wider previous conv layer.
'''
n_dim = get_n_dim(layer)
if not weighted:
return get_conv_class(n_dim)(
layer.input_channel,
layer.filters + n_add_filters,
kernel_size=layer.kernel_size,
stride=layer.stride
)

n_pre_filters = layer.filters
rand = np.random.randint(n_pre_filters, size=n_add_filters)
teacher_w, teacher_b = layer.get_weights()

student_w = teacher_w.copy()
student_b = teacher_b.copy()
# target layer update (i)
for i in range(len(rand)):
teacher_index = rand[i]
new_weight = teacher_w[teacher_index, ...]
new_weight = new_weight[np.newaxis, ...]
student_w = np.concatenate((student_w, new_weight), axis=0)
student_b = np.append(student_b, teacher_b[teacher_index])
new_pre_layer = get_conv_class(n_dim)(
layer.input_channel,
n_pre_filters + n_add_filters,
kernel_size=layer.kernel_size,
stride=layer.stride
)
new_pre_layer.set_weights(
(add_noise(student_w, teacher_w), add_noise(student_b, teacher_b))
)
return new_pre_layer


def wider_next_conv(layer, start_dim, total_dim, n_add, weighted=True):
'''wider next conv layer.
'''
n_dim = get_n_dim(layer)
if not weighted:
return get_conv_class(n_dim)(layer.input_channel + n_add,
layer.filters,
kernel_size=layer.kernel_size,
stride=layer.stride)
n_filters = layer.filters
teacher_w, teacher_b = layer.get_weights()

new_weight_shape = list(teacher_w.shape)
new_weight_shape[1] = n_add
new_weight = np.zeros(tuple(new_weight_shape))

student_w = np.concatenate((teacher_w[:, :start_dim, ...].copy(),
add_noise(new_weight, teacher_w),
teacher_w[:, start_dim:total_dim, ...].copy()), axis=1)
new_layer = get_conv_class(n_dim)(layer.input_channel + n_add,
n_filters,
kernel_size=layer.kernel_size,
stride=layer.stride)
new_layer.set_weights((student_w, teacher_b))
return new_layer


def wider_bn(layer, start_dim, total_dim, n_add, weighted=True):
'''wider batch norm layer.
'''
n_dim = get_n_dim(layer)
if not weighted:
return get_batch_norm_class(n_dim)(layer.num_features + n_add)

weights = layer.get_weights()

new_weights = [
add_noise(np.ones(n_add, dtype=np.float32), np.array([0, 1])),
add_noise(np.zeros(n_add, dtype=np.float32), np.array([0, 1])),
add_noise(np.zeros(n_add, dtype=np.float32), np.array([0, 1])),
add_noise(np.ones(n_add, dtype=np.float32), np.array([0, 1])),
]

student_w = tuple()
for weight, new_weight in zip(weights, new_weights):
temp_w = weight.copy()
temp_w = np.concatenate(
(temp_w[:start_dim], new_weight, temp_w[start_dim:total_dim])
)
student_w += (temp_w,)
new_layer = get_batch_norm_class(n_dim)(layer.num_features + n_add)
new_layer.set_weights(student_w)
return new_layer


def wider_next_dense(layer, start_dim, total_dim, n_add, weighted=True):
'''wider next dense layer.
'''
if not weighted:
return StubDense(layer.input_units + n_add, layer.units)
teacher_w, teacher_b = layer.get_weights()
student_w = teacher_w.copy()
n_units_each_channel = int(teacher_w.shape[1] / total_dim)

new_weight = np.zeros((teacher_w.shape[0], n_add * n_units_each_channel))
student_w = np.concatenate(
(
student_w[:, : start_dim * n_units_each_channel],
add_noise(new_weight, student_w),
student_w[
:, start_dim * n_units_each_channel: total_dim * n_units_each_channel
],
),
axis=1,
)

new_layer = StubDense(layer.input_units + n_add, layer.units)
new_layer.set_weights((student_w, teacher_b))
return new_layer


def add_noise(weights, other_weights):
'''add noise to the layer.
'''
w_range = np.ptp(other_weights.flatten())
noise_range = NOISE_RATIO * w_range
noise = np.random.uniform(-noise_range / 2.0,
noise_range / 2.0, weights.shape)
return np.add(noise, weights)


def init_dense_weight(layer):
'''initilize dense layer weight.
'''
units = layer.units
weight = np.eye(units)
bias = np.zeros(units)
layer.set_weights(
(add_noise(weight, np.array([0, 1])),
add_noise(bias, np.array([0, 1])))
)


def init_conv_weight(layer):
'''initilize conv layer weight.
'''
n_filters = layer.filters
filter_shape = (layer.kernel_size,) * get_n_dim(layer)
weight = np.zeros((n_filters, n_filters) + filter_shape)

center = tuple(map(lambda x: int((x - 1) / 2), filter_shape))
for i in range(n_filters):
filter_weight = np.zeros((n_filters,) + filter_shape)
index = (i,) + center
filter_weight[index] = 1
weight[i, ...] = filter_weight
bias = np.zeros(n_filters)

layer.set_weights(
(add_noise(weight, np.array([0, 1])),
add_noise(bias, np.array([0, 1])))
)


def init_bn_weight(layer):
'''initilize batch norm layer weight.
'''
n_filters = layer.num_features
new_weights = [
add_noise(np.ones(n_filters, dtype=np.float32), np.array([0, 1])),
add_noise(np.zeros(n_filters, dtype=np.float32), np.array([0, 1])),
add_noise(np.zeros(n_filters, dtype=np.float32), np.array([0, 1])),
add_noise(np.ones(n_filters, dtype=np.float32), np.array([0, 1])),
]
layer.set_weights(new_weights)

+ 765
- 0
dubhe-tadl/network_morphism/algorithm/layers.py View File

@@ -0,0 +1,765 @@
from abc import abstractmethod
from collections.abc import Iterable

import torch
from torch import nn
from torch.nn import functional
from utils import Constant


class AvgPool(nn.Module):
"""
AvgPool Module.
"""

def __init__(self):
super().__init__()

@abstractmethod
def forward(self, input_tensor):
pass


class GlobalAvgPool1d(AvgPool):
"""
GlobalAvgPool1d Module.
"""

def forward(self, input_tensor):
return functional.avg_pool1d(input_tensor, input_tensor.size()[2:]).view(
input_tensor.size()[:2]
)


class GlobalAvgPool2d(AvgPool):
"""
GlobalAvgPool2d Module.
"""

def forward(self, input_tensor):
return functional.avg_pool2d(input_tensor, input_tensor.size()[2:]).view(
input_tensor.size()[:2]
)


class GlobalAvgPool3d(AvgPool):
"""
GlobalAvgPool3d Module.
"""

def forward(self, input_tensor):
return functional.avg_pool3d(input_tensor, input_tensor.size()[2:]).view(
input_tensor.size()[:2]
)


class StubLayer:
"""
StubLayer Module. Base Module.
"""

def __init__(self, input_node=None, output_node=None):
self.input = input_node
self.output = output_node
self.weights = None

def build(self, shape):
"""
build shape.
"""

def set_weights(self, weights):
"""
set weights.
"""
self.weights = weights

def import_weights(self, torch_layer):
"""
import weights.
"""

def export_weights(self, torch_layer):
"""
export weights.
"""

def get_weights(self):
"""
get weights.
"""
return self.weights

def size(self):
"""
size().
"""
return 0

@property
def output_shape(self):
"""
output shape.
"""
return self.input.shape

def to_real_layer(self):
"""
to real layer.
"""

def __str__(self):
"""
str() function to print.
"""
return type(self).__name__[4:]


class StubWeightBiasLayer(StubLayer):
"""
StubWeightBiasLayer Module to set the bias.
"""

def import_weights(self, torch_layer):
self.set_weights(
(torch_layer.weight.data.cpu().numpy(),
torch_layer.bias.data.cpu().numpy())
)

def export_weights(self, torch_layer):
torch_layer.weight.data = torch.Tensor(self.weights[0])
torch_layer.bias.data = torch.Tensor(self.weights[1])



class StubBatchNormalization(StubWeightBiasLayer):
"""
StubBatchNormalization Module. Batch Norm.
"""

def __init__(self, num_features, input_node=None, output_node=None):
super().__init__(input_node, output_node)
self.num_features = num_features

def import_weights(self, torch_layer):
self.set_weights(
(
torch_layer.weight.data.cpu().numpy(),
torch_layer.bias.data.cpu().numpy(),
torch_layer.running_mean.cpu().numpy(),
torch_layer.running_var.cpu().numpy(),
)
)

def export_weights(self, torch_layer):
torch_layer.weight.data = torch.Tensor(self.weights[0])
torch_layer.bias.data = torch.Tensor(self.weights[1])
torch_layer.running_mean = torch.Tensor(self.weights[2])
torch_layer.running_var = torch.Tensor(self.weights[3])

def size(self):
return self.num_features * 4

@abstractmethod
def to_real_layer(self):
pass


class StubBatchNormalization1d(StubBatchNormalization):
"""
StubBatchNormalization1d Module.
"""

def to_real_layer(self):
return torch.nn.BatchNorm1d(self.num_features)


class StubBatchNormalization2d(StubBatchNormalization):
"""
StubBatchNormalization2d Module.
"""

def to_real_layer(self):
return torch.nn.BatchNorm2d(self.num_features)


class StubBatchNormalization3d(StubBatchNormalization):
"""
StubBatchNormalization3d Module.
"""

def to_real_layer(self):
return torch.nn.BatchNorm3d(self.num_features)


class StubDense(StubWeightBiasLayer):
"""
StubDense Module. Linear.
"""

def __init__(self, input_units, units, input_node=None, output_node=None):
super().__init__(input_node, output_node)
self.input_units = input_units
self.units = units

@property
def output_shape(self):
return (self.units,)

def size(self):
return self.input_units * self.units + self.units

def to_real_layer(self):
return torch.nn.Linear(self.input_units, self.units)


class StubConv(StubWeightBiasLayer):
"""
StubConv Module. Conv.
"""

def __init__(self, input_channel, filters, kernel_size,
stride=1, input_node=None, output_node=None):
super().__init__(input_node, output_node)
self.input_channel = input_channel
self.filters = filters
self.kernel_size = kernel_size
self.stride = stride
self.padding = int(self.kernel_size / 2)

@property
def output_shape(self):
ret = list(self.input.shape[:-1])
for index, dim in enumerate(ret):
ret[index] = (
int((dim + 2 * self.padding - self.kernel_size) / self.stride) + 1
)
ret = ret + [self.filters]
return tuple(ret)

def size(self):
return (self.input_channel * self.kernel_size *
self.kernel_size + 1) * self.filters

@abstractmethod
def to_real_layer(self):
pass

def __str__(self):
return (
super().__str__()
+ "("
+ ", ".join(
str(item)
for item in [
self.input_channel,
self.filters,
self.kernel_size,
self.stride,
]
)
+ ")"
)


class StubConv1d(StubConv):
"""
StubConv1d Module.
"""

def to_real_layer(self):
return torch.nn.Conv1d(
self.input_channel,
self.filters,
self.kernel_size,
stride=self.stride,
padding=self.padding,
)


class StubConv2d(StubConv):
"""
StubConv2d Module.
"""

def to_real_layer(self):
return torch.nn.Conv2d(
self.input_channel,
self.filters,
self.kernel_size,
stride=self.stride,
padding=self.padding,
)


class StubConv3d(StubConv):
"""
StubConv3d Module.
"""

def to_real_layer(self):
return torch.nn.Conv3d(
self.input_channel,
self.filters,
self.kernel_size,
stride=self.stride,
padding=self.padding,
)


class StubAggregateLayer(StubLayer):
"""
StubAggregateLayer Module.
"""

def __init__(self, input_nodes=None, output_node=None):
if input_nodes is None:
input_nodes = []
super().__init__(input_nodes, output_node)


class StubConcatenate(StubAggregateLayer):
"""StubConcatenate Module.
"""
@property
def output_shape(self):
ret = 0
for current_input in self.input:
ret += current_input.shape[-1]
ret = self.input[0].shape[:-1] + (ret,)
return ret

def to_real_layer(self):
return TorchConcatenate()


class StubAdd(StubAggregateLayer):
"""
StubAdd Module.
"""
@property
def output_shape(self):
return self.input[0].shape

def to_real_layer(self):
return TorchAdd()


class StubFlatten(StubLayer):
"""
StubFlatten Module.
"""
@property
def output_shape(self):
ret = 1
for dim in self.input.shape:
ret *= dim
return (ret,)

def to_real_layer(self):
return TorchFlatten()


class StubReLU(StubLayer):
"""
StubReLU Module.
"""

def to_real_layer(self):
return torch.nn.ReLU()


class StubSoftmax(StubLayer):
"""
StubSoftmax Module.
"""

def to_real_layer(self):
return torch.nn.LogSoftmax(dim=1)


class StubDropout(StubLayer):
"""
StubDropout Module.
"""

def __init__(self, rate, input_node=None, output_node=None):
super().__init__(input_node, output_node)
self.rate = rate

@abstractmethod
def to_real_layer(self):
pass


class StubDropout1d(StubDropout):
"""
StubDropout1d Module.
"""

def to_real_layer(self):
return torch.nn.Dropout(self.rate)


class StubDropout2d(StubDropout):
"""
StubDropout2d Module.
"""

def to_real_layer(self):
return torch.nn.Dropout2d(self.rate)


class StubDropout3d(StubDropout):
"""
StubDropout3d Module.
"""

def to_real_layer(self):
return torch.nn.Dropout3d(self.rate)


class StubInput(StubLayer):
"""
StubInput Module.
"""

def __init__(self, input_node=None, output_node=None):
super().__init__(input_node, output_node)


class StubPooling(StubLayer):
"""
StubPooling Module.
"""

def __init__(self,
kernel_size=None,
stride=None,
padding=0,
input_node=None,
output_node=None):
super().__init__(input_node, output_node)
self.kernel_size = (
kernel_size if kernel_size is not None else Constant.POOLING_KERNEL_SIZE
)
self.stride = stride if stride is not None else self.kernel_size
self.padding = padding

@property
def output_shape(self):
ret = tuple()
for dim in self.input.shape[:-1]:
ret = ret + (max(int((dim + 2 * self.padding) / self.kernel_size), 1),)
ret = ret + (self.input.shape[-1],)
return ret

@abstractmethod
def to_real_layer(self):
pass


class StubPooling1d(StubPooling):
"""
StubPooling1d Module.
"""

def to_real_layer(self):
return torch.nn.MaxPool1d(self.kernel_size, stride=self.stride)


class StubPooling2d(StubPooling):
"""
StubPooling2d Module.
"""

def to_real_layer(self):
return torch.nn.MaxPool2d(self.kernel_size, stride=self.stride)


class StubPooling3d(StubPooling):
"""
StubPooling3d Module.
"""

def to_real_layer(self):
return torch.nn.MaxPool3d(self.kernel_size, stride=self.stride)


class StubGlobalPooling(StubLayer):
"""
StubGlobalPooling Module.
"""

def __init__(self, input_node=None, output_node=None):
super().__init__(input_node, output_node)

@property
def output_shape(self):
return (self.input.shape[-1],)

@abstractmethod
def to_real_layer(self):
pass


class StubGlobalPooling1d(StubGlobalPooling):
"""
StubGlobalPooling1d Module.
"""

def to_real_layer(self):
return GlobalAvgPool1d()


class StubGlobalPooling2d(StubGlobalPooling):
"""
StubGlobalPooling2d Module.
"""

def to_real_layer(self):
return GlobalAvgPool2d()


class StubGlobalPooling3d(StubGlobalPooling):
"""
StubGlobalPooling3d Module.
"""

def to_real_layer(self):
return GlobalAvgPool3d()


class TorchConcatenate(nn.Module):
"""
TorchConcatenate Module.
"""

def forward(self, input_list):
return torch.cat(input_list, dim=1)


class TorchAdd(nn.Module):
"""
TorchAdd Module.
"""

def forward(self, input_list):
return input_list[0] + input_list[1]


class TorchFlatten(nn.Module):
"""
TorchFlatten Module.
"""

def forward(self, input_tensor):
return input_tensor.view(input_tensor.size(0), -1)

def is_layer(layer, layer_type):
"""
Judge the layer type.

Returns
-------
bool
boolean -- True or False
"""

if layer_type == "Input":
return isinstance(layer, StubInput)
elif layer_type == "Conv":
return isinstance(layer, StubConv)
elif layer_type == "Dense":
return isinstance(layer, (StubDense,))
elif layer_type == "BatchNormalization":
return isinstance(layer, (StubBatchNormalization,))
elif layer_type == "Concatenate":
return isinstance(layer, (StubConcatenate,))
elif layer_type == "Add":
return isinstance(layer, (StubAdd,))
elif layer_type == "Pooling":
return isinstance(layer, StubPooling)
elif layer_type == "Dropout":
return isinstance(layer, (StubDropout,))
elif layer_type == "Softmax":
return isinstance(layer, (StubSoftmax,))
elif layer_type == "ReLU":
return isinstance(layer, (StubReLU,))
elif layer_type == "Flatten":
return isinstance(layer, (StubFlatten,))
elif layer_type == "GlobalAveragePooling":
return isinstance(layer, StubGlobalPooling)
return None # note: this is not written by original author, feel free to modify if you think it's incorrect


def layer_description_extractor(layer, node_to_id):
"""
Get layer description.
"""

layer_input = layer.input
layer_output = layer.output
if layer_input is not None:
if isinstance(layer_input, Iterable):
layer_input = list(map(lambda x: node_to_id[x], layer_input))
else:
layer_input = node_to_id[layer_input]

if layer_output is not None:
layer_output = node_to_id[layer_output]

if isinstance(layer, StubConv):
return (
type(layer).__name__,
layer_input,
layer_output,
layer.input_channel,
layer.filters,
layer.kernel_size,
layer.stride,
layer.padding,
)
elif isinstance(layer, (StubDense,)):
return [
type(layer).__name__,
layer_input,
layer_output,
layer.input_units,
layer.units,
]
elif isinstance(layer, (StubBatchNormalization,)):
return (type(layer).__name__, layer_input,
layer_output, layer.num_features)
elif isinstance(layer, (StubDropout,)):
return (type(layer).__name__, layer_input, layer_output, layer.rate)
elif isinstance(layer, StubPooling):
return (
type(layer).__name__,
layer_input,
layer_output,
layer.kernel_size,
layer.stride,
layer.padding,
)
else:
return (type(layer).__name__, layer_input, layer_output)


def layer_description_builder(layer_information, id_to_node):
"""build layer from description.
"""
layer_type = layer_information[0]

layer_input_ids = layer_information[1]
if isinstance(layer_input_ids, Iterable):
layer_input = list(map(lambda x: id_to_node[x], layer_input_ids))
else:
layer_input = id_to_node[layer_input_ids]
layer_output = id_to_node[layer_information[2]]
if layer_type.startswith("StubConv"):
input_channel = layer_information[3]
filters = layer_information[4]
kernel_size = layer_information[5]
stride = layer_information[6]
return globals()[layer_type](
input_channel, filters, kernel_size, stride, layer_input, layer_output
)
elif layer_type.startswith("StubDense"):
input_units = layer_information[3]
units = layer_information[4]
return globals()[layer_type](input_units, units, layer_input, layer_output)
elif layer_type.startswith("StubBatchNormalization"):
num_features = layer_information[3]
return globals()[layer_type](num_features, layer_input, layer_output)
elif layer_type.startswith("StubDropout"):
rate = layer_information[3]
return globals()[layer_type](rate, layer_input, layer_output)
elif layer_type.startswith("StubPooling"):
kernel_size = layer_information[3]
stride = layer_information[4]
padding = layer_information[5]
return globals()[layer_type](kernel_size, stride, padding, layer_input, layer_output)
else:
return globals()[layer_type](layer_input, layer_output)


def layer_width(layer):
"""
Get layer width.
"""

if is_layer(layer, "Dense"):
return layer.units
if is_layer(layer, "Conv"):
return layer.filters
raise TypeError("The layer should be either Dense or Conv layer.")


def set_torch_weight_to_stub(torch_layer, stub_layer):
stub_layer.import_weights(torch_layer)


def set_stub_weight_to_torch(stub_layer, torch_layer):
stub_layer.export_weights(torch_layer)


def get_conv_class(n_dim):
conv_class_list = [StubConv1d, StubConv2d, StubConv3d]
return conv_class_list[n_dim - 1]


def get_dropout_class(n_dim):
dropout_class_list = [StubDropout1d, StubDropout2d, StubDropout3d]
return dropout_class_list[n_dim - 1]


def get_global_avg_pooling_class(n_dim):
global_avg_pooling_class_list = [
StubGlobalPooling1d,
StubGlobalPooling2d,
StubGlobalPooling3d,
]
return global_avg_pooling_class_list[n_dim - 1]


def get_pooling_class(n_dim):
pooling_class_list = [StubPooling1d, StubPooling2d, StubPooling3d]
return pooling_class_list[n_dim - 1]


def get_batch_norm_class(n_dim):
batch_norm_class_list = [
StubBatchNormalization1d,
StubBatchNormalization2d,
StubBatchNormalization3d,
]
return batch_norm_class_list[n_dim - 1]


def get_n_dim(layer):
if isinstance(layer, (
StubConv1d,
StubDropout1d,
StubGlobalPooling1d,
StubPooling1d,
StubBatchNormalization1d,
)):
return 1
if isinstance(layer, (
StubConv2d,
StubDropout2d,
StubGlobalPooling2d,
StubPooling2d,
StubBatchNormalization2d,
)):
return 2
if isinstance(layer, (
StubConv3d,
StubDropout3d,
StubGlobalPooling3d,
StubPooling3d,
StubBatchNormalization3d,
)):
return 3
return -1

+ 293
- 0
dubhe-tadl/network_morphism/algorithm/networkmorphism_searcher.py View File

@@ -0,0 +1,293 @@
import logging
import os
import shutil
from utils import Constant, OptimizeMode
from .bayesian import BayesianOptimizer
from .nn import CnnGenerator, ResNetGenerator, MlpGenerator
from .graph import graph_to_json, json_to_graph

logger = logging.getLogger(__name__)

class NetworkMorphismSearcher:
"""
NetworkMorphismSearcher is a tuner which using network morphism techniques.

Attributes
----------
n_classes : int
The class number or output node number (default: ``10``)
input_shape : tuple
A tuple including: (input_width, input_width, input_channel)
t_min : float
The minimum temperature for simulated annealing. (default: ``Constant.T_MIN``)
beta : float
The beta in acquisition function. (default: ``Constant.BETA``)
algorithm_name : str
algorithm name used in the network morphism (default: ``"Bayesian"``)
optimize_mode : str
optimize mode "minimize" or "maximize" (default: ``"minimize"``)
verbose : bool
verbose to print the log (default: ``True``)
bo : BayesianOptimizer
The optimizer used in networkmorphsim tuner.
max_model_size : int
max model size to the graph (default: ``Constant.MAX_MODEL_SIZE``)
default_model_len : int
default model length (default: ``Constant.MODEL_LEN``)
default_model_width : int
default model width (default: ``Constant.MODEL_WIDTH``)
search_space : dict
"""

def __init__(
self,
path,
best_selected_space_path,
task="cv",
input_width=32,
input_channel=3,
n_output_node=10,
algorithm_name="Bayesian",
optimize_mode="maximize",
verbose=True,
beta=Constant.BETA,
t_min=Constant.T_MIN,
max_model_size=Constant.MAX_MODEL_SIZE,
default_model_len=Constant.MODEL_LEN,
default_model_width=Constant.MODEL_WIDTH,
):
"""
initilizer of the NetworkMorphismSearcher.
"""
self.path = path
self.best_selected_space_path = best_selected_space_path

if task == "cv":
self.generators = [CnnGenerator]
elif task == "common":
self.generators = [MlpGenerator]
else:
raise NotImplementedError(
'{} task not supported in List ["cv","common"]')

self.n_classes = n_output_node
self.input_shape = (input_width, input_width, input_channel)

self.t_min = t_min
self.beta = beta
self.algorithm_name = algorithm_name
self.optimize_mode = OptimizeMode(optimize_mode)
self.json = None
self.total_data = {}
self.verbose = verbose

self.bo = BayesianOptimizer(
self, self.t_min, self.optimize_mode, self.beta)

self.training_queue = []
self.descriptors = []
self.history = []

self.max_model_size = max_model_size
self.default_model_len = default_model_len
self.default_model_width = default_model_width

def search(self, parameter_id, args):
"""
Returns a set of trial neural architecture, as a serializable object.

Parameters
----------
parameter_id : int
"""
if not self.history:
self.init_search(args)

new_father_id = None
generated_graph = None
# 先看training queue里面有没有元素,有就是init_search了
# 如果有history了话那就生成一个
if not self.training_queue:
new_father_id, generated_graph = self.generate()
new_model_id = args.trial_id
self.training_queue.append(
(generated_graph, new_father_id, new_model_id))
self.descriptors.append(generated_graph.extract_descriptor())

graph, father_id, model_id = self.training_queue.pop(0)

# from graph to json
json_out = graph_to_json(graph, os.path.join(self.path, str(model_id),'model_selected_space.json'))
self.total_data[parameter_id] = (json_out, father_id, model_id)

return json_out

def update_searcher(self, parameter_id, value, **kwargs):
"""
Record an observation of the objective function.

Parameters
----------
parameter_id : int
the id of a group of paramters that generated by nni manager.
value : dict/float
if value is dict, it should have "default" key.
"""

if parameter_id not in self.total_data:
raise RuntimeError("Received parameter_id not in total_data.")

(_, father_id, model_id) = self.total_data[parameter_id]

graph = self.bo.searcher.load_model_by_id(model_id)

# to use the value and graph
self.add_model(value, model_id)
self.update(father_id, graph, value, model_id)


def init_search(self,args):
"""
Call the generators to generate the initial architectures for the search.
"""
if self.verbose:
logger.info("Initializing search.")
for generator in self.generators:
graph = generator(self.n_classes, self.input_shape).generate(
self.default_model_len, self.default_model_width
)
model_id = args.trial_id
self.training_queue.append((graph, -1, model_id))
self.descriptors.append(graph.extract_descriptor())

if self.verbose:
logger.info("Initialization finished.")


def generate(self):
"""
Generate the next neural architecture.

Returns
-------
other_info : any object
Anything to be saved in the training queue together with the architecture.
generated_graph : Graph
An instance of Graph.
"""
generated_graph, new_father_id = self.bo.generate(self.descriptors)
if new_father_id is None:
new_father_id = 0
generated_graph = self.generators[0](
self.n_classes, self.input_shape
).generate(self.default_model_len, self.default_model_width)

return new_father_id, generated_graph

def update(self, other_info, graph, metric_value, model_id):
"""
Update the controller with evaluation result of a neural architecture.

Parameters
----------
other_info: any object
In our case it is the father ID in the search tree.
graph: Graph
An instance of Graph. The trained neural architecture.
metric_value: float
The final evaluated metric value.
model_id: int
"""
father_id = other_info
self.bo.fit([graph.extract_descriptor()], [metric_value])
self.bo.add_child(father_id, model_id)

def add_model(self, metric_value, model_id):
"""
Add model to the history, x_queue and y_queue

Parameters
----------
metric_value : float
graph : dict
model_id : int

Returns
-------
model : dict
"""
if self.verbose:
logger.info("Saving model.")

# Update best_model text file
ret = {"model_id": model_id, "metric_value": metric_value}
self.history.append(ret)
# update best selected space
if model_id == self.get_best_model_id():
best_model_path = os.path.join(self.path, str(model_id),'model_selected_space.json')
shutil.copy(best_model_path, self.best_selected_space_path)
return ret


def get_best_model_id(self):
"""
Get the best model_id from history using the metric value
"""

if self.optimize_mode is OptimizeMode.Maximize:
return max(self.history, key=lambda x: x["metric_value"])[
"model_id"]
return min(self.history, key=lambda x: x["metric_value"])["model_id"]


def load_model_by_id(self, model_id):
"""
Get the model by model_id

Parameters
----------
model_id : int
model index

Returns
-------
load_model : Graph
the model graph representation
"""

with open(os.path.join(self.path, str(model_id), "model_selected_space.json")) as fin:
json_str = fin.read().replace("\n", "")

load_model = json_to_graph(json_str)
return load_model

def load_best_model(self):
"""
Get the best model by model id

Returns
-------
load_model : Graph
the model graph representation
"""
return self.load_model_by_id(self.get_best_model_id())

def get_metric_value_by_id(self, model_id):
"""
Get the model metric valud by its model_id

Parameters
----------
model_id : int
model index

Returns
-------
float
the model metric
"""
for item in self.history:
if item["model_id"] == model_id:
return item["metric_value"]
return None

+ 227
- 0
dubhe-tadl/network_morphism/algorithm/nn.py View File

@@ -0,0 +1,227 @@
from abc import abstractmethod

from .graph import Graph
from .layers import (StubAdd, StubDense, StubDropout1d,
StubReLU, get_batch_norm_class,
get_conv_class,
get_dropout_class,
get_global_avg_pooling_class,
get_pooling_class)
from utils import Constant


class NetworkGenerator:
"""The base class for generating a network.
It can be used to generate a CNN or Multi-Layer Perceptron.
Attributes:
n_output_node: Number of output nodes in the network.
input_shape: A tuple to represent the input shape.
"""

def __init__(self, n_output_node, input_shape):
self.n_output_node = n_output_node
self.input_shape = input_shape

@abstractmethod
def generate(self, model_len, model_width):
pass


class CnnGenerator(NetworkGenerator):
"""A class to generate CNN.
Attributes:
n_dim: `len(self.input_shape) - 1`
conv: A class that represents `(n_dim-1)` dimensional convolution.
dropout: A class that represents `(n_dim-1)` dimensional dropout.
global_avg_pooling: A class that represents `(n_dim-1)` dimensional Global Average Pooling.
pooling: A class that represents `(n_dim-1)` dimensional pooling.
batch_norm: A class that represents `(n_dim-1)` dimensional batch normalization.
"""

def __init__(self, n_output_node, input_shape):
super(CnnGenerator, self).__init__(n_output_node, input_shape)
self.n_dim = len(self.input_shape) - 1
if len(self.input_shape) > 4:
raise ValueError("The input dimension is too high.")
if len(self.input_shape) < 2:
raise ValueError("The input dimension is too low.")
self.conv = get_conv_class(self.n_dim)
self.dropout = get_dropout_class(self.n_dim)
self.global_avg_pooling = get_global_avg_pooling_class(self.n_dim)
self.pooling = get_pooling_class(self.n_dim)
self.batch_norm = get_batch_norm_class(self.n_dim)

def generate(self, model_len=None, model_width=None):
"""Generates a CNN.
Args:
model_len: An integer. Number of convolutional layers.
model_width: An integer. Number of filters for the convolutional layers.
Returns:
An instance of the class Graph. Represents the neural architecture graph of the generated model.
"""

if model_len is None:
model_len = Constant.MODEL_LEN
if model_width is None:
model_width = Constant.MODEL_WIDTH
pooling_len = int(model_len / 4)
graph = Graph(self.input_shape, False)
temp_input_channel = self.input_shape[-1]
output_node_id = 0
stride = 1
for i in range(model_len):
output_node_id = graph.add_layer(StubReLU(), output_node_id)
output_node_id = graph.add_layer(
self.batch_norm(
graph.node_list[output_node_id].shape[-1]), output_node_id
)
output_node_id = graph.add_layer(
self.conv(
temp_input_channel,
model_width,
kernel_size=3,
stride=stride),
output_node_id,
)
temp_input_channel = model_width
if pooling_len == 0 or (
(i + 1) % pooling_len == 0 and i != model_len - 1):
output_node_id = graph.add_layer(
self.pooling(), output_node_id)

output_node_id = graph.add_layer(
self.global_avg_pooling(), output_node_id)
output_node_id = graph.add_layer(
self.dropout(Constant.CONV_DROPOUT_RATE), output_node_id
)
output_node_id = graph.add_layer(
StubDense(graph.node_list[output_node_id].shape[0], model_width),
output_node_id,
)
output_node_id = graph.add_layer(StubReLU(), output_node_id)
graph.add_layer(
StubDense(
model_width,
self.n_output_node),
output_node_id)
return graph

class ResNetGenerator(NetworkGenerator):
def __init__(self, n_output_node, input_shape):
super(ResNetGenerator, self).__init__(n_output_node, input_shape)
# self.layers = [2, 2, 2, 2]
self.in_planes = 64
self.block_expansion = 1
self.n_dim = len(self.input_shape) - 1
if len(self.input_shape) > 4:
raise ValueError('The input dimension is too high.')
elif len(self.input_shape) < 2:
raise ValueError('The input dimension is too low.')
self.conv = get_conv_class(self.n_dim)
self.dropout = get_dropout_class(self.n_dim)
self.global_avg_pooling = get_global_avg_pooling_class(self.n_dim)
self.adaptive_avg_pooling = get_global_avg_pooling_class(self.n_dim)
self.batch_norm = get_batch_norm_class(self.n_dim)

def generate(self, model_len=None, model_width=None):
if model_width is None:
model_width = Constant.MODEL_WIDTH
graph = Graph(self.input_shape, False)
temp_input_channel = self.input_shape[-1]
output_node_id = 0
# output_node_id = graph.add_layer(StubReLU(), output_node_id)
output_node_id = graph.add_layer(self.conv(temp_input_channel, model_width, kernel_size=3), output_node_id)
output_node_id = graph.add_layer(self.batch_norm(model_width), output_node_id)
# output_node_id = graph.add_layer(self.pooling(kernel_size=3, stride=2, padding=1), output_node_id)

output_node_id = self._make_layer(graph, model_width, 2, output_node_id, 1)
model_width *= 2
output_node_id = self._make_layer(graph, model_width, 2, output_node_id, 2)
model_width *= 2
output_node_id = self._make_layer(graph, model_width, 2, output_node_id, 2)
model_width *= 2
output_node_id = self._make_layer(graph, model_width, 2, output_node_id, 2)

output_node_id = graph.add_layer(self.global_avg_pooling(), output_node_id)
graph.add_layer(StubDense(model_width * self.block_expansion, self.n_output_node), output_node_id)
return graph

def _make_layer(self, graph, planes, blocks, node_id, stride):
strides = [stride] + [1] * (blocks - 1)
out = node_id
for current_stride in strides:
out = self._make_block(graph, self.in_planes, planes, out, current_stride)
self.in_planes = planes * self.block_expansion
return out

def _make_block(self, graph, in_planes, planes, node_id, stride=1):
out = graph.add_layer(self.batch_norm(in_planes), node_id)
out = graph.add_layer(StubReLU(), out)
residual_node_id = out
out = graph.add_layer(self.conv(in_planes, planes, kernel_size=3, stride=stride), out)
out = graph.add_layer(self.batch_norm(planes), out)
out = graph.add_layer(StubReLU(), out)
out = graph.add_layer(self.conv(planes, planes, kernel_size=3), out)

residual_node_id = graph.add_layer(StubReLU(), residual_node_id)
residual_node_id = graph.add_layer(self.conv(in_planes,
planes * self.block_expansion,
kernel_size=1,
stride=stride), residual_node_id)
out = graph.add_layer(StubAdd(), (out, residual_node_id))
return out

class MlpGenerator(NetworkGenerator):
"""A class to generate Multi-Layer Perceptron.
"""

def __init__(self, n_output_node, input_shape):
"""Initialize the instance.
Args:
n_output_node: An integer. Number of output nodes in the network.
input_shape: A tuple. Input shape of the network. If it is 1D, ensure the value is appended by a comma
in the tuple.
"""
super(MlpGenerator, self).__init__(n_output_node, input_shape)
if len(self.input_shape) > 1:
raise ValueError("The input dimension is too high.")

def generate(self, model_len=None, model_width=None):
"""Generates a Multi-Layer Perceptron.
Args:
model_len: An integer. Number of hidden layers.
model_width: An integer or a list of integers of length `model_len`. If it is a list, it represents the
number of nodes in each hidden layer. If it is an integer, all hidden layers have nodes equal to this
value.
Returns:
An instance of the class Graph. Represents the neural architecture graph of the generated model.
"""
if model_len is None:
model_len = Constant.MODEL_LEN
if model_width is None:
model_width = Constant.MODEL_WIDTH
if isinstance(model_width, list) and not len(model_width) == model_len:
raise ValueError(
"The length of 'model_width' does not match 'model_len'")
elif isinstance(model_width, int):
model_width = [model_width] * model_len

graph = Graph(self.input_shape, False)
output_node_id = 0
n_nodes_prev_layer = self.input_shape[0]
for width in model_width:
output_node_id = graph.add_layer(
StubDense(n_nodes_prev_layer, width), output_node_id
)
output_node_id = graph.add_layer(
StubDropout1d(Constant.MLP_DROPOUT_RATE), output_node_id
)
output_node_id = graph.add_layer(StubReLU(), output_node_id)
n_nodes_prev_layer = width

graph.add_layer(
StubDense(
n_nodes_prev_layer,
self.n_output_node),
output_node_id)
return graph

+ 64
- 0
dubhe-tadl/network_morphism/datasets.py View File

@@ -0,0 +1,64 @@
import numpy as np
import torch
import torchvision.transforms as transforms
from utils import Constant

class Cutout:
"""Randomly mask out one or more patches from an image.
Args:
n_holes (int): Number of patches to cut out of each image.
length (int): The length (in pixels) of each square patch.
"""

def __init__(self, n_holes, length):
self.n_holes = n_holes
self.length = length

def __call__(self, img):
"""
Args:
img (Tensor): Tensor image of size (C, H, W).
Returns:
Tensor: Image with n_holes of dimension length x length cut out of it.
"""
h, w = img.size(1), img.size(2)
mask = np.ones((h, w), np.float32)

for _ in range(self.n_holes):
y = np.random.randint(h)
x = np.random.randint(w)

y1 = np.clip(y - self.length // 2, 0, h)
y2 = np.clip(y + self.length // 2, 0, h)
x1 = np.clip(x - self.length // 2, 0, w)
x2 = np.clip(x + self.length // 2, 0, w)

mask[y1:y2, x1:x2] = 0.0

mask = torch.from_numpy(mask)
mask = mask.expand_as(img)
img *= mask
return img

def data_transforms_cifar10():
""" data_transforms for cifar10 dataset
"""

cifar_mean = [0.49139968, 0.48215827, 0.44653124]
cifar_std = [0.24703233, 0.24348505, 0.26158768]

train_transform = transforms.Compose(
[
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(cifar_mean, cifar_std),
Cutout(n_holes=Constant.CUTOUT_HOLES,
length=int(32 * Constant.CUTOUT_RATIO))
]
)

valid_transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize(cifar_mean, cifar_std)]
)
return train_transform, valid_transform

+ 57
- 0
dubhe-tadl/network_morphism/network_morphism_retrain.py View File

@@ -0,0 +1,57 @@
import sys
sys.path.append('..'+ '/' + '..')
import os
import logging
from pytorch.network_morphism.network_morphism_trainer import NetworkMorphismTrainer
import argparse
from pytorch.utils import init_logger, mkdirs
import json

logger = logging.getLogger(__name__)

class Retrain:
def __init__(self, args):
self.args = args

def run(self):
logger.info("Retraining the best model.")
with open(args.best_selected_space_path, 'r') as f:
json_out = json.load(f)
json_out = json.dumps(json_out)

trainer = NetworkMorphismTrainer(json_out, self.args)
trainer.retrain()


if __name__ == "__main__":
parser = argparse.ArgumentParser("network_morphism_retrain")
parser.add_argument("--trial_id", type=int, default=0, help="Trial id")
parser.add_argument("--log_path", type=str,
default='./log', help="log for info")
parser.add_argument(
"--experiment_dir", type=str, default='./TADL', help="experiment level path"
)
parser.add_argument(
"--best_selected_space_path", type=str, default='./best_selected_space.json', help="Path to best selected space"
)
parser.add_argument(
"--result_path", type=str, default='./result.json', help="Path to result"
)
parser.add_argument(
"--best_checkpoint_dir", type=str, default='./', help="Path to checkpoint saved"
)
parser.add_argument(
"--data_dir", type=str, default='../data/', help="Path to dataset"
)
parser.add_argument("--batch_size", type=int,
default=128, help="batch size")
parser.add_argument("--opt", type=str, default="SGD", help="optimizer")
parser.add_argument("--epochs", type=int, default=200, help="epoch limit")
parser.add_argument(
"--lr", type=float, default=0.001, help="learning rate"
)
args = parser.parse_args()
mkdirs(args.result_path, args.log_path, args.best_checkpoint_dir)
init_logger(args.log_path)
retrain = Retrain(args)
retrain.run()

+ 22
- 0
dubhe-tadl/network_morphism/network_morphism_select.py View File

@@ -0,0 +1,22 @@
import sys
sys.path.append('..'+ '/' + '..')
from argparse import ArgumentParser
from pytorch.selector import Selector


class NetworkMorphismSelector(Selector):
def __init__(self, single_candidate=True):
super().__init__(single_candidate)

def fit(self):
pass


if __name__ == "__main__":
parser = ArgumentParser("NetworkMorphism select")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")

args = parser.parse_args()
select = NetworkMorphismSelector(True)
select.fit()

+ 87
- 0
dubhe-tadl/network_morphism/network_morphism_train.py View File

@@ -0,0 +1,87 @@
import sys
sys.path.append('..'+ '/' + '..')
import os
import logging
from pytorch.network_morphism.network_morphism_trainer import NetworkMorphismTrainer
from pytorch.network_morphism.algorithm.networkmorphism_searcher import NetworkMorphismSearcher
import argparse
import pickle
from pytorch.utils import init_logger, mkdirs

logger = logging.getLogger(__name__)

def create_dir(path):
if os.path.exists(path):
# shutil.rmtree(path)
return path
os.makedirs(path)
return path

class Train:
def __init__(self, args):
self.id = args.trial_id
self.trial_dir = os.path.join(
args.experiment_dir, 'train', str(args.trial_id))
self.searcher_dir = os.path.join(
args.experiment_dir, '{}.pkl'.format(NetworkMorphismSearcher.__name__))
self.args = args
self.searcher = None
# first trial
if not os.path.exists(self.searcher_dir):
self.searcher = NetworkMorphismSearcher(os.path.join(
args.experiment_dir, 'train'), args.best_selected_space_path)
else:
# load from previous round
with open(self.searcher_dir, 'rb') as f:
self.searcher = pickle.load(f)

def run_trial_job(self):
logger.info('trial {} search next model'.format(self.id))
model = self.searcher.search(self.id,self.args)

trainer = NetworkMorphismTrainer(model, self.args)
logger.info('trial {} run training script'.format(self.id))
metric = trainer.train()

if metric != None:
logger.info('trial {} receive trial result'.format(self.id))
self.searcher.update_searcher(self.id, metric)

with open(self.searcher_dir, 'wb') as f:
pickle.dump(self.searcher, f)


if __name__ == "__main__":
parser = argparse.ArgumentParser("network_morphism")
parser.add_argument("--trial_id", type=int, default=0, help="Trial id")
parser.add_argument(
"--data_dir", type=str, default='../data/', help="Path to dataset"
)
parser.add_argument(
"--log_path", type=str, default='./log', help="Path to log file"
)
parser.add_argument(
"--experiment_dir", type=str, default='./TADL', help="experiment level path"
)
parser.add_argument(
"--result_path", type=str, default='./result.json', help="trial level path to result"
)
parser.add_argument(
"--search_space_path", type=str, default='./search_space.json', help="experiment level path to search space"
)
parser.add_argument(
"--best_selected_space_path", type=str, default='./best_selected_space.json', help="experiment level path to best selected space"
)
parser.add_argument("--batch_size", type=int,
default=128, help="batch size")
parser.add_argument("--opt", type=str, default="SGD", help="optimizer")
parser.add_argument("--epochs", type=int, default=2, help="epoch limit")
parser.add_argument(
"--lr", type=float, default=0.001, help="learning rate"
)
args = parser.parse_args()
mkdirs(args.experiment_dir, args.result_path, args.log_path, args.search_space_path, args.best_selected_space_path)
create_dir(os.path.join(args.experiment_dir,'train',str(args.trial_id)))
init_logger(args.log_path)
train = Train(args)
train.run_trial_job()

+ 175
- 0
dubhe-tadl/network_morphism/network_morphism_trainer.py View File

@@ -0,0 +1,175 @@
import logging
from algorithm.graph import json_to_graph

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
import re

import datasets
from utils import Constant, EarlyStop, save_json_result
from pytorch.utils import save_best_checkpoint

# pylint: disable=W0603
# set the logger format
logger = logging.getLogger(__name__)

class NetworkMorphismTrainer:
def __init__(self, model_json, args):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.batch_size = args.batch_size
self.epochs = args.epochs
self.lr = args.lr
self.optimizer_name = args.opt
self.data_dir = args.data_dir
self.trial_id = args.trial_id
self.args = args

# Loading Data
logger.info("Preparing data..")

transform_train, transform_test = datasets.data_transforms_cifar10()

trainset = torchvision.datasets.CIFAR10(
root=self.data_dir, train=True, download=True, transform=transform_train
)
self.trainloader = data.DataLoader(
trainset, batch_size=self.batch_size, shuffle=True, num_workers=1
)

testset = torchvision.datasets.CIFAR10(
root=self.data_dir, train=False, download=True, transform=transform_test
)
self.testloader = data.DataLoader(
testset, batch_size=self.batch_size, shuffle=False, num_workers=1
)

# Model
logger.info("Building model..")
# build model from json representation
self.graph = json_to_graph(model_json)

self.net = self.graph.produce_torch_model()

if self.device == "cuda" and torch.cuda.device_count() > 1:
self.net = nn.DataParallel(self.net)
self.net.to(self.device)

self.criterion = nn.CrossEntropyLoss()
if self.optimizer_name == "SGD":
self.optimizer = optim.SGD(
self.net.parameters(), lr=self.lr, momentum=0.9, weight_decay=3e-4
)
if self.optimizer_name == "Adadelta":
self.optimizer = optim.Adadelta(self.net.parameters(), lr=self.lr)
if self.optimizer_name == "Adagrad":
self.optimizer = optim.Adagrad(self.net.parameters(), lr=self.lr)
if self.optimizer_name == "Adam":
self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
if self.optimizer_name == "Adamax":
self.optimizer = optim.Adamax(self.net.parameters(), lr=self.lr)
if self.optimizer_name == "RMSprop":
self.optimizer = optim.RMSprop(self.net.parameters(), lr=self.lr)

self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
self.optimizer, self.epochs)

def train_one_epoch(self):
"""
train model on each epoch in trainset
"""
self.net.train()

for batch_idx, (inputs, targets) in enumerate(self.trainloader):
inputs, targets = inputs.to(self.device), targets.to(self.device)
self.optimizer.zero_grad()
outputs = self.net(inputs)
loss = self.criterion(outputs, targets)
loss.backward()
self.optimizer.step()

def validate_one_epoch(self, epoch):
""" eval model on each epoch in testset
"""
self.net.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(self.testloader):
inputs, targets = inputs.to(
self.device), targets.to(self.device)
outputs = self.net(inputs)
loss = self.criterion(outputs, targets)

test_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()

acc = correct / total
logger.info("Epoch: %d, accuracy: %.3f", epoch, acc)
result = {"type": "Accuracy", "result": {
"sequence": epoch, "category": "epoch", "value": acc}}

save_json_result(self.args.result_path, result)
return test_loss, acc

def train(self):
try:
max_no_improvement_num = Constant.MAX_NO_IMPROVEMENT_NUM
early_stop = EarlyStop(max_no_improvement_num)
early_stop.on_train_begin()
test_metric_value_list = []

for ep in range(self.epochs):
self.train_one_epoch()
test_loss, test_acc = self.validate_one_epoch(ep)
self.scheduler.step()
test_metric_value_list.append(test_acc)
decreasing = early_stop.on_epoch_end(test_loss)
if not decreasing:
break

last_num = min(max_no_improvement_num, self.epochs)
estimated_performance = sum(
test_metric_value_list[-last_num:]) / last_num

logger.info("final accuracy: %.3f", estimated_performance)


except RuntimeError as e:
if not re.search('out of memory', str(e)):
raise e
print(
'\nCurrent model size is too big. Discontinuing training this model to search for other models.')
Constant.MAX_MODEL_SIZE = self.graph.size()-1
return None
except Exception as e:
logger.exception(e)
raise

return estimated_performance

def retrain(self):
logger.info("here")
try:
best_acc = 0.0
for ep in range(self.epochs):
logger.info(ep)
self.train_one_epoch()
_, test_acc = self.validate_one_epoch(ep)
self.scheduler.step()
if test_acc > best_acc:
best_acc = test_acc
save_best_checkpoint(self.args.best_checkpoint_dir,
self.net, self.optimizer, self.epochs)

logger.info("final accuracy: %.3f", best_acc)

except Exception as exception:
logger.exception(exception)
raise

+ 102
- 0
dubhe-tadl/network_morphism/utils.py View File

@@ -0,0 +1,102 @@
from enum import Enum
import json

class Constant:
# Data
CUTOUT_HOLES = 1
CUTOUT_RATIO = 0.5

# Searcher
MAX_MODEL_NUM = 1000
MAX_LAYERS = 200
N_NEIGHBOURS = 8
MAX_MODEL_SIZE = (1 << 25)
MAX_LAYER_WIDTH = 4096
KERNEL_LAMBDA = 1.0
BETA = 2.576
T_MIN = 0.0001


MLP_MODEL_LEN = 3
MLP_MODEL_WIDTH = 5
MODEL_LEN = 3
MODEL_WIDTH = 64
POOLING_KERNEL_SIZE = 2
DENSE_DROPOUT_RATE = 0.5
CONV_DROPOUT_RATE = 0.25
MLP_DROPOUT_RATE = 0.25
CONV_BLOCK_DISTANCE = 2

# trainer
MAX_NO_IMPROVEMENT_NUM = 5
MIN_LOSS_DEC = 1e-4


class OptimizeMode(Enum):
"""Optimize Mode class

if OptimizeMode is 'minimize', it means the tuner need to minimize the reward
that received from Trial.

if OptimizeMode is 'maximize', it means the tuner need to maximize the reward
that received from Trial.
"""
Minimize = 'minimize'
Maximize = 'maximize'

class EarlyStop:
"""A class check for early stop condition.
Attributes:
training_losses: Record all the training loss.
minimum_loss: The minimum loss we achieve so far. Used to compared to determine no improvement condition.
no_improvement_count: Current no improvement count.
_max_no_improvement_num: The maximum number specified.
_done: Whether condition met.
_min_loss_dec: A threshold for loss improvement.
"""

def __init__(self, max_no_improvement_num=None, min_loss_dec=None):
self.training_losses = []
self.minimum_loss = None
self.no_improvement_count = 0
self._max_no_improvement_num = max_no_improvement_num if max_no_improvement_num is not None \
else Constant.MAX_NO_IMPROVEMENT_NUM
self._done = False
self._min_loss_dec = min_loss_dec if min_loss_dec is not None else Constant.MIN_LOSS_DEC

def on_train_begin(self):
"""Initiate the early stop condition.
Call on every time the training iteration begins.
"""
self.training_losses = []
self.no_improvement_count = 0
self._done = False
self.minimum_loss = float('inf')

def on_epoch_end(self, loss):
"""Check the early stop condition.
Call on every time the training iteration end.
Args:
loss: The loss function achieved by the epoch.
Returns:
True if condition met, otherwise False.
"""
self.training_losses.append(loss)
if self._done and loss > (self.minimum_loss - self._min_loss_dec):
return False

if loss > (self.minimum_loss - self._min_loss_dec):
self.no_improvement_count += 1
else:
self.no_improvement_count = 0
self.minimum_loss = loss

if self.no_improvement_count > self._max_no_improvement_num:
self._done = True

return True

def save_json_result(path, data):
with open(path,'a') as f:
json.dump(data,f)
f.write('\n')

+ 1
- 0
dubhe-tadl/pcdarts/__init__.py View File

@@ -0,0 +1 @@
from pytorch.pcdarts.pcdartsmutator import PCdartsMutator

+ 227
- 0
dubhe-tadl/pcdarts/model.py View File

@@ -0,0 +1,227 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from collections import OrderedDict

import torch
import torch.nn as nn

from pytorch import mutables
from pytorch.darts import ops

def random_channel_shuffle(x):
num_channels = x.data.size()[1]
indices = torch.randperm(num_channels)
x = x[:, indices]
return x

def channel_shuffle(x, groups):
batchsize, num_channels, height, width = x.data.size()

channels_per_group = num_channels // groups
# reshape
x = x.view(batchsize, groups,
channels_per_group, height, width)

x = torch.transpose(x, 1, 2).contiguous()

# flatten
x = x.view(batchsize, -1, height, width)

return x

class AuxiliaryHead(nn.Module):
""" Auxiliary head in 2/3 place of network to let the gradient flow well """

def __init__(self, input_size, C, n_classes):
""" assuming input size 7x7 or 8x8 """
assert input_size in [7, 8]
super().__init__()
self.net = nn.Sequential(
nn.ReLU(inplace=True),
nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False), # 2x2 out
nn.Conv2d(C, 128, kernel_size=1, bias=False),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 768, kernel_size=2, bias=False), # 1x1 out
nn.BatchNorm2d(768),
nn.ReLU(inplace=True)
)
self.linear = nn.Linear(768, n_classes)

def forward(self, x):
out = self.net(x)
out = out.view(out.size(0), -1) # flatten
logits = self.linear(out)
return logits


class Node(nn.Module):
def __init__(self, node_id, num_prev_nodes, channels, k, num_downsample_connect, search):
super().__init__()
if search:
self.k = k
partial_channles = channels // k
else:
partial_channles = channels
self.search = search
self.ops = nn.ModuleList()
choice_keys = []
for i in range(num_prev_nodes):
stride = 2 if i < num_downsample_connect else 1
choice_keys.append("{}_p{}".format(node_id, i))
self.ops.append(
mutables.LayerChoice(OrderedDict([
("maxpool", ops.PoolBN('max', partial_channles, 3, stride, 1, affine=False)),
("avgpool", ops.PoolBN('avg', partial_channles, 3, stride, 1, affine=False)),
("skipconnect", nn.Identity() if stride == 1 else ops.FactorizedReduce(partial_channles, partial_channles, affine=False)),
("sepconv3x3", ops.SepConv(partial_channles, partial_channles, 3, stride, 1, affine=False)),
("sepconv5x5", ops.SepConv(partial_channles, partial_channles, 5, stride, 2, affine=False)),
("dilconv3x3", ops.DilConv(partial_channles, partial_channles, 3, stride, 2, 2, affine=False)),
("dilconv5x5", ops.DilConv(partial_channles, partial_channles, 5, stride, 4, 2, affine=False))
]), key=choice_keys[-1]))
self.drop_path = ops.DropPath()
self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id))
self.pool = nn.MaxPool2d(2,2)

def forward(self, prev_nodes):
assert len(self.ops) == len(prev_nodes), "len(self.ops) != len(prev_nodes) in Node"
# for each candicate predecessor of each intermediate node
if self.search:
# in search
results = []
for op, x in zip(self.ops, prev_nodes):
# channel shuffle
channels = x.shape[1]
# channel proportion k=4
temp0 = x[ : , : channels//self.k, : , :]
temp1 = x[ : ,channels//self.k : , : , :]
out = op(temp0)
# normal
if out.shape[2] == x.shape[2]:
result = torch.cat([out, temp1], dim=1)
# reduction
else:
result = torch.cat([out, self.pool(temp1)], dim=1)
results.append(channel_shuffle(result, self.k))

# # channel random shuffule
# channels = random_channel_shuffle(x).shape[1]
# # channel proportion k=4
# temp0 = x[ : , : channels//self.k, : , :]
# temp1 = x[ : ,channels//self.k : , : , :]
# out = op(temp0)
# # normal
# if out.shape[2] == x.shape[2]:
# result = torch.cat([out, temp1], dim=1)
# # reduction
# else:
# result = torch.cat([out, self.pool(temp1)], dim=1)
# results.append(result)
else:
# in retrain, no channel shuffle
results = [op(node) for op, node in zip(self.ops, prev_nodes)]
output = [self.drop_path(re) if re is not None else None for re in results]
return self.input_switch(output)


class Cell(nn.Module):

def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction, k, search):
super().__init__()
self.reduction = reduction
self.n_nodes = n_nodes

# If previous cell is reduction cell, current input size does not match with
# output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
if reduction_p:
self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False)
else:
self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False)
self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False)

# generate dag
self.mutable_ops = nn.ModuleList()
for depth in range(2, self.n_nodes + 2):
self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth), depth, channels, k, 2 if reduction else 0, search))

def forward(self, s0, s1):
# s0, s1 are the outputs of previous previous cell and previous cell, respectively.
tensors = [self.preproc0(s0), self.preproc1(s1)]
for node in self.mutable_ops:
cur_tensor = node(tensors)
tensors.append(cur_tensor)

output = torch.cat(tensors[2:], dim=1)
return output


class CNN(nn.Module):

def __init__(self, input_size, in_channels, channels, n_classes, n_layers, k=4, n_nodes=4, stem_multiplier=3, auxiliary=False, search=True):
super().__init__()
self.in_channels = in_channels
self.channels = channels
self.n_classes = n_classes
self.n_layers = n_layers
self.n_nodes = n_nodes
self.aux_pos = 2 * n_layers // 3 if auxiliary else -1

c_cur = stem_multiplier * self.channels
self.stem = nn.Sequential(
nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
nn.BatchNorm2d(c_cur)
)

# for the first cell, stem is used for both s0 and s1
# [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
channels_pp, channels_p, c_cur = c_cur, c_cur, channels

self.cells = nn.ModuleList()
reduction_p, reduction = False, False
for i in range(n_layers):
reduction_p, reduction = reduction, False
# Reduce featuremap size and double channels in 1/3 and 2/3 layer.
if i in [n_layers // 3, 2 * n_layers // 3]:
c_cur *= 2
reduction = True

cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction, k, search)
self.cells.append(cell)
c_cur_out = c_cur * n_nodes
channels_pp, channels_p = channels_p, c_cur_out

if i == self.aux_pos:
self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes)

self.gap = nn.AdaptiveAvgPool2d(1)
self.linear = nn.Linear(channels_p, n_classes)

def forward(self, x):
s0 = s1 = self.stem(x)

aux_logits = None
for i, cell in enumerate(self.cells):
s0, s1 = s1, cell(s0, s1)
if i == self.aux_pos and self.training:
aux_logits = self.aux_head(s1)

out = self.gap(s1)
out = out.view(out.size(0), -1) # flatten
logits = self.linear(out)

if aux_logits is not None:
return logits, aux_logits
return logits

def drop_path_prob(self, p):
for module in self.modules():
if isinstance(module, ops.DropPath):
module.p = p

def _loss(self, input, target):
logits = self(input)
return self._criterion(logits, target)

+ 204
- 0
dubhe-tadl/pcdarts/pcdarts_retrain.py View File

@@ -0,0 +1,204 @@
import sys
sys.path.append('..'+ '/' + '..')
import os
import logging
import time
from argparse import ArgumentParser

import torch
import torch.nn as nn
import numpy as np
# from torch.utils.tensorboard import SummaryWriter
import torch.backends.cudnn as cudnn

from model import CNN
from pytorch.fixed import apply_fixed_architecture
from pytorch.utils import set_seed, mkdirs, init_logger, save_best_checkpoint, AverageMeter
from pytorch.darts import utils
from pytorch.darts import datasets
from pytorch.retrainer import Retrainer

logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# writer = SummaryWriter()

class PCdartsRetrainer(Retrainer):
def __init__(self, aux_weight, grad_clip, epochs, log_frequency):
self.aux_weight = aux_weight
self.grad_clip = grad_clip
self.epochs = epochs
self.log_frequency = log_frequency

def train(self, train_loader, model, optimizer, criterion, epoch):
top1 = AverageMeter("top1")
top5 = AverageMeter("top5")
losses = AverageMeter("losses")

cur_step = epoch * len(train_loader)
cur_lr = optimizer.param_groups[0]["lr"]
logger.info("Epoch %d LR %.6f", epoch, cur_lr)
# writer.add_scalar("lr", cur_lr, global_step=cur_step)

model.train()

for step, (x, y) in enumerate(train_loader):
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
bs = x.size(0)

optimizer.zero_grad()
logits, aux_logits = model(x)
loss = criterion(logits, y)
if self.aux_weight > 0.:
loss += self.aux_weight * criterion(aux_logits, y)
loss.backward()
# gradient clipping
nn.utils.clip_grad_norm_(model.parameters(), self.grad_clip)
optimizer.step()

accuracy = utils.accuracy(logits, y, topk=(1, 5))
losses.update(loss.item(), bs)
top1.update(accuracy["acc1"], bs)
top5.update(accuracy["acc5"], bs)
# writer.add_scalar("loss/train", loss.item(), global_step=cur_step)
# writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step)
# writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step)

if step % self.log_frequency == 0 or step == len(train_loader) - 1:
logger.info(
"Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
"Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
epoch + 1, self.epochs, step, len(train_loader) - 1, losses=losses,
top1=top1, top5=top5))

cur_step += 1

logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg))


def validate(self, valid_loader, model, criterion, epoch, cur_step):
top1 = AverageMeter("top1")
top5 = AverageMeter("top5")
losses = AverageMeter("losses")

model.eval()

with torch.no_grad():
for step, (X, y) in enumerate(valid_loader):
X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
bs = X.size(0)

logits = model(X)
loss = criterion(logits, y)

accuracy = utils.accuracy(logits, y, topk=(1, 5))
losses.update(loss.item(), bs)
top1.update(accuracy["acc1"], bs)
top5.update(accuracy["acc5"], bs)

if step % self.log_frequency == 0 or step == len(valid_loader) - 1:
logger.info(
"Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
"Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
epoch + 1, self.epochs, step, len(valid_loader) - 1, losses=losses,
top1=top1, top5=top5))

# writer.add_scalar("loss/test", losses.avg, global_step=cur_step)
# writer.add_scalar("acc1/test", top1.avg, global_step=cur_step)
# writer.add_scalar("acc5/test", top5.avg, global_step=cur_step)

logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg))

return top1.avg

if __name__ == "__main__":
parser = ArgumentParser("PCDARTS retrain")
parser.add_argument("--data_dir", type=str,
default='./', help="search_space json file")
parser.add_argument("--result_path", type=str,
default='./result.json', help="training result")
parser.add_argument("--log_path", type=str,
default='.0/log', help="log for info")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")
parser.add_argument("--best_checkpoint_dir", type=str,
default='', help="default name is best_checkpoint_epoch{}.pth")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')
parser.add_argument("--layers", default=20, type=int)
parser.add_argument("--lr", default=0.01, type=float)
parser.add_argument("--batch_size", default=96, type=int)
parser.add_argument("--log_frequency", default=10, type=int)
parser.add_argument("--epochs", default=600, type=int)
parser.add_argument("--aux_weight", default=0.4, type=float)
parser.add_argument("--drop_path_prob", default=0.2, type=float)
parser.add_argument("--workers", default=4, type=int)
parser.add_argument("--class_num", default=10, type=int, help="cifar10")
parser.add_argument("--channels", default=36, type=int)
parser.add_argument("--grad_clip", default=6., type=float)
args = parser.parse_args()

mkdirs(args.result_path, args.log_path, args.best_checkpoint_dir)
init_logger(args.log_path)
logger.info(args)
set_seed(args.trial_id)
logger.info("loading data")
dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16, root=args.data_dir)

model = CNN(32, 3, args.channels, args.class_num, args.layers, auxiliary=True, search=False)
apply_fixed_architecture(model, args.best_selected_space_path)
criterion = nn.CrossEntropyLoss()

model.to(device)
criterion.to(device)

optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6)

train_loader = torch.utils.data.DataLoader(dataset_train,
batch_size=args.batch_size,
shuffle=True,
num_workers=args.workers,
pin_memory=True)
valid_loader = torch.utils.data.DataLoader(dataset_valid,
batch_size=args.batch_size,
shuffle=False,
num_workers=args.workers,
pin_memory=True)

retrainer = PCdartsRetrainer(aux_weight=args.aux_weight,
grad_clip=args.grad_clip,
epochs=args.epochs,
log_frequency = args.log_frequency)
# result = {"Accuracy": [], "Cost_time": ''}
best_top1 = 0.
start_time = time.time()
for epoch in range(args.epochs):
drop_prob = args.drop_path_prob * epoch / args.epochs
model.drop_path_prob(drop_prob)

# training
retrainer.train(train_loader, model, optimizer, criterion, epoch)

# validation
cur_step = (epoch + 1) * len(train_loader)
top1 = retrainer.validate(valid_loader, model, criterion, epoch, cur_step)
# 后端在终端过滤,{"type": "Accuracy", "result": {"sequence": 1, "category": "epoch", "value":96.7}}
logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) + '\n')
# result["Accuracy"].append(top1)
best_top1 = max(best_top1, top1)

lr_scheduler.step()

logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
cost_time = time.time() - start_time
# 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}}
logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}))
# result["Cost_time"] = str(cost_time) + ' s'
# dump_global_result(args.result_path, result)
save_best_checkpoint(args.best_checkpoint_dir, model, optimizer, epoch)
logger.info("Save best checkpoint in {}".format(os.path.join(args.best_checkpoint_dir, "best_checkpoint_epoch{}.pth".format(epoch))))

+ 21
- 0
dubhe-tadl/pcdarts/pcdarts_select.py View File

@@ -0,0 +1,21 @@
import sys
sys.path.append('../..')
from pytorch.selector import Selector
from argparse import ArgumentParser


class PCdartsSelector(Selector):
def __init__(self, single_candidate=True):
super().__init__(single_candidate)
def fit(self):
pass

if __name__ == "__main__":
parser = ArgumentParser("DARTS select")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")

args = parser.parse_args()
darts_selector = PCdartsSelector(True)
darts_selector.fit()

+ 93
- 0
dubhe-tadl/pcdarts/pcdarts_train.py View File

@@ -0,0 +1,93 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import sys
sys.path.append('..'+ '/' + '..')
import time
from argparse import ArgumentParser

from model import CNN
import torch
import torch.nn as nn

from pytorch.callbacks import BestArchitectureCheckpoint, LRSchedulerCallback
from pytorch.pcdarts import PCdartsMutator
from pytorch.darts import DartsTrainer
from pytorch.darts.utils import accuracy
from pytorch.darts import datasets
from pytorch.utils import *

logger = logging.getLogger(__name__)

if __name__ == "__main__":
parser = ArgumentParser("PCDARTS train")
parser.add_argument("--data_dir", type=str,
default='../data/', help="search_space json file")
parser.add_argument("--result_path", type=str,
default='.0/result.json', help="training result")
parser.add_argument("--log_path", type=str,
default='.0/log', help="log for info")
parser.add_argument("--search_space_path", type=str,
default='./search_space.json', help="search space of PDARTS")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')
parser.add_argument('--model_lr', type=float, default=0.1, help='learning rate for training model weights')
parser.add_argument('--arch_lr', type=float, default=6e-4, help='learning rate for training architecture')
parser.add_argument("--nodes", default=4, type=int)
parser.add_argument("--layers", default=8, type=int)
parser.add_argument("--channels", default=16, type=int)
parser.add_argument("--batch_size", default=96, type=int)
parser.add_argument("--log_frequency", default=50, type=int)
parser.add_argument("--class_num", default=10, type=int, help="cifar10")
parser.add_argument("--epochs", default=5, type=int)
parser.add_argument("--pre_epochs", default=15, type=int, help='pre epochs to train weight only')
parser.add_argument("--k", default=4, type=int, help="channel portion of channel shuffle")
parser.add_argument("--unrolled", default=False, action="store_true")
args = parser.parse_args()
mkdirs(args.result_path, args.log_path, args.search_space_path, args.best_selected_space_path)
init_logger(args.log_path, "info")
logger.info(args)
set_seed(args.trial_id)

logger.info("loading data")
dataset_train, dataset_valid = datasets.get_dataset("cifar10", root=args.data_dir)
model = CNN(32, 3, args.channels, args.class_num, args.layers, n_nodes=args.nodes, k=args.k)
criterion = nn.CrossEntropyLoss()

optim = torch.optim.SGD(model.parameters(), args.model_lr, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001)

logger.info("initializing trainer")
trainer = DartsTrainer(model,
loss=criterion,
metrics=lambda output, target: accuracy(output, target, topk=(1,)),
optimizer=optim,
num_epochs=args.epochs,
dataset_train=dataset_train,
dataset_valid=dataset_valid,
mutator=PCdartsMutator(model),
batch_size=args.batch_size,
log_frequency=args.log_frequency,
arch_lr=args.arch_lr,
unrolled=args.unrolled,
result_path=args.result_path,
num_pre_epochs=args.pre_epochs,
search_space_path=args.search_space_path,
callbacks=
[LRSchedulerCallback(lr_scheduler), BestArchitectureCheckpoint(args.best_selected_space_path, args.epochs)])

logger.info("training")
t1 = time.time()
trainer.train()
# res_json = trainer.result
cost_time = time.time() - t1
# 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}}
logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}))

# res_json["Cost_time"] = str(cost_time) + ' s'
# dump_global_result(args.result_path, res_json)

+ 146
- 0
dubhe-tadl/pcdarts/pcdartsmutator.py View File

@@ -0,0 +1,146 @@

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from pytorch.mutator import Mutator
from pytorch.mutables import LayerChoice, InputChoice

_logger = logging.getLogger(__name__)

class PCdartsMutator(Mutator):
"""
Connects the model in a PC-DARTS (differentiable) way.

Two connections are automatically inserted for each LayerChoice and InputChoice, when these connections are selected by softmax function.
Ops on the LayerChoice are selected by max top-k probabilities. But channels in the all candicate predecessors on the InputChoice are weighted sum
There is no op on this LayerChoice (namely a ``ZeroOp``), in which case, every element in the exported choice list is ``false``
(not chosen).

All input choice will be fully connected in the search phase. On exporting, the input choice will choose inputs based
on keys in ``choose_from``. If the keys were to be keys of LayerChoices, the top logit of the corresponding LayerChoice
will join the competition of input choice to compete against other logits. Otherwise, the logit will be assumed 0.

It's possible to cut branches by setting parameter ``choices`` in a particular position to ``-inf``. After softmax, the
value would be 0. Framework will ignore 0 values and not connect. Note that the gradient on the ``-inf`` location will
be 0. Since manipulations with ``-inf`` will be ``nan``, you need to handle the gradient update phase carefully.

Attributes
----------
choices: ParameterDict
dict that maps keys of LayerChoices to weighted-connection float tensors.
"""
def __init__(self, model):
super().__init__(model)
self.choices = nn.ParameterDict()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.length + 1))
if isinstance(mutable, InputChoice):
self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.n_candidates))
def device(self):
for v in self.choices.values():
return v.device

def sample_search(self):
result = dict()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
result[mutable.key] = F.softmax(self.choices[mutable.key], dim=-1)[:-1]
elif isinstance(mutable, InputChoice):
result[mutable.key] = F.softmax(self.choices[mutable.key], dim=-1)
return result

def sample_final(self):
result = dict()
edges_max = dict()
choices = dict()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
# multiply the normalized coefficients together to select top-1 op in each LayerChoice
predecessor_idx = int(mutable.key[-1])
inputchoice_key = mutable.key[:-2] + "switch"
choices[mutable.key] = self.choices[mutable.key] * self.choices[inputchoice_key][predecessor_idx]
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
# select non-none top-1 op
max_val, index = torch.max(F.softmax(choices[mutable.key], dim=-1)[:-1], 0)
edges_max[mutable.key] = max_val
result[mutable.key] = F.one_hot(index, num_classes=len(mutable)).view(-1).bool()
for mutable in self.mutables:
if isinstance(mutable, InputChoice):
if mutable.n_chosen is not None:
weights = []
for src_key in mutable.choose_from:
if src_key not in edges_max:
_logger.warning("InputChoice.NO_KEY in '%s' is weighted 0 when selecting inputs.", mutable.key)
weights.append(edges_max.get(src_key, 0.))
weights = torch.tensor(weights) # pylint: disable=not-callable
# select top-2 strongest predecessor
_, topk_edge_indices = torch.topk(weights, mutable.n_chosen)
selected_multihot = []
for i, src_key in enumerate(mutable.choose_from):
if i not in topk_edge_indices and src_key in result:
# If an edge is never selected, there is no need to calculate any op on this edge.
# This is to eliminate redundant calculation.
result[src_key] = torch.zeros_like(result[src_key])
selected_multihot.append(i in topk_edge_indices)
result[mutable.key] = torch.tensor(selected_multihot, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable
else:
result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable
return result

def _generate_search_space(self):
"""
Generate search space from mutables.
Here is the search space format:
::
{ key_name: {"_type": "layer_choice",
"_value": ["conv1", "conv2"]} }
{ key_name: {"_type": "input_choice",
"_value": {"candidates": ["in1", "in2"],
"n_chosen": 1}} }
Returns
-------
dict
the generated search space
"""
res = OrderedDict()
res["op_list"] = OrderedDict()
res["search_space"] = {"reduction_cell": OrderedDict(), "normal_cell": OrderedDict()}
keys = []
for mutable in self.mutables:
# for now we only generate flattened search space
if (len(res["search_space"]["reduction_cell"]) + len(res["search_space"]["normal_cell"])) >= 36:
break

if isinstance(mutable, LayerChoice):
key = mutable.key
if key not in keys:
val = mutable.names
if not res["op_list"]:
res["op_list"] = {"_type": "layer_choice", "_value": val + ["none"]}
node_type = "normal_cell" if "normal" in key else "reduction_cell"
res["search_space"][node_type][key] = "op_list"
keys.append(key)

elif isinstance(mutable, InputChoice):
key = mutable.key
if key not in keys:
node_type = "normal_cell" if "normal" in key else "reduction_cell"
res["search_space"][node_type][key] = {"_type": "input_choice",
"_value": {"candidates": mutable.choose_from,
"n_chosen": mutable.n_chosen}}
keys.append(key)
else:
raise TypeError("Unsupported mutable type: '%s'." % type(mutable))

return res


+ 81
- 0
dubhe-tadl/pcdarts/reademe.md View File

@@ -0,0 +1,81 @@
# train stage
`python pcdarts_train.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --search_space_path 'experiment_id/search_space.json' --best_selected_space_path 'experiment_id/best_selected_space.json' --trial_id 0 --layers 5 --model_lr 0.025 --arch_lr 3e-4 --epochs 2 --pre_epochs 1 --batch_size 64 --channels 16`

# select stage
`python pcdarts_select.py --best_selected_space_path 'experiment_id/best_selected_space.json' `

# retrain stage
`python pcdarts_retrain.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --best_selected_space_path 'experiment_id/best_selected_space.json' --best_checkpoint_dir 'experiment_id/' --trial_id 0 --batch_size 96 --epochs 2 --lr 0.01 --layers 20 --channels 36`

# output file
`result.json`
```
{'type': 'Accuracy', 'result': {'sequence': 0, 'category': 'epoch', 'value': 0.1}}
{'type': 'Accuracy', 'result': {'sequence': 1, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 2, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 3, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 4, 'category': 'epoch', 'value': 0.0}}
{'type': 'Cost_time', 'result': {'value': '41.614346981048584 s'}}
```

`search_space.json`
```
{
"op_list": {
"_type": "layer_choice",
"_value": [
"maxpool",
"avgpool",
"skipconnect",
"sepconv3x3",
"sepconv5x5",
"dilconv3x3",
"dilconv5x5",
"none"
]
},
"search_space": {
"normal_n2_p0": "op_list",
"normal_n2_p1": "op_list",
"normal_n2_switch": {
"_type": "input_choice",
"_value": {
"candidates": [
"normal_n2_p0",
"normal_n2_p1"
],
"n_chosen": 2
}
},

...
}
```

`best_selected_space.json`
```
{
"normal_n2_p0": "dilconv5x5",
"normal_n2_p1": "dilconv5x5",
"normal_n2_switch": [
"normal_n2_p0",
"normal_n2_p1"
],
"normal_n3_p0": "sepconv3x3",
"normal_n3_p1": "dilconv5x5",
"normal_n3_p2": [],
"normal_n3_switch": [
"normal_n3_p0",
"normal_n3_p1"
],
"normal_n4_p0": [],
"normal_n4_p1": "dilconv5x5",
"normal_n4_p2": "sepconv5x5",
"normal_n4_p3": [],
"normal_n4_switch": [
"normal_n4_p1",
"normal_n4_p2"
],
...
}
```

+ 2
- 0
dubhe-tadl/pdarts/__init__.py View File

@@ -0,0 +1,2 @@
from pytorch.pdarts.pdartsmutator import PdartsMutator
from pytorch.pdarts.pdartstrainer import PdartsTrainer

+ 177
- 0
dubhe-tadl/pdarts/model.py View File

@@ -0,0 +1,177 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from collections import OrderedDict
import torch
import torch.nn as nn
from pytorch import mutables
from pytorch.darts import ops

class AuxiliaryHead(nn.Module):
""" Auxiliary head in 2/3 place of network to let the gradient flow well """

def __init__(self, input_size, C, n_classes):
""" assuming input size 7x7 or 8x8 """
assert input_size in [7, 8]
super().__init__()
self.net = nn.Sequential(
nn.ReLU(inplace=True),
nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False), # 2x2 out
nn.Conv2d(C, 128, kernel_size=1, bias=False),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 768, kernel_size=2, bias=False), # 1x1 out
nn.BatchNorm2d(768),
nn.ReLU(inplace=True)
)
self.linear = nn.Linear(768, n_classes)

def forward(self, x):
out = self.net(x)
out = out.view(out.size(0), -1) # flatten
logits = self.linear(out)
return logits


class Node(nn.Module):
def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect, search, dropout_rate):
super().__init__()
self.dropout_rate = dropout_rate
self.ops = nn.ModuleList()
choice_keys = []
for i in range(num_prev_nodes):
stride = 2 if i < num_downsample_connect else 1
choice_keys.append("{}_p{}".format(node_id, i))
skip_op = nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False)
# In search, op-level dropout for skip-connect
if search and self.dropout_rate > 0:
skip_op = nn.Sequential(skip_op, nn.Dropout(self.dropout_rate))
self.ops.append(
mutables.LayerChoice(OrderedDict([
("maxpool", ops.PoolBN('max', channels, 3, stride, 1, affine=False)),
("avgpool", ops.PoolBN('avg', channels, 3, stride, 1, affine=False)),
("skipconnect", skip_op),
("sepconv3x3", ops.SepConv(channels, channels, 3, stride, 1, affine=False)),
("sepconv5x5", ops.SepConv(channels, channels, 5, stride, 2, affine=False)),
("dilconv3x3", ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False)),
("dilconv5x5", ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False))
]), key=choice_keys[-1]))
# In retrain, DropPath for non skip-connect, p in DropPath default to 0
self.drop_path = ops.DropPath()
self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id))

def forward(self, prev_nodes):
assert len(self.ops) == len(prev_nodes)
output = []
for op, node in zip(self.ops, prev_nodes):
out = op(node)
# In retrain
if out is not None:
if not isinstance(op, nn.Identity):
out = self.drop_path(out)
else:
out = None
output.append(out)
# out = [op(node) for op, node in zip(self.ops, prev_nodes)]
# out = [self.drop_path(o) if o is not None else None for o in out]
return self.input_switch(output)


class Cell(nn.Module):

def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction, search, dropout_rate):
super().__init__()
self.reduction = reduction
self.n_nodes = n_nodes

# If previous cell is reduction cell, current input size does not match with
# output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
if reduction_p:
self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False)
else:
self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False)
self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False)

# generate dag
self.mutable_ops = nn.ModuleList()
for depth in range(2, self.n_nodes + 2):
self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth), depth, channels, 2 if reduction else 0, search, dropout_rate))

def forward(self, s0, s1):
# s0, s1 are the outputs of previous previous cell and previous cell, respectively.
tensors = [self.preproc0(s0), self.preproc1(s1)]
for node in self.mutable_ops:
cur_tensor = node(tensors)
tensors.append(cur_tensor)

output = torch.cat(tensors[2:], dim=1)
return output


class CNN(nn.Module):

def __init__(self, input_size, in_channels, channels, n_classes, n_layers, dropout_rate, n_nodes=4, stem_multiplier=3, auxiliary=False, search=True):
super().__init__()
self.in_channels = in_channels
self.channels = channels
self.n_classes = n_classes
self.n_layers = n_layers
self.aux_pos = 2 * n_layers // 3 if auxiliary else -1

c_cur = stem_multiplier * self.channels
self.stem = nn.Sequential(
nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
nn.BatchNorm2d(c_cur)
)

# for the first cell, stem is used for both s0 and s1
# [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
channels_pp, channels_p, c_cur = c_cur, c_cur, channels

self.cells = nn.ModuleList()
reduction_p, reduction = False, False
for i in range(n_layers):
reduction_p, reduction = reduction, False
# Reduce featuremap size and double channels in 1/3 and 2/3 layer.
if i in [n_layers // 3, 2 * n_layers // 3]:
c_cur *= 2
reduction = True

cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction, search, dropout_rate)
self.cells.append(cell)
c_cur_out = c_cur * n_nodes
channels_pp, channels_p = channels_p, c_cur_out

if i == self.aux_pos:
self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes)

self.gap = nn.AdaptiveAvgPool2d(1)
self.linear = nn.Linear(channels_p, n_classes)

def forward(self, x):
s0 = s1 = self.stem(x)

aux_logits = None
for i, cell in enumerate(self.cells):
s0, s1 = s1, cell(s0, s1)
if i == self.aux_pos and self.training:
aux_logits = self.aux_head(s1)

out = self.gap(s1)
out = out.view(out.size(0), -1) # flatten
logits = self.linear(out)

if aux_logits is not None:
return logits, aux_logits
return logits

def drop_path_prob(self, p, search=True):
if search:
for module in self.modules():
# In search, update dropout rate
if isinstance(module, nn.Sequential) and isinstance(module[0], nn.Identity):
module[1].dropout_rate = p
else:
# In retrain, update ops.DropPath
for module in self.modules():
if isinstance(module, ops.DropPath):
module.p = p

+ 203
- 0
dubhe-tadl/pdarts/pdarts_retrain.py View File

@@ -0,0 +1,203 @@
import sys
sys.path.append('..'+ '/' + '..')
import os
import logging
import time
import json
from argparse import ArgumentParser

import torch
import torch.nn as nn
# from torch.utils.tensorboard import SummaryWriter

from model import CNN
from pytorch.fixed import apply_fixed_architecture
from pytorch.utils import set_seed, mkdirs, init_logger, save_best_checkpoint, AverageMeter
from pytorch.darts import utils
from pytorch.darts import datasets
from pytorch.retrainer import Retrainer

logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# writer = SummaryWriter()

class PdartsRetrainer(Retrainer):
def __init__(self, aux_weight, grad_clip, epochs, log_frequency):
self.aux_weight = aux_weight
self.grad_clip = grad_clip
self.epochs = epochs
self.log_frequency = log_frequency

def train(self, train_loader, model, optimizer, criterion, epoch):
top1 = AverageMeter("top1")
top5 = AverageMeter("top5")
losses = AverageMeter("losses")

cur_step = epoch * len(train_loader)
cur_lr = optimizer.param_groups[0]["lr"]
logger.info("Epoch %d LR %.6f", epoch, cur_lr)
# writer.add_scalar("lr", cur_lr, global_step=cur_step)

model.train()

for step, (x, y) in enumerate(train_loader):
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
bs = x.size(0)

optimizer.zero_grad()
logits, aux_logits = model(x)
loss = criterion(logits, y)
if self.aux_weight > 0.:
loss += self.aux_weight * criterion(aux_logits, y)
loss.backward()
# gradient clipping
nn.utils.clip_grad_norm_(model.parameters(), self.grad_clip)
optimizer.step()

accuracy = utils.accuracy(logits, y, topk=(1, 5))
losses.update(loss.item(), bs)
top1.update(accuracy["acc1"], bs)
top5.update(accuracy["acc5"], bs)
# writer.add_scalar("loss/train", loss.item(), global_step=cur_step)
# writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step)
# writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step)

if step % self.log_frequency == 0 or step == len(train_loader) - 1:
logger.info(
"Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
"Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
epoch + 1, self.epochs, step, len(train_loader) - 1, losses=losses,
top1=top1, top5=top5))

cur_step += 1

logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg))


def validate(self, valid_loader, model, criterion, epoch, cur_step):
top1 = AverageMeter("top1")
top5 = AverageMeter("top5")
losses = AverageMeter("losses")

model.eval()

with torch.no_grad():
for step, (X, y) in enumerate(valid_loader):
X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
bs = X.size(0)

logits = model(X)
loss = criterion(logits, y)

accuracy = utils.accuracy(logits, y, topk=(1, 5))
losses.update(loss.item(), bs)
top1.update(accuracy["acc1"], bs)
top5.update(accuracy["acc5"], bs)

if step % self.log_frequency == 0 or step == len(valid_loader) - 1:
logger.info(
"Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
"Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
epoch + 1, self.epochs, step, len(valid_loader) - 1, losses=losses,
top1=top1, top5=top5))

# writer.add_scalar("loss/test", losses.avg, global_step=cur_step)
# writer.add_scalar("acc1/test", top1.avg, global_step=cur_step)
# writer.add_scalar("acc5/test", top5.avg, global_step=cur_step)

logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, self.epochs, top1.avg))

return top1.avg


if __name__ == "__main__":
parser = ArgumentParser("Pdarts retrain")
parser.add_argument("--data_dir", type=str,
default='./', help="search_space json file")
parser.add_argument("--result_path", type=str,
default='./result.json', help="training result")
parser.add_argument("--log_path", type=str,
default='.0/log', help="log for info")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")
parser.add_argument("--best_checkpoint_dir", type=str,
default='', help="default name is best_checkpoint_epoch{}.pth")
parser.add_argument('--trial_id', type=int, default=0, metavar='N',
help='trial_id,start from 0')
parser.add_argument("--layers", default=20, type=int)
parser.add_argument("--batch_size", default=96, type=int)
parser.add_argument("--log_frequency", default=10, type=int)
parser.add_argument("--epochs", default=600, type=int)
parser.add_argument("--lr", default=0.025, type=float)
parser.add_argument("--channels", default=36, type=int)
parser.add_argument("--aux_weight", default=0.4, type=float)
parser.add_argument("--drop_path_prob", default=0.3, type=float)
parser.add_argument("--workers", default=4)
parser.add_argument("--grad_clip", default=5., type=float)
args = parser.parse_args()
mkdirs(args.result_path, args.log_path, args.best_checkpoint_dir)
init_logger(args.log_path)
logger.info(args)
set_seed(args.trial_id)
logger.info("loading data")
dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16, root=args.data_dir)

model = CNN(32, 3, 36, 10, args.layers, auxiliary=True, search=False, dropout_rate=0.0)
if isinstance(args.best_selected_space_path, str):
with open(args.best_selected_space_path) as f:
fixed_arc = json.load(f)
apply_fixed_architecture(model, fixed_arc=fixed_arc["best_selected_space"])
criterion = nn.CrossEntropyLoss()

model.to(device)
criterion.to(device)

optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6)

train_loader = torch.utils.data.DataLoader(dataset_train,
batch_size=args.batch_size,
shuffle=True,
num_workers=args.workers,
pin_memory=True)
valid_loader = torch.utils.data.DataLoader(dataset_valid,
batch_size=args.batch_size,
shuffle=False,
num_workers=args.workers,
pin_memory=True)
retrainer = PdartsRetrainer(aux_weight=args.aux_weight,
grad_clip=args.grad_clip,
epochs=args.epochs,
log_frequency = args.log_frequency)
best_top1 = 0.
start_time = time.time()
for epoch in range(args.epochs):
drop_prob = args.drop_path_prob * epoch / args.epochs
model.drop_path_prob(drop_prob)

# training
retrainer.train(train_loader, model, optimizer, criterion, epoch)

# validation
cur_step = (epoch + 1) * len(train_loader)
top1 = retrainer.validate(valid_loader, model, criterion, epoch, cur_step)
# 后端在终端过滤,{"type": "Accuracy", "result": {"sequence": 1, "category": "epoch", "value":96.7}}
logger.info({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Accuracy", "result": {"sequence": epoch, "category": "epoch", "value": top1}}) + '\n')
best_top1 = max(best_top1, top1)

lr_scheduler.step()

logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
cost_time = time.time() - start_time
# 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}}
logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}))
# result["Cost_time"] = str(cost_time) + ' s'
# dump_global_result(args.result_path, result)
save_best_checkpoint(args.best_checkpoint_dir, model, optimizer, epoch)
logger.info("Save best checkpoint in {}".format(os.path.join(args.best_checkpoint_dir, "best_checkpoint_epoch{}.pth".format(epoch))))

+ 22
- 0
dubhe-tadl/pdarts/pdarts_select.py View File

@@ -0,0 +1,22 @@
import sys
sys.path.append('../..')
from pytorch.selector import Selector
from argparse import ArgumentParser


class PdartsSelector(Selector):
def __init__(self, single_candidate=True):
super().__init__(single_candidate)
def fit(self):
pass

if __name__ == "__main__":
parser = ArgumentParser("PDARTS select")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")

args = parser.parse_args()
darts_selector = PdartsSelector(True)
darts_selector.fit()

+ 79
- 0
dubhe-tadl/pdarts/pdarts_train.py View File

@@ -0,0 +1,79 @@
import sys
sys.path.append('..'+ '/' + '..')
import time
import logging
from argparse import ArgumentParser
from pdartstrainer import PdartsTrainer
from pytorch.utils import mkdirs, set_seed, init_logger, list_str2int

logger = logging.getLogger(__name__)

if __name__ == "__main__":
parser = ArgumentParser("pdarts")
parser.add_argument("--data_dir", type=str,
default='../data/', help="search_space json file")
parser.add_argument("--result_path", type=str,
default='0/result.json', help="training result")
parser.add_argument("--log_path", type=str,
default='0/log', help="log for info")
parser.add_argument("--search_space_path", type=str,
default='./search_space.json', help="search space of PDARTS")
parser.add_argument("--best_selected_space_path", type=str,
default='./best_selected_space.json', help="final best selected space")
parser.add_argument('--trial_id', type=int, default=0, help='for ensuring reproducibility ')
parser.add_argument('--model_lr', type=float, default=0.025, help='learning rate for training model weights')
parser.add_argument('--arch_lr', type=float, default=3e-4, help='learning rate for training architecture')
parser.add_argument("--epochs", default=2, type=int)
parser.add_argument("--pre_epochs", default=15, type=int)
parser.add_argument("--batch_size", default=96, type=int)
parser.add_argument("--init_layers", default=5, type=int)
parser.add_argument('--add_layers', default=[0, 6, 12], nargs='+', type=int, help='add layers in each stage')
parser.add_argument('--dropped_ops', default=[3, 2, 1], nargs='+', type=int, help='drop ops in each stage')
parser.add_argument('--dropout_rates', default=[0.1, 0.4, 0.7], nargs='+', type=float, help='drop ops probability in each stage')
# parser.add_argument('--add_layers', action='append', help='add layers in each stage')
# parser.add_argument('--dropped_ops', action='append', help='drop ops in each stage')
# parser.add_argument('--dropout_rates', action='append', help='drop ops probability in each stage')
parser.add_argument("--channels", default=16, type=int)
parser.add_argument("--log_frequency", default=50, type=int)
parser.add_argument("--class_num", default=10, type=int)
parser.add_argument("--unrolled", default=False, action="store_true")
args = parser.parse_args()

mkdirs(args.result_path, args.log_path, args.search_space_path, args.best_selected_space_path)
init_logger(args.log_path, "info")
set_seed(args.trial_id)
# args.add_layers = list_str2int(args.add_layers)
# args.dropped_ops = list_str2int(args.dropped_ops)
# args.dropout_rates = list_str2int(args.dropout_rates)
logger.info(args)
logger.info("initializing pdarts trainer")
trainer = PdartsTrainer(
init_layers=args.init_layers,
pdarts_num_layers=args.add_layers,
pdarts_num_to_drop=args.dropped_ops,
pdarts_dropout_rates=args.dropout_rates,
num_epochs=args.epochs,
num_pre_epochs=args.pre_epochs,
model_lr=args.model_lr,
arch_lr=args.arch_lr,
batch_size=args.batch_size,
class_num=args.class_num,
channels=args.channels,
result_path=args.result_path,
log_frequency=args.log_frequency,
unrolled=args.unrolled,
data_dir = args.data_dir,
search_space_path=args.search_space_path,
best_selected_space_path=args.best_selected_space_path
)
logger.info("training")
start_time = time.time()
trainer.train(validate=True)
# result = trainer.result
cost_time = time.time() - start_time
# 后端在终端过滤,{"type": "Cost_time", "result": {"value": "* s"}}
logger.info({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}})
with open(args.result_path, "a") as file:
file.write(str({"type": "Cost_time", "result": {"value": str(cost_time) + ' s'}}))

+ 201
- 0
dubhe-tadl/pdarts/pdartsmutator.py View File

@@ -0,0 +1,201 @@
import copy

import numpy as np
import torch
import logging
from collections import OrderedDict
from torch import nn

from pytorch.darts.dartsmutator import DartsMutator
from pytorch.mutables import LayerChoice, InputChoice

logger = logging.getLogger(__name__)

class PdartsMutator(DartsMutator):
"""
It works with PdartsTrainer to calculate ops weights,
and drop weights in different PDARTS epochs.
"""
def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}):
self.pdarts_epoch_index = pdarts_epoch_index
self.pdarts_num_to_drop = pdarts_num_to_drop
# save the last two switches and choices for restrict skip
self.last_two_switches = None
self.last_two_choices = None

if switches is None:
self.switches = {}
else:
self.switches = switches

super(PdartsMutator, self).__init__(model)

# this loop go through mutables with different keys,
# it's mainly to update length of choices.
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
switches = self.switches.get(mutable.key, [True for j in range(len(mutable))])
# choices = self.choices[mutable.key]

operations_count = np.sum(switches)
# +1 and -1 are caused by zero operation in darts network
# the zero operation is not in choices list(switches) in network, but its weight are in,
# so it needs one more weights and switch for zero.
self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(operations_count + 1))
self.switches[mutable.key] = switches

# update LayerChoice instances in model,
# it's physically remove dropped choices operations.
for module in self.model.modules():
if isinstance(module, LayerChoice):
switches = self.switches.get(module.key)
choices = self.choices[module.key]
if len(module) > len(choices):
# from last to first, so that it won't effect previous indexes after removed one.
for index in range(len(switches)-1, -1, -1):
if switches[index] == False:
del module[index]
assert len(module) <= len(choices), "Failed to remove dropped choices."

def export(self, last, switches):
# In last pdarts_epoches, need to restrict skipconnection
# Cannot rely on super().export() because P-DARTS has deleted some of the choices and has misaligned length.
if last:
# restrict Up to 2 skipconnect (normal cell only)
name = "normal"
max_num = 2
skip_num = self.check_skip_num(name, switches)
logger.info("Initially, the number of skipconnect is {}.".format(skip_num))
while skip_num > max_num:
logger.info("Restricting {} skipconnect to {}.".format(skip_num, max_num))
logger.info("Original normal_switch is {}.".format(switches))
# update self.choices setting skip prob to 0 and self.switches setting skip prob to False
switches = self.delete_min_sk(name, switches)
logger.info("Restricted normal_switch is {}.".format(switches))
skip_num = self.check_skip_num(name, switches)
# from bool result convert to human readable by Mutator export()
results = super().sample_final()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
# As some operations are dropped physically,
# so it needs to fill back false to track dropped operations.
trained_result = results[mutable.key]
trained_index = 0
switches = self.switches[mutable.key]
result = torch.Tensor(switches).bool()
for index in range(len(result)):
if result[index]:
result[index] = trained_result[trained_index]
trained_index += 1
results[mutable.key] = result
return results

def drop_paths(self):
"""
This method is called when a PDARTS epoch is finished.
It prepares switches for next epoch.
candidate operations with False switch will be doppped in next epoch.
"""
all_switches = copy.deepcopy(self.switches)
for key in all_switches:
switches = all_switches[key]
idxs = []
for j in range(len(switches)):
if switches[j]:
idxs.append(j)
sorted_weights = self.choices[key].data.cpu().numpy()[:-1]
drop = np.argsort(sorted_weights)[:self.pdarts_num_to_drop[self.pdarts_epoch_index]]
for idx in drop:
switches[idxs[idx]] = False
return all_switches


def check_skip_num(self, name, switches):
counter = 0
for key in switches:
if name in key:
# zero operation not in switches, so "skipconnect" in 2
if switches[key][2]:
counter += 1
return counter

def delete_min_sk(self, name, switches):
def _get_sk_idx(key, switches):
if not switches[key][2]:
idx = -1
else:
idx = 0
for i in range(2):
# switches has 1 True, self.switches has 2 True
if self.switches[key][i]:
idx += 1
return idx
sk_choices = [1.0 for i in range(14)]
sk_keys = [None for i in range(14)] # key has skip connection
sk_choices_idx = -1
for key in switches:
if name in key:
# default key in order
sk_choices_idx += 1
idx = _get_sk_idx(key, switches)
if not idx == -1:
sk_keys[sk_choices_idx] = key
sk_choices[sk_choices_idx] = self.choices[key][idx]
min_sk_idx = np.argmin(sk_choices)
idx = _get_sk_idx(sk_keys[min_sk_idx], switches)
# modify self.choices or copy.deepcopy ?
self.choices[sk_keys[min_sk_idx]][idx] = 0.0
# modify self.switches or copy.deepcopy ?
# self.switches indicate last two switches, and switches indicate present(last) switches
self.switches[sk_keys[min_sk_idx]][2] = False
switches[sk_keys[min_sk_idx]][2] = False
return switches


def _generate_search_space(self):
"""
Generate search space from mutables.
Here is the search space format:
::
{ key_name: {"_type": "layer_choice",
"_value": ["conv1", "conv2"]} }
{ key_name: {"_type": "input_choice",
"_value": {"candidates": ["in1", "in2"],
"n_chosen": 1}} }
Returns
-------
dict
the generated search space
"""
res = OrderedDict()
res["op_list"] = OrderedDict()
res["search_space"] = {"reduction_cell": OrderedDict(), "normal_cell": OrderedDict()}
keys = []
for mutable in self.mutables:
# for now we only generate flattened search space
if (len(res["search_space"]["reduction_cell"]) + len(res["search_space"]["normal_cell"])) >= 36:
break

if isinstance(mutable, LayerChoice):
key = mutable.key
if key not in keys:
val = mutable.names
if not res["op_list"]:
res["op_list"] = {"_type": "layer_choice", "_value": val + ["none"]}
node_type = "normal_cell" if "normal" in key else "reduction_cell"
res["search_space"][node_type][key] = "op_list"
keys.append(key)

elif isinstance(mutable, InputChoice):
key = mutable.key
if key not in keys:
node_type = "normal_cell" if "normal" in key else "reduction_cell"
res["search_space"][node_type][key] = {"_type": "input_choice",
"_value": {"candidates": mutable.choose_from,
"n_chosen": mutable.n_chosen}}
keys.append(key)
else:
raise TypeError("Unsupported mutable type: '%s'." % type(mutable))

return res

+ 167
- 0
dubhe-tadl/pdarts/pdartstrainer.py View File

@@ -0,0 +1,167 @@
import os
import logging
import torch
import torch.nn as nn
import numpy as np
from collections import OrderedDict
import json
from pytorch.callbacks import LRSchedulerCallback
from pytorch.trainer import BaseTrainer, TorchTensorEncoder
from pytorch.utils import dump_global_result

from model import CNN
from pdartsmutator import PdartsMutator
from pytorch.darts.utils import accuracy
from pytorch.darts import datasets
from pytorch.darts.dartstrainer import DartsTrainer

logger = logging.getLogger(__name__)

class PdartsTrainer(BaseTrainer):
"""
This trainer implements the PDARTS algorithm.
PDARTS bases on DARTS algorithm, and provides a network growth approach to find deeper and better network.
This class relies on pdarts_num_layers and pdarts_num_to_drop parameters to control how network grows.
pdarts_num_layers means how many layers more than first epoch.
pdarts_num_to_drop means how many candidate operations should be dropped in each epoch.
So that the grew network can in similar size.
"""

def __init__(self, init_layers, pdarts_num_layers, pdarts_num_to_drop, pdarts_dropout_rates, num_epochs, num_pre_epochs, model_lr, class_num,
arch_lr, channels, batch_size, result_path, log_frequency, unrolled, data_dir, search_space_path,
best_selected_space_path, device=None, workers=4):
super(PdartsTrainer, self).__init__()
self.init_layers = init_layers
self.class_num = class_num
self.channels = channels
self.model_lr = model_lr
self.num_epochs = num_epochs
self.class_num = class_num
self.pdarts_num_layers = pdarts_num_layers
self.pdarts_num_to_drop = pdarts_num_to_drop
self.pdarts_dropout_rates = pdarts_dropout_rates
self.pdarts_epoches = len(pdarts_num_to_drop)
self.search_space_path = search_space_path
self.best_selected_space_path = best_selected_space_path

logger.info("loading data")
dataset_train, dataset_valid = datasets.get_dataset(
"cifar10", root=data_dir)
self.darts_parameters = {
"metrics": lambda output, target: accuracy(output, target, topk=(1,)),
"arch_lr": arch_lr,
"num_epochs": num_epochs,
"num_pre_epochs": num_pre_epochs,
"dataset_train": dataset_train,
"dataset_valid": dataset_valid,
"batch_size": batch_size,
"result_path": result_path,
"workers": workers,
"device": device,
"log_frequency": log_frequency,
"unrolled": unrolled,
"search_space_path": None
}

def train(self, validate=False):
switches = None
last = False
for epoch in range(self.pdarts_epoches):
if epoch == self.pdarts_epoches - 1:
last = True
# create network for each stage
layers = self.init_layers + self.pdarts_num_layers[epoch]
init_dropout_rate = float(self.pdarts_dropout_rates[epoch])
model = CNN(32, 3, self.channels, self.class_num, layers,
init_dropout_rate, n_nodes=4, search=True)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.SGD(
model.parameters(), self.model_lr, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optim, self.num_epochs, eta_min=0.001)

logger.info(
"############Start PDARTS training epoch %s############", epoch)
self.mutator = PdartsMutator(
model, epoch, self.pdarts_num_to_drop, switches)
if epoch == 0:
# only write original search space in first stage
search_space = self.mutator._generate_search_space()
dump_global_result(self.search_space_path,
search_space)

darts_callbacks = []
if lr_scheduler is not None:
darts_callbacks.append(LRSchedulerCallback(lr_scheduler))
# darts_callbacks.append(ArchitectureCheckpoint(
# os.path.join(self.selected_space_path, "stage_{}".format(epoch))))
self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion,
optimizer=optim, callbacks=darts_callbacks, **self.darts_parameters)

for train_epoch in range(self.darts_parameters["num_epochs"]):
for callback in darts_callbacks:
callback.on_epoch_begin(train_epoch)

# training
logger.info("Epoch %d Training", train_epoch)
if train_epoch < self.darts_parameters["num_pre_epochs"]:
dropout_rate = init_dropout_rate * \
(self.darts_parameters["num_epochs"] - train_epoch -
1) / self.darts_parameters["num_epochs"]
else:
# scale_factor = 0.2
dropout_rate = init_dropout_rate * \
np.exp(-(epoch -
self.darts_parameters["num_pre_epochs"]) * 0.2)

model.drop_path_prob(search=True, p=dropout_rate)
self.trainer.train_one_epoch(train_epoch)

if validate:
# validation
logger.info("Epoch %d Validating", train_epoch + 1)
self.trainer.validate_one_epoch(
train_epoch, log_print=True if last else False)

for callback in darts_callbacks:
callback.on_epoch_end(train_epoch)

switches = self.mutator.drop_paths()

# In last pdarts_epoches, need to restrict skipconnection and save best structure
if last:
res = OrderedDict()
op_value = [value for value in search_space["op_list"]["_value"] if value != 'none']
res["op_list"] = search_space["op_list"]
res["op_list"]["_value"] = op_value
res["best_selected_space"] = self.mutator.export(last, switches)
logger.info(res)
dump_global_result(self.best_selected_space_path, res)

def validate(self):
self.trainer.validate()

def export(self, file, last, switches):
self.mutator.export(last, switches)
mutator_export = self.mutator.export()
with open(file, "w") as f:
json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder)

def checkpoint(self, file_path, epoch):
if isinstance(self.model, nn.DataParallel):
child_model_state_dict = self.model.module.state_dict()
else:
child_model_state_dict = self.model.state_dict()

save_state = {'child_model_state_dict': child_model_state_dict,
'optimizer_state_dict': self.optimizer.state_dict(),
'epoch': epoch}

dest_path = os.path.join(
file_path, "best_checkpoint_epoch_{}.pth.tar".format(epoch))
logger.info("Saving model to %s", dest_path)
torch.save(save_state, dest_path)
raise NotImplementedError("Not implemented yet")



+ 92
- 0
dubhe-tadl/pdarts/reademe.md View File

@@ -0,0 +1,92 @@
# train stage
`python pdarts_train.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --search_space_path 'experiment_id/search_space.json' --best_selected_space_path 'experiment_id/best_selected_space.json' --trial_id 0 --model_lr 0.025 --arch_lr 3e-4 --epochs 2 --pre_epochs 1 --batch_size 64 --channels 16 --init_layers 5 --add_layer 0 6 12 --dropped_ops 3 2 1 --dropout_rates 0.1 0.4 0.7`

# select stage
`python pdarts_select.py --best_selected_space_path 'experiment_id/best_selected_space.json'`

# retrain stage
`python pdarts_retrain.py --data_dir '../data/' --result_path 'trial_id/result.json' --log_path 'trial_id/log' --best_selected_space_path 'experiment_id/best_selected_space.json' --best_checkpoint_dir 'experiment_id/' --trial_id 0 --batch_size 96 --epochs 2 --lr 0.025 --layers 20 --channels 36`

# output file
`result.json`
```
{'type': 'Accuracy', 'result': {'sequence': 0, 'category': 'epoch', 'value': 0.1}}
{'type': 'Accuracy', 'result': {'sequence': 1, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 2, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 3, 'category': 'epoch', 'value': 0.0}}
{'type': 'Accuracy', 'result': {'sequence': 4, 'category': 'epoch', 'value': 0.0}}
{'type': 'Cost_time', 'result': {'value': '41.614346981048584 s'}}
```

`search_space.json`
```
{
"op_list": {
"_type": "layer_choice",
"_value": [
"maxpool",
"avgpool",
"skipconnect",
"sepconv3x3",
"sepconv5x5",
"dilconv3x3",
"dilconv5x5",
"none"
]
},
"search_space": {
"normal_n2_p0": "op_list",
"normal_n2_p1": "op_list",
"normal_n2_switch": {
"_type": "input_choice",
"_value": {
"candidates": [
"normal_n2_p0",
"normal_n2_p1"
],
"n_chosen": 2
}
},

...
}
```

`best_selected_space.json`
```
{
{
"op_list": {
"_type": "layer_choice",
"_value": [
"maxpool",
"avgpool",
"skipconnect",
"sepconv3x3",
"sepconv5x5",
"dilconv3x3",
"dilconv5x5"
]
},
"best_selected_space": {
"normal_n2_p0": [
false,
false,
false,
false,
true,
false,
false
],
"normal_n2_p1": [
true,
false,
false,
false,
false,
false,
false
],
...
}
```

+ 46
- 0
dubhe-tadl/retrainer.py View File

@@ -0,0 +1,46 @@
from abc import ABC, abstractmethod
class Retrainer(ABC):
"""
Train the best performance model from scratch without structure optimization.
To implement a new selector, users need to implement:
method: "train"
method: "__init__"
super().__init__() must be called in __init__ method
parameters:
-----------
candidates: candidates to be evaluated
"""
@abstractmethod
def train(self):
"""
Override the method to train.
"""
raise NotImplementedError
def validate(self):
"""
Override the method to validate.
"""
raise NotImplementedError
def export(self, file):
"""
Override the method to export to file.
Parameters
----------
file : str
File path to export to.
"""
raise NotImplementedError
def checkpoint(self):
"""
Override to dump a checkpoint.
"""
raise NotImplementedError

+ 56
- 0
dubhe-tadl/selector.py View File

@@ -0,0 +1,56 @@
from abc import ABC, abstractmethod
class Selector(ABC):
"""
choose the best model from a group of candidates.
To implement a new selector, users need to implement:
method: "fit"
method: "__init__"
super().__init__() must be called in __init__ method
parameters:
-----------
candidates: candidates to be evaluated
##### Examples #####
# class HPOSelector(Selector):
# def __init__(self, *args, single_candidate=True):
# super().__init__(single_candidate)
# self.args = args
# def fit(self):
#
# # only one candatite, function passed
#
# pass
###########
"""
@abstractmethod
def __init__(self, single_candidate=True):
self.single_candidate = single_candidate
self._valid()
@abstractmethod
def fit(self, candidates=None):
"""
evaluate the candidates to select the best one.
any optimization algos could be implement here.
if the inputs has only one candidates, just return the candidate directly
"""
raise NotImplementedError
def _valid(self, ):
if self.single_candidate:
print("### single model, selecting finished ###")
exit(0)

+ 1
- 0
dubhe-tadl/spos/algorithms/random/__init__.py View File

@@ -0,0 +1 @@
from .mutator import RandomMutator

+ 36
- 0
dubhe-tadl/spos/algorithms/random/mutator.py View File

@@ -0,0 +1,36 @@
import torch
import torch.nn.functional as F

from nni.nas.pytorch.mutator import Mutator
from nni.nas.pytorch.mutables import LayerChoice, InputChoice


class RandomMutator(Mutator):
"""
Random mutator that samples a random candidate in the search space each time ``reset()``.
It uses random function in PyTorch, so users can set seed in PyTorch to ensure deterministic behavior.
"""

def sample_search(self):
"""
Sample a random candidate.
"""
result = dict()
for mutable in self.mutables:
if isinstance(mutable, LayerChoice):
gen_index = torch.randint(high=len(mutable), size=(1, ))
result[mutable.key] = F.one_hot(gen_index, num_classes=len(mutable)).view(-1).bool()
elif isinstance(mutable, InputChoice):
if mutable.n_chosen is None:
result[mutable.key] = torch.randint(high=2, size=(mutable.n_candidates,)).view(-1).bool()
else:
perm = torch.randperm(mutable.n_candidates)
mask = [i in perm[:mutable.n_chosen] for i in range(mutable.n_candidates)]
result[mutable.key] = torch.tensor(mask, dtype=torch.bool) # pylint: disable=not-callable
return result

def sample_final(self):
"""
Same as :meth:`sample_search`.
"""
return self.sample_search()

+ 3
- 0
dubhe-tadl/spos/algorithms/spos/__init__.py View File

@@ -0,0 +1,3 @@
from .evolution import SPOSEvolution
from .mutator import SPOSSupernetTrainingMutator
from .trainer import SPOSSupernetTrainer

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save