|
- """
- Copyright 2020 Tianshu AI Platform. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- =============================================================
- """
-
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- from copy import deepcopy
-
- from typing import Callable
-
- from kamal.core.engine.engine import Engine
- from kamal.core.engine.trainer import KDTrainer
- from kamal.core.engine.hooks import FeatureHook
- from kamal.core import tasks
- import math
-
- from kamal.slim.prunning import Pruner, strategy
-
- def _assert_same_type(layers, layer_type=None):
- if layer_type is None:
- layer_type = type(layers[0])
-
- assert all(isinstance(l, layer_type) for l in layers), 'Model archictures must be the same'
-
- def _get_layers(model_list):
- submodel = [ model.modules() for model in model_list ]
- for layers in zip(*submodel):
- _assert_same_type(layers)
- yield layers
-
- def bn_combine_fn(layers):
- """Combine 2D Batch Normalization Layers
-
- **Parameters:**
- - **layers** (BatchNorm2D): Batch Normalization Layers.
- """
- _assert_same_type(layers, nn.BatchNorm2d)
- num_features = sum(l.num_features for l in layers)
- combined_bn = nn.BatchNorm2d(num_features=num_features,
- eps=layers[0].eps,
- momentum=layers[0].momentum,
- affine=layers[0].affine,
- track_running_stats=layers[0].track_running_stats)
- combined_bn.running_mean = torch.cat(
- [l.running_mean for l in layers], dim=0).clone()
- combined_bn.running_var = torch.cat(
- [l.running_var for l in layers], dim=0).clone()
-
- if combined_bn.affine:
- combined_bn.weight = torch.nn.Parameter(
- torch.cat([l.weight.data.clone() for l in layers], dim=0).clone())
- combined_bn.bias = torch.nn.Parameter(
- torch.cat([l.bias.data.clone() for l in layers], dim=0).clone())
- return combined_bn
-
-
- def conv2d_combine_fn(layers):
- """Combine 2D Conv Layers
-
- **Parameters:**
- - **layers** (Conv2d): Conv Layers.
- """
- _assert_same_type(layers, nn.Conv2d)
-
- CO, CI = 0, 0
- for l in layers:
- O, I, H, W = l.weight.shape
- CO += O
- CI += I
-
- dtype = layers[0].weight.dtype
- device = layers[0].weight.device
-
- combined_weight = torch.nn.Parameter(
- torch.zeros(CO, CI, H, W, dtype=dtype, device=device))
- if layers[0].bias is not None:
- combined_bias = torch.nn.Parameter(
- torch.zeros(CO, dtype=dtype, device=device))
- else:
- combined_bias = None
- co_offset = 0
- ci_offset = 0
- for idx, l in enumerate(layers):
- co_len, ci_len = l.weight.shape[0], l.weight.shape[1]
- combined_weight[co_offset: co_offset+co_len,
- ci_offset: ci_offset+ci_len, :, :] = l.weight.clone()
- if combined_bias is not None:
- combined_bias[co_offset: co_offset+co_len] = l.bias.clone()
- co_offset += co_len
- ci_offset += ci_offset
- combined_conv2d = nn.Conv2d(in_channels=CI,
- out_channels=CO,
- kernel_size=layers[0].weight.shape[-2:],
- stride=layers[0].stride,
- padding=layers[0].padding,
- bias=layers[0].bias is not None)
- combined_conv2d.weight.data = combined_weight
- if combined_bias is not None:
- combined_conv2d.bias.data = combined_bias
- for p in combined_conv2d.parameters():
- p.requires_grad = True
- return combined_conv2d
-
-
- def combine_models(models):
- """Combine modules with parser
-
- **Parameters:**
- - **models** (nn.Module): modules to be combined.
- - **combine_parser** (function): layer selector
- """
- def _recursively_combine(module):
- module_output = module
-
- if isinstance( module, nn.Conv2d ):
- combined_module = conv2d_combine_fn( layer_mapping[module] )
- elif isinstance( module, nn.BatchNorm2d ):
- combined_module = bn_combine_fn( layer_mapping[module] )
- else:
- combined_module = module
-
- if combined_module is not None:
- module_output = combined_module
-
- for name, child in module.named_children():
- module_output.add_module(name, _recursively_combine(child))
- return module_output
-
- models = deepcopy(models)
- combined_model = deepcopy(models[0]) # copy the model archicture and modify it with _recursively_combine
-
- layer_mapping = {}
- for combined_layer, layers in zip(combined_model.modules(), _get_layers(models)):
- layer_mapping[combined_layer] = layers # link to teachers
- combined_model = _recursively_combine(combined_model)
- return combined_model
-
-
- class CombinedModel(nn.Module):
- def __init__(self, models):
- super( Combination, self ).__init__()
- self.combined_model = combine_models( models )
- self.expand = len(models)
-
- def forward(self, x):
- x.repeat( -1, x.shape[1]*self.expand, -1, -1 )
- return self.combined_model(x)
-
-
- class PruningKDTrainer(KDTrainer):
- def setup(
- self,
- student,
- teachers,
- task,
- dataloader: torch.utils.data.DataLoader,
- get_optimizer_and_scheduler:Callable=None,
- pruning_rounds=5,
- device=None,
- ):
- if device is None:
- device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )
- self._device = device
- self._dataloader = dataloader
- self.model = self.student = student.to(self.device)
- self.teachers = nn.ModuleList(teachers).to(self.device)
- self.get_optimizer_and_scheduler = get_optimizer_and_scheduler
- @property
- def device(self):
- return self._device
- def run(self, max_iter, start_iter=0, epoch_length=None, pruning_rounds=3, target_model_size=0.6 ):
- pruning_size_per_round = 1 - math.pow( target_model_size, 1/pruning_rounds )
- prunner = Pruner( strategy.LNStrategy(n=1) )
- for pruning_round in range(pruning_rounds):
- prunner.prune( self.student, rate=pruning_size_per_round, example_inputs=torch.randn(1,3,240,240) )
- self.student.to(self.device)
- if self.get_optimizer_and_scheduler:
- self.optimizer, self.scheduler = self.get_optimizer_and_scheduler( self.student )
- else:
- self.optimizer = torch.optim.Adam( self.student.parameters(), lr=1e-4, weight_decay=1e-5 )
- self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self.optimizer, T_max= (max_iter-start_iter)//pruning_rounds )
- step_iter = (max_iter - start_iter)//pruning_rounds
-
- with set_mode(self.student, training=True), \
- set_mode(self.teachers, training=False):
- super( RecombinationAmalgamation, self ).run(self.step_fn, self._dataloader,
- start_iter=start_iter+step_iter*pruning_round , max_iter=start_iter+step_iter*(pruning_round+1), epoch_length=epoch_length)
-
- def step_fn(self, engine, batch):
- metrics = super(RecombinationAmalgamation, self).step_fn( engine, batch )
- self.scheduler.step()
- return metrics
-
- class RecombinationAmalgamator(PruningKDTrainer):
- pass
|