Browse Source

Delete 'model/backbone/LeViT.py'

v1
limingjuan 2 years ago
parent
commit
fa3591bb11
1 changed files with 0 additions and 582 deletions
  1. +0
    -582
      model/backbone/LeViT.py

+ 0
- 582
model/backbone/LeViT.py View File

@@ -1,582 +0,0 @@
# pylint: disable=E0401
# pylint: disable=W0201
"""
MindSpore implementation of `LeViT`.
Refer to "LeViT: a Vision Transformer in ConvNet’s Clothing for Faster Inference"
"""
import itertools

import mindspore as ms
from mindspore import Tensor
from mindspore import nn
from mindspore import context
from mindspore import numpy as np
from mindspore.common import initializer as init


from model.helper import load_pretrained
from model.registry import register_model


__all__ = [
"LeViT",
"LeViT_128S",
"LeViT_128",
"LeViT_192",
"LeViT_256",
"LeViT_384",
]


def _cfg(url='', **kwargs): # need to check for
return {
'url': url,
'num_classes': 1000,
'input_size': (3, 224, 224),
# 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
'first_conv': 'patch_embed.proj', 'classifier': 'head',
**kwargs
}


default_cfgs = {
'LeViT_128S': _cfg(url=''),
'LeViT_128': _cfg(url=''),
'LeViT_192': _cfg(url=''),
'LeViT_256': _cfg(url=''),
'LeViT_384': _cfg(url='')
}

FLOPS_COUNTER = 0


class Conv2d_BN(nn.SequentialCell):
""" Conv2d and BatchNorm """
def __init__(self,
a: int,
b: int,
ks: int = 1,
stride: int = 1,
pad: int = 0, # pad=1
dilation: int = 1,
group: int = 1) -> None:
super().__init__()
self.conv = nn.Conv2d(in_channels=a,
out_channels=b,
kernel_size=ks,
stride=stride,
padding=pad,
dilation=dilation,
group=group,
has_bias=False,
pad_mode="pad")

self.bn = nn.BatchNorm2d(num_features=b,
gamma_init="ones",
beta_init="zeros",
use_batch_statistics=True,
momentum=0.9) # 0.1

def construct(self, input_data: Tensor) -> Tensor:
x = self.conv(input_data)
x = self.bn(x)
return x


class Linear_BN(nn.SequentialCell):
""" Dense and BatchNorm """
def __init__(self,
a: int,
b: int) -> None:
super().__init__()

self.linear = nn.Dense(a,
b,
weight_init='Uniform',
bias_init='Uniform',
has_bias=False)

self.bn1d = nn.BatchNorm1d(num_features=b,
gamma_init="ones",
beta_init="zeros",
momentum=0.9)

def construct(self, input_data: Tensor) -> Tensor:
x = self.linear(input_data)
x1, x2, x3 = x.shape
new_x = ms.ops.reshape(x, (x1 * x2, x3))
x = self.bn1d(new_x).reshape(x.shape)
return x


class BN_Linear(nn.SequentialCell):
""" BatchNorm and Dense """
def __init__(self,
a: int,
b: int,
bias: bool = True,
std: float = 0.02) -> None:
super().__init__()

self.bn1d = nn.BatchNorm1d(num_features=a,
gamma_init="ones",
beta_init="zeros",
momentum=0.9)

self.linear = nn.Dense(a,
b,
weight_init=init.TruncatedNormal(sigma=std),
bias_init='zeros',
has_bias=bias)

def construct(self, input_data: Tensor) -> Tensor:
x = self.bn1d(input_data)
x = self.linear(x)
return x


class Residual(nn.Cell):
""" Residual """
def __init__(self,
m: type = None,
drop: int = 0):
super().__init__()
self.m = m
self.drop = drop

def construct(self, x: Tensor) -> Tensor:
if self.training and self.drop > 0:
return x + self.m(x) * ms.Tensor.to_tensor(
(np.randn((x.shape[0], 1, 1)) > self.drop) / (1 - self.drop))

y = self.m(x)
x = x + y
return x


def b16(n, activation=nn.HSwish):
""" b16 """
return nn.SequentialCell(
Conv2d_BN(3, n // 8, 3, 2, 1),
activation(),
Conv2d_BN(n // 8, n // 4, 3, 2, 1),
activation(),
Conv2d_BN(n // 4, n // 2, 3, 2, 1),
activation(),
Conv2d_BN(n // 2, n, 3, 2, 1))


class Subsample(nn.Cell):
""" DownSample """
def __init__(self,
stride: int,
resolution: int):
super().__init__()
self.stride = stride
self.resolution = resolution

def construct(self, x: Tensor) -> Tensor:
B, _, C = x.shape
x = x.view(B, self.resolution, self.resolution, C)[
:, ::self.stride, ::self.stride].reshape(B, -1, C)
return x


class Attention(nn.Cell):
""" Attention """
def __init__(self,
dim: int,
key_dim: int,
num_heads: int = 8,
attn_ratio: int = 4,
activation: type = None,
resolution: int = 14) -> None:

super().__init__()

self.num_heads = num_heads
self.scale = key_dim ** -0.5
self.key_dim = key_dim
self.nh_kd = nh_kd = key_dim * num_heads
self.d = int(attn_ratio * key_dim)
self.dh = int(attn_ratio * key_dim) * num_heads
self.attn_ratio = attn_ratio
h = self.dh + nh_kd * 2
self.qkv = Linear_BN(dim, h)
self.proj = nn.SequentialCell(activation(), Linear_BN(self.dh, dim))

points = list(itertools.product(range(resolution), range(resolution))) # 迭代两个不同大小的列表来获取新列表
self.N = len(points)
self.softmax = nn.Softmax(axis=-1)

attention_offsets = {}
idxs = []
for p1 in points:
for p2 in points:
offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
idxs.append(attention_offsets[offset])

self.attention_biases = ms.Parameter(
Tensor(np.zeros([num_heads, len(attention_offsets)], np.float32)))

attention_bias_idxs = ms.Tensor(idxs, dtype=ms.int64).view(self.N, self.N)
self.attention_bias_idxs = ms.Parameter(attention_bias_idxs, requires_grad=False)

self.ab = self.attention_biases[:, self.attention_bias_idxs]

def construct(self, x: Tensor) -> Tensor:
B, N, _ = x.shape
atte = self.qkv(x).view(B, N, self.num_heads, -1)
q, k, v = ms.ops.split(atte, [self.key_dim, self.key_dim, 2 * self.key_dim], axis=3)
q = ms.ops.transpose(q, (0, 2, 1, 3))
k = ms.ops.transpose(k, (0, 2, 1, 3))
v = ms.ops.transpose(v, (0, 2, 1, 3))

attn = (
(ms.ops.matmul(q, ms.ops.transpose(k, (-4, -3, -1, -2)))) * self.scale
+
(self.attention_biases[:, self.attention_bias_idxs]
if self.training else self.ab)
)

attn = self.softmax(attn)

x = ms.ops.transpose((ms.ops.matmul(attn, v)), (0, 2, 1, 3))

x = x.reshape(B, N, self.dh)

x = self.proj(x)

return x


class AttentionSubsample(nn.Cell):
""" Attention SubSample """
def __init__(self,
in_dim: int,
out_dim: int,
key_dim: int,
num_heads: int = 8,
attn_ratio: int = 2,
activation: type = None,
stride: int = 2,
resolution: int = 14,
resolution_: int = 7) -> None:
super().__init__()

self.num_heads = num_heads
self.scale = key_dim ** -0.5
self.key_dim = key_dim
self.nh_kd = nh_kd = key_dim * num_heads
self.d = int(attn_ratio * key_dim)
self.dh = int(attn_ratio * key_dim) * self.num_heads
self.attn_ratio = attn_ratio
self.resolution_ = resolution_
self.resolution_2 = resolution_ ** 2
h = self.dh + nh_kd
self.kv = Linear_BN(in_dim, h)

self.q = nn.SequentialCell(
Subsample(stride, resolution),
Linear_BN(in_dim, nh_kd))
self.proj = nn.SequentialCell(activation(), Linear_BN(self.dh, out_dim))
self.softmax = nn.Softmax(axis=-1)
self.stride = stride
self.resolution = resolution
points = list(itertools.product(range(resolution), range(resolution)))
points_ = list(itertools.product(range(resolution_), range(resolution_)))

N = len(points)
N_ = len(points_)
attention_offsets = {}
idxs = []
for p1 in points_:
for p2 in points:
size = 1
offset = (
abs(p1[0] * stride - p2[0] + (size - 1) / 2),
abs(p1[1] * stride - p2[1] + (size - 1) / 2))
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
idxs.append(attention_offsets[offset])

self.attention_biases = ms.Parameter(
Tensor(np.zeros([num_heads, len(attention_offsets)], np.float32)))

attention_bias_idxs = (ms.Tensor(idxs, dtype=ms.int64)).view((N_, N))
self.attention_bias_idxs = ms.Parameter(attention_bias_idxs, requires_grad=False)

self.ab = self.attention_biases[:, self.attention_bias_idxs]

def construct(self,
x: Tensor) -> Tensor:

B, N, _ = x.shape
atte = self.kv(x).view(B, N, self.num_heads, -1)
k, v = ms.ops.split(atte, [self.key_dim, atte.shape[3] - self.key_dim], axis=3)
v = ms.ops.transpose(v, (0, 2, 1, 3))
k = ms.ops.transpose(k, (0, 2, 1, 3))

q = self.q(x).view(B, self.resolution_2, self.num_heads, self.key_dim)
q = ms.ops.transpose(q, (0, 2, 1, 3))

attn = (
ms.ops.matmul(q, ms.ops.transpose(k, (-4, -3, -1, -2))) * self.scale +
(self.attention_biases[:, self.attention_bias_idxs]
if self.training else self.ab)
)

attn = self.softmax(attn)

x = ms.ops.transpose((ms.ops.matmul(attn, v)), (0, 2, 1, 3))
x = x.reshape(B, -1, self.dh)
x = self.proj(x)
return x


class LeViT(nn.Cell):
""" Vision Transformer with support for patch or hybrid CNN input stage
"""

def __init__(self,
img_size: int = 224,
patch_size: int = 16,
num_classes: int = 1000,
embed_dim=None,
key_dim=None,
depth=None,
num_heads=None,
attn_ratio=None,
mlp_ratio=None,
hybrid_backbone: type = b16(128, activation=nn.HSwish),
down_ops=None,
attention_activation: type = nn.HSwish,
mlp_activation: type = nn.HSwish,
distillation: bool = True,
drop_path: int = 0):
super().__init__()

if embed_dim is None:
embed_dim = [128, 256, 384]
if key_dim is None:
key_dim = [16, 16, 16]
if depth is None:
depth = [2, 3, 4]
if num_heads is None:
num_heads = [4, 6, 8]
if attn_ratio is None:
attn_ratio = [2, 2, 2]
if mlp_ratio is None:
mlp_ratio = [2, 2, 2]
if down_ops is None:
down_ops = [['Subsample', 16, 128 // 16, 4, 2, 2], ['Subsample', 16, 256 // 16, 4, 2, 2]]
self.num_classes = num_classes
self.num_features = embed_dim[-1]
self.embed_dim = embed_dim
self.distillation = distillation
self.patch_embed = hybrid_backbone
self.blocks = []

down_ops.append([''])
resolution = img_size // patch_size
for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate(
zip(embed_dim, key_dim, depth, num_heads, attn_ratio, mlp_ratio, down_ops)):
for _ in range(dpth):
self.blocks.append(
Residual(Attention(
ed, kd, nh,
attn_ratio=ar,
activation=attention_activation,
resolution=resolution,
), drop_path))
if mr > 0:
h = int(ed * mr)
self.blocks.append(
Residual(nn.SequentialCell(
Linear_BN(ed, h),
mlp_activation(),
Linear_BN(h, ed),
), drop_path))

if do[0] == 'Subsample':
resolution_ = (resolution - 1) // do[5] + 1
self.blocks.append(
AttentionSubsample(
*embed_dim[i:i + 2], key_dim=do[1], num_heads=do[2],
attn_ratio=do[3],
activation=attention_activation,
stride=do[5],
resolution=resolution,
resolution_=resolution_))
resolution = resolution_
if do[4] > 0: # mlp_ratio
h = int(embed_dim[i + 1] * do[4])
self.blocks.append(
Residual(nn.SequentialCell(
Linear_BN(embed_dim[i + 1], h),
mlp_activation(),
Linear_BN(
h, embed_dim[i + 1]),
), drop_path))
self.blocks = nn.SequentialCell(*self.blocks)

# Classifier head
if num_classes > 0:
self.head = BN_Linear(embed_dim[-1], num_classes)
if distillation:
self.head_dist = BN_Linear(embed_dim[-1], num_classes)

self.apply(self._init_weights)

def _init_weights(self, cell):
""" initialize weights """
if isinstance(cell, nn.Dense):
cell.weight.set_data(init.initializer(init.TruncatedNormal(sigma=.02), cell.weight.data.shape))
if cell.bias is not None:
cell.bias.set_data(init.initializer('zeros', cell.bias.shape))
elif isinstance(cell, nn.LayerNorm):
cell.gamma.set_data(init.initializer('ones', cell.gamma.shape))
cell.beta.set_data(init.initializer('zeros', cell.beta.shape))

def construct(self, x: Tensor) -> Tensor:

x = self.patch_embed(x)
B, C, H, W = x.shape
x = x.reshape(B, C, H * W)
x = ms.ops.transpose(x, (0, 2, 1))
x = self.blocks(x)
x = x.mean(1)
if self.distillation:
x = self.head(x), self.head_dist(x)
if not self.training:
x = (x[0] + x[1]) / 2
else:
x = self.head(x)
return x


@register_model
def LeViT_128S(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
""" LeViT_128S """
default_cfg = default_cfgs['LeViT_128S']
model = LeViT(num_classes=num_classes,
embed_dim=[128, 256, 384],
num_heads=[4, 6, 8],
key_dim=[16, 16, 16],
depth=[2, 3, 4],
down_ops=[
['Subsample', 16, 128 // 16, 4, 2, 2],
['Subsample', 16, 256 // 16, 4, 2, 2],
],
hybrid_backbone=b16(128),
**kwargs)
model.default_cfg = default_cfg

if pretrained:
load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

return model


@register_model
def LeViT_128(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
""" LeViT_128 """
default_cfg = default_cfgs['LeViT_128']
model = LeViT(num_classes=num_classes,
embed_dim=[128, 256, 384],
num_heads=[4, 8, 12],
key_dim=[16, 16, 16],
depth=[4, 4, 4],
down_ops=[
['Subsample', 16, 128 // 16, 4, 2, 2],
['Subsample', 16, 256 // 16, 4, 2, 2],
],
hybrid_backbone=b16(128),
**kwargs)
model.default_cfg = default_cfg

if pretrained:
load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

return model


@register_model
def LeViT_192(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
""" LeViT_192 """
default_cfg = default_cfgs['LeViT_192']
model = LeViT(num_classes=num_classes,
embed_dim=[192, 288, 384],
num_heads=[3, 5, 6],
key_dim=[32, 32, 32],
depth=[4, 4, 4],
down_ops=[
['Subsample', 32, 192 // 32, 4, 2, 2],
['Subsample', 32, 288 // 32, 4, 2, 2],
],
hybrid_backbone=b16(192),
**kwargs)
model.default_cfg = default_cfg

if pretrained:
load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

return model


@register_model
def LeViT_256(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
""" LeViT_256 """
default_cfg = default_cfgs['LeViT_256']
model = LeViT(num_classes=num_classes,
embed_dim=[256, 384, 512],
num_heads=[4, 6, 8],
key_dim=[32, 32, 32],
depth=[4, 4, 4],
down_ops=[
['Subsample', 32, 256 // 32, 4, 2, 2],
['Subsample', 32, 384 // 32, 4, 2, 2],
],
hybrid_backbone=b16(256),
**kwargs)
model.default_cfg = default_cfg

if pretrained:
load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

return model


@register_model
def LeViT_384(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
""" LeViT_384 """
default_cfg = default_cfgs['LeViT_384']
model = LeViT(num_classes=num_classes,
embed_dim=[384, 512, 768],
num_heads=[6, 9, 12],
key_dim=[32, 32, 32],
depth=[4, 4, 4],
down_ops=[
['Subsample', 32, 384 // 32, 4, 2, 2],
['Subsample', 32, 512 // 32, 4, 2, 2],
],
hybrid_backbone=b16(384),
**kwargs)
model.default_cfg = default_cfg

if pretrained:
load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

return model


if __name__ == '__main__':
context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU") # context.PYNATIVE_MODE

net = LeViT_128S()
# print(net)
dummy_input = ms.ops.rand((4, 3, 224, 224))
output = net(dummy_input)
print(output.shape)

Loading…
Cancel
Save