|
- # Copyright (c) OpenMMLab. All rights reserved.
- import copy
-
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- from mmcv.cnn import Linear, bias_init_with_prob, constant_init
- from mmcv.runner import force_fp32
-
- from mmdet.core import multi_apply
- from mmdet.models.utils.transformer import inverse_sigmoid
- from ..builder import HEADS
- from .detr_head import DETRHead
-
-
- @HEADS.register_module()
- class DeformableDETRHead(DETRHead):
- """Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to-
- End Object Detection.
-
- Code is modified from the `official github repo
- <https://github.com/fundamentalvision/Deformable-DETR>`_.
-
- More details can be found in the `paper
- <https://arxiv.org/abs/2010.04159>`_ .
-
- Args:
- with_box_refine (bool): Whether to refine the reference points
- in the decoder. Defaults to False.
- as_two_stage (bool) : Whether to generate the proposal from
- the outputs of encoder.
- transformer (obj:`ConfigDict`): ConfigDict is used for building
- the Encoder and Decoder.
- """
-
- def __init__(self,
- *args,
- with_box_refine=False,
- as_two_stage=False,
- transformer=None,
- **kwargs):
- self.with_box_refine = with_box_refine
- self.as_two_stage = as_two_stage
- if self.as_two_stage:
- transformer['as_two_stage'] = self.as_two_stage
-
- super(DeformableDETRHead, self).__init__(
- *args, transformer=transformer, **kwargs)
-
- def _init_layers(self):
- """Initialize classification branch and regression branch of head."""
-
- fc_cls = Linear(self.embed_dims, self.cls_out_channels)
- reg_branch = []
- for _ in range(self.num_reg_fcs):
- reg_branch.append(Linear(self.embed_dims, self.embed_dims))
- reg_branch.append(nn.ReLU())
- reg_branch.append(Linear(self.embed_dims, 4))
- reg_branch = nn.Sequential(*reg_branch)
-
- def _get_clones(module, N):
- return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
-
- # last reg_branch is used to generate proposal from
- # encode feature map when as_two_stage is True.
- num_pred = (self.transformer.decoder.num_layers + 1) if \
- self.as_two_stage else self.transformer.decoder.num_layers
-
- if self.with_box_refine:
- self.cls_branches = _get_clones(fc_cls, num_pred)
- self.reg_branches = _get_clones(reg_branch, num_pred)
- else:
-
- self.cls_branches = nn.ModuleList(
- [fc_cls for _ in range(num_pred)])
- self.reg_branches = nn.ModuleList(
- [reg_branch for _ in range(num_pred)])
-
- if not self.as_two_stage:
- self.query_embedding = nn.Embedding(self.num_query,
- self.embed_dims * 2)
-
- def init_weights(self):
- """Initialize weights of the DeformDETR head."""
- self.transformer.init_weights()
- if self.loss_cls.use_sigmoid:
- bias_init = bias_init_with_prob(0.01)
- for m in self.cls_branches:
- nn.init.constant_(m.bias, bias_init)
- for m in self.reg_branches:
- constant_init(m[-1], 0, bias=0)
- nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
- if self.as_two_stage:
- for m in self.reg_branches:
- nn.init.constant_(m[-1].bias.data[2:], 0.0)
-
- def forward(self, mlvl_feats, img_metas):
- """Forward function.
-
- Args:
- mlvl_feats (tuple[Tensor]): Features from the upstream
- network, each is a 4D-tensor with shape
- (N, C, H, W).
- img_metas (list[dict]): List of image information.
-
- Returns:
- all_cls_scores (Tensor): Outputs from the classification head, \
- shape [nb_dec, bs, num_query, cls_out_channels]. Note \
- cls_out_channels should includes background.
- all_bbox_preds (Tensor): Sigmoid outputs from the regression \
- head with normalized coordinate format (cx, cy, w, h). \
- Shape [nb_dec, bs, num_query, 4].
- enc_outputs_class (Tensor): The score of each point on encode \
- feature map, has shape (N, h*w, num_class). Only when \
- as_two_stage is True it would be returned, otherwise \
- `None` would be returned.
- enc_outputs_coord (Tensor): The proposal generate from the \
- encode feature map, has shape (N, h*w, 4). Only when \
- as_two_stage is True it would be returned, otherwise \
- `None` would be returned.
- """
-
- batch_size = mlvl_feats[0].size(0)
- input_img_h, input_img_w = img_metas[0]['batch_input_shape']
- img_masks = mlvl_feats[0].new_ones(
- (batch_size, input_img_h, input_img_w))
- for img_id in range(batch_size):
- img_h, img_w, _ = img_metas[img_id]['img_shape']
- img_masks[img_id, :img_h, :img_w] = 0
-
- mlvl_masks = []
- mlvl_positional_encodings = []
- for feat in mlvl_feats:
- mlvl_masks.append(
- F.interpolate(img_masks[None],
- size=feat.shape[-2:]).to(torch.bool).squeeze(0))
- mlvl_positional_encodings.append(
- self.positional_encoding(mlvl_masks[-1]))
-
- query_embeds = None
- if not self.as_two_stage:
- query_embeds = self.query_embedding.weight
- hs, init_reference, inter_references, \
- enc_outputs_class, enc_outputs_coord = self.transformer(
- mlvl_feats,
- mlvl_masks,
- query_embeds,
- mlvl_positional_encodings,
- reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501
- cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501
- )
- hs = hs.permute(0, 2, 1, 3)
- outputs_classes = []
- outputs_coords = []
-
- for lvl in range(hs.shape[0]):
- if lvl == 0:
- reference = init_reference
- else:
- reference = inter_references[lvl - 1]
- reference = inverse_sigmoid(reference)
- outputs_class = self.cls_branches[lvl](hs[lvl])
- tmp = self.reg_branches[lvl](hs[lvl])
- if reference.shape[-1] == 4:
- tmp += reference
- else:
- assert reference.shape[-1] == 2
- tmp[..., :2] += reference
- outputs_coord = tmp.sigmoid()
- outputs_classes.append(outputs_class)
- outputs_coords.append(outputs_coord)
-
- outputs_classes = torch.stack(outputs_classes)
- outputs_coords = torch.stack(outputs_coords)
- if self.as_two_stage:
- return outputs_classes, outputs_coords, \
- enc_outputs_class, \
- enc_outputs_coord.sigmoid()
- else:
- return outputs_classes, outputs_coords, \
- None, None
-
- @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
- def loss(self,
- all_cls_scores,
- all_bbox_preds,
- enc_cls_scores,
- enc_bbox_preds,
- gt_bboxes_list,
- gt_labels_list,
- img_metas,
- gt_bboxes_ignore=None):
- """"Loss function.
-
- Args:
- all_cls_scores (Tensor): Classification score of all
- decoder layers, has shape
- [nb_dec, bs, num_query, cls_out_channels].
- all_bbox_preds (Tensor): Sigmoid regression
- outputs of all decode layers. Each is a 4D-tensor with
- normalized coordinate format (cx, cy, w, h) and shape
- [nb_dec, bs, num_query, 4].
- enc_cls_scores (Tensor): Classification scores of
- points on encode feature map , has shape
- (N, h*w, num_classes). Only be passed when as_two_stage is
- True, otherwise is None.
- enc_bbox_preds (Tensor): Regression results of each points
- on the encode feature map, has shape (N, h*w, 4). Only be
- passed when as_two_stage is True, otherwise is None.
- gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
- with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
- gt_labels_list (list[Tensor]): Ground truth class indices for each
- image with shape (num_gts, ).
- img_metas (list[dict]): List of image meta information.
- gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
- which can be ignored for each image. Default None.
-
- Returns:
- dict[str, Tensor]: A dictionary of loss components.
- """
- assert gt_bboxes_ignore is None, \
- f'{self.__class__.__name__} only supports ' \
- f'for gt_bboxes_ignore setting to None.'
-
- num_dec_layers = len(all_cls_scores)
- all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
- all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
- all_gt_bboxes_ignore_list = [
- gt_bboxes_ignore for _ in range(num_dec_layers)
- ]
- img_metas_list = [img_metas for _ in range(num_dec_layers)]
-
- losses_cls, losses_bbox, losses_iou = multi_apply(
- self.loss_single, all_cls_scores, all_bbox_preds,
- all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
- all_gt_bboxes_ignore_list)
-
- loss_dict = dict()
- # loss of proposal generated from encode feature map.
- if enc_cls_scores is not None:
- binary_labels_list = [
- torch.zeros_like(gt_labels_list[i])
- for i in range(len(img_metas))
- ]
- enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
- self.loss_single(enc_cls_scores, enc_bbox_preds,
- gt_bboxes_list, binary_labels_list,
- img_metas, gt_bboxes_ignore)
- loss_dict['enc_loss_cls'] = enc_loss_cls
- loss_dict['enc_loss_bbox'] = enc_losses_bbox
- loss_dict['enc_loss_iou'] = enc_losses_iou
-
- # loss from the last decoder layer
- loss_dict['loss_cls'] = losses_cls[-1]
- loss_dict['loss_bbox'] = losses_bbox[-1]
- loss_dict['loss_iou'] = losses_iou[-1]
- # loss from other decoder layers
- num_dec_layer = 0
- for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
- losses_bbox[:-1],
- losses_iou[:-1]):
- loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
- loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
- loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
- num_dec_layer += 1
- return loss_dict
-
- @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
- def get_bboxes(self,
- all_cls_scores,
- all_bbox_preds,
- enc_cls_scores,
- enc_bbox_preds,
- img_metas,
- rescale=False):
- """Transform network outputs for a batch into bbox predictions.
-
- Args:
- all_cls_scores (Tensor): Classification score of all
- decoder layers, has shape
- [nb_dec, bs, num_query, cls_out_channels].
- all_bbox_preds (Tensor): Sigmoid regression
- outputs of all decode layers. Each is a 4D-tensor with
- normalized coordinate format (cx, cy, w, h) and shape
- [nb_dec, bs, num_query, 4].
- enc_cls_scores (Tensor): Classification scores of
- points on encode feature map , has shape
- (N, h*w, num_classes). Only be passed when as_two_stage is
- True, otherwise is None.
- enc_bbox_preds (Tensor): Regression results of each points
- on the encode feature map, has shape (N, h*w, 4). Only be
- passed when as_two_stage is True, otherwise is None.
- img_metas (list[dict]): Meta information of each image.
- rescale (bool, optional): If True, return boxes in original
- image space. Default False.
-
- Returns:
- list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
- The first item is an (n, 5) tensor, where the first 4 columns \
- are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
- 5-th column is a score between 0 and 1. The second item is a \
- (n,) tensor where each item is the predicted class label of \
- the corresponding box.
- """
- cls_scores = all_cls_scores[-1]
- bbox_preds = all_bbox_preds[-1]
-
- result_list = []
- for img_id in range(len(img_metas)):
- cls_score = cls_scores[img_id]
- bbox_pred = bbox_preds[img_id]
- img_shape = img_metas[img_id]['img_shape']
- scale_factor = img_metas[img_id]['scale_factor']
- proposals = self._get_bboxes_single(cls_score, bbox_pred,
- img_shape, scale_factor,
- rescale)
- result_list.append(proposals)
- return result_list
|