You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

353 lines
13 KiB

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers_slim.models.ppdet.core.workspace import register
from ..layers import MultiHeadAttention, _convert_attention_mask
from .position_encoding import PositionEmbedding
from .utils import _get_clones
from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
__all__ = ['DETRTransformer']
class TransformerEncoderLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerEncoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, src, src_mask=None, pos_embed=None):
src_mask = _convert_attention_mask(src_mask, src.dtype)
residual = src
if self.normalize_before:
src = self.norm1(src)
q = k = self.with_pos_embed(src, pos_embed)
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
src = residual + self.dropout1(src)
if not self.normalize_before:
src = self.norm1(src)
residual = src
if self.normalize_before:
src = self.norm2(src)
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = residual + self.dropout2(src)
if not self.normalize_before:
src = self.norm2(src)
return src
class TransformerEncoder(nn.Layer):
def __init__(self, encoder_layer, num_layers, norm=None):
super(TransformerEncoder, self).__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(self, src, src_mask=None, pos_embed=None):
src_mask = _convert_attention_mask(src_mask, src.dtype)
output = src
for layer in self.layers:
output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
if self.norm is not None:
output = self.norm(output)
return output
class TransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerDecoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self,
tgt,
memory,
tgt_mask=None,
memory_mask=None,
pos_embed=None,
query_pos_embed=None):
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
residual = tgt
if self.normalize_before:
tgt = self.norm1(tgt)
q = k = self.with_pos_embed(tgt, query_pos_embed)
tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
tgt = residual + self.dropout1(tgt)
if not self.normalize_before:
tgt = self.norm1(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm2(tgt)
q = self.with_pos_embed(tgt, query_pos_embed)
k = self.with_pos_embed(memory, pos_embed)
tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
tgt = residual + self.dropout2(tgt)
if not self.normalize_before:
tgt = self.norm2(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm3(tgt)
tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = residual + self.dropout3(tgt)
if not self.normalize_before:
tgt = self.norm3(tgt)
return tgt
class TransformerDecoder(nn.Layer):
def __init__(self,
decoder_layer,
num_layers,
norm=None,
return_intermediate=False):
super(TransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
def forward(self,
tgt,
memory,
tgt_mask=None,
memory_mask=None,
pos_embed=None,
query_pos_embed=None):
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
output = tgt
intermediate = []
for layer in self.layers:
output = layer(
output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
pos_embed=pos_embed,
query_pos_embed=query_pos_embed)
if self.return_intermediate:
intermediate.append(self.norm(output))
if self.norm is not None:
output = self.norm(output)
if self.return_intermediate:
return paddle.stack(intermediate)
return output.unsqueeze(0)
@register
class DETRTransformer(nn.Layer):
__shared__ = ['hidden_dim']
def __init__(self,
num_queries=100,
position_embed_type='sine',
return_intermediate_dec=True,
backbone_num_channels=2048,
hidden_dim=256,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(DETRTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'],\
f'ValueError: position_embed_type not supported {position_embed_type}!'
self.hidden_dim = hidden_dim
self.nhead = nhead
encoder_layer = TransformerEncoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
encoder_norm)
decoder_layer = TransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
decoder_norm = nn.LayerNorm(hidden_dim)
self.decoder = TransformerDecoder(
decoder_layer,
num_decoder_layers,
decoder_norm,
return_intermediate=return_intermediate_dec)
self.input_proj = nn.Conv2D(
backbone_num_channels, hidden_dim, kernel_size=1)
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
self.position_embedding = PositionEmbedding(
hidden_dim // 2,
normalize=True if position_embed_type == 'sine' else False,
embed_type=position_embed_type)
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
xavier_uniform_(p)
conv_init_(self.input_proj)
normal_(self.query_pos_embed.weight)
@classmethod
def from_config(cls, cfg, input_shape):
return {
'backbone_num_channels': [i.channels for i in input_shape][-1],
}
def forward(self, src, src_mask=None):
r"""
Applies a Transformer model on the inputs.
Parameters:
src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
src_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
[bs, H, W]`. When the data type is bool, the unwanted positions
have `False` values and the others have `True` values. When the
data type is int, the unwanted positions have 0 values and the
others have 1 values. When the data type is float, the unwanted
positions have `-INF` values and the others have 0 values. It
can be None when nothing wanted or needed to be prevented
attention to. Default None.
Returns:
output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
memory (Tensor): [batch_size, hidden_dim, h, w]
"""
# use last level feature map
src_proj = self.input_proj(src[-1])
bs, c, h, w = src_proj.shape
# flatten [B, C, H, W] to [B, HxW, C]
src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
if src_mask is not None:
src_mask = F.interpolate(
src_mask.unsqueeze(0).astype(src_flatten.dtype),
size=(h, w))[0].astype('bool')
else:
src_mask = paddle.ones([bs, h, w], dtype='bool')
pos_embed = self.position_embedding(src_mask).flatten(2).transpose(
[0, 2, 1])
src_mask = _convert_attention_mask(src_mask, src_flatten.dtype)
src_mask = src_mask.reshape([bs, 1, 1, -1])
memory = self.encoder(
src_flatten, src_mask=src_mask, pos_embed=pos_embed)
query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
[bs, 1, 1])
tgt = paddle.zeros_like(query_pos_embed)
output = self.decoder(
tgt,
memory,
memory_mask=src_mask,
pos_embed=pos_embed,
query_pos_embed=query_pos_embed)
return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
src_proj, src_mask.reshape([bs, 1, 1, h, w]))