first commit

This commit is contained in:
IamZLT
2024-08-05 11:19:19 +08:00
commit 8b2e804ccc
39 changed files with 2795 additions and 0 deletions

176
model/SGraFormer.py Normal file
View File

@ -0,0 +1,176 @@
## Our model was revised from https://github.com/zczcwh/PoseFormer/blob/main/common/model_poseformer.py
import torch
import torch.nn as nn
from functools import partial
from einops import rearrange
from timm.models.layers import DropPath
from common.opt import opts
from model.Spatial_encoder import First_view_Spatial_features, Spatial_features
from model.Temporal_encoder import Temporal__features
opt = opts().parse()
#######################################################################################################################
class sgraformer(nn.Module):
def __init__(self, num_frame=9, num_joints=17, in_chans=2, embed_dim_ratio=32, depth=4,
num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, norm_layer=None):
""" ##########hybrid_backbone=None, representation_size=None,
Args:
num_frame (int, tuple): input frame number
num_joints (int, tuple): joints number
in_chans (int): number of input channels, 2D joints have 2 channels: (x,y)
embed_dim_ratio (int): embedding dimension ratio
depth (int): depth of transformer
num_heads (int): number of attention heads
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
qkv_bias (bool): enable bias for qkv if True
qk_scale (float): override default qk scale of head_dim ** -0.5 if set
drop_rate (float): dropout rate
attn_drop_rate (float): attention dropout rate
drop_path_rate (float): stochastic depth rate
norm_layer: (nn.Module): normalization layer
"""
super().__init__()
embed_dim = embed_dim_ratio * num_joints
out_dim = num_joints * 3 #### output dimension is num_joints * 3
##Spatial_features
self.SF1 = First_view_Spatial_features(num_frame, num_joints, in_chans, embed_dim_ratio, depth,
num_heads, mlp_ratio, qkv_bias, qk_scale,
drop_rate, attn_drop_rate, drop_path_rate, norm_layer)
self.SF2 = Spatial_features(num_frame, num_joints, in_chans, embed_dim_ratio, depth,
num_heads, mlp_ratio, qkv_bias, qk_scale,
drop_rate, attn_drop_rate, drop_path_rate, norm_layer)
self.SF3 = Spatial_features(num_frame, num_joints, in_chans, embed_dim_ratio, depth,
num_heads, mlp_ratio, qkv_bias, qk_scale,
drop_rate, attn_drop_rate, drop_path_rate, norm_layer)
self.SF4 = Spatial_features(num_frame, num_joints, in_chans, embed_dim_ratio, depth,
num_heads, mlp_ratio, qkv_bias, qk_scale,
drop_rate, attn_drop_rate, drop_path_rate, norm_layer)
## MVF
self.view_pos_embed = nn.Parameter(torch.zeros(1, 4, num_frame, embed_dim))
self.pos_drop = nn.Dropout(p=0.)
self.conv = nn.Sequential(
nn.BatchNorm2d(4, momentum=0.1),
nn.Conv2d(4, 1, kernel_size=opt.mvf_kernel, stride=1, padding=int(opt.mvf_kernel // 2), bias=False),
nn.ReLU(inplace=True),
)
self.conv_hop = nn.Sequential(
nn.BatchNorm2d(4, momentum=0.1),
nn.Conv2d(4, 1, kernel_size=opt.mvf_kernel, stride=1, padding=int(opt.mvf_kernel // 2), bias=False),
nn.ReLU(inplace=True),
)
self.conv_norm = nn.LayerNorm(embed_dim)
self.conv_hop_norm = nn.LayerNorm(embed_dim)
# Time Serial
self.TF = Temporal__features(num_frame, num_joints, in_chans, embed_dim_ratio, depth,
num_heads, mlp_ratio, qkv_bias, qk_scale,
drop_rate, attn_drop_rate, drop_path_rate, norm_layer)
self.head = nn.Sequential(
nn.LayerNorm(embed_dim),
nn.Linear(embed_dim, out_dim),
)
self.hop_w0 = nn.Parameter(torch.ones(17, 17))
self.hop_w1 = nn.Parameter(torch.ones(17, 17))
self.hop_w2 = nn.Parameter(torch.ones(17, 17))
self.hop_w3 = nn.Parameter(torch.ones(17, 17))
self.hop_w4 = nn.Parameter(torch.ones(17, 17))
self.hop_global = nn.Parameter(torch.ones(17, 17))
self.linear_hop = nn.Linear(8, 2)
# self.max_pool = nn.MaxPool1d(2)
self.edge_embedding = nn.Linear(17*17*4, 17*17)
def forward(self, x, hops):
b, f, v, j, c = x.shape
edge_embedding = self.edge_embedding(hops[0].reshape(1, -1))
###############golbal feature#################
x_hop_global = x.unsqueeze(3).repeat(1, 1, 1, 17, 1, 1)
x_hop_global = x_hop_global - x_hop_global.permute(0, 1, 2, 4, 3, 5)
x_hop_global = torch.sum(x_hop_global ** 2, dim=-1)
hop_global = x_hop_global / torch.sum(x_hop_global, dim=-1).unsqueeze(-1)
hops = hops.unsqueeze(1).unsqueeze(2).repeat(1, f, v, 1, 1, 1)
hops1 = hop_global * hops[:, :, :, 0]
hops2 = hop_global * hops[:, :, :, 1]
hops3 = hop_global * hops[:, :, :, 2]
hops4 = hop_global * hops[:, :, :, 3]
# hops = torch.cat((hops1,hops2,hops3,hops4), dim=-1)
hops = torch.cat((hops1,hops2,hops3,hops4), dim=-1)
x1 = x[:, :, 0]
x2 = x[:, :, 1]
x3 = x[:, :, 2]
x4 = x[:, :, 3]
x1 = x1.permute(0, 3, 1, 2)
x2 = x2.permute(0, 3, 1, 2)
x3 = x3.permute(0, 3, 1, 2)
x4 = x4.permute(0, 3, 1, 2)
hop1 = hops[:, :, 0]
hop2 = hops[:, :, 1]
hop3 = hops[:, :, 2]
hop4 = hops[:, :, 3]
hop1 = hop1.permute(0, 3, 1, 2)
hop2 = hop2.permute(0, 3, 1, 2)
hop3 = hop3.permute(0, 3, 1, 2)
hop4 = hop4.permute(0, 3, 1, 2)
### Semantic graph transformer encoder
x1, hop1, MSA1, MSA2, MSA3, MSA4 = self.SF1(x1, hop1, edge_embedding)
x2, hop2, MSA1, MSA2, MSA3, MSA4 = self.SF2(x2, hop2, MSA1, MSA2, MSA3, MSA4, edge_embedding)
x3, hop3, MSA1, MSA2, MSA3, MSA4 = self.SF3(x3, hop3, MSA1, MSA2, MSA3, MSA4, edge_embedding)
x4, hop4, MSA1, MSA2, MSA3, MSA4 = self.SF4(x4, hop4, MSA1, MSA2, MSA3, MSA4, edge_embedding)
### Multi-view cross-channel fusion
x = torch.cat((x1.unsqueeze(1), x2.unsqueeze(1), x3.unsqueeze(1), x4.unsqueeze(1)), dim=1) + self.view_pos_embed
x = self.pos_drop(x)
x = self.conv(x).squeeze(1) + x1 + x2 + x3 + x4
x = self.conv_norm(x)
hop = torch.cat((hop1.unsqueeze(1), hop2.unsqueeze(1), hop3.unsqueeze(1), hop4.unsqueeze(1)), dim=1) + self.view_pos_embed
hop = self.pos_drop(hop)
# hop = self.conv_hop(hop).squeeze(1) + hop1 + hop2 + hop3 + hop4
# hop = self.conv_hop_norm(hop)
hop = self.conv(hop).squeeze(1) + hop1 + hop2 + hop3 + hop4
hop = self.conv_norm(hop)
x = x * hop
### Temporal transformer encoder
x = self.TF(x)
x = self.head(x)
x = x.view(b, opt.frames, j, -1)
print("=============> x.shape", x.shape)
return x
# x = torch.rand((8, 27, 4, 17 , 2))
# hops = torch.rand((8,4,17,17))
# mvft = hmvformer(num_frame=opt.frames, num_joints=17, in_chans=2, embed_dim_ratio=32, depth=4,
# num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None, drop_path_rate=0.1)
# print(mvft(x, hops).shape)

343
model/Spatial_encoder.py Normal file
View File

@ -0,0 +1,343 @@
## Our model was revised from https://github.com/zczcwh/PoseFormer/blob/main/common/model_poseformer.py
import torch
import torch.nn as nn
from functools import partial
from einops import rearrange
from timm.models.layers import DropPath
#######################################################################################################################
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
#######################################################################################################################
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.edge_embedding = nn.Linear(17*17, 17*17)
def forward(self, x, edge_embedding):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
attn = (q @ k.transpose(-2, -1)) * self.scale
edge_embedding = self.edge_embedding(edge_embedding)
edge_embedding = edge_embedding.reshape(1, 17, 17).unsqueeze(0).repeat(B, self.num_heads, 1, 1)
# print(edge_embedding.shape)
attn = attn + edge_embedding
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
#######################################################################################################################
class CVA_Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.Qnorm = nn.LayerNorm(dim)
self.Knorm = nn.LayerNorm(dim)
self.Vnorm = nn.LayerNorm(dim)
self.QLinear = nn.Linear(dim, dim)
self.KLinear = nn.Linear(dim, dim)
self.VLinear = nn.Linear(dim, dim)
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.edge_embedding = nn.Linear(17*17, 17*17)
def forward(self, x, CVA_input, edge_embedding):
B, N, C = x.shape
# CVA_input = self.max_pool(CVA_input)
# print(CVA_input.shape)
q = self.QLinear(self.Qnorm(CVA_input)).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
k = self.KLinear(self.Knorm(CVA_input)).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
v = self.VLinear(self.Vnorm(x)).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
attn = (q @ k.transpose(-2, -1)) * self.scale
edge_embedding = self.edge_embedding(edge_embedding)
edge_embedding = edge_embedding.reshape(1, 17, 17).unsqueeze(0).repeat(B, self.num_heads, 1, 1)
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
#######################################################################################################################
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x), edge_embedding))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
#######################################################################################################################
class Multi_Out_Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
self.norm_hop1 = norm_layer(dim)
self.norm_hop2 = norm_layer(dim)
self.mlp_hop = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def forward(self, x, hops, edge_embedding):
MSA = self.drop_path(self.attn(self.norm1(x), edge_embedding))
MSA = self.norm_hop1(hops) * MSA
x = x + MSA
x = x + self.drop_path(self.mlp(self.norm2(x)))
hops = hops + MSA
hops = hops + self.drop_path(self.mlp_hop(self.norm_hop2(hops)))
return x, hops, MSA
#######################################################################################################################
class Multi_In_Out_Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.cva_attn = CVA_Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
# self.max_pool = nn.MaxPool1d(3, stride=1, padding=1, dilation=1, return_indices=False, ceil_mode=False)
self.norm_hop1 = norm_layer(dim)
self.norm_hop2 = norm_layer(dim)
self.mlp_hop = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def forward(self, x, hops, CVA_input, edge_embedding):
MSA = self.drop_path(self.cva_attn(x, CVA_input, edge_embedding))
MSA = self.norm_hop1(hops) * MSA
x = x + MSA
x = x + self.drop_path(self.mlp(self.norm2(x)))
hops = hops + MSA
hops = hops + self.drop_path(self.mlp_hop(self.norm_hop2(hops)))
return x, hops, MSA
#######################################################################################################################
class First_view_Spatial_features(nn.Module):
def __init__(self, num_frame=9, num_joints=17, in_chans=2, embed_dim_ratio=32, depth=4,
num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, norm_layer=None):
super().__init__()
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
### spatial patch embedding
self.Spatial_patch_to_embedding = nn.Linear(in_chans, embed_dim_ratio)
self.Spatial_pos_embed = nn.Parameter(torch.zeros(1, num_joints, embed_dim_ratio))
self.hop_to_embedding = nn.Linear(68, embed_dim_ratio)
self.hop_pos_embed = nn.Parameter(torch.zeros(1, num_joints, embed_dim_ratio))
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
self.block1 = Multi_Out_Block(dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[0],
norm_layer=norm_layer)
self.block2 = Multi_Out_Block(dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[1],
norm_layer=norm_layer)
self.block3 = Multi_Out_Block(dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[2],
norm_layer=norm_layer)
self.block4 = Multi_Out_Block(dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[3],
norm_layer=norm_layer)
self.Spatial_norm = norm_layer(embed_dim_ratio)
self.hop_norm = norm_layer(embed_dim_ratio)
def forward(self, x, hops, edge_embedding):
b, _, f, p = x.shape ##### b is batch size, f is number of frames, p is number of joints
x = rearrange(x, 'b c f p -> (b f) p c', )
x = self.Spatial_patch_to_embedding(x)
x += self.Spatial_pos_embed
x = self.pos_drop(x)
hops = rearrange(hops, 'b c f p -> (b f) p c', )
hops = self.hop_to_embedding(hops)
hops += self.hop_pos_embed
hops = self.pos_drop(hops)
x, hops, MSA1 = self.block1(x, hops, edge_embedding)
x, hops, MSA2 = self.block2(x, hops, edge_embedding)
x, hops, MSA3 = self.block3(x, hops, edge_embedding)
x, hops, MSA4 = self.block4(x, hops, edge_embedding)
x = self.Spatial_norm(x)
x = rearrange(x, '(b f) w c -> b f (w c)', f=f)
hops = self.hop_norm(hops)
hops = rearrange(hops, '(b f) w c -> b f (w c)', f=f)
return x, hops, MSA1, MSA2, MSA3, MSA4
#######################################################################################################################
class Spatial_features(nn.Module):
def __init__(self, num_frame=9, num_joints=17, in_chans=2, embed_dim_ratio=32, depth=4,
num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, norm_layer=None):
super().__init__()
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
### spatial patch embedding
self.Spatial_patch_to_embedding = nn.Linear(in_chans, embed_dim_ratio)
self.Spatial_pos_embed = nn.Parameter(torch.zeros(1, num_joints, embed_dim_ratio))
self.hop_to_embedding = nn.Linear(68, embed_dim_ratio)
self.hop_pos_embed = nn.Parameter(torch.zeros(1, num_joints, embed_dim_ratio))
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
self.block1 = Multi_In_Out_Block(
dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[0], norm_layer=norm_layer)
self.block2 = Multi_In_Out_Block(
dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[1], norm_layer=norm_layer)
self.block3 = Multi_In_Out_Block(
dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[2], norm_layer=norm_layer)
self.block4 = Multi_In_Out_Block(
dim=embed_dim_ratio, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[3], norm_layer=norm_layer)
self.Spatial_norm = norm_layer(embed_dim_ratio)
self.hop_norm = norm_layer(embed_dim_ratio)
def forward(self, x, hops, MSA1, MSA2, MSA3, MSA4, edge_embedding):
b, _, f, p = x.shape ##### b is batch size, f is number of frames, p is number of joints
x = rearrange(x, 'b c f p -> (b f) p c', )
x = self.Spatial_patch_to_embedding(x)
x += self.Spatial_pos_embed
x = self.pos_drop(x)
hops = rearrange(hops, 'b c f p -> (b f) p c', )
hops = self.hop_to_embedding(hops)
hops += self.hop_pos_embed
hops = self.pos_drop(hops)
x, hops, MSA1 = self.block1(x, hops, MSA1, edge_embedding)
x, hops, MSA2 = self.block2(x, hops, MSA2, edge_embedding)
x, hops, MSA3 = self.block3(x, hops, MSA3, edge_embedding)
x, hops, MSA4 = self.block4(x, hops, MSA4, edge_embedding)
x = self.Spatial_norm(x)
x = rearrange(x, '(b f) w c -> b f (w c)', f=f)
hops = self.hop_norm(hops)
hops = rearrange(hops, '(b f) w c -> b f (w c)', f=f)
return x, hops, MSA1, MSA2, MSA3, MSA4

159
model/Temporal_encoder.py Normal file
View File

@ -0,0 +1,159 @@
## Our model was revised from https://github.com/zczcwh/PoseFormer/blob/main/common/model_poseformer.py
import torch
import torch.nn as nn
from functools import partial
from einops import rearrange
from timm.models.layers import DropPath
from common.opt import opts
opt = opts().parse()
#######################################################################################################################
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
#######################################################################################################################
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
#######################################################################################################################
class CVA_Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.Qnorm = nn.LayerNorm(dim)
self.Knorm = nn.LayerNorm(dim)
self.Vnorm = nn.LayerNorm(dim)
self.QLinear = nn.Linear(dim, dim)
self.KLinear = nn.Linear(dim, dim)
self.VLinear = nn.Linear(dim, dim)
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, CVA_input):
B, N, C = x.shape
q = self.QLinear(self.Qnorm(CVA_input)).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
k = self.KLinear(self.Knorm(CVA_input)).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
v = self.VLinear(self.Vnorm(x)).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
#######################################################################################################################
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
#######################################################################################################################
class Temporal__features(nn.Module):
def __init__(self, num_frame=9, num_joints=17, in_chans=2, embed_dim_ratio=32, depth=4,
num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, norm_layer=None):
super().__init__()
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
embed_dim = embed_dim_ratio * num_joints #### temporal embed_dim is num_joints * spatial embedding dim ratio
out_dim = num_joints * 3 #### output dimension is num_joints * 3
### Temporal patch embedding
self.Temporal_pos_embed = nn.Parameter(torch.zeros(1, num_frame, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList([
Block(
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
for i in range(depth)])
self.Temporal_norm = norm_layer(embed_dim)
####### A easy way to implement weighted mean
self.weighted_mean = torch.nn.Conv1d(in_channels=num_frame, out_channels=1, kernel_size=1)
def forward(self, x):
b = x.shape[0]
x += self.Temporal_pos_embed
x = self.pos_drop(x)
for blk in self.blocks:
x = blk(x)
x = self.Temporal_norm(x)
##### x size [b, f, emb_dim], then take weighted mean on frame dimension, we only predict 3D pose of the center frame
# x = self.weighted_mean(x)
x = x.view(b, opt.frames, -1)
return x

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.