Support skeleton (#155)

* pose * pose * pose * pose * 你的提交消息 * pose * pose * Delete train1.sh * pretreatment * configs * pose * reference * Update gaittr.py * naming * naming * Update transform.py * update for datasets * update README * update name and README * update * Update transform.py
2023-09-27 16:20:00 +08:00
parent 853bb1821d
commit 2c29afadf3
41 changed files with 4251 additions and 12 deletions
@@ -0,0 +1,75 @@
+import torch
+from ..base_model import BaseModel
+from ..backbones.resgcn import ResGCN
+from ..modules import Graph
+import torch.nn.functional as F
+
+class GaitGraph1(BaseModel):
+    """
+        GaitGraph1: Gaitgraph: Graph Convolutional Network for Skeleton-Based Gait Recognition
+        Paper:    https://ieeexplore.ieee.org/document/9506717
+        Github:   https://github.com/tteepe/GaitGraph
+    """
+    def build_network(self, model_cfg):
+         
+        self.joint_format = model_cfg['joint_format']
+        self.input_num = model_cfg['input_num']
+        self.block = model_cfg['block']
+        self.input_branch = model_cfg['input_branch']
+        self.main_stream = model_cfg['main_stream']
+        self.num_class = model_cfg['num_class']
+        self.reduction = model_cfg['reduction']
+        self.tta = model_cfg['tta']
+        
+        ## Graph Init ##
+        self.graph = Graph(joint_format=self.joint_format,max_hop=3)
+        self.A = torch.tensor(self.graph.A, dtype=torch.float32, requires_grad=False)
+        ## Network ##
+        self.ResGCN = ResGCN(input_num=self.input_num, input_branch=self.input_branch, 
+                             main_stream=self.main_stream, num_class=self.num_class,
+                             reduction=self.reduction, block=self.block,graph=self.A)
+
+    def forward(self, inputs):
+
+        ipts, labs, type_, view_, seqL = inputs
+        x_input = ipts[0] # N T C V I
+        # x = N, T, C, V, M -> N, C, T, V, M
+        x_input = x_input.permute(0, 2, 3, 4, 1).contiguous()
+        N, T, V, I, C = x_input.size() 
+        
+        pose  = x_input
+        if self.training:
+            x_input = torch.cat([x_input[:,:int(T/2),...],x_input[:,int(T/2):,...]],dim=0) #[8, 60, 17, 1, 3]
+        elif self.tta:
+            data_flipped = torch.flip(x_input,dims=[1])
+            x_input = torch.cat([x_input,data_flipped], dim=0)
+
+        x = x_input.permute(0, 3, 4, 1, 2).contiguous()
+
+        # resgcn
+        x = self.ResGCN(x)
+        x = F.normalize(x, dim=1, p=2) # norm #only for GaitGraph1 # Remove from GaitGraph2
+        
+        if self.training:
+            f1, f2 = torch.split(x, [N, N], dim=0)
+            embed = torch.cat([f1.unsqueeze(1), f2.unsqueeze(1)], dim=1) #[4, 2, 128]
+            
+        elif self.tta:
+            f1, f2 = torch.split(x, [N, N], dim=0)
+            embed = torch.mean(torch.stack([f1, f2]), dim=0)
+            embed = embed.unsqueeze(-1)
+        else:
+            embed = embed.unsqueeze(-1)
+        
+        retval = {
+            'training_feat': {
+                'SupConLoss': {'features': embed , 'labels': labs}, # loss
+            },
+            'visual_summary': {
+                'image/pose': pose.view(N*T, 1, I*V, C).contiguous() # visualization
+            },
+            'inference_feat': {
+                'embeddings':   embed # for metric
+            }
+        }
+        return retval
@@ -0,0 +1,110 @@
+import torch
+import torch.nn as nn
+from ..base_model import BaseModel
+from ..backbones.resgcn import ResGCN
+from ..modules import Graph
+import numpy as np
+
+
+class GaitGraph2(BaseModel):
+    """
+        GaitGraph2: Towards a Deeper Understanding of Skeleton-based Gait Recognition
+        Paper:    https://openaccess.thecvf.com/content/CVPR2022W/Biometrics/papers/Teepe_Towards_a_Deeper_Understanding_of_Skeleton-Based_Gait_Recognition_CVPRW_2022_paper
+        Github:   https://github.com/tteepe/GaitGraph2
+    """
+    def build_network(self, model_cfg):
+         
+        self.joint_format = model_cfg['joint_format']
+        self.input_num = model_cfg['input_num']
+        self.block = model_cfg['block']
+        self.input_branch = model_cfg['input_branch']
+        self.main_stream = model_cfg['main_stream']
+        self.num_class = model_cfg['num_class']
+        self.reduction = model_cfg['reduction']
+        self.tta = model_cfg['tta']
+        ## Graph Init ##
+        self.graph = Graph(joint_format=self.joint_format,max_hop=3)
+        self.A = torch.tensor(self.graph.A, dtype=torch.float32, requires_grad=False)
+        ## Network ##
+        self.ResGCN = ResGCN(input_num=self.input_num, input_branch=self.input_branch, 
+                             main_stream=self.main_stream, num_class=self.num_class,
+                             reduction=self.reduction, block=self.block,graph=self.A)
+
+    def forward(self, inputs):
+
+        ipts, labs, type_, view_, seqL = inputs
+        x_input = ipts[0] 
+        N, T, V, I, C = x_input.size()
+        pose  = x_input
+        flip_idx = self.graph.flip_idx
+
+        if not self.training and self.tta:
+            multi_input = MultiInput(self.graph.connect_joint, self.graph.center)
+            x1 = []
+            x2 = []
+            for i in range(N):
+                x1.append(multi_input(x_input[i,:,:,0,:3].flip(0)))
+                x2.append(multi_input(x_input[i,:,flip_idx,0,:3]))
+            x_input = torch.cat([x_input, torch.stack(x1,0), torch.stack(x2,0)], dim=0)
+        
+        x = x_input.permute(0, 3, 4, 1, 2).contiguous()
+
+        # resgcn
+        x = self.ResGCN(x)
+
+        if not self.training and self.tta:
+            f1, f2, f3 = torch.split(x, [N, N, N], dim=0)
+            x = torch.cat((f1, f2, f3), dim=1)
+             
+        embed = torch.unsqueeze(x,-1)
+        
+        retval = {
+            'training_feat': {
+                'SupConLoss': {'features': x , 'labels': labs}, # loss
+            },
+            'visual_summary': {
+                'image/pose': pose.view(N*T, 1, I*V, C).contiguous() # visualization
+            },
+            'inference_feat': {
+                'embeddings': embed # for metric
+            }
+        }
+        return retval
+    
+class MultiInput:
+    def __init__(self, connect_joint, center):
+        self.connect_joint = connect_joint
+        self.center = center
+
+    def __call__(self, data):
+
+        # T, V, C -> T, V, I=3, C + 2
+        T, V, C = data.shape
+        x_new = torch.zeros((T, V, 3, C + 2), device=data.device)
+
+        # Joints
+        x = data
+        x_new[:, :, 0, :C] = x
+        for i in range(V):
+            x_new[:, i, 0, C:] = x[:, i, :2] - x[:, self.center, :2]
+
+        # Velocity
+        for i in range(T - 2):
+            x_new[i, :, 1, :2] = x[i + 1, :, :2] - x[i, :, :2]
+            x_new[i, :, 1, 3:] = x[i + 2, :, :2] - x[i, :, :2]
+        x_new[:, :, 1, 3] = x[:, :, 2]
+
+        # Bones
+        for i in range(V):
+            x_new[:, i, 2, :2] = x[:, i, :2] - x[:, self.connect_joint[i], :2]
+        bone_length = 0
+        for i in range(C - 1):
+            bone_length += torch.pow(x_new[:, :, 2, i], 2)
+        bone_length = torch.sqrt(bone_length) + 0.0001
+        for i in range(C - 1):
+            x_new[:, :, 2, C+i] = torch.acos(x_new[:, :, 2, i] / bone_length)
+        x_new[:, :, 2, 3] = x[:, :, 2]
+
+        data = x_new
+        return data
+
@@ -0,0 +1,186 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..base_model import BaseModel
+from ..modules import Graph, SpatialAttention
+import numpy as np
+import math
+
+
+class Mish(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self,x):
+        return x * (torch.tanh(F.softplus(x)))
+
+class STModule(nn.Module):
+    def __init__(self,in_channels, out_channels, incidence, num_point):
+        super(STModule, self).__init__()
+        """
+        This class implements augmented graph spatial convolution in case of Spatial Transformer
+        Fucntion adapated from: https://github.com/Chiaraplizz/ST-TR/blob/master/code/st_gcn/net/gcn_attention.py
+        """
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.incidence = incidence
+        self.num_point = num_point
+        self.relu = Mish()
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.data_bn = nn.BatchNorm1d(self.in_channels * self.num_point)
+        self.attention_conv = SpatialAttention(in_channels=in_channels,out_channel=out_channels,A=self.incidence,num_point=self.num_point)
+    def forward(self,x):
+        N, C, T, V = x.size()
+        # data normlization
+        x = x.permute(0, 1, 3, 2).reshape(N, C * V, T)
+        x = self.data_bn(x)
+        x = x.reshape(N, C, V, T).permute(0, 1, 3, 2)
+        # adjacency matrix
+        self.incidence = self.incidence.cuda(x.get_device())
+        # N, T, C, V > NT, C, 1, V
+        xa = x.permute(0, 2, 1, 3).reshape(-1, C, 1, V)
+        # spatial attention
+        attn_out = self.attention_conv(xa)
+        # N, T, C, V > N, C, T, V
+        attn_out = attn_out.reshape(N, T, -1, V).permute(0, 2, 1, 3)
+        y = attn_out
+        y = self.bn(self.relu(y))
+        return y
+
+class UnitConv2D(nn.Module):
+    '''
+    This class is used in GaitTR[TCN_ST] block.
+    '''
+
+    def __init__(self, D_in, D_out, kernel_size=9, stride=1, dropout=0.1, bias=True):
+        super(UnitConv2D,self).__init__()
+        pad = int((kernel_size-1)/2)
+        self.conv = nn.Conv2d(D_in,D_out,kernel_size=(kernel_size,1)
+                            ,padding=(pad,0),stride=(stride,1),bias=bias)
+        self.bn = nn.BatchNorm2d(D_out)
+        self.relu = Mish()
+        self.dropout = nn.Dropout(dropout, inplace=False)
+        #initalize
+        self.conv_init(self.conv)
+
+    def forward(self,x):
+        x = self.dropout(x)
+        x = self.bn(self.relu(self.conv(x)))
+        return x
+
+    def conv_init(self,module):
+        n = module.out_channels
+        for k in module.kernel_size:
+            n = n*k
+        module.weight.data.normal_(0, math.sqrt(2. / n))
+
+class TCN_ST(nn.Module):
+    """
+    Block of GaitTR: https://arxiv.org/pdf/2204.03873.pdf
+    TCN: Temporal Convolution Network
+    ST: Sptail Temporal Graph Convolution Network
+    """
+    def __init__(self,in_channel,out_channel,A,num_point):
+        super(TCN_ST, self).__init__()
+        #params
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        self.A = A
+        self.num_point = num_point
+        #network
+        self.tcn = UnitConv2D(D_in=self.in_channel,D_out=self.in_channel,kernel_size=9)
+        self.st = STModule(in_channels=self.in_channel,out_channels=self.out_channel,incidence=self.A,num_point=self.num_point)
+        self.residual = lambda x: x
+        if (in_channel != out_channel):
+            self.residual_s = nn.Sequential(
+                nn.Conv2d(in_channel, out_channel, 1),
+                nn.BatchNorm2d(out_channel),
+            )
+            self.down = UnitConv2D(D_in=self.in_channel,D_out=out_channel,kernel_size=1,dropout=0)
+        else:
+            self.residual_s = lambda x: x
+            self.down = None
+
+    def forward(self,x):
+        x0 = self.tcn(x) + self.residual(x)
+        y = self.st(x0) + self.residual_s(x0)
+        # skip residual
+        y = y + (x if(self.down is None) else self.down(x))
+        return y
+
+
+
+class GaitTR(BaseModel):
+    """
+        GaitTR: Spatial Transformer Network on Skeleton-based Gait Recognition
+        Arxiv : https://arxiv.org/abs/2204.03873.pdf
+    """
+    def build_network(self, model_cfg):
+
+        in_c = model_cfg['in_channels']
+        self.num_class = model_cfg['num_class']
+        self.joint_format = model_cfg['joint_format']
+        self.graph = Graph(joint_format=self.joint_format,max_hop=3)
+
+        #### Network Define ####
+
+        # ajaceny matrix
+        self.A = torch.from_numpy(self.graph.A.astype(np.float32))
+
+        #data normalization
+        num_point = self.A.shape[-1]
+        self.data_bn = nn.BatchNorm1d(in_c[0] * num_point)
+        
+        #backbone
+        backbone = []
+        for i in range(len(in_c)-1):
+            backbone.append(TCN_ST(in_channel= in_c[i],out_channel= in_c[i+1],A=self.A,num_point=num_point))
+        self.backbone = nn.ModuleList(backbone)
+
+        self.fcn = nn.Conv1d(in_c[-1], self.num_class, kernel_size=1)
+
+    def forward(self, inputs):
+        ipts, labs, _, _, seqL = inputs
+
+        x= ipts[0] 
+        pose = x
+        # x = N, T, C, V, M -> N, C, T, V, M
+        x = x.permute(0, 2, 1, 3, 4)
+        N, C, T, V, M = x.size()
+        if len(x.size()) == 4:
+            x = x.unsqueeze(1)
+        del ipts
+
+        x = x.permute(0, 4, 3, 1, 2).contiguous().view(N, M * V * C, T)
+
+        x = self.data_bn(x)
+        x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, 2).contiguous().view(
+                N * M, C, T, V)
+        #backbone
+        for _,m in enumerate(self.backbone):
+            x = m(x)
+        # V pooling
+        x = F.avg_pool2d(x, kernel_size=(1,V))
+        # M pooling
+        c = x.size(1)
+        t = x.size(2)
+        x = x.view(N, M, c, t).mean(dim=1).view(N, c, t)#[n,c,t]
+        # T pooling
+        x = F.avg_pool1d(x, kernel_size=x.size()[2]) #[n,c]
+        # C fcn
+        x = self.fcn(x) #[n,c']
+        x = F.avg_pool1d(x, x.size()[2:]) # [n,c']
+        x = x.view(N, self.num_class) # n,c
+        embed = x.unsqueeze(-1) # n,c,1
+
+        retval = {
+            'training_feat': {
+                'triplet': {'embeddings': embed, 'labels': labs}
+            },
+            'visual_summary': {
+                'image/pose': pose.view(N*T, M, V, C)
+            },
+            'inference_feat': {
+                'embeddings': embed
+            }
+        }
+        return retval
@@ -0,0 +1,484 @@
+import torch
+import copy
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+from ..base_model import BaseModel
+
+class MultiScaleGaitGraph(BaseModel):
+    """
+        Learning Rich Features for Gait Recognition by Integrating Skeletons and Silhouettes
+        Github: https://github.com/YunjiePeng/BimodalFusion
+    """
+
+    def build_network(self, model_cfg):
+        in_c = model_cfg['in_channels']
+        out_c = model_cfg['out_channels']
+        num_id = model_cfg['num_id']
+
+        temporal_kernel_size = model_cfg['temporal_kernel_size']
+
+        # load spatial graph
+        self.graph = SpatialGraph(**model_cfg['graph_cfg'])
+        A_lowSemantic = torch.tensor(self.graph.get_adjacency(semantic_level=0), dtype=torch.float32, requires_grad=False)
+        A_mediumSemantic =  torch.tensor(self.graph.get_adjacency(semantic_level=1), dtype=torch.float32, requires_grad=False)
+        A_highSemantic = torch.tensor(self.graph.get_adjacency(semantic_level=2), dtype=torch.float32, requires_grad=False)
+
+        self.register_buffer('A_lowSemantic', A_lowSemantic)
+        self.register_buffer('A_mediumSemantic', A_mediumSemantic)
+        self.register_buffer('A_highSemantic', A_highSemantic)
+
+        # build networks
+        spatial_kernel_size = self.graph.num_A
+        temporal_kernel_size = temporal_kernel_size
+        kernel_size = (temporal_kernel_size, spatial_kernel_size)
+
+        self.st_gcn_networks_lowSemantic = nn.ModuleList()
+        self.st_gcn_networks_mediumSemantic = nn.ModuleList()
+        self.st_gcn_networks_highSemantic = nn.ModuleList()
+        for i in range(len(in_c)-1):
+            if i == 0:
+                self.st_gcn_networks_lowSemantic.append(st_gcn_block(in_c[i], in_c[i+1], kernel_size, 1, residual=False))
+                self.st_gcn_networks_mediumSemantic.append(st_gcn_block(in_c[i], in_c[i+1], kernel_size, 1, residual=False))
+                self.st_gcn_networks_highSemantic.append(st_gcn_block(in_c[i], in_c[i+1], kernel_size, 1, residual=False))
+            else:
+                self.st_gcn_networks_lowSemantic.append(st_gcn_block(in_c[i], in_c[i+1], kernel_size, 1))
+                self.st_gcn_networks_mediumSemantic.append(st_gcn_block(in_c[i], in_c[i+1], kernel_size, 1))
+                self.st_gcn_networks_highSemantic.append(st_gcn_block(in_c[i], in_c[i+1], kernel_size, 1))
+
+            self.st_gcn_networks_lowSemantic.append(st_gcn_block(in_c[i+1], in_c[i+1], kernel_size, 1))
+            self.st_gcn_networks_mediumSemantic.append(st_gcn_block(in_c[i+1], in_c[i+1], kernel_size, 1))
+            self.st_gcn_networks_highSemantic.append(st_gcn_block(in_c[i+1], in_c[i+1], kernel_size, 1))
+
+        self.edge_importance_lowSemantic = nn.ParameterList([
+            nn.Parameter(torch.ones(self.A_lowSemantic.size()))
+            for i in self.st_gcn_networks_lowSemantic])
+
+        self.edge_importance_mediumSemantic = nn.ParameterList([
+            nn.Parameter(torch.ones(self.A_mediumSemantic.size()))
+            for i in self.st_gcn_networks_mediumSemantic])
+
+        self.edge_importance_highSemantic = nn.ParameterList([
+            nn.Parameter(torch.ones(self.A_highSemantic.size()))
+            for i in self.st_gcn_networks_highSemantic])
+
+        self.fc = nn.Linear(in_c[-1], out_c)
+        self.bn_neck = nn.BatchNorm1d(out_c)
+        self.encoder_cls = nn.Linear(out_c, num_id, bias=False)
+
+    def semantic_pooling(self, x):
+        cur_node_num = x.size()[-1]
+        half_x_1, half_x_2 = torch.split(x, int(cur_node_num / 2), dim=-1)
+        x_sp = torch.add(half_x_1, half_x_2) / 2
+        return x_sp
+
+    def forward(self, inputs):
+        ipts, labs, _, _, seqL = inputs
+        
+        x = ipts[0]  # [N, T, V, C]
+        del ipts
+        """
+           N - the number of videos.
+           T - the number of frames in one video.
+           V - the number of keypoints.
+           C - the number of features for one keypoint.
+        """
+        N, T, V, C = x.size()
+        x = x.permute(0, 3, 1, 2).contiguous()
+        x = x.view(N, C, T, V)
+
+        y = self.semantic_pooling(x)
+        z = self.semantic_pooling(y)
+        for gcn_lowSemantic, importance_lowSemantic, gcn_mediumSemantic, importance_mediumSemantic, gcn_highSemantic, importance_highSemantic in zip(self.st_gcn_networks_lowSemantic, self.edge_importance_lowSemantic, self.st_gcn_networks_mediumSemantic, self.edge_importance_mediumSemantic, self.st_gcn_networks_highSemantic, self.edge_importance_highSemantic):
+            x, _ = gcn_lowSemantic(x, self.A_lowSemantic * importance_lowSemantic)
+            y, _ = gcn_mediumSemantic(y, self.A_mediumSemantic * importance_mediumSemantic)
+            z, _ = gcn_highSemantic(z, self.A_highSemantic * importance_highSemantic)
+
+            # Cross-scale Message Passing
+            x_sp = self.semantic_pooling(x)
+            y = torch.add(y, x_sp)
+            y_sp = self.semantic_pooling(y)
+            z = torch.add(z, y_sp)
+        
+        # global pooling for each layer
+        x_sp = F.avg_pool2d(x, x.size()[2:])
+        N, C, T, V = x_sp.size()
+        x_sp = x_sp.view(N, C, T*V).contiguous()
+
+        y_sp = F.avg_pool2d(y, y.size()[2:])
+        N, C, T, V = y_sp.size()
+        y_sp = y_sp.view(N, C, T*V).contiguous()
+
+        z = F.avg_pool2d(z, z.size()[2:])
+        N, C, T, V = z.size()
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z = z.view(N, T*V, C)
+
+        z_fc = self.fc(z.view(N, -1))
+        bn_z_fc = self.bn_neck(z_fc)
+        z_cls_score = self.encoder_cls(bn_z_fc)
+
+        z_fc = z_fc.unsqueeze(-1).contiguous() # [n, c, p]
+        z_cls_score = z_cls_score.unsqueeze(-1).contiguous() # [n, c, p]
+
+        retval = {
+            'training_feat': {
+                'triplet_joints': {'embeddings': x_sp, 'labels': labs},
+                'triplet_limbs': {'embeddings': y_sp, 'labels': labs},
+                'triplet_bodyparts': {'embeddings': z_fc, 'labels': labs},
+                'softmax': {'logits': z_cls_score, 'labels': labs}
+            },
+            'visual_summary': {},
+            'inference_feat': {
+                'embeddings': z_fc
+            }
+        }
+        return retval
+
+class st_gcn_block(nn.Module):
+    r"""Applies a spatial temporal graph convolution over an input graph sequence.
+    Args:
+        in_channels (int): Number of channels in the input sequence data
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (tuple): Size of the temporal convolving kernel and graph convolving kernel
+        stride (int, optional): Stride of the temporal convolution. Default: 1
+        dropout (int, optional): Dropout rate of the final output. Default: 0
+        residual (bool, optional): If ``True``, applies a residual mechanism. Default: ``True``
+    Shape:
+        - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
+        - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
+        - Output[0]: Outpu graph sequence in :math:`(N, out_channels, T_{out}, V)` format
+        - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
+        where
+            :math:`N` is a batch size, i.e. the number of videos.
+            :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`.
+            :math:`T_{in}/T_{out}` is a length of input/output sequence, i.e. the number of frames in a video.
+            :math:`V` is the number of graph nodes.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dropout=0,
+                 residual=True):
+        super().__init__()
+
+        assert len(kernel_size) == 2
+        assert kernel_size[0] % 2 == 1
+        padding = ((kernel_size[0] - 1) // 2, 0)
+
+        self.gcn = SCN(in_channels, out_channels, kernel_size[1])
+
+        self.tcn = nn.Sequential(
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                out_channels,
+                out_channels,
+                (kernel_size[0], 1),
+                (stride, 1),
+                padding,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.Dropout(dropout, inplace=True),
+        )
+
+        if not residual:
+            self.residual = lambda x: 0
+
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=(stride, 1)),
+                nn.BatchNorm2d(out_channels),
+            )
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, A):
+        res = self.residual(x)
+        x, A = self.gcn(x, A)
+        x = self.tcn(x) + res
+
+        return self.relu(x), A
+
+class SCN(nn.Module):
+    r"""The basic module for applying a graph convolution.
+    Args:
+        in_channels (int): Number of channels in the input sequence data
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int): Size of the graph convolving kernel
+        t_kernel_size (int): Size of the temporal convolving kernel
+        t_stride (int, optional): Stride of the temporal convolution. Default: 1
+        t_padding (int, optional): Temporal zero-padding added to both sides of
+            the input. Default: 0
+        t_dilation (int, optional): Spacing between temporal kernel elements.
+            Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output.
+            Default: ``True``
+    Shape:
+        - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
+        - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
+        - Output[0]: Output graph sequence in :math:`(N, out_channels, T_{out}, V)` format
+        - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
+        where
+            :math:`N` is a batch size,
+            :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`,
+            :math:`T_{in}/T_{out}` is a length of input/output sequence,
+            :math:`V` is the number of graph nodes.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 t_kernel_size=1,
+                 t_stride=1,
+                 t_padding=0,
+                 t_dilation=1,
+                 bias=True):
+        super().__init__()
+        # The defined module SCN are responsible only for the Spacial Graph (i.e. the graph in on frame),
+        # and the parameter t_kernel_size in this situation is always set to 1.
+
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv2d(in_channels,
+                              out_channels * kernel_size,
+                              kernel_size=(t_kernel_size, 1),
+                              padding=(t_padding, 0),
+                              stride=(t_stride, 1),
+                              dilation=(t_dilation, 1),
+                              bias=bias)
+        """
+        The 1x1 conv operation here stands for the weight metrix W.
+        The kernel_size here stands for the number of different adjacency matrix, 
+            which are defined according to the partitioning strategy.
+        Because for neighbor nodes in the same subset (in one adjacency matrix), the weights are shared. 
+        It is reasonable to apply 1x1 conv as the implementation of weight function.
+        """
+
+
+    def forward(self, x, A):
+        assert A.size(0) == self.kernel_size
+
+        x = self.conv(x)
+
+        n, kc, t, v = x.size()
+        x = x.view(n, self.kernel_size, kc // self.kernel_size, t, v)
+        x = torch.einsum('nkctv,kvw->nctw', (x, A))
+
+        return x.contiguous(), A
+
+class SpatialGraph():
+    """ Use skeleton sequences extracted by Openpose/HRNet to construct Spatial-Temporal Graph
+
+    Args:
+        strategy (string): must be one of the follow candidates
+        - uniform: Uniform Labeling
+        - distance: Distance Partitioning
+        - spatial: Spatial Configuration Partitioning
+        - gait_temporal: Gait Temporal Configuration Partitioning
+            For more information, please refer to the section 'Partition Strategies' in PGG.
+        layout (string): must be one of the follow candidates
+        - body_12: Is consists of 12 joints.
+            (right shoulder, right elbow, right knee, right hip, left elbow, left knee,
+             left shoulder, right wrist, right ankle, left hip, left wrist, left ankle).
+            For more information, please refer to the section 'Data Processing' in PGG.
+        max_hop (int): the maximal distance between two connected nodes # 1-neighbor
+        dilation (int): controls the spacing between the kernel points
+    """
+    def __init__(self,
+                 layout='body_12', # Openpose here represents for body_12
+                 strategy='spatial',
+                 semantic_level=0,
+                 max_hop=1,
+                 dilation=1):
+        self.layout = layout
+        self.strategy = strategy
+        self.max_hop = max_hop
+        self.dilation = dilation
+        self.num_node, self.neighbor_link_dic = self.get_layout_info(layout)
+        self.num_A = self.get_A_num(strategy)
+
+    def __str__(self):
+        return self.A
+
+    def get_A_num(self, strategy):
+        if self.strategy == 'uniform':
+            return 1
+        elif self.strategy == 'distance':
+            return 2
+        elif (self.strategy == 'spatial') or (self.strategy == 'gait_temporal'):
+            return 3
+        else:
+            raise ValueError("Do Not Exist This Strategy")
+
+    def get_layout_info(self, layout):
+        if layout == 'body_12':
+            num_node = 12
+            neighbor_link_dic = {
+                0: [(7, 1), (1, 0), (10, 4), (4, 6),
+                     (8, 2), (2, 3), (11, 5), (5, 9),
+                     (9, 3), (3, 0), (9, 6), (6, 0)],
+                1: [(1, 0), (4, 0), (0, 3), (2, 3), (5, 3)],
+                2: [(1, 0), (2, 0)]
+            }
+            return num_node, neighbor_link_dic
+        else:
+            raise ValueError("Do Not Exist This Layout.")
+
+    def get_edge(self, semantic_level):
+        # edge is a list of [child, parent] pairs, regarding the center node as root node
+        self_link = [(i, i) for i in range(int(self.num_node / (2 ** semantic_level)))]
+        neighbor_link = self.neighbor_link_dic[semantic_level]
+        edge = self_link + neighbor_link
+        center = []
+        if self.layout == 'body_12':
+            if semantic_level == 0:
+                center = [0, 3, 6, 9]
+            elif semantic_level == 1:
+                center = [0, 3]
+            elif semantic_level == 2:
+                center = [0]
+        return edge, center
+
+    def get_gait_temporal_partitioning(self, semantic_level):
+        if semantic_level == 0:
+            if self.layout == 'body_12':
+                positive_node = {1, 2, 4, 5, 7, 8, 10, 11}
+                negative_node = {0, 3, 6, 9}
+        elif semantic_level == 1:
+            if self.layout == 'body_12':
+                positive_node = {1, 2, 4, 5}
+                negative_node = {0, 3}
+        elif semantic_level == 2:
+            if self.layout == 'body_12':
+                positive_node = {1, 2}
+                negative_node = {0}
+        return positive_node, negative_node
+            
+    def get_adjacency(self, semantic_level):
+        edge, center = self.get_edge(semantic_level)
+        num_node = int(self.num_node / (2 ** semantic_level))
+        hop_dis = get_hop_distance(num_node, edge, max_hop=self.max_hop)
+                
+        valid_hop = range(0, self.max_hop + 1, self.dilation)
+        adjacency = np.zeros((num_node, num_node))
+        for hop in valid_hop:
+            adjacency[hop_dis == hop] = 1
+
+        normalize_adjacency = normalize_digraph(adjacency)
+        # normalize_adjacency = adjacency # withoutNodeNorm
+
+        # normalize_adjacency[a][b] = x
+        # when x = 0, node b has no connection with node a within valid hop.
+        # when x ≠ 0, the normalized adjacency from node b to node a is x.
+        # the value of x is normalized by the number of adjacent neighbor nodes around the node b.
+
+        if self.strategy == 'uniform':
+            A = np.zeros((1, num_node, num_node))
+            A[0] = normalize_adjacency
+            return A
+        elif self.strategy == 'distance':
+            A = np.zeros((len(valid_hop), num_node, num_node))
+            for i, hop in enumerate(valid_hop):
+                A[i][hop_dis == hop] = normalize_adjacency[hop_dis == hop]
+            return A
+        elif self.strategy == 'spatial':
+            A = []
+            for hop in valid_hop:
+                a_root = np.zeros((num_node, num_node))
+                a_close = np.zeros((num_node, num_node))
+                a_further = np.zeros((num_node, num_node))
+                for i in range(num_node):
+                    for j in range(num_node):
+                        if hop_dis[j, i] == hop:
+                            j_hop_dis = min([hop_dis[j, _center] for _center in center])
+                            i_hop_dis = min([hop_dis[i, _center] for _center in center])
+                            if j_hop_dis == i_hop_dis:
+                                a_root[j, i] = normalize_adjacency[j, i]
+                            elif j_hop_dis > i_hop_dis:
+                                a_close[j, i] = normalize_adjacency[j, i]
+                            else:
+                                a_further[j, i] = normalize_adjacency[j, i]
+                if hop == 0:
+                    A.append(a_root)
+                else:
+                    A.append(a_root + a_close)
+                    A.append(a_further)
+            A = np.stack(A)
+            self.A = A
+            return A
+        elif self.strategy == 'gait_temporal':
+            A = []
+            positive_node, negative_node = self.get_gait_temporal_partitioning(semantic_level)
+            for hop in valid_hop:
+                a_root = np.zeros((num_node, num_node))
+                a_positive = np.zeros((num_node, num_node))
+                a_negative = np.zeros((num_node, num_node))
+                for i in range(num_node):
+                    for j in range(num_node):
+                        if hop_dis[j, i] == hop:
+                            if i == j:
+                                a_root[j, i] = normalize_adjacency[j, i]
+                            elif j in positive_node:
+                                a_positive[j, i] = normalize_adjacency[j, i]
+                            else:
+                                a_negative[j, i] = normalize_adjacency[j, i]
+                
+                if hop == 0:
+                    A.append(a_root)
+                else:
+                    A.append(a_negative)
+                    A.append(a_positive)
+            A = np.stack(A)
+            return A
+        else:
+            raise ValueError("Do Not Exist This Strategy")
+
+
+def get_hop_distance(num_node, edge, max_hop=1):
+    # Calculate the shortest path between nodes
+    # i.e. The minimum number of steps needed to walk from one node to another
+    A = np.zeros((num_node, num_node)) # Ajacent Matrix
+    for i, j in edge:
+        A[j, i] = 1
+        A[i, j] = 1
+
+    # compute hop steps
+    hop_dis = np.zeros((num_node, num_node)) + np.inf
+    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
+    arrive_mat = (np.stack(transfer_mat) > 0)
+    for d in range(max_hop, -1, -1):
+        hop_dis[arrive_mat[d]] = d
+    return hop_dis
+
+
+def normalize_digraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-1)
+    AD = np.dot(A, Dn)
+    return AD
+
+
+def normalize_undigraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-0.5)
+    DAD = np.dot(np.dot(Dn, A), Dn)
+    return DAD