add BiggerGait
This commit is contained in:
@@ -0,0 +1,127 @@
|
|||||||
|
data_cfg:
|
||||||
|
dataset_name: CCPG
|
||||||
|
# TODO
|
||||||
|
dataset_root: your_path # use datasets/pretreatment_rgb.py for data preprocessing!
|
||||||
|
dataset_partition: ./datasets/CCPG/CCPG.json
|
||||||
|
data_in_use: [True, False] # images / real_ratios
|
||||||
|
num_workers: 8
|
||||||
|
remove_no_gallery: false # Remove probe if no gallery for it
|
||||||
|
test_dataset_name: CCPG
|
||||||
|
|
||||||
|
evaluator_cfg:
|
||||||
|
enable_float16: true
|
||||||
|
restore_ckpt_strict: False # Rename some module name for clarity, so it is fasle.
|
||||||
|
restore_hint: 30000 # BiggerGait__SmallDINOv2_Gaitbase_84Frame30_448224_6432HPP32_NoAlign_Sep12B_WiMask-30000.pt in HuggingFace
|
||||||
|
save_name: BiggerGait__Dinov2
|
||||||
|
eval_func: evaluate_CCPG
|
||||||
|
sampler:
|
||||||
|
batch_shuffle: false
|
||||||
|
batch_size: 8 # GPUs number
|
||||||
|
sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options: fixed_unordered
|
||||||
|
frames_all_limit: 250 # limit the number of sampled frames to prevent out of memory
|
||||||
|
metric: euc # cos
|
||||||
|
transform:
|
||||||
|
- type: BaseRgbTransform
|
||||||
|
|
||||||
|
loss_cfg:
|
||||||
|
- loss_term_weight: 1.0
|
||||||
|
margin: 0.2
|
||||||
|
type: TripletLoss
|
||||||
|
log_prefix: triplet
|
||||||
|
- loss_term_weight: 1.0
|
||||||
|
scale: 16
|
||||||
|
type: CrossEntropyLoss
|
||||||
|
log_prefix: softmax
|
||||||
|
log_accuracy: true
|
||||||
|
|
||||||
|
model_cfg:
|
||||||
|
model: BiggerGait__DINOv2
|
||||||
|
pretrained_lvm: ./pretrained_LVMs/dinov2-small # DINOv2-S HuggingFace Link: "git clone https://huggingface.co/facebook/dinov2-small" + "git lfs pull"
|
||||||
|
pretrained_mask_branch: ./pretrained_LVMs/MaskBranch_vits14.pt # Using BigGait Code for pretraining at first, it is very lightly. pretrained_mask_branch: None or MaskBranch Download Link: https://drive.google.com/drive/folders/1zrWPUsrbCpwxoLgfom3d2irgxkBqtXqc?usp=sharing
|
||||||
|
image_size: 224 # 448x224
|
||||||
|
sils_size: 32 # 64x32
|
||||||
|
|
||||||
|
source_dim: 384
|
||||||
|
num_unknown: 16
|
||||||
|
total_layer_num: 12
|
||||||
|
group_layer_num: 1
|
||||||
|
head_num: 12
|
||||||
|
|
||||||
|
Mask_Branch:
|
||||||
|
source_dim: 384
|
||||||
|
target_dim: 2
|
||||||
|
p: 0.
|
||||||
|
softmax: True
|
||||||
|
|
||||||
|
backbone_cfg:
|
||||||
|
type: ResNet9
|
||||||
|
block: BasicBlock
|
||||||
|
in_channel: 16
|
||||||
|
channels: # Layers configuration for automatically model construction
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
layers:
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
strides:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
maxpool: false
|
||||||
|
SeparateFCs:
|
||||||
|
in_channels: 512
|
||||||
|
out_channels: 256
|
||||||
|
parts_num: 32
|
||||||
|
SeparateBNNecks:
|
||||||
|
class_num: 100
|
||||||
|
in_channels: 256
|
||||||
|
parts_num: 32
|
||||||
|
bin_num:
|
||||||
|
- 32
|
||||||
|
|
||||||
|
optimizer_cfg:
|
||||||
|
lr: 0.1
|
||||||
|
momentum: 0.9
|
||||||
|
solver: SGD
|
||||||
|
weight_decay: 0.0005
|
||||||
|
|
||||||
|
scheduler_cfg:
|
||||||
|
gamma: 0.1
|
||||||
|
milestones: # Learning Rate Reduction at each milestones
|
||||||
|
- 15000
|
||||||
|
- 25000
|
||||||
|
scheduler: MultiStepLR
|
||||||
|
|
||||||
|
trainer_cfg:
|
||||||
|
find_unused_parameters: True
|
||||||
|
enable_float16: true # half_percesion float for memory reduction and speedup
|
||||||
|
fix_BN: false
|
||||||
|
log_iter: 100
|
||||||
|
with_test: true
|
||||||
|
restore_ckpt_strict: true
|
||||||
|
restore_hint: 0
|
||||||
|
save_iter: 10000
|
||||||
|
save_name: BiggerGait__Dinov2
|
||||||
|
sync_BN: true
|
||||||
|
total_iter: 30000
|
||||||
|
sampler:
|
||||||
|
batch_shuffle: true
|
||||||
|
batch_size:
|
||||||
|
- 8 # TripletSampler, batch_size[0] indicates Number of Identity
|
||||||
|
- 4 # batch_size[1] indicates Samples sequqnce for each Identity
|
||||||
|
frames_num_fixed: 30 # fixed frames number for training
|
||||||
|
frames_skip_num: 4
|
||||||
|
frames_num_max: 40 # max frames number for unfixed training
|
||||||
|
frames_num_min: 20 # min frames number for unfixed traing
|
||||||
|
sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered
|
||||||
|
type: TripletSampler
|
||||||
|
transform:
|
||||||
|
- type: Compose
|
||||||
|
trf_cfg:
|
||||||
|
- type: RandomHorizontalFlip
|
||||||
|
- type: BaseRgbTransform
|
||||||
@@ -0,0 +1,127 @@
|
|||||||
|
data_cfg:
|
||||||
|
dataset_name: CCPG
|
||||||
|
# TODO
|
||||||
|
dataset_root: your_path # use datasets/pretreatment_rgb.py for data preprocessing!
|
||||||
|
dataset_partition: ./datasets/CCPG/CCPG.json
|
||||||
|
data_in_use: [True, False] # images / real_ratios
|
||||||
|
num_workers: 8
|
||||||
|
remove_no_gallery: false # Remove probe if no gallery for it
|
||||||
|
test_dataset_name: CCPG
|
||||||
|
|
||||||
|
evaluator_cfg:
|
||||||
|
enable_float16: true
|
||||||
|
restore_ckpt_strict: False # Rename some module name for clarity, so it is fasle.
|
||||||
|
restore_hint: 30000 # BiggerGait__SmallDINOv2_Gaitbase_84Frame30_448224_6432HPP32_NoAlign_Sep12B_WiMask_2B_6G-30000.pt in HuggingFace
|
||||||
|
save_name: BiggerGait__Dinov2_Group
|
||||||
|
eval_func: evaluate_CCPG
|
||||||
|
sampler:
|
||||||
|
batch_shuffle: false
|
||||||
|
batch_size: 8 # GPUs number
|
||||||
|
sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options: fixed_unordered
|
||||||
|
frames_all_limit: 250 # limit the number of sampled frames to prevent out of memory
|
||||||
|
metric: euc # cos
|
||||||
|
transform:
|
||||||
|
- type: BaseRgbTransform
|
||||||
|
|
||||||
|
loss_cfg:
|
||||||
|
- loss_term_weight: 1.0
|
||||||
|
margin: 0.2
|
||||||
|
type: TripletLoss
|
||||||
|
log_prefix: triplet
|
||||||
|
- loss_term_weight: 1.0
|
||||||
|
scale: 16
|
||||||
|
type: CrossEntropyLoss
|
||||||
|
log_prefix: softmax
|
||||||
|
log_accuracy: true
|
||||||
|
|
||||||
|
model_cfg:
|
||||||
|
model: BiggerGait__DINOv2
|
||||||
|
pretrained_lvm: ./pretrained_LVMs/dinov2-small # DINOv2-S HuggingFace Link: "git clone https://huggingface.co/facebook/dinov2-small" + "git lfs pull"
|
||||||
|
pretrained_mask_branch: ./pretrained_LVMs/MaskBranch_vits14.pt # Using BigGait Code for pretraining at first, it is very lightly. pretrained_mask_branch: None or MaskBranch Download Link: https://drive.google.com/drive/folders/1zrWPUsrbCpwxoLgfom3d2irgxkBqtXqc?usp=sharing
|
||||||
|
image_size: 224 # 448x224
|
||||||
|
sils_size: 32 # 64x32
|
||||||
|
|
||||||
|
source_dim: 384
|
||||||
|
num_unknown: 16
|
||||||
|
total_layer_num: 12
|
||||||
|
group_layer_num: 2
|
||||||
|
head_num: 2
|
||||||
|
|
||||||
|
Mask_Branch:
|
||||||
|
source_dim: 384
|
||||||
|
target_dim: 2
|
||||||
|
p: 0.
|
||||||
|
softmax: True
|
||||||
|
|
||||||
|
backbone_cfg:
|
||||||
|
type: ResNet9
|
||||||
|
block: BasicBlock
|
||||||
|
in_channel: 16
|
||||||
|
channels: # Layers configuration for automatically model construction
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
layers:
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
strides:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 1
|
||||||
|
- 1
|
||||||
|
maxpool: false
|
||||||
|
SeparateFCs:
|
||||||
|
in_channels: 512
|
||||||
|
out_channels: 256
|
||||||
|
parts_num: 32
|
||||||
|
SeparateBNNecks:
|
||||||
|
class_num: 100
|
||||||
|
in_channels: 256
|
||||||
|
parts_num: 32
|
||||||
|
bin_num:
|
||||||
|
- 32
|
||||||
|
|
||||||
|
optimizer_cfg:
|
||||||
|
lr: 0.1
|
||||||
|
momentum: 0.9
|
||||||
|
solver: SGD
|
||||||
|
weight_decay: 0.0005
|
||||||
|
|
||||||
|
scheduler_cfg:
|
||||||
|
gamma: 0.1
|
||||||
|
milestones: # Learning Rate Reduction at each milestones
|
||||||
|
- 15000
|
||||||
|
- 25000
|
||||||
|
scheduler: MultiStepLR
|
||||||
|
|
||||||
|
trainer_cfg:
|
||||||
|
find_unused_parameters: True
|
||||||
|
enable_float16: true # half_percesion float for memory reduction and speedup
|
||||||
|
fix_BN: false
|
||||||
|
log_iter: 100
|
||||||
|
with_test: true
|
||||||
|
restore_ckpt_strict: true
|
||||||
|
restore_hint: 0
|
||||||
|
save_iter: 10000
|
||||||
|
save_name: BiggerGait__Dinov2_Group
|
||||||
|
sync_BN: true
|
||||||
|
total_iter: 30000
|
||||||
|
sampler:
|
||||||
|
batch_shuffle: true
|
||||||
|
batch_size:
|
||||||
|
- 8 # TripletSampler, batch_size[0] indicates Number of Identity
|
||||||
|
- 4 # batch_size[1] indicates Samples sequqnce for each Identity
|
||||||
|
frames_num_fixed: 30 # fixed frames number for training
|
||||||
|
frames_skip_num: 4
|
||||||
|
frames_num_max: 40 # max frames number for unfixed training
|
||||||
|
frames_num_min: 20 # min frames number for unfixed traing
|
||||||
|
sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered
|
||||||
|
type: TripletSampler
|
||||||
|
transform:
|
||||||
|
- type: Compose
|
||||||
|
trf_cfg:
|
||||||
|
- type: RandomHorizontalFlip
|
||||||
|
- type: BaseRgbTransform
|
||||||
@@ -188,3 +188,93 @@ class Baseline(nn.Module):
|
|||||||
_, logits = self.BNNecks(embed_1) # [n, c, p]
|
_, logits = self.BNNecks(embed_1) # [n, c, p]
|
||||||
# return embed_1, logits, heat_mapt
|
# return embed_1, logits, heat_mapt
|
||||||
return embed_1, logits
|
return embed_1, logits
|
||||||
|
|
||||||
|
|
||||||
|
class Baseline_Single(nn.Module):
|
||||||
|
def __init__(self, model_cfg):
|
||||||
|
super(Baseline_Single, self).__init__()
|
||||||
|
self.pre_rgb = SetBlockWrapper(Pre_ResNet9(**model_cfg['backbone_cfg']))
|
||||||
|
self.post_backbone = SetBlockWrapper(Post_ResNet9(**model_cfg['backbone_cfg']))
|
||||||
|
self.FCs = SeparateFCs(**model_cfg['SeparateFCs'])
|
||||||
|
self.BNNecks = SeparateBNNecks(**model_cfg['SeparateBNNecks'])
|
||||||
|
self.TP = PackSequenceWrapper(torch.max)
|
||||||
|
self.HPP = HorizontalPoolingPyramid(bin_num=model_cfg['bin_num'])
|
||||||
|
|
||||||
|
def get_backbone(self, backbone_cfg):
|
||||||
|
"""Get the backbone of the model."""
|
||||||
|
if is_dict(backbone_cfg):
|
||||||
|
Backbone = get_attr_from([backbones], backbone_cfg['type'])
|
||||||
|
valid_args = get_valid_args(Backbone, backbone_cfg, ['type'])
|
||||||
|
return Backbone(**valid_args)
|
||||||
|
if is_list(backbone_cfg):
|
||||||
|
Backbone = nn.ModuleList([self.get_backbone(cfg)
|
||||||
|
for cfg in backbone_cfg])
|
||||||
|
return Backbone
|
||||||
|
raise ValueError(
|
||||||
|
"Error type for -Backbone-Cfg-, supported: (A list of) dict.")
|
||||||
|
|
||||||
|
def pre_forward(self, appearance, *args, **kwargs):
|
||||||
|
outs = self.pre_rgb(appearance, *args, **kwargs) # [n, c, s, h, w]
|
||||||
|
outs = self.post_backbone(outs, *args, **kwargs)
|
||||||
|
return outs
|
||||||
|
|
||||||
|
def forward(self, appearance, seqL, *args, **kwargs):
|
||||||
|
outs = self.pre_rgb(appearance, *args, **kwargs) # [n, c, s, h, w]
|
||||||
|
outs = self.post_backbone(outs, *args, **kwargs)
|
||||||
|
# Temporal Pooling, TP
|
||||||
|
outs = self.TP(outs, seqL, options={"dim": 2})[0] # [n, c, h, w]
|
||||||
|
# Horizontal Pooling Matching, HPM
|
||||||
|
outs = self.HPP(outs) # [n, c, p]
|
||||||
|
embed_1 = self.FCs(outs) # [n, c, p]
|
||||||
|
_, logits = self.BNNecks(embed_1) # [n, c, p]
|
||||||
|
return embed_1, logits
|
||||||
|
|
||||||
|
def test_1(self, appearance, *args, **kwargs):
|
||||||
|
outs = self.pre_rgb(appearance, *args, **kwargs) # [n, c, s, h, w]
|
||||||
|
outs = self.post_backbone(outs, *args, **kwargs)
|
||||||
|
return outs
|
||||||
|
|
||||||
|
def test_2(self, outs, seqL):
|
||||||
|
outs = self.TP(outs, seqL, options={"dim": 2})[0] # [n, c, h, w]
|
||||||
|
outs = self.HPP(outs) # [n, c, p]
|
||||||
|
embed_1 = self.FCs(outs) # [n, c, p]
|
||||||
|
_, logits = self.BNNecks(embed_1) # [n, c, p]
|
||||||
|
return embed_1, logits
|
||||||
|
|
||||||
|
class Baseline_Share(nn.Module):
|
||||||
|
def __init__(self, model_cfg):
|
||||||
|
super(Baseline_Share, self).__init__()
|
||||||
|
self.head_num = model_cfg['head_num']
|
||||||
|
self.num_FPN = model_cfg['total_layer_num'] // model_cfg['group_layer_num']
|
||||||
|
self.real_gait = nn.ModuleList([
|
||||||
|
Baseline_Single(model_cfg) for _ in range(self.head_num)
|
||||||
|
])
|
||||||
|
self.Gait_List = nn.ModuleList([
|
||||||
|
self.real_gait[_ // (self.num_FPN // self.head_num)] for _ in range(self.num_FPN)
|
||||||
|
])
|
||||||
|
|
||||||
|
def forward(self, x, seqL):
|
||||||
|
x = self.test_1(x)
|
||||||
|
embed_list, log_list = self.test_2(x, seqL)
|
||||||
|
return embed_list, log_list
|
||||||
|
|
||||||
|
def test_1(self, x, *args, **kwargs):
|
||||||
|
# x: [n, c, s, h, w]
|
||||||
|
n,c,s,h,w = x.shape
|
||||||
|
x_list = list(torch.chunk(x, self.num_FPN, dim=1))
|
||||||
|
for i in range(self.num_FPN):
|
||||||
|
x_list[i] = self.Gait_List[i].test_1(x_list[i], *args, **kwargs)
|
||||||
|
x = torch.concat(x_list, dim=1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def test_2(self, x, seqL):
|
||||||
|
# x: [n, c, s, h, w]
|
||||||
|
# embed_1: [n, c, p]
|
||||||
|
x_list = torch.chunk(x, self.num_FPN, dim=1)
|
||||||
|
embed_list = []
|
||||||
|
log_list = []
|
||||||
|
for i in range(self.num_FPN):
|
||||||
|
embed_1, logits = self.Gait_List[i].test_2(x_list[i], seqL)
|
||||||
|
embed_list.append(embed_1)
|
||||||
|
log_list.append(logits)
|
||||||
|
return embed_list, log_list
|
||||||
|
|||||||
@@ -0,0 +1,233 @@
|
|||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the license found in the
|
||||||
|
# LICENSE file in the root directory of this source tree.
|
||||||
|
|
||||||
|
# References:
|
||||||
|
# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
|
||||||
|
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.utils.checkpoint
|
||||||
|
from einops import rearrange
|
||||||
|
from ..base_model import BaseModel
|
||||||
|
from torch.nn import functional as F
|
||||||
|
from kornia import morphology as morph
|
||||||
|
import random
|
||||||
|
|
||||||
|
from .BigGait_utils.BigGait_GaitBase import *
|
||||||
|
from .BigGait_utils.save_img import save_image, pca_image
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
# ######################################## BiggerGait ###########################################
|
||||||
|
|
||||||
|
class infoDistillation(nn.Module):
|
||||||
|
def __init__(self, source_dim, target_dim, p, softmax):
|
||||||
|
super(infoDistillation, self).__init__()
|
||||||
|
self.dropout = nn.Dropout(p=p)
|
||||||
|
self.bn_s = nn.BatchNorm1d(source_dim, affine=False)
|
||||||
|
self.bn_t = nn.BatchNorm1d(target_dim, affine=False)
|
||||||
|
self.down_sampling = nn.Linear(source_dim, target_dim)
|
||||||
|
self.up_sampling = nn.Linear(target_dim, source_dim)
|
||||||
|
self.softmax = softmax
|
||||||
|
self.mse = nn.MSELoss()
|
||||||
|
|
||||||
|
def forward(self, x, mse=True):
|
||||||
|
# [n, c]
|
||||||
|
d_x = self.down_sampling(self.bn_s(self.dropout(x)))
|
||||||
|
d_x = F.softmax(d_x, dim=1)
|
||||||
|
if mse:
|
||||||
|
u_x = self.up_sampling(d_x)
|
||||||
|
return d_x, torch.mean(self.mse(u_x, x))
|
||||||
|
else:
|
||||||
|
return d_x, None
|
||||||
|
|
||||||
|
class ResizeToHW(torch.nn.Module):
|
||||||
|
def __init__(self, target_size):
|
||||||
|
super().__init__()
|
||||||
|
self.target_size = target_size
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return F.interpolate(x, size=self.target_size, mode='bilinear', align_corners=False)
|
||||||
|
|
||||||
|
class BiggerGait__DINOv2(BaseModel):
|
||||||
|
def build_network(self, model_cfg):
|
||||||
|
# get pretained models
|
||||||
|
self.pretrained_lvm = model_cfg["pretrained_lvm"]
|
||||||
|
self.pretrained_mask_branch = model_cfg["pretrained_mask_branch"]
|
||||||
|
|
||||||
|
# set input size
|
||||||
|
self.image_size = model_cfg["image_size"]
|
||||||
|
self.sils_size = model_cfg["sils_size"]
|
||||||
|
|
||||||
|
# set feature dim
|
||||||
|
self.f4_dim = model_cfg['source_dim']
|
||||||
|
self.num_unknown = model_cfg["num_unknown"]
|
||||||
|
|
||||||
|
# set layer / group / gait_head number
|
||||||
|
self.total_layer_num = model_cfg["total_layer_num"] # total layer number is 12
|
||||||
|
self.group_layer_num = model_cfg["group_layer_num"] # each group have 2 layers
|
||||||
|
self.head_num = model_cfg["head_num"] # 2 gait heads
|
||||||
|
assert self.total_layer_num % self.group_layer_num == 0
|
||||||
|
assert (self.total_layer_num // self.group_layer_num) % self.head_num == 0
|
||||||
|
self.num_FPN = self.total_layer_num // self.group_layer_num
|
||||||
|
|
||||||
|
self.Gait_Net = Baseline_Share(model_cfg)
|
||||||
|
|
||||||
|
self.HumanSpace_Conv = nn.ModuleList([
|
||||||
|
nn.Sequential(
|
||||||
|
nn.BatchNorm2d(self.f4_dim*self.group_layer_num, affine=False),
|
||||||
|
nn.Conv2d(self.f4_dim*self.group_layer_num, self.f4_dim//2, kernel_size=1),
|
||||||
|
nn.BatchNorm2d(self.f4_dim//2, affine=False),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Conv2d(self.f4_dim//2, self.num_unknown, kernel_size=1),
|
||||||
|
ResizeToHW((self.sils_size*2, self.sils_size)),
|
||||||
|
nn.BatchNorm2d(self.num_unknown, affine=False),
|
||||||
|
nn.Sigmoid()
|
||||||
|
) for _ in range(self.num_FPN)
|
||||||
|
])
|
||||||
|
self.Mask_Branch = infoDistillation(**model_cfg["Mask_Branch"])
|
||||||
|
|
||||||
|
def init_DINOv2(self):
|
||||||
|
from transformers import Dinov2Config, Dinov2Model
|
||||||
|
from transformers.modeling_outputs import BaseModelOutputWithPooling
|
||||||
|
config = Dinov2Config.from_pretrained(self.pretrained_lvm + "/config.json")
|
||||||
|
self.Backbone = Dinov2Model.from_pretrained(
|
||||||
|
self.pretrained_lvm,
|
||||||
|
config=config,
|
||||||
|
)
|
||||||
|
self.Backbone.cpu()
|
||||||
|
self.msg_mgr.log_info(f'load model from: {self.pretrained_lvm}')
|
||||||
|
|
||||||
|
def init_Mask_Branch(self):
|
||||||
|
self.msg_mgr.log_info(f'load model from: {self.pretrained_mask_branch}')
|
||||||
|
load_dict = torch.load(self.pretrained_mask_branch, map_location=torch.device("cpu"))['model']
|
||||||
|
msg = self.Mask_Branch.load_state_dict(load_dict, strict=True)
|
||||||
|
n_parameters = sum(p.numel() for p in self.Mask_Branch.parameters())
|
||||||
|
self.msg_mgr.log_info('Missing keys: {}'.format(msg.missing_keys))
|
||||||
|
self.msg_mgr.log_info('Unexpected keys: {}'.format(msg.unexpected_keys))
|
||||||
|
self.msg_mgr.log_info(f"=> loaded successfully '{self.pretrained_mask_branch}'")
|
||||||
|
self.msg_mgr.log_info('SegmentationBranch Count: {:.5f}M'.format(n_parameters / 1e6))
|
||||||
|
|
||||||
|
def init_parameters(self):
|
||||||
|
for m in self.modules():
|
||||||
|
if isinstance(m, (nn.Conv3d, nn.Conv2d, nn.Conv1d)):
|
||||||
|
nn.init.xavier_uniform_(m.weight.data)
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.constant_(m.bias.data, 0.0)
|
||||||
|
elif isinstance(m, nn.Linear):
|
||||||
|
nn.init.xavier_uniform_(m.weight.data)
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.constant_(m.bias.data, 0.0)
|
||||||
|
elif isinstance(m, (nn.BatchNorm3d, nn.BatchNorm2d, nn.BatchNorm1d)):
|
||||||
|
if m.affine:
|
||||||
|
nn.init.normal_(m.weight.data, 1.0, 0.02)
|
||||||
|
nn.init.constant_(m.bias.data, 0.0)
|
||||||
|
|
||||||
|
n_parameters = sum(p.numel() for p in self.parameters())
|
||||||
|
self.msg_mgr.log_info('Expect Backbone Count: {:.5f}M'.format(n_parameters / 1e6))
|
||||||
|
|
||||||
|
self.init_DINOv2()
|
||||||
|
self.init_Mask_Branch()
|
||||||
|
|
||||||
|
# # Cal GFlops
|
||||||
|
if self.training:
|
||||||
|
from fvcore.nn import FlopCountAnalysis
|
||||||
|
self.eval()
|
||||||
|
with torch.no_grad():
|
||||||
|
device = torch.distributed.get_rank()
|
||||||
|
inputs = ([[torch.randn((1,1,3,448,224),dtype=torch.float32).to(device), torch.rand(1,dtype=torch.float32).to(device)], None, None, None, None],)
|
||||||
|
flops = FlopCountAnalysis(self.to(device), inputs).total() / 1e9 # GFLOPs
|
||||||
|
self.train()
|
||||||
|
|
||||||
|
self.Backbone.eval()
|
||||||
|
self.Backbone.requires_grad_(False)
|
||||||
|
self.Mask_Branch.eval()
|
||||||
|
self.Mask_Branch.requires_grad_(False)
|
||||||
|
|
||||||
|
n_parameters = sum(p.numel() for p in self.parameters())
|
||||||
|
if self.training:
|
||||||
|
self.msg_mgr.log_info('All Backbone Count: {:.5f}M, {:.2f} GFLOPs'.format(n_parameters / 1e6, flops))
|
||||||
|
else:
|
||||||
|
self.msg_mgr.log_info('All Backbone Count: {:.5f}M'.format(n_parameters / 1e6))
|
||||||
|
|
||||||
|
self.msg_mgr.log_info("=> init successfully")
|
||||||
|
|
||||||
|
# resize image
|
||||||
|
def preprocess(self, sils, image_size, mode='bilinear'):
|
||||||
|
# shape: [nxs,c,h,w] / [nxs,c,224,112]
|
||||||
|
return F.interpolate(sils, (image_size*2, image_size), mode=mode, align_corners=False)
|
||||||
|
|
||||||
|
def min_max_norm(self, x):
|
||||||
|
return (x - x.min())/(x.max() - x.min())
|
||||||
|
|
||||||
|
# # ############################# For Train ##############################
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
ipts, labs, ty, vi, seqL = inputs
|
||||||
|
rgb = ipts[0]
|
||||||
|
del ipts
|
||||||
|
|
||||||
|
# adjust gpu
|
||||||
|
rgb_chunks = torch.chunk(rgb, (rgb.size(1)//96)+1, dim=1)
|
||||||
|
all_outs = []
|
||||||
|
for _, rgb_img in enumerate(rgb_chunks):
|
||||||
|
with torch.no_grad():
|
||||||
|
# get RGB
|
||||||
|
n,s,c,h,w = rgb_img.size()
|
||||||
|
rgb_img = rearrange(rgb_img, 'n s c h w -> (n s) c h w').contiguous()
|
||||||
|
outs = self.preprocess(rgb_img, self.image_size)
|
||||||
|
outs = self.Backbone(outs,output_hidden_states=True).hidden_states[1:] # [ns,h*w,c]
|
||||||
|
|
||||||
|
intermediates = partial(nn.LayerNorm, eps=1e-6)(self.f4_dim*len(outs), elementwise_affine=False)(torch.concat(outs, dim=-1))[:,1:]
|
||||||
|
intermediates = rearrange(intermediates.view(n, s, self.image_size//7, self.image_size//14, -1), 'n s h w c -> (n s) c h w').contiguous()
|
||||||
|
intermediates = list(torch.chunk(intermediates, self.total_layer_num, dim=1))
|
||||||
|
|
||||||
|
human_mask = partial(nn.LayerNorm, eps=1e-6)(self.f4_dim, elementwise_affine=False)(outs[-1])[:,1:].contiguous()
|
||||||
|
human_mask, _ = self.Mask_Branch(human_mask.view(-1, self.f4_dim), mse=False)
|
||||||
|
human_mask = (human_mask[:,1] > 0.5).float() # check which is the foreground at first!!! 0 or 1; 50%;
|
||||||
|
human_mask = human_mask.view(n*s, 1, self.image_size//7, self.image_size//14)
|
||||||
|
human_mask = self.preprocess(human_mask, self.sils_size).detach().clone()
|
||||||
|
|
||||||
|
intermediates = [torch.cat(intermediates[i:i+self.group_layer_num], dim=1).contiguous() for i in range(0, self.total_layer_num, self.group_layer_num)]
|
||||||
|
for i in range(self.num_FPN):
|
||||||
|
intermediates[i] = self.HumanSpace_Conv[i](intermediates[i])
|
||||||
|
intermediates = torch.concat(intermediates, dim=1)
|
||||||
|
intermediates = intermediates * (human_mask > 0.5).to(intermediates)
|
||||||
|
intermediates = rearrange(intermediates.view(n, s, -1, self.sils_size*2, self.sils_size), 'n s c h w -> n c s h w').contiguous()
|
||||||
|
|
||||||
|
outs = self.Gait_Net.test_1(intermediates)
|
||||||
|
all_outs.append(outs)
|
||||||
|
|
||||||
|
embed_list, log_list = self.Gait_Net.test_2(
|
||||||
|
torch.cat(all_outs, dim=2),
|
||||||
|
seqL,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.training:
|
||||||
|
retval = {
|
||||||
|
'training_feat': {
|
||||||
|
'triplet': {'embeddings': torch.concat(embed_list, dim=-1), 'labels': labs},
|
||||||
|
'softmax': {'logits': torch.concat(log_list, dim=-1), 'labels': labs},
|
||||||
|
},
|
||||||
|
'visual_summary': {
|
||||||
|
'image/rgb_img': rgb_img.view(n*s, c, h, w)[:5].float(),
|
||||||
|
'image/human_mask': self.min_max_norm(human_mask.view(n*s, -1, self.sils_size*2, self.sils_size)[:5].float()),
|
||||||
|
},
|
||||||
|
'inference_feat': {
|
||||||
|
'embeddings': torch.concat(embed_list, dim=-1),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
retval = {
|
||||||
|
'training_feat': {},
|
||||||
|
'visual_summary': {},
|
||||||
|
|
||||||
|
'inference_feat': {
|
||||||
|
'embeddings': torch.concat(embed_list, dim=-1),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return retval
|
||||||
Reference in New Issue
Block a user