From 67940f6561af8a2685973de07cb831b1ad1afced Mon Sep 17 00:00:00 2001 From: bugjudger Date: Tue, 7 Oct 2025 18:15:04 -0400 Subject: [PATCH] add BiggerGait --- .../biggergait/biggergait__DINOv2_CCPG.yaml | 127 ++++++++++ .../biggergait__DINOv2_Group_CCPG.yaml | 127 ++++++++++ .../models/BigGait_utils/BigGait_GaitBase.py | 90 +++++++ opengait/modeling/models/BiggerGait_DINOv2.py | 233 ++++++++++++++++++ 4 files changed, 577 insertions(+) create mode 100644 configs/biggergait/biggergait__DINOv2_CCPG.yaml create mode 100644 configs/biggergait/biggergait__DINOv2_Group_CCPG.yaml create mode 100644 opengait/modeling/models/BiggerGait_DINOv2.py diff --git a/configs/biggergait/biggergait__DINOv2_CCPG.yaml b/configs/biggergait/biggergait__DINOv2_CCPG.yaml new file mode 100644 index 0000000..93cd52e --- /dev/null +++ b/configs/biggergait/biggergait__DINOv2_CCPG.yaml @@ -0,0 +1,127 @@ +data_cfg: + dataset_name: CCPG + # TODO + dataset_root: your_path # use datasets/pretreatment_rgb.py for data preprocessing! + dataset_partition: ./datasets/CCPG/CCPG.json + data_in_use: [True, False] # images / real_ratios + num_workers: 8 + remove_no_gallery: false # Remove probe if no gallery for it + test_dataset_name: CCPG + +evaluator_cfg: + enable_float16: true + restore_ckpt_strict: False # Rename some module name for clarity, so it is fasle. + restore_hint: 30000 # BiggerGait__SmallDINOv2_Gaitbase_84Frame30_448224_6432HPP32_NoAlign_Sep12B_WiMask-30000.pt in HuggingFace + save_name: BiggerGait__Dinov2 + eval_func: evaluate_CCPG + sampler: + batch_shuffle: false + batch_size: 8 # GPUs number + sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options: fixed_unordered + frames_all_limit: 250 # limit the number of sampled frames to prevent out of memory + metric: euc # cos + transform: + - type: BaseRgbTransform + +loss_cfg: + - loss_term_weight: 1.0 + margin: 0.2 + type: TripletLoss + log_prefix: triplet + - loss_term_weight: 1.0 + scale: 16 + type: CrossEntropyLoss + log_prefix: softmax + log_accuracy: true + +model_cfg: + model: BiggerGait__DINOv2 + pretrained_lvm: ./pretrained_LVMs/dinov2-small # DINOv2-S HuggingFace Link: "git clone https://huggingface.co/facebook/dinov2-small" + "git lfs pull" + pretrained_mask_branch: ./pretrained_LVMs/MaskBranch_vits14.pt # Using BigGait Code for pretraining at first, it is very lightly. pretrained_mask_branch: None or MaskBranch Download Link: https://drive.google.com/drive/folders/1zrWPUsrbCpwxoLgfom3d2irgxkBqtXqc?usp=sharing + image_size: 224 # 448x224 + sils_size: 32 # 64x32 + + source_dim: 384 + num_unknown: 16 + total_layer_num: 12 + group_layer_num: 1 + head_num: 12 + + Mask_Branch: + source_dim: 384 + target_dim: 2 + p: 0. + softmax: True + + backbone_cfg: + type: ResNet9 + block: BasicBlock + in_channel: 16 + channels: # Layers configuration for automatically model construction + - 64 + - 128 + - 256 + - 512 + layers: + - 1 + - 1 + - 1 + - 1 + strides: + - 1 + - 2 + - 1 + - 1 + maxpool: false + SeparateFCs: + in_channels: 512 + out_channels: 256 + parts_num: 32 + SeparateBNNecks: + class_num: 100 + in_channels: 256 + parts_num: 32 + bin_num: + - 32 + +optimizer_cfg: + lr: 0.1 + momentum: 0.9 + solver: SGD + weight_decay: 0.0005 + +scheduler_cfg: + gamma: 0.1 + milestones: # Learning Rate Reduction at each milestones + - 15000 + - 25000 + scheduler: MultiStepLR + +trainer_cfg: + find_unused_parameters: True + enable_float16: true # half_percesion float for memory reduction and speedup + fix_BN: false + log_iter: 100 + with_test: true + restore_ckpt_strict: true + restore_hint: 0 + save_iter: 10000 + save_name: BiggerGait__Dinov2 + sync_BN: true + total_iter: 30000 + sampler: + batch_shuffle: true + batch_size: + - 8 # TripletSampler, batch_size[0] indicates Number of Identity + - 4 # batch_size[1] indicates Samples sequqnce for each Identity + frames_num_fixed: 30 # fixed frames number for training + frames_skip_num: 4 + frames_num_max: 40 # max frames number for unfixed training + frames_num_min: 20 # min frames number for unfixed traing + sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered + type: TripletSampler + transform: + - type: Compose + trf_cfg: + - type: RandomHorizontalFlip + - type: BaseRgbTransform diff --git a/configs/biggergait/biggergait__DINOv2_Group_CCPG.yaml b/configs/biggergait/biggergait__DINOv2_Group_CCPG.yaml new file mode 100644 index 0000000..1d240e6 --- /dev/null +++ b/configs/biggergait/biggergait__DINOv2_Group_CCPG.yaml @@ -0,0 +1,127 @@ +data_cfg: + dataset_name: CCPG + # TODO + dataset_root: your_path # use datasets/pretreatment_rgb.py for data preprocessing! + dataset_partition: ./datasets/CCPG/CCPG.json + data_in_use: [True, False] # images / real_ratios + num_workers: 8 + remove_no_gallery: false # Remove probe if no gallery for it + test_dataset_name: CCPG + +evaluator_cfg: + enable_float16: true + restore_ckpt_strict: False # Rename some module name for clarity, so it is fasle. + restore_hint: 30000 # BiggerGait__SmallDINOv2_Gaitbase_84Frame30_448224_6432HPP32_NoAlign_Sep12B_WiMask_2B_6G-30000.pt in HuggingFace + save_name: BiggerGait__Dinov2_Group + eval_func: evaluate_CCPG + sampler: + batch_shuffle: false + batch_size: 8 # GPUs number + sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options: fixed_unordered + frames_all_limit: 250 # limit the number of sampled frames to prevent out of memory + metric: euc # cos + transform: + - type: BaseRgbTransform + +loss_cfg: + - loss_term_weight: 1.0 + margin: 0.2 + type: TripletLoss + log_prefix: triplet + - loss_term_weight: 1.0 + scale: 16 + type: CrossEntropyLoss + log_prefix: softmax + log_accuracy: true + +model_cfg: + model: BiggerGait__DINOv2 + pretrained_lvm: ./pretrained_LVMs/dinov2-small # DINOv2-S HuggingFace Link: "git clone https://huggingface.co/facebook/dinov2-small" + "git lfs pull" + pretrained_mask_branch: ./pretrained_LVMs/MaskBranch_vits14.pt # Using BigGait Code for pretraining at first, it is very lightly. pretrained_mask_branch: None or MaskBranch Download Link: https://drive.google.com/drive/folders/1zrWPUsrbCpwxoLgfom3d2irgxkBqtXqc?usp=sharing + image_size: 224 # 448x224 + sils_size: 32 # 64x32 + + source_dim: 384 + num_unknown: 16 + total_layer_num: 12 + group_layer_num: 2 + head_num: 2 + + Mask_Branch: + source_dim: 384 + target_dim: 2 + p: 0. + softmax: True + + backbone_cfg: + type: ResNet9 + block: BasicBlock + in_channel: 16 + channels: # Layers configuration for automatically model construction + - 64 + - 128 + - 256 + - 512 + layers: + - 1 + - 1 + - 1 + - 1 + strides: + - 1 + - 2 + - 1 + - 1 + maxpool: false + SeparateFCs: + in_channels: 512 + out_channels: 256 + parts_num: 32 + SeparateBNNecks: + class_num: 100 + in_channels: 256 + parts_num: 32 + bin_num: + - 32 + +optimizer_cfg: + lr: 0.1 + momentum: 0.9 + solver: SGD + weight_decay: 0.0005 + +scheduler_cfg: + gamma: 0.1 + milestones: # Learning Rate Reduction at each milestones + - 15000 + - 25000 + scheduler: MultiStepLR + +trainer_cfg: + find_unused_parameters: True + enable_float16: true # half_percesion float for memory reduction and speedup + fix_BN: false + log_iter: 100 + with_test: true + restore_ckpt_strict: true + restore_hint: 0 + save_iter: 10000 + save_name: BiggerGait__Dinov2_Group + sync_BN: true + total_iter: 30000 + sampler: + batch_shuffle: true + batch_size: + - 8 # TripletSampler, batch_size[0] indicates Number of Identity + - 4 # batch_size[1] indicates Samples sequqnce for each Identity + frames_num_fixed: 30 # fixed frames number for training + frames_skip_num: 4 + frames_num_max: 40 # max frames number for unfixed training + frames_num_min: 20 # min frames number for unfixed traing + sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered + type: TripletSampler + transform: + - type: Compose + trf_cfg: + - type: RandomHorizontalFlip + - type: BaseRgbTransform diff --git a/opengait/modeling/models/BigGait_utils/BigGait_GaitBase.py b/opengait/modeling/models/BigGait_utils/BigGait_GaitBase.py index 0d104bc..61f3b8a 100644 --- a/opengait/modeling/models/BigGait_utils/BigGait_GaitBase.py +++ b/opengait/modeling/models/BigGait_utils/BigGait_GaitBase.py @@ -188,3 +188,93 @@ class Baseline(nn.Module): _, logits = self.BNNecks(embed_1) # [n, c, p] # return embed_1, logits, heat_mapt return embed_1, logits + + +class Baseline_Single(nn.Module): + def __init__(self, model_cfg): + super(Baseline_Single, self).__init__() + self.pre_rgb = SetBlockWrapper(Pre_ResNet9(**model_cfg['backbone_cfg'])) + self.post_backbone = SetBlockWrapper(Post_ResNet9(**model_cfg['backbone_cfg'])) + self.FCs = SeparateFCs(**model_cfg['SeparateFCs']) + self.BNNecks = SeparateBNNecks(**model_cfg['SeparateBNNecks']) + self.TP = PackSequenceWrapper(torch.max) + self.HPP = HorizontalPoolingPyramid(bin_num=model_cfg['bin_num']) + + def get_backbone(self, backbone_cfg): + """Get the backbone of the model.""" + if is_dict(backbone_cfg): + Backbone = get_attr_from([backbones], backbone_cfg['type']) + valid_args = get_valid_args(Backbone, backbone_cfg, ['type']) + return Backbone(**valid_args) + if is_list(backbone_cfg): + Backbone = nn.ModuleList([self.get_backbone(cfg) + for cfg in backbone_cfg]) + return Backbone + raise ValueError( + "Error type for -Backbone-Cfg-, supported: (A list of) dict.") + + def pre_forward(self, appearance, *args, **kwargs): + outs = self.pre_rgb(appearance, *args, **kwargs) # [n, c, s, h, w] + outs = self.post_backbone(outs, *args, **kwargs) + return outs + + def forward(self, appearance, seqL, *args, **kwargs): + outs = self.pre_rgb(appearance, *args, **kwargs) # [n, c, s, h, w] + outs = self.post_backbone(outs, *args, **kwargs) + # Temporal Pooling, TP + outs = self.TP(outs, seqL, options={"dim": 2})[0] # [n, c, h, w] + # Horizontal Pooling Matching, HPM + outs = self.HPP(outs) # [n, c, p] + embed_1 = self.FCs(outs) # [n, c, p] + _, logits = self.BNNecks(embed_1) # [n, c, p] + return embed_1, logits + + def test_1(self, appearance, *args, **kwargs): + outs = self.pre_rgb(appearance, *args, **kwargs) # [n, c, s, h, w] + outs = self.post_backbone(outs, *args, **kwargs) + return outs + + def test_2(self, outs, seqL): + outs = self.TP(outs, seqL, options={"dim": 2})[0] # [n, c, h, w] + outs = self.HPP(outs) # [n, c, p] + embed_1 = self.FCs(outs) # [n, c, p] + _, logits = self.BNNecks(embed_1) # [n, c, p] + return embed_1, logits + +class Baseline_Share(nn.Module): + def __init__(self, model_cfg): + super(Baseline_Share, self).__init__() + self.head_num = model_cfg['head_num'] + self.num_FPN = model_cfg['total_layer_num'] // model_cfg['group_layer_num'] + self.real_gait = nn.ModuleList([ + Baseline_Single(model_cfg) for _ in range(self.head_num) + ]) + self.Gait_List = nn.ModuleList([ + self.real_gait[_ // (self.num_FPN // self.head_num)] for _ in range(self.num_FPN) + ]) + + def forward(self, x, seqL): + x = self.test_1(x) + embed_list, log_list = self.test_2(x, seqL) + return embed_list, log_list + + def test_1(self, x, *args, **kwargs): + # x: [n, c, s, h, w] + n,c,s,h,w = x.shape + x_list = list(torch.chunk(x, self.num_FPN, dim=1)) + for i in range(self.num_FPN): + x_list[i] = self.Gait_List[i].test_1(x_list[i], *args, **kwargs) + x = torch.concat(x_list, dim=1) + return x + + def test_2(self, x, seqL): + # x: [n, c, s, h, w] + # embed_1: [n, c, p] + x_list = torch.chunk(x, self.num_FPN, dim=1) + embed_list = [] + log_list = [] + for i in range(self.num_FPN): + embed_1, logits = self.Gait_List[i].test_2(x_list[i], seqL) + embed_list.append(embed_1) + log_list.append(logits) + return embed_list, log_list diff --git a/opengait/modeling/models/BiggerGait_DINOv2.py b/opengait/modeling/models/BiggerGait_DINOv2.py new file mode 100644 index 0000000..2b9d474 --- /dev/null +++ b/opengait/modeling/models/BiggerGait_DINOv2.py @@ -0,0 +1,233 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from einops import rearrange +from ..base_model import BaseModel +from torch.nn import functional as F +from kornia import morphology as morph +import random + +from .BigGait_utils.BigGait_GaitBase import * +from .BigGait_utils.save_img import save_image, pca_image +from functools import partial + +# ######################################## BiggerGait ########################################### + +class infoDistillation(nn.Module): + def __init__(self, source_dim, target_dim, p, softmax): + super(infoDistillation, self).__init__() + self.dropout = nn.Dropout(p=p) + self.bn_s = nn.BatchNorm1d(source_dim, affine=False) + self.bn_t = nn.BatchNorm1d(target_dim, affine=False) + self.down_sampling = nn.Linear(source_dim, target_dim) + self.up_sampling = nn.Linear(target_dim, source_dim) + self.softmax = softmax + self.mse = nn.MSELoss() + + def forward(self, x, mse=True): + # [n, c] + d_x = self.down_sampling(self.bn_s(self.dropout(x))) + d_x = F.softmax(d_x, dim=1) + if mse: + u_x = self.up_sampling(d_x) + return d_x, torch.mean(self.mse(u_x, x)) + else: + return d_x, None + +class ResizeToHW(torch.nn.Module): + def __init__(self, target_size): + super().__init__() + self.target_size = target_size + + def forward(self, x): + return F.interpolate(x, size=self.target_size, mode='bilinear', align_corners=False) + +class BiggerGait__DINOv2(BaseModel): + def build_network(self, model_cfg): + # get pretained models + self.pretrained_lvm = model_cfg["pretrained_lvm"] + self.pretrained_mask_branch = model_cfg["pretrained_mask_branch"] + + # set input size + self.image_size = model_cfg["image_size"] + self.sils_size = model_cfg["sils_size"] + + # set feature dim + self.f4_dim = model_cfg['source_dim'] + self.num_unknown = model_cfg["num_unknown"] + + # set layer / group / gait_head number + self.total_layer_num = model_cfg["total_layer_num"] # total layer number is 12 + self.group_layer_num = model_cfg["group_layer_num"] # each group have 2 layers + self.head_num = model_cfg["head_num"] # 2 gait heads + assert self.total_layer_num % self.group_layer_num == 0 + assert (self.total_layer_num // self.group_layer_num) % self.head_num == 0 + self.num_FPN = self.total_layer_num // self.group_layer_num + + self.Gait_Net = Baseline_Share(model_cfg) + + self.HumanSpace_Conv = nn.ModuleList([ + nn.Sequential( + nn.BatchNorm2d(self.f4_dim*self.group_layer_num, affine=False), + nn.Conv2d(self.f4_dim*self.group_layer_num, self.f4_dim//2, kernel_size=1), + nn.BatchNorm2d(self.f4_dim//2, affine=False), + nn.GELU(), + nn.Conv2d(self.f4_dim//2, self.num_unknown, kernel_size=1), + ResizeToHW((self.sils_size*2, self.sils_size)), + nn.BatchNorm2d(self.num_unknown, affine=False), + nn.Sigmoid() + ) for _ in range(self.num_FPN) + ]) + self.Mask_Branch = infoDistillation(**model_cfg["Mask_Branch"]) + + def init_DINOv2(self): + from transformers import Dinov2Config, Dinov2Model + from transformers.modeling_outputs import BaseModelOutputWithPooling + config = Dinov2Config.from_pretrained(self.pretrained_lvm + "/config.json") + self.Backbone = Dinov2Model.from_pretrained( + self.pretrained_lvm, + config=config, + ) + self.Backbone.cpu() + self.msg_mgr.log_info(f'load model from: {self.pretrained_lvm}') + + def init_Mask_Branch(self): + self.msg_mgr.log_info(f'load model from: {self.pretrained_mask_branch}') + load_dict = torch.load(self.pretrained_mask_branch, map_location=torch.device("cpu"))['model'] + msg = self.Mask_Branch.load_state_dict(load_dict, strict=True) + n_parameters = sum(p.numel() for p in self.Mask_Branch.parameters()) + self.msg_mgr.log_info('Missing keys: {}'.format(msg.missing_keys)) + self.msg_mgr.log_info('Unexpected keys: {}'.format(msg.unexpected_keys)) + self.msg_mgr.log_info(f"=> loaded successfully '{self.pretrained_mask_branch}'") + self.msg_mgr.log_info('SegmentationBranch Count: {:.5f}M'.format(n_parameters / 1e6)) + + def init_parameters(self): + for m in self.modules(): + if isinstance(m, (nn.Conv3d, nn.Conv2d, nn.Conv1d)): + nn.init.xavier_uniform_(m.weight.data) + if m.bias is not None: + nn.init.constant_(m.bias.data, 0.0) + elif isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight.data) + if m.bias is not None: + nn.init.constant_(m.bias.data, 0.0) + elif isinstance(m, (nn.BatchNorm3d, nn.BatchNorm2d, nn.BatchNorm1d)): + if m.affine: + nn.init.normal_(m.weight.data, 1.0, 0.02) + nn.init.constant_(m.bias.data, 0.0) + + n_parameters = sum(p.numel() for p in self.parameters()) + self.msg_mgr.log_info('Expect Backbone Count: {:.5f}M'.format(n_parameters / 1e6)) + + self.init_DINOv2() + self.init_Mask_Branch() + + # # Cal GFlops + if self.training: + from fvcore.nn import FlopCountAnalysis + self.eval() + with torch.no_grad(): + device = torch.distributed.get_rank() + inputs = ([[torch.randn((1,1,3,448,224),dtype=torch.float32).to(device), torch.rand(1,dtype=torch.float32).to(device)], None, None, None, None],) + flops = FlopCountAnalysis(self.to(device), inputs).total() / 1e9 # GFLOPs + self.train() + + self.Backbone.eval() + self.Backbone.requires_grad_(False) + self.Mask_Branch.eval() + self.Mask_Branch.requires_grad_(False) + + n_parameters = sum(p.numel() for p in self.parameters()) + if self.training: + self.msg_mgr.log_info('All Backbone Count: {:.5f}M, {:.2f} GFLOPs'.format(n_parameters / 1e6, flops)) + else: + self.msg_mgr.log_info('All Backbone Count: {:.5f}M'.format(n_parameters / 1e6)) + + self.msg_mgr.log_info("=> init successfully") + + # resize image + def preprocess(self, sils, image_size, mode='bilinear'): + # shape: [nxs,c,h,w] / [nxs,c,224,112] + return F.interpolate(sils, (image_size*2, image_size), mode=mode, align_corners=False) + + def min_max_norm(self, x): + return (x - x.min())/(x.max() - x.min()) + +# # ############################# For Train ############################## + + def forward(self, inputs): + ipts, labs, ty, vi, seqL = inputs + rgb = ipts[0] + del ipts + + # adjust gpu + rgb_chunks = torch.chunk(rgb, (rgb.size(1)//96)+1, dim=1) + all_outs = [] + for _, rgb_img in enumerate(rgb_chunks): + with torch.no_grad(): + # get RGB + n,s,c,h,w = rgb_img.size() + rgb_img = rearrange(rgb_img, 'n s c h w -> (n s) c h w').contiguous() + outs = self.preprocess(rgb_img, self.image_size) + outs = self.Backbone(outs,output_hidden_states=True).hidden_states[1:] # [ns,h*w,c] + + intermediates = partial(nn.LayerNorm, eps=1e-6)(self.f4_dim*len(outs), elementwise_affine=False)(torch.concat(outs, dim=-1))[:,1:] + intermediates = rearrange(intermediates.view(n, s, self.image_size//7, self.image_size//14, -1), 'n s h w c -> (n s) c h w').contiguous() + intermediates = list(torch.chunk(intermediates, self.total_layer_num, dim=1)) + + human_mask = partial(nn.LayerNorm, eps=1e-6)(self.f4_dim, elementwise_affine=False)(outs[-1])[:,1:].contiguous() + human_mask, _ = self.Mask_Branch(human_mask.view(-1, self.f4_dim), mse=False) + human_mask = (human_mask[:,1] > 0.5).float() # check which is the foreground at first!!! 0 or 1; 50%; + human_mask = human_mask.view(n*s, 1, self.image_size//7, self.image_size//14) + human_mask = self.preprocess(human_mask, self.sils_size).detach().clone() + + intermediates = [torch.cat(intermediates[i:i+self.group_layer_num], dim=1).contiguous() for i in range(0, self.total_layer_num, self.group_layer_num)] + for i in range(self.num_FPN): + intermediates[i] = self.HumanSpace_Conv[i](intermediates[i]) + intermediates = torch.concat(intermediates, dim=1) + intermediates = intermediates * (human_mask > 0.5).to(intermediates) + intermediates = rearrange(intermediates.view(n, s, -1, self.sils_size*2, self.sils_size), 'n s c h w -> n c s h w').contiguous() + + outs = self.Gait_Net.test_1(intermediates) + all_outs.append(outs) + + embed_list, log_list = self.Gait_Net.test_2( + torch.cat(all_outs, dim=2), + seqL, + ) + + if self.training: + retval = { + 'training_feat': { + 'triplet': {'embeddings': torch.concat(embed_list, dim=-1), 'labels': labs}, + 'softmax': {'logits': torch.concat(log_list, dim=-1), 'labels': labs}, + }, + 'visual_summary': { + 'image/rgb_img': rgb_img.view(n*s, c, h, w)[:5].float(), + 'image/human_mask': self.min_max_norm(human_mask.view(n*s, -1, self.sils_size*2, self.sils_size)[:5].float()), + }, + 'inference_feat': { + 'embeddings': torch.concat(embed_list, dim=-1), + } + } + else: + retval = { + 'training_feat': {}, + 'visual_summary': {}, + + 'inference_feat': { + 'embeddings': torch.concat(embed_list, dim=-1), + } + } + return retval