docs: add uv workflow and ScoNet eval reproduction notes

2026-02-26 14:19:00 +08:00
parent 0fdd35bd78
commit 5c06a80d93
8 changed files with 2693 additions and 0 deletions
@@ -145,3 +145,5 @@ dmypy.json
 # Cython debug symbols
 cython_debug/
 ckpt/
@@ -44,6 +44,7 @@ OpenGait/
 ## CONVENTIONS
 - Launch pattern is DDP-first (`python -m torch.distributed.launch ... opengait/main.py --cfgs ... --phase ...`).
 - DDP Constraints: `world_size` must equal number of visible GPUs; test `evaluator_cfg.sampler.batch_size` must equal `world_size`.
 - Model/loss/backbone discoverability is filesystem-driven via package-level dynamic imports.
 - Experiment config semantics: custom YAML overlays `configs/default.yaml` (local key precedence).
 - Outputs are keyed by config identity: `output/${dataset_name}/${model}/${save_name}`.
@@ -61,6 +62,18 @@ OpenGait/
 ## COMMANDS
 ```bash
 # install (uv)
 uv sync --extra torch
 # train (uv)
 CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train
 # test (uv)
 CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase test
 # ScoNet 1-GPU eval
 CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.launch --nproc_per_node=1 opengait/main.py --cfgs ./configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml --phase test
 # train
 CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train
@@ -76,6 +76,20 @@ See [here](https://github.com/jdyjjj/All-in-One-Gait) for details.
 - **Nice log**: We use [`tensorboard`](https://pytorch.org/docs/stable/tensorboard.html) and `logging` to log everything, which looks pretty.
 ## Getting Started
 ### Quick Start (uv)
 ```bash
 # Install dependencies
 uv sync --extra torch
 # Train
 CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train
 # Test
 CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase test
 ```
 > **Note:** The `--nproc_per_node` argument must exactly match the number of GPUs specified in `CUDA_VISIBLE_DEVICES`. For single-GPU evaluation, use `CUDA_VISIBLE_DEVICES=0` and `--nproc_per_node=1` with the DDP launcher.
 Please see [0.get_started.md](docs/0.get_started.md). We also provide the following tutorials for your reference:
@@ -0,0 +1,101 @@
 data_cfg:
  dataset_name: Scoliosis1K
  dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
  num_workers: 1
  remove_no_gallery: false # Remove probe if no gallery for it
  test_dataset_name: Scoliosis1K
 evaluator_cfg:
  enable_float16: true
  restore_ckpt_strict: true
  restore_hint: ./ckpt/ScoNet-20000.pt
  save_name: ScoNet
  eval_func: evaluate_scoliosis
  sampler:
    batch_shuffle: false
    batch_size: 8
    sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options:   fixed_unordered
    frames_all_limit: 720 # limit the number of sampled frames to prevent out of memory
  metric: euc # cos
  transform:
    - type: BaseSilCuttingTransform
 loss_cfg:
  - loss_term_weight: 1.0
    margin: 0.2
    type: TripletLoss
    log_prefix: triplet
  - loss_term_weight: 1.0
    scale: 16
    type: CrossEntropyLoss
    log_prefix: softmax
    log_accuracy: true
 model_cfg:
  model: ScoNet
  backbone_cfg:
    type: ResNet9
    block: BasicBlock
    channels: # Layers configuration for automatically model construction
      - 64
      - 128
      - 256
      - 512
    layers:
      - 1
      - 1
      - 1
      - 1
    strides:
      - 1
      - 2
      - 2
      - 1
    maxpool: false
  SeparateFCs:
    in_channels: 512
    out_channels: 256
    parts_num: 16
  SeparateBNNecks:
    class_num: 3
    in_channels: 256
    parts_num: 16
  bin_num:
    - 16
 optimizer_cfg:
  lr: 0.1
  momentum: 0.9
  solver: SGD
  weight_decay: 0.0005
 scheduler_cfg:
  gamma: 0.1
  milestones: # Learning Rate Reduction at each milestones
    - 10000
    - 14000
    - 18000
  scheduler: MultiStepLR
 trainer_cfg:
  enable_float16: true # half_percesion float for memory reduction and speedup
  fix_BN: false
  with_test: false
  log_iter: 100
  restore_ckpt_strict: true
  restore_hint: 0
  save_iter: 20000
  save_name: ScoNet
  sync_BN: true
  total_iter: 20000
  sampler:
    batch_shuffle: true
    batch_size:
      - 8 # TripletSampler, batch_size[0] indicates Number of Identity
      - 8 #                 batch_size[1] indicates Samples sequqnce for each Identity
    frames_num_fixed: 30 # fixed frames number for training
    sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered
    type: TripletSampler
  transform:
    - type: BaseSilCuttingTransform
@@ -0,0 +1,101 @@
 data_cfg:
  dataset_name: Scoliosis1K
  dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
  num_workers: 1
  remove_no_gallery: false # Remove probe if no gallery for it
  test_dataset_name: Scoliosis1K
 evaluator_cfg:
  enable_float16: true
  restore_ckpt_strict: true
  restore_hint: ./ckpt/ScoNet-20000.pt
  save_name: ScoNet
  eval_func: evaluate_scoliosis
  sampler:
    batch_shuffle: false
    batch_size: 1
    sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options:   fixed_unordered
    frames_all_limit: 720 # limit the number of sampled frames to prevent out of memory
  metric: euc # cos
  transform:
    - type: BaseSilCuttingTransform
 loss_cfg:
  - loss_term_weight: 1.0
    margin: 0.2
    type: TripletLoss
    log_prefix: triplet
  - loss_term_weight: 1.0
    scale: 16
    type: CrossEntropyLoss
    log_prefix: softmax
    log_accuracy: true
 model_cfg:
  model: ScoNet
  backbone_cfg:
    type: ResNet9
    block: BasicBlock
    channels: # Layers configuration for automatically model construction
      - 64
      - 128
      - 256
      - 512
    layers:
      - 1
      - 1
      - 1
      - 1
    strides:
      - 1
      - 2
      - 2
      - 1
    maxpool: false
  SeparateFCs:
    in_channels: 512
    out_channels: 256
    parts_num: 16
  SeparateBNNecks:
    class_num: 3
    in_channels: 256
    parts_num: 16
  bin_num:
    - 16
 optimizer_cfg:
  lr: 0.1
  momentum: 0.9
  solver: SGD
  weight_decay: 0.0005
 scheduler_cfg:
  gamma: 0.1
  milestones: # Learning Rate Reduction at each milestones
    - 10000
    - 14000
    - 18000
  scheduler: MultiStepLR
 trainer_cfg:
  enable_float16: true # half_percesion float for memory reduction and speedup
  fix_BN: false
  with_test: false
  log_iter: 100
  restore_ckpt_strict: true
  restore_hint: 0
  save_iter: 20000
  save_name: ScoNet
  sync_BN: true
  total_iter: 20000
  sampler:
    batch_shuffle: true
    batch_size:
      - 8 # TripletSampler, batch_size[0] indicates Number of Identity
      - 8 #                 batch_size[1] indicates Samples sequqnce for each Identity
    frames_num_fixed: 30 # fixed frames number for training
    sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered
    type: TripletSampler
  transform:
    - type: BaseSilCuttingTransform
@@ -0,0 +1,74 @@
 # ScoNet Checkpoint Evaluation Reproduction Notes
 This document records the findings and successful procedure for reproducing ScoNet checkpoint evaluation using `uv` and the OpenGait framework.
 ## Observed Failure Sequence and Root Causes
 ### 1. Missing Dependencies (Eager Auto-Import)
 OpenGait uses a dynamic registration pattern in `opengait/modeling/models/__init__.py`. When `main.py` imports `models`, it attempts to iterate through all modules in the `models/` directory. If any model file (e.g., `BiggerGait_DINOv2.py`) has dependencies not installed in the current environment (like `timm`), the entire program fails even if you are not using that specific model.
 **Root Cause:** `iter_modules` in `opengait/modeling/models/__init__.py` triggers imports of all sibling files.
 ### 2. GPU/World Size Mismatch
 The runtime enforces a strict equality between the number of visible GPUs and the DDP world size in `opengait/main.py`:
 ```python
 # opengait/main.py
 if torch.distributed.get_world_size() != torch.cuda.device_count():
    raise ValueError("Expect number of available GPUs({}) equals to the world size({}).".format(
        torch.cuda.device_count(), torch.distributed.get_world_size()))
 ```
 **Error Message:** `ValueError: Expect number of available GPUs(2) equals to the world size(1)`
 ### 3. Evaluator Sampler Batch Size Rule
 The evaluator enforces that the total batch size must equal the number of GPUs in testing mode, as checked in `opengait/modeling/base_model.py`.
 **Error Message:** `ValueError: The batch size (8) must be equal to the number of GPUs (1) in testing mode!`
 ## Successful Reproduction Environment
 - **Runtime:** `uv` with PEP 621 (`pyproject.toml`)
 - **Hardware:** 1 Visible GPU
 - **Dataset Path:** Symlinked at `datasets/Scoliosis1K` (user-created link pointing to the actual data root).
 ## Successful Command and Config
 ### Command
 ```bash
 CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.launch \
    --nproc_per_node=1 \
    opengait/main.py \
    --cfgs ./configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml \
    --phase test
 ```
 ### Config Highlights (`configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml`)
 ```yaml
 data_cfg:
  dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
 evaluator_cfg:
  restore_hint: ./ckpt/ScoNet-20000.pt
  sampler:
    batch_size: 1  # Must be integer for evaluation
    sample_type: all_ordered
 ```
 ## Final Metrics
 The successful evaluation of the `ScoNet-20000.pt` checkpoint yielded:
 | Metric | Value |
 | :--- | :--- |
 | **Accuracy** | 80.88% |
 | **Macro Precision** | 81.50% |
 | **Macro Recall** | 78.82% |
 | **Macro F1** | 75.14% |
 ## Troubleshooting Checklist
 1. **Environment:** Ensure all dependencies for *all* registered models are installed (e.g., `timm` for `BiggerGait_DINOv2.py`) to avoid eager import failures in `opengait/modeling/models/__init__.py`.
 2. **GPU Visibility:** Match `CUDA_VISIBLE_DEVICES` count exactly with `--nproc_per_node` (checked in `opengait/main.py`).
 3. **Config Check:** Verify `evaluator_cfg.sampler.batch_size` equals the number of GPUs (checked in `opengait/modeling/base_model.py`).
 4. **Data Paths:** Ensure `dataset_root` and `dataset_partition` in the YAML point to valid paths (use symlinks under `datasets/` for convenience).
@@ -0,0 +1,31 @@
 [build-system]
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "opengait"
 version = "0.0.0"
 requires-python = ">=3.10"
 dependencies = [
    "pyyaml",
    "tensorboard",
    "opencv-python",
    "tqdm",
    "py7zr",
    "kornia",
    "einops",
    "numpy",
    "imageio",
    "Pillow",
    "scikit-learn",
    "matplotlib",
 ]
 [project.optional-dependencies]
 torch = [
    "torch>=1.10",
    "torchvision",
 ]
 [tool.setuptools]
 packages = ["opengait"]