docs: add uv workflow and ScoNet eval reproduction notes

2026-02-26 14:19:00 +08:00
parent 0fdd35bd78
commit 5c06a80d93
8 changed files with 2693 additions and 0 deletions
@@ -145,3 +145,5 @@ dmypy.json

 # Cython debug symbols
 cython_debug/
+
+ckpt/
@@ -44,6 +44,7 @@ OpenGait/

 ## CONVENTIONS
 - Launch pattern is DDP-first (`python -m torch.distributed.launch ... opengait/main.py --cfgs ... --phase ...`).
+- DDP Constraints: `world_size` must equal number of visible GPUs; test `evaluator_cfg.sampler.batch_size` must equal `world_size`.
 - Model/loss/backbone discoverability is filesystem-driven via package-level dynamic imports.
 - Experiment config semantics: custom YAML overlays `configs/default.yaml` (local key precedence).
 - Outputs are keyed by config identity: `output/${dataset_name}/${model}/${save_name}`.
@@ -61,6 +62,18 @@ OpenGait/

 ## COMMANDS
 ```bash
+# install (uv)
+uv sync --extra torch
+
+# train (uv)
+CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train
+
+# test (uv)
+CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase test
+
+# ScoNet 1-GPU eval
+CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.launch --nproc_per_node=1 opengait/main.py --cfgs ./configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml --phase test
+
 # train
 CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train

@@ -76,6 +76,20 @@ See [here](https://github.com/jdyjjj/All-in-One-Gait) for details.
 - **Nice log**: We use [`tensorboard`](https://pytorch.org/docs/stable/tensorboard.html) and `logging` to log everything, which looks pretty.

 ## Getting Started
+### Quick Start (uv)
+```bash
+# Install dependencies
+uv sync --extra torch
+
+# Train
+CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train
+
+# Test
+CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase test
+```
+
+> **Note:** The `--nproc_per_node` argument must exactly match the number of GPUs specified in `CUDA_VISIBLE_DEVICES`. For single-GPU evaluation, use `CUDA_VISIBLE_DEVICES=0` and `--nproc_per_node=1` with the DDP launcher.
+


 Please see [0.get_started.md](docs/0.get_started.md). We also provide the following tutorials for your reference:
@@ -0,0 +1,101 @@
+data_cfg:
+  dataset_name: Scoliosis1K
+  dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
+  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
+  num_workers: 1
+  remove_no_gallery: false # Remove probe if no gallery for it
+  test_dataset_name: Scoliosis1K
+
+evaluator_cfg:
+  enable_float16: true
+  restore_ckpt_strict: true
+  restore_hint: ./ckpt/ScoNet-20000.pt
+  save_name: ScoNet
+  eval_func: evaluate_scoliosis
+  sampler:
+    batch_shuffle: false
+    batch_size: 8
+    sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options:   fixed_unordered
+    frames_all_limit: 720 # limit the number of sampled frames to prevent out of memory
+  metric: euc # cos
+  transform:
+    - type: BaseSilCuttingTransform
+
+loss_cfg:
+  - loss_term_weight: 1.0
+    margin: 0.2
+    type: TripletLoss
+    log_prefix: triplet
+  - loss_term_weight: 1.0
+    scale: 16
+    type: CrossEntropyLoss
+    log_prefix: softmax
+    log_accuracy: true
+
+
+model_cfg:
+  model: ScoNet
+  backbone_cfg:
+    type: ResNet9
+    block: BasicBlock
+    channels: # Layers configuration for automatically model construction
+      - 64
+      - 128
+      - 256
+      - 512
+    layers:
+      - 1
+      - 1
+      - 1
+      - 1
+    strides:
+      - 1
+      - 2
+      - 2
+      - 1
+    maxpool: false
+  SeparateFCs:
+    in_channels: 512
+    out_channels: 256
+    parts_num: 16
+  SeparateBNNecks:
+    class_num: 3
+    in_channels: 256
+    parts_num: 16
+  bin_num:
+    - 16
+
+optimizer_cfg:
+  lr: 0.1
+  momentum: 0.9
+  solver: SGD
+  weight_decay: 0.0005
+
+scheduler_cfg:
+  gamma: 0.1
+  milestones: # Learning Rate Reduction at each milestones
+    - 10000
+    - 14000
+    - 18000
+  scheduler: MultiStepLR
+trainer_cfg:
+  enable_float16: true # half_percesion float for memory reduction and speedup
+  fix_BN: false
+  with_test: false
+  log_iter: 100
+  restore_ckpt_strict: true
+  restore_hint: 0
+  save_iter: 20000
+  save_name: ScoNet
+  sync_BN: true
+  total_iter: 20000
+  sampler:
+    batch_shuffle: true
+    batch_size:
+      - 8 # TripletSampler, batch_size[0] indicates Number of Identity
+      - 8 #                 batch_size[1] indicates Samples sequqnce for each Identity
+    frames_num_fixed: 30 # fixed frames number for training
+    sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered
+    type: TripletSampler
+  transform:
+    - type: BaseSilCuttingTransform
@@ -0,0 +1,101 @@
+data_cfg:
+  dataset_name: Scoliosis1K
+  dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
+  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
+  num_workers: 1
+  remove_no_gallery: false # Remove probe if no gallery for it
+  test_dataset_name: Scoliosis1K
+
+evaluator_cfg:
+  enable_float16: true
+  restore_ckpt_strict: true
+  restore_hint: ./ckpt/ScoNet-20000.pt
+  save_name: ScoNet
+  eval_func: evaluate_scoliosis
+  sampler:
+    batch_shuffle: false
+    batch_size: 1
+    sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options:   fixed_unordered
+    frames_all_limit: 720 # limit the number of sampled frames to prevent out of memory
+  metric: euc # cos
+  transform:
+    - type: BaseSilCuttingTransform
+
+loss_cfg:
+  - loss_term_weight: 1.0
+    margin: 0.2
+    type: TripletLoss
+    log_prefix: triplet
+  - loss_term_weight: 1.0
+    scale: 16
+    type: CrossEntropyLoss
+    log_prefix: softmax
+    log_accuracy: true
+
+
+model_cfg:
+  model: ScoNet
+  backbone_cfg:
+    type: ResNet9
+    block: BasicBlock
+    channels: # Layers configuration for automatically model construction
+      - 64
+      - 128
+      - 256
+      - 512
+    layers:
+      - 1
+      - 1
+      - 1
+      - 1
+    strides:
+      - 1
+      - 2
+      - 2
+      - 1
+    maxpool: false
+  SeparateFCs:
+    in_channels: 512
+    out_channels: 256
+    parts_num: 16
+  SeparateBNNecks:
+    class_num: 3
+    in_channels: 256
+    parts_num: 16
+  bin_num:
+    - 16
+
+optimizer_cfg:
+  lr: 0.1
+  momentum: 0.9
+  solver: SGD
+  weight_decay: 0.0005
+
+scheduler_cfg:
+  gamma: 0.1
+  milestones: # Learning Rate Reduction at each milestones
+    - 10000
+    - 14000
+    - 18000
+  scheduler: MultiStepLR
+trainer_cfg:
+  enable_float16: true # half_percesion float for memory reduction and speedup
+  fix_BN: false
+  with_test: false
+  log_iter: 100
+  restore_ckpt_strict: true
+  restore_hint: 0
+  save_iter: 20000
+  save_name: ScoNet
+  sync_BN: true
+  total_iter: 20000
+  sampler:
+    batch_shuffle: true
+    batch_size:
+      - 8 # TripletSampler, batch_size[0] indicates Number of Identity
+      - 8 #                 batch_size[1] indicates Samples sequqnce for each Identity
+    frames_num_fixed: 30 # fixed frames number for training
+    sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered
+    type: TripletSampler
+  transform:
+    - type: BaseSilCuttingTransform
@@ -0,0 +1,74 @@
+# ScoNet Checkpoint Evaluation Reproduction Notes
+
+This document records the findings and successful procedure for reproducing ScoNet checkpoint evaluation using `uv` and the OpenGait framework.
+
+## Observed Failure Sequence and Root Causes
+
+### 1. Missing Dependencies (Eager Auto-Import)
+OpenGait uses a dynamic registration pattern in `opengait/modeling/models/__init__.py`. When `main.py` imports `models`, it attempts to iterate through all modules in the `models/` directory. If any model file (e.g., `BiggerGait_DINOv2.py`) has dependencies not installed in the current environment (like `timm`), the entire program fails even if you are not using that specific model.
+
+**Root Cause:** `iter_modules` in `opengait/modeling/models/__init__.py` triggers imports of all sibling files.
+
+### 2. GPU/World Size Mismatch
+The runtime enforces a strict equality between the number of visible GPUs and the DDP world size in `opengait/main.py`:
+
+```python
+# opengait/main.py
+if torch.distributed.get_world_size() != torch.cuda.device_count():
+    raise ValueError("Expect number of available GPUs({}) equals to the world size({}).".format(
+        torch.cuda.device_count(), torch.distributed.get_world_size()))
+```
+
+**Error Message:** `ValueError: Expect number of available GPUs(2) equals to the world size(1)`
+
+### 3. Evaluator Sampler Batch Size Rule
+The evaluator enforces that the total batch size must equal the number of GPUs in testing mode, as checked in `opengait/modeling/base_model.py`.
+
+**Error Message:** `ValueError: The batch size (8) must be equal to the number of GPUs (1) in testing mode!`
+
+## Successful Reproduction Environment
+
+- **Runtime:** `uv` with PEP 621 (`pyproject.toml`)
+- **Hardware:** 1 Visible GPU
+- **Dataset Path:** Symlinked at `datasets/Scoliosis1K` (user-created link pointing to the actual data root).
+
+## Successful Command and Config
+
+### Command
+```bash
+CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.launch \
+    --nproc_per_node=1 \
+    opengait/main.py \
+    --cfgs ./configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml \
+    --phase test
+```
+
+### Config Highlights (`configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml`)
+```yaml
+data_cfg:
+  dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
+  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
+
+evaluator_cfg:
+  restore_hint: ./ckpt/ScoNet-20000.pt
+  sampler:
+    batch_size: 1  # Must be integer for evaluation
+    sample_type: all_ordered
+```
+
+## Final Metrics
+The successful evaluation of the `ScoNet-20000.pt` checkpoint yielded:
+
+| Metric | Value |
+| :--- | :--- |
+| **Accuracy** | 80.88% |
+| **Macro Precision** | 81.50% |
+| **Macro Recall** | 78.82% |
+| **Macro F1** | 75.14% |
+
+## Troubleshooting Checklist
+
+1. **Environment:** Ensure all dependencies for *all* registered models are installed (e.g., `timm` for `BiggerGait_DINOv2.py`) to avoid eager import failures in `opengait/modeling/models/__init__.py`.
+2. **GPU Visibility:** Match `CUDA_VISIBLE_DEVICES` count exactly with `--nproc_per_node` (checked in `opengait/main.py`).
+3. **Config Check:** Verify `evaluator_cfg.sampler.batch_size` equals the number of GPUs (checked in `opengait/modeling/base_model.py`).
+4. **Data Paths:** Ensure `dataset_root` and `dataset_partition` in the YAML point to valid paths (use symlinks under `datasets/` for convenience).
@@ -0,0 +1,31 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "opengait"
+version = "0.0.0"
+requires-python = ">=3.10"
+dependencies = [
+    "pyyaml",
+    "tensorboard",
+    "opencv-python",
+    "tqdm",
+    "py7zr",
+    "kornia",
+    "einops",
+    "numpy",
+    "imageio",
+    "Pillow",
+    "scikit-learn",
+    "matplotlib",
+]
+
+[project.optional-dependencies]
+torch = [
+    "torch>=1.10",
+    "torchvision",
+]
+
+[tool.setuptools]
+packages = ["opengait"]