docs: add uv workflow and ScoNet eval reproduction notes
This commit is contained in:
@@ -145,3 +145,5 @@ dmypy.json
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
ckpt/
|
||||
|
||||
@@ -44,6 +44,7 @@ OpenGait/
|
||||
|
||||
## CONVENTIONS
|
||||
- Launch pattern is DDP-first (`python -m torch.distributed.launch ... opengait/main.py --cfgs ... --phase ...`).
|
||||
- DDP Constraints: `world_size` must equal number of visible GPUs; test `evaluator_cfg.sampler.batch_size` must equal `world_size`.
|
||||
- Model/loss/backbone discoverability is filesystem-driven via package-level dynamic imports.
|
||||
- Experiment config semantics: custom YAML overlays `configs/default.yaml` (local key precedence).
|
||||
- Outputs are keyed by config identity: `output/${dataset_name}/${model}/${save_name}`.
|
||||
@@ -61,6 +62,18 @@ OpenGait/
|
||||
|
||||
## COMMANDS
|
||||
```bash
|
||||
# install (uv)
|
||||
uv sync --extra torch
|
||||
|
||||
# train (uv)
|
||||
CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train
|
||||
|
||||
# test (uv)
|
||||
CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase test
|
||||
|
||||
# ScoNet 1-GPU eval
|
||||
CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.launch --nproc_per_node=1 opengait/main.py --cfgs ./configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml --phase test
|
||||
|
||||
# train
|
||||
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train
|
||||
|
||||
|
||||
@@ -76,6 +76,20 @@ See [here](https://github.com/jdyjjj/All-in-One-Gait) for details.
|
||||
- **Nice log**: We use [`tensorboard`](https://pytorch.org/docs/stable/tensorboard.html) and `logging` to log everything, which looks pretty.
|
||||
|
||||
## Getting Started
|
||||
### Quick Start (uv)
|
||||
```bash
|
||||
# Install dependencies
|
||||
uv sync --extra torch
|
||||
|
||||
# Train
|
||||
CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase train
|
||||
|
||||
# Test
|
||||
CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_node=2 opengait/main.py --cfgs ./configs/baseline/baseline.yaml --phase test
|
||||
```
|
||||
|
||||
> **Note:** The `--nproc_per_node` argument must exactly match the number of GPUs specified in `CUDA_VISIBLE_DEVICES`. For single-GPU evaluation, use `CUDA_VISIBLE_DEVICES=0` and `--nproc_per_node=1` with the DDP launcher.
|
||||
|
||||
|
||||
|
||||
Please see [0.get_started.md](docs/0.get_started.md). We also provide the following tutorials for your reference:
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
data_cfg:
|
||||
dataset_name: Scoliosis1K
|
||||
dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
|
||||
num_workers: 1
|
||||
remove_no_gallery: false # Remove probe if no gallery for it
|
||||
test_dataset_name: Scoliosis1K
|
||||
|
||||
evaluator_cfg:
|
||||
enable_float16: true
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: ./ckpt/ScoNet-20000.pt
|
||||
save_name: ScoNet
|
||||
eval_func: evaluate_scoliosis
|
||||
sampler:
|
||||
batch_shuffle: false
|
||||
batch_size: 8
|
||||
sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options: fixed_unordered
|
||||
frames_all_limit: 720 # limit the number of sampled frames to prevent out of memory
|
||||
metric: euc # cos
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
|
||||
loss_cfg:
|
||||
- loss_term_weight: 1.0
|
||||
margin: 0.2
|
||||
type: TripletLoss
|
||||
log_prefix: triplet
|
||||
- loss_term_weight: 1.0
|
||||
scale: 16
|
||||
type: CrossEntropyLoss
|
||||
log_prefix: softmax
|
||||
log_accuracy: true
|
||||
|
||||
|
||||
model_cfg:
|
||||
model: ScoNet
|
||||
backbone_cfg:
|
||||
type: ResNet9
|
||||
block: BasicBlock
|
||||
channels: # Layers configuration for automatically model construction
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
layers:
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
strides:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 1
|
||||
maxpool: false
|
||||
SeparateFCs:
|
||||
in_channels: 512
|
||||
out_channels: 256
|
||||
parts_num: 16
|
||||
SeparateBNNecks:
|
||||
class_num: 3
|
||||
in_channels: 256
|
||||
parts_num: 16
|
||||
bin_num:
|
||||
- 16
|
||||
|
||||
optimizer_cfg:
|
||||
lr: 0.1
|
||||
momentum: 0.9
|
||||
solver: SGD
|
||||
weight_decay: 0.0005
|
||||
|
||||
scheduler_cfg:
|
||||
gamma: 0.1
|
||||
milestones: # Learning Rate Reduction at each milestones
|
||||
- 10000
|
||||
- 14000
|
||||
- 18000
|
||||
scheduler: MultiStepLR
|
||||
trainer_cfg:
|
||||
enable_float16: true # half_percesion float for memory reduction and speedup
|
||||
fix_BN: false
|
||||
with_test: false
|
||||
log_iter: 100
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 0
|
||||
save_iter: 20000
|
||||
save_name: ScoNet
|
||||
sync_BN: true
|
||||
total_iter: 20000
|
||||
sampler:
|
||||
batch_shuffle: true
|
||||
batch_size:
|
||||
- 8 # TripletSampler, batch_size[0] indicates Number of Identity
|
||||
- 8 # batch_size[1] indicates Samples sequqnce for each Identity
|
||||
frames_num_fixed: 30 # fixed frames number for training
|
||||
sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered
|
||||
type: TripletSampler
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
@@ -0,0 +1,101 @@
|
||||
data_cfg:
|
||||
dataset_name: Scoliosis1K
|
||||
dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
|
||||
num_workers: 1
|
||||
remove_no_gallery: false # Remove probe if no gallery for it
|
||||
test_dataset_name: Scoliosis1K
|
||||
|
||||
evaluator_cfg:
|
||||
enable_float16: true
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: ./ckpt/ScoNet-20000.pt
|
||||
save_name: ScoNet
|
||||
eval_func: evaluate_scoliosis
|
||||
sampler:
|
||||
batch_shuffle: false
|
||||
batch_size: 1
|
||||
sample_type: all_ordered # all indicates whole sequence used to test, while ordered means input sequence by its natural order; Other options: fixed_unordered
|
||||
frames_all_limit: 720 # limit the number of sampled frames to prevent out of memory
|
||||
metric: euc # cos
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
|
||||
loss_cfg:
|
||||
- loss_term_weight: 1.0
|
||||
margin: 0.2
|
||||
type: TripletLoss
|
||||
log_prefix: triplet
|
||||
- loss_term_weight: 1.0
|
||||
scale: 16
|
||||
type: CrossEntropyLoss
|
||||
log_prefix: softmax
|
||||
log_accuracy: true
|
||||
|
||||
|
||||
model_cfg:
|
||||
model: ScoNet
|
||||
backbone_cfg:
|
||||
type: ResNet9
|
||||
block: BasicBlock
|
||||
channels: # Layers configuration for automatically model construction
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
layers:
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
strides:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 1
|
||||
maxpool: false
|
||||
SeparateFCs:
|
||||
in_channels: 512
|
||||
out_channels: 256
|
||||
parts_num: 16
|
||||
SeparateBNNecks:
|
||||
class_num: 3
|
||||
in_channels: 256
|
||||
parts_num: 16
|
||||
bin_num:
|
||||
- 16
|
||||
|
||||
optimizer_cfg:
|
||||
lr: 0.1
|
||||
momentum: 0.9
|
||||
solver: SGD
|
||||
weight_decay: 0.0005
|
||||
|
||||
scheduler_cfg:
|
||||
gamma: 0.1
|
||||
milestones: # Learning Rate Reduction at each milestones
|
||||
- 10000
|
||||
- 14000
|
||||
- 18000
|
||||
scheduler: MultiStepLR
|
||||
trainer_cfg:
|
||||
enable_float16: true # half_percesion float for memory reduction and speedup
|
||||
fix_BN: false
|
||||
with_test: false
|
||||
log_iter: 100
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 0
|
||||
save_iter: 20000
|
||||
save_name: ScoNet
|
||||
sync_BN: true
|
||||
total_iter: 20000
|
||||
sampler:
|
||||
batch_shuffle: true
|
||||
batch_size:
|
||||
- 8 # TripletSampler, batch_size[0] indicates Number of Identity
|
||||
- 8 # batch_size[1] indicates Samples sequqnce for each Identity
|
||||
frames_num_fixed: 30 # fixed frames number for training
|
||||
sample_type: fixed_unordered # fixed control input frames number, unordered for controlling order of input tensor; Other options: unfixed_ordered or all_ordered
|
||||
type: TripletSampler
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
@@ -0,0 +1,74 @@
|
||||
# ScoNet Checkpoint Evaluation Reproduction Notes
|
||||
|
||||
This document records the findings and successful procedure for reproducing ScoNet checkpoint evaluation using `uv` and the OpenGait framework.
|
||||
|
||||
## Observed Failure Sequence and Root Causes
|
||||
|
||||
### 1. Missing Dependencies (Eager Auto-Import)
|
||||
OpenGait uses a dynamic registration pattern in `opengait/modeling/models/__init__.py`. When `main.py` imports `models`, it attempts to iterate through all modules in the `models/` directory. If any model file (e.g., `BiggerGait_DINOv2.py`) has dependencies not installed in the current environment (like `timm`), the entire program fails even if you are not using that specific model.
|
||||
|
||||
**Root Cause:** `iter_modules` in `opengait/modeling/models/__init__.py` triggers imports of all sibling files.
|
||||
|
||||
### 2. GPU/World Size Mismatch
|
||||
The runtime enforces a strict equality between the number of visible GPUs and the DDP world size in `opengait/main.py`:
|
||||
|
||||
```python
|
||||
# opengait/main.py
|
||||
if torch.distributed.get_world_size() != torch.cuda.device_count():
|
||||
raise ValueError("Expect number of available GPUs({}) equals to the world size({}).".format(
|
||||
torch.cuda.device_count(), torch.distributed.get_world_size()))
|
||||
```
|
||||
|
||||
**Error Message:** `ValueError: Expect number of available GPUs(2) equals to the world size(1)`
|
||||
|
||||
### 3. Evaluator Sampler Batch Size Rule
|
||||
The evaluator enforces that the total batch size must equal the number of GPUs in testing mode, as checked in `opengait/modeling/base_model.py`.
|
||||
|
||||
**Error Message:** `ValueError: The batch size (8) must be equal to the number of GPUs (1) in testing mode!`
|
||||
|
||||
## Successful Reproduction Environment
|
||||
|
||||
- **Runtime:** `uv` with PEP 621 (`pyproject.toml`)
|
||||
- **Hardware:** 1 Visible GPU
|
||||
- **Dataset Path:** Symlinked at `datasets/Scoliosis1K` (user-created link pointing to the actual data root).
|
||||
|
||||
## Successful Command and Config
|
||||
|
||||
### Command
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.launch \
|
||||
--nproc_per_node=1 \
|
||||
opengait/main.py \
|
||||
--cfgs ./configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml \
|
||||
--phase test
|
||||
```
|
||||
|
||||
### Config Highlights (`configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml`)
|
||||
```yaml
|
||||
data_cfg:
|
||||
dataset_root: ./datasets/Scoliosis1K/Scoliosis1K-sil-pkl
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_1116.json
|
||||
|
||||
evaluator_cfg:
|
||||
restore_hint: ./ckpt/ScoNet-20000.pt
|
||||
sampler:
|
||||
batch_size: 1 # Must be integer for evaluation
|
||||
sample_type: all_ordered
|
||||
```
|
||||
|
||||
## Final Metrics
|
||||
The successful evaluation of the `ScoNet-20000.pt` checkpoint yielded:
|
||||
|
||||
| Metric | Value |
|
||||
| :--- | :--- |
|
||||
| **Accuracy** | 80.88% |
|
||||
| **Macro Precision** | 81.50% |
|
||||
| **Macro Recall** | 78.82% |
|
||||
| **Macro F1** | 75.14% |
|
||||
|
||||
## Troubleshooting Checklist
|
||||
|
||||
1. **Environment:** Ensure all dependencies for *all* registered models are installed (e.g., `timm` for `BiggerGait_DINOv2.py`) to avoid eager import failures in `opengait/modeling/models/__init__.py`.
|
||||
2. **GPU Visibility:** Match `CUDA_VISIBLE_DEVICES` count exactly with `--nproc_per_node` (checked in `opengait/main.py`).
|
||||
3. **Config Check:** Verify `evaluator_cfg.sampler.batch_size` equals the number of GPUs (checked in `opengait/modeling/base_model.py`).
|
||||
4. **Data Paths:** Ensure `dataset_root` and `dataset_partition` in the YAML point to valid paths (use symlinks under `datasets/` for convenience).
|
||||
@@ -0,0 +1,31 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "opengait"
|
||||
version = "0.0.0"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"pyyaml",
|
||||
"tensorboard",
|
||||
"opencv-python",
|
||||
"tqdm",
|
||||
"py7zr",
|
||||
"kornia",
|
||||
"einops",
|
||||
"numpy",
|
||||
"imageio",
|
||||
"Pillow",
|
||||
"scikit-learn",
|
||||
"matplotlib",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
torch = [
|
||||
"torch>=1.10",
|
||||
"torchvision",
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["opengait"]
|
||||
Reference in New Issue
Block a user