feat: retain best checkpoints and support alternate output roots

This commit is contained in:
2026-03-11 01:14:05 +08:00
parent 63e2ed1097
commit a0150c791f
14 changed files with 852 additions and 9 deletions
@@ -0,0 +1,69 @@
data_cfg:
dataset_name: Scoliosis1K
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
num_workers: 1
remove_no_gallery: false
evaluator_cfg:
enable_float16: true
restore_ckpt_strict: true
restore_hint: 2000
save_name: DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k
eval_func: evaluate_scoliosis
sampler:
batch_shuffle: false
batch_size: 1
sample_type: all_ordered
type: InferenceSampler
frames_all_limit: 720
metric: euc
transform:
- type: BaseSilCuttingTransform
- type: NoOperation
loss_cfg:
- loss_term_weight: 1.0
margin: 0.2
type: TripletLoss
log_prefix: triplet
- loss_term_weight: 1.0
scale: 16
type: CrossEntropyLoss
log_prefix: softmax
log_accuracy: true
model_cfg:
model: DRF
num_pairs: 8
num_metrics: 3
backbone_cfg:
type: ResNet9
block: BasicBlock
in_channel: 2
channels:
- 64
- 128
- 256
- 512
layers:
- 1
- 1
- 1
- 1
strides:
- 1
- 2
- 2
- 1
maxpool: false
SeparateFCs:
in_channels: 512
out_channels: 256
parts_num: 16
SeparateBNNecks:
class_num: 3
in_channels: 256
parts_num: 16
bin_num:
- 16
@@ -0,0 +1,115 @@
data_cfg:
dataset_name: Scoliosis1K
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
data_in_use:
- true
- false
num_workers: 1
remove_no_gallery: false
test_dataset_name: Scoliosis1K
evaluator_cfg:
enable_float16: true
restore_ckpt_strict: true
restore_hint: 80000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k
output_root: /mnt/hddl/data/OpenGait-output
eval_func: evaluate_scoliosis
sampler:
batch_shuffle: false
batch_size: 1
sample_type: all_ordered
type: InferenceSampler
frames_all_limit: 720
metric: euc
transform:
- type: BaseSilCuttingTransform
loss_cfg:
- loss_term_weight: 1.0
margin: 0.2
type: TripletLoss
log_prefix: triplet
- loss_term_weight: 1.0
scale: 16
type: CrossEntropyLoss
log_prefix: softmax
log_accuracy: true
model_cfg:
model: ScoNet
backbone_cfg:
type: ResNet9
block: BasicBlock
in_channel: 2
channels:
- 64
- 128
- 256
- 512
layers:
- 1
- 1
- 1
- 1
strides:
- 1
- 2
- 2
- 1
maxpool: false
SeparateFCs:
in_channels: 512
out_channels: 256
parts_num: 16
SeparateBNNecks:
class_num: 3
in_channels: 256
parts_num: 16
bin_num:
- 16
optimizer_cfg:
lr: 0.0005
solver: AdamW
weight_decay: 0.0005
scheduler_cfg:
scheduler: CosineAnnealingLR
T_max: 60000
eta_min: 0.00001
trainer_cfg:
enable_float16: true
fix_BN: false
with_test: true
log_iter: 100
restore_ckpt_strict: true
optimizer_reset: false
scheduler_reset: false
restore_hint: /mnt/hddl/data/OpenGait-output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k/checkpoints/latest.pt
output_root: /mnt/hddl/data/OpenGait-output
auto_resume_latest: true
resume_every_iter: 500
resume_keep: 6
best_ckpt_cfg:
keep_n: 3
metric_names:
- scalar/test_f1/
- scalar/test_accuracy/
eval_iter: 1000
save_iter: 500
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k
sync_BN: false
total_iter: 80000
sampler:
batch_shuffle: true
batch_size:
- 8
- 8
frames_num_fixed: 30
sample_type: fixed_unordered
type: TripletSampler
transform:
- type: BaseSilCuttingTransform
@@ -0,0 +1,111 @@
data_cfg:
dataset_name: Scoliosis1K
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
data_in_use:
- true
- false
num_workers: 1
remove_no_gallery: false
test_dataset_name: Scoliosis1K
evaluator_cfg:
enable_float16: true
restore_ckpt_strict: true
restore_hint: 20000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k
eval_func: evaluate_scoliosis
sampler:
batch_shuffle: false
batch_size: 1
sample_type: all_ordered
type: InferenceSampler
frames_all_limit: 720
metric: euc
transform:
- type: BaseSilCuttingTransform
loss_cfg:
- loss_term_weight: 1.0
margin: 0.2
type: TripletLoss
log_prefix: triplet
- loss_term_weight: 1.0
scale: 16
type: CrossEntropyLoss
log_prefix: softmax
log_accuracy: true
model_cfg:
model: ScoNet
backbone_cfg:
type: ResNet9
block: BasicBlock
in_channel: 2
channels:
- 64
- 128
- 256
- 512
layers:
- 1
- 1
- 1
- 1
strides:
- 1
- 2
- 2
- 1
maxpool: false
SeparateFCs:
in_channels: 512
out_channels: 256
parts_num: 16
SeparateBNNecks:
class_num: 3
in_channels: 256
parts_num: 16
bin_num:
- 16
optimizer_cfg:
lr: 0.001
solver: AdamW
weight_decay: 0.0005
scheduler_cfg:
gamma: 0.1
milestones:
- 5000
- 7000
- 9000
scheduler: MultiStepLR
trainer_cfg:
enable_float16: true
fix_BN: false
with_test: true
log_iter: 100
restore_ckpt_strict: true
optimizer_reset: true
scheduler_reset: true
restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt
auto_resume_latest: true
resume_every_iter: 500
resume_keep: 6
eval_iter: 1000
save_iter: 1000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k
sync_BN: false
total_iter: 20000
sampler:
batch_shuffle: true
batch_size:
- 8
- 8
frames_num_fixed: 30
sample_type: fixed_unordered
type: TripletSampler
transform:
- type: BaseSilCuttingTransform
@@ -0,0 +1,111 @@
data_cfg:
dataset_name: Scoliosis1K
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
data_in_use:
- true
- false
num_workers: 1
remove_no_gallery: false
test_dataset_name: Scoliosis1K
evaluator_cfg:
enable_float16: true
restore_ckpt_strict: true
restore_hint: 40000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k
eval_func: evaluate_scoliosis
sampler:
batch_shuffle: false
batch_size: 1
sample_type: all_ordered
type: InferenceSampler
frames_all_limit: 720
metric: euc
transform:
- type: BaseSilCuttingTransform
loss_cfg:
- loss_term_weight: 1.0
margin: 0.2
type: TripletLoss
log_prefix: triplet
- loss_term_weight: 1.0
scale: 16
type: CrossEntropyLoss
log_prefix: softmax
log_accuracy: true
model_cfg:
model: ScoNet
backbone_cfg:
type: ResNet9
block: BasicBlock
in_channel: 2
channels:
- 64
- 128
- 256
- 512
layers:
- 1
- 1
- 1
- 1
strides:
- 1
- 2
- 2
- 1
maxpool: false
SeparateFCs:
in_channels: 512
out_channels: 256
parts_num: 16
SeparateBNNecks:
class_num: 3
in_channels: 256
parts_num: 16
bin_num:
- 16
optimizer_cfg:
lr: 0.001
solver: AdamW
weight_decay: 0.0005
scheduler_cfg:
gamma: 0.1
milestones:
- 10000
- 20000
- 30000
scheduler: MultiStepLR
trainer_cfg:
enable_float16: true
fix_BN: false
with_test: true
log_iter: 100
restore_ckpt_strict: true
optimizer_reset: true
scheduler_reset: true
restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt
auto_resume_latest: true
resume_every_iter: 500
resume_keep: 6
eval_iter: 1000
save_iter: 1000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k
sync_BN: false
total_iter: 40000
sampler:
batch_shuffle: true
batch_size:
- 8
- 8
frames_num_fixed: 30
sample_type: fixed_unordered
type: TripletSampler
transform:
- type: BaseSilCuttingTransform
@@ -0,0 +1,110 @@
data_cfg:
dataset_name: Scoliosis1K
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
data_in_use:
- true
- false
num_workers: 1
remove_no_gallery: false
test_dataset_name: Scoliosis1K
evaluator_cfg:
enable_float16: true
restore_ckpt_strict: true
restore_hint: 20000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k
eval_func: evaluate_scoliosis
sampler:
batch_shuffle: false
batch_size: 1
sample_type: all_ordered
type: InferenceSampler
frames_all_limit: 720
metric: euc
transform:
- type: BaseSilCuttingTransform
loss_cfg:
- loss_term_weight: 1.0
margin: 0.2
type: TripletLoss
log_prefix: triplet
- loss_term_weight: 1.0
scale: 16
type: CrossEntropyLoss
log_prefix: softmax
log_accuracy: true
model_cfg:
model: ScoNet
backbone_cfg:
type: ResNet9
block: BasicBlock
in_channel: 2
channels:
- 64
- 128
- 256
- 512
layers:
- 1
- 1
- 1
- 1
strides:
- 1
- 2
- 2
- 1
maxpool: false
SeparateFCs:
in_channels: 512
out_channels: 256
parts_num: 16
SeparateBNNecks:
class_num: 3
in_channels: 256
parts_num: 16
bin_num:
- 16
optimizer_cfg:
lr: 0.1
momentum: 0.9
solver: SGD
weight_decay: 0.0005
scheduler_cfg:
gamma: 0.1
milestones:
- 10000
- 14000
- 18000
scheduler: MultiStepLR
trainer_cfg:
enable_float16: true
fix_BN: false
with_test: true
log_iter: 100
restore_ckpt_strict: true
restore_hint: 0
auto_resume_latest: true
resume_every_iter: 500
resume_keep: 6
eval_iter: 1000
save_iter: 1000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k
sync_BN: false
total_iter: 20000
sampler:
batch_shuffle: true
batch_size:
- 8
- 8
frames_num_fixed: 30
sample_type: fixed_unordered
type: TripletSampler
transform:
- type: BaseSilCuttingTransform
@@ -0,0 +1,112 @@
data_cfg:
dataset_name: Scoliosis1K
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
data_in_use:
- true
- false
num_workers: 1
remove_no_gallery: false
test_dataset_name: Scoliosis1K
evaluator_cfg:
enable_float16: true
restore_ckpt_strict: true
restore_hint: 20000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k
eval_func: evaluate_scoliosis
sampler:
batch_shuffle: false
batch_size: 1
sample_type: all_ordered
type: InferenceSampler
frames_all_limit: 720
metric: euc
transform:
- type: BaseSilCuttingTransform
loss_cfg:
- loss_term_weight: 1.0
margin: 0.2
type: TripletLoss
log_prefix: triplet
- loss_term_weight: 1.0
scale: 16
type: CrossEntropyLoss
log_prefix: softmax
log_accuracy: true
model_cfg:
model: ScoNet
backbone_cfg:
type: ResNet9
block: BasicBlock
in_channel: 2
channels:
- 64
- 128
- 256
- 512
layers:
- 1
- 1
- 1
- 1
strides:
- 1
- 2
- 2
- 1
maxpool: false
SeparateFCs:
in_channels: 512
out_channels: 256
parts_num: 16
SeparateBNNecks:
class_num: 3
in_channels: 256
parts_num: 16
bin_num:
- 16
optimizer_cfg:
lr: 0.1
momentum: 0.9
solver: SGD
weight_decay: 0.0005
scheduler_cfg:
gamma: 0.1
milestones:
- 10000
- 14000
- 18000
scheduler: MultiStepLR
trainer_cfg:
enable_float16: true
fix_BN: false
with_test: true
log_iter: 100
restore_ckpt_strict: true
optimizer_reset: false
scheduler_reset: false
restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt
auto_resume_latest: true
resume_every_iter: 500
resume_keep: 6
eval_iter: 1000
save_iter: 1000
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k
sync_BN: false
total_iter: 20000
sampler:
batch_shuffle: true
batch_size:
- 8
- 8
frames_num_fixed: 30
sample_type: fixed_unordered
type: TripletSampler
transform:
- type: BaseSilCuttingTransform
+24
View File
@@ -86,3 +86,27 @@
>> summary_writer.add_video('outs', outs.mean(2).unsqueeze(2), self.iteration) >> summary_writer.add_video('outs', outs.mean(2).unsqueeze(2), self.iteration)
>> ``` >> ```
> Note that this example requires the [`moviepy`](https://github.com/Zulko/moviepy) package, and hence you should run `pip install moviepy` first. > Note that this example requires the [`moviepy`](https://github.com/Zulko/moviepy) package, and hence you should run `pip install moviepy` first.
### Keep Best Checkpoints
> If you want to retain the strongest evaluation checkpoints instead of relying only on the latest or final save, you can enable best-checkpoint tracking in `trainer_cfg`.
>
> Example:
>> ```yaml
>> trainer_cfg:
>> with_test: true
>> eval_iter: 1000
>> save_iter: 1000
>> best_ckpt_cfg:
>> keep_n: 3
>> metric_names:
>> - scalar/test_f1/
>> - scalar/test_accuracy/
>> ```
>
> Behavior:
> * The normal numbered checkpoints are still written by `save_iter`.
> * After each eval, the trainer checks the configured scalar metrics and keeps the top `N` checkpoints separately for each metric.
> * Best checkpoints are saved under `output/.../checkpoints/best/<metric>/`.
> * Each best-metric directory contains an `index.json` file with the retained iterations, scores, and paths.
>
> This is useful for long or unstable runs where the best checkpoint may appear well before the final iteration.
+5
View File
@@ -164,6 +164,7 @@ Conclusion:
- on the same split, `body-only + plain CE` improved that further to `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` at `7000` - on the same split, `body-only + plain CE` improved that further to `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` at `7000`
- a later explicit rerun of the `body-only + plain CE` `7000` full-test eval reproduced that same `83.16 / 68.24 / 80.02 / 68.47` result - a later explicit rerun of the `body-only + plain CE` `7000` full-test eval reproduced that same `83.16 / 68.24 / 80.02 / 68.47` result
- adding back limited head context via `head-lite` did not improve the full-test score; its `7000` checkpoint reached only `78.07 Acc / 65.42 Prec / 80.50 Rec / 62.08 F1` - adding back limited head context via `head-lite` did not improve the full-test score; its `7000` checkpoint reached only `78.07 Acc / 65.42 Prec / 80.50 Rec / 62.08 F1`
- the first practical DRF bridge on the same `1:1:2` body-only recipe peaked early and still underperformed the plain skeleton baseline; its best retained `2000` checkpoint reached only `80.21 Acc / 58.92 Prec / 59.23 Rec / 57.84 F1` on the full test set
### Not reproducible with current evidence ### Not reproducible with current evidence
@@ -179,6 +180,10 @@ Conclusion:
- the `1:1:8` class ratio is not just a nuisance; it appears to be a major driver of the current skeleton/DRF failure mode - the `1:1:8` class ratio is not just a nuisance; it appears to be a major driver of the current skeleton/DRF failure mode
- on the easier `1:1:2` split, weighted CE is not currently the winning recipe; the best local full-test result so far came from plain CE - on the easier `1:1:2` split, weighted CE is not currently the winning recipe; the best local full-test result so far came from plain CE
- `head-lite` may help the small fixed proxy subset, but that gain did not transfer to the full `TEST_SET`, so `body-only + plain CE` remains the best practical skeleton recipe - `head-lite` may help the small fixed proxy subset, but that gain did not transfer to the full `TEST_SET`, so `body-only + plain CE` remains the best practical skeleton recipe
- DRF currently looks worse than the plain skeleton baseline not because the skeleton path is dead, but because the additional prior branch is not yet providing a selective or stable complement. The current local evidence points to three likely causes:
- the body-only skeleton baseline already captures most of the useful torso signal on `1:1:2`, so PAV may be largely redundant in this setting
- the current PGA/PAV path appears weakly selective in local diagnostics, so the prior is not clearly emphasizing a few clinically relevant parts
- DRF peaks very early and then degrades, which suggests the added branch is making optimization less stable without improving the final decision boundary
## Recommended standard for future work in this repo ## Recommended standard for future work in this repo
+6 -1
View File
@@ -37,7 +37,11 @@ Use it for:
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_2gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Same `1:1:2` body-only bridge as above, but removed weighted CE to test whether class weighting was suppressing precision on the easier split | interrupted | superseded before meaningful progress by the user-requested 1-GPU rerun on the `5070 Ti` | | 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_2gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Same `1:1:2` body-only bridge as above, but removed weighted CE to test whether class weighting was suppressing precision on the easier split | interrupted | superseded before meaningful progress by the user-requested 1-GPU rerun on the `5070 Ti` |
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Same plain-CE `1:1:2` bridge, relaunched on the `5070 Ti` only per user request | complete | best proxy subset at `7000`: `88.28/69.12/74.15/68.80`; full test at `7000`: `83.16/68.24/80.02/68.47`; final proxy at `10000`: `75.00/65.00/63.41/54.55` (Acc/Prec/Rec/F1) | | 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Same plain-CE `1:1:2` bridge, relaunched on the `5070 Ti` only per user request | complete | best proxy subset at `7000`: `88.28/69.12/74.15/68.80`; full test at `7000`: `83.16/68.24/80.02/68.47`; final proxy at `10000`: `75.00/65.00/63.41/54.55` (Acc/Prec/Rec/F1) |
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_headlite_plaince_bridge_1gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-112-sigma15-joint8-headlite` + `Scoliosis1K_112.json` | Added `head-lite` structure (nose plus shoulder links, no eyes/ears) on top of the plain-CE `1:1:2` bridge; first `3090` launch OOMed due unrelated occupancy, then relaunched on the UUID-pinned `5070 Ti` | complete | best proxy subset at `7000`: `86.72/70.15/89.00/70.44`; full test at `7000`: `78.07/65.42/80.50/62.08` (Acc/Prec/Rec/F1) | | 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_headlite_plaince_bridge_1gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-112-sigma15-joint8-headlite` + `Scoliosis1K_112.json` | Added `head-lite` structure (nose plus shoulder links, no eyes/ears) on top of the plain-CE `1:1:2` bridge; first `3090` launch OOMed due unrelated occupancy, then relaunched on the UUID-pinned `5070 Ti` | complete | best proxy subset at `7000`: `86.72/70.15/89.00/70.44`; full test at `7000`: `78.07/65.42/80.50/62.08` (Acc/Prec/Rec/F1) |
| 2026-03-10 | `DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | DRF bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | First practical DRF run on the winning `1:1:2` skeleton recipe: `body-only`, plain CE, SGD, `10k` bridge schedule, fixed proxy subset seed `112` | running | pending | | 2026-03-10 | `DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | DRF bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | First practical DRF run on the winning `1:1:2` skeleton recipe: `body-only`, plain CE, SGD, `10k` bridge schedule, fixed proxy subset seed `112` | complete | best proxy subset at `2000`: `88.28/61.79/60.31/60.93`; full test at `2000`: `80.21/58.92/59.23/57.84` (Acc/Prec/Rec/F1) |
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k` | ScoNet-MT-ske mainline | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Promoted the winning practical skeleton recipe to a longer `20k` run with full `TEST_SET` eval and checkpoint save every `1000`; no proxy subset, same plain CE + SGD setup | interrupted | superseded by the true-resume continuation below |
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k` | ScoNet-MT-ske mainline | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | True continuation of the earlier plain-CE `1:1:2` `10k` bridge from its `latest.pt`, extended to `20k` with full `TEST_SET` eval and checkpoint save every `1000` | interrupted | superseded by the AdamW finetune branch below |
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k` | ScoNet-MT-ske finetune | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | AdamW finetune from the earlier plain-CE `1:1:2` `10k` checkpoint; restores model weights only, resets optimizer/scheduler state, keeps full `TEST_SET` eval and checkpoint save every `1000` | interrupted | superseded by the longer overnight 40k finetune below |
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k` | ScoNet-MT-ske finetune | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Longer overnight AdamW finetune from the same `10k` plain-CE checkpoint; restores model weights only, resets optimizer/scheduler state, extends to `40000` total iterations with full `TEST_SET` eval every `1000` | running | pending |
## Current best skeleton baseline ## Current best skeleton baseline
@@ -63,3 +67,4 @@ Current best `ScoNet-MT-ske`-style result:
- Removing weighted CE on the `1:1:2` bridge improved the current best full-test result further: `body-only + plain CE` reached `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` at `7000`, so weighted CE does not currently look beneficial on the easier split. - Removing weighted CE on the `1:1:2` bridge improved the current best full-test result further: `body-only + plain CE` reached `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` at `7000`, so weighted CE does not currently look beneficial on the easier split.
- A later full-test rerun of the retained `body-only + plain CE` `7000` checkpoint reproduced the same `83.16 / 68.24 / 80.02 / 68.47` result exactly, so that number is now explicitly reconfirmed rather than just carried forward from the original run log. - A later full-test rerun of the retained `body-only + plain CE` `7000` checkpoint reproduced the same `83.16 / 68.24 / 80.02 / 68.47` result exactly, so that number is now explicitly reconfirmed rather than just carried forward from the original run log.
- `Head-lite` looked stronger than `body-only` on the fixed 128-sequence proxy subset at `7000`, but it did not transfer to the full test set: `78.07 Acc / 65.42 Prec / 80.50 Rec / 62.08 F1`, which is clearly below the `body-only + plain CE` full-test result. - `Head-lite` looked stronger than `body-only` on the fixed 128-sequence proxy subset at `7000`, but it did not transfer to the full test set: `78.07 Acc / 65.42 Prec / 80.50 Rec / 62.08 F1`, which is clearly below the `body-only + plain CE` full-test result.
- The first practical DRF bridge on the winning `1:1:2` recipe did not beat the plain skeleton baseline. Its best retained checkpoint (`2000`) reached only `80.21 Acc / 58.92 Prec / 59.23 Rec / 57.84 F1` on the full test set, versus `83.16 / 68.24 / 80.02 / 68.47` for `body-only + plain CE` at `7000`. The working local interpretation is that the added PAV/PGA path is currently injecting a weak or noisy prior rather than a useful complementary signal.
+27
View File
@@ -124,6 +124,33 @@ The launcher configures both:
This makes it easier to recover logs even if the original shell or tool session disappears. This makes it easier to recover logs even if the original shell or tool session disappears.
## Moving outputs off the SSD
OpenGait writes checkpoints, TensorBoard summaries, best-checkpoint snapshots, and file logs under a run output root.
By default that root is `output/`, but you can override it per run with `output_root` in the engine config:
```yaml
trainer_cfg:
output_root: /mnt/hddl/data/OpenGait-output
evaluator_cfg:
output_root: /mnt/hddl/data/OpenGait-output
```
The final path layout stays the same under that root:
```text
<output_root>/<dataset>/<model>/<save_name>/
```
For long scoliosis runs, using an HDD-backed root is recommended so local SSD space is not consumed by:
- numbered checkpoints
- rolling resume checkpoints
- best-N retained checkpoints
- TensorBoard summary files
## GPU selection ## GPU selection
Prefer GPU UUIDs, not ordinal indices. Prefer GPU UUIDs, not ordinal indices.
+9 -3
View File
@@ -4,7 +4,14 @@ import argparse
import torch import torch
import torch.nn as nn import torch.nn as nn
from modeling import models from modeling import models
from opengait.utils import config_loader, get_ddp_module, init_seeds, params_count, get_msg_mgr from opengait.utils import (
config_loader,
get_ddp_module,
get_msg_mgr,
init_seeds,
params_count,
resolve_output_path,
)
parser = argparse.ArgumentParser(description='Main program for opengait.') parser = argparse.ArgumentParser(description='Main program for opengait.')
parser.add_argument('--local_rank', type=int, default=0, parser.add_argument('--local_rank', type=int, default=0,
@@ -25,8 +32,7 @@ def initialization(cfgs, training):
msg_mgr = get_msg_mgr() msg_mgr = get_msg_mgr()
engine_cfg = cfgs['trainer_cfg'] if training else cfgs['evaluator_cfg'] engine_cfg = cfgs['trainer_cfg'] if training else cfgs['evaluator_cfg']
logger_cfg = cfgs.get('logger_cfg', {}) logger_cfg = cfgs.get('logger_cfg', {})
output_path = os.path.join('output/', cfgs['data_cfg']['dataset_name'], output_path = resolve_output_path(cfgs, engine_cfg)
cfgs['model_cfg']['model'], engine_cfg['save_name'])
if training: if training:
msg_mgr.init_manager( msg_mgr.init_manager(
output_path, output_path,
+133 -3
View File
@@ -10,8 +10,10 @@ BaseModel.run_train(model)
BaseModel.run_test(model) BaseModel.run_test(model)
""" """
import json import json
import math
import os import os
import random import random
import re
from typing import Any from typing import Any
import numpy as np import numpy as np
@@ -33,7 +35,7 @@ from data.transform import get_transform
from data.collate_fn import CollateFn from data.collate_fn import CollateFn
from data.dataset import DataSet from data.dataset import DataSet
import data.sampler as Samplers import data.sampler as Samplers
from opengait.utils import Odict, mkdir, ddp_all_gather from opengait.utils import Odict, mkdir, ddp_all_gather, resolve_output_path
from opengait.utils import get_valid_args, is_list, is_dict, np2var, ts2np, list2var, get_attr_from from opengait.utils import get_valid_args, is_list, is_dict, np2var, ts2np, list2var, get_attr_from
from evaluation import evaluator as eval_functions from evaluation import evaluator as eval_functions
from opengait.utils import NoOp from opengait.utils import NoOp
@@ -144,8 +146,7 @@ class BaseModel(MetaModel, nn.Module):
if training and self.engine_cfg['enable_float16']: if training and self.engine_cfg['enable_float16']:
self.Scaler = GradScaler() self.Scaler = GradScaler()
self.save_path = osp.join('output/', cfgs['data_cfg']['dataset_name'], self.save_path = resolve_output_path(cfgs, self.engine_cfg)
cfgs['model_cfg']['model'], self.engine_cfg['save_name'])
self.build_network(cfgs['model_cfg']) self.build_network(cfgs['model_cfg'])
self.init_parameters() self.init_parameters()
@@ -317,6 +318,134 @@ class BaseModel(MetaModel, nn.Module):
return candidate return candidate
return None return None
def _best_ckpt_cfg(self) -> dict[str, Any] | None:
best_ckpt_cfg = self.engine_cfg.get('best_ckpt_cfg')
if not isinstance(best_ckpt_cfg, dict):
return None
keep_n = int(best_ckpt_cfg.get('keep_n', 0))
metric_names = best_ckpt_cfg.get('metric_names', [])
if keep_n <= 0 or not isinstance(metric_names, list) or not metric_names:
return None
return best_ckpt_cfg
def _best_ckpt_root(self) -> str:
return osp.join(self._checkpoint_dir(), "best")
def _best_metric_dir(self, metric_name: str) -> str:
metric_slug = re.sub(r"[^A-Za-z0-9_.-]+", "_", metric_name).strip("._")
return osp.join(self._best_ckpt_root(), metric_slug)
def _best_metric_index_path(self, metric_name: str) -> str:
return osp.join(self._best_metric_dir(metric_name), "index.json")
def _load_best_metric_index(self, metric_name: str) -> list[dict[str, Any]]:
index_path = self._best_metric_index_path(metric_name)
if not osp.isfile(index_path):
return []
with open(index_path, "r", encoding="utf-8") as handle:
raw_entries = json.load(handle)
if not isinstance(raw_entries, list):
return []
entries: list[dict[str, Any]] = []
for entry in raw_entries:
if not isinstance(entry, dict):
continue
path = entry.get("path")
if isinstance(path, str) and osp.isfile(path):
entries.append(entry)
return entries
def _write_best_metric_index(
self,
metric_name: str,
entries: list[dict[str, Any]],
) -> None:
index_path = self._best_metric_index_path(metric_name)
mkdir(osp.dirname(index_path))
tmp_path = index_path + ".tmp"
with open(tmp_path, "w", encoding="utf-8") as handle:
json.dump(entries, handle, indent=2, sort_keys=True)
os.replace(tmp_path, index_path)
def _summary_scalar(self, value: Any) -> float | None:
if isinstance(value, torch.Tensor):
return float(value.detach().float().mean().item())
if isinstance(value, np.ndarray):
return float(np.mean(value))
if isinstance(value, (float, int, np.floating, np.integer)):
return float(value)
return None
def _save_best_ckpts(
self,
iteration: int,
result_dict: dict[str, Any],
) -> None:
if torch.distributed.get_rank() != 0:
return
best_ckpt_cfg = self._best_ckpt_cfg()
if best_ckpt_cfg is None:
return
keep_n = int(best_ckpt_cfg['keep_n'])
metric_names = [metric for metric in best_ckpt_cfg['metric_names'] if metric in result_dict]
if not metric_names:
return
checkpoint: dict[str, Any] | None = None
save_name = self.engine_cfg['save_name']
for metric_name in metric_names:
score = self._summary_scalar(result_dict.get(metric_name))
if score is None or not math.isfinite(score):
continue
entries = [
entry for entry in self._load_best_metric_index(metric_name)
if int(entry.get("iteration", -1)) != iteration
]
ranked_entries = sorted(
entries + [{"iteration": iteration, "score": score, "path": ""}],
key=lambda entry: (float(entry["score"]), int(entry["iteration"])),
reverse=True,
)
kept_entries = ranked_entries[:keep_n]
if not any(int(entry["iteration"]) == iteration for entry in kept_entries):
continue
metric_dir = self._best_metric_dir(metric_name)
mkdir(metric_dir)
metric_slug = osp.basename(metric_dir)
best_path = osp.join(
metric_dir,
f"{save_name}-iter-{iteration:0>5}-score-{score:.4f}-{metric_slug}.pt",
)
if checkpoint is None:
checkpoint = self._build_checkpoint(iteration)
self._save_checkpoint_file(checkpoint, best_path)
refreshed_entries = []
for entry in kept_entries:
if int(entry["iteration"]) == iteration:
refreshed_entries.append(
{
"iteration": iteration,
"score": score,
"path": best_path,
}
)
else:
refreshed_entries.append(entry)
keep_paths = {entry["path"] for entry in refreshed_entries if isinstance(entry.get("path"), str)}
for stale_entry in entries:
stale_path = stale_entry.get("path")
if isinstance(stale_path, str) and stale_path not in keep_paths and osp.isfile(stale_path):
os.remove(stale_path)
self._write_best_metric_index(metric_name, refreshed_entries)
def save_ckpt(self, iteration): def save_ckpt(self, iteration):
if torch.distributed.get_rank() == 0: if torch.distributed.get_rank() == 0:
save_name = self.engine_cfg['save_name'] save_name = self.engine_cfg['save_name']
@@ -589,6 +718,7 @@ class BaseModel(MetaModel, nn.Module):
if result_dict: if result_dict:
model.msg_mgr.write_to_tensorboard(result_dict) model.msg_mgr.write_to_tensorboard(result_dict)
model.msg_mgr.write_to_wandb(result_dict) model.msg_mgr.write_to_wandb(result_dict)
model._save_best_ckpts(model.iteration, result_dict)
model.msg_mgr.reset_time() model.msg_mgr.reset_time()
if model.iteration >= model.engine_cfg['total_iter']: if model.iteration >= model.engine_cfg['total_iter']:
break break
+1
View File
@@ -7,4 +7,5 @@ from .common import mkdir, clones
from .common import MergeCfgsDict from .common import MergeCfgsDict
from .common import get_attr_from from .common import get_attr_from
from .common import NoOp from .common import NoOp
from .common import resolve_output_path
from .msg_manager import get_msg_mgr from .msg_manager import get_msg_mgr
+17
View File
@@ -2,6 +2,7 @@ import copy
import os import os
import inspect import inspect
import logging import logging
from pathlib import Path
import torch import torch
import numpy as np import numpy as np
import torch.nn as nn import torch.nn as nn
@@ -203,3 +204,19 @@ def get_ddp_module(module, find_unused_parameters=False, **kwargs):
def params_count(net): def params_count(net):
n_parameters = sum(p.numel() for p in net.parameters()) n_parameters = sum(p.numel() for p in net.parameters())
return 'Parameters Count: {:.5f}M'.format(n_parameters / 1e6) return 'Parameters Count: {:.5f}M'.format(n_parameters / 1e6)
def resolve_output_path(cfgs, engine_cfg):
output_root = (
engine_cfg.get('output_root')
or cfgs.get('output_root')
or os.environ.get('OPENGAIT_OUTPUT_ROOT')
or 'output'
)
output_root = str(Path(output_root).expanduser())
return os.path.join(
output_root,
cfgs['data_cfg']['dataset_name'],
cfgs['model_cfg']['model'],
engine_cfg['save_name'],
)