diff --git a/configs/drf/drf_scoliosis1k_112_sigma15_joint8_bodyonly_plaince_bridge_eval_2000_1gpu.yaml b/configs/drf/drf_scoliosis1k_112_sigma15_joint8_bodyonly_plaince_bridge_eval_2000_1gpu.yaml new file mode 100644 index 0000000..500137c --- /dev/null +++ b/configs/drf/drf_scoliosis1k_112_sigma15_joint8_bodyonly_plaince_bridge_eval_2000_1gpu.yaml @@ -0,0 +1,69 @@ +data_cfg: + dataset_name: Scoliosis1K + dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly + dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json + num_workers: 1 + remove_no_gallery: false + +evaluator_cfg: + enable_float16: true + restore_ckpt_strict: true + restore_hint: 2000 + save_name: DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k + eval_func: evaluate_scoliosis + sampler: + batch_shuffle: false + batch_size: 1 + sample_type: all_ordered + type: InferenceSampler + frames_all_limit: 720 + metric: euc + transform: + - type: BaseSilCuttingTransform + - type: NoOperation + +loss_cfg: + - loss_term_weight: 1.0 + margin: 0.2 + type: TripletLoss + log_prefix: triplet + - loss_term_weight: 1.0 + scale: 16 + type: CrossEntropyLoss + log_prefix: softmax + log_accuracy: true + +model_cfg: + model: DRF + num_pairs: 8 + num_metrics: 3 + backbone_cfg: + type: ResNet9 + block: BasicBlock + in_channel: 2 + channels: + - 64 + - 128 + - 256 + - 512 + layers: + - 1 + - 1 + - 1 + - 1 + strides: + - 1 + - 2 + - 2 + - 1 + maxpool: false + SeparateFCs: + in_channels: 512 + out_channels: 256 + parts_num: 16 + SeparateBNNecks: + class_num: 3 + in_channels: 256 + parts_num: 16 + bin_num: + - 16 diff --git a/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k.yaml b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k.yaml new file mode 100644 index 0000000..5a35b01 --- /dev/null +++ b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k.yaml @@ -0,0 +1,115 @@ +data_cfg: + dataset_name: Scoliosis1K + dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly + dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json + data_in_use: + - true + - false + num_workers: 1 + remove_no_gallery: false + test_dataset_name: Scoliosis1K + +evaluator_cfg: + enable_float16: true + restore_ckpt_strict: true + restore_hint: 80000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k + output_root: /mnt/hddl/data/OpenGait-output + eval_func: evaluate_scoliosis + sampler: + batch_shuffle: false + batch_size: 1 + sample_type: all_ordered + type: InferenceSampler + frames_all_limit: 720 + metric: euc + transform: + - type: BaseSilCuttingTransform + +loss_cfg: + - loss_term_weight: 1.0 + margin: 0.2 + type: TripletLoss + log_prefix: triplet + - loss_term_weight: 1.0 + scale: 16 + type: CrossEntropyLoss + log_prefix: softmax + log_accuracy: true + +model_cfg: + model: ScoNet + backbone_cfg: + type: ResNet9 + block: BasicBlock + in_channel: 2 + channels: + - 64 + - 128 + - 256 + - 512 + layers: + - 1 + - 1 + - 1 + - 1 + strides: + - 1 + - 2 + - 2 + - 1 + maxpool: false + SeparateFCs: + in_channels: 512 + out_channels: 256 + parts_num: 16 + SeparateBNNecks: + class_num: 3 + in_channels: 256 + parts_num: 16 + bin_num: + - 16 + +optimizer_cfg: + lr: 0.0005 + solver: AdamW + weight_decay: 0.0005 + +scheduler_cfg: + scheduler: CosineAnnealingLR + T_max: 60000 + eta_min: 0.00001 + +trainer_cfg: + enable_float16: true + fix_BN: false + with_test: true + log_iter: 100 + restore_ckpt_strict: true + optimizer_reset: false + scheduler_reset: false + restore_hint: /mnt/hddl/data/OpenGait-output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k/checkpoints/latest.pt + output_root: /mnt/hddl/data/OpenGait-output + auto_resume_latest: true + resume_every_iter: 500 + resume_keep: 6 + best_ckpt_cfg: + keep_n: 3 + metric_names: + - scalar/test_f1/ + - scalar/test_accuracy/ + eval_iter: 1000 + save_iter: 500 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k + sync_BN: false + total_iter: 80000 + sampler: + batch_shuffle: true + batch_size: + - 8 + - 8 + frames_num_fixed: 30 + sample_type: fixed_unordered + type: TripletSampler + transform: + - type: BaseSilCuttingTransform diff --git a/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k.yaml b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k.yaml new file mode 100644 index 0000000..40f601d --- /dev/null +++ b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k.yaml @@ -0,0 +1,111 @@ +data_cfg: + dataset_name: Scoliosis1K + dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly + dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json + data_in_use: + - true + - false + num_workers: 1 + remove_no_gallery: false + test_dataset_name: Scoliosis1K + +evaluator_cfg: + enable_float16: true + restore_ckpt_strict: true + restore_hint: 20000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k + eval_func: evaluate_scoliosis + sampler: + batch_shuffle: false + batch_size: 1 + sample_type: all_ordered + type: InferenceSampler + frames_all_limit: 720 + metric: euc + transform: + - type: BaseSilCuttingTransform + +loss_cfg: + - loss_term_weight: 1.0 + margin: 0.2 + type: TripletLoss + log_prefix: triplet + - loss_term_weight: 1.0 + scale: 16 + type: CrossEntropyLoss + log_prefix: softmax + log_accuracy: true + +model_cfg: + model: ScoNet + backbone_cfg: + type: ResNet9 + block: BasicBlock + in_channel: 2 + channels: + - 64 + - 128 + - 256 + - 512 + layers: + - 1 + - 1 + - 1 + - 1 + strides: + - 1 + - 2 + - 2 + - 1 + maxpool: false + SeparateFCs: + in_channels: 512 + out_channels: 256 + parts_num: 16 + SeparateBNNecks: + class_num: 3 + in_channels: 256 + parts_num: 16 + bin_num: + - 16 + +optimizer_cfg: + lr: 0.001 + solver: AdamW + weight_decay: 0.0005 + +scheduler_cfg: + gamma: 0.1 + milestones: + - 5000 + - 7000 + - 9000 + scheduler: MultiStepLR + +trainer_cfg: + enable_float16: true + fix_BN: false + with_test: true + log_iter: 100 + restore_ckpt_strict: true + optimizer_reset: true + scheduler_reset: true + restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt + auto_resume_latest: true + resume_every_iter: 500 + resume_keep: 6 + eval_iter: 1000 + save_iter: 1000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k + sync_BN: false + total_iter: 20000 + sampler: + batch_shuffle: true + batch_size: + - 8 + - 8 + frames_num_fixed: 30 + sample_type: fixed_unordered + type: TripletSampler + transform: + - type: BaseSilCuttingTransform diff --git a/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k.yaml b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k.yaml new file mode 100644 index 0000000..7df1ff4 --- /dev/null +++ b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k.yaml @@ -0,0 +1,111 @@ +data_cfg: + dataset_name: Scoliosis1K + dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly + dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json + data_in_use: + - true + - false + num_workers: 1 + remove_no_gallery: false + test_dataset_name: Scoliosis1K + +evaluator_cfg: + enable_float16: true + restore_ckpt_strict: true + restore_hint: 40000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k + eval_func: evaluate_scoliosis + sampler: + batch_shuffle: false + batch_size: 1 + sample_type: all_ordered + type: InferenceSampler + frames_all_limit: 720 + metric: euc + transform: + - type: BaseSilCuttingTransform + +loss_cfg: + - loss_term_weight: 1.0 + margin: 0.2 + type: TripletLoss + log_prefix: triplet + - loss_term_weight: 1.0 + scale: 16 + type: CrossEntropyLoss + log_prefix: softmax + log_accuracy: true + +model_cfg: + model: ScoNet + backbone_cfg: + type: ResNet9 + block: BasicBlock + in_channel: 2 + channels: + - 64 + - 128 + - 256 + - 512 + layers: + - 1 + - 1 + - 1 + - 1 + strides: + - 1 + - 2 + - 2 + - 1 + maxpool: false + SeparateFCs: + in_channels: 512 + out_channels: 256 + parts_num: 16 + SeparateBNNecks: + class_num: 3 + in_channels: 256 + parts_num: 16 + bin_num: + - 16 + +optimizer_cfg: + lr: 0.001 + solver: AdamW + weight_decay: 0.0005 + +scheduler_cfg: + gamma: 0.1 + milestones: + - 10000 + - 20000 + - 30000 + scheduler: MultiStepLR + +trainer_cfg: + enable_float16: true + fix_BN: false + with_test: true + log_iter: 100 + restore_ckpt_strict: true + optimizer_reset: true + scheduler_reset: true + restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt + auto_resume_latest: true + resume_every_iter: 500 + resume_keep: 6 + eval_iter: 1000 + save_iter: 1000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k + sync_BN: false + total_iter: 40000 + sampler: + batch_shuffle: true + batch_size: + - 8 + - 8 + frames_num_fixed: 30 + sample_type: fixed_unordered + type: TripletSampler + transform: + - type: BaseSilCuttingTransform diff --git a/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k.yaml b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k.yaml new file mode 100644 index 0000000..9cd158e --- /dev/null +++ b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k.yaml @@ -0,0 +1,110 @@ +data_cfg: + dataset_name: Scoliosis1K + dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly + dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json + data_in_use: + - true + - false + num_workers: 1 + remove_no_gallery: false + test_dataset_name: Scoliosis1K + +evaluator_cfg: + enable_float16: true + restore_ckpt_strict: true + restore_hint: 20000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k + eval_func: evaluate_scoliosis + sampler: + batch_shuffle: false + batch_size: 1 + sample_type: all_ordered + type: InferenceSampler + frames_all_limit: 720 + metric: euc + transform: + - type: BaseSilCuttingTransform + +loss_cfg: + - loss_term_weight: 1.0 + margin: 0.2 + type: TripletLoss + log_prefix: triplet + - loss_term_weight: 1.0 + scale: 16 + type: CrossEntropyLoss + log_prefix: softmax + log_accuracy: true + +model_cfg: + model: ScoNet + backbone_cfg: + type: ResNet9 + block: BasicBlock + in_channel: 2 + channels: + - 64 + - 128 + - 256 + - 512 + layers: + - 1 + - 1 + - 1 + - 1 + strides: + - 1 + - 2 + - 2 + - 1 + maxpool: false + SeparateFCs: + in_channels: 512 + out_channels: 256 + parts_num: 16 + SeparateBNNecks: + class_num: 3 + in_channels: 256 + parts_num: 16 + bin_num: + - 16 + +optimizer_cfg: + lr: 0.1 + momentum: 0.9 + solver: SGD + weight_decay: 0.0005 + +scheduler_cfg: + gamma: 0.1 + milestones: + - 10000 + - 14000 + - 18000 + scheduler: MultiStepLR + +trainer_cfg: + enable_float16: true + fix_BN: false + with_test: true + log_iter: 100 + restore_ckpt_strict: true + restore_hint: 0 + auto_resume_latest: true + resume_every_iter: 500 + resume_keep: 6 + eval_iter: 1000 + save_iter: 1000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k + sync_BN: false + total_iter: 20000 + sampler: + batch_shuffle: true + batch_size: + - 8 + - 8 + frames_num_fixed: 30 + sample_type: fixed_unordered + type: TripletSampler + transform: + - type: BaseSilCuttingTransform diff --git a/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k.yaml b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k.yaml new file mode 100644 index 0000000..149680a --- /dev/null +++ b/configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k.yaml @@ -0,0 +1,112 @@ +data_cfg: + dataset_name: Scoliosis1K + dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly + dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json + data_in_use: + - true + - false + num_workers: 1 + remove_no_gallery: false + test_dataset_name: Scoliosis1K + +evaluator_cfg: + enable_float16: true + restore_ckpt_strict: true + restore_hint: 20000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k + eval_func: evaluate_scoliosis + sampler: + batch_shuffle: false + batch_size: 1 + sample_type: all_ordered + type: InferenceSampler + frames_all_limit: 720 + metric: euc + transform: + - type: BaseSilCuttingTransform + +loss_cfg: + - loss_term_weight: 1.0 + margin: 0.2 + type: TripletLoss + log_prefix: triplet + - loss_term_weight: 1.0 + scale: 16 + type: CrossEntropyLoss + log_prefix: softmax + log_accuracy: true + +model_cfg: + model: ScoNet + backbone_cfg: + type: ResNet9 + block: BasicBlock + in_channel: 2 + channels: + - 64 + - 128 + - 256 + - 512 + layers: + - 1 + - 1 + - 1 + - 1 + strides: + - 1 + - 2 + - 2 + - 1 + maxpool: false + SeparateFCs: + in_channels: 512 + out_channels: 256 + parts_num: 16 + SeparateBNNecks: + class_num: 3 + in_channels: 256 + parts_num: 16 + bin_num: + - 16 + +optimizer_cfg: + lr: 0.1 + momentum: 0.9 + solver: SGD + weight_decay: 0.0005 + +scheduler_cfg: + gamma: 0.1 + milestones: + - 10000 + - 14000 + - 18000 + scheduler: MultiStepLR + +trainer_cfg: + enable_float16: true + fix_BN: false + with_test: true + log_iter: 100 + restore_ckpt_strict: true + optimizer_reset: false + scheduler_reset: false + restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt + auto_resume_latest: true + resume_every_iter: 500 + resume_keep: 6 + eval_iter: 1000 + save_iter: 1000 + save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k + sync_BN: false + total_iter: 20000 + sampler: + batch_shuffle: true + batch_size: + - 8 + - 8 + frames_num_fixed: 30 + sample_type: fixed_unordered + type: TripletSampler + transform: + - type: BaseSilCuttingTransform diff --git a/docs/5.advanced_usages.md b/docs/5.advanced_usages.md index 54adf40..80a3efd 100644 --- a/docs/5.advanced_usages.md +++ b/docs/5.advanced_usages.md @@ -85,4 +85,28 @@ >> if torch.distributed.get_rank() == 0 and self.training and self.iteration % 100==0: >> summary_writer.add_video('outs', outs.mean(2).unsqueeze(2), self.iteration) >> ``` -> Note that this example requires the [`moviepy`](https://github.com/Zulko/moviepy) package, and hence you should run `pip install moviepy` first. \ No newline at end of file +> Note that this example requires the [`moviepy`](https://github.com/Zulko/moviepy) package, and hence you should run `pip install moviepy` first. + +### Keep Best Checkpoints +> If you want to retain the strongest evaluation checkpoints instead of relying only on the latest or final save, you can enable best-checkpoint tracking in `trainer_cfg`. +> +> Example: +>> ```yaml +>> trainer_cfg: +>> with_test: true +>> eval_iter: 1000 +>> save_iter: 1000 +>> best_ckpt_cfg: +>> keep_n: 3 +>> metric_names: +>> - scalar/test_f1/ +>> - scalar/test_accuracy/ +>> ``` +> +> Behavior: +> * The normal numbered checkpoints are still written by `save_iter`. +> * After each eval, the trainer checks the configured scalar metrics and keeps the top `N` checkpoints separately for each metric. +> * Best checkpoints are saved under `output/.../checkpoints/best//`. +> * Each best-metric directory contains an `index.json` file with the retained iterations, scores, and paths. +> +> This is useful for long or unstable runs where the best checkpoint may appear well before the final iteration. diff --git a/docs/scoliosis_reproducibility_audit.md b/docs/scoliosis_reproducibility_audit.md index e583360..2414507 100644 --- a/docs/scoliosis_reproducibility_audit.md +++ b/docs/scoliosis_reproducibility_audit.md @@ -164,6 +164,7 @@ Conclusion: - on the same split, `body-only + plain CE` improved that further to `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` at `7000` - a later explicit rerun of the `body-only + plain CE` `7000` full-test eval reproduced that same `83.16 / 68.24 / 80.02 / 68.47` result - adding back limited head context via `head-lite` did not improve the full-test score; its `7000` checkpoint reached only `78.07 Acc / 65.42 Prec / 80.50 Rec / 62.08 F1` +- the first practical DRF bridge on the same `1:1:2` body-only recipe peaked early and still underperformed the plain skeleton baseline; its best retained `2000` checkpoint reached only `80.21 Acc / 58.92 Prec / 59.23 Rec / 57.84 F1` on the full test set ### Not reproducible with current evidence @@ -179,6 +180,10 @@ Conclusion: - the `1:1:8` class ratio is not just a nuisance; it appears to be a major driver of the current skeleton/DRF failure mode - on the easier `1:1:2` split, weighted CE is not currently the winning recipe; the best local full-test result so far came from plain CE - `head-lite` may help the small fixed proxy subset, but that gain did not transfer to the full `TEST_SET`, so `body-only + plain CE` remains the best practical skeleton recipe +- DRF currently looks worse than the plain skeleton baseline not because the skeleton path is dead, but because the additional prior branch is not yet providing a selective or stable complement. The current local evidence points to three likely causes: + - the body-only skeleton baseline already captures most of the useful torso signal on `1:1:2`, so PAV may be largely redundant in this setting + - the current PGA/PAV path appears weakly selective in local diagnostics, so the prior is not clearly emphasizing a few clinically relevant parts + - DRF peaks very early and then degrades, which suggests the added branch is making optimization less stable without improving the final decision boundary ## Recommended standard for future work in this repo diff --git a/docs/scoliosis_training_change_log.md b/docs/scoliosis_training_change_log.md index e5745e7..3279e10 100644 --- a/docs/scoliosis_training_change_log.md +++ b/docs/scoliosis_training_change_log.md @@ -37,7 +37,11 @@ Use it for: | 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_2gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Same `1:1:2` body-only bridge as above, but removed weighted CE to test whether class weighting was suppressing precision on the easier split | interrupted | superseded before meaningful progress by the user-requested 1-GPU rerun on the `5070 Ti` | | 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Same plain-CE `1:1:2` bridge, relaunched on the `5070 Ti` only per user request | complete | best proxy subset at `7000`: `88.28/69.12/74.15/68.80`; full test at `7000`: `83.16/68.24/80.02/68.47`; final proxy at `10000`: `75.00/65.00/63.41/54.55` (Acc/Prec/Rec/F1) | | 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_headlite_plaince_bridge_1gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-112-sigma15-joint8-headlite` + `Scoliosis1K_112.json` | Added `head-lite` structure (nose plus shoulder links, no eyes/ears) on top of the plain-CE `1:1:2` bridge; first `3090` launch OOMed due unrelated occupancy, then relaunched on the UUID-pinned `5070 Ti` | complete | best proxy subset at `7000`: `86.72/70.15/89.00/70.44`; full test at `7000`: `78.07/65.42/80.50/62.08` (Acc/Prec/Rec/F1) | -| 2026-03-10 | `DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | DRF bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | First practical DRF run on the winning `1:1:2` skeleton recipe: `body-only`, plain CE, SGD, `10k` bridge schedule, fixed proxy subset seed `112` | running | pending | +| 2026-03-10 | `DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | DRF bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | First practical DRF run on the winning `1:1:2` skeleton recipe: `body-only`, plain CE, SGD, `10k` bridge schedule, fixed proxy subset seed `112` | complete | best proxy subset at `2000`: `88.28/61.79/60.31/60.93`; full test at `2000`: `80.21/58.92/59.23/57.84` (Acc/Prec/Rec/F1) | +| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k` | ScoNet-MT-ske mainline | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Promoted the winning practical skeleton recipe to a longer `20k` run with full `TEST_SET` eval and checkpoint save every `1000`; no proxy subset, same plain CE + SGD setup | interrupted | superseded by the true-resume continuation below | +| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k` | ScoNet-MT-ske mainline | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | True continuation of the earlier plain-CE `1:1:2` `10k` bridge from its `latest.pt`, extended to `20k` with full `TEST_SET` eval and checkpoint save every `1000` | interrupted | superseded by the AdamW finetune branch below | +| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k` | ScoNet-MT-ske finetune | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | AdamW finetune from the earlier plain-CE `1:1:2` `10k` checkpoint; restores model weights only, resets optimizer/scheduler state, keeps full `TEST_SET` eval and checkpoint save every `1000` | interrupted | superseded by the longer overnight 40k finetune below | +| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k` | ScoNet-MT-ske finetune | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Longer overnight AdamW finetune from the same `10k` plain-CE checkpoint; restores model weights only, resets optimizer/scheduler state, extends to `40000` total iterations with full `TEST_SET` eval every `1000` | running | pending | ## Current best skeleton baseline @@ -63,3 +67,4 @@ Current best `ScoNet-MT-ske`-style result: - Removing weighted CE on the `1:1:2` bridge improved the current best full-test result further: `body-only + plain CE` reached `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` at `7000`, so weighted CE does not currently look beneficial on the easier split. - A later full-test rerun of the retained `body-only + plain CE` `7000` checkpoint reproduced the same `83.16 / 68.24 / 80.02 / 68.47` result exactly, so that number is now explicitly reconfirmed rather than just carried forward from the original run log. - `Head-lite` looked stronger than `body-only` on the fixed 128-sequence proxy subset at `7000`, but it did not transfer to the full test set: `78.07 Acc / 65.42 Prec / 80.50 Rec / 62.08 F1`, which is clearly below the `body-only + plain CE` full-test result. +- The first practical DRF bridge on the winning `1:1:2` recipe did not beat the plain skeleton baseline. Its best retained checkpoint (`2000`) reached only `80.21 Acc / 58.92 Prec / 59.23 Rec / 57.84 F1` on the full test set, versus `83.16 / 68.24 / 80.02 / 68.47` for `body-only + plain CE` at `7000`. The working local interpretation is that the added PAV/PGA path is currently injecting a weak or noisy prior rather than a useful complementary signal. diff --git a/docs/systemd-run-training.md b/docs/systemd-run-training.md index faeb3bd..b003ca1 100644 --- a/docs/systemd-run-training.md +++ b/docs/systemd-run-training.md @@ -124,6 +124,33 @@ The launcher configures both: This makes it easier to recover logs even if the original shell or tool session disappears. +## Moving outputs off the SSD + +OpenGait writes checkpoints, TensorBoard summaries, best-checkpoint snapshots, and file logs under a run output root. + +By default that root is `output/`, but you can override it per run with `output_root` in the engine config: + +```yaml +trainer_cfg: + output_root: /mnt/hddl/data/OpenGait-output + +evaluator_cfg: + output_root: /mnt/hddl/data/OpenGait-output +``` + +The final path layout stays the same under that root: + +```text +//// +``` + +For long scoliosis runs, using an HDD-backed root is recommended so local SSD space is not consumed by: + +- numbered checkpoints +- rolling resume checkpoints +- best-N retained checkpoints +- TensorBoard summary files + ## GPU selection Prefer GPU UUIDs, not ordinal indices. diff --git a/opengait/main.py b/opengait/main.py index dc2eec2..3da1ff8 100644 --- a/opengait/main.py +++ b/opengait/main.py @@ -4,7 +4,14 @@ import argparse import torch import torch.nn as nn from modeling import models -from opengait.utils import config_loader, get_ddp_module, init_seeds, params_count, get_msg_mgr +from opengait.utils import ( + config_loader, + get_ddp_module, + get_msg_mgr, + init_seeds, + params_count, + resolve_output_path, +) parser = argparse.ArgumentParser(description='Main program for opengait.') parser.add_argument('--local_rank', type=int, default=0, @@ -25,8 +32,7 @@ def initialization(cfgs, training): msg_mgr = get_msg_mgr() engine_cfg = cfgs['trainer_cfg'] if training else cfgs['evaluator_cfg'] logger_cfg = cfgs.get('logger_cfg', {}) - output_path = os.path.join('output/', cfgs['data_cfg']['dataset_name'], - cfgs['model_cfg']['model'], engine_cfg['save_name']) + output_path = resolve_output_path(cfgs, engine_cfg) if training: msg_mgr.init_manager( output_path, diff --git a/opengait/modeling/base_model.py b/opengait/modeling/base_model.py index 28d7ac9..3dcfca1 100644 --- a/opengait/modeling/base_model.py +++ b/opengait/modeling/base_model.py @@ -10,8 +10,10 @@ BaseModel.run_train(model) BaseModel.run_test(model) """ import json +import math import os import random +import re from typing import Any import numpy as np @@ -33,7 +35,7 @@ from data.transform import get_transform from data.collate_fn import CollateFn from data.dataset import DataSet import data.sampler as Samplers -from opengait.utils import Odict, mkdir, ddp_all_gather +from opengait.utils import Odict, mkdir, ddp_all_gather, resolve_output_path from opengait.utils import get_valid_args, is_list, is_dict, np2var, ts2np, list2var, get_attr_from from evaluation import evaluator as eval_functions from opengait.utils import NoOp @@ -144,8 +146,7 @@ class BaseModel(MetaModel, nn.Module): if training and self.engine_cfg['enable_float16']: self.Scaler = GradScaler() - self.save_path = osp.join('output/', cfgs['data_cfg']['dataset_name'], - cfgs['model_cfg']['model'], self.engine_cfg['save_name']) + self.save_path = resolve_output_path(cfgs, self.engine_cfg) self.build_network(cfgs['model_cfg']) self.init_parameters() @@ -317,6 +318,134 @@ class BaseModel(MetaModel, nn.Module): return candidate return None + def _best_ckpt_cfg(self) -> dict[str, Any] | None: + best_ckpt_cfg = self.engine_cfg.get('best_ckpt_cfg') + if not isinstance(best_ckpt_cfg, dict): + return None + keep_n = int(best_ckpt_cfg.get('keep_n', 0)) + metric_names = best_ckpt_cfg.get('metric_names', []) + if keep_n <= 0 or not isinstance(metric_names, list) or not metric_names: + return None + return best_ckpt_cfg + + def _best_ckpt_root(self) -> str: + return osp.join(self._checkpoint_dir(), "best") + + def _best_metric_dir(self, metric_name: str) -> str: + metric_slug = re.sub(r"[^A-Za-z0-9_.-]+", "_", metric_name).strip("._") + return osp.join(self._best_ckpt_root(), metric_slug) + + def _best_metric_index_path(self, metric_name: str) -> str: + return osp.join(self._best_metric_dir(metric_name), "index.json") + + def _load_best_metric_index(self, metric_name: str) -> list[dict[str, Any]]: + index_path = self._best_metric_index_path(metric_name) + if not osp.isfile(index_path): + return [] + with open(index_path, "r", encoding="utf-8") as handle: + raw_entries = json.load(handle) + if not isinstance(raw_entries, list): + return [] + entries: list[dict[str, Any]] = [] + for entry in raw_entries: + if not isinstance(entry, dict): + continue + path = entry.get("path") + if isinstance(path, str) and osp.isfile(path): + entries.append(entry) + return entries + + def _write_best_metric_index( + self, + metric_name: str, + entries: list[dict[str, Any]], + ) -> None: + index_path = self._best_metric_index_path(metric_name) + mkdir(osp.dirname(index_path)) + tmp_path = index_path + ".tmp" + with open(tmp_path, "w", encoding="utf-8") as handle: + json.dump(entries, handle, indent=2, sort_keys=True) + os.replace(tmp_path, index_path) + + def _summary_scalar(self, value: Any) -> float | None: + if isinstance(value, torch.Tensor): + return float(value.detach().float().mean().item()) + if isinstance(value, np.ndarray): + return float(np.mean(value)) + if isinstance(value, (float, int, np.floating, np.integer)): + return float(value) + return None + + def _save_best_ckpts( + self, + iteration: int, + result_dict: dict[str, Any], + ) -> None: + if torch.distributed.get_rank() != 0: + return + best_ckpt_cfg = self._best_ckpt_cfg() + if best_ckpt_cfg is None: + return + + keep_n = int(best_ckpt_cfg['keep_n']) + metric_names = [metric for metric in best_ckpt_cfg['metric_names'] if metric in result_dict] + if not metric_names: + return + + checkpoint: dict[str, Any] | None = None + save_name = self.engine_cfg['save_name'] + + for metric_name in metric_names: + score = self._summary_scalar(result_dict.get(metric_name)) + if score is None or not math.isfinite(score): + continue + + entries = [ + entry for entry in self._load_best_metric_index(metric_name) + if int(entry.get("iteration", -1)) != iteration + ] + ranked_entries = sorted( + entries + [{"iteration": iteration, "score": score, "path": ""}], + key=lambda entry: (float(entry["score"]), int(entry["iteration"])), + reverse=True, + ) + kept_entries = ranked_entries[:keep_n] + if not any(int(entry["iteration"]) == iteration for entry in kept_entries): + continue + + metric_dir = self._best_metric_dir(metric_name) + mkdir(metric_dir) + metric_slug = osp.basename(metric_dir) + best_path = osp.join( + metric_dir, + f"{save_name}-iter-{iteration:0>5}-score-{score:.4f}-{metric_slug}.pt", + ) + + if checkpoint is None: + checkpoint = self._build_checkpoint(iteration) + self._save_checkpoint_file(checkpoint, best_path) + + refreshed_entries = [] + for entry in kept_entries: + if int(entry["iteration"]) == iteration: + refreshed_entries.append( + { + "iteration": iteration, + "score": score, + "path": best_path, + } + ) + else: + refreshed_entries.append(entry) + + keep_paths = {entry["path"] for entry in refreshed_entries if isinstance(entry.get("path"), str)} + for stale_entry in entries: + stale_path = stale_entry.get("path") + if isinstance(stale_path, str) and stale_path not in keep_paths and osp.isfile(stale_path): + os.remove(stale_path) + + self._write_best_metric_index(metric_name, refreshed_entries) + def save_ckpt(self, iteration): if torch.distributed.get_rank() == 0: save_name = self.engine_cfg['save_name'] @@ -589,6 +718,7 @@ class BaseModel(MetaModel, nn.Module): if result_dict: model.msg_mgr.write_to_tensorboard(result_dict) model.msg_mgr.write_to_wandb(result_dict) + model._save_best_ckpts(model.iteration, result_dict) model.msg_mgr.reset_time() if model.iteration >= model.engine_cfg['total_iter']: break diff --git a/opengait/utils/__init__.py b/opengait/utils/__init__.py index 8f72cd7..ffad61b 100644 --- a/opengait/utils/__init__.py +++ b/opengait/utils/__init__.py @@ -7,4 +7,5 @@ from .common import mkdir, clones from .common import MergeCfgsDict from .common import get_attr_from from .common import NoOp -from .msg_manager import get_msg_mgr \ No newline at end of file +from .common import resolve_output_path +from .msg_manager import get_msg_mgr diff --git a/opengait/utils/common.py b/opengait/utils/common.py index dfd39e0..adeef57 100644 --- a/opengait/utils/common.py +++ b/opengait/utils/common.py @@ -2,6 +2,7 @@ import copy import os import inspect import logging +from pathlib import Path import torch import numpy as np import torch.nn as nn @@ -203,3 +204,19 @@ def get_ddp_module(module, find_unused_parameters=False, **kwargs): def params_count(net): n_parameters = sum(p.numel() for p in net.parameters()) return 'Parameters Count: {:.5f}M'.format(n_parameters / 1e6) + + +def resolve_output_path(cfgs, engine_cfg): + output_root = ( + engine_cfg.get('output_root') + or cfgs.get('output_root') + or os.environ.get('OPENGAIT_OUTPUT_ROOT') + or 'output' + ) + output_root = str(Path(output_root).expanduser()) + return os.path.join( + output_root, + cfgs['data_cfg']['dataset_name'], + cfgs['model_cfg']['model'], + engine_cfg['save_name'], + )