feat: retain best checkpoints and support alternate output roots
This commit is contained in:
+69
@@ -0,0 +1,69 @@
|
||||
data_cfg:
|
||||
dataset_name: Scoliosis1K
|
||||
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
|
||||
num_workers: 1
|
||||
remove_no_gallery: false
|
||||
|
||||
evaluator_cfg:
|
||||
enable_float16: true
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 2000
|
||||
save_name: DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k
|
||||
eval_func: evaluate_scoliosis
|
||||
sampler:
|
||||
batch_shuffle: false
|
||||
batch_size: 1
|
||||
sample_type: all_ordered
|
||||
type: InferenceSampler
|
||||
frames_all_limit: 720
|
||||
metric: euc
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
- type: NoOperation
|
||||
|
||||
loss_cfg:
|
||||
- loss_term_weight: 1.0
|
||||
margin: 0.2
|
||||
type: TripletLoss
|
||||
log_prefix: triplet
|
||||
- loss_term_weight: 1.0
|
||||
scale: 16
|
||||
type: CrossEntropyLoss
|
||||
log_prefix: softmax
|
||||
log_accuracy: true
|
||||
|
||||
model_cfg:
|
||||
model: DRF
|
||||
num_pairs: 8
|
||||
num_metrics: 3
|
||||
backbone_cfg:
|
||||
type: ResNet9
|
||||
block: BasicBlock
|
||||
in_channel: 2
|
||||
channels:
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
layers:
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
strides:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 1
|
||||
maxpool: false
|
||||
SeparateFCs:
|
||||
in_channels: 512
|
||||
out_channels: 256
|
||||
parts_num: 16
|
||||
SeparateBNNecks:
|
||||
class_num: 3
|
||||
in_channels: 256
|
||||
parts_num: 16
|
||||
bin_num:
|
||||
- 16
|
||||
+115
@@ -0,0 +1,115 @@
|
||||
data_cfg:
|
||||
dataset_name: Scoliosis1K
|
||||
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
|
||||
data_in_use:
|
||||
- true
|
||||
- false
|
||||
num_workers: 1
|
||||
remove_no_gallery: false
|
||||
test_dataset_name: Scoliosis1K
|
||||
|
||||
evaluator_cfg:
|
||||
enable_float16: true
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 80000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k
|
||||
output_root: /mnt/hddl/data/OpenGait-output
|
||||
eval_func: evaluate_scoliosis
|
||||
sampler:
|
||||
batch_shuffle: false
|
||||
batch_size: 1
|
||||
sample_type: all_ordered
|
||||
type: InferenceSampler
|
||||
frames_all_limit: 720
|
||||
metric: euc
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
|
||||
loss_cfg:
|
||||
- loss_term_weight: 1.0
|
||||
margin: 0.2
|
||||
type: TripletLoss
|
||||
log_prefix: triplet
|
||||
- loss_term_weight: 1.0
|
||||
scale: 16
|
||||
type: CrossEntropyLoss
|
||||
log_prefix: softmax
|
||||
log_accuracy: true
|
||||
|
||||
model_cfg:
|
||||
model: ScoNet
|
||||
backbone_cfg:
|
||||
type: ResNet9
|
||||
block: BasicBlock
|
||||
in_channel: 2
|
||||
channels:
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
layers:
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
strides:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 1
|
||||
maxpool: false
|
||||
SeparateFCs:
|
||||
in_channels: 512
|
||||
out_channels: 256
|
||||
parts_num: 16
|
||||
SeparateBNNecks:
|
||||
class_num: 3
|
||||
in_channels: 256
|
||||
parts_num: 16
|
||||
bin_num:
|
||||
- 16
|
||||
|
||||
optimizer_cfg:
|
||||
lr: 0.0005
|
||||
solver: AdamW
|
||||
weight_decay: 0.0005
|
||||
|
||||
scheduler_cfg:
|
||||
scheduler: CosineAnnealingLR
|
||||
T_max: 60000
|
||||
eta_min: 0.00001
|
||||
|
||||
trainer_cfg:
|
||||
enable_float16: true
|
||||
fix_BN: false
|
||||
with_test: true
|
||||
log_iter: 100
|
||||
restore_ckpt_strict: true
|
||||
optimizer_reset: false
|
||||
scheduler_reset: false
|
||||
restore_hint: /mnt/hddl/data/OpenGait-output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k/checkpoints/latest.pt
|
||||
output_root: /mnt/hddl/data/OpenGait-output
|
||||
auto_resume_latest: true
|
||||
resume_every_iter: 500
|
||||
resume_keep: 6
|
||||
best_ckpt_cfg:
|
||||
keep_n: 3
|
||||
metric_names:
|
||||
- scalar/test_f1/
|
||||
- scalar/test_accuracy/
|
||||
eval_iter: 1000
|
||||
save_iter: 500
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_cosine_finetune_1gpu_80k
|
||||
sync_BN: false
|
||||
total_iter: 80000
|
||||
sampler:
|
||||
batch_shuffle: true
|
||||
batch_size:
|
||||
- 8
|
||||
- 8
|
||||
frames_num_fixed: 30
|
||||
sample_type: fixed_unordered
|
||||
type: TripletSampler
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
+111
@@ -0,0 +1,111 @@
|
||||
data_cfg:
|
||||
dataset_name: Scoliosis1K
|
||||
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
|
||||
data_in_use:
|
||||
- true
|
||||
- false
|
||||
num_workers: 1
|
||||
remove_no_gallery: false
|
||||
test_dataset_name: Scoliosis1K
|
||||
|
||||
evaluator_cfg:
|
||||
enable_float16: true
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 20000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k
|
||||
eval_func: evaluate_scoliosis
|
||||
sampler:
|
||||
batch_shuffle: false
|
||||
batch_size: 1
|
||||
sample_type: all_ordered
|
||||
type: InferenceSampler
|
||||
frames_all_limit: 720
|
||||
metric: euc
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
|
||||
loss_cfg:
|
||||
- loss_term_weight: 1.0
|
||||
margin: 0.2
|
||||
type: TripletLoss
|
||||
log_prefix: triplet
|
||||
- loss_term_weight: 1.0
|
||||
scale: 16
|
||||
type: CrossEntropyLoss
|
||||
log_prefix: softmax
|
||||
log_accuracy: true
|
||||
|
||||
model_cfg:
|
||||
model: ScoNet
|
||||
backbone_cfg:
|
||||
type: ResNet9
|
||||
block: BasicBlock
|
||||
in_channel: 2
|
||||
channels:
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
layers:
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
strides:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 1
|
||||
maxpool: false
|
||||
SeparateFCs:
|
||||
in_channels: 512
|
||||
out_channels: 256
|
||||
parts_num: 16
|
||||
SeparateBNNecks:
|
||||
class_num: 3
|
||||
in_channels: 256
|
||||
parts_num: 16
|
||||
bin_num:
|
||||
- 16
|
||||
|
||||
optimizer_cfg:
|
||||
lr: 0.001
|
||||
solver: AdamW
|
||||
weight_decay: 0.0005
|
||||
|
||||
scheduler_cfg:
|
||||
gamma: 0.1
|
||||
milestones:
|
||||
- 5000
|
||||
- 7000
|
||||
- 9000
|
||||
scheduler: MultiStepLR
|
||||
|
||||
trainer_cfg:
|
||||
enable_float16: true
|
||||
fix_BN: false
|
||||
with_test: true
|
||||
log_iter: 100
|
||||
restore_ckpt_strict: true
|
||||
optimizer_reset: true
|
||||
scheduler_reset: true
|
||||
restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt
|
||||
auto_resume_latest: true
|
||||
resume_every_iter: 500
|
||||
resume_keep: 6
|
||||
eval_iter: 1000
|
||||
save_iter: 1000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k
|
||||
sync_BN: false
|
||||
total_iter: 20000
|
||||
sampler:
|
||||
batch_shuffle: true
|
||||
batch_size:
|
||||
- 8
|
||||
- 8
|
||||
frames_num_fixed: 30
|
||||
sample_type: fixed_unordered
|
||||
type: TripletSampler
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
+111
@@ -0,0 +1,111 @@
|
||||
data_cfg:
|
||||
dataset_name: Scoliosis1K
|
||||
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
|
||||
data_in_use:
|
||||
- true
|
||||
- false
|
||||
num_workers: 1
|
||||
remove_no_gallery: false
|
||||
test_dataset_name: Scoliosis1K
|
||||
|
||||
evaluator_cfg:
|
||||
enable_float16: true
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 40000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k
|
||||
eval_func: evaluate_scoliosis
|
||||
sampler:
|
||||
batch_shuffle: false
|
||||
batch_size: 1
|
||||
sample_type: all_ordered
|
||||
type: InferenceSampler
|
||||
frames_all_limit: 720
|
||||
metric: euc
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
|
||||
loss_cfg:
|
||||
- loss_term_weight: 1.0
|
||||
margin: 0.2
|
||||
type: TripletLoss
|
||||
log_prefix: triplet
|
||||
- loss_term_weight: 1.0
|
||||
scale: 16
|
||||
type: CrossEntropyLoss
|
||||
log_prefix: softmax
|
||||
log_accuracy: true
|
||||
|
||||
model_cfg:
|
||||
model: ScoNet
|
||||
backbone_cfg:
|
||||
type: ResNet9
|
||||
block: BasicBlock
|
||||
in_channel: 2
|
||||
channels:
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
layers:
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
strides:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 1
|
||||
maxpool: false
|
||||
SeparateFCs:
|
||||
in_channels: 512
|
||||
out_channels: 256
|
||||
parts_num: 16
|
||||
SeparateBNNecks:
|
||||
class_num: 3
|
||||
in_channels: 256
|
||||
parts_num: 16
|
||||
bin_num:
|
||||
- 16
|
||||
|
||||
optimizer_cfg:
|
||||
lr: 0.001
|
||||
solver: AdamW
|
||||
weight_decay: 0.0005
|
||||
|
||||
scheduler_cfg:
|
||||
gamma: 0.1
|
||||
milestones:
|
||||
- 10000
|
||||
- 20000
|
||||
- 30000
|
||||
scheduler: MultiStepLR
|
||||
|
||||
trainer_cfg:
|
||||
enable_float16: true
|
||||
fix_BN: false
|
||||
with_test: true
|
||||
log_iter: 100
|
||||
restore_ckpt_strict: true
|
||||
optimizer_reset: true
|
||||
scheduler_reset: true
|
||||
restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt
|
||||
auto_resume_latest: true
|
||||
resume_every_iter: 500
|
||||
resume_keep: 6
|
||||
eval_iter: 1000
|
||||
save_iter: 1000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k
|
||||
sync_BN: false
|
||||
total_iter: 40000
|
||||
sampler:
|
||||
batch_shuffle: true
|
||||
batch_size:
|
||||
- 8
|
||||
- 8
|
||||
frames_num_fixed: 30
|
||||
sample_type: fixed_unordered
|
||||
type: TripletSampler
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
+110
@@ -0,0 +1,110 @@
|
||||
data_cfg:
|
||||
dataset_name: Scoliosis1K
|
||||
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
|
||||
data_in_use:
|
||||
- true
|
||||
- false
|
||||
num_workers: 1
|
||||
remove_no_gallery: false
|
||||
test_dataset_name: Scoliosis1K
|
||||
|
||||
evaluator_cfg:
|
||||
enable_float16: true
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 20000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k
|
||||
eval_func: evaluate_scoliosis
|
||||
sampler:
|
||||
batch_shuffle: false
|
||||
batch_size: 1
|
||||
sample_type: all_ordered
|
||||
type: InferenceSampler
|
||||
frames_all_limit: 720
|
||||
metric: euc
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
|
||||
loss_cfg:
|
||||
- loss_term_weight: 1.0
|
||||
margin: 0.2
|
||||
type: TripletLoss
|
||||
log_prefix: triplet
|
||||
- loss_term_weight: 1.0
|
||||
scale: 16
|
||||
type: CrossEntropyLoss
|
||||
log_prefix: softmax
|
||||
log_accuracy: true
|
||||
|
||||
model_cfg:
|
||||
model: ScoNet
|
||||
backbone_cfg:
|
||||
type: ResNet9
|
||||
block: BasicBlock
|
||||
in_channel: 2
|
||||
channels:
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
layers:
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
strides:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 1
|
||||
maxpool: false
|
||||
SeparateFCs:
|
||||
in_channels: 512
|
||||
out_channels: 256
|
||||
parts_num: 16
|
||||
SeparateBNNecks:
|
||||
class_num: 3
|
||||
in_channels: 256
|
||||
parts_num: 16
|
||||
bin_num:
|
||||
- 16
|
||||
|
||||
optimizer_cfg:
|
||||
lr: 0.1
|
||||
momentum: 0.9
|
||||
solver: SGD
|
||||
weight_decay: 0.0005
|
||||
|
||||
scheduler_cfg:
|
||||
gamma: 0.1
|
||||
milestones:
|
||||
- 10000
|
||||
- 14000
|
||||
- 18000
|
||||
scheduler: MultiStepLR
|
||||
|
||||
trainer_cfg:
|
||||
enable_float16: true
|
||||
fix_BN: false
|
||||
with_test: true
|
||||
log_iter: 100
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 0
|
||||
auto_resume_latest: true
|
||||
resume_every_iter: 500
|
||||
resume_keep: 6
|
||||
eval_iter: 1000
|
||||
save_iter: 1000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k
|
||||
sync_BN: false
|
||||
total_iter: 20000
|
||||
sampler:
|
||||
batch_shuffle: true
|
||||
batch_size:
|
||||
- 8
|
||||
- 8
|
||||
frames_num_fixed: 30
|
||||
sample_type: fixed_unordered
|
||||
type: TripletSampler
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
+112
@@ -0,0 +1,112 @@
|
||||
data_cfg:
|
||||
dataset_name: Scoliosis1K
|
||||
dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly
|
||||
dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_112.json
|
||||
data_in_use:
|
||||
- true
|
||||
- false
|
||||
num_workers: 1
|
||||
remove_no_gallery: false
|
||||
test_dataset_name: Scoliosis1K
|
||||
|
||||
evaluator_cfg:
|
||||
enable_float16: true
|
||||
restore_ckpt_strict: true
|
||||
restore_hint: 20000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k
|
||||
eval_func: evaluate_scoliosis
|
||||
sampler:
|
||||
batch_shuffle: false
|
||||
batch_size: 1
|
||||
sample_type: all_ordered
|
||||
type: InferenceSampler
|
||||
frames_all_limit: 720
|
||||
metric: euc
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
|
||||
loss_cfg:
|
||||
- loss_term_weight: 1.0
|
||||
margin: 0.2
|
||||
type: TripletLoss
|
||||
log_prefix: triplet
|
||||
- loss_term_weight: 1.0
|
||||
scale: 16
|
||||
type: CrossEntropyLoss
|
||||
log_prefix: softmax
|
||||
log_accuracy: true
|
||||
|
||||
model_cfg:
|
||||
model: ScoNet
|
||||
backbone_cfg:
|
||||
type: ResNet9
|
||||
block: BasicBlock
|
||||
in_channel: 2
|
||||
channels:
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
layers:
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
- 1
|
||||
strides:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 1
|
||||
maxpool: false
|
||||
SeparateFCs:
|
||||
in_channels: 512
|
||||
out_channels: 256
|
||||
parts_num: 16
|
||||
SeparateBNNecks:
|
||||
class_num: 3
|
||||
in_channels: 256
|
||||
parts_num: 16
|
||||
bin_num:
|
||||
- 16
|
||||
|
||||
optimizer_cfg:
|
||||
lr: 0.1
|
||||
momentum: 0.9
|
||||
solver: SGD
|
||||
weight_decay: 0.0005
|
||||
|
||||
scheduler_cfg:
|
||||
gamma: 0.1
|
||||
milestones:
|
||||
- 10000
|
||||
- 14000
|
||||
- 18000
|
||||
scheduler: MultiStepLR
|
||||
|
||||
trainer_cfg:
|
||||
enable_float16: true
|
||||
fix_BN: false
|
||||
with_test: true
|
||||
log_iter: 100
|
||||
restore_ckpt_strict: true
|
||||
optimizer_reset: false
|
||||
scheduler_reset: false
|
||||
restore_hint: output/Scoliosis1K/ScoNet/ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k/checkpoints/latest.pt
|
||||
auto_resume_latest: true
|
||||
resume_every_iter: 500
|
||||
resume_keep: 6
|
||||
eval_iter: 1000
|
||||
save_iter: 1000
|
||||
save_name: ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k
|
||||
sync_BN: false
|
||||
total_iter: 20000
|
||||
sampler:
|
||||
batch_shuffle: true
|
||||
batch_size:
|
||||
- 8
|
||||
- 8
|
||||
frames_num_fixed: 30
|
||||
sample_type: fixed_unordered
|
||||
type: TripletSampler
|
||||
transform:
|
||||
- type: BaseSilCuttingTransform
|
||||
@@ -85,4 +85,28 @@
|
||||
>> if torch.distributed.get_rank() == 0 and self.training and self.iteration % 100==0:
|
||||
>> summary_writer.add_video('outs', outs.mean(2).unsqueeze(2), self.iteration)
|
||||
>> ```
|
||||
> Note that this example requires the [`moviepy`](https://github.com/Zulko/moviepy) package, and hence you should run `pip install moviepy` first.
|
||||
> Note that this example requires the [`moviepy`](https://github.com/Zulko/moviepy) package, and hence you should run `pip install moviepy` first.
|
||||
|
||||
### Keep Best Checkpoints
|
||||
> If you want to retain the strongest evaluation checkpoints instead of relying only on the latest or final save, you can enable best-checkpoint tracking in `trainer_cfg`.
|
||||
>
|
||||
> Example:
|
||||
>> ```yaml
|
||||
>> trainer_cfg:
|
||||
>> with_test: true
|
||||
>> eval_iter: 1000
|
||||
>> save_iter: 1000
|
||||
>> best_ckpt_cfg:
|
||||
>> keep_n: 3
|
||||
>> metric_names:
|
||||
>> - scalar/test_f1/
|
||||
>> - scalar/test_accuracy/
|
||||
>> ```
|
||||
>
|
||||
> Behavior:
|
||||
> * The normal numbered checkpoints are still written by `save_iter`.
|
||||
> * After each eval, the trainer checks the configured scalar metrics and keeps the top `N` checkpoints separately for each metric.
|
||||
> * Best checkpoints are saved under `output/.../checkpoints/best/<metric>/`.
|
||||
> * Each best-metric directory contains an `index.json` file with the retained iterations, scores, and paths.
|
||||
>
|
||||
> This is useful for long or unstable runs where the best checkpoint may appear well before the final iteration.
|
||||
|
||||
@@ -164,6 +164,7 @@ Conclusion:
|
||||
- on the same split, `body-only + plain CE` improved that further to `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` at `7000`
|
||||
- a later explicit rerun of the `body-only + plain CE` `7000` full-test eval reproduced that same `83.16 / 68.24 / 80.02 / 68.47` result
|
||||
- adding back limited head context via `head-lite` did not improve the full-test score; its `7000` checkpoint reached only `78.07 Acc / 65.42 Prec / 80.50 Rec / 62.08 F1`
|
||||
- the first practical DRF bridge on the same `1:1:2` body-only recipe peaked early and still underperformed the plain skeleton baseline; its best retained `2000` checkpoint reached only `80.21 Acc / 58.92 Prec / 59.23 Rec / 57.84 F1` on the full test set
|
||||
|
||||
### Not reproducible with current evidence
|
||||
|
||||
@@ -179,6 +180,10 @@ Conclusion:
|
||||
- the `1:1:8` class ratio is not just a nuisance; it appears to be a major driver of the current skeleton/DRF failure mode
|
||||
- on the easier `1:1:2` split, weighted CE is not currently the winning recipe; the best local full-test result so far came from plain CE
|
||||
- `head-lite` may help the small fixed proxy subset, but that gain did not transfer to the full `TEST_SET`, so `body-only + plain CE` remains the best practical skeleton recipe
|
||||
- DRF currently looks worse than the plain skeleton baseline not because the skeleton path is dead, but because the additional prior branch is not yet providing a selective or stable complement. The current local evidence points to three likely causes:
|
||||
- the body-only skeleton baseline already captures most of the useful torso signal on `1:1:2`, so PAV may be largely redundant in this setting
|
||||
- the current PGA/PAV path appears weakly selective in local diagnostics, so the prior is not clearly emphasizing a few clinically relevant parts
|
||||
- DRF peaks very early and then degrades, which suggests the added branch is making optimization less stable without improving the final decision boundary
|
||||
|
||||
## Recommended standard for future work in this repo
|
||||
|
||||
|
||||
@@ -37,7 +37,11 @@ Use it for:
|
||||
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_2gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Same `1:1:2` body-only bridge as above, but removed weighted CE to test whether class weighting was suppressing precision on the easier split | interrupted | superseded before meaningful progress by the user-requested 1-GPU rerun on the `5070 Ti` |
|
||||
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Same plain-CE `1:1:2` bridge, relaunched on the `5070 Ti` only per user request | complete | best proxy subset at `7000`: `88.28/69.12/74.15/68.80`; full test at `7000`: `83.16/68.24/80.02/68.47`; final proxy at `10000`: `75.00/65.00/63.41/54.55` (Acc/Prec/Rec/F1) |
|
||||
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_headlite_plaince_bridge_1gpu_10k` | ScoNet-MT-ske bridge | `Scoliosis1K-drf-pkl-112-sigma15-joint8-headlite` + `Scoliosis1K_112.json` | Added `head-lite` structure (nose plus shoulder links, no eyes/ears) on top of the plain-CE `1:1:2` bridge; first `3090` launch OOMed due unrelated occupancy, then relaunched on the UUID-pinned `5070 Ti` | complete | best proxy subset at `7000`: `86.72/70.15/89.00/70.44`; full test at `7000`: `78.07/65.42/80.50/62.08` (Acc/Prec/Rec/F1) |
|
||||
| 2026-03-10 | `DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | DRF bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | First practical DRF run on the winning `1:1:2` skeleton recipe: `body-only`, plain CE, SGD, `10k` bridge schedule, fixed proxy subset seed `112` | running | pending |
|
||||
| 2026-03-10 | `DRF_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k` | DRF bridge | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | First practical DRF run on the winning `1:1:2` skeleton recipe: `body-only`, plain CE, SGD, `10k` bridge schedule, fixed proxy subset seed `112` | complete | best proxy subset at `2000`: `88.28/61.79/60.31/60.93`; full test at `2000`: `80.21/58.92/59.23/57.84` (Acc/Prec/Rec/F1) |
|
||||
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_main_1gpu_20k` | ScoNet-MT-ske mainline | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Promoted the winning practical skeleton recipe to a longer `20k` run with full `TEST_SET` eval and checkpoint save every `1000`; no proxy subset, same plain CE + SGD setup | interrupted | superseded by the true-resume continuation below |
|
||||
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_resume_1gpu_20k` | ScoNet-MT-ske mainline | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | True continuation of the earlier plain-CE `1:1:2` `10k` bridge from its `latest.pt`, extended to `20k` with full `TEST_SET` eval and checkpoint save every `1000` | interrupted | superseded by the AdamW finetune branch below |
|
||||
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_20k` | ScoNet-MT-ske finetune | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | AdamW finetune from the earlier plain-CE `1:1:2` `10k` checkpoint; restores model weights only, resets optimizer/scheduler state, keeps full `TEST_SET` eval and checkpoint save every `1000` | interrupted | superseded by the longer overnight 40k finetune below |
|
||||
| 2026-03-10 | `ScoNet_skeleton_112_sigma15_joint8_bodyonly_plaince_adamw_finetune_1gpu_40k` | ScoNet-MT-ske finetune | `Scoliosis1K-drf-pkl-118-sigma15-joint8-bodyonly` + `Scoliosis1K_112.json` | Longer overnight AdamW finetune from the same `10k` plain-CE checkpoint; restores model weights only, resets optimizer/scheduler state, extends to `40000` total iterations with full `TEST_SET` eval every `1000` | running | pending |
|
||||
|
||||
## Current best skeleton baseline
|
||||
|
||||
@@ -63,3 +67,4 @@ Current best `ScoNet-MT-ske`-style result:
|
||||
- Removing weighted CE on the `1:1:2` bridge improved the current best full-test result further: `body-only + plain CE` reached `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` at `7000`, so weighted CE does not currently look beneficial on the easier split.
|
||||
- A later full-test rerun of the retained `body-only + plain CE` `7000` checkpoint reproduced the same `83.16 / 68.24 / 80.02 / 68.47` result exactly, so that number is now explicitly reconfirmed rather than just carried forward from the original run log.
|
||||
- `Head-lite` looked stronger than `body-only` on the fixed 128-sequence proxy subset at `7000`, but it did not transfer to the full test set: `78.07 Acc / 65.42 Prec / 80.50 Rec / 62.08 F1`, which is clearly below the `body-only + plain CE` full-test result.
|
||||
- The first practical DRF bridge on the winning `1:1:2` recipe did not beat the plain skeleton baseline. Its best retained checkpoint (`2000`) reached only `80.21 Acc / 58.92 Prec / 59.23 Rec / 57.84 F1` on the full test set, versus `83.16 / 68.24 / 80.02 / 68.47` for `body-only + plain CE` at `7000`. The working local interpretation is that the added PAV/PGA path is currently injecting a weak or noisy prior rather than a useful complementary signal.
|
||||
|
||||
@@ -124,6 +124,33 @@ The launcher configures both:
|
||||
|
||||
This makes it easier to recover logs even if the original shell or tool session disappears.
|
||||
|
||||
## Moving outputs off the SSD
|
||||
|
||||
OpenGait writes checkpoints, TensorBoard summaries, best-checkpoint snapshots, and file logs under a run output root.
|
||||
|
||||
By default that root is `output/`, but you can override it per run with `output_root` in the engine config:
|
||||
|
||||
```yaml
|
||||
trainer_cfg:
|
||||
output_root: /mnt/hddl/data/OpenGait-output
|
||||
|
||||
evaluator_cfg:
|
||||
output_root: /mnt/hddl/data/OpenGait-output
|
||||
```
|
||||
|
||||
The final path layout stays the same under that root:
|
||||
|
||||
```text
|
||||
<output_root>/<dataset>/<model>/<save_name>/
|
||||
```
|
||||
|
||||
For long scoliosis runs, using an HDD-backed root is recommended so local SSD space is not consumed by:
|
||||
|
||||
- numbered checkpoints
|
||||
- rolling resume checkpoints
|
||||
- best-N retained checkpoints
|
||||
- TensorBoard summary files
|
||||
|
||||
## GPU selection
|
||||
|
||||
Prefer GPU UUIDs, not ordinal indices.
|
||||
|
||||
+9
-3
@@ -4,7 +4,14 @@ import argparse
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from modeling import models
|
||||
from opengait.utils import config_loader, get_ddp_module, init_seeds, params_count, get_msg_mgr
|
||||
from opengait.utils import (
|
||||
config_loader,
|
||||
get_ddp_module,
|
||||
get_msg_mgr,
|
||||
init_seeds,
|
||||
params_count,
|
||||
resolve_output_path,
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Main program for opengait.')
|
||||
parser.add_argument('--local_rank', type=int, default=0,
|
||||
@@ -25,8 +32,7 @@ def initialization(cfgs, training):
|
||||
msg_mgr = get_msg_mgr()
|
||||
engine_cfg = cfgs['trainer_cfg'] if training else cfgs['evaluator_cfg']
|
||||
logger_cfg = cfgs.get('logger_cfg', {})
|
||||
output_path = os.path.join('output/', cfgs['data_cfg']['dataset_name'],
|
||||
cfgs['model_cfg']['model'], engine_cfg['save_name'])
|
||||
output_path = resolve_output_path(cfgs, engine_cfg)
|
||||
if training:
|
||||
msg_mgr.init_manager(
|
||||
output_path,
|
||||
|
||||
@@ -10,8 +10,10 @@ BaseModel.run_train(model)
|
||||
BaseModel.run_test(model)
|
||||
"""
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -33,7 +35,7 @@ from data.transform import get_transform
|
||||
from data.collate_fn import CollateFn
|
||||
from data.dataset import DataSet
|
||||
import data.sampler as Samplers
|
||||
from opengait.utils import Odict, mkdir, ddp_all_gather
|
||||
from opengait.utils import Odict, mkdir, ddp_all_gather, resolve_output_path
|
||||
from opengait.utils import get_valid_args, is_list, is_dict, np2var, ts2np, list2var, get_attr_from
|
||||
from evaluation import evaluator as eval_functions
|
||||
from opengait.utils import NoOp
|
||||
@@ -144,8 +146,7 @@ class BaseModel(MetaModel, nn.Module):
|
||||
|
||||
if training and self.engine_cfg['enable_float16']:
|
||||
self.Scaler = GradScaler()
|
||||
self.save_path = osp.join('output/', cfgs['data_cfg']['dataset_name'],
|
||||
cfgs['model_cfg']['model'], self.engine_cfg['save_name'])
|
||||
self.save_path = resolve_output_path(cfgs, self.engine_cfg)
|
||||
|
||||
self.build_network(cfgs['model_cfg'])
|
||||
self.init_parameters()
|
||||
@@ -317,6 +318,134 @@ class BaseModel(MetaModel, nn.Module):
|
||||
return candidate
|
||||
return None
|
||||
|
||||
def _best_ckpt_cfg(self) -> dict[str, Any] | None:
|
||||
best_ckpt_cfg = self.engine_cfg.get('best_ckpt_cfg')
|
||||
if not isinstance(best_ckpt_cfg, dict):
|
||||
return None
|
||||
keep_n = int(best_ckpt_cfg.get('keep_n', 0))
|
||||
metric_names = best_ckpt_cfg.get('metric_names', [])
|
||||
if keep_n <= 0 or not isinstance(metric_names, list) or not metric_names:
|
||||
return None
|
||||
return best_ckpt_cfg
|
||||
|
||||
def _best_ckpt_root(self) -> str:
|
||||
return osp.join(self._checkpoint_dir(), "best")
|
||||
|
||||
def _best_metric_dir(self, metric_name: str) -> str:
|
||||
metric_slug = re.sub(r"[^A-Za-z0-9_.-]+", "_", metric_name).strip("._")
|
||||
return osp.join(self._best_ckpt_root(), metric_slug)
|
||||
|
||||
def _best_metric_index_path(self, metric_name: str) -> str:
|
||||
return osp.join(self._best_metric_dir(metric_name), "index.json")
|
||||
|
||||
def _load_best_metric_index(self, metric_name: str) -> list[dict[str, Any]]:
|
||||
index_path = self._best_metric_index_path(metric_name)
|
||||
if not osp.isfile(index_path):
|
||||
return []
|
||||
with open(index_path, "r", encoding="utf-8") as handle:
|
||||
raw_entries = json.load(handle)
|
||||
if not isinstance(raw_entries, list):
|
||||
return []
|
||||
entries: list[dict[str, Any]] = []
|
||||
for entry in raw_entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
path = entry.get("path")
|
||||
if isinstance(path, str) and osp.isfile(path):
|
||||
entries.append(entry)
|
||||
return entries
|
||||
|
||||
def _write_best_metric_index(
|
||||
self,
|
||||
metric_name: str,
|
||||
entries: list[dict[str, Any]],
|
||||
) -> None:
|
||||
index_path = self._best_metric_index_path(metric_name)
|
||||
mkdir(osp.dirname(index_path))
|
||||
tmp_path = index_path + ".tmp"
|
||||
with open(tmp_path, "w", encoding="utf-8") as handle:
|
||||
json.dump(entries, handle, indent=2, sort_keys=True)
|
||||
os.replace(tmp_path, index_path)
|
||||
|
||||
def _summary_scalar(self, value: Any) -> float | None:
|
||||
if isinstance(value, torch.Tensor):
|
||||
return float(value.detach().float().mean().item())
|
||||
if isinstance(value, np.ndarray):
|
||||
return float(np.mean(value))
|
||||
if isinstance(value, (float, int, np.floating, np.integer)):
|
||||
return float(value)
|
||||
return None
|
||||
|
||||
def _save_best_ckpts(
|
||||
self,
|
||||
iteration: int,
|
||||
result_dict: dict[str, Any],
|
||||
) -> None:
|
||||
if torch.distributed.get_rank() != 0:
|
||||
return
|
||||
best_ckpt_cfg = self._best_ckpt_cfg()
|
||||
if best_ckpt_cfg is None:
|
||||
return
|
||||
|
||||
keep_n = int(best_ckpt_cfg['keep_n'])
|
||||
metric_names = [metric for metric in best_ckpt_cfg['metric_names'] if metric in result_dict]
|
||||
if not metric_names:
|
||||
return
|
||||
|
||||
checkpoint: dict[str, Any] | None = None
|
||||
save_name = self.engine_cfg['save_name']
|
||||
|
||||
for metric_name in metric_names:
|
||||
score = self._summary_scalar(result_dict.get(metric_name))
|
||||
if score is None or not math.isfinite(score):
|
||||
continue
|
||||
|
||||
entries = [
|
||||
entry for entry in self._load_best_metric_index(metric_name)
|
||||
if int(entry.get("iteration", -1)) != iteration
|
||||
]
|
||||
ranked_entries = sorted(
|
||||
entries + [{"iteration": iteration, "score": score, "path": ""}],
|
||||
key=lambda entry: (float(entry["score"]), int(entry["iteration"])),
|
||||
reverse=True,
|
||||
)
|
||||
kept_entries = ranked_entries[:keep_n]
|
||||
if not any(int(entry["iteration"]) == iteration for entry in kept_entries):
|
||||
continue
|
||||
|
||||
metric_dir = self._best_metric_dir(metric_name)
|
||||
mkdir(metric_dir)
|
||||
metric_slug = osp.basename(metric_dir)
|
||||
best_path = osp.join(
|
||||
metric_dir,
|
||||
f"{save_name}-iter-{iteration:0>5}-score-{score:.4f}-{metric_slug}.pt",
|
||||
)
|
||||
|
||||
if checkpoint is None:
|
||||
checkpoint = self._build_checkpoint(iteration)
|
||||
self._save_checkpoint_file(checkpoint, best_path)
|
||||
|
||||
refreshed_entries = []
|
||||
for entry in kept_entries:
|
||||
if int(entry["iteration"]) == iteration:
|
||||
refreshed_entries.append(
|
||||
{
|
||||
"iteration": iteration,
|
||||
"score": score,
|
||||
"path": best_path,
|
||||
}
|
||||
)
|
||||
else:
|
||||
refreshed_entries.append(entry)
|
||||
|
||||
keep_paths = {entry["path"] for entry in refreshed_entries if isinstance(entry.get("path"), str)}
|
||||
for stale_entry in entries:
|
||||
stale_path = stale_entry.get("path")
|
||||
if isinstance(stale_path, str) and stale_path not in keep_paths and osp.isfile(stale_path):
|
||||
os.remove(stale_path)
|
||||
|
||||
self._write_best_metric_index(metric_name, refreshed_entries)
|
||||
|
||||
def save_ckpt(self, iteration):
|
||||
if torch.distributed.get_rank() == 0:
|
||||
save_name = self.engine_cfg['save_name']
|
||||
@@ -589,6 +718,7 @@ class BaseModel(MetaModel, nn.Module):
|
||||
if result_dict:
|
||||
model.msg_mgr.write_to_tensorboard(result_dict)
|
||||
model.msg_mgr.write_to_wandb(result_dict)
|
||||
model._save_best_ckpts(model.iteration, result_dict)
|
||||
model.msg_mgr.reset_time()
|
||||
if model.iteration >= model.engine_cfg['total_iter']:
|
||||
break
|
||||
|
||||
@@ -7,4 +7,5 @@ from .common import mkdir, clones
|
||||
from .common import MergeCfgsDict
|
||||
from .common import get_attr_from
|
||||
from .common import NoOp
|
||||
from .msg_manager import get_msg_mgr
|
||||
from .common import resolve_output_path
|
||||
from .msg_manager import get_msg_mgr
|
||||
|
||||
@@ -2,6 +2,7 @@ import copy
|
||||
import os
|
||||
import inspect
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import torch
|
||||
import numpy as np
|
||||
import torch.nn as nn
|
||||
@@ -203,3 +204,19 @@ def get_ddp_module(module, find_unused_parameters=False, **kwargs):
|
||||
def params_count(net):
|
||||
n_parameters = sum(p.numel() for p in net.parameters())
|
||||
return 'Parameters Count: {:.5f}M'.format(n_parameters / 1e6)
|
||||
|
||||
|
||||
def resolve_output_path(cfgs, engine_cfg):
|
||||
output_root = (
|
||||
engine_cfg.get('output_root')
|
||||
or cfgs.get('output_root')
|
||||
or os.environ.get('OPENGAIT_OUTPUT_ROOT')
|
||||
or 'output'
|
||||
)
|
||||
output_root = str(Path(output_root).expanduser())
|
||||
return os.path.join(
|
||||
output_root,
|
||||
cfgs['data_cfg']['dataset_name'],
|
||||
cfgs['model_cfg']['model'],
|
||||
engine_cfg['save_name'],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user