Add resumable ScoNet skeleton training diagnostics

2026-03-09 15:57:13 +08:00
parent 4e0b0a18dc
commit 36aef46a0d
15 changed files with 1226 additions and 44 deletions
@@ -89,6 +89,8 @@ CUDA_VISIBLE_DEVICES=0,1 uv run python -m torch.distributed.launch --nproc_per_n
 ```

 > **Note:** The `--nproc_per_node` argument must exactly match the number of GPUs specified in `CUDA_VISIBLE_DEVICES`. For single-GPU evaluation, use `CUDA_VISIBLE_DEVICES=0` and `--nproc_per_node=1` with the DDP launcher.
+>
+> **Resume Tip:** To survive interrupted training runs, set `trainer_cfg.resume_every_iter` to a non-zero value and optionally `trainer_cfg.auto_resume_latest: true`. OpenGait will keep `output/.../checkpoints/latest.pt` updated for crash recovery.



@@ -68,6 +68,9 @@ trainer_cfg:
  optimizer_reset: false
  scheduler_reset: false
  restore_hint: 0
+  auto_resume_latest: false
+  resume_every_iter: 0
+  resume_keep: 3
  save_iter: 2000
  save_name: tmp
  sync_BN: false
@@ -0,0 +1,25 @@
+coco18tococo17_args:
+  transfer_to_coco17: False
+
+padkeypoints_args:
+  pad_method: knn
+  use_conf: True
+
+norm_args:
+  pose_format: coco
+  use_conf: ${padkeypoints_args.use_conf}
+  heatmap_image_height: 128
+
+heatmap_generator_args:
+  sigma: 1.5
+  use_score: ${padkeypoints_args.use_conf}
+  img_h: ${norm_args.heatmap_image_height}
+  img_w: ${norm_args.heatmap_image_height}
+  with_limb: null
+  with_kp: null
+
+align_args:
+  align: True
+  final_img_size: 64
+  offset: 0
+  heatmap_image_size: ${norm_args.heatmap_image_height}
@@ -0,0 +1,28 @@
+coco18tococo17_args:
+  transfer_to_coco17: False
+
+padkeypoints_args:
+  pad_method: knn
+  use_conf: True
+
+norm_args:
+  pose_format: coco
+  use_conf: ${padkeypoints_args.use_conf}
+  heatmap_image_height: 128
+
+heatmap_generator_args:
+  sigma: 1.5
+  use_score: ${padkeypoints_args.use_conf}
+  img_h: ${norm_args.heatmap_image_height}
+  img_w: ${norm_args.heatmap_image_height}
+  with_limb: null
+  with_kp: null
+
+sigma_limb: 1.5
+sigma_joint: 8.0
+
+align_args:
+  align: True
+  final_img_size: 64
+  offset: 0
+  heatmap_image_size: ${norm_args.heatmap_image_height}
@@ -0,0 +1,105 @@
+data_cfg:
+  dataset_name: Scoliosis1K
+  dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15
+  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_118.json
+  data_in_use:
+    - true
+    - false
+  num_workers: 1
+  remove_no_gallery: false
+  test_dataset_name: Scoliosis1K
+
+evaluator_cfg:
+  enable_float16: true
+  restore_ckpt_strict: true
+  restore_hint: 20000
+  save_name: ScoNet_skeleton_118_sigma15
+  eval_func: evaluate_scoliosis
+  sampler:
+    batch_shuffle: false
+    batch_size: 1
+    sample_type: all_ordered
+    frames_all_limit: 720
+  metric: euc
+  transform:
+    - type: BaseSilCuttingTransform
+
+loss_cfg:
+  - loss_term_weight: 1.0
+    margin: 0.2
+    type: TripletLoss
+    log_prefix: triplet
+  - loss_term_weight: 1.0
+    scale: 16
+    type: CrossEntropyLoss
+    log_prefix: softmax
+    log_accuracy: true
+
+model_cfg:
+  model: ScoNet
+  backbone_cfg:
+    type: ResNet9
+    block: BasicBlock
+    in_channel: 2
+    channels:
+      - 64
+      - 128
+      - 256
+      - 512
+    layers:
+      - 1
+      - 1
+      - 1
+      - 1
+    strides:
+      - 1
+      - 2
+      - 2
+      - 1
+    maxpool: false
+  SeparateFCs:
+    in_channels: 512
+    out_channels: 256
+    parts_num: 16
+  SeparateBNNecks:
+    class_num: 3
+    in_channels: 256
+    parts_num: 16
+  bin_num:
+    - 16
+
+optimizer_cfg:
+  lr: 0.1
+  momentum: 0.9
+  solver: SGD
+  weight_decay: 0.0005
+
+scheduler_cfg:
+  gamma: 0.1
+  milestones:
+    - 10000
+    - 14000
+    - 18000
+  scheduler: MultiStepLR
+
+trainer_cfg:
+  enable_float16: true
+  fix_BN: false
+  with_test: false
+  log_iter: 100
+  restore_ckpt_strict: true
+  restore_hint: 0
+  save_iter: 20000
+  save_name: ScoNet_skeleton_118_sigma15
+  sync_BN: true
+  total_iter: 20000
+  sampler:
+    batch_shuffle: true
+    batch_size:
+      - 8
+      - 8
+    frames_num_fixed: 30
+    sample_type: fixed_unordered
+    type: TripletSampler
+  transform:
+    - type: BaseSilCuttingTransform
@@ -0,0 +1,105 @@
+data_cfg:
+  dataset_name: Scoliosis1K
+  dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15
+  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_118.json
+  data_in_use:
+    - true
+    - false
+  num_workers: 1
+  remove_no_gallery: false
+  test_dataset_name: Scoliosis1K
+
+evaluator_cfg:
+  enable_float16: true
+  restore_ckpt_strict: true
+  restore_hint: 20000
+  save_name: ScoNet_skeleton_118_sigma15_bs12x8
+  eval_func: evaluate_scoliosis
+  sampler:
+    batch_shuffle: false
+    batch_size: 1
+    sample_type: all_ordered
+    frames_all_limit: 720
+  metric: euc
+  transform:
+    - type: BaseSilCuttingTransform
+
+loss_cfg:
+  - loss_term_weight: 1.0
+    margin: 0.2
+    type: TripletLoss
+    log_prefix: triplet
+  - loss_term_weight: 1.0
+    scale: 16
+    type: CrossEntropyLoss
+    log_prefix: softmax
+    log_accuracy: true
+
+model_cfg:
+  model: ScoNet
+  backbone_cfg:
+    type: ResNet9
+    block: BasicBlock
+    in_channel: 2
+    channels:
+      - 64
+      - 128
+      - 256
+      - 512
+    layers:
+      - 1
+      - 1
+      - 1
+      - 1
+    strides:
+      - 1
+      - 2
+      - 2
+      - 1
+    maxpool: false
+  SeparateFCs:
+    in_channels: 512
+    out_channels: 256
+    parts_num: 16
+  SeparateBNNecks:
+    class_num: 3
+    in_channels: 256
+    parts_num: 16
+  bin_num:
+    - 16
+
+optimizer_cfg:
+  lr: 0.1
+  momentum: 0.9
+  solver: SGD
+  weight_decay: 0.0005
+
+scheduler_cfg:
+  gamma: 0.1
+  milestones:
+    - 10000
+    - 14000
+    - 18000
+  scheduler: MultiStepLR
+
+trainer_cfg:
+  enable_float16: true
+  fix_BN: false
+  with_test: false
+  log_iter: 100
+  restore_ckpt_strict: true
+  restore_hint: 0
+  save_iter: 20000
+  save_name: ScoNet_skeleton_118_sigma15_bs12x8
+  sync_BN: true
+  total_iter: 20000
+  sampler:
+    batch_shuffle: true
+    batch_size:
+      - 12
+      - 8
+    frames_num_fixed: 30
+    sample_type: fixed_unordered
+    type: TripletSampler
+  transform:
+    - type: BaseSilCuttingTransform
@@ -0,0 +1,105 @@
+data_cfg:
+  dataset_name: Scoliosis1K
+  dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-sharedalign
+  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_118.json
+  data_in_use:
+    - true
+    - false
+  num_workers: 1
+  remove_no_gallery: false
+  test_dataset_name: Scoliosis1K
+
+evaluator_cfg:
+  enable_float16: true
+  restore_ckpt_strict: true
+  restore_hint: 20000
+  save_name: ScoNet_skeleton_118_sigma15_joint8_sharedalign_bs12x8
+  eval_func: evaluate_scoliosis
+  sampler:
+    batch_shuffle: false
+    batch_size: 1
+    sample_type: all_ordered
+    frames_all_limit: 720
+  metric: euc
+  transform:
+    - type: BaseSilCuttingTransform
+
+loss_cfg:
+  - loss_term_weight: 1.0
+    margin: 0.2
+    type: TripletLoss
+    log_prefix: triplet
+  - loss_term_weight: 1.0
+    scale: 16
+    type: CrossEntropyLoss
+    log_prefix: softmax
+    log_accuracy: true
+
+model_cfg:
+  model: ScoNet
+  backbone_cfg:
+    type: ResNet9
+    block: BasicBlock
+    in_channel: 2
+    channels:
+      - 64
+      - 128
+      - 256
+      - 512
+    layers:
+      - 1
+      - 1
+      - 1
+      - 1
+    strides:
+      - 1
+      - 2
+      - 2
+      - 1
+    maxpool: false
+  SeparateFCs:
+    in_channels: 512
+    out_channels: 256
+    parts_num: 16
+  SeparateBNNecks:
+    class_num: 3
+    in_channels: 256
+    parts_num: 16
+  bin_num:
+    - 16
+
+optimizer_cfg:
+  lr: 0.1
+  momentum: 0.9
+  solver: SGD
+  weight_decay: 0.0005
+
+scheduler_cfg:
+  gamma: 0.1
+  milestones:
+    - 10000
+    - 14000
+    - 18000
+  scheduler: MultiStepLR
+
+trainer_cfg:
+  enable_float16: true
+  fix_BN: false
+  with_test: false
+  log_iter: 100
+  restore_ckpt_strict: true
+  restore_hint: 0
+  save_iter: 20000
+  save_name: ScoNet_skeleton_118_sigma15_joint8_sharedalign_bs12x8
+  sync_BN: true
+  total_iter: 20000
+  sampler:
+    batch_shuffle: true
+    batch_size:
+      - 12
+      - 8
+    frames_num_fixed: 30
+    sample_type: fixed_unordered
+    type: TripletSampler
+  transform:
+    - type: BaseSilCuttingTransform
@@ -0,0 +1,108 @@
+data_cfg:
+  dataset_name: Scoliosis1K
+  dataset_root: /mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-sharedalign
+  dataset_partition: ./datasets/Scoliosis1K/Scoliosis1K_118.json
+  data_in_use:
+    - true
+    - false
+  num_workers: 1
+  remove_no_gallery: false
+  test_dataset_name: Scoliosis1K
+
+evaluator_cfg:
+  enable_float16: true
+  restore_ckpt_strict: true
+  restore_hint: 20000
+  save_name: ScoNet_skeleton_118_sigma15_joint8_sharedalign_2gpu_bs12x8
+  eval_func: evaluate_scoliosis
+  sampler:
+    batch_shuffle: false
+    batch_size: 2
+    sample_type: all_ordered
+    frames_all_limit: 720
+  metric: euc
+  transform:
+    - type: BaseSilCuttingTransform
+
+loss_cfg:
+  - loss_term_weight: 1.0
+    margin: 0.2
+    type: TripletLoss
+    log_prefix: triplet
+  - loss_term_weight: 1.0
+    scale: 16
+    type: CrossEntropyLoss
+    log_prefix: softmax
+    log_accuracy: true
+
+model_cfg:
+  model: ScoNet
+  backbone_cfg:
+    type: ResNet9
+    block: BasicBlock
+    in_channel: 2
+    channels:
+      - 64
+      - 128
+      - 256
+      - 512
+    layers:
+      - 1
+      - 1
+      - 1
+      - 1
+    strides:
+      - 1
+      - 2
+      - 2
+      - 1
+    maxpool: false
+  SeparateFCs:
+    in_channels: 512
+    out_channels: 256
+    parts_num: 16
+  SeparateBNNecks:
+    class_num: 3
+    in_channels: 256
+    parts_num: 16
+  bin_num:
+    - 16
+
+optimizer_cfg:
+  lr: 0.1
+  momentum: 0.9
+  solver: SGD
+  weight_decay: 0.0005
+
+scheduler_cfg:
+  gamma: 0.1
+  milestones:
+    - 10000
+    - 14000
+    - 18000
+  scheduler: MultiStepLR
+
+trainer_cfg:
+  enable_float16: true
+  fix_BN: false
+  with_test: false
+  log_iter: 100
+  restore_ckpt_strict: true
+  restore_hint: 0
+  auto_resume_latest: true
+  resume_every_iter: 500
+  resume_keep: 3
+  save_iter: 20000
+  save_name: ScoNet_skeleton_118_sigma15_joint8_sharedalign_2gpu_bs12x8
+  sync_BN: true
+  total_iter: 20000
+  sampler:
+    batch_shuffle: true
+    batch_size:
+      - 12
+      - 8
+    frames_num_fixed: 30
+    sample_type: fixed_unordered
+    type: TripletSampler
+  transform:
+    - type: BaseSilCuttingTransform
@@ -75,6 +75,7 @@ The silhouette and skeleton-map pipelines are different experiments and should n

 * `Scoliosis1K-sil-pkl` is the silhouette modality used by the standard ScoNet configs.
 * pose-derived heatmap roots such as `Scoliosis1K_sigma_8.0/pkl` or DRF exports are skeleton-map inputs and require `in_channel: 2`.
+* DRF does **not** use the silhouette stream as an input. It uses `0_heatmap.pkl` plus `1_pav.pkl`.

 Naming note:

@@ -89,6 +90,18 @@ A strong silhouette checkpoint does not validate the skeleton-map path. In parti

 So if you are debugging DRF or `ScoNet-MT-ske` reproduction, do not use `ScoNet-20000-better.pt` as evidence that the heatmap preprocessing is correct.

+### Overlay caveat
+
+Do not treat a direct overlay between `Scoliosis1K-sil-pkl` and pose-derived skeleton maps as a valid alignment test.
+
+Reason:
+
+* the released silhouette modality is an estimated segmentation output from `PP-HumanSeg v2`
+* the released pose modality is an estimated keypoint output from `ViTPose`
+* the two modalities are normalized by different preprocessing pipelines before they reach OpenGait
+
+So a silhouette-vs-skeleton mismatch in a debug figure is usually a cross-modality frame-of-reference issue, not proof that the raw dataset is bad. The more important check for skeleton-map debugging is whether the **limb and joint channels align with each other** inside `0_heatmap.pkl`.
+
 ---

 ## Pose-to-Heatmap Conversion
@@ -146,6 +159,21 @@ If you explicitly want train-only PAV min-max statistics, add:
  --stats_partition=./datasets/Scoliosis1K/Scoliosis1K_118.json
 ```

+### Heatmap debugging notes
+
+Current confirmed findings from local debugging:
+
+* the raw pose dataset itself looks healthy; poor `ScoNet-MT-ske` results are not explained by obvious missing-joint collapse
+* a larger heatmap sigma can materially blur away the articulated structure; `sigma=8` was much broader than the silhouette geometry, while smaller sigma values recovered more structure
+* an earlier bug aligned the limb and joint channels separately; that made the two channels of `0_heatmap.pkl` slightly misregistered
+* the heatmap path is now patched so limb and joint channels share one alignment crop
+
+Remaining caution:
+
+* the exported skeleton map is stored as `64x64`
+* if the runtime config uses `BaseSilCuttingTransform`, the network actually sees `64x44`
+* that symmetric left/right crop is not automatically wrong, but it is still a meaningful ablation point for skeleton-map experiments
+
 The output layout is:

 ```text
@@ -8,7 +8,8 @@ import pickle
 import argparse
 import numpy as np
 from glob import glob 
-from typing import Literal
+from copy import deepcopy
+from typing import Any, Literal
 from tqdm import tqdm
 import matplotlib.cm as cm
 import torch.distributed as dist
@@ -516,7 +517,7 @@ class GatherTransform(object):
    """
    Gather the different transforms.
    """
-    def __init__(self, base_transform, transform_bone, transform_joint):
+    def __init__(self, base_transform, transform_bone, transform_joint, align_transform=None):

        """
        base_transform: Some common transform, e.g., COCO18toCOCO17, PadKeypoints, CenterAndScale
@@ -526,12 +527,15 @@ class GatherTransform(object):
        self.base_transform = base_transform
        self.transform_bone = transform_bone
        self.transform_joint = transform_joint
+        self.align_transform = align_transform

    def __call__(self, pose_data):
        x = self.base_transform(pose_data)
        heatmap_bone = self.transform_bone(x) # [T, 1, H, W]
        heatmap_joint = self.transform_joint(x) # [T, 1, H, W]
        heatmap = np.concatenate([heatmap_bone, heatmap_joint], axis=1)
+        if self.align_transform is not None:
+            heatmap = self.align_transform(heatmap)
        return heatmap

 class HeatmapAlignment():
@@ -543,23 +547,32 @@ class HeatmapAlignment():

    def center_crop(self, heatmap):
        """
-        Input: [1, heatmap_image_size, heatmap_image_size]
-        Output: [1, final_img_size, final_img_size]
+        Input: [C, heatmap_image_size, heatmap_image_size]
+        Output: [C, final_img_size, final_img_size]
        """
-        raw_heatmap = heatmap[0]
-        if self.align: 
-            y_sum = raw_heatmap.sum(axis=1)
-            y_top = (y_sum != 0).argmax(axis=0)
-            y_btm = (y_sum != 0).cumsum(axis=0).argmax(axis=0)
-            height = y_btm - y_top + 1
-            raw_heatmap = raw_heatmap[y_top - self.offset: y_btm + 1 + self.offset, (self.heatmap_image_size // 2) - (height // 2) : (self.heatmap_image_size // 2) + (height // 2) + 1]
-        raw_heatmap = cv2.resize(raw_heatmap, (self.final_img_size, self.final_img_size), interpolation=cv2.INTER_AREA)
-        return raw_heatmap[np.newaxis, :, :] # [1, final_img_size, final_img_size]
+        raw_heatmap = heatmap
+        if self.align:
+            support_map = raw_heatmap.max(axis=0)
+            y_sum = support_map.sum(axis=1)
+            nonzero_rows = np.flatnonzero(y_sum != 0)
+            if nonzero_rows.size != 0:
+                y_top = max(int(nonzero_rows[0]) - self.offset, 0)
+                y_btm = min(int(nonzero_rows[-1]) + self.offset, self.heatmap_image_size - 1)
+                height = y_btm - y_top + 1
+                x_center = self.heatmap_image_size // 2
+                x_left = max(x_center - (height // 2), 0)
+                x_right = min(x_center + (height // 2) + 1, self.heatmap_image_size)
+                raw_heatmap = raw_heatmap[:, y_top:y_btm + 1, x_left:x_right]
+        resized = np.stack([
+            cv2.resize(channel, (self.final_img_size, self.final_img_size), interpolation=cv2.INTER_AREA)
+            for channel in raw_heatmap
+        ], axis=0)
+        return resized # [C, final_img_size, final_img_size]

    def __call__(self, heatmap_imgs):
        """
-        heatmap_imgs: (T, 1, raw_size, raw_size)
-        return (T, 1, final_img_size, final_img_size)
+        heatmap_imgs: (T, C, raw_size, raw_size)
+        return (T, C, final_img_size, final_img_size)
        """
        original_dtype = heatmap_imgs.dtype
        heatmap_imgs = heatmap_imgs.astype(np.float32) / 255.0
@@ -570,12 +583,14 @@ class HeatmapAlignment():
        return heatmap_imgs.astype(original_dtype)

 def GenerateHeatmapTransform(
-    coco18tococo17_args,
-    padkeypoints_args,
-    norm_args,
-    heatmap_generator_args,
-    align_args,
+    coco18tococo17_args: dict[str, Any],
+    padkeypoints_args: dict[str, Any],
+    norm_args: dict[str, Any],
+    heatmap_generator_args: dict[str, Any],
+    align_args: dict[str, Any],
    reduction: Literal["upstream", "max", "sum"] = "upstream",
+    sigma_limb: float | None = None,
+    sigma_joint: float | None = None,
 ):

    base_transform = T.Compose([
@@ -584,34 +599,44 @@ def GenerateHeatmapTransform(
        CenterAndScaleNormalizer(**norm_args), 
    ])

-    heatmap_generator_args["with_limb"] = True
-    heatmap_generator_args["with_kp"] = False
+    bone_generator_args = deepcopy(heatmap_generator_args)
+    joint_generator_args = deepcopy(heatmap_generator_args)
+
+    bone_generator_args["with_limb"] = True
+    bone_generator_args["with_kp"] = False
+    if sigma_limb is not None:
+        bone_generator_args["sigma"] = sigma_limb
    bone_image_transform = (
        HeatmapToImage()
        if reduction == "upstream"
        else HeatmapReducer(reduction=reduction)
    )
    transform_bone = T.Compose([
-        GeneratePoseTarget(**heatmap_generator_args), 
+        GeneratePoseTarget(**bone_generator_args), 
        bone_image_transform,
-        HeatmapAlignment(**align_args) 
    ])

-    heatmap_generator_args["with_limb"] = False
-    heatmap_generator_args["with_kp"] = True
+    joint_generator_args["with_limb"] = False
+    joint_generator_args["with_kp"] = True
+    if sigma_joint is not None:
+        joint_generator_args["sigma"] = sigma_joint
    joint_image_transform = (
        HeatmapToImage()
        if reduction == "upstream"
        else HeatmapReducer(reduction=reduction)
    )
    transform_joint = T.Compose([
-        GeneratePoseTarget(**heatmap_generator_args), 
+        GeneratePoseTarget(**joint_generator_args), 
        joint_image_transform,
-        HeatmapAlignment(**align_args) 
    ])

    transform = T.Compose([
-        GatherTransform(base_transform, transform_bone, transform_joint) # [T, 2, H, W]
+        GatherTransform(
+            base_transform,
+            transform_bone,
+            transform_joint,
+            HeatmapAlignment(**align_args),
+        ) # [T, 2, H, W]
    ])

    return transform
@@ -98,6 +98,15 @@ def load_heatmap_cfg(cfg_path: str) -> dict[str, Any]:
    return cast(dict[str, Any], replaced)


+def optional_cfg_float(cfg: dict[str, Any], key: str) -> float | None:
+    value = cfg.get(key)
+    if value is None:
+        return None
+    if not isinstance(value, (int, float)):
+        raise TypeError(f"Expected numeric value for {key}, got {type(value).__name__}")
+    return float(value)
+
+
 def build_pose_transform(cfg: dict[str, Any]) -> T.Compose:
    return T.Compose([
        heatmap_prep.COCO18toCOCO17(**cfg["coco18tococo17_args"]),
@@ -192,6 +201,8 @@ def main() -> None:
        heatmap_generator_args=heatmap_cfg["heatmap_generator_args"],
        align_args=heatmap_cfg["align_args"],
        reduction=cast(HeatmapReduction, args.heatmap_reduction),
+        sigma_limb=optional_cfg_float(heatmap_cfg, "sigma_limb"),
+        sigma_joint=optional_cfg_float(heatmap_cfg, "sigma_joint"),
    )

    pose_paths = iter_pose_paths(args.pose_data_path)
@@ -59,9 +59,12 @@
 ### trainer_cfg
 * Trainer configuration
 >  * Args
->     * restore_hint: `int` value indicates the iteration number of restored checkpoint; `str` value indicates the path to restored checkpoint. The option is often used to finetune on new dataset or restore the interrupted training process.
+>     * restore_hint: `int` value indicates the iteration number of restored checkpoint; `str` value indicates the path to restored checkpoint. Use `latest` to restore the latest rolling resume checkpoint. The option is often used to finetune on new dataset or restore the interrupted training process.
+>     * auto_resume_latest: If `True` and `restore_hint==0`, automatically resume from `output/.../checkpoints/latest.pt` when it exists.
 >     * fix_BN: If `True`, we fix the weight of all `BatchNorm` layers.
 >     * log_iter: Log the information per `log_iter` iterations.
+>     * resume_every_iter: Save a rolling resume checkpoint every `resume_every_iter` iterations. These checkpoints update `checkpoints/latest.pt` and are intended for crash recovery.
+>     * resume_keep: Number of rolling resume checkpoints retained under `checkpoints/resume/`. Set `0` to keep all of them.
 >     * save_iter: Save the checkpoint per `save_iter` iterations.
 >     * with_test: If `True`, we test the model every `save_iter` iterations. A bit of performance impact.(*Disable in Default*)
 >     * optimizer_reset: If `True` and `restore_hint!=0`, reset the optimizer while restoring the model.
@@ -168,6 +171,9 @@ trainer_cfg:
  log_iter: 100
  restore_ckpt_strict: true
  restore_hint: 0
+  auto_resume_latest: false
+  resume_every_iter: 500
+  resume_keep: 3
  save_iter: 10000
  save_name: Baseline
  sync_BN: true
@@ -9,8 +9,13 @@ Typical usage:
 BaseModel.run_train(model)
 BaseModel.run_test(model)
 """
-import torch
+import json
+import os
+import random
+from typing import Any
+
 import numpy as np
+import torch
 import os.path as osp
 import torch.nn as nn
 import torch.optim as optim
@@ -169,6 +174,13 @@ class BaseModel(MetaModel, nn.Module):
        restore_hint = self.engine_cfg['restore_hint']
        if restore_hint != 0:
            self.resume_ckpt(restore_hint)
+        elif training and self.engine_cfg.get('auto_resume_latest', False):
+            latest_ckpt = self._get_latest_resume_ckpt_path()
+            if latest_ckpt is not None:
+                self.msg_mgr.log_info(
+                    "Auto-resuming from latest checkpoint %s", latest_ckpt
+                )
+                self.resume_ckpt(latest_ckpt)

    def get_backbone(self, backbone_cfg):
        """Get the backbone of the model."""
@@ -234,23 +246,112 @@ class BaseModel(MetaModel, nn.Module):
        scheduler = Scheduler(self.optimizer, **valid_arg)
        return scheduler

+    def _build_checkpoint(self, iteration: int) -> dict[str, Any]:
+        checkpoint: dict[str, Any] = {
+            'model': self.state_dict(),
+            'optimizer': self.optimizer.state_dict(),
+            'scheduler': self.scheduler.state_dict(),
+            'iteration': iteration,
+            'random_state': random.getstate(),
+            'numpy_random_state': np.random.get_state(),
+            'torch_random_state': torch.get_rng_state(),
+        }
+        if torch.cuda.is_available():
+            checkpoint['cuda_random_state_all'] = torch.cuda.get_rng_state_all()
+        if self.engine_cfg.get('enable_float16', False) and hasattr(self, 'Scaler'):
+            checkpoint['scaler'] = self.Scaler.state_dict()
+        return checkpoint
+
+    def _checkpoint_dir(self) -> str:
+        return osp.join(self.save_path, "checkpoints")
+
+    def _resume_dir(self) -> str:
+        return osp.join(self._checkpoint_dir(), "resume")
+
+    def _save_checkpoint_file(
+        self,
+        checkpoint: dict[str, Any],
+        save_path: str,
+    ) -> None:
+        mkdir(osp.dirname(save_path))
+        tmp_path = save_path + ".tmp"
+        torch.save(checkpoint, tmp_path)
+        os.replace(tmp_path, save_path)
+
+    def _write_resume_meta(self, iteration: int, resume_path: str) -> None:
+        meta_path = osp.join(self._checkpoint_dir(), "latest.json")
+        meta = {
+            "iteration": iteration,
+            "path": resume_path,
+        }
+        tmp_path = meta_path + ".tmp"
+        with open(tmp_path, "w", encoding="utf-8") as handle:
+            json.dump(meta, handle, indent=2, sort_keys=True)
+        os.replace(tmp_path, meta_path)
+
+    def _prune_resume_checkpoints(self, keep_count: int) -> None:
+        if keep_count <= 0:
+            return
+        resume_dir = self._resume_dir()
+        if not osp.isdir(resume_dir):
+            return
+        prefix = f"{self.engine_cfg['save_name']}-resume-"
+        resume_files = sorted(
+            file_name for file_name in os.listdir(resume_dir)
+            if file_name.startswith(prefix) and file_name.endswith(".pt")
+        )
+        stale_files = resume_files[:-keep_count]
+        for file_name in stale_files:
+            os.remove(osp.join(resume_dir, file_name))
+
+    def _get_latest_resume_ckpt_path(self) -> str | None:
+        latest_path = osp.join(self._checkpoint_dir(), "latest.pt")
+        if osp.isfile(latest_path):
+            return latest_path
+        meta_path = osp.join(self._checkpoint_dir(), "latest.json")
+        if osp.isfile(meta_path):
+            with open(meta_path, "r", encoding="utf-8") as handle:
+                latest_meta = json.load(handle)
+            candidate = latest_meta.get("path")
+            if isinstance(candidate, str) and osp.isfile(candidate):
+                return candidate
+        return None
+
    def save_ckpt(self, iteration):
        if torch.distributed.get_rank() == 0:
-            mkdir(osp.join(self.save_path, "checkpoints/"))
            save_name = self.engine_cfg['save_name']
-            checkpoint = {
-                'model': self.state_dict(),
-                'optimizer': self.optimizer.state_dict(),
-                'scheduler': self.scheduler.state_dict(),
-                'iteration': iteration}
-            torch.save(checkpoint,
-                       osp.join(self.save_path, 'checkpoints/{}-{:0>5}.pt'.format(save_name, iteration)))
+            checkpoint = self._build_checkpoint(iteration)
+            ckpt_path = osp.join(
+                self._checkpoint_dir(),
+                '{}-{:0>5}.pt'.format(save_name, iteration),
+            )
+            self._save_checkpoint_file(checkpoint, ckpt_path)
+
+    def save_resume_ckpt(self, iteration: int) -> None:
+        if torch.distributed.get_rank() != 0:
+            return
+        checkpoint = self._build_checkpoint(iteration)
+        save_name = self.engine_cfg['save_name']
+        resume_path = osp.join(
+            self._resume_dir(),
+            f"{save_name}-resume-{iteration:0>5}.pt",
+        )
+        latest_path = osp.join(self._checkpoint_dir(), "latest.pt")
+        self._save_checkpoint_file(checkpoint, resume_path)
+        self._save_checkpoint_file(checkpoint, latest_path)
+        self._write_resume_meta(iteration, resume_path)
+        self._prune_resume_checkpoints(
+            int(self.engine_cfg.get('resume_keep', 3))
+        )

    def _load_ckpt(self, save_name):
        load_ckpt_strict = self.engine_cfg['restore_ckpt_strict']

-        checkpoint = torch.load(save_name, map_location=torch.device(
-            "cuda", self.device))
+        checkpoint = torch.load(
+            save_name,
+            map_location=torch.device("cuda", self.device),
+            weights_only=False,
+        )
        model_state_dict = checkpoint['model']

        if not load_ckpt_strict:
@@ -271,6 +372,33 @@ class BaseModel(MetaModel, nn.Module):
            else:
                self.msg_mgr.log_warning(
                    "Restore NO Scheduler from %s !!!" % save_name)
+            if (
+                self.engine_cfg.get('enable_float16', False)
+                and hasattr(self, 'Scaler')
+                and 'scaler' in checkpoint
+            ):
+                self.Scaler.load_state_dict(checkpoint['scaler'])
+            if 'random_state' in checkpoint:
+                random.setstate(checkpoint['random_state'])
+            if 'numpy_random_state' in checkpoint:
+                np.random.set_state(checkpoint['numpy_random_state'])
+            if 'torch_random_state' in checkpoint:
+                torch_random_state = checkpoint['torch_random_state']
+                if not isinstance(torch_random_state, torch.Tensor):
+                    torch_random_state = torch.as_tensor(
+                        torch_random_state,
+                        dtype=torch.uint8,
+                    )
+                torch.set_rng_state(torch_random_state.cpu())
+            if 'cuda_random_state_all' in checkpoint and torch.cuda.is_available():
+                cuda_random_state_all = checkpoint['cuda_random_state_all']
+                normalized_cuda_states = []
+                for state in cuda_random_state_all:
+                    if not isinstance(state, torch.Tensor):
+                        state = torch.as_tensor(state, dtype=torch.uint8)
+                    normalized_cuda_states.append(state.cpu())
+                torch.cuda.set_rng_state_all(normalized_cuda_states)
+        self.iteration = int(checkpoint.get('iteration', self.iteration))
        self.msg_mgr.log_info("Restore Parameters from %s !!!" % save_name)

    def resume_ckpt(self, restore_hint):
@@ -278,10 +406,15 @@ class BaseModel(MetaModel, nn.Module):
            save_name = self.engine_cfg['save_name']
            save_name = osp.join(
                self.save_path, 'checkpoints/{}-{:0>5}.pt'.format(save_name, restore_hint))
-            self.iteration = restore_hint
        elif isinstance(restore_hint, str):
-            save_name = restore_hint
-            self.iteration = 0
+            if restore_hint == 'latest':
+                save_name = self._get_latest_resume_ckpt_path()
+                if save_name is None:
+                    raise FileNotFoundError(
+                        f"No latest checkpoint found under {self._checkpoint_dir()}"
+                    )
+            else:
+                save_name = restore_hint
        else:
            raise ValueError(
                "Error type for -Restore_Hint-, supported: int or string.")
@@ -417,6 +550,9 @@ class BaseModel(MetaModel, nn.Module):
            visual_summary['scalar/learning_rate'] = model.optimizer.param_groups[0]['lr']

            model.msg_mgr.train_step(loss_info, visual_summary)
+            resume_every_iter = int(model.engine_cfg.get('resume_every_iter', 0))
+            if resume_every_iter > 0 and model.iteration % resume_every_iter == 0:
+                model.save_resume_ckpt(model.iteration)
            if model.iteration % model.engine_cfg['save_iter'] == 0:
                # save the checkpoint
                model.save_ckpt(model.iteration)
@@ -0,0 +1,421 @@
+from __future__ import annotations
+
+import json
+import pickle
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+
+import numpy as np
+from jaxtyping import Float
+from numpy.typing import NDArray
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(REPO_ROOT) not in sys.path:
+    sys.path.append(str(REPO_ROOT))
+
+from datasets import pretreatment_scoliosis_drf as drf_prep
+
+POSE_ROOT = Path("/mnt/public/data/Scoliosis1K/Scoliosis1K-pose-pkl")
+HEATMAP_ROOT = Path("/mnt/public/data/Scoliosis1K/Scoliosis1K-drf-pkl-118-sigma15-joint8-sharedalign")
+PARTITION_PATH = REPO_ROOT / "datasets/Scoliosis1K/Scoliosis1K_118.json"
+HEATMAP_CFG_PATH = REPO_ROOT / "configs/drf/pretreatment_heatmap_drf_sigma15_joint8.yaml"
+REPORT_PATH = REPO_ROOT / "research/scoliosis_dataset_analysis_118_sharedalign.md"
+JSON_PATH = REPO_ROOT / "research/scoliosis_dataset_analysis_118_sharedalign.json"
+
+EPS = 1e-6
+THRESHOLD = 13.0
+SIDE_CUT = 10
+LABEL_TO_INT = {"negative": 0, "neutral": 1, "positive": 2}
+FloatArray = NDArray[np.float32]
+
+
+@dataclass(frozen=True)
+class SequenceKey:
+    pid: str
+    label: str
+    seq: str
+
+
+@dataclass
+class RunningStats:
+    total: float = 0.0
+    count: int = 0
+
+    def update(self, value: float, n: int = 1) -> None:
+        self.total += value * n
+        self.count += n
+
+    @property
+    def mean(self) -> float:
+        return self.total / max(self.count, 1)
+
+
+def load_partition_ids() -> tuple[set[str], set[str]]:
+    with PARTITION_PATH.open("r", encoding="utf-8") as handle:
+        partition = json.load(handle)
+    return set(partition["TRAIN_SET"]), set(partition["TEST_SET"])
+
+
+def sequence_key_from_path(path: Path) -> SequenceKey:
+    parts = path.parts
+    return SequenceKey(pid=parts[-4], label=parts[-3], seq=parts[-2])
+
+
+def iter_pose_paths() -> list[Path]:
+    return sorted(POSE_ROOT.glob("*/*/*/*.pkl"))
+
+
+def iter_heatmap_paths() -> list[Path]:
+    return sorted(HEATMAP_ROOT.glob("*/*/*/0_heatmap.pkl"))
+
+
+def read_pickle(path: Path) -> object:
+    with path.open("rb") as handle:
+        return pickle.load(handle)
+
+
+def bbox_from_mask(mask: NDArray[np.bool_]) -> tuple[float, float, float, float] | None:
+    rows = np.flatnonzero(mask.any(axis=1))
+    cols = np.flatnonzero(mask.any(axis=0))
+    if rows.size == 0 or cols.size == 0:
+        return None
+    y0 = int(rows[0])
+    y1 = int(rows[-1])
+    x0 = int(cols[0])
+    x1 = int(cols[-1])
+    width = float(x1 - x0 + 1)
+    height = float(y1 - y0 + 1)
+    center_x = float((x0 + x1) / 2.0)
+    center_y = float((y0 + y1) / 2.0)
+    return width, height, center_x, center_y
+
+
+def sequence_bbox_metrics(
+    heatmap: Float[FloatArray, "frames channels height width"],
+    threshold: float = THRESHOLD,
+) -> dict[str, float]:
+    support = heatmap.max(axis=1)
+    bone = heatmap[:, 0]
+    joint = heatmap[:, 1]
+
+    widths: list[float] = []
+    heights: list[float] = []
+    centers_x: list[float] = []
+    centers_y: list[float] = []
+    active_fractions: list[float] = []
+    cut_mass_ratios: list[float] = []
+    bone_joint_dx: list[float] = []
+    bone_joint_dy: list[float] = []
+
+    for frame_idx in range(support.shape[0]):
+        frame = support[frame_idx]
+        mask = frame > threshold
+        bbox = bbox_from_mask(mask)
+        if bbox is not None:
+            width, height, center_x, center_y = bbox
+            widths.append(width)
+            heights.append(height)
+            centers_x.append(center_x)
+            centers_y.append(center_y)
+            active_fractions.append(float(mask.mean()))
+
+        total_mass = float(frame.sum())
+        if total_mass > EPS:
+            clipped_mass = float(frame[:, :SIDE_CUT].sum() + frame[:, -SIDE_CUT:].sum())
+            cut_mass_ratios.append(clipped_mass / total_mass)
+
+        bone_bbox = bbox_from_mask(bone[frame_idx] > threshold)
+        joint_bbox = bbox_from_mask(joint[frame_idx] > threshold)
+        if bone_bbox is not None and joint_bbox is not None:
+            bone_joint_dx.append(abs(bone_bbox[2] - joint_bbox[2]))
+            bone_joint_dy.append(abs(bone_bbox[3] - joint_bbox[3]))
+
+    def safe_mean(values: Iterable[float]) -> float:
+        array = np.asarray(list(values), dtype=np.float32)
+        return float(array.mean()) if array.size else 0.0
+
+    def safe_std(values: Iterable[float]) -> float:
+        array = np.asarray(list(values), dtype=np.float32)
+        return float(array.std()) if array.size else 0.0
+
+    return {
+        "width_mean": safe_mean(widths),
+        "height_mean": safe_mean(heights),
+        "center_x_std": safe_std(centers_x),
+        "center_y_std": safe_std(centers_y),
+        "width_std": safe_std(widths),
+        "height_std": safe_std(heights),
+        "active_fraction_mean": safe_mean(active_fractions),
+        "cut_mass_ratio_mean": safe_mean(cut_mass_ratios),
+        "bone_joint_dx_mean": safe_mean(bone_joint_dx),
+        "bone_joint_dy_mean": safe_mean(bone_joint_dy),
+    }
+
+
+def softmax_rows(logits: NDArray[np.float64]) -> NDArray[np.float64]:
+    shifted = logits - logits.max(axis=1, keepdims=True)
+    exp = np.exp(shifted)
+    return exp / exp.sum(axis=1, keepdims=True)
+
+
+def fit_softmax_regression(
+    x: NDArray[np.float64],
+    y: NDArray[np.int64],
+    num_classes: int,
+    steps: int = 4000,
+    lr: float = 0.05,
+    reg: float = 1e-4,
+) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
+    weights = np.zeros((x.shape[1], num_classes), dtype=np.float64)
+    bias = np.zeros(num_classes, dtype=np.float64)
+    one_hot = np.eye(num_classes, dtype=np.float64)[y]
+
+    for _ in range(steps):
+        logits = x @ weights + bias
+        probs = softmax_rows(logits)
+        error = probs - one_hot
+        grad_w = (x.T @ error) / x.shape[0] + reg * weights
+        grad_b = error.mean(axis=0)
+        weights -= lr * grad_w
+        bias -= lr * grad_b
+
+    return weights, bias
+
+
+def evaluate_predictions(
+    y_true: NDArray[np.int64],
+    y_pred: NDArray[np.int64],
+    num_classes: int,
+) -> dict[str, float]:
+    accuracy = float((y_true == y_pred).mean())
+    precisions: list[float] = []
+    recalls: list[float] = []
+    f1s: list[float] = []
+
+    for class_id in range(num_classes):
+        tp = int(((y_true == class_id) & (y_pred == class_id)).sum())
+        fp = int(((y_true != class_id) & (y_pred == class_id)).sum())
+        fn = int(((y_true == class_id) & (y_pred != class_id)).sum())
+        precision = tp / max(tp + fp, 1)
+        recall = tp / max(tp + fn, 1)
+        f1 = 2 * precision * recall / max(precision + recall, EPS)
+        precisions.append(precision)
+        recalls.append(recall)
+        f1s.append(f1)
+
+    return {
+        "accuracy": 100.0 * accuracy,
+        "macro_precision": 100.0 * float(np.mean(precisions)),
+        "macro_recall": 100.0 * float(np.mean(recalls)),
+        "macro_f1": 100.0 * float(np.mean(f1s)),
+    }
+
+
+def analyze() -> dict[str, object]:
+    train_ids, test_ids = load_partition_ids()
+
+    heatmap_cfg = drf_prep.load_heatmap_cfg(str(HEATMAP_CFG_PATH))
+    pose_transform = drf_prep.build_pose_transform(heatmap_cfg)
+
+    split_label_counts: dict[str, dict[str, int]] = {
+        "train": defaultdict(int),
+        "test": defaultdict(int),
+    }
+    pose_quality: dict[str, dict[str, RunningStats]] = {
+        "train": defaultdict(RunningStats),
+        "test": defaultdict(RunningStats),
+    }
+    valid_ratio: dict[str, dict[str, RunningStats]] = {
+        "train": defaultdict(RunningStats),
+        "test": defaultdict(RunningStats),
+    }
+
+    for pose_path in iter_pose_paths():
+        key = sequence_key_from_path(pose_path)
+        split = "train" if key.pid in train_ids else "test"
+        split_label_counts[split][key.label] += 1
+
+        pose = drf_prep.read_pose(str(pose_path))
+        conf = pose[..., 2] if pose.shape[-1] >= 3 else np.ones(pose.shape[:-1], dtype=np.float32)
+        pose_quality[split][key.label].update(float(conf.mean()))
+        valid_ratio[split][key.label].update(float((conf > 0.05).mean()))
+
+    heatmap_metrics: dict[str, list[float]] = defaultdict(list)
+    pav_vectors_train: list[NDArray[np.float64]] = []
+    pav_vectors_test: list[NDArray[np.float64]] = []
+    labels_train: list[int] = []
+    labels_test: list[int] = []
+    pav_means: dict[str, list[float]] = defaultdict(list)
+
+    for heatmap_path in iter_heatmap_paths():
+        key = sequence_key_from_path(heatmap_path)
+        split = "train" if key.pid in train_ids else "test"
+        heatmap = np.asarray(read_pickle(heatmap_path), dtype=np.float32)
+        metrics = sequence_bbox_metrics(heatmap)
+        for metric_name, metric_value in metrics.items():
+            heatmap_metrics[f"{split}.{metric_name}"].append(metric_value)
+            heatmap_metrics[f"all.{metric_name}"].append(metric_value)
+
+        pav_path = heatmap_path.with_name("1_pav.pkl")
+        pav_seq = np.asarray(read_pickle(pav_path), dtype=np.float32)
+        pav_vector = pav_seq[0].reshape(-1).astype(np.float64)
+        pav_means[key.label].append(float(pav_vector.mean()))
+        if split == "train":
+            pav_vectors_train.append(pav_vector)
+            labels_train.append(LABEL_TO_INT[key.label])
+        else:
+            pav_vectors_test.append(pav_vector)
+            labels_test.append(LABEL_TO_INT[key.label])
+
+    x_train = np.stack(pav_vectors_train, axis=0)
+    x_test = np.stack(pav_vectors_test, axis=0)
+    y_train = np.asarray(labels_train, dtype=np.int64)
+    y_test = np.asarray(labels_test, dtype=np.int64)
+
+    mean = x_train.mean(axis=0, keepdims=True)
+    std = np.maximum(x_train.std(axis=0, keepdims=True), EPS)
+    x_train_std = (x_train - mean) / std
+    x_test_std = (x_test - mean) / std
+    weights, bias = fit_softmax_regression(x_train_std, y_train, num_classes=3)
+    y_pred = np.argmax(x_test_std @ weights + bias, axis=1).astype(np.int64)
+    pav_classifier = evaluate_predictions(y_test, y_pred, num_classes=3)
+
+    results: dict[str, object] = {
+        "split_label_counts": split_label_counts,
+        "pose_confidence_mean": {
+            split: {label: stats.mean for label, stats in per_label.items()}
+            for split, per_label in pose_quality.items()
+        },
+        "pose_valid_ratio_mean": {
+            split: {label: stats.mean for label, stats in per_label.items()}
+            for split, per_label in valid_ratio.items()
+        },
+        "pav_label_means": {
+            label: float(np.mean(values))
+            for label, values in pav_means.items()
+        },
+        "pav_softmax_probe": pav_classifier,
+        "heatmap_metrics": {
+            key: {
+                "mean": float(np.mean(values)),
+                "p95": float(np.percentile(values, 95)),
+            }
+            for key, values in heatmap_metrics.items()
+        },
+    }
+    return results
+
+
+def format_report(results: dict[str, object]) -> str:
+    split_counts = results["split_label_counts"]
+    pose_conf = results["pose_confidence_mean"]
+    pose_valid = results["pose_valid_ratio_mean"]
+    heat = results["heatmap_metrics"]
+    pav_probe = results["pav_softmax_probe"]
+    pav_means = results["pav_label_means"]
+
+    def heat_stat(name: str) -> tuple[float, float]:
+        entry = heat[f"all.{name}"]
+        return entry["mean"], entry["p95"]
+
+    center_x_std_mean, center_x_std_p95 = heat_stat("center_x_std")
+    center_y_std_mean, center_y_std_p95 = heat_stat("center_y_std")
+    width_std_mean, width_std_p95 = heat_stat("width_std")
+    height_std_mean, height_std_p95 = heat_stat("height_std")
+    cut_ratio_mean, cut_ratio_p95 = heat_stat("cut_mass_ratio_mean")
+    bone_joint_dx_mean, bone_joint_dx_p95 = heat_stat("bone_joint_dx_mean")
+    bone_joint_dy_mean, bone_joint_dy_p95 = heat_stat("bone_joint_dy_mean")
+    width_mean, width_p95 = heat_stat("width_mean")
+    height_mean, height_p95 = heat_stat("height_mean")
+    active_fraction_mean, active_fraction_p95 = heat_stat("active_fraction_mean")
+
+    return f"""# Scoliosis1K Dataset Analysis (1:1:8, shared-align skeleton maps)
+
+## Split
+
+Train counts:
+- negative: {split_counts["train"]["negative"]}
+- neutral: {split_counts["train"]["neutral"]}
+- positive: {split_counts["train"]["positive"]}
+
+Test counts:
+- negative: {split_counts["test"]["negative"]}
+- neutral: {split_counts["test"]["neutral"]}
+- positive: {split_counts["test"]["positive"]}
+
+## Raw pose quality
+
+Mean keypoint confidence by split/class:
+- train negative: {pose_conf["train"]["negative"]:.4f}
+- train neutral: {pose_conf["train"]["neutral"]:.4f}
+- train positive: {pose_conf["train"]["positive"]:.4f}
+- test negative: {pose_conf["test"]["negative"]:.4f}
+- test neutral: {pose_conf["test"]["neutral"]:.4f}
+- test positive: {pose_conf["test"]["positive"]:.4f}
+
+Mean valid-joint ratio (`conf > 0.05`) by split/class:
+- train negative: {pose_valid["train"]["negative"]:.4f}
+- train neutral: {pose_valid["train"]["neutral"]:.4f}
+- train positive: {pose_valid["train"]["positive"]:.4f}
+- test negative: {pose_valid["test"]["negative"]:.4f}
+- test neutral: {pose_valid["test"]["neutral"]:.4f}
+- test positive: {pose_valid["test"]["positive"]:.4f}
+
+## PAV signal
+
+Mean normalized PAV value by label:
+- negative: {pav_means["negative"]:.4f}
+- neutral: {pav_means["neutral"]:.4f}
+- positive: {pav_means["positive"]:.4f}
+
+Train-on-train / test-on-test linear softmax probe over sequence-level PAV:
+- accuracy: {pav_probe["accuracy"]:.2f}%
+- macro precision: {pav_probe["macro_precision"]:.2f}%
+- macro recall: {pav_probe["macro_recall"]:.2f}%
+- macro F1: {pav_probe["macro_f1"]:.2f}%
+
+## Shared-align heatmap geometry
+
+Combined support bbox stats over all sequences:
+- width mean / p95: {width_mean:.2f} / {width_p95:.2f}
+- height mean / p95: {height_mean:.2f} / {height_p95:.2f}
+- active fraction mean / p95: {active_fraction_mean:.4f} / {active_fraction_p95:.4f}
+
+Per-sequence temporal jitter (std over frames):
+- center-x std mean / p95: {center_x_std_mean:.3f} / {center_x_std_p95:.3f}
+- center-y std mean / p95: {center_y_std_mean:.3f} / {center_y_std_p95:.3f}
+- width std mean / p95: {width_std_mean:.3f} / {width_std_p95:.3f}
+- height std mean / p95: {height_std_mean:.3f} / {height_std_p95:.3f}
+
+Residual limb-vs-joint bbox-center mismatch after shared alignment:
+- dx mean / p95: {bone_joint_dx_mean:.3f} / {bone_joint_dx_p95:.3f}
+- dy mean / p95: {bone_joint_dy_mean:.3f} / {bone_joint_dy_p95:.3f}
+
+Estimated intensity mass in the columns removed by `BaseSilCuttingTransform`:
+- mean clipped-mass ratio: {cut_ratio_mean:.4f}
+- p95 clipped-mass ratio: {cut_ratio_p95:.4f}
+
+## Reading
+
+- The raw pose data does not look broken. Confidence and valid-joint ratios are high and similar across classes.
+- The sequence-level PAV still carries useful label signal, so the dataset is not devoid of scoliosis information.
+- Shared alignment removed the old limb-vs-joint registration bug; residual channel-center mismatch is now small.
+- The remaining suspicious area is the visual branch: the skeleton map still has frame-to-frame bbox jitter, and the support bbox is almost full-height (`~61.5 / 64`) and fairly dense (`~36%` active pixels), which may be washing out subtle asymmetry cues.
+- `BaseSilCuttingTransform` does not appear to be the main failure source for this shared-align export; the measured mass in the removed side margins is near zero.
+- The dataset itself looks usable; the bigger issue still appears to be how the current skeleton-map preprocessing/runtime path presents that data to ScoNet.
+"""
+
+
+def main() -> None:
+    results = analyze()
+    REPORT_PATH.write_text(format_report(results), encoding="utf-8")
+    JSON_PATH.write_text(json.dumps(results, indent=2, sort_keys=True), encoding="utf-8")
+    print(f"Wrote {REPORT_PATH}")
+    print(f"Wrote {JSON_PATH}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,74 @@
+# Scoliosis1K Dataset Analysis (1:1:8, shared-align skeleton maps)
+
+## Split
+
+Train counts:
+- negative: 596
+- neutral: 74
+- positive: 74
+
+Test counts:
+- negative: 204
+- neutral: 126
+- positive: 419
+
+## Raw pose quality
+
+Mean keypoint confidence by split/class:
+- train negative: 0.9016
+- train neutral: 0.9023
+- train positive: 0.8987
+- test negative: 0.9009
+- test neutral: 0.9020
+- test positive: 0.8999
+
+Mean valid-joint ratio (`conf > 0.05`) by split/class:
+- train negative: 1.0000
+- train neutral: 1.0000
+- train positive: 1.0000
+- test negative: 1.0000
+- test neutral: 1.0000
+- test positive: 1.0000
+
+## PAV signal
+
+Mean normalized PAV value by label:
+- negative: 0.3068
+- neutral: 0.3546
+- positive: 0.3635
+
+Train-on-train / test-on-test linear softmax probe over sequence-level PAV:
+- accuracy: 50.87%
+- macro precision: 50.50%
+- macro recall: 48.19%
+- macro F1: 39.88%
+
+## Shared-align heatmap geometry
+
+Combined support bbox stats over all sequences:
+- width mean / p95: 32.13 / 33.57
+- height mean / p95: 61.52 / 61.61
+- active fraction mean / p95: 0.3634 / 0.3738
+
+Per-sequence temporal jitter (std over frames):
+- center-x std mean / p95: 0.864 / 1.243
+- center-y std mean / p95: 0.516 / 0.704
+- width std mean / p95: 2.152 / 2.804
+- height std mean / p95: 0.507 / 0.545
+
+Residual limb-vs-joint bbox-center mismatch after shared alignment:
+- dx mean / p95: 0.195 / 0.229
+- dy mean / p95: 0.251 / 0.357
+
+Estimated intensity mass in the columns removed by `BaseSilCuttingTransform`:
+- mean clipped-mass ratio: 0.0000
+- p95 clipped-mass ratio: 0.0000
+
+## Reading
+
+- The raw pose data does not look broken. Confidence and valid-joint ratios are high and similar across classes.
+- The sequence-level PAV still carries useful label signal, so the dataset is not devoid of scoliosis information.
+- Shared alignment removed the old limb-vs-joint registration bug; residual channel-center mismatch is now small.
+- The remaining suspicious area is the visual branch: the skeleton map still has frame-to-frame bbox jitter, and the support bbox is almost full-height (`~61.5 / 64`) and fairly dense (`~36%` active pixels), which may be washing out subtle asymmetry cues.
+- `BaseSilCuttingTransform` does not appear to be the main failure source for this shared-align export; the measured mass in the removed side margins is near zero.
+- The dataset itself looks usable; the bigger issue still appears to be how the current skeleton-map preprocessing/runtime path presents that data to ScoNet.