feat: add fixed-pool scoliosis partition helper

2026-03-14 17:45:31 +08:00
parent 4a12bd64b9
commit 7b98e066e4
4 changed files with 1206 additions and 0 deletions
@@ -69,6 +69,33 @@ python -m torch.distributed.launch --nproc_per_node=4 \
 opengait/main.py --cfgs configs/sconet/sconet_scoliosis1k.yaml --phase test --log_to_file
 ```

+### Fixed-pool ratio comparison
+
+If you want to compare `1:1:2` against `1:1:8` without changing the evaluation
+pool, do not compare `Scoliosis1K_112.json` against `Scoliosis1K_118.json`
+directly. Those two files differ substantially in train/test membership.
+
+For a cleaner same-pool comparison, use:
+
+* `datasets/Scoliosis1K/Scoliosis1K_118.json`
+  * original `1:1:8` split
+* `datasets/Scoliosis1K/Scoliosis1K_118_fixedpool_train112.json`
+  * same `TEST_SET` as `118`
+  * same positive/neutral `TRAIN_SET` ids as `118`
+  * downsampled `TRAIN_SET` negatives to `148`, giving train counts
+    `74 positive / 74 neutral / 148 negative`
+
+The helper used to generate that derived partition is:
+
+```bash
+uv run python scripts/build_scoliosis_fixedpool_partition.py \
+  --base-partition datasets/Scoliosis1K/Scoliosis1K_118.json \
+  --dataset-root /mnt/public/data/Scoliosis1K/Scoliosis1K-sil-pkl \
+  --negative-multiplier 2 \
+  --output-path datasets/Scoliosis1K/Scoliosis1K_118_fixedpool_train112.json \
+  --seed 118
+```
+
 ### Modality sanity check

 The silhouette and skeleton-map pipelines are different experiments and should not be mixed when you interpret results.
@@ -212,6 +212,13 @@ One practical caveat on `1:1:2` vs `1:1:8` comparisons in this repo:
 - so local `112` vs `118` results should not be overinterpreted as a pure
  class-balance ablation unless the train/test pool is explicitly held fixed

+To support a clean same-pool comparison, the repo now also includes:
+- `datasets/Scoliosis1K/Scoliosis1K_118_fixedpool_train112.json`
+
+That partition keeps the full `118` `TEST_SET` unchanged and keeps the same
+positive/neutral `TRAIN_SET` ids as `118`, but downsamples `TRAIN_SET` negatives
+to `148` so the train ratio becomes `74 / 74 / 148` (`1:1:2`).
+
 The strongest recovered result:
 - `80.24 / 76.73 / 76.40 / 76.56`

@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+import json
+import random
+from collections import Counter
+from pathlib import Path
+from typing import TypedDict, cast
+
+import click
+
+
+class Partition(TypedDict):
+    TRAIN_SET: list[str]
+    TEST_SET: list[str]
+
+
+def infer_pid_label(dataset_root: Path, pid: str) -> str:
+    pid_root = dataset_root / pid
+    if not pid_root.exists():
+        raise FileNotFoundError(f"PID root not found under dataset root: {pid_root}")
+    label_dirs = sorted([entry.name.lower() for entry in pid_root.iterdir() if entry.is_dir()])
+    if len(label_dirs) != 1:
+        raise ValueError(f"Expected exactly one class dir for pid {pid}, got {label_dirs}")
+    label = label_dirs[0]
+    if label not in {"positive", "neutral", "negative"}:
+        raise ValueError(f"Unexpected label directory for pid {pid}: {label}")
+    return label
+
+
+@click.command()
+@click.option(
+    "--base-partition",
+    type=click.Path(path_type=Path, exists=True, dir_okay=False),
+    required=True,
+    help="Path to the source partition JSON, e.g. datasets/Scoliosis1K/Scoliosis1K_118.json",
+)
+@click.option(
+    "--dataset-root",
+    type=click.Path(path_type=Path, exists=True, file_okay=False),
+    required=True,
+    help="Dataset root used to infer each pid class label, e.g. /mnt/public/data/Scoliosis1K/Scoliosis1K-sil-pkl",
+)
+@click.option(
+    "--negative-multiplier",
+    type=int,
+    required=True,
+    help="Target negative count as a multiple of the positive/neutral count, e.g. 2 for 1:1:2",
+)
+@click.option(
+    "--output-path",
+    type=click.Path(path_type=Path, dir_okay=False),
+    required=True,
+    help="Path to write the derived partition JSON.",
+)
+@click.option(
+    "--seed",
+    type=int,
+    default=118,
+    show_default=True,
+    help="Random seed used when downsampling negatives.",
+)
+def main(
+    base_partition: Path,
+    dataset_root: Path,
+    negative_multiplier: int,
+    output_path: Path,
+    seed: int,
+) -> None:
+    with base_partition.open("r", encoding="utf-8") as handle:
+        partition = cast(Partition, json.load(handle))
+
+    train_ids = list(partition["TRAIN_SET"])
+    test_ids = list(partition["TEST_SET"])
+
+    train_by_label: dict[str, list[str]] = {"positive": [], "neutral": [], "negative": []}
+    for pid in train_ids:
+        label = infer_pid_label(dataset_root, pid)
+        train_by_label[label].append(pid)
+
+    pos_count = len(train_by_label["positive"])
+    neu_count = len(train_by_label["neutral"])
+    neg_count = len(train_by_label["negative"])
+    if pos_count != neu_count:
+        raise ValueError(
+            "This helper assumes equal positive/neutral train counts so that only "
+            + f"negative downsampling changes the ratio. Got positive={pos_count}, neutral={neu_count}."
+        )
+
+    target_negative_count = negative_multiplier * pos_count
+    if target_negative_count > neg_count:
+        raise ValueError(
+            f"Requested {target_negative_count} negatives but only {neg_count} are available "
+            + f"in base partition {base_partition}."
+        )
+
+    rng = random.Random(seed)
+    sampled_negatives = sorted(rng.sample(train_by_label["negative"], target_negative_count))
+    derived_train = (
+        sorted(train_by_label["positive"])
+        + sorted(train_by_label["neutral"])
+        + sampled_negatives
+    )
+
+    derived_partition = {
+        "TRAIN_SET": derived_train,
+        "TEST_SET": test_ids,
+    }
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("w", encoding="utf-8") as handle:
+        json.dump(derived_partition, handle, indent=2)
+        _ = handle.write("\n")
+
+    train_counts = Counter(infer_pid_label(dataset_root, pid) for pid in derived_train)
+    test_counts = Counter(infer_pid_label(dataset_root, pid) for pid in test_ids)
+    click.echo(f"wrote {output_path}")
+    click.echo(f"train_counts={dict(train_counts)}")
+    click.echo(f"test_counts={dict(test_counts)}")
+
+
+if __name__ == "__main__":
+    main()