feat: add fixed-pool scoliosis partition helper
This commit is contained in:
@@ -69,6 +69,33 @@ python -m torch.distributed.launch --nproc_per_node=4 \
|
|||||||
opengait/main.py --cfgs configs/sconet/sconet_scoliosis1k.yaml --phase test --log_to_file
|
opengait/main.py --cfgs configs/sconet/sconet_scoliosis1k.yaml --phase test --log_to_file
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Fixed-pool ratio comparison
|
||||||
|
|
||||||
|
If you want to compare `1:1:2` against `1:1:8` without changing the evaluation
|
||||||
|
pool, do not compare `Scoliosis1K_112.json` against `Scoliosis1K_118.json`
|
||||||
|
directly. Those two files differ substantially in train/test membership.
|
||||||
|
|
||||||
|
For a cleaner same-pool comparison, use:
|
||||||
|
|
||||||
|
* `datasets/Scoliosis1K/Scoliosis1K_118.json`
|
||||||
|
* original `1:1:8` split
|
||||||
|
* `datasets/Scoliosis1K/Scoliosis1K_118_fixedpool_train112.json`
|
||||||
|
* same `TEST_SET` as `118`
|
||||||
|
* same positive/neutral `TRAIN_SET` ids as `118`
|
||||||
|
* downsampled `TRAIN_SET` negatives to `148`, giving train counts
|
||||||
|
`74 positive / 74 neutral / 148 negative`
|
||||||
|
|
||||||
|
The helper used to generate that derived partition is:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv run python scripts/build_scoliosis_fixedpool_partition.py \
|
||||||
|
--base-partition datasets/Scoliosis1K/Scoliosis1K_118.json \
|
||||||
|
--dataset-root /mnt/public/data/Scoliosis1K/Scoliosis1K-sil-pkl \
|
||||||
|
--negative-multiplier 2 \
|
||||||
|
--output-path datasets/Scoliosis1K/Scoliosis1K_118_fixedpool_train112.json \
|
||||||
|
--seed 118
|
||||||
|
```
|
||||||
|
|
||||||
### Modality sanity check
|
### Modality sanity check
|
||||||
|
|
||||||
The silhouette and skeleton-map pipelines are different experiments and should not be mixed when you interpret results.
|
The silhouette and skeleton-map pipelines are different experiments and should not be mixed when you interpret results.
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -212,6 +212,13 @@ One practical caveat on `1:1:2` vs `1:1:8` comparisons in this repo:
|
|||||||
- so local `112` vs `118` results should not be overinterpreted as a pure
|
- so local `112` vs `118` results should not be overinterpreted as a pure
|
||||||
class-balance ablation unless the train/test pool is explicitly held fixed
|
class-balance ablation unless the train/test pool is explicitly held fixed
|
||||||
|
|
||||||
|
To support a clean same-pool comparison, the repo now also includes:
|
||||||
|
- `datasets/Scoliosis1K/Scoliosis1K_118_fixedpool_train112.json`
|
||||||
|
|
||||||
|
That partition keeps the full `118` `TEST_SET` unchanged and keeps the same
|
||||||
|
positive/neutral `TRAIN_SET` ids as `118`, but downsamples `TRAIN_SET` negatives
|
||||||
|
to `148` so the train ratio becomes `74 / 74 / 148` (`1:1:2`).
|
||||||
|
|
||||||
The strongest recovered result:
|
The strongest recovered result:
|
||||||
- `80.24 / 76.73 / 76.40 / 76.56`
|
- `80.24 / 76.73 / 76.40 / 76.56`
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,121 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TypedDict, cast
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
|
||||||
|
class Partition(TypedDict):
|
||||||
|
TRAIN_SET: list[str]
|
||||||
|
TEST_SET: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
def infer_pid_label(dataset_root: Path, pid: str) -> str:
|
||||||
|
pid_root = dataset_root / pid
|
||||||
|
if not pid_root.exists():
|
||||||
|
raise FileNotFoundError(f"PID root not found under dataset root: {pid_root}")
|
||||||
|
label_dirs = sorted([entry.name.lower() for entry in pid_root.iterdir() if entry.is_dir()])
|
||||||
|
if len(label_dirs) != 1:
|
||||||
|
raise ValueError(f"Expected exactly one class dir for pid {pid}, got {label_dirs}")
|
||||||
|
label = label_dirs[0]
|
||||||
|
if label not in {"positive", "neutral", "negative"}:
|
||||||
|
raise ValueError(f"Unexpected label directory for pid {pid}: {label}")
|
||||||
|
return label
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option(
|
||||||
|
"--base-partition",
|
||||||
|
type=click.Path(path_type=Path, exists=True, dir_okay=False),
|
||||||
|
required=True,
|
||||||
|
help="Path to the source partition JSON, e.g. datasets/Scoliosis1K/Scoliosis1K_118.json",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--dataset-root",
|
||||||
|
type=click.Path(path_type=Path, exists=True, file_okay=False),
|
||||||
|
required=True,
|
||||||
|
help="Dataset root used to infer each pid class label, e.g. /mnt/public/data/Scoliosis1K/Scoliosis1K-sil-pkl",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--negative-multiplier",
|
||||||
|
type=int,
|
||||||
|
required=True,
|
||||||
|
help="Target negative count as a multiple of the positive/neutral count, e.g. 2 for 1:1:2",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--output-path",
|
||||||
|
type=click.Path(path_type=Path, dir_okay=False),
|
||||||
|
required=True,
|
||||||
|
help="Path to write the derived partition JSON.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--seed",
|
||||||
|
type=int,
|
||||||
|
default=118,
|
||||||
|
show_default=True,
|
||||||
|
help="Random seed used when downsampling negatives.",
|
||||||
|
)
|
||||||
|
def main(
|
||||||
|
base_partition: Path,
|
||||||
|
dataset_root: Path,
|
||||||
|
negative_multiplier: int,
|
||||||
|
output_path: Path,
|
||||||
|
seed: int,
|
||||||
|
) -> None:
|
||||||
|
with base_partition.open("r", encoding="utf-8") as handle:
|
||||||
|
partition = cast(Partition, json.load(handle))
|
||||||
|
|
||||||
|
train_ids = list(partition["TRAIN_SET"])
|
||||||
|
test_ids = list(partition["TEST_SET"])
|
||||||
|
|
||||||
|
train_by_label: dict[str, list[str]] = {"positive": [], "neutral": [], "negative": []}
|
||||||
|
for pid in train_ids:
|
||||||
|
label = infer_pid_label(dataset_root, pid)
|
||||||
|
train_by_label[label].append(pid)
|
||||||
|
|
||||||
|
pos_count = len(train_by_label["positive"])
|
||||||
|
neu_count = len(train_by_label["neutral"])
|
||||||
|
neg_count = len(train_by_label["negative"])
|
||||||
|
if pos_count != neu_count:
|
||||||
|
raise ValueError(
|
||||||
|
"This helper assumes equal positive/neutral train counts so that only "
|
||||||
|
+ f"negative downsampling changes the ratio. Got positive={pos_count}, neutral={neu_count}."
|
||||||
|
)
|
||||||
|
|
||||||
|
target_negative_count = negative_multiplier * pos_count
|
||||||
|
if target_negative_count > neg_count:
|
||||||
|
raise ValueError(
|
||||||
|
f"Requested {target_negative_count} negatives but only {neg_count} are available "
|
||||||
|
+ f"in base partition {base_partition}."
|
||||||
|
)
|
||||||
|
|
||||||
|
rng = random.Random(seed)
|
||||||
|
sampled_negatives = sorted(rng.sample(train_by_label["negative"], target_negative_count))
|
||||||
|
derived_train = (
|
||||||
|
sorted(train_by_label["positive"])
|
||||||
|
+ sorted(train_by_label["neutral"])
|
||||||
|
+ sampled_negatives
|
||||||
|
)
|
||||||
|
|
||||||
|
derived_partition = {
|
||||||
|
"TRAIN_SET": derived_train,
|
||||||
|
"TEST_SET": test_ids,
|
||||||
|
}
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with output_path.open("w", encoding="utf-8") as handle:
|
||||||
|
json.dump(derived_partition, handle, indent=2)
|
||||||
|
_ = handle.write("\n")
|
||||||
|
|
||||||
|
train_counts = Counter(infer_pid_label(dataset_root, pid) for pid in derived_train)
|
||||||
|
test_counts = Counter(infer_pid_label(dataset_root, pid) for pid in test_ids)
|
||||||
|
click.echo(f"wrote {output_path}")
|
||||||
|
click.echo(f"train_counts={dict(train_counts)}")
|
||||||
|
click.echo(f"test_counts={dict(test_counts)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user