PyTorch¶

ddacs.pytorch.DDACSDataset is a torch.utils.data.IterableDataset over a Croissant view. It is available only when the [torch] extra is installed (see Installation).

`DDACSDataset`¶

`DDACSDataset` ¶

Bases: IterableDataset

Streaming PyTorch dataset for a single Croissant view.

Yields a dict[str, numpy.ndarray] per simulation. Field selection (which dataset path, optional timestep slicing) is derived from the Croissant view + field-map; no manual extraction code lives here.

Sharding is decided inside __iter__ via get_worker_info() and torch.distributed, so the same instance plays under num_workers=0, num_workers=N and DDP without constructor changes.

Parameters:

Name	Type	Description	Default
`view`	`str`	Name of the RecordSet to stream (e.g. "springback-minimal").	required
`source`	`str \| Path \| None`	Override the manifest URL / path. Defaults to the resolution chain in `ddacs.croissant.resolve_source`.	`None`
`data_dir`	`str \| Path \| None`	Override the local data directory. Defaults to `ddacs.spec.DDACS_SPEC.default_data_dir`. Pass `None` to skip local-file discovery.	`DEFAULT_DATA_DIR`
`dataset`		A pre-loaded `mlcroissant.Dataset` (e.g. one mutated by `ddacs.add_view`). When supplied, `source`/`data_dir` are ignored for the manifest parse and the caller's object is used as-is. This is the way to stream a custom view from `DDACSDataset`.	`None`
`sim_ids`	`list[int] \| None`	Explicit allowlist of simulation ids to stream (default: every sim in `process_parameters.csv`).	`None`
`where`	`Callable[[Series], bool] \| None`	Predicate applied to each `process_parameters.csv` row before any zip is opened. Combined with `sim_ids` if both are given.	`None`
`shuffle`	`bool`	If true, each shard shuffles its own sim_id list with a seed derived from `seed + epoch + shard_id`. Call `set_epoch` between epochs to get a fresh permutation.	`False`
`seed`	`int`	Base seed for the per-shard shuffle.	`0`

Source code in ddacs/pytorch.py

class DDACSDataset(IterableDataset):
    """Streaming PyTorch dataset for a single Croissant view.

    Yields a `dict[str, numpy.ndarray]` per simulation. Field selection
    (which dataset path, optional timestep slicing) is derived from the
    Croissant view + field-map; no manual extraction code lives here.

    Sharding is decided inside `__iter__` via `get_worker_info()` and
    `torch.distributed`, so the same instance plays under
    `num_workers=0`, `num_workers=N` and DDP without constructor changes.

    Args:
        view: Name of the RecordSet to stream (e.g. "springback-minimal").
        source: Override the manifest URL / path. Defaults to the resolution
            chain in `ddacs.croissant.resolve_source`.
        data_dir: Override the local data directory. Defaults to
            `ddacs.spec.DDACS_SPEC.default_data_dir`. Pass `None` to skip local-file
            discovery.
        dataset: A pre-loaded `mlcroissant.Dataset` (e.g. one mutated by
            `ddacs.add_view`). When supplied, `source`/`data_dir` are ignored
            for the manifest parse and the caller's object is used as-is.
            This is the way to stream a custom view from `DDACSDataset`.
        sim_ids: Explicit allowlist of simulation ids to stream (default:
            every sim in `process_parameters.csv`).
        where: Predicate applied to each `process_parameters.csv` row before
            any zip is opened. Combined with `sim_ids` if both are given.
        shuffle: If true, each shard shuffles its own sim_id list with a
            seed derived from `seed + epoch + shard_id`. Call `set_epoch`
            between epochs to get a fresh permutation.
        seed: Base seed for the per-shard shuffle.
    """

    def __init__(
        self,
        view: str,
        source: str | Path | None = None,
        data_dir: str | Path | None = DEFAULT_DATA_DIR,
        dataset=None,
        sim_ids: list[int] | None = None,
        where: Callable[[pd.Series], bool] | None = None,
        shuffle: bool = False,
        seed: int = 0,
        spec: DatasetSpec = DDACS_SPEC,
    ):
        super().__init__()
        self.view = view
        self.source = source
        self.data_dir = Path(data_dir) if data_dir is not None else None
        self.where = where
        self.shuffle = shuffle
        self.seed = seed
        self.spec = spec
        self._epoch = 0

        # Use a caller-provided `dataset` (e.g. one mutated by `ddacs.add_view`)
        # so the custom view is visible here; otherwise parse the manifest fresh.
        ds = (
            dataset
            if dataset is not None
            else _croissant.load(source=source, data_dir=data_dir, spec=spec)
        )
        self._field_specs = self._build_field_specs(ds)
        self._h5_index = self._build_h5_index(ds.mapping or {})
        self._sim_ids = self._resolve_sim_ids(sim_ids)
        if sim_ids is not None:
            from .streaming import _warn_missing

            _warn_missing(sim_ids, self._sim_ids, self._h5_index, self.data_dir)

    # --- public ------------------------------------------------------------

    def set_epoch(self, epoch: int) -> None:
        """Update the epoch used in shuffle seeding.

        Call once per epoch (analogous to `DistributedSampler.set_epoch`)
        to get a different per-shard permutation each pass.
        """
        self._epoch = int(epoch)

    def __iter__(self):
        shard_id, total_shards = self._shard_position()
        my_ids = self._sim_ids[shard_id::total_shards]

        if self.shuffle:
            rng = random.Random(self.seed + self._epoch * 1_000_003 + shard_id)
            my_ids = list(my_ids)
            rng.shuffle(my_ids)

        # Reuse the open zip across consecutive sims that land in it. Corner
        # blocks group ~2200 sims per zip, so the cache hits nearly every time.
        last_path: str | None = None
        last_zf: zipfile.ZipFile | None = None
        try:
            for sim_id in my_ids:
                zip_path = self._h5_index.get(int(sim_id))
                if zip_path is None:
                    continue
                if zip_path != last_path:
                    if last_zf is not None:
                        last_zf.close()
                    last_zf = zipfile.ZipFile(zip_path)
                    last_path = zip_path
                try:
                    data = last_zf.read(f"{self.spec.id_format.format(int(sim_id))}.h5")
                except KeyError:
                    continue
                with h5py.File(io.BytesIO(data), "r") as f:
                    yield self._extract_record(f)
        finally:
            if last_zf is not None:
                last_zf.close()

    # --- internals ---------------------------------------------------------

    def _shard_position(self) -> tuple[int, int]:
        """Return `(shard_id, total_shards)` for the current worker × rank."""
        worker = get_worker_info()
        num_workers = worker.num_workers if worker else 1
        worker_id = worker.id if worker else 0
        rank, world = self._ddp_info()
        return rank * num_workers + worker_id, world * num_workers

    @staticmethod
    def _ddp_info() -> tuple[int, int]:
        try:
            import torch.distributed as dist

            if dist.is_available() and dist.is_initialized():
                return dist.get_rank(), dist.get_world_size()
        except (ImportError, RuntimeError):
            pass
        return 0, 1

    def _build_field_specs(self, ds) -> dict[str, tuple[str, Any]]:
        """For each view-field, return `(h5_path, slicing)`.

        `slicing` is `None`, an `int` (single timestep), or a `list[int]`
        (multiple timesteps), parsed from the view-field's JSONPath transform.
        """
        view_rs = next((r for r in ds.metadata.record_sets if r.id == self.view), None)
        if view_rs is None:
            raise ValueError(f"view {self.view!r} not found in manifest")
        fm_rs = next(
            (r for r in ds.metadata.record_sets if r.id == self.spec.field_map_record_set),
            None,
        )
        if fm_rs is None:
            raise ValueError(
                f"{self.spec.field_map_record_set!r} RecordSet missing — manifest is malformed"
            )
        fm = {f.name: f for f in fm_rs.fields}

        specs: dict[str, tuple[str, Any]] = {}
        for f in view_rs.fields:
            rs_id, _, source_field_id = f.source.uuid.partition("/")
            if rs_id != self.spec.field_map_record_set:
                raise ValueError(
                    f"view field {f.name!r} sources RecordSet {rs_id!r}, but this dataset "
                    f"only streams {self.spec.field_map_record_set!r} (HDF5) fields. For views "
                    "that include process-parameters metadata columns use streaming.iter_view, "
                    "or build the view from field-map fields only."
                )
            if source_field_id not in fm:
                raise ValueError(f"view field {f.name!r} sources unknown field {source_field_id!r}")
            h5_path = fm[source_field_id].source.transforms[0].regex
            slicing = None
            if f.source.transforms:
                slicing = self._parse_jsonpath(f.source.transforms[0].json_path)
            specs[f.name] = (h5_path, slicing)
        return specs

    @staticmethod
    def _parse_jsonpath(expr: str | None) -> Any:
        """Parse `$[N]` -> int, `$[a,b,c]` -> list[int]. Anything else -> None."""
        if not expr:
            return None
        m = _JSONPATH_RE.match(expr)
        if not m:
            return None
        inner = m.group(1)
        if "," in inner:
            return [int(s) for s in inner.split(",")]
        try:
            return int(inner)
        except ValueError:
            return None

    @staticmethod
    def _build_h5_index(mapping: dict[str, str]) -> dict[int, str]:
        """Map sim_id (int) -> absolute path of the zip containing `<sim_id>.h5`."""
        index: dict[int, str] = {}
        for path in mapping.values():
            path_str = str(path)
            if not path_str.endswith(".zip"):
                continue
            try:
                with zipfile.ZipFile(path_str) as zf:
                    for name in zf.namelist():
                        if not name.endswith(".h5"):
                            continue
                        try:
                            sim_id = int(Path(name).stem)
                        except ValueError:
                            continue
                        index[sim_id] = path_str
            except zipfile.BadZipFile:
                continue
        return index

    def _resolve_sim_ids(self, sim_ids_arg: list[int] | None) -> list[int]:
        """Apply sim_ids + where to produce the final ordered list of sim ids.

        The list is built once at construction time (before any worker fork)
        so every shard sees the same ordering.
        """
        if self.data_dir is not None:
            csv_path = self.data_dir / self.spec.process_parameters_file
            if csv_path.is_file():
                df = pd.read_csv(csv_path)
                if self.spec.id_column not in df.columns:
                    raise ValueError(f"{csv_path} missing required {self.spec.id_column!r} column")
                if sim_ids_arg is not None:
                    df = df[df[self.spec.id_column].isin(set(sim_ids_arg))]
                if self.where is not None:
                    df = df[df.apply(self.where, axis=1)]
                return [int(x) for x in df[self.spec.id_column].tolist()]

        if self.where is not None:
            raise ValueError(
                f"`where` filter requires {self.spec.process_parameters_file} under data_dir"
            )
        if sim_ids_arg is not None:
            return [int(x) for x in sim_ids_arg]
        return sorted(self._h5_index.keys())

    def _extract_record(self, f: h5py.File) -> dict[str, np.ndarray]:
        # Cache the full array per h5 path so multiple view fields sharing a
        # source field (e.g. `forming` + `springback` both reading
        # `OP10/blank/node_displacement`) only read it once.
        cache: dict[str, np.ndarray] = {}
        rec: dict[str, np.ndarray] = {}
        for alias, (h5_path, slicing) in self._field_specs.items():
            arr = cache.get(h5_path)
            if arr is None:
                arr = f[h5_path][...]
                cache[h5_path] = arr
            rec[alias] = arr[slicing] if slicing is not None else arr
        return rec

`init(view: str, source: str | Path | None = None, data_dir: str | Path | None = DEFAULT_DATA_DIR, dataset=None, sim_ids: list[int] | None = None, where: Callable[[pd.Series], bool] | None = None, shuffle: bool = False, seed: int = 0, spec: DatasetSpec = DDACS_SPEC)` ¶

Source code in ddacs/pytorch.py

def __init__(
    self,
    view: str,
    source: str | Path | None = None,
    data_dir: str | Path | None = DEFAULT_DATA_DIR,
    dataset=None,
    sim_ids: list[int] | None = None,
    where: Callable[[pd.Series], bool] | None = None,
    shuffle: bool = False,
    seed: int = 0,
    spec: DatasetSpec = DDACS_SPEC,
):
    super().__init__()
    self.view = view
    self.source = source
    self.data_dir = Path(data_dir) if data_dir is not None else None
    self.where = where
    self.shuffle = shuffle
    self.seed = seed
    self.spec = spec
    self._epoch = 0

    # Use a caller-provided `dataset` (e.g. one mutated by `ddacs.add_view`)
    # so the custom view is visible here; otherwise parse the manifest fresh.
    ds = (
        dataset
        if dataset is not None
        else _croissant.load(source=source, data_dir=data_dir, spec=spec)
    )
    self._field_specs = self._build_field_specs(ds)
    self._h5_index = self._build_h5_index(ds.mapping or {})
    self._sim_ids = self._resolve_sim_ids(sim_ids)
    if sim_ids is not None:
        from .streaming import _warn_missing

        _warn_missing(sim_ids, self._sim_ids, self._h5_index, self.data_dir)

`iter()` ¶

Source code in ddacs/pytorch.py

def __iter__(self):
    shard_id, total_shards = self._shard_position()
    my_ids = self._sim_ids[shard_id::total_shards]

    if self.shuffle:
        rng = random.Random(self.seed + self._epoch * 1_000_003 + shard_id)
        my_ids = list(my_ids)
        rng.shuffle(my_ids)

    # Reuse the open zip across consecutive sims that land in it. Corner
    # blocks group ~2200 sims per zip, so the cache hits nearly every time.
    last_path: str | None = None
    last_zf: zipfile.ZipFile | None = None
    try:
        for sim_id in my_ids:
            zip_path = self._h5_index.get(int(sim_id))
            if zip_path is None:
                continue
            if zip_path != last_path:
                if last_zf is not None:
                    last_zf.close()
                last_zf = zipfile.ZipFile(zip_path)
                last_path = zip_path
            try:
                data = last_zf.read(f"{self.spec.id_format.format(int(sim_id))}.h5")
            except KeyError:
                continue
            with h5py.File(io.BytesIO(data), "r") as f:
                yield self._extract_record(f)
    finally:
        if last_zf is not None:
            last_zf.close()

`set_epoch(epoch: int) -> None` ¶

Update the epoch used in shuffle seeding.

Call once per epoch (analogous to DistributedSampler.set_epoch) to get a different per-shard permutation each pass.

Source code in ddacs/pytorch.py

def set_epoch(self, epoch: int) -> None:
    """Update the epoch used in shuffle seeding.

    Call once per epoch (analogous to `DistributedSampler.set_epoch`)
    to get a different per-shard permutation each pass.
    """
    self._epoch = int(epoch)

PyTorch¶

DDACSDataset¶

DDACSDataset ¶

__init__(view: str, source: str | Path | None = None, data_dir: str | Path | None = DEFAULT_DATA_DIR, dataset=None, sim_ids: list[int] | None = None, where: Callable[[pd.Series], bool] | None = None, shuffle: bool = False, seed: int = 0, spec: DatasetSpec = DDACS_SPEC) ¶

__iter__() ¶

set_epoch(epoch: int) -> None ¶

`DDACSDataset`¶

`DDACSDataset` ¶

`init(view: str, source: str | Path | None = None, data_dir: str | Path | None = DEFAULT_DATA_DIR, dataset=None, sim_ids: list[int] | None = None, where: Callable[[pd.Series], bool] | None = None, shuffle: bool = False, seed: int = 0, spec: DatasetSpec = DDACS_SPEC)` ¶

`iter()` ¶

`set_epoch(epoch: int) -> None` ¶