Skip to content

Generators

Generator functions for streaming DDACS simulation data.

generators

Simple generator functions for DDACS data streaming.

This module provides lightweight generator functions for iterating over DDACS simulation data without class overhead.

count_available_simulations(data_dir, h5_subdir='h5', metadata_file='metadata.csv')

Count available simulations (with existing H5 files).

Parameters:

Name Type Description Default
data_dir str | Path

Root directory of the dataset.

required
h5_subdir str

Subdirectory containing H5 files (default: "h5").

'h5'
metadata_file str

Name of the metadata CSV file (default: "metadata.csv").

'metadata.csv'

Returns:

Name Type Description
int int

Number of simulations with existing H5 files.

Raises:

Type Description
FileNotFoundError

If the H5 directory or metadata file don't exist.

Examples:

>>> count = count_available_simulations('/data/ddacs')
>>> print(f"Dataset contains {count} available simulations")
Source code in ddacs/generators.py
def count_available_simulations(
    data_dir: str | Path,
    h5_subdir: str = "h5",
    metadata_file: str = "metadata.csv",
) -> int:
    """
    Count available simulations (with existing H5 files).

    Args:
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").
        metadata_file: Name of the metadata CSV file (default: "metadata.csv").

    Returns:
        int: Number of simulations with existing H5 files.

    Raises:
        FileNotFoundError: If the H5 directory or metadata file don't exist.

    Examples:
        >>> count = count_available_simulations('/data/ddacs')
        >>> print(f"Dataset contains {count} available simulations")
    """
    data_dir = Path(data_dir)
    metadata_path = data_dir / metadata_file
    h5_dir = data_dir / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")
    if not metadata_path.exists():
        raise FileNotFoundError(f"Metadata file not found: {metadata_path}")

    metadata = pd.read_csv(metadata_path)
    mask = metadata["ID"].apply(lambda sim_id: (h5_dir / f"{int(sim_id)}.h5").exists())
    return mask.sum()

get_simulation_by_id(sim_id, data_dir, h5_subdir='h5', metadata_file='metadata.csv')

Get a specific simulation by its ID.

Parameters:

Name Type Description Default
sim_id int

The simulation ID to retrieve.

required
data_dir str | Path

Root directory of the dataset.

required
h5_subdir str

Subdirectory containing H5 files (default: "h5").

'h5'
metadata_file str

Name of the metadata CSV file (default: "metadata.csv").

'metadata.csv'

Returns:

Type Description
tuple[int, ndarray, Path] | None

Optional[Tuple[int, np.ndarray, Path]]: Simulation data if found, None otherwise. Tuple contains (simulation_id, metadata_values, h5_file_path).

Raises:

Type Description
FileNotFoundError

If the H5 directory or metadata file don't exist.

Examples:

>>> sim_data = get_simulation_by_id(113525, '/data/ddacs')
>>> if sim_data:
...     sim_id, metadata, h5_path = sim_data
...     print(f"Found simulation {sim_id}")
>>> # Check if simulation exists
>>> if get_simulation_by_id(999999, '/data/ddacs') is None:
...     print("Simulation not found")
Source code in ddacs/generators.py
def get_simulation_by_id(
    sim_id: int,
    data_dir: str | Path,
    h5_subdir: str = "h5",
    metadata_file: str = "metadata.csv",
) -> tuple[int, np.ndarray, Path] | None:
    """
    Get a specific simulation by its ID.

    Args:
        sim_id: The simulation ID to retrieve.
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").
        metadata_file: Name of the metadata CSV file (default: "metadata.csv").

    Returns:
        Optional[Tuple[int, np.ndarray, Path]]: Simulation data if found, None otherwise.
            Tuple contains (simulation_id, metadata_values, h5_file_path).

    Raises:
        FileNotFoundError: If the H5 directory or metadata file don't exist.

    Examples:
        >>> sim_data = get_simulation_by_id(113525, '/data/ddacs')
        >>> if sim_data:
        ...     sim_id, metadata, h5_path = sim_data
        ...     print(f"Found simulation {sim_id}")

        >>> # Check if simulation exists
        >>> if get_simulation_by_id(999999, '/data/ddacs') is None:
        ...     print("Simulation not found")
    """
    data_dir = Path(data_dir)
    metadata_path = data_dir / metadata_file
    h5_dir = data_dir / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")
    if not metadata_path.exists():
        raise FileNotFoundError(f"Metadata file not found: {metadata_path}")

    metadata = pd.read_csv(metadata_path)
    row = metadata[metadata["ID"] == sim_id]

    if row.empty:
        return None

    row = row.iloc[0]
    h5_path = h5_dir / f"{sim_id}.h5"

    if not h5_path.exists():
        return None

    metadata_vals = np.asarray(row.values[1:], copy=False)  # Skip ID, no copy
    return sim_id, metadata_vals, h5_path

iter_ddacs(data_dir, h5_subdir='h5', metadata_file='metadata.csv', skip_missing=False)

Generator for streaming DDACS data.

Parameters:

Name Type Description Default
data_dir str | Path

Root directory of the dataset.

required
h5_subdir str

Subdirectory containing H5 files (default: "h5").

'h5'
metadata_file str

Name of the metadata CSV file (default: "metadata.csv").

'metadata.csv'
skip_missing bool

If True, skip missing H5 files with a warning. If False, raise FileNotFoundError (default: False).

False

Yields:

Type Description
tuple[int, ndarray, Path]

Tuple[int, np.ndarray, Path]: Simulation ID, metadata values array, and path to corresponding H5 file.

Raises:

Type Description
FileNotFoundError

If H5 directory doesn't exist, or if skip_missing=False and an H5 file is missing.

Examples:

>>> for sim_id, metadata, h5_path in iter_ddacs('/data/ddacs'):
...     print(f"Simulation {sim_id}: {h5_path}")
>>> # Skip missing files (for partial downloads)
>>> for sim_id, metadata, h5_path in iter_ddacs('/data/ddacs', skip_missing=True):
...     print(f"Processing {sim_id}")
Source code in ddacs/generators.py
def iter_ddacs(
    data_dir: str | Path,
    h5_subdir: str = "h5",
    metadata_file: str = "metadata.csv",
    skip_missing: bool = False,
) -> Generator[tuple[int, np.ndarray, Path], None, None]:
    """
    Generator for streaming DDACS data.

    Args:
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").
        metadata_file: Name of the metadata CSV file (default: "metadata.csv").
        skip_missing: If True, skip missing H5 files with a warning.
            If False, raise FileNotFoundError (default: False).

    Yields:
        Tuple[int, np.ndarray, Path]: Simulation ID, metadata values array,
            and path to corresponding H5 file.

    Raises:
        FileNotFoundError: If H5 directory doesn't exist, or if skip_missing=False
            and an H5 file is missing.

    Examples:
        >>> for sim_id, metadata, h5_path in iter_ddacs('/data/ddacs'):
        ...     print(f"Simulation {sim_id}: {h5_path}")

        >>> # Skip missing files (for partial downloads)
        >>> for sim_id, metadata, h5_path in iter_ddacs('/data/ddacs', skip_missing=True):
        ...     print(f"Processing {sim_id}")
    """
    data_dir = Path(data_dir)
    metadata_path = data_dir / metadata_file
    h5_dir = data_dir / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")

    metadata = pd.read_csv(metadata_path)

    if skip_missing:
        logger.warning("skip_missing=True: Missing H5 files will be skipped")

    for _, row in metadata.iterrows():
        sim_id = int(row["ID"])
        h5_path = h5_dir / f"{sim_id}.h5"

        if h5_path.exists():
            metadata_vals = np.asarray(row.values[1:], copy=False)  # Skip ID, no copy
            yield sim_id, metadata_vals, h5_path
        elif skip_missing:
            continue
        else:
            raise FileNotFoundError(f"H5 file not found: {h5_path}")

iter_h5_files(data_dir, h5_subdir='h5')

Minimal generator for H5 file paths only.

Parameters:

Name Type Description Default
data_dir str | Path

Root directory of the dataset.

required
h5_subdir str

Subdirectory containing H5 files (default: "h5").

'h5'

Yields:

Name Type Description
Path Path

Absolute path to each H5 file found in the specified directory.

Raises:

Type Description
FileNotFoundError

If the H5 directory doesn't exist.

Examples:

>>> for h5_path in iter_h5_files('/data/ddacs'):
...     print(f"Found H5 file: {h5_path.name}")
>>> # Count all H5 files
>>> h5_count = sum(1 for _ in iter_h5_files('/data/ddacs'))
>>> print(f"Total H5 files: {h5_count}")
Note

Yields all .h5 files found in the directory, regardless of metadata.

Source code in ddacs/generators.py
def iter_h5_files(data_dir: str | Path, h5_subdir: str = "h5") -> Generator[Path, None, None]:
    """
    Minimal generator for H5 file paths only.

    Args:
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").

    Yields:
        Path: Absolute path to each H5 file found in the specified directory.

    Raises:
        FileNotFoundError: If the H5 directory doesn't exist.

    Examples:
        >>> for h5_path in iter_h5_files('/data/ddacs'):
        ...     print(f"Found H5 file: {h5_path.name}")

        >>> # Count all H5 files
        >>> h5_count = sum(1 for _ in iter_h5_files('/data/ddacs'))
        >>> print(f"Total H5 files: {h5_count}")

    Note:
        Yields all .h5 files found in the directory, regardless of metadata.
    """
    h5_dir = Path(data_dir) / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")

    yield from h5_dir.glob("*.h5")

sample_simulations(n, data_dir, h5_subdir='h5', metadata_file='metadata.csv')

Randomly sample simulations from the dataset.

Parameters:

Name Type Description Default
n int

Number of simulations to sample.

required
data_dir str | Path

Root directory of the dataset.

required
h5_subdir str

Subdirectory containing H5 files (default: "h5").

'h5'
metadata_file str

Name of the metadata CSV file (default: "metadata.csv").

'metadata.csv'

Yields:

Type Description
tuple[int, ndarray, Path]

Tuple[int, np.ndarray, Path]: Simulation ID, metadata values array, and path to corresponding H5 file.

Raises:

Type Description
FileNotFoundError

If the H5 directory or metadata file don't exist.

Examples:

>>> # Sample 5 random simulations
>>> for sim_id, metadata, h5_path in sample_simulations(5, '/data/ddacs'):
...     print(f"Sampled simulation {sim_id}")
>>> # Convert to list for further processing
>>> samples = list(sample_simulations(10, '/data/ddacs'))
>>> print(f"Got {len(samples)} samples")
Source code in ddacs/generators.py
def sample_simulations(
    n: int,
    data_dir: str | Path,
    h5_subdir: str = "h5",
    metadata_file: str = "metadata.csv",
) -> Generator[tuple[int, np.ndarray, Path], None, None]:
    """
    Randomly sample simulations from the dataset.

    Args:
        n: Number of simulations to sample.
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").
        metadata_file: Name of the metadata CSV file (default: "metadata.csv").

    Yields:
        Tuple[int, np.ndarray, Path]: Simulation ID, metadata values array,
            and path to corresponding H5 file.

    Raises:
        FileNotFoundError: If the H5 directory or metadata file don't exist.

    Examples:
        >>> # Sample 5 random simulations
        >>> for sim_id, metadata, h5_path in sample_simulations(5, '/data/ddacs'):
        ...     print(f"Sampled simulation {sim_id}")

        >>> # Convert to list for further processing
        >>> samples = list(sample_simulations(10, '/data/ddacs'))
        >>> print(f"Got {len(samples)} samples")
    """
    data_dir = Path(data_dir)
    metadata_path = data_dir / metadata_file
    h5_dir = data_dir / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")
    if not metadata_path.exists():
        raise FileNotFoundError(f"Metadata file not found: {metadata_path}")

    metadata = pd.read_csv(metadata_path)

    # Filter to only existing H5 files
    mask = metadata["ID"].apply(lambda sim_id: (h5_dir / f"{int(sim_id)}.h5").exists())
    available_metadata = metadata[mask]

    if len(available_metadata) == 0:
        logger.warning("No simulations with existing H5 files found")
        return

    # Sample the requested number (or all available if less)
    n_sample = min(n, len(available_metadata))
    sampled = available_metadata.sample(n=n_sample)

    for _, row in sampled.iterrows():
        sim_id = int(row["ID"])
        h5_path = h5_dir / f"{sim_id}.h5"
        metadata_vals = np.asarray(row.values[1:], copy=False)  # Skip ID, no copy
        yield sim_id, metadata_vals, h5_path