Generators¶

Generator functions for streaming DDACS simulation data.

`generators` ¶

Simple generator functions for DDACS data streaming.

This module provides lightweight generator functions for iterating over DDACS simulation data without class overhead.

`count_available_simulations(data_dir, h5_subdir='h5', metadata_file='metadata.csv')` ¶

Count available simulations (with existing H5 files).

Parameters:

Name	Type	Description	Default
`data_dir`	`str \| Path`	Root directory of the dataset.	required
`h5_subdir`	`str`	Subdirectory containing H5 files (default: "h5").	`'h5'`
`metadata_file`	`str`	Name of the metadata CSV file (default: "metadata.csv").	`'metadata.csv'`

Returns:

Name	Type	Description
`int`	`int`	Number of simulations with existing H5 files.

Raises:

Type	Description
`FileNotFoundError`	If the H5 directory or metadata file don't exist.

Examples:

>>> count = count_available_simulations('/data/ddacs')
>>> print(f"Dataset contains {count} available simulations")

Source code in ddacs/generators.py

def count_available_simulations(
    data_dir: str | Path,
    h5_subdir: str = "h5",
    metadata_file: str = "metadata.csv",
) -> int:
    """
    Count available simulations (with existing H5 files).

    Args:
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").
        metadata_file: Name of the metadata CSV file (default: "metadata.csv").

    Returns:
        int: Number of simulations with existing H5 files.

    Raises:
        FileNotFoundError: If the H5 directory or metadata file don't exist.

    Examples:
        >>> count = count_available_simulations('/data/ddacs')
        >>> print(f"Dataset contains {count} available simulations")
    """
    data_dir = Path(data_dir)
    metadata_path = data_dir / metadata_file
    h5_dir = data_dir / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")
    if not metadata_path.exists():
        raise FileNotFoundError(f"Metadata file not found: {metadata_path}")

    metadata = pd.read_csv(metadata_path)
    mask = metadata["ID"].apply(lambda sim_id: (h5_dir / f"{int(sim_id)}.h5").exists())
    return mask.sum()

`get_simulation_by_id(sim_id, data_dir, h5_subdir='h5', metadata_file='metadata.csv')` ¶

Get a specific simulation by its ID.

Parameters:

Name	Type	Description	Default
`sim_id`	`int`	The simulation ID to retrieve.	required
`data_dir`	`str \| Path`	Root directory of the dataset.	required
`h5_subdir`	`str`	Subdirectory containing H5 files (default: "h5").	`'h5'`
`metadata_file`	`str`	Name of the metadata CSV file (default: "metadata.csv").	`'metadata.csv'`

Returns:

Type	Description
`tuple[int, ndarray, Path] \| None`	Optional[Tuple[int, np.ndarray, Path]]: Simulation data if found, None otherwise. Tuple contains (simulation_id, metadata_values, h5_file_path).

Raises:

Type	Description
`FileNotFoundError`	If the H5 directory or metadata file don't exist.

Examples:

>>> sim_data = get_simulation_by_id(113525, '/data/ddacs')
>>> if sim_data:
...     sim_id, metadata, h5_path = sim_data
...     print(f"Found simulation {sim_id}")

>>> # Check if simulation exists
>>> if get_simulation_by_id(999999, '/data/ddacs') is None:
...     print("Simulation not found")

Source code in ddacs/generators.py

def get_simulation_by_id(
    sim_id: int,
    data_dir: str | Path,
    h5_subdir: str = "h5",
    metadata_file: str = "metadata.csv",
) -> tuple[int, np.ndarray, Path] | None:
    """
    Get a specific simulation by its ID.

    Args:
        sim_id: The simulation ID to retrieve.
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").
        metadata_file: Name of the metadata CSV file (default: "metadata.csv").

    Returns:
        Optional[Tuple[int, np.ndarray, Path]]: Simulation data if found, None otherwise.
            Tuple contains (simulation_id, metadata_values, h5_file_path).

    Raises:
        FileNotFoundError: If the H5 directory or metadata file don't exist.

    Examples:
        >>> sim_data = get_simulation_by_id(113525, '/data/ddacs')
        >>> if sim_data:
        ...     sim_id, metadata, h5_path = sim_data
        ...     print(f"Found simulation {sim_id}")

        >>> # Check if simulation exists
        >>> if get_simulation_by_id(999999, '/data/ddacs') is None:
        ...     print("Simulation not found")
    """
    data_dir = Path(data_dir)
    metadata_path = data_dir / metadata_file
    h5_dir = data_dir / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")
    if not metadata_path.exists():
        raise FileNotFoundError(f"Metadata file not found: {metadata_path}")

    metadata = pd.read_csv(metadata_path)
    row = metadata[metadata["ID"] == sim_id]

    if row.empty:
        return None

    row = row.iloc[0]
    h5_path = h5_dir / f"{sim_id}.h5"

    if not h5_path.exists():
        return None

    metadata_vals = np.asarray(row.values[1:], copy=False)  # Skip ID, no copy
    return sim_id, metadata_vals, h5_path

`iter_ddacs(data_dir, h5_subdir='h5', metadata_file='metadata.csv', skip_missing=False)` ¶

Generator for streaming DDACS data.

Parameters:

Name	Type	Description	Default
`data_dir`	`str \| Path`	Root directory of the dataset.	required
`h5_subdir`	`str`	Subdirectory containing H5 files (default: "h5").	`'h5'`
`metadata_file`	`str`	Name of the metadata CSV file (default: "metadata.csv").	`'metadata.csv'`
`skip_missing`	`bool`	If True, skip missing H5 files with a warning. If False, raise FileNotFoundError (default: False).	`False`

Yields:

Type	Description
`tuple[int, ndarray, Path]`	Tuple[int, np.ndarray, Path]: Simulation ID, metadata values array, and path to corresponding H5 file.

Raises:

Type	Description
`FileNotFoundError`	If H5 directory doesn't exist, or if skip_missing=False and an H5 file is missing.

Examples:

>>> for sim_id, metadata, h5_path in iter_ddacs('/data/ddacs'):
...     print(f"Simulation {sim_id}: {h5_path}")

>>> # Skip missing files (for partial downloads)
>>> for sim_id, metadata, h5_path in iter_ddacs('/data/ddacs', skip_missing=True):
...     print(f"Processing {sim_id}")

Source code in ddacs/generators.py

def iter_ddacs(
    data_dir: str | Path,
    h5_subdir: str = "h5",
    metadata_file: str = "metadata.csv",
    skip_missing: bool = False,
) -> Generator[tuple[int, np.ndarray, Path], None, None]:
    """
    Generator for streaming DDACS data.

    Args:
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").
        metadata_file: Name of the metadata CSV file (default: "metadata.csv").
        skip_missing: If True, skip missing H5 files with a warning.
            If False, raise FileNotFoundError (default: False).

    Yields:
        Tuple[int, np.ndarray, Path]: Simulation ID, metadata values array,
            and path to corresponding H5 file.

    Raises:
        FileNotFoundError: If H5 directory doesn't exist, or if skip_missing=False
            and an H5 file is missing.

    Examples:
        >>> for sim_id, metadata, h5_path in iter_ddacs('/data/ddacs'):
        ...     print(f"Simulation {sim_id}: {h5_path}")

        >>> # Skip missing files (for partial downloads)
        >>> for sim_id, metadata, h5_path in iter_ddacs('/data/ddacs', skip_missing=True):
        ...     print(f"Processing {sim_id}")
    """
    data_dir = Path(data_dir)
    metadata_path = data_dir / metadata_file
    h5_dir = data_dir / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")

    metadata = pd.read_csv(metadata_path)

    if skip_missing:
        logger.warning("skip_missing=True: Missing H5 files will be skipped")

    for _, row in metadata.iterrows():
        sim_id = int(row["ID"])
        h5_path = h5_dir / f"{sim_id}.h5"

        if h5_path.exists():
            metadata_vals = np.asarray(row.values[1:], copy=False)  # Skip ID, no copy
            yield sim_id, metadata_vals, h5_path
        elif skip_missing:
            continue
        else:
            raise FileNotFoundError(f"H5 file not found: {h5_path}")

`iter_h5_files(data_dir, h5_subdir='h5')` ¶

Minimal generator for H5 file paths only.

Parameters:

Name	Type	Description	Default
`data_dir`	`str \| Path`	Root directory of the dataset.	required
`h5_subdir`	`str`	Subdirectory containing H5 files (default: "h5").	`'h5'`

Yields:

Name	Type	Description
`Path`	`Path`	Absolute path to each H5 file found in the specified directory.

Raises:

Type	Description
`FileNotFoundError`	If the H5 directory doesn't exist.

Examples:

>>> for h5_path in iter_h5_files('/data/ddacs'):
...     print(f"Found H5 file: {h5_path.name}")

>>> # Count all H5 files
>>> h5_count = sum(1 for _ in iter_h5_files('/data/ddacs'))
>>> print(f"Total H5 files: {h5_count}")

Note

Yields all .h5 files found in the directory, regardless of metadata.

Source code in ddacs/generators.py

def iter_h5_files(data_dir: str | Path, h5_subdir: str = "h5") -> Generator[Path, None, None]:
    """
    Minimal generator for H5 file paths only.

    Args:
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").

    Yields:
        Path: Absolute path to each H5 file found in the specified directory.

    Raises:
        FileNotFoundError: If the H5 directory doesn't exist.

    Examples:
        >>> for h5_path in iter_h5_files('/data/ddacs'):
        ...     print(f"Found H5 file: {h5_path.name}")

        >>> # Count all H5 files
        >>> h5_count = sum(1 for _ in iter_h5_files('/data/ddacs'))
        >>> print(f"Total H5 files: {h5_count}")

    Note:
        Yields all .h5 files found in the directory, regardless of metadata.
    """
    h5_dir = Path(data_dir) / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")

    yield from h5_dir.glob("*.h5")

`sample_simulations(n, data_dir, h5_subdir='h5', metadata_file='metadata.csv')` ¶

Randomly sample simulations from the dataset.

Parameters:

Name	Type	Description	Default
`n`	`int`	Number of simulations to sample.	required
`data_dir`	`str \| Path`	Root directory of the dataset.	required
`h5_subdir`	`str`	Subdirectory containing H5 files (default: "h5").	`'h5'`
`metadata_file`	`str`	Name of the metadata CSV file (default: "metadata.csv").	`'metadata.csv'`

Yields:

Type	Description
`tuple[int, ndarray, Path]`	Tuple[int, np.ndarray, Path]: Simulation ID, metadata values array, and path to corresponding H5 file.

Raises:

Type	Description
`FileNotFoundError`	If the H5 directory or metadata file don't exist.

Examples:

>>> # Sample 5 random simulations
>>> for sim_id, metadata, h5_path in sample_simulations(5, '/data/ddacs'):
...     print(f"Sampled simulation {sim_id}")

>>> # Convert to list for further processing
>>> samples = list(sample_simulations(10, '/data/ddacs'))
>>> print(f"Got {len(samples)} samples")

Source code in ddacs/generators.py

def sample_simulations(
    n: int,
    data_dir: str | Path,
    h5_subdir: str = "h5",
    metadata_file: str = "metadata.csv",
) -> Generator[tuple[int, np.ndarray, Path], None, None]:
    """
    Randomly sample simulations from the dataset.

    Args:
        n: Number of simulations to sample.
        data_dir: Root directory of the dataset.
        h5_subdir: Subdirectory containing H5 files (default: "h5").
        metadata_file: Name of the metadata CSV file (default: "metadata.csv").

    Yields:
        Tuple[int, np.ndarray, Path]: Simulation ID, metadata values array,
            and path to corresponding H5 file.

    Raises:
        FileNotFoundError: If the H5 directory or metadata file don't exist.

    Examples:
        >>> # Sample 5 random simulations
        >>> for sim_id, metadata, h5_path in sample_simulations(5, '/data/ddacs'):
        ...     print(f"Sampled simulation {sim_id}")

        >>> # Convert to list for further processing
        >>> samples = list(sample_simulations(10, '/data/ddacs'))
        >>> print(f"Got {len(samples)} samples")
    """
    data_dir = Path(data_dir)
    metadata_path = data_dir / metadata_file
    h5_dir = data_dir / h5_subdir

    if not h5_dir.exists():
        raise FileNotFoundError(f"H5 directory not found: {h5_dir}")
    if not metadata_path.exists():
        raise FileNotFoundError(f"Metadata file not found: {metadata_path}")

    metadata = pd.read_csv(metadata_path)

    # Filter to only existing H5 files
    mask = metadata["ID"].apply(lambda sim_id: (h5_dir / f"{int(sim_id)}.h5").exists())
    available_metadata = metadata[mask]

    if len(available_metadata) == 0:
        logger.warning("No simulations with existing H5 files found")
        return

    # Sample the requested number (or all available if less)
    n_sample = min(n, len(available_metadata))
    sampled = available_metadata.sample(n=n_sample)

    for _, row in sampled.iterrows():
        sim_id = int(row["ID"])
        h5_path = h5_dir / f"{sim_id}.h5"
        metadata_vals = np.asarray(row.values[1:], copy=False)  # Skip ID, no copy
        yield sim_id, metadata_vals, h5_path

Generators¶

generators ¶

count_available_simulations(data_dir, h5_subdir='h5', metadata_file='metadata.csv') ¶

get_simulation_by_id(sim_id, data_dir, h5_subdir='h5', metadata_file='metadata.csv') ¶

iter_ddacs(data_dir, h5_subdir='h5', metadata_file='metadata.csv', skip_missing=False) ¶

iter_h5_files(data_dir, h5_subdir='h5') ¶

sample_simulations(n, data_dir, h5_subdir='h5', metadata_file='metadata.csv') ¶

`generators` ¶

`count_available_simulations(data_dir, h5_subdir='h5', metadata_file='metadata.csv')` ¶

`get_simulation_by_id(sim_id, data_dir, h5_subdir='h5', metadata_file='metadata.csv')` ¶

`iter_ddacs(data_dir, h5_subdir='h5', metadata_file='metadata.csv', skip_missing=False)` ¶

`iter_h5_files(data_dir, h5_subdir='h5')` ¶

`sample_simulations(n, data_dir, h5_subdir='h5', metadata_file='metadata.csv')` ¶