Source code for eyefeatures.data.utils

"""
Simple data loading utilities for the eye-tracking collection.

Collection data lives in this repo at ``data/collection`` as Parquet files
(tracked with Git LFS). You can pass a custom path or use the default.

Column conventions:
- Primary key (pk): columns starting with ``group_``
- Labels: columns ending with ``_label``
- Meta: columns starting with ``meta_``
"""

import json
from pathlib import Path
from typing import Any

import pandas as pd

#: Default root directory for collection Parquet files (``data/collection`` in the repo, Git LFS).
DEFAULT_COLLECTION_DIR = Path("data/collection")


def _classify_dataset_type(dataset_name: str) -> str:
    """Classify dataset type by name suffix: 'gaze' or 'fixation'."""
    if dataset_name.endswith("_gaze") or dataset_name.endswith("_gazes"):
        return "gaze"
    if dataset_name.endswith("_fixations") or dataset_name.endswith("_fixation"):
        return "fixation"
    # Default: treat as fixation
    return "fixation"


[docs] def list_datasets( collection_dir: str | Path | None = None, *, include_extensive_collection: bool = True, extensive_collection_only: bool = False, include_extracted_fixations: bool = True, extracted_fixations_only: bool = False, dataset_type: str | None = None, ) -> list[str]: """List available dataset names in the collection directory. Parameters ---------- collection_dir : path, optional Root directory containing collection Parquet files. Defaults to ``data/collection`` (repo data tracked with Git LFS). include_extensive_collection : bool, default True If True, also search in extensive_collection subfolder. Ignored when extensive_collection_only or extracted_fixations_only is True. extensive_collection_only : bool, default False If True, list only datasets from extensive_collection subfolder (main directory is not scanned). include_extracted_fixations : bool, default True If True, also search in extracted_fixations subfolder. Ignored when extensive_collection_only or extracted_fixations_only is True. extracted_fixations_only : bool, default False If True, list only datasets from extracted_fixations subfolder (main directory is not scanned). dataset_type : str, optional If "gaze", return only gaze datasets (names ending with _gaze/_gazes). If "fixation", return only fixation datasets (names ending with _fixations/_fixation or default). If None, return all. Returns ------- list of str Sorted list of dataset names (without .parquet extension). """ collection_path = ( Path(collection_dir) if collection_dir is not None else DEFAULT_COLLECTION_DIR ) dataset_names = set() if extracted_fixations_only: extracted_dir = collection_path / "extracted_fixations" if extracted_dir.exists(): for f in extracted_dir.glob("*.parquet"): dataset_names.add(f.stem) elif extensive_collection_only: extensive_dir = collection_path / "extensive_collection" if extensive_dir.exists(): for f in extensive_dir.glob("*.parquet"): dataset_names.add(f.stem) else: for f in collection_path.glob("*.parquet"): dataset_names.add(f.stem) if include_extensive_collection: extensive_dir = collection_path / "extensive_collection" if extensive_dir.exists(): for f in extensive_dir.glob("*.parquet"): dataset_names.add(f.stem) if include_extracted_fixations: extracted_dir = collection_path / "extracted_fixations" if extracted_dir.exists(): for f in extracted_dir.glob("*.parquet"): dataset_names.add(f.stem) if dataset_type is not None: dataset_names = { name for name in dataset_names if _classify_dataset_type(name) == dataset_type } return sorted(dataset_names)
[docs] def load_dataset( dataset_name: str, collection_dir: str | Path | None = None, *, normalize: bool = True, ) -> tuple[pd.DataFrame, dict]: """Load a collection dataset by name. Parameters ---------- dataset_name : str Name of the dataset (e.g. "ASD_ready_data_fixations"). Will search for {dataset_name}.parquet in collection_dir. collection_dir : path, optional Root directory containing collection Parquet files. Defaults to ``data/collection`` (repo data tracked with Git LFS). normalize : bool, default True If True and dataset has unnormalized x/y columns, normalize them and rename to norm_pos_x/norm_pos_y. Returns ------- tuple (DataFrame, meta_info) - DataFrame: loaded and optionally normalized data - meta_info: dict with 'pk', 'labels', 'meta' column lists and 'info' (from collection_dir/meta.json under key dataset_name, if present). """ collection_path = ( Path(collection_dir) if collection_dir is not None else DEFAULT_COLLECTION_DIR ) dataset_path = collection_path / f"{dataset_name}.parquet" if not dataset_path.exists(): # Try in extensive_collection extensive_path = ( collection_path / "extensive_collection" / f"{dataset_name}.parquet" ) if extensive_path.exists(): dataset_path = extensive_path else: # Try in extracted_fixations extracted_path = ( collection_path / "extracted_fixations" / f"{dataset_name}.parquet" ) if extracted_path.exists(): dataset_path = extracted_path else: raise FileNotFoundError( f"Dataset '{dataset_name}' not found in {collection_path}, " f"{collection_path / 'extensive_collection'}, or " f"{collection_path / 'extracted_fixations'}" ) df = pd.read_parquet(dataset_path) # Parquet preserves types; ensure numeric for x/y if present (e.g. from older exports) if "x" in df.columns and not pd.api.types.is_numeric_dtype(df["x"]): df["x"] = pd.to_numeric( df["x"].astype(str).str.replace(",", "."), errors="coerce" ) if "y" in df.columns and not pd.api.types.is_numeric_dtype(df["y"]): df["y"] = pd.to_numeric( df["y"].astype(str).str.replace(",", "."), errors="coerce" ) # Handle left/right eye columns if "x_left" in df.columns and "x_right" in df.columns: if "x" not in df.columns: df["x"] = (df["x_left"] + df["x_right"]) / 2 if "y" not in df.columns: df["y"] = (df["y_left"] + df["y_right"]) / 2 # Normalize if requested and needed if normalize and "x" in df.columns and "y" in df.columns: if "norm_pos_x" not in df.columns: max_x = df["x"].max() max_y = df["y"].max() df["norm_pos_x"] = df["x"] / max_x if max_x > 0 else df["x"] df["norm_pos_y"] = df["y"] / max_y if max_y > 0 else df["y"] # Build meta info meta_info = { "pk": get_pk(df), "labels": get_labels(df), "meta": get_meta(df), "info": _load_meta_info(collection_path, dataset_name), } return df, meta_info
def _load_meta_info(collection_path: Path, dataset_name: str) -> Any | None: """Load meta.json from collection dir and return value for dataset_name key.""" meta_path = collection_path / "meta.json" if not meta_path.exists(): return None try: with open(meta_path, encoding="utf-8") as f: data = json.load(f) return data.get(dataset_name) except (json.JSONDecodeError, OSError): return None
[docs] def get_pk(df: pd.DataFrame) -> list[str]: r"""Get primary key column names (columns starting with ``group\_``). Parameters ---------- df : DataFrame Benchmark dataset DataFrame. Returns ------- list of str Primary key column names. """ return [col for col in df.columns if col.startswith("group_")]
[docs] def get_labels(df: pd.DataFrame) -> list[str]: """Get label column names (columns ending with _label). Parameters ---------- df : DataFrame Benchmark dataset DataFrame. Returns ------- list of str Label column names. """ return [col for col in df.columns if col.endswith("_label")]
[docs] def get_meta(df: pd.DataFrame) -> list[str]: r"""Get meta column names (columns starting with ``meta\_``). Parameters ---------- df : DataFrame Benchmark dataset DataFrame. Returns ------- list of str Meta column names. """ return [col for col in df.columns if col.startswith("meta_")]