Skip to content

Dataset

kloppy.domain.Dataset dataclass

Dataset(records, metadata)

Bases: ABC, Generic[T]

Base class for datasets.

A dataset describes specific aspects of what happened during a single match as a sequence of DataRecord entities.

ATTRIBUTE DESCRIPTION
dataset_type

The type of the dataset.

TYPE: DatasetType

records

List of records in the dataset.

TYPE: list[T]

metadata

Metadata for the dataset.

TYPE: Metadata

records instance-attribute

records

metadata instance-attribute

metadata

dataset_type abstractmethod property

dataset_type

transform

transform(*args, **kwargs)

See transform

Source code in kloppy/domain/models/common.py
def transform(self, *args, **kwargs):
    """
    See [transform][kloppy.helpers.transform]
    """
    from kloppy.helpers import transform

    return transform(self, *args, **kwargs)

filter

filter(filter_)

Filter all records used filter_

PARAMETER DESCRIPTION
filter_

The filter to be used to filter the records. It can be a callable that takes a record and returns a boolean, or a string representing a css-like selector.

TYPE: Union[str, Callable[[T], bool]]

Examples:

1
2
3
>>> from kloppy.domain import EventType
>>> dataset = dataset.filter(lambda event: event.event_type == EventType.PASS)
>>> dataset = dataset.filter('pass')
Source code in kloppy/domain/models/common.py
def filter(self, filter_: Union[str, Callable[[T], bool]]):
    """
    Filter all records used `filter_`

    Args:
        filter_: The filter to be used to filter the records. It can be a
            callable that takes a record and returns a boolean, or a string
            representing a css-like selector.

    Examples:

        >>> from kloppy.domain import EventType
        >>> dataset = dataset.filter(lambda event: event.event_type == EventType.PASS)
        >>> dataset = dataset.filter('pass')
    """
    # 1. Perform filtering
    filtered_records = self.find_all(filter_)

    # 2. Determine the target class
    current_class = self.__class__

    if isinstance(self, FilteredDataset):
        # Already a filtered class, keep using it
        target_class = current_class
    else:
        # Need to create or retrieve the dynamic filtered subclass
        if current_class not in _FILTERED_CLASS_CACHE:
            # Dynamically create: class FilteredEventDataset(FilteredDataset, EventDataset)
            new_cls_name = f"Filtered{current_class.__name__}"

            # We inherit from FilteredDataset first, then the original class
            _FILTERED_CLASS_CACHE[current_class] = type(
                new_cls_name, (FilteredDataset, current_class), {}
            )
        target_class = _FILTERED_CLASS_CACHE[current_class]

    # 3. Gather arguments to instantiate the new class
    # We only want fields that are defined in __init__
    init_kwargs = {
        f.name: getattr(self, f.name) for f in fields(self) if f.init
    }

    # 4. Update the records in the arguments
    init_kwargs["records"] = filtered_records

    # 5. Instantiate and return the new class
    # This runs __init__ and __post_init__ of the new class naturally
    return target_class(**init_kwargs)

map

map(mapper)
Source code in kloppy/domain/models/common.py
def map(self, mapper):
    return replace(
        self, records=[mapper(record) for record in self.records]
    )

find_all

find_all(filter_)
Source code in kloppy/domain/models/common.py
def find_all(self, filter_) -> list[T]:
    return [record for record in self.records if record.matches(filter_)]

find

find(filter_)
Source code in kloppy/domain/models/common.py
def find(self, filter_) -> Optional[T]:
    for record in self.records:
        if record.matches(filter_):
            return record

from_dataset classmethod

from_dataset(dataset, mapper_fn)

Create a new Dataset from other dataset

PARAMETER DESCRIPTION
mapper_fn

TYPE: Callable[[Self], Self]

Examples:

>>> from kloppy.domain import Code,     CodeDataset
>>> code_dataset = (
>>>     CodeDataset
>>>     .from_dataset(
>>>         dataset,
>>>         lambda event: Code(
>>>             code_id=event.event_id,
>>>             code=event.event_name,
>>>             period=event.period,
>>>             timestamp=event.timestamp - 7,
>>>             end_timestamp=event.timestamp + 5,
>>>             labels={
>>>                 'Player': str(event.player),
>>>                 'Team': str(event.team)
>>>             }
>>>         )
>>>     )
>>> )
Source code in kloppy/domain/models/common.py
@classmethod
def from_dataset(
    cls, dataset: "Dataset", mapper_fn: Callable[[Self], Self]
):
    """
    Create a new Dataset from other dataset

    Arguments:
        mapper_fn:

    Examples:
        >>> from kloppy.domain import Code,     CodeDataset

        >>> code_dataset = (
        >>>     CodeDataset
        >>>     .from_dataset(
        >>>         dataset,
        >>>         lambda event: Code(
        >>>             code_id=event.event_id,
        >>>             code=event.event_name,
        >>>             period=event.period,
        >>>             timestamp=event.timestamp - 7,
        >>>             end_timestamp=event.timestamp + 5,
        >>>             labels={
        >>>                 'Player': str(event.player),
        >>>                 'Team': str(event.team)
        >>>             }
        >>>         )
        >>>     )
        >>> )
    """
    return cls(
        metadata=dataset.metadata,
        records=[mapper_fn(record) for record in dataset.records],
    )

get_record_by_id

get_record_by_id(record_id)
Source code in kloppy/domain/models/common.py
def get_record_by_id(self, record_id: Union[int, str]) -> Optional[T]:
    for record in self.records:
        if record.record_id == record_id:
            return record

to_records

to_records(*columns: Unpack[tuple[Column]], as_list: Literal[True] = True, **named_columns: NamedColumns) -> list[dict[str, Any]]
to_records(*columns: Unpack[tuple[Column]], as_list: Literal[False] = False, **named_columns: NamedColumns) -> Iterable[dict[str, Any]]
to_records(*columns, as_list=True, **named_columns)
Source code in kloppy/domain/models/common.py
def to_records(
    self,
    *columns: Unpack[tuple["Column"]],
    as_list: bool = True,
    **named_columns: "NamedColumns",
) -> Union[list[dict[str, Any]], Iterable[dict[str, Any]]]:
    from ..services.transformers.data_record import get_transformer_cls

    transformer = get_transformer_cls(self.dataset_type)(
        *columns, **named_columns
    )
    iterator = map(transformer, self.records)
    if as_list:
        return list(iterator)
    else:
        return iterator

to_dict

to_dict(*columns, orient='list', **named_columns)
Source code in kloppy/domain/models/common.py
def to_dict(
    self,
    *columns: Unpack[tuple["Column"]],
    orient: Literal["list"] = "list",
    **named_columns: "NamedColumns",
) -> dict[str, list[Any]]:
    if orient == "list":
        from ..services.transformers.data_record import get_transformer_cls

        transformer = get_transformer_cls(self.dataset_type)(
            *columns, **named_columns
        )

        c = len(self.records)
        items = defaultdict(lambda: [None] * c)
        for i, record in enumerate(self.records):
            item = transformer(record)
            for k, v in item.items():
                items[k][i] = v

        return items
    else:
        raise KloppyParameterError(
            f"Orient {orient} is not supported. Only orient='list' is supported"
        )

to_df

to_df(*columns, engine=None, **named_columns)
Source code in kloppy/domain/models/common.py
def to_df(
    self,
    *columns: Unpack[tuple["Column"]],
    engine: Optional[Literal["polars", "pandas", "pandas[pyarrow]"]] = None,
    **named_columns: "NamedColumns",
):
    from kloppy.config import get_config

    if not engine:
        engine = get_config("dataframe.engine")

    if engine == "pandas[pyarrow]":
        try:
            import pandas as pd

            types_mapper = pd.ArrowDtype
        except ImportError:
            raise ImportError(
                "Seems like you don't have pandas installed. Please"
                " install it using: pip install pandas"
            )
        except AttributeError:
            raise AttributeError(
                "Seems like you have an older version of pandas installed. Please"
                " upgrade to at least 1.5 using: pip install pandas>=1.5"
            )

        try:
            import pyarrow as pa
        except ImportError:
            raise ImportError(
                "Seems like you don't have pyarrow installed. Please"
                " install it using: pip install pyarrow"
            )

        table = pa.Table.from_pydict(
            self.to_dict(*columns, orient="list", **named_columns)
        )
        return table.to_pandas(types_mapper=types_mapper)

    elif engine == "pandas":
        try:
            from pandas import DataFrame
        except ImportError:
            raise ImportError(
                "Seems like you don't have pandas installed. Please"
                " install it using: pip install pandas"
            )

        return DataFrame.from_dict(
            self.to_dict(*columns, orient="list", **named_columns)
        )
    elif engine == "polars":
        try:
            from polars import from_dict
        except ImportError:
            raise ImportError(
                "Seems like you don't have polars installed. Please"
                " install it using: pip install polars"
            )

        return from_dict(
            self.to_dict(*columns, orient="list", **named_columns)
        )
    else:
        raise KloppyParameterError(f"Engine {engine} is not valid")