Skip to content

kloppy.io

kloppy.io

I/O utilities for reading raw data.

FileLike module-attribute

FileLike = Union[FileOrPath, Source]

Source dataclass

Source(data, optional=False, skip_if_missing=False)

A wrapper around a file-like object to enable optional inputs.

PARAMETER DESCRIPTION
data

The file-like object.

TYPE: FileLike

optional

Whether the file is optional. Defaults to False.

TYPE: bool DEFAULT: False

skip_if_missing

Whether to skip the file if it is missing. Defaults to False.

TYPE: bool DEFAULT: False

Example:

1
>>> open_as_file(Source.create("example.csv", optional=True))

data instance-attribute

data

optional class-attribute instance-attribute

optional = False

skip_if_missing class-attribute instance-attribute

skip_if_missing = False

create classmethod

create(input_, **kwargs)
Source code in kloppy/io.py
@classmethod
def create(cls, input_: Optional[FileOrPath], **kwargs):
    if isinstance(input_, Source):
        return replace(input_, **kwargs)
    return Source(data=input_, **kwargs)

open_as_file

open_as_file(input_)

Open a byte stream to the given input object.

The following input types are supported
  • A string or pathlib.Path object representing a local file path.
  • A string representing a URL. It should start with 'http://' or 'https://'.
  • A string representing a path to a file in a Amazon S3 cloud storage bucket. It should start with 's3://'.
  • A xml or json string containing the data. The string should contain a '{' or '<' character. Otherwise, it will be treated as a file path.
  • A bytes object containing the data.
  • A buffered binary stream that inherits from io.BufferedIOBase.
  • A Source object that wraps any of the above input types.
PARAMETER DESCRIPTION
input_

The input object to be opened.

TYPE: FileLike

RETURNS DESCRIPTION
BinaryIO

A binary stream to the input object.

TYPE: ContextManager[Optional[BinaryIO]]

RAISES DESCRIPTION
ValueError

If the input is required but not provided.

InputNotFoundError

If the input file is not found and should not be skipped.

TypeError

If the input type is not supported.

Example:

1
2
>>> with open_as_file("example.txt") as f:
...     contents = f.read()
Note

To support reading data from other sources, see the Adapter class.

If the given file path or URL ends with '.gz', '.xz', or '.bz2', the file will be decompressed before being read.

Source code in kloppy/io.py
def open_as_file(input_: FileLike) -> ContextManager[Optional[BinaryIO]]:
    """Open a byte stream to the given input object.

    The following input types are supported:
        - A string or `pathlib.Path` object representing a local file path.
        - A string representing a URL. It should start with 'http://' or
          'https://'.
        - A string representing a path to a file in a Amazon S3 cloud storage
          bucket. It should start with 's3://'.
        - A xml or json string containing the data. The string should contain
          a '{' or '<' character. Otherwise, it will be treated as a file path.
        - A bytes object containing the data.
        - A buffered binary stream that inherits from `io.BufferedIOBase`.
        - A [Source](`kloppy.io.Source`) object that wraps any of the above
          input types.

    Args:
        input_ (FileLike): The input object to be opened.

    Returns:
        BinaryIO: A binary stream to the input object.

    Raises:
        ValueError: If the input is required but not provided.
        InputNotFoundError: If the input file is not found and should not be skipped.
        TypeError: If the input type is not supported.

    Example:

        >>> with open_as_file("example.txt") as f:
        ...     contents = f.read()

    Note:
        To support reading data from other sources, see the
        [Adapter](`kloppy.io.adapters.Adapter`) class.

        If the given file path or URL ends with '.gz', '.xz', or '.bz2', the
        file will be decompressed before being read.
    """
    if isinstance(input_, Source):
        if input_.data is None and input_.optional:
            # This saves us some additional code in every vendor specific code
            return dummy_context_mgr()
        elif input_.data is None:
            raise ValueError("Input required but not provided.")
        else:
            try:
                return open_as_file(input_.data)
            except InputNotFoundError as exc:
                if input_.skip_if_missing:
                    logging.info(f"Input {input_.data} not found. Skipping")
                    return dummy_context_mgr()
                else:
                    raise exc

    if isinstance(input_, str) and ("{" in input_ or "<" in input_):
        # If input_ is a JSON or XML string, return it as a binary stream
        return BytesIO(input_.encode("utf8"))

    if isinstance(input_, bytes):
        # If input_ is a bytes object, return it as a binary stream
        return BytesIO(input_)

    if isinstance(input_, str) or hasattr(input_, "__fspath__"):
        # If input_ is a path-like object, open it and return the binary stream
        uri = _filepath_from_path_or_filelike(input_)

        adapter = get_adapter(uri)
        if adapter:
            stream = BytesIO()
            adapter.read_to_stream(uri, stream)
            stream.seek(0)
        else:
            raise AdapterError(f"No adapter found for {uri}")
        return stream

    if isinstance(input_, TextIOWrapper):
        # If file_or_path is a TextIOWrapper, return its underlying binary buffer
        return input_.buffer

    if hasattr(input_, "readinto"):
        # If file_or_path is a file-like object, return it as is
        return _open(input_)  # type: ignore

    raise TypeError(f"Unsupported input type: {type(input_)}")

get_file_extension

get_file_extension(file_or_path)

Determine the file extension of the given file-like object.

If the file has compression extensions such as '.gz', '.xz', or '.bz2', they will be stripped before determining the extension.

PARAMETER DESCRIPTION
file_or_path

The file-like object whose extension needs to be determined.

TYPE: FileLike

RETURNS DESCRIPTION
str

The file extension, including the dot ('.') if present.

TYPE: str

RAISES DESCRIPTION
Exception

If the extension cannot be determined.

Example:

1
2
3
4
5
6
>>> get_file_extension("example.xml.gz")
'.xml'
>>> get_file_extension(Path("example.txt"))
'.txt'
>>> get_file_extension(Source(data="example.csv"))
'.csv'
Source code in kloppy/io.py
def get_file_extension(file_or_path: FileLike) -> str:
    """Determine the file extension of the given file-like object.

    If the file has compression extensions such as '.gz', '.xz', or '.bz2',
    they will be stripped before determining the extension.

    Args:
        file_or_path (FileLike): The file-like object whose extension needs to be determined.

    Returns:
        str: The file extension, including the dot ('.') if present.

    Raises:
        Exception: If the extension cannot be determined.

    Example:

        >>> get_file_extension("example.xml.gz")
        '.xml'
        >>> get_file_extension(Path("example.txt"))
        '.txt'
        >>> get_file_extension(Source(data="example.csv"))
        '.csv'
    """
    if isinstance(file_or_path, (str, bytes)) or hasattr(
        file_or_path, "__fspath__"
    ):
        path = os.fspath(file_or_path)  # type: ignore
        for ext in [".gz", ".xz", ".bz2"]:
            if path.endswith(ext):
                path = path[: -len(ext)]
        return os.path.splitext(path)[1]

    if isinstance(file_or_path, Source):
        return get_file_extension(file_or_path.data)

    raise TypeError(
        f"Could not determine extension for input type: {type(file_or_path)}"
    )