Loading data from external sources¶
Since 3.3 kloppy uses adapters to load data from external sources. kloppy is shipped with support for http and s3, but you can add your own adapters to support different external sources.
S3¶
kloppy uses s3fs to access files on s3. If preferred you can create a s3fs.S3FileSystem instance and pass it via set_config
In [ ]:
Copied!
import s3fs
from kloppy import statsbomb
from kloppy.config import set_config
file_system = s3fs.S3FileSystem(anon=True)
set_config(
'adapters.s3.s3fs',
file_system
)
# This will fail because we don't have access to 'some-bucket'
dataset = statsbomb.load(
event_data='s3://some-bucket/1234/events.json',
lineup_data='s3://some-bucket/1234/lineup.json'
)
import s3fs
from kloppy import statsbomb
from kloppy.config import set_config
file_system = s3fs.S3FileSystem(anon=True)
set_config(
'adapters.s3.s3fs',
file_system
)
# This will fail because we don't have access to 'some-bucket'
dataset = statsbomb.load(
event_data='s3://some-bucket/1234/events.json',
lineup_data='s3://some-bucket/1234/lineup.json'
)
Custom adapter - database¶
It's possible to create your own adapter. For example a database adapter.
First create a table within a sqlite database to hold the file content.
In [45]:
Copied!
import sqlite3
# Setup a table
con = sqlite3.connect('database.db')
try:
con.execute("CREATE TABLE files(match_id INT, file_type TEXT, data TEXT, UNIQUE(match_id, file_type))")
con.commit()
except sqlite3.OperationalError:
# already exists
pass
# Load some open data into the database
import requests
def load_into_table(match_id):
events_data = requests.get(f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match_id}.json")
con.execute("INSERT INTO files values(?, ?, ?)", (match_id, 'events', events_data.content))
lineup_data = requests.get(f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match_id}.json")
con.execute("INSERT INTO files values(?, ?, ?)", (match_id, 'lineup', lineup_data.content))
con.commit()
import sqlite3
# Setup a table
con = sqlite3.connect('database.db')
try:
con.execute("CREATE TABLE files(match_id INT, file_type TEXT, data TEXT, UNIQUE(match_id, file_type))")
con.commit()
except sqlite3.OperationalError:
# already exists
pass
# Load some open data into the database
import requests
def load_into_table(match_id):
events_data = requests.get(f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match_id}.json")
con.execute("INSERT INTO files values(?, ?, ?)", (match_id, 'events', events_data.content))
lineup_data = requests.get(f"https://raw.githubusercontent.com/statsbomb/open-data/master/data/lineups/{match_id}.json")
con.execute("INSERT INTO files values(?, ?, ?)", (match_id, 'lineup', lineup_data.content))
con.commit()
Next load the data from two matches into the database
In [46]:
Copied!
load_into_table(9636)
load_into_table(9609)
load_into_table(9636)
load_into_table(9609)
Define our database adapter. The adapter supports all urls starting with db://
In [60]:
Copied!
from typing import BinaryIO
from kloppy.infra.io.adapters import Adapter, adapters
class DBAdapter(Adapter):
def __init__(self, con):
self.con = con
def supports(self, url: str):
return url.startswith("db://")
def read_to_stream(self, url: str, output: BinaryIO):
match_id, file_type = url[5:].split("/")
cursor = con.cursor()
cursor.execute("SELECT data FROM files WHERE match_id = ? AND file_type = ?", (match_id, file_type))
result = cursor.fetchone()
output.write(result[0])
db_adapter = DBAdapter(con)
# When you rerun this code you might need to run: adapters.clear()
adapters.append(db_adapter)
set_config('cache', None)
from typing import BinaryIO
from kloppy.infra.io.adapters import Adapter, adapters
class DBAdapter(Adapter):
def __init__(self, con):
self.con = con
def supports(self, url: str):
return url.startswith("db://")
def read_to_stream(self, url: str, output: BinaryIO):
match_id, file_type = url[5:].split("/")
cursor = con.cursor()
cursor.execute("SELECT data FROM files WHERE match_id = ? AND file_type = ?", (match_id, file_type))
result = cursor.fetchone()
output.write(result[0])
db_adapter = DBAdapter(con)
# When you rerun this code you might need to run: adapters.clear()
adapters.append(db_adapter)
set_config('cache', None)
Use the adapter!
In [61]:
Copied!
dataset = statsbomb.load(
event_data="db://9636/events",
lineup_data="db://9636/lineup",
)
dataset = statsbomb.load(
event_data="db://9636/events",
lineup_data="db://9636/lineup",
)
In [63]:
Copied!
home_team, away_team = dataset.metadata.teams
print(f"Loaded from database: {home_team} - {away_team}")
home_team, away_team = dataset.metadata.teams
print(f"Loaded from database: {home_team} - {away_team}")
Loaded from database: Barcelona - Las Palmas