| """Implements serializers for Opta data.""" |
|
|
| import copy |
| import datetime |
| import glob |
| import os |
| import re |
| import warnings |
| from collections.abc import Mapping |
| from pathlib import Path |
| from typing import Any, Optional, Union, cast |
|
|
| import pandas as pd |
| from pandera.typing import DataFrame |
|
|
| from socceraction.data.base import EventDataLoader |
|
|
| from .parsers import ( |
| F1JSONParser, |
| F7XMLParser, |
| F9JSONParser, |
| F24JSONParser, |
| F24XMLParser, |
| MA1JSONParser, |
| MA3JSONParser, |
| OptaParser, |
| WhoScoredParser, |
| ) |
| from .schema import ( |
| OptaCompetitionSchema, |
| OptaEventSchema, |
| OptaGameSchema, |
| OptaPlayerSchema, |
| OptaTeamSchema, |
| ) |
|
|
| _jsonparsers = { |
| "f1": F1JSONParser, |
| "f9": F9JSONParser, |
| "f24": F24JSONParser, |
| "ma1": MA1JSONParser, |
| "ma3": MA3JSONParser, |
| } |
|
|
| _xmlparsers = { |
| "f7": F7XMLParser, |
| "f24": F24XMLParser, |
| } |
|
|
| _statsperformparsers = { |
| "ma1": MA1JSONParser, |
| "ma3": MA3JSONParser, |
| } |
|
|
| _whoscoredparsers = { |
| "whoscored": WhoScoredParser, |
| } |
|
|
| _eventtypesdf = pd.DataFrame( |
| [ |
| (1, "pass"), |
| (2, "offside pass"), |
| (3, "take on"), |
| (4, "foul"), |
| (5, "out"), |
| (6, "corner awarded"), |
| (7, "tackle"), |
| (8, "interception"), |
| (9, "turnover"), |
| (10, "save"), |
| (11, "claim"), |
| (12, "clearance"), |
| (13, "miss"), |
| (14, "post"), |
| (15, "attempt saved"), |
| (16, "goal"), |
| (17, "card"), |
| (18, "player off"), |
| (19, "player on"), |
| (20, "player retired"), |
| (21, "player returns"), |
| (22, "player becomes goalkeeper"), |
| (23, "goalkeeper becomes player"), |
| (24, "condition change"), |
| (25, "official change"), |
| (26, "unknown26"), |
| (27, "start delay"), |
| (28, "end delay"), |
| (29, "unknown29"), |
| (30, "end"), |
| (31, "unknown31"), |
| (32, "start"), |
| (33, "unknown33"), |
| (34, "team set up"), |
| (35, "player changed position"), |
| (36, "player changed jersey number"), |
| (37, "collection end"), |
| (38, "temp_goal"), |
| (39, "temp_attempt"), |
| (40, "formation change"), |
| (41, "punch"), |
| (42, "good skill"), |
| (43, "deleted event"), |
| (44, "aerial"), |
| (45, "challenge"), |
| (46, "unknown46"), |
| (47, "rescinded card"), |
| (48, "unknown46"), |
| (49, "ball recovery"), |
| (50, "dispossessed"), |
| (51, "error"), |
| (52, "keeper pick-up"), |
| (53, "cross not claimed"), |
| (54, "smother"), |
| (55, "offside provoked"), |
| (56, "shield ball opp"), |
| (57, "foul throw in"), |
| (58, "penalty faced"), |
| (59, "keeper sweeper"), |
| (60, "chance missed"), |
| (61, "ball touch"), |
| (62, "unknown62"), |
| (63, "temp_save"), |
| (64, "resume"), |
| (65, "contentious referee decision"), |
| (66, "possession data"), |
| (67, "50/50"), |
| (68, "referee drop ball"), |
| (69, "failed to block"), |
| (70, "injury time announcement"), |
| (71, "coach setup"), |
| (72, "caught offside"), |
| (73, "other ball contact"), |
| (74, "blocked pass"), |
| (75, "delayed start"), |
| (76, "early end"), |
| (77, "player off pitch"), |
| (78, "temp card"), |
| (79, "coverage interruption"), |
| (80, "drop of ball"), |
| (81, "obstacle"), |
| (83, "attempted tackle"), |
| (84, "deleted after review"), |
| (10000, "offside given"), |
| ], |
| columns=["type_id", "type_name"], |
| ) |
|
|
|
|
| def _deepupdate(target: dict[Any, Any], src: dict[Any, Any]) -> None: |
| """Deep update target dict with src. |
| |
| For each k,v in src: if k doesn't exist in target, it is deep copied from |
| src to target. Otherwise, if v is a list, target[k] is extended with |
| src[k]. If v is a set, target[k] is updated with v, If v is a dict, |
| recursively deep-update it. |
| |
| Parameters |
| ---------- |
| target: dict |
| The original dictionary which is updated. |
| src: dict |
| The dictionary with which `target` is updated. |
| |
| Examples |
| -------- |
| >>> t = {'name': 'ferry', 'hobbies': ['programming', 'sci-fi']} |
| >>> deepupdate(t, {'hobbies': ['gaming']}) |
| >>> print(t) |
| {'name': 'ferry', 'hobbies': ['programming', 'sci-fi', 'gaming']} |
| """ |
| for k, v in src.items(): |
| if isinstance(v, list): |
| if k not in target: |
| target[k] = copy.deepcopy(v) |
| else: |
| target[k].extend(v) |
| elif isinstance(v, dict): |
| if k not in target: |
| target[k] = copy.deepcopy(v) |
| else: |
| _deepupdate(target[k], v) |
| elif isinstance(v, set): |
| if k not in target: |
| target[k] = v.copy() |
| else: |
| target[k].update(v.copy()) |
| else: |
| target[k] = copy.copy(v) |
|
|
|
|
| def _extract_ids_from_path(path: str, pattern: str) -> dict[str, Union[str, int]]: |
| regex = re.compile( |
| ".+?" |
| + re.escape(pattern) |
| .replace(r"\{competition_id\}", r"(?P<competition_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)") |
| .replace(r"\{season_id\}", r"(?P<season_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)") |
| .replace(r"\{game_id\}", r"(?P<game_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)") |
| ) |
| m = re.match(regex, path) |
| if m is None: |
| raise ValueError(f"The filepath {path} does not match the format {pattern}.") |
| ids = m.groupdict() |
| return {k: int(v) if v.isdigit() else v for k, v in ids.items()} |
|
|
|
|
| class OptaLoader(EventDataLoader): |
| """Load Opta data feeds from a local folder. |
| |
| Parameters |
| ---------- |
| root : str |
| Root-path of the data. |
| parser : str or dict |
| Either 'xml', 'json', 'statsperform', 'whoscored' or a dict with |
| a custom parser for each feed. The default xml parser supports F7 and |
| F24 feeds; the default json parser supports F1, F9 and F24 feeds, the |
| StatsPerform parser supports MA1 and MA3 feeds. Custom parsers can be |
| specified as:: |
| |
| { |
| 'feed1_name': Feed1Parser |
| 'feed2_name': Feed2Parser |
| } |
| |
| where Feed1Parser and Feed2Parser are classes implementing |
| :class:`~socceraction.spadl.opta.OptaParser` and 'feed1_name' and |
| 'feed2_name' are a unique ID for each feed that matches to the keys in |
| `feeds`. |
| feeds : dict |
| Glob pattern describing from which files the data from a specific game |
| can be retrieved. For example, if files are named:: |
| |
| f7-1-2021-17362.xml |
| f24-1-2021-17362.xml |
| |
| use:: |
| |
| feeds = { |
| 'f7': "f7-{competition_id}-{season_id}-{game_id}.xml", |
| 'f24': "f24-{competition_id}-{season_id}-{game_id}.xml" |
| } |
| |
| Raises |
| ------ |
| ValueError |
| If an invalid parser is provided. |
| """ |
|
|
| def __init__( |
| self, |
| root: str, |
| parser: Union[str, Mapping[str, type[OptaParser]]] = "xml", |
| feeds: Optional[dict[str, str]] = None, |
| ) -> None: |
| self.root = root |
| if parser == "json": |
| if feeds is None: |
| feeds = { |
| "f1": "f1-{competition_id}-{season_id}.json", |
| "f9": "f9-{competition_id}-{season_id}-{game_id}.json", |
| "f24": "f24-{competition_id}-{season_id}-{game_id}.json", |
| } |
| self.parsers = self._get_parsers_for_feeds(_jsonparsers, feeds) |
| elif parser == "xml": |
| if feeds is None: |
| feeds = { |
| "f7": "f7-{competition_id}-{season_id}-{game_id}.xml", |
| "f24": "f24-{competition_id}-{season_id}-{game_id}.xml", |
| } |
| self.parsers = self._get_parsers_for_feeds(_xmlparsers, feeds) |
| elif parser == "statsperform": |
| if feeds is None: |
| feeds = { |
| "ma1": "ma1-{competition_id}-{season_id}.json", |
| "ma3": "ma3-{competition_id}-{season_id}-{game_id}.json", |
| } |
| self.parsers = self._get_parsers_for_feeds(_statsperformparsers, feeds) |
| elif parser == "whoscored": |
| if feeds is None: |
| feeds = { |
| "whoscored": "{competition_id}-{season_id}-{game_id}.json", |
| } |
| self.parsers = self._get_parsers_for_feeds(_whoscoredparsers, feeds) |
| elif isinstance(parser, dict): |
| if feeds is None: |
| raise ValueError("You must specify a feed for each parser.") |
| self.parsers = self._get_parsers_for_feeds(parser, feeds) |
| else: |
| raise ValueError("Invalid parser provided.") |
| self.feeds = {k: str(Path(v)) for k, v in feeds.items()} |
|
|
| def _get_parsers_for_feeds( |
| self, available_parsers: Mapping[str, type[OptaParser]], feeds: dict[str, str] |
| ) -> Mapping[str, type[OptaParser]]: |
| """Select the appropriate parser for each feed. |
| |
| Parameters |
| ---------- |
| available_parsers : dict(str, OptaParser) |
| Dictionary with all available parsers. |
| feeds : dict(str, str) |
| All feeds that should be parsed. |
| |
| Returns |
| ------- |
| dict(str, OptaParser) |
| A mapping between all feeds that should be parsed and the |
| corresponding parser class. |
| |
| Warns |
| ----- |
| Raises a warning if there is no parser available for any of the |
| provided feeds. |
| """ |
| parsers = {} |
| for feed in feeds: |
| if feed in available_parsers: |
| parsers[feed] = available_parsers[feed] |
| else: |
| warnings.warn(f"No parser available for {feed} feeds. This feed is ignored.") |
| return parsers |
|
|
| def competitions(self) -> DataFrame[OptaCompetitionSchema]: |
| """Return a dataframe with all available competitions and seasons. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all available competitions and seasons. See |
| :class:`~socceraction.spadl.opta.OptaCompetitionSchema` for the schema. |
| """ |
| data: dict[int, dict[str, Any]] = {} |
| loaded_seasons = set() |
| for feed, feed_pattern in self.feeds.items(): |
| glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id="*") |
| feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| for ffp in feed_files: |
| ids = _extract_ids_from_path(ffp, feed_pattern) |
| |
| |
| |
| competition_id = ids.get("competition_id") |
| season_id = ids.get("season_id") |
| if competition_id is not None and season_id is not None: |
| if (competition_id, season_id) in loaded_seasons: |
| continue |
| else: |
| loaded_seasons.add((competition_id, season_id)) |
| parser = self.parsers[feed](ffp, **ids) |
| _deepupdate(data, parser.extract_competitions()) |
| return cast(DataFrame[OptaCompetitionSchema], pd.DataFrame(list(data.values()))) |
|
|
| def games(self, competition_id: int, season_id: int) -> DataFrame[OptaGameSchema]: |
| """Return a dataframe with all available games in a season. |
| |
| Parameters |
| ---------- |
| competition_id : int |
| The ID of the competition. |
| season_id : int |
| The ID of the season. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all available games. See |
| :class:`~socceraction.spadl.opta.OptaGameSchema` for the schema. |
| """ |
| data: dict[int, dict[str, Any]] = {} |
| for feed, feed_pattern in self.feeds.items(): |
| glob_pattern = feed_pattern.format( |
| competition_id=competition_id, season_id=season_id, game_id="*" |
| ) |
| feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| for ffp in feed_files: |
| ids = _extract_ids_from_path(ffp, feed_pattern) |
| parser = self.parsers[feed](ffp, **ids) |
| _deepupdate(data, parser.extract_games()) |
| return cast(DataFrame[OptaGameSchema], pd.DataFrame(list(data.values()))) |
|
|
| def teams(self, game_id: int) -> DataFrame[OptaTeamSchema]: |
| """Return a dataframe with both teams that participated in a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing both teams. See |
| :class:`~socceraction.spadl.opta.OptaTeamSchema` for the schema. |
| """ |
| data: dict[int, dict[str, Any]] = {} |
| for feed, feed_pattern in self.feeds.items(): |
| glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id) |
| feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| for ffp in feed_files: |
| ids = _extract_ids_from_path(ffp, feed_pattern) |
| parser = self.parsers[feed](ffp, **ids) |
| _deepupdate(data, parser.extract_teams()) |
| return cast(DataFrame[OptaTeamSchema], pd.DataFrame(list(data.values()))) |
|
|
| def players(self, game_id: int) -> DataFrame[OptaPlayerSchema]: |
| """Return a dataframe with all players that participated in a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all players. See |
| :class:`~socceraction.spadl.opta.OptaPlayerSchema` for the schema. |
| """ |
| data: dict[int, dict[str, Any]] = {} |
| for feed, feed_pattern in self.feeds.items(): |
| glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id) |
| feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| for ffp in feed_files: |
| ids = _extract_ids_from_path(ffp, feed_pattern) |
| parser = self.parsers[feed](ffp, **ids) |
| _deepupdate(data, parser.extract_players()) |
| df_players = pd.DataFrame(list(data.values())) |
| df_players["game_id"] = game_id |
| return cast(DataFrame[OptaPlayerSchema], df_players) |
|
|
| def events(self, game_id: int) -> DataFrame[OptaEventSchema]: |
| """Return a dataframe with the event stream of a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing the event stream. See |
| :class:`~socceraction.spadl.opta.OptaEventSchema` for the schema. |
| """ |
| data: dict[int, dict[str, Any]] = {} |
| for feed, feed_pattern in self.feeds.items(): |
| glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id) |
| feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| for ffp in feed_files: |
| ids = _extract_ids_from_path(ffp, feed_pattern) |
| parser = self.parsers[feed](ffp, **ids) |
| _deepupdate(data, parser.extract_events()) |
| events = ( |
| pd.DataFrame(list(data.values())) |
| .merge(_eventtypesdf, on="type_id", how="left") |
| .sort_values( |
| ["game_id", "period_id", "minute", "second", "timestamp"], kind="mergesort" |
| ) |
| .reset_index(drop=True) |
| ) |
|
|
| |
| events.loc[events.second < 0, "second"] = 0 |
| events = events.sort_values( |
| ["game_id", "period_id", "minute", "second", "timestamp"], kind="mergesort" |
| ) |
|
|
| |
| events = events[events.type_id != 43] |
| events = events[ |
| ~( |
| (events.timestamp < datetime.datetime(1900, 1, 1)) |
| | (events.timestamp > datetime.datetime(2100, 1, 1)) |
| ) |
| ] |
|
|
| return cast(DataFrame[OptaEventSchema], events) |
|
|