| """Implements serializers for Wyscout data.""" |
|
|
| import glob |
| import os |
| import re |
| import warnings |
| from pathlib import Path |
| from typing import Any, Callable, Optional, Union, cast |
| from urllib.error import HTTPError |
| from urllib.parse import urlparse |
| from urllib.request import urlopen, urlretrieve |
| from zipfile import ZipFile, is_zipfile |
|
|
| import pandas as pd |
| from pandera.typing import DataFrame |
|
|
| from ..base import ( |
| EventDataLoader, |
| JSONType, |
| MissingDataError, |
| ParseError, |
| _auth_remoteloadjson, |
| _expand_minute, |
| _has_auth, |
| _localloadjson, |
| _remoteloadjson, |
| ) |
| from .schema import ( |
| WyscoutCompetitionSchema, |
| WyscoutEventSchema, |
| WyscoutGameSchema, |
| WyscoutPlayerSchema, |
| WyscoutTeamSchema, |
| ) |
|
|
|
|
| class PublicWyscoutLoader(EventDataLoader): |
| """ |
| Load the public Wyscout dataset. |
| |
| This dataset is a public release of event stream data, collected by Wyscout |
| (https://wyscout.com/) containing all matches of the 2017/18 season of the |
| top-5 European leagues (La Liga, Serie A, Bundesliga, Premier League, Ligue |
| 1), the FIFA World Cup 2018, and UEFA Euro Cup 2016. For a detailed |
| description, see Pappalardo et al. [1]_. |
| |
| Parameters |
| ---------- |
| root : str |
| Path where a local copy of the dataset is stored or where the |
| downloaded dataset should be stored. |
| download : bool |
| Whether to force a redownload of the data. |
| |
| References |
| ---------- |
| .. [1] Pappalardo, L., Cintia, P., Rossi, A. et al. A public data set of |
| spatio-temporal match events in soccer competitions. Sci Data 6, 236 |
| (2019). https://doi.org/10.1038/s41597-019-0247-7 |
| """ |
|
|
| def __init__(self, root: Optional[str] = None, download: bool = False) -> None: |
| if root is None: |
| self.root = os.path.join(os.getcwd(), "wyscout_data") |
| os.makedirs(self.root, exist_ok=True) |
| else: |
| self.root = root |
|
|
| self.get = _localloadjson |
|
|
| if download or len(os.listdir(self.root)) == 0: |
| self._download_repo() |
|
|
| self._index = pd.DataFrame( |
| [ |
| { |
| "competition_id": 524, |
| "season_id": 181248, |
| "season_name": "2017/2018", |
| "db_matches": "matches_Italy.json", |
| "db_events": "events_Italy.json", |
| }, |
| { |
| "competition_id": 364, |
| "season_id": 181150, |
| "season_name": "2017/2018", |
| "db_matches": "matches_England.json", |
| "db_events": "events_England.json", |
| }, |
| { |
| "competition_id": 795, |
| "season_id": 181144, |
| "season_name": "2017/2018", |
| "db_matches": "matches_Spain.json", |
| "db_events": "events_Spain.json", |
| }, |
| { |
| "competition_id": 412, |
| "season_id": 181189, |
| "season_name": "2017/2018", |
| "db_matches": "matches_France.json", |
| "db_events": "events_France.json", |
| }, |
| { |
| "competition_id": 426, |
| "season_id": 181137, |
| "season_name": "2017/2018", |
| "db_matches": "matches_Germany.json", |
| "db_events": "events_Germany.json", |
| }, |
| { |
| "competition_id": 102, |
| "season_id": 9291, |
| "season_name": "2016", |
| "db_matches": "matches_European_Championship.json", |
| "db_events": "events_European_Championship.json", |
| }, |
| { |
| "competition_id": 28, |
| "season_id": 10078, |
| "season_name": "2018", |
| "db_matches": "matches_World_Cup.json", |
| "db_events": "events_World_Cup.json", |
| }, |
| ] |
| ).set_index(["competition_id", "season_id"]) |
| self._match_index = self._create_match_index().set_index("match_id") |
| self._cache: Optional[dict[str, Any]] = None |
|
|
| def _download_repo(self) -> None: |
| dataset_urls = { |
| "competitions": "https://ndownloader.figshare.com/files/15073685", |
| "teams": "https://ndownloader.figshare.com/files/15073697", |
| "players": "https://ndownloader.figshare.com/files/15073721", |
| "matches": "https://ndownloader.figshare.com/files/14464622", |
| "events": "https://ndownloader.figshare.com/files/14464685", |
| } |
| |
| for url in dataset_urls.values(): |
| url_obj = urlopen(url).geturl() |
| path = Path(urlparse(url_obj).path) |
| file_name = os.path.join(self.root, path.name) |
| file_local, _ = urlretrieve(url_obj, file_name) |
| if is_zipfile(file_local): |
| with ZipFile(file_local) as zip_file: |
| zip_file.extractall(self.root) |
|
|
| def _create_match_index(self) -> pd.DataFrame: |
| df_matches = pd.concat( |
| [pd.DataFrame(self.get(path)) for path in glob.iglob(f"{self.root}/matches_*.json")] |
| ) |
| df_matches.rename( |
| columns={ |
| "wyId": "match_id", |
| "competitionId": "competition_id", |
| "seasonId": "season_id", |
| }, |
| inplace=True, |
| ) |
| return pd.merge( |
| df_matches[["match_id", "competition_id", "season_id"]], |
| self._index, |
| on=["competition_id", "season_id"], |
| how="left", |
| ) |
|
|
| def competitions(self) -> DataFrame[WyscoutCompetitionSchema]: |
| """Return a dataframe with all available competitions and seasons. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all available competitions and seasons. See |
| :class:`~socceraction.spadl.wyscout.WyscoutCompetitionSchema` for the schema. |
| """ |
| path = os.path.join(self.root, "competitions.json") |
| df_competitions = pd.DataFrame(self.get(path)) |
| df_competitions.rename( |
| columns={"wyId": "competition_id", "name": "competition_name"}, inplace=True |
| ) |
| df_competitions["country_name"] = df_competitions.apply( |
| lambda x: x.area["name"] if x.area["name"] != "" else "International", axis=1 |
| ) |
| df_competitions["competition_gender"] = "male" |
| df_competitions = pd.merge( |
| df_competitions, |
| self._index.reset_index()[["competition_id", "season_id", "season_name"]], |
| on="competition_id", |
| how="left", |
| ) |
| return cast( |
| DataFrame[WyscoutCompetitionSchema], |
| df_competitions.reset_index()[ |
| [ |
| "competition_id", |
| "season_id", |
| "country_name", |
| "competition_name", |
| "competition_gender", |
| "season_name", |
| ] |
| ], |
| ) |
|
|
| def games(self, competition_id: int, season_id: int) -> DataFrame[WyscoutGameSchema]: |
| """Return a dataframe with all available games in a season. |
| |
| Parameters |
| ---------- |
| competition_id : int |
| The ID of the competition. |
| season_id : int |
| The ID of the season. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all available games. See |
| :class:`~socceraction.spadl.wyscout.WyscoutGameSchema` for the schema. |
| """ |
| path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_matches"]) |
| df_matches = pd.DataFrame(self.get(path)) |
| return cast(DataFrame[WyscoutGameSchema], _convert_games(df_matches)) |
|
|
| def _lineups(self, game_id: int) -> list[dict[str, Any]]: |
| competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]] |
| path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_matches"]) |
| df_matches = pd.DataFrame(self.get(path)).set_index("wyId") |
| return list(df_matches.at[game_id, "teamsData"].values()) |
|
|
| def teams(self, game_id: int) -> DataFrame[WyscoutTeamSchema]: |
| """Return a dataframe with both teams that participated in a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing both teams. See |
| :class:`~socceraction.spadl.wyscout.WyscoutTeamSchema` for the schema. |
| """ |
| path = os.path.join(self.root, "teams.json") |
| df_teams = pd.DataFrame(self.get(path)).set_index("wyId") |
| df_teams_match_id = pd.DataFrame(self._lineups(game_id))["teamId"] |
| df_teams_match = df_teams.loc[df_teams_match_id].reset_index() |
| return cast(DataFrame[WyscoutTeamSchema], _convert_teams(df_teams_match)) |
|
|
| def players(self, game_id: int) -> DataFrame[WyscoutPlayerSchema]: |
| """Return a dataframe with all players that participated in a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all players. See |
| :class:`~socceraction.spadl.wyscout.WyscoutPlayerSchema` for the schema. |
| """ |
| path = os.path.join(self.root, "players.json") |
| df_players = pd.DataFrame(self.get(path)).set_index("wyId") |
| lineups = self._lineups(game_id) |
| players_match = [] |
| for team in lineups: |
| playerlist = team["formation"]["lineup"] |
| if team["formation"]["substitutions"] != "null": |
| for p in team["formation"]["substitutions"]: |
| try: |
| playerlist.append( |
| next( |
| item |
| for item in team["formation"]["bench"] |
| if item["playerId"] == p["playerIn"] |
| ) |
| ) |
| except StopIteration: |
| warnings.warn( |
| f'A player with ID={p["playerIn"]} was substituted ' |
| f'in the {p["minute"]}th minute of game {game_id}, but ' |
| "could not be found on the bench." |
| ) |
| df = pd.DataFrame(playerlist) |
| df["side"] = team["side"] |
| df["team_id"] = team["teamId"] |
| players_match.append(df) |
| df_players_match = ( |
| pd.concat(players_match) |
| .rename(columns={"playerId": "wyId"}) |
| .set_index("wyId") |
| .join(df_players, how="left") |
| ) |
| df_players_match.reset_index(inplace=True) |
| for c in ["shortName", "lastName", "firstName"]: |
| df_players_match[c] = df_players_match[c].apply( |
| lambda x: x.encode().decode("unicode-escape") |
| ) |
| df_players_match = _convert_players(df_players_match) |
|
|
| |
| competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]] |
| path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_events"]) |
| if self._cache is not None and self._cache["path"] == path: |
| df_events = self._cache["events"] |
| else: |
| df_events = pd.DataFrame(self.get(path)).set_index("matchId") |
| |
| |
| self._cache = {"path": path, "events": df_events} |
| match_events = df_events.loc[game_id].reset_index().to_dict("records") |
| mp = _get_minutes_played(lineups, match_events) |
| df_players_match = pd.merge(df_players_match, mp, on="player_id", how="right") |
| df_players_match["minutes_played"] = df_players_match.minutes_played.fillna(0) |
| df_players_match["game_id"] = game_id |
| return cast(DataFrame[WyscoutPlayerSchema], df_players_match) |
|
|
| def events(self, game_id: int) -> DataFrame[WyscoutEventSchema]: |
| """Return a dataframe with the event stream of a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing the event stream. See |
| :class:`~socceraction.spadl.wyscout.WyscoutEventSchema` for the schema. |
| """ |
| competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]] |
| path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_events"]) |
| if self._cache is not None and self._cache["path"] == path: |
| df_events = self._cache["events"] |
| else: |
| df_events = pd.DataFrame(self.get(path)).set_index("matchId") |
| |
| |
| self._cache = {"path": path, "events": df_events} |
| return cast( |
| DataFrame[WyscoutEventSchema], _convert_events(df_events.loc[game_id].reset_index()) |
| ) |
|
|
|
|
| class WyscoutLoader(EventDataLoader): |
| """Load event data either from a remote location or from a local folder. |
| |
| Parameters |
| ---------- |
| root : str |
| Root-path of the data. |
| getter : str or callable, default: "remote" |
| "remote", "local" or a function that returns loads JSON data from a path. |
| feeds : dict(str, str) |
| Glob pattern for each feed that should be parsed. The default feeds for |
| a "remote" getter are:: |
| |
| { |
| 'competitions': 'competitions', |
| 'seasons': 'competitions/{season_id}/seasons', |
| 'games': 'seasons/{season_id}/matches', |
| 'events': 'matches/{game_id}/events?fetch=teams,players,match,substitutions' |
| } |
| |
| The default feeds for a "local" getter are:: |
| |
| { |
| 'competitions': 'competitions.json', |
| 'seasons': 'seasons_{competition_id}.json', |
| 'games': 'matches_{season_id}.json', |
| 'events': 'matches/events_{game_id}.json', |
| } |
| |
| creds: dict, optional |
| Login credentials in the format {"user": "", "passwd": ""}. Only used |
| when getter is "remote". |
| """ |
|
|
| _wyscout_api: str = "https://apirest.wyscout.com/v2/" |
|
|
| def __init__( |
| self, |
| root: str = _wyscout_api, |
| getter: Union[str, Callable[[str], JSONType]] = "remote", |
| feeds: Optional[dict[str, str]] = None, |
| creds: Optional[dict[str, str]] = None, |
| ) -> None: |
| self.root = root |
|
|
| |
| if creds is None: |
| creds = { |
| "user": os.environ.get("WY_USERNAME", ""), |
| "passwd": os.environ.get("WY_PASSWORD", ""), |
| } |
|
|
| |
| if getter == "remote": |
| self.get = _remoteloadjson |
| if _has_auth(creds): |
| _auth_remoteloadjson(creds["user"], creds["passwd"]) |
| elif getter == "local": |
| self.get = _localloadjson |
| else: |
| self.get = getter |
|
|
| |
| if feeds is not None: |
| self.feeds = feeds |
| elif getter == "remote": |
| self.feeds = { |
| "seasons": "competitions/{competition_id}/seasons?fetch=competition", |
| "games": "seasons/{season_id}/matches", |
| "events": "matches/{game_id}/events?fetch=teams,players,match,coaches,referees,formations,substitutions", |
| } |
| elif getter == "local": |
| self.feeds = { |
| "competitions": "competitions.json", |
| "seasons": "seasons_{competition_id}.json", |
| "games": "matches_{season_id}.json", |
| "events": "matches/events_{game_id}.json", |
| } |
| else: |
| raise ValueError("No feeds specified.") |
|
|
| def _get_file_or_url( |
| self, |
| feed: str, |
| competition_id: Optional[int] = None, |
| season_id: Optional[int] = None, |
| game_id: Optional[int] = None, |
| ) -> list[str]: |
| competition_id_glob = "*" if competition_id is None else competition_id |
| season_id_glob = "*" if season_id is None else season_id |
| game_id_glob = "*" if game_id is None else game_id |
| glob_pattern = self.feeds[feed].format( |
| competition_id=competition_id_glob, season_id=season_id_glob, game_id=game_id_glob |
| ) |
| if "*" in glob_pattern: |
| files = glob.glob(os.path.join(self.root, glob_pattern)) |
| if len(files) == 0: |
| raise MissingDataError |
| return files |
| return [glob_pattern] |
|
|
| def competitions( |
| self, competition_id: Optional[int] = None |
| ) -> DataFrame[WyscoutCompetitionSchema]: |
| """Return a dataframe with all available competitions and seasons. |
| |
| Parameters |
| ---------- |
| competition_id : int, optional |
| The ID of the competition. |
| |
| Raises |
| ------ |
| ParseError |
| When the raw data does not adhere to the expected format. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all available competitions and seasons. See |
| :class:`~socceraction.spadl.wyscout.WyscoutCompetitionSchema` for the schema. |
| """ |
| |
| if "competitions" in self.feeds: |
| competitions_url = self._get_file_or_url("competitions")[0] |
| path = os.path.join(self.root, competitions_url) |
| obj = self.get(path) |
| if not isinstance(obj, dict) or "competitions" not in obj: |
| raise ParseError(f"{path} should contain a list of competitions") |
| seasons_urls = [ |
| self._get_file_or_url("seasons", competition_id=c["wyId"])[0] |
| for c in obj["competitions"] |
| ] |
| else: |
| seasons_urls = self._get_file_or_url("seasons", competition_id=competition_id) |
| |
| competitions = [] |
| seasons = [] |
| for seasons_url in seasons_urls: |
| try: |
| path = os.path.join(self.root, seasons_url) |
| obj = self.get(path) |
| if not isinstance(obj, dict) or "competition" not in obj or "seasons" not in obj: |
| raise ParseError( |
| f"{path} should contain a list of competition and list of seasons" |
| ) |
| competitions.append(obj["competition"]) |
| seasons.extend([s["season"] for s in obj["seasons"]]) |
| except FileNotFoundError: |
| warnings.warn(f"File not found: {seasons_url}") |
| df_competitions = _convert_competitions(pd.DataFrame(competitions)) |
| df_seasons = _convert_seasons(pd.DataFrame(seasons)) |
| |
| return cast( |
| DataFrame[WyscoutCompetitionSchema], |
| pd.merge(df_competitions, df_seasons, on="competition_id"), |
| ) |
|
|
| def games(self, competition_id: int, season_id: int) -> DataFrame[WyscoutGameSchema]: |
| """Return a dataframe with all available games in a season. |
| |
| Parameters |
| ---------- |
| competition_id : int |
| The ID of the competition. |
| season_id : int |
| The ID of the season. |
| |
| Raises |
| ------ |
| ParseError |
| When the raw data does not adhere to the expected format. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all available games. See |
| :class:`~socceraction.spadl.wyscout.WyscoutGameSchema` for the schema. |
| """ |
| |
| if "games" in self.feeds: |
| games_url = self._get_file_or_url( |
| "games", competition_id=competition_id, season_id=season_id |
| )[0] |
| path = os.path.join(self.root, games_url) |
| obj = self.get(path) |
| if not isinstance(obj, dict) or "matches" not in obj: |
| raise ParseError(f"{path} should contain a list of matches") |
| gamedetails_urls = [ |
| self._get_file_or_url( |
| "events", |
| competition_id=competition_id, |
| season_id=season_id, |
| game_id=g["matchId"], |
| )[0] |
| for g in obj["matches"] |
| ] |
| else: |
| gamedetails_urls = self._get_file_or_url( |
| "events", competition_id=competition_id, season_id=season_id |
| ) |
| games = [] |
| for gamedetails_url in gamedetails_urls: |
| try: |
| path = os.path.join(self.root, gamedetails_url) |
| obj = self.get(path) |
| if not isinstance(obj, dict) or "match" not in obj: |
| raise ParseError(f"{path} should contain a match") |
| games.append(obj["match"]) |
| except FileNotFoundError: |
| warnings.warn(f"File not found: {gamedetails_url}") |
| except HTTPError: |
| warnings.warn(f"Resource not found: {gamedetails_url}") |
| df_games = _convert_games(pd.DataFrame(games)) |
| return cast(DataFrame[WyscoutGameSchema], df_games) |
|
|
| def teams(self, game_id: int) -> DataFrame[WyscoutTeamSchema]: |
| """Return a dataframe with both teams that participated in a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Raises |
| ------ |
| ParseError |
| When the raw data does not adhere to the expected format. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing both teams. See |
| :class:`~socceraction.spadl.wyscout.WyscoutTeamSchema` for the schema. |
| """ |
| events_url = self._get_file_or_url("events", game_id=game_id)[0] |
| path = os.path.join(self.root, events_url) |
| obj = self.get(path) |
| if not isinstance(obj, dict) or "teams" not in obj: |
| raise ParseError(f"{path} should contain a list of matches") |
| teams = [t["team"] for t in obj["teams"].values() if t.get("team")] |
| df_teams = _convert_teams(pd.DataFrame(teams)) |
| return cast(DataFrame[WyscoutTeamSchema], df_teams) |
|
|
| def players(self, game_id: int) -> DataFrame[WyscoutPlayerSchema]: |
| """Return a dataframe with all players that participated in a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Raises |
| ------ |
| ParseError |
| When the raw data does not adhere to the expected format. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing all players. See |
| :class:`~socceraction.spadl.wyscout.WyscoutPlayerSchema` for the schema. |
| """ |
| events_url = self._get_file_or_url("events", game_id=game_id)[0] |
| path = os.path.join(self.root, events_url) |
| obj = self.get(path) |
| if not isinstance(obj, dict) or "players" not in obj: |
| raise ParseError(f"{path} should contain a list of players") |
| players = [ |
| player["player"] |
| for team in obj["players"].values() |
| for player in team |
| if player.get("player") |
| ] |
| df_players = _convert_players(pd.DataFrame(players).drop_duplicates("wyId")) |
| df_players = pd.merge( |
| df_players, |
| _get_minutes_played(obj["match"]["teamsData"], obj["events"]), |
| on="player_id", |
| how="right", |
| ) |
| df_players["minutes_played"] = df_players.minutes_played.fillna(0) |
| df_players["game_id"] = game_id |
| return cast(DataFrame[WyscoutPlayerSchema], df_players) |
|
|
| def events(self, game_id: int) -> DataFrame[WyscoutEventSchema]: |
| """Return a dataframe with the event stream of a game. |
| |
| Parameters |
| ---------- |
| game_id : int |
| The ID of the game. |
| |
| Raises |
| ------ |
| ParseError |
| When the raw data does not adhere to the expected format. |
| |
| Returns |
| ------- |
| pd.DataFrame |
| A dataframe containing the event stream. See |
| :class:`~socceraction.spadl.wyscout.WyscoutEventSchema` for the schema. |
| """ |
| events_url = self._get_file_or_url("events", game_id=game_id)[0] |
| path = os.path.join(self.root, events_url) |
| obj = self.get(path) |
| if not isinstance(obj, dict) or "events" not in obj: |
| raise ParseError(f"{path} should contain a list of events") |
| df_events = _convert_events(pd.DataFrame(obj["events"])) |
| return cast(DataFrame[WyscoutEventSchema], df_events) |
|
|
|
|
| def _convert_competitions(competitions: pd.DataFrame) -> pd.DataFrame: |
| competitionsmapping = { |
| "wyId": "competition_id", |
| "name": "competition_name", |
| "gender": "competition_gender", |
| } |
| cols = ["competition_id", "competition_name", "country_name", "competition_gender"] |
| competitions["country_name"] = competitions.apply( |
| lambda x: x.area["name"] if x.area["name"] != "" else "International", axis=1 |
| ) |
| competitions = competitions.rename(columns=competitionsmapping)[cols] |
| return competitions |
|
|
|
|
| def _convert_seasons(seasons: pd.DataFrame) -> pd.DataFrame: |
| seasonsmapping = { |
| "wyId": "season_id", |
| "name": "season_name", |
| "competitionId": "competition_id", |
| } |
| cols = ["season_id", "season_name", "competition_id"] |
| seasons = seasons.rename(columns=seasonsmapping)[cols] |
| return seasons |
|
|
|
|
| def _convert_games(matches: pd.DataFrame) -> pd.DataFrame: |
| gamesmapping = { |
| "wyId": "game_id", |
| "dateutc": "game_date", |
| "competitionId": "competition_id", |
| "seasonId": "season_id", |
| "gameweek": "game_day", |
| } |
| cols = ["game_id", "competition_id", "season_id", "game_date", "game_day"] |
| games = matches.rename(columns=gamesmapping)[cols] |
| games["game_date"] = pd.to_datetime(games["game_date"]) |
| games["home_team_id"] = matches.teamsData.apply(lambda x: _get_team_id(x, "home")) |
| games["away_team_id"] = matches.teamsData.apply(lambda x: _get_team_id(x, "away")) |
| return games |
|
|
|
|
| def _get_team_id(teamsData: dict[int, Any], side: str) -> int: |
| for team_id, data in teamsData.items(): |
| if data["side"] == side: |
| return int(team_id) |
| raise ValueError() |
|
|
|
|
| def _convert_players(players: pd.DataFrame) -> pd.DataFrame: |
| playermapping = { |
| "wyId": "player_id", |
| "shortName": "nickname", |
| "firstName": "firstname", |
| "lastName": "lastname", |
| "birthDate": "birth_date", |
| } |
| cols = ["player_id", "nickname", "firstname", "lastname", "birth_date"] |
| df_players = players.rename(columns=playermapping)[cols] |
| df_players["player_name"] = df_players[["firstname", "lastname"]].agg(" ".join, axis=1) |
| df_players["birth_date"] = pd.to_datetime(df_players["birth_date"]) |
| return df_players |
|
|
|
|
| def _convert_teams(teams: pd.DataFrame) -> pd.DataFrame: |
| teammapping = { |
| "wyId": "team_id", |
| "name": "team_name_short", |
| "officialName": "team_name", |
| } |
| cols = ["team_id", "team_name_short", "team_name"] |
| return teams.rename(columns=teammapping)[cols] |
|
|
|
|
| def _convert_events(raw_events: pd.DataFrame) -> pd.DataFrame: |
| eventmapping = { |
| "id": "event_id", |
| "match_id": "game_id", |
| "event_name": "type_name", |
| "sub_event_name": "subtype_name", |
| } |
| cols = [ |
| "event_id", |
| "game_id", |
| "period_id", |
| "milliseconds", |
| "team_id", |
| "player_id", |
| "type_id", |
| "type_name", |
| "subtype_id", |
| "subtype_name", |
| "positions", |
| "tags", |
| ] |
| events = raw_events.copy() |
| |
| pattern = re.compile(r"(?<!^)(?=[A-Z])") |
| events.columns = [pattern.sub("_", c).lower() for c in events.columns] |
| |
| events["type_id"] = ( |
| pd.to_numeric( |
| events["event_id"] if "event_id" in events.columns else None, errors="coerce" |
| ) |
| .fillna(0) |
| .astype(int) |
| ) |
| del events["event_id"] |
| events["subtype_id"] = ( |
| pd.to_numeric( |
| events["sub_event_id"] if "sub_event_id" in events.columns else None, errors="coerce" |
| ) |
| .fillna(0) |
| .astype(int) |
| ) |
| del events["sub_event_id"] |
| events["period_id"] = events.match_period.apply(lambda x: wyscout_periods[x]) |
| events["milliseconds"] = events.event_sec * 1000 |
| return events.rename(columns=eventmapping)[cols] |
|
|
|
|
| def _get_minutes_played( |
| teamsData: list[dict[str, Any]], events: list[dict[str, Any]] |
| ) -> pd.DataFrame: |
| |
| periods_ts = {i: [0] for i in range(6)} |
| for e in events: |
| period_id = wyscout_periods[e["matchPeriod"]] |
| periods_ts[period_id].append(e["eventSec"]) |
| periods_duration = [ |
| round(max(periods_ts[i]) / 60) for i in range(5) if max(periods_ts[i]) != 0 |
| ] |
| |
| duration = sum(periods_duration) |
|
|
| |
| playergames: dict[int, dict[str, Any]] = {} |
| if isinstance(teamsData, dict): |
| teamsData = list(teamsData.values()) |
| for teamData in teamsData: |
| formation = teamData.get("formation", {}) |
| substitutions = formation.get("substitutions", []) |
| red_cards = { |
| player["playerId"]: _expand_minute(int(player["redCards"]), periods_duration) |
| for key in ["bench", "lineup"] |
| for player in formation.get(key, []) |
| if player["redCards"] != "0" |
| } |
| pg = { |
| player["playerId"]: { |
| "team_id": teamData["teamId"], |
| "player_id": player["playerId"], |
| "jersey_number": player.get("shirtNumber", 0), |
| "minutes_played": red_cards.get(player["playerId"], duration), |
| "is_starter": True, |
| } |
| for player in formation.get("lineup", []) |
| } |
|
|
| |
| if substitutions != "null": |
| for substitution in substitutions: |
| expanded_minute_sub = _expand_minute(substitution["minute"], periods_duration) |
| substitute = { |
| "team_id": teamData["teamId"], |
| "player_id": substitution["playerIn"], |
| "jersey_number": next( |
| ( |
| p.get("shirtNumber", 0) |
| for p in formation.get("bench", []) |
| if p["playerId"] == substitution["playerIn"] |
| ), |
| 0, |
| ), |
| "minutes_played": duration - expanded_minute_sub, |
| "is_starter": False, |
| } |
| if substitution["playerIn"] in red_cards: |
| substitute["minutes_played"] = ( |
| red_cards[substitution["playerIn"]] - expanded_minute_sub |
| ) |
| pg[substitution["playerIn"]] = substitute |
| pg[substitution["playerOut"]]["minutes_played"] = expanded_minute_sub |
|
|
| playergames = {**playergames, **pg} |
| return pd.DataFrame(playergames.values()) |
|
|
|
|
| wyscout_periods = {"1H": 1, "2H": 2, "E1": 3, "E2": 4, "P": 5} |
|
|