library(gt)
library(gtExtras)
library(rvest)
library(tidyverse)
library(worldfootballR)
#set seed for reproducibility
set.seed(20231228)
#fetch current table using the worldfootballr package
prem_2024_table <- fb_season_team_stats(country = "ENG",
gender = "M",
season_end_year = "2024",
tier = "1st",
stat_type = "league_table")
pl_table <- prem_2024_table %>%
select(Squad, MP, Pts, W, D, L, GF, GA, GD, xG, xGA, Rk) %>%
rename(Points = Pts,
Goals = GF,
Goals_Allowed = GA,
Rank = Rk,
Goal_Dif = GD,
xG_Allowed = xGA)
#read data from fb ref. Since there's only one table, it's pretty simple.
pl_url <- read_html("https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures")
pl_tbls <- html_nodes(pl_url, "table")
pl_scores <- pl_tbls %>%
html_table(fill = TRUE)
#rather than use a case statement over and over, just make this a function
calculate_points <- function(a, b){
case_when(a > b ~ 3,
a == b ~ 1,
a > b ~ 0,
TRUE ~ 0)
}
#how many seasons we want to simulate
number_of_sims <- 20000
#function to simulate games. Accepts goals as an average,
#so could use average goals or average xG
sim_games <- function(home_goals, away_goals){
n_sims <- number_of_sims
home_goals <- rpois(n_sims, home_goals)
away_goals<- rpois(n_sims, away_goals)
results <- cbind(home_goals, away_goals) %>%
as_tibble %>%
mutate(goal_dif = home_goals - away_goals,
home_points = calculate_points(home_goals, away_goals),
away_points = calculate_points(away_goals, home_goals),
sim_num = row_number()) %>%
relocate(sim_num)
results
}
#clean our data
pl_prepped <- pl_scores[[1]] %>%
as_tibble(.name_repair = "unique") %>%
rename(HomexG = `xG...6`,
AwayxG = `xG...8`) %>%
separate(Score, c('HomeGoal', 'AwayGoal')) %>%
mutate(HomePoints = calculate_points(HomeGoal, AwayGoal),
AwayPoints = calculate_points(AwayGoal, HomeGoal)) %>%
filter(is.na(Wk) == FALSE) %>%
mutate(HomeGoal = as.numeric(HomeGoal),
AwayGoal = as.numeric(AwayGoal))
#get max date with a game
Max_Date <- pl_prepped %>%
filter(`Match Report` == 'Match Report') %>%
summarise(Max_Date = max(Date)) %>%
pull()
#get unplayed games - we don't need to sim games that have already been played
pl_remaning <- pl_prepped %>%
filter(`Match Report` != 'Match Report' | Notes == 'Match Suspended')
#do some light prep to bring in average goals for each team from the table,
#then run our sime
pl_simmed <- pl_remaning %>%
select(Wk, Date, Home, Away) %>%
inner_join(pl_table %>% select(Squad, Goals, MP) %>% rename(Home_Goals = Goals, Home_MP = MP), join_by(Home == Squad)) %>%
inner_join(pl_table %>% select(Squad, Goals, MP) %>% rename(Away_Goals = Goals, Away_MP = MP), join_by(Away == Squad)) %>%
mutate(Home_G_per_Match = Home_Goals / Home_MP,
Away_G_per_Match = Away_Goals / Away_MP) %>%
mutate(simmed_games = map2(Home_G_per_Match, Away_G_per_Match, ~sim_games(.x, .y)))
#unnest our sims, split into home and away
pl_simmed_long <- pl_simmed %>%
select(Wk, Date, Home, Away, simmed_games) %>%
unnest(simmed_games)
pl_simmed_home <- pl_simmed_long %>%
select(sim_num, Home, home_points, home_goals, away_goals) %>%
rename(Squad = Home,
Goals = home_goals,
Goals_Allowed = away_goals,
Points = home_points
) %>%
mutate(W = case_when(Goals > Goals_Allowed ~ 1, TRUE ~ 0),
D = case_when(Goals == Goals_Allowed ~ 1, TRUE ~ 0),
L = case_when(Goals < Goals_Allowed ~ 1, TRUE ~ 0))
pl_simmed_away <- pl_simmed_long %>%
select(sim_num, Away, away_points, away_goals, home_goals) %>%
rename(Squad = Away,
Goals = away_goals,
Goals_Allowed = home_goals,
Points = away_points
) %>%
mutate(W = case_when(Goals > Goals_Allowed ~ 1, TRUE ~ 0),
D = case_when(Goals == Goals_Allowed ~ 1, TRUE ~ 0),
L = case_when(Goals < Goals_Allowed ~ 1, TRUE ~ 0))
#we need actual in the data as well, so we repliate the table
#to match our number of sims
table_rep <- seq(1:number_of_sims) %>%
as_tibble() %>%
rename(sim_num = value) %>%
crossing(pl_table %>% select(Squad, Points, Goals, Goals_Allowed, W, D, L))
#take all of our sims and summarize. Note this includes the sim number if we want to play with that
pl_aggregated <- pl_simmed_home %>%
rbind(pl_simmed_away) %>%
rbind(table_rep) %>%
group_by(sim_num, Squad) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
mutate(Goal_Dif = Goals - Goals_Allowed) %>%
group_by(sim_num) %>%
arrange(desc(Points), desc(Goal_Dif)) %>%
mutate(Rank = rank(desc(Points), ties.method="first"),
win_PL = case_when(Rank == 1 ~ 1, TRUE ~ 0),
qualify_UCL = case_when(Rank <= 4 ~ 1, TRUE ~ 0),
relgated = case_when(Rank >= 18 ~ 1, TRUE ~ 0))
pl_team <- pl_aggregated %>%
ungroup() %>%
select(-sim_num, -Rank) %>%
group_by(Squad) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
arrange(desc(Points))
pl_sim_table <- pl_table %>%
inner_join(pl_team, by = join_by(Squad)) %>%
select(-xG, -xG_Allowed) %>%
arrange(desc(win_PL), relgated)
pl_simmed_long %>%
filter(Date == '2023-12-30' & Home == 'Manchester City') %>%
mutate(W = case_when(home_goals > away_goals ~ 1, TRUE ~ 0),
D = case_when(home_goals == away_goals ~ 1, TRUE ~ 0),
L = case_when(home_goals < away_goals ~ 1, TRUE ~ 0)) %>%
group_by(Date, Home, Away) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
select(-Wk, -sim_num) %>%
ungroup() %>%
gt() %>%
gt_theme_espn() %>%
cols_label(home_goals = 'Home Goals',
away_goals = 'Away Goals',
goal_dif = 'GD',
home_points = 'Home Points',
away_points = 'Away Points') %>%
fmt_percent(columns = c(W, D, L), decimals = 1) %>%
fmt_number(columns = c(home_goals, away_goals, goal_dif, home_points, away_points),
decimals = 2)