#chunk of cleaning my data
#read the CSV of premier league matches
matches <- read.csv("~/Desktop/PremierLeagueMatches.csv")
#trims the last 90 rows of NAs
matches <- head(matches, -90)
# convert Date properly
matches$Date <- as.Date(matches$Date, format = "%m/%d/%y")
#converts Date, Time, Scores, Predicts, Attendence to workable numeric
matches <- matches %>%
mutate(
#makes date and time easier to work w
Time = hm(Time),
homeScore = as.numeric(homeScore),
awayScore = as.numeric(awayScore),
homeXG = as.numeric(homeXG),
awayXG = as.numeric(awayXG),
# remove commas
Attendance = as.numeric(str_remove(Attendance, ","))
)
#adds season names column, helpful when comparing across time
season_names <- c("Season 1","Season 2","Season 3") # adjust as needed
matches <- matches %>%
mutate(Season = case_when(
Date >= as.Date("2022-08-05") & Date <= as.Date("2023-05-28") ~ 1,
Date >= as.Date("2023-08-11") & Date <= as.Date("2024-05-19") ~ 2,
Date >= as.Date("2024-08-11") & Date <= as.Date("2025-05-31") ~ 3,
))
#adds a column for away error and home error for each team
matches <- matches %>%
mutate(
HError = (homeScore - homeXG),
AError = (awayScore - awayXG)
)
# Define relegated teams by season
relegated_list <- list(
"Season 1" = c("Leeds United", "Leicester City", "Southampton"),
"Season 2" = c("Burnley", "Luton Town", "Sheffield Utd"),
"Season 3" = c("Southampton", "Leicester City", "Ipswich Town")
)
# Add columns for home and away relegation status
matches <- matches %>%
mutate(
HomeRelegatedS = case_when(
(Season == 1 & Home.Team %in% relegated_list[["Season 1"]]) |
(Season == 2 & Home.Team %in% relegated_list[["Season 2"]]) |
(Season == 3 & Home.Team %in% relegated_list[["Season 3"]]) ~ TRUE,
TRUE ~ FALSE
),
AwayRelegatedS = case_when(
(Season == 1 & Away.Team %in% relegated_list[["Season 1"]]) |
(Season == 2 & Away.Team %in% relegated_list[["Season 2"]]) |
(Season == 3 & Away.Team %in% relegated_list[["Season 3"]]) ~ TRUE,
TRUE ~ FALSE
)
)
#computes total Match Error
matches <- matches %>%
mutate(
TotalMatchError = abs(HError) + abs(AError)
)
write.csv(matches, "~/Desktop/matches.csv", row.names = FALSE)
##Output correlation of XG v. Home Score
#Outputs correlation v. homescore
corrH <- cor(matches$homeScore, matches$homeXG, use = "complete.obs")
corrA <- cor(matches$awayScore, matches$awayXG, use = "complete.obs")
corrH
## [1] 0.5783566
corrA
## [1] 0.6121344
# creates Team Error csv, of each team separated by their home and away avg E
# used in tableau
#computes home average error
home_avg <- matches %>%
group_by(Team = Home.Team) %>%
summarise(Home = mean(HError, na.rm = TRUE), .groups = "drop")
#computes away average
away_avg <- matches %>%
group_by(Team = Away.Team) %>%
summarise(Away = mean(AError, na.rm = TRUE), .groups = "drop")
#combines the average away and home error
team_error <- full_join(home_avg, away_avg, by = "Team") %>%
mutate(overall_mean = (Home + Away) / 2)
#exports this to use in tableau
write.csv(team_error, "~/Desktop/teamErrorR.csv", row.names = FALSE)
# heatmap
ggplot(matches, aes(x = AError, y = HError)) +
geom_bin2d(bins = 30) +
scale_fill_gradient(low = "darkblue", high = "darkorange", name = "Match Count") +
labs(
title = "Distribution of Home vs Away Prediction Errors",
x = "Away Error",
y = "Home Error"
) +
# point at the origin: no aes() for constants
geom_point(x = 0, y = 0, shape = 4, color = "white", size = 4, stroke = 1) +
theme_minimal(base_size = 12)
#plot of refferees
#used plotly website for references
plot_ly(
data = matches,
x = ~Referee,
y = ~TotalMatchError,
type = "box",
boxpoints = "outliers",
marker = list(color = "darkblue"),
line = list(color = "gray30"),
fillcolor = "lightblue"
) %>%
layout(
title = "Total Prediction Error by Referee",
xaxis = list(
#makes the angled x axis, to be more readable
title = "Referee",
tickangle = 65
),
yaxis = list(title = "Total Error Per Match"),
#adds white space at the bottom so that
margin = list(b = 150),
font = list(size = 12)
)
##https://plotly.com/r/axes/
#Plotly about attendence
#had to take out the NAs, one NA in attn that was breaking code
attn <- matches %>%
filter(!is.na(Attendance), !is.na(TotalMatchError))
#creates the linear regression line, to show lack of relationship even more
fit <- lm(TotalMatchError ~ Attendance, data = attn)
#plotly scatter of attendance v. total match error
plot_ly(attn, x = ~Attendance, y = ~TotalMatchError, type = "scatter",
mode = "markers", marker = list(color = "navy", opacity = 0.8, size = 6)
) %>%
#adds in the fitted regression that was made above
#have to add in attn$Attendance, because plotly cannot read it otherwise
add_lines(x = attn$Attendance,
y = fitted(fit),
line = list(color = "darkorange", opacity = 1, width = 4),
) %>%
layout(
title = "Attendance vs Total Match Error",
xaxis = list(title = "Attendance"),
yaxis = list(title = "Total Match Error"),
font = list(size = 12),
showlegend = FALSE
)
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
#creates barbell plot of home v. away prediction error by team
ggplot(team_error, aes(y = reorder(Team, overall_mean))) +
#draws center gray segment
geom_segment(aes(x = Away, xend = Home, yend = Team),
color = "gray70", linewidth = 1.2) +
#draws home and away points
geom_point(aes(x = Home, color = "Home"), size = 3) +
geom_point(aes(x = Away, color = "Away"), size = 3) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
scale_color_manual(name = "Game Type",values = c("Home" = "darkblue",
"Away" = "lightblue")) +
theme_minimal(base_size = 12) +
labs(
title = "Home vs Away xG Prediction Error by Team",
subtitle = "Error = Goals − xG",
x = "Average Prediction Error (Goals − xG)",
y = "Team"
) +
theme(
plot.title = element_text(face = "bold"),
axis.text.y = element_text(size = 8)
)
#https://r-graph-gallery.com/web-extended-dumbbell-plot-ggplot2.html
#change the colors here to be
highlight_teams <- c("Arsenal", "Manchester City", "Tottenham",
"Leeds United", "Crystal Palace", "Leicester City")
team_error_filtered <- team_error %>%
filter(Team %in% highlight_teams)
ggplot(team_error_filtered, aes(y = reorder(Team, overall_mean))) +
geom_segment(aes(x = Away, xend = Home, yend = Team),
color = "gray70", linewidth = 1.2) +
geom_point(aes(x = Home, color = "Home"), size = 3) +
geom_point(aes(x = Away, color = "Away"), size = 3) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
scale_color_manual(values = c("Home" = "darkblue", "Away" = "lightblue")) +
theme_minimal(base_size = 12) +
labs(
title = "Home vs Away xG Prediction Error (Highlighted Teams Only)",
subtitle = "Error = Goals − xG",
x = "Average Prediction Error (Goals − xG)",
y = "Team"
) +
theme(
plot.title = element_text(face = "bold"),
axis.text.y = element_text(size = 8)
)
#sep out the teams by highest and lowest error (hard v. easy to predict)
top_teams <- c("Arsenal", "Manchester City", "Tottenham")
bottom_teams <- c("Leeds United", "Crystal Palace", "Leicester City")
team_order <- c(top_teams, bottom_teams)
#makes a DF of just the 6 teams above of all their home games
#to build the home side of the plot
home_df <- matches %>%
filter(Home.Team %in% team_order) %>%
mutate(
Team = factor(Home.Team, levels = team_order),
ErrorValue = HError,
#adds a column either H or E to predict, for coloring in the actual plot
Group = case_when(
Home.Team %in% top_teams ~ "Hardest to Predict",
Home.Team %in% bottom_teams ~ "Easiest to Predict"
)
)
#draws the home violin plot
p_home <- ggplot(home_df, aes(x = Team, y = ErrorValue, fill = Group)) +
geom_violin(color = "black") +
geom_boxplot(width = 0.15, outlier.shape = 1) +
labs(
title = "Home v. Away xG Prediction Errors",
x = "Team",
y = "Home Error (Goals − xG)"
) +
scale_fill_manual(values = c(
"Hardest to Predict" = "#3964DB",
"Easiest to Predict" = "darkorange"
)) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
#makes a DF of just the 6 teams above of all their away games
#to build the home side of the plot
away_df <- matches %>%
filter(Away.Team %in% team_order) %>%
mutate(
Team = factor(Away.Team, levels = team_order),
ErrorValue = AError,
#adds a column either H or E to predict, for coloring in the actual plot
Group = case_when(
Away.Team %in% top_teams ~ "Hardest to Predict",
Away.Team %in% bottom_teams ~ "Easiest to Predict"
)
)
#draws the away violin plot
p_away <- ggplot(away_df, aes(x = Team, y = ErrorValue, fill = Group)) +
geom_violin(color = "black") +
geom_boxplot(width = 0.15, outlier.shape = 1) +
labs(
x = "Team",
y = "Away Error (Goals − xG)"
) +
scale_fill_manual(values = c(
"Hardest to Predict" = "#3964DB",
"Easiest to Predict" = "darkorange"
)) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#combines both using patchwork
p_home + p_away
Their medians are quite similar, what might be causing them to be more volitile
#creates a small multiples graph of the goals v. prediction error for each team
#for HOME games
ggplot(home_df, aes(x = homeScore, y = HError, color = Group)) +
geom_point(size = 2, alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
scale_color_manual(
values = c(
"Hardest to Predict" = "darkblue",
"Easiest to Predict" = "darkorange"
)
) +
#facet wrap makes it draw based on the home team, separating it
facet_wrap(~ Home.Team, ncol = 3) +
labs(
title = "Relationship Between Goals Scored and Prediction Error (Home)",
x = "Home Goals Scored",
y = "Home Error",
color = "Team Group"
) +
theme_minimal(base_size = 12) +
theme(
panel.background = element_rect(fill = "white", color = NA),
)
## `geom_smooth()` using formula = 'y ~ x'
```
Now that we know that number of goals are the impactful piece, lets understand that more via interactive shiny apps
Which teams are unpredictable
What makes matches unpredictable