Data Loading & Preparation
Install Packages
install.packages("worldfootballR")
install.packages("tidyverse")
install.packages("broom") # For model summaries
Load Libraries
library(worldfootballR) # Premier League data
library(tidyverse) # Data manipulation
library(broom) # Model tidying
Import Data
# Get player stats for past 3 seasons
player_history <- fb_big5_advanced_season_stats(
season_end_year = c(2022, 2023, 2024),
stat_type = "standard",
team_or_player = "player"
) %>%
filter(Comp == "Premier League")
Part 1: Scoring Analysis
Top Scorers by Season
top_scorers_history <- player_history %>%
select(Season_End_Year, Player, Squad, Gls, Min_Playing) %>%
group_by(Season_End_Year) %>%
arrange(desc(Gls)) %>%
slice_head(n = 10) %>%
ungroup()
# Display table
knitr::kable(head(top_scorers_history, 15),
caption = "Top Scorers by Season",
align = 'c')
Top Scorers by Season
| 2022 |
Mohamed Salah |
Liverpool |
23 |
2762 |
| 2022 |
Son Heung-min |
Tottenham |
23 |
3006 |
| 2022 |
Cristiano Ronaldo |
Manchester Utd |
18 |
2456 |
| 2022 |
Harry Kane |
Tottenham |
17 |
3232 |
| 2022 |
Sadio Mané |
Liverpool |
16 |
2819 |
| 2022 |
Jamie Vardy |
Leicester City |
15 |
1806 |
| 2022 |
Diogo Jota |
Liverpool |
15 |
2364 |
| 2022 |
Kevin De Bruyne |
Manchester City |
15 |
2201 |
| 2022 |
Wilfried Zaha |
Crystal Palace |
14 |
2760 |
| 2022 |
Raheem Sterling |
Manchester City |
13 |
2128 |
| 2023 |
Erling Haaland |
Manchester City |
36 |
2769 |
| 2023 |
Harry Kane |
Tottenham |
30 |
3405 |
| 2023 |
Ivan Toney |
Brentford |
20 |
2951 |
| 2023 |
Mohamed Salah |
Liverpool |
19 |
3290 |
| 2023 |
Callum Wilson |
Newcastle Utd |
18 |
1877 |
Graph 1: Top Scorers by Season
ggplot(top_scorers_history, aes(x = reorder(Player, Gls), y = Gls, fill = factor(Season_End_Year))) +
geom_col() +
coord_flip() +
facet_wrap(~Season_End_Year, scales = "free_y") +
labs(title = "Top 10 Goal Scorers by Season",
subtitle = "Premier League 2022-2024",
x = "Player",
y = "Goals",
fill = "Season") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
strip.background = element_rect(fill = "#3498db"),
strip.text = element_text(color = "white", face = "bold"),
legend.position = "none"
)

Scoring Efficiency Calculation
scoring_trends <- player_history %>%
filter(Min_Playing >= 1000) %>%
mutate(Goals_Per_90 = (Gls / Min_Playing) * 90) %>%
group_by(Player) %>%
filter(n() >= 2) %>%
summarise(
Avg_Goals_Per_90 = mean(Goals_Per_90),
Total_Goals = sum(Gls),
Seasons_Played = n(),
Latest_Team = last(Squad)
) %>%
arrange(desc(Avg_Goals_Per_90)) %>%
head(15)
knitr::kable(scoring_trends,
digits = 2,
caption = "Top 15 Players by Goals Per 90 Minutes")
Top 15 Players by Goals Per 90 Minutes
| Erling Haaland |
1.06 |
63 |
2 |
Manchester City |
| Alexander Isak |
0.71 |
31 |
2 |
Newcastle Utd |
| Callum Wilson |
0.69 |
26 |
2 |
Newcastle Utd |
| Diogo Jota |
0.64 |
32 |
3 |
Liverpool |
| Mohamed Salah |
0.64 |
60 |
3 |
Liverpool |
| Harry Kane |
0.63 |
47 |
2 |
Tottenham |
| Taiwo Awoniyi |
0.58 |
16 |
2 |
Nott’ham Forest |
| Jean-Philippe Mateta |
0.51 |
21 |
2 |
Crystal Palace |
| Son Heung-min |
0.51 |
50 |
3 |
Tottenham |
| Phil Foden |
0.51 |
39 |
3 |
Manchester City |
| Darwin Núñez |
0.48 |
20 |
2 |
Liverpool |
| Leandro Trossard |
0.46 |
27 |
3 |
Arsenal |
| Julián Álvarez |
0.46 |
20 |
2 |
Manchester City |
| Riyad Mahrez |
0.45 |
16 |
2 |
Manchester City |
| Jamie Vardy |
0.45 |
18 |
2 |
Leicester City |
Part 2: Linear Regression Modeling
Model 1: Predicting Goals from Minutes Played
# Prepare data for modeling
modeling_data <- player_history %>%
filter(Min_Playing >= 500) %>%
mutate(Minutes_1000 = Min_Playing / 1000) # Scale for interpretability
# Build linear model
goals_model <- lm(Gls ~ Minutes_1000 + Season_End_Year, data = modeling_data)
# Display model summary
summary(goals_model)
##
## Call:
## lm(formula = Gls ~ Minutes_1000 + Season_End_Year, data = modeling_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.238 -2.088 -0.930 0.886 31.927
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -393.7895 266.9126 -1.475 0.14
## Minutes_1000 1.4900 0.1305 11.413 <2e-16 ***
## Season_End_Year 0.1946 0.1319 1.475 0.14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.715 on 1197 degrees of freedom
## Multiple R-squared: 0.09943, Adjusted R-squared: 0.09793
## F-statistic: 66.08 on 2 and 1197 DF, p-value: < 2.2e-16
# Tidy model output
tidy_model <- tidy(goals_model)
knitr::kable(tidy_model,
digits = 3,
caption = "Linear Regression: Goals ~ Minutes + Season")
Linear Regression: Goals ~ Minutes + Season
| (Intercept) |
-393.790 |
266.913 |
-1.475 |
0.14 |
| Minutes_1000 |
1.490 |
0.131 |
11.413 |
0.00 |
| Season_End_Year |
0.195 |
0.132 |
1.475 |
0.14 |
Graph 2: Model Fit Visualization
# Add predictions to data
modeling_data$predictions <- predict(goals_model, modeling_data)
ggplot(modeling_data, aes(x = Minutes_1000, y = Gls)) +
geom_point(alpha = 0.3, color = "#3498db") +
geom_smooth(method = "lm", color = "#e74c3c", size = 1.5, se = TRUE) +
labs(title = "Linear Model: Goals vs Minutes Played",
subtitle = "With 95% confidence interval",
x = "Minutes Played (in 1000s)",
y = "Goals Scored") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14))

Graph 3: Residual Plot
# Calculate residuals
modeling_data$residuals <- residuals(goals_model)
ggplot(modeling_data, aes(x = predictions, y = residuals)) +
geom_point(alpha = 0.4, color = "#3498db") +
geom_hline(yintercept = 0, linetype = "dashed", color = "#e74c3c", size = 1) +
labs(title = "Residual Plot: Model Diagnostics",
x = "Predicted Goals",
y = "Residuals") +
theme_minimal()

Model 2: Goals Per 90 Prediction for 2025
# Create prediction based on trends with confidence intervals
goal_scorer_prediction <- scoring_trends %>%
mutate(
Predicted_2025_Goals = round(Avg_Goals_Per_90 * 38), # Full season
Lower_CI = round((Avg_Goals_Per_90 - 0.1) * 38),
Upper_CI = round((Avg_Goals_Per_90 + 0.1) * 38),
Confidence = case_when(
Seasons_Played >= 3 ~ "High",
Seasons_Played == 2 ~ "Medium",
TRUE ~ "Low"
)
) %>%
arrange(desc(Predicted_2025_Goals)) %>%
head(10)
knitr::kable(goal_scorer_prediction %>%
select(Player, Latest_Team, Predicted_2025_Goals, Lower_CI, Upper_CI, Confidence),
caption = "2025 Goal Predictions with 95% Confidence Intervals")
2025 Goal Predictions with 95% Confidence Intervals
| Erling Haaland |
Manchester City |
40 |
37 |
44 |
Medium |
| Alexander Isak |
Newcastle Utd |
27 |
23 |
31 |
Medium |
| Callum Wilson |
Newcastle Utd |
26 |
22 |
30 |
Medium |
| Diogo Jota |
Liverpool |
24 |
20 |
28 |
High |
| Mohamed Salah |
Liverpool |
24 |
20 |
28 |
High |
| Harry Kane |
Tottenham |
24 |
20 |
28 |
Medium |
| Taiwo Awoniyi |
Nott’ham Forest |
22 |
18 |
26 |
Medium |
| Jean-Philippe Mateta |
Crystal Palace |
19 |
16 |
23 |
Medium |
| Son Heung-min |
Tottenham |
19 |
15 |
23 |
High |
| Phil Foden |
Manchester City |
19 |
15 |
23 |
High |
Graph 4: Predicted Top Scorers with Error Bars
ggplot(goal_scorer_prediction, aes(x = reorder(Player, Predicted_2025_Goals),
y = Predicted_2025_Goals,
fill = Confidence)) +
geom_col() +
geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), width = 0.3, size = 0.8) +
coord_flip() +
scale_fill_manual(values = c("High" = "#27ae60", "Medium" = "#f39c12", "Low" = "#e74c3c")) +
labs(title = "Predicted Top 10 Goal Scorers - Premier League 2025",
subtitle = "Error bars show 95% confidence intervals",
x = "Player",
y = "Predicted Goals",
fill = "Confidence") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14))

Part 3: Multi-Player Comparison
Player Selection & Data Prep
four_players <- player_history %>%
filter(Player %in% c("Mohamed Salah", "Erling Haaland", "Ivan Toney", "Virgil van Dijk")) %>%
select(Season_End_Year, Player, Squad, Gls, Ast, Min_Playing) %>%
arrange(Player, Season_End_Year)
# Summary table
knitr::kable(four_players,
caption = "4 Player Dataset: 2022-2024")
4 Player Dataset: 2022-2024
| 2023 |
Erling Haaland |
Manchester City |
36 |
8 |
2769 |
| 2024 |
Erling Haaland |
Manchester City |
27 |
5 |
2552 |
| 2022 |
Ivan Toney |
Brentford |
12 |
5 |
2909 |
| 2023 |
Ivan Toney |
Brentford |
20 |
4 |
2951 |
| 2024 |
Ivan Toney |
Brentford |
4 |
2 |
1450 |
| 2022 |
Mohamed Salah |
Liverpool |
23 |
13 |
2762 |
| 2023 |
Mohamed Salah |
Liverpool |
19 |
12 |
3290 |
| 2024 |
Mohamed Salah |
Liverpool |
18 |
10 |
2534 |
| 2022 |
Virgil van Dijk |
Liverpool |
3 |
1 |
3060 |
| 2023 |
Virgil van Dijk |
Liverpool |
3 |
1 |
2835 |
| 2024 |
Virgil van Dijk |
Liverpool |
2 |
2 |
3177 |
Graph 5: Goals Time Series
ggplot(four_players, aes(x = Season_End_Year, y = Gls, color = Player, group = Player)) +
geom_line(size = 2) +
geom_point(size = 4) +
scale_x_continuous(breaks = c(2022, 2023, 2024)) +
scale_color_manual(values = c("Mohamed Salah" = "#C8102E",
"Erling Haaland" = "#6CABDD",
"Ivan Toney" = "#F7A800",
"Virgil van Dijk" = "#8B0000")) +
labs(title = "Goals Scored: 4 Player Comparison (2022-2024)",
subtitle = "Time series showing scoring trends",
x = "Season",
y = "Goals",
color = "Player") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
legend.position = "bottom"
)

Goals Per 90 Analysis
four_players_per90 <- four_players %>%
mutate(Goals_Per_90 = (Gls / Min_Playing) * 90)
knitr::kable(four_players_per90 %>%
select(Season_End_Year, Player, Gls, Goals_Per_90),
digits = 2,
caption = "Goals Per 90 Minutes by Season")
Goals Per 90 Minutes by Season
| 2023 |
Erling Haaland |
36 |
1.17 |
| 2024 |
Erling Haaland |
27 |
0.95 |
| 2022 |
Ivan Toney |
12 |
0.37 |
| 2023 |
Ivan Toney |
20 |
0.61 |
| 2024 |
Ivan Toney |
4 |
0.25 |
| 2022 |
Mohamed Salah |
23 |
0.75 |
| 2023 |
Mohamed Salah |
19 |
0.52 |
| 2024 |
Mohamed Salah |
18 |
0.64 |
| 2022 |
Virgil van Dijk |
3 |
0.09 |
| 2023 |
Virgil van Dijk |
3 |
0.10 |
| 2024 |
Virgil van Dijk |
2 |
0.06 |
Graph 6: Scoring Efficiency Lines
ggplot(four_players_per90, aes(x = Season_End_Year, y = Goals_Per_90,
color = Player, group = Player)) +
geom_line(size = 2) +
geom_point(size = 4) +
scale_x_continuous(breaks = c(2022, 2023, 2024)) +
scale_color_manual(values = c("Mohamed Salah" = "#C8102E",
"Erling Haaland" = "#6CABDD",
"Ivan Toney" = "#F7A800",
"Virgil van Dijk" = "#8B0000")) +
labs(title = "Goals Per 90 Minutes: Efficiency Comparison",
subtitle = "Standardized scoring rate",
x = "Season",
y = "Goals Per 90 Minutes",
color = "Player") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14))

Graph 7: Grouped Bar Chart
ggplot(four_players, aes(x = factor(Season_End_Year), y = Gls, fill = Player)) +
geom_col(position = "dodge", width = 0.7) +
scale_fill_manual(values = c("Mohamed Salah" = "#C8102E",
"Erling Haaland" = "#6CABDD",
"Ivan Toney" = "#F7A800",
"Virgil van Dijk" = "#8B0000")) +
labs(title = "Goals by Season: Side-by-Side Comparison",
x = "Season",
y = "Goals",
fill = "Player") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14))

Part 4: Correlation Analysis
Calculate Correlations
# Prepare correlation data
correlation_data <- four_players %>%
select(Gls, Ast, Min_Playing) %>%
cor()
knitr::kable(correlation_data,
digits = 3,
caption = "Correlation Matrix: Goals, Assists, Minutes")
Correlation Matrix: Goals, Assists, Minutes
| Gls |
1.000 |
0.667 |
0.075 |
| Ast |
0.667 |
1.000 |
0.170 |
| Min_Playing |
0.075 |
0.170 |
1.000 |
Graph 8: Correlation Heatmap
# Reshape for plotting
cor_long <- as.data.frame(correlation_data) %>%
rownames_to_column("Var1") %>%
pivot_longer(-Var1, names_to = "Var2", values_to = "Correlation")
ggplot(cor_long, aes(x = Var1, y = Var2, fill = Correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Correlation, 2)), color = "white", size = 6) +
scale_fill_gradient2(low = "#3498db", mid = "white", high = "#e74c3c",
midpoint = 0, limits = c(-1, 1)) +
labs(title = "Correlation Heatmap",
x = "", y = "") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14))
Part 5: Career Summary
Career Averages
career_comparison <- four_players %>%
group_by(Player) %>%
summarise(
Seasons = n(),
Total_Goals = sum(Gls),
Avg_Goals = round(mean(Gls), 1),
Total_Assists = sum(Ast),
Avg_Assists = round(mean(Ast), 1),
Total_Minutes = sum(Min_Playing),
Goals_Per_90 = round((Total_Goals / Total_Minutes) * 90, 2),
Current_Team = last(Squad)
) %>%
arrange(desc(Total_Goals))
knitr::kable(career_comparison,
caption = "3-Year Career Summary (2022-2024)")
3-Year Career Summary (2022-2024)
| Erling Haaland |
2 |
63 |
31.5 |
13 |
6.5 |
5321 |
1.07 |
Manchester City |
| Mohamed Salah |
3 |
60 |
20.0 |
35 |
11.7 |
8586 |
0.63 |
Liverpool |
| Ivan Toney |
3 |
36 |
12.0 |
11 |
3.7 |
7310 |
0.44 |
Brentford |
| Virgil van Dijk |
3 |
8 |
2.7 |
4 |
1.3 |
9072 |
0.08 |
Liverpool |
Graph 10: Position-Based Faceted View
four_players_roles <- four_players %>%
mutate(
Position = case_when(
Player == "Virgil van Dijk" ~ "Defender",
Player == "Mohamed Salah" ~ "Winger",
Player %in% c("Erling Haaland", "Ivan Toney") ~ "Striker"
)
)
ggplot(four_players_roles, aes(x = Player, y = Gls, fill = Player)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Position, scales = "free_x") +
scale_fill_manual(values = c("Mohamed Salah" = "#C8102E",
"Erling Haaland" = "#6CABDD",
"Ivan Toney" = "#F7A800",
"Virgil van Dijk" = "#8B0000")) +
labs(title = "Goals by Position Type",
subtitle = "Positional context matters",
x = "", y = "Goals") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
strip.background = element_rect(fill = "#3498db"),
strip.text = element_text(color = "white", face = "bold")
)
