Introduction

This tutorial uses historical Premier League data to:


Data Loading & Preparation

Install Packages

install.packages("worldfootballR")
install.packages("tidyverse")
install.packages("broom")  # For model summaries

Load Libraries

library(worldfootballR)  # Premier League data
library(tidyverse)       # Data manipulation
library(broom)          # Model tidying

Import Data

# Get player stats for past 3 seasons
player_history <- fb_big5_advanced_season_stats(
  season_end_year = c(2022, 2023, 2024),
  stat_type = "standard",
  team_or_player = "player"
) %>%
  filter(Comp == "Premier League")

Part 1: Scoring Analysis

Top Scorers by Season

top_scorers_history <- player_history %>%
  select(Season_End_Year, Player, Squad, Gls, Min_Playing) %>%
  group_by(Season_End_Year) %>%
  arrange(desc(Gls)) %>%
  slice_head(n = 10) %>%
  ungroup()

# Display table
knitr::kable(head(top_scorers_history, 15), 
             caption = "Top Scorers by Season",
             align = 'c')
Top Scorers by Season
Season_End_Year Player Squad Gls Min_Playing
2022 Mohamed Salah Liverpool 23 2762
2022 Son Heung-min Tottenham 23 3006
2022 Cristiano Ronaldo Manchester Utd 18 2456
2022 Harry Kane Tottenham 17 3232
2022 Sadio Mané Liverpool 16 2819
2022 Jamie Vardy Leicester City 15 1806
2022 Diogo Jota Liverpool 15 2364
2022 Kevin De Bruyne Manchester City 15 2201
2022 Wilfried Zaha Crystal Palace 14 2760
2022 Raheem Sterling Manchester City 13 2128
2023 Erling Haaland Manchester City 36 2769
2023 Harry Kane Tottenham 30 3405
2023 Ivan Toney Brentford 20 2951
2023 Mohamed Salah Liverpool 19 3290
2023 Callum Wilson Newcastle Utd 18 1877

Graph 1: Top Scorers by Season

ggplot(top_scorers_history, aes(x = reorder(Player, Gls), y = Gls, fill = factor(Season_End_Year))) +
  geom_col() +
  coord_flip() +
  facet_wrap(~Season_End_Year, scales = "free_y") +
  labs(title = "Top 10 Goal Scorers by Season",
       subtitle = "Premier League 2022-2024",
       x = "Player",
       y = "Goals",
       fill = "Season") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "#3498db"),
    strip.text = element_text(color = "white", face = "bold"),
    legend.position = "none"
  )

Scoring Efficiency Calculation

scoring_trends <- player_history %>%
  filter(Min_Playing >= 1000) %>%
  mutate(Goals_Per_90 = (Gls / Min_Playing) * 90) %>%
  group_by(Player) %>%
  filter(n() >= 2) %>%
  summarise(
    Avg_Goals_Per_90 = mean(Goals_Per_90),
    Total_Goals = sum(Gls),
    Seasons_Played = n(),
    Latest_Team = last(Squad)
  ) %>%
  arrange(desc(Avg_Goals_Per_90)) %>%
  head(15)

knitr::kable(scoring_trends, 
             digits = 2,
             caption = "Top 15 Players by Goals Per 90 Minutes")
Top 15 Players by Goals Per 90 Minutes
Player Avg_Goals_Per_90 Total_Goals Seasons_Played Latest_Team
Erling Haaland 1.06 63 2 Manchester City
Alexander Isak 0.71 31 2 Newcastle Utd
Callum Wilson 0.69 26 2 Newcastle Utd
Diogo Jota 0.64 32 3 Liverpool
Mohamed Salah 0.64 60 3 Liverpool
Harry Kane 0.63 47 2 Tottenham
Taiwo Awoniyi 0.58 16 2 Nott’ham Forest
Jean-Philippe Mateta 0.51 21 2 Crystal Palace
Son Heung-min 0.51 50 3 Tottenham
Phil Foden 0.51 39 3 Manchester City
Darwin Núñez 0.48 20 2 Liverpool
Leandro Trossard 0.46 27 3 Arsenal
Julián Álvarez 0.46 20 2 Manchester City
Riyad Mahrez 0.45 16 2 Manchester City
Jamie Vardy 0.45 18 2 Leicester City

Part 2: Linear Regression Modeling

Model 1: Predicting Goals from Minutes Played

# Prepare data for modeling
modeling_data <- player_history %>%
  filter(Min_Playing >= 500) %>%
  mutate(Minutes_1000 = Min_Playing / 1000)  # Scale for interpretability

# Build linear model
goals_model <- lm(Gls ~ Minutes_1000 + Season_End_Year, data = modeling_data)

# Display model summary
summary(goals_model)
## 
## Call:
## lm(formula = Gls ~ Minutes_1000 + Season_End_Year, data = modeling_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.238 -2.088 -0.930  0.886 31.927 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -393.7895   266.9126  -1.475     0.14    
## Minutes_1000       1.4900     0.1305  11.413   <2e-16 ***
## Season_End_Year    0.1946     0.1319   1.475     0.14    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.715 on 1197 degrees of freedom
## Multiple R-squared:  0.09943,    Adjusted R-squared:  0.09793 
## F-statistic: 66.08 on 2 and 1197 DF,  p-value: < 2.2e-16
# Tidy model output
tidy_model <- tidy(goals_model)
knitr::kable(tidy_model, 
             digits = 3,
             caption = "Linear Regression: Goals ~ Minutes + Season")
Linear Regression: Goals ~ Minutes + Season
term estimate std.error statistic p.value
(Intercept) -393.790 266.913 -1.475 0.14
Minutes_1000 1.490 0.131 11.413 0.00
Season_End_Year 0.195 0.132 1.475 0.14

Graph 2: Model Fit Visualization

# Add predictions to data
modeling_data$predictions <- predict(goals_model, modeling_data)

ggplot(modeling_data, aes(x = Minutes_1000, y = Gls)) +
  geom_point(alpha = 0.3, color = "#3498db") +
  geom_smooth(method = "lm", color = "#e74c3c", size = 1.5, se = TRUE) +
  labs(title = "Linear Model: Goals vs Minutes Played",
       subtitle = "With 95% confidence interval",
       x = "Minutes Played (in 1000s)",
       y = "Goals Scored") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14))

Graph 3: Residual Plot

# Calculate residuals
modeling_data$residuals <- residuals(goals_model)

ggplot(modeling_data, aes(x = predictions, y = residuals)) +
  geom_point(alpha = 0.4, color = "#3498db") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#e74c3c", size = 1) +
  labs(title = "Residual Plot: Model Diagnostics",
       x = "Predicted Goals",
       y = "Residuals") +
  theme_minimal()

Model 2: Goals Per 90 Prediction for 2025

# Create prediction based on trends with confidence intervals
goal_scorer_prediction <- scoring_trends %>%
  mutate(
    Predicted_2025_Goals = round(Avg_Goals_Per_90 * 38),  # Full season
    Lower_CI = round((Avg_Goals_Per_90 - 0.1) * 38),
    Upper_CI = round((Avg_Goals_Per_90 + 0.1) * 38),
    Confidence = case_when(
      Seasons_Played >= 3 ~ "High",
      Seasons_Played == 2 ~ "Medium",
      TRUE ~ "Low"
    )
  ) %>%
  arrange(desc(Predicted_2025_Goals)) %>%
  head(10)

knitr::kable(goal_scorer_prediction %>% 
               select(Player, Latest_Team, Predicted_2025_Goals, Lower_CI, Upper_CI, Confidence),
             caption = "2025 Goal Predictions with 95% Confidence Intervals")
2025 Goal Predictions with 95% Confidence Intervals
Player Latest_Team Predicted_2025_Goals Lower_CI Upper_CI Confidence
Erling Haaland Manchester City 40 37 44 Medium
Alexander Isak Newcastle Utd 27 23 31 Medium
Callum Wilson Newcastle Utd 26 22 30 Medium
Diogo Jota Liverpool 24 20 28 High
Mohamed Salah Liverpool 24 20 28 High
Harry Kane Tottenham 24 20 28 Medium
Taiwo Awoniyi Nott’ham Forest 22 18 26 Medium
Jean-Philippe Mateta Crystal Palace 19 16 23 Medium
Son Heung-min Tottenham 19 15 23 High
Phil Foden Manchester City 19 15 23 High

Graph 4: Predicted Top Scorers with Error Bars

ggplot(goal_scorer_prediction, aes(x = reorder(Player, Predicted_2025_Goals), 
                                     y = Predicted_2025_Goals,
                                     fill = Confidence)) +
  geom_col() +
  geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), width = 0.3, size = 0.8) +
  coord_flip() +
  scale_fill_manual(values = c("High" = "#27ae60", "Medium" = "#f39c12", "Low" = "#e74c3c")) +
  labs(title = "Predicted Top 10 Goal Scorers - Premier League 2025",
       subtitle = "Error bars show 95% confidence intervals",
       x = "Player",
       y = "Predicted Goals",
       fill = "Confidence") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14))


Part 3: Multi-Player Comparison

Player Selection & Data Prep

four_players <- player_history %>%
  filter(Player %in% c("Mohamed Salah", "Erling Haaland", "Ivan Toney", "Virgil van Dijk")) %>%
  select(Season_End_Year, Player, Squad, Gls, Ast, Min_Playing) %>%
  arrange(Player, Season_End_Year)

# Summary table
knitr::kable(four_players, 
             caption = "4 Player Dataset: 2022-2024")
4 Player Dataset: 2022-2024
Season_End_Year Player Squad Gls Ast Min_Playing
2023 Erling Haaland Manchester City 36 8 2769
2024 Erling Haaland Manchester City 27 5 2552
2022 Ivan Toney Brentford 12 5 2909
2023 Ivan Toney Brentford 20 4 2951
2024 Ivan Toney Brentford 4 2 1450
2022 Mohamed Salah Liverpool 23 13 2762
2023 Mohamed Salah Liverpool 19 12 3290
2024 Mohamed Salah Liverpool 18 10 2534
2022 Virgil van Dijk Liverpool 3 1 3060
2023 Virgil van Dijk Liverpool 3 1 2835
2024 Virgil van Dijk Liverpool 2 2 3177

Graph 5: Goals Time Series

ggplot(four_players, aes(x = Season_End_Year, y = Gls, color = Player, group = Player)) +
  geom_line(size = 2) +
  geom_point(size = 4) +
  scale_x_continuous(breaks = c(2022, 2023, 2024)) +
  scale_color_manual(values = c("Mohamed Salah" = "#C8102E", 
                                "Erling Haaland" = "#6CABDD",
                                "Ivan Toney" = "#F7A800",
                                "Virgil van Dijk" = "#8B0000")) +
  labs(title = "Goals Scored: 4 Player Comparison (2022-2024)",
       subtitle = "Time series showing scoring trends",
       x = "Season",
       y = "Goals",
       color = "Player") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "bottom"
  )

Goals Per 90 Analysis

four_players_per90 <- four_players %>%
  mutate(Goals_Per_90 = (Gls / Min_Playing) * 90)

knitr::kable(four_players_per90 %>% 
               select(Season_End_Year, Player, Gls, Goals_Per_90), 
             digits = 2,
             caption = "Goals Per 90 Minutes by Season")
Goals Per 90 Minutes by Season
Season_End_Year Player Gls Goals_Per_90
2023 Erling Haaland 36 1.17
2024 Erling Haaland 27 0.95
2022 Ivan Toney 12 0.37
2023 Ivan Toney 20 0.61
2024 Ivan Toney 4 0.25
2022 Mohamed Salah 23 0.75
2023 Mohamed Salah 19 0.52
2024 Mohamed Salah 18 0.64
2022 Virgil van Dijk 3 0.09
2023 Virgil van Dijk 3 0.10
2024 Virgil van Dijk 2 0.06

Graph 6: Scoring Efficiency Lines

ggplot(four_players_per90, aes(x = Season_End_Year, y = Goals_Per_90, 
                                color = Player, group = Player)) +
  geom_line(size = 2) +
  geom_point(size = 4) +
  scale_x_continuous(breaks = c(2022, 2023, 2024)) +
  scale_color_manual(values = c("Mohamed Salah" = "#C8102E", 
                                "Erling Haaland" = "#6CABDD",
                                "Ivan Toney" = "#F7A800",
                                "Virgil van Dijk" = "#8B0000")) +
  labs(title = "Goals Per 90 Minutes: Efficiency Comparison",
       subtitle = "Standardized scoring rate",
       x = "Season",
       y = "Goals Per 90 Minutes",
       color = "Player") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14))

Graph 7: Grouped Bar Chart

ggplot(four_players, aes(x = factor(Season_End_Year), y = Gls, fill = Player)) +
  geom_col(position = "dodge", width = 0.7) +
  scale_fill_manual(values = c("Mohamed Salah" = "#C8102E", 
                               "Erling Haaland" = "#6CABDD",
                               "Ivan Toney" = "#F7A800",
                               "Virgil van Dijk" = "#8B0000")) +
  labs(title = "Goals by Season: Side-by-Side Comparison",
       x = "Season",
       y = "Goals",
       fill = "Player") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14))


Part 4: Correlation Analysis

Calculate Correlations

# Prepare correlation data
correlation_data <- four_players %>%
  select(Gls, Ast, Min_Playing) %>%
  cor()

knitr::kable(correlation_data, 
             digits = 3,
             caption = "Correlation Matrix: Goals, Assists, Minutes")
Correlation Matrix: Goals, Assists, Minutes
Gls Ast Min_Playing
Gls 1.000 0.667 0.075
Ast 0.667 1.000 0.170
Min_Playing 0.075 0.170 1.000

Graph 8: Correlation Heatmap

# Reshape for plotting
cor_long <- as.data.frame(correlation_data) %>%
  rownames_to_column("Var1") %>%
  pivot_longer(-Var1, names_to = "Var2", values_to = "Correlation")

ggplot(cor_long, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Correlation, 2)), color = "white", size = 6) +
  scale_fill_gradient2(low = "#3498db", mid = "white", high = "#e74c3c", 
                       midpoint = 0, limits = c(-1, 1)) +
  labs(title = "Correlation Heatmap",
       x = "", y = "") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14))

Part 5: Career Summary

Career Averages

career_comparison <- four_players %>%
  group_by(Player) %>%
  summarise(
    Seasons = n(),
    Total_Goals = sum(Gls),
    Avg_Goals = round(mean(Gls), 1),
    Total_Assists = sum(Ast),
    Avg_Assists = round(mean(Ast), 1),
    Total_Minutes = sum(Min_Playing),
    Goals_Per_90 = round((Total_Goals / Total_Minutes) * 90, 2),
    Current_Team = last(Squad)
  ) %>%
  arrange(desc(Total_Goals))

knitr::kable(career_comparison,
             caption = "3-Year Career Summary (2022-2024)")
3-Year Career Summary (2022-2024)
Player Seasons Total_Goals Avg_Goals Total_Assists Avg_Assists Total_Minutes Goals_Per_90 Current_Team
Erling Haaland 2 63 31.5 13 6.5 5321 1.07 Manchester City
Mohamed Salah 3 60 20.0 35 11.7 8586 0.63 Liverpool
Ivan Toney 3 36 12.0 11 3.7 7310 0.44 Brentford
Virgil van Dijk 3 8 2.7 4 1.3 9072 0.08 Liverpool

Graph 9: Performance Heatmap

heatmap_data <- career_comparison %>%
  select(Player, Total_Goals, Avg_Goals, Total_Assists, Goals_Per_90) %>%
  pivot_longer(-Player, names_to = "Metric", values_to = "Value")

ggplot(heatmap_data, aes(x = Metric, y = Player, fill = Value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Value, 1)), color = "white", size = 5, fontface = "bold") +
  scale_fill_gradient(low = "#3498db", high = "#e74c3c") +
  labs(title = "Career Performance Heatmap (2022-2024)",
       x = "", y = "",
       fill = "Value") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Graph 10: Position-Based Faceted View

four_players_roles <- four_players %>%
  mutate(
    Position = case_when(
      Player == "Virgil van Dijk" ~ "Defender",
      Player == "Mohamed Salah" ~ "Winger",
      Player %in% c("Erling Haaland", "Ivan Toney") ~ "Striker"
    )
  )

ggplot(four_players_roles, aes(x = Player, y = Gls, fill = Player)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Position, scales = "free_x") +
  scale_fill_manual(values = c("Mohamed Salah" = "#C8102E", 
                               "Erling Haaland" = "#6CABDD",
                               "Ivan Toney" = "#F7A800",
                               "Virgil van Dijk" = "#8B0000")) +
  labs(title = "Goals by Position Type",
       subtitle = "Positional context matters",
       x = "", y = "Goals") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    strip.background = element_rect(fill = "#3498db"),
    strip.text = element_text(color = "white", face = "bold")
  )