# Core packages
library(worldfootballR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(tidyr)
epl_shots <- load_understat_league_shots(league = "EPL")
## → Data last updated 2025-09-18 18:43:25.4242129325867 UTC
# Display first few rows
head(epl_shots)
## league id minute result X Y xG player h_a
## 1 EPL 14511 12 SavedShot 0.728 0.501 0.01874101 Wayne Rooney h
## 2 EPL 14512 16 BlockedShot 0.789 0.336 0.01559763 Wayne Rooney h
## 3 EPL 14513 25 SavedShot 0.914 0.188 0.06492316 Juan Mata h
## 4 EPL 14514 26 SavedShot 0.920 0.482 0.05778754 Wayne Rooney h
## 5 EPL 14516 33 MissedShots 0.922 0.590 0.04880100 Wayne Rooney h
## 6 EPL 14517 40 BlockedShot 0.774 0.466 0.03097105 Darren Fletcher h
## player_id situation season shotType match_id home_team
## 1 629 OpenPlay 2014 RightFoot 4749 Manchester United
## 2 629 OpenPlay 2014 RightFoot 4749 Manchester United
## 3 554 DirectFreekick 2014 LeftFoot 4749 Manchester United
## 4 629 OpenPlay 2014 Head 4749 Manchester United
## 5 629 FromCorner 2014 Head 4749 Manchester United
## 6 809 OpenPlay 2014 RightFoot 4749 Manchester United
## away_team home_goals away_goals date player_assisted
## 1 Swansea 1 2 2014-08-16 12:45:00 Darren Fletcher
## 2 Swansea 1 2 2014-08-16 12:45:00 Chicharito
## 3 Swansea 1 2 2014-08-16 12:45:00 <NA>
## 4 Swansea 1 2 2014-08-16 12:45:00 Ashley Young
## 5 Swansea 1 2 2014-08-16 12:45:00 Juan Mata
## 6 Swansea 1 2 2014-08-16 12:45:00 Juan Mata
## lastAction home_away
## 1 Pass <NA>
## 2 TakeOn <NA>
## 3 Standard <NA>
## 4 Aerial <NA>
## 5 Cross <NA>
## 6 Pass <NA>
# Filter to 2024/25 season if available, otherwise use 2023/24
if("2024" %in% epl_shots$season) {
epl_shots_filtered <- epl_shots %>% filter(season == "2024")
cat("Using 2024/25 season data\n")
} else {
epl_shots_filtered <- epl_shots %>% filter(season == "2023")
cat("2024/25 not available yet, using 2023/24 season\n")
}
## Using 2024/25 season data
cat("Number of shots in selected season:", nrow(epl_shots_filtered), "\n")
## Number of shots in selected season: 9878
assists_data <- epl_shots_filtered %>%
filter(result == "Goal", !is.na(player_assisted)) %>%
group_by(player_name = player_assisted) %>%
summarise(
assists = n(),
xA_from_goals = sum(xG, na.rm = TRUE),
.groups = "drop"
)
key_passes_data <- epl_shots_filtered %>%
filter(!is.na(player_assisted)) %>%
group_by(player_name = player_assisted) %>%
summarise(
key_passes = n(),
total_xA = sum(xG, na.rm = TRUE), # This was missing!
.groups = "drop"
)
playmakers <- assists_data %>%
full_join(key_passes_data, by = "player_name") %>%
mutate(
assists = replace_na(assists, 0),
key_passes = replace_na(key_passes, 0),
xA = coalesce(total_xA, xA_from_goals, 0)
) %>%
# filter by creative output
filter(assists >=5 | key_passes>=15) %>%
# calculate quality metrics
mutate(
xA_per_key_pass = ifelse(key_passes > 0, xA / key_passes, 0),
xA_per_assist = ifelse(assists > 0, xA / assists, 0),
# calculate difference between actual and expected assists
xA_difference = assists - xA
) %>%
select(player_name, assists, xA, key_passes, xA_per_key_pass, xA_per_assist, xA_difference) %>%
arrange(desc(xA))
# show summary
cat("Players meeting threshold (5+ assists OR 15+ key passes):", nrow(playmakers), "\n\n")
## Players meeting threshold (5+ assists OR 15+ key passes): 182
cat("Summary statistics:\n")
## Summary statistics:
summary(playmakers[, c("assists", "xA", "key_passes")])
## assists xA key_passes
## Min. : 0.000 Min. : 0.7262 Min. :15.00
## 1st Qu.: 2.000 1st Qu.: 2.1458 1st Qu.:19.00
## Median : 3.000 Median : 3.3109 Median :26.50
## Mean : 3.577 Mean : 4.0023 Mean :31.98
## 3rd Qu.: 5.000 3rd Qu.: 5.2316 3rd Qu.:42.00
## Max. :18.000 Max. :15.3738 Max. :92.00
top_xA <- playmakers %>%
arrange(desc(xA)) %>%
head(15) %>%
select(player = player_name, xA) %>%
mutate(across(where(is.numeric), ~round(., 2)))
knitr::kable(top_xA, caption = "Top 15 Players by Expected Assists")
| player | xA |
|---|---|
| Mohamed Salah | 15.37 |
| Cole Palmer | 12.95 |
| Bukayo Saka | 11.58 |
| Bruno Fernandes | 10.23 |
| Ismaila Sarr | 10.06 |
| Bryan Mbeumo | 10.02 |
| Jacob Murphy | 9.34 |
| Mikkel Damsgaard | 9.25 |
| Declan Rice | 9.06 |
| Son Heung-Min | 9.04 |
| Kevin De Bruyne | 8.84 |
| Alex Iwobi | 8.46 |
| Morgan Rogers | 8.15 |
| Trent Alexander-Arnold | 8.06 |
| Youri Tielemans | 8.04 |
top_key_passes <- playmakers %>%
arrange(desc(key_passes)) %>%
head(15) %>%
select(Player = player_name, `Key Passes` = key_passes) %>%
mutate(across(where(is.numeric), ~round(., 2)))
knitr::kable(top_key_passes, caption = "Top 15 Players by Key Passes")
| Player | Key Passes |
|---|---|
| Bruno Fernandes | 92 |
| Mohamed Salah | 87 |
| Cole Palmer | 86 |
| Enzo Fernández | 77 |
| Bryan Mbeumo | 70 |
| Dejan Kulusevski | 64 |
| Mikkel Damsgaard | 63 |
| Martin Odegaard | 63 |
| Leif Davis | 61 |
| Declan Rice | 59 |
| Bukayo Saka | 58 |
| Alex Iwobi | 58 |
| Youri Tielemans | 58 |
| Eberechi Eze | 58 |
| Kevin De Bruyne | 57 |
top_assists <- playmakers %>%
arrange(desc(assists)) %>%
head(15) %>%
select(Player = player_name, Assists = assists) %>%
mutate(across(where(is.numeric), ~round(., 2)))
knitr::kable(top_assists, caption = "Top 15 Players by Assists")
| Player | Assists |
|---|---|
| Mohamed Salah | 18 |
| Jacob Murphy | 11 |
| Anthony Elanga | 11 |
| Bukayo Saka | 10 |
| Bruno Fernandes | 10 |
| Morgan Rogers | 10 |
| Antonee Robinson | 10 |
| Mikkel Damsgaard | 9 |
| Son Heung-Min | 9 |
| Cole Palmer | 8 |
| Martin Odegaard | 8 |
| Jarrod Bowen | 8 |
| Eberechi Eze | 8 |
| Sávio | 8 |
| Morgan Gibbs-White | 8 |
top_15_label <- playmakers %>%
arrange(desc(xA)) %>%
head(15)
playmakers <- playmakers %>%
mutate(
top_15 = ifelse(
player_name %in% top_15_label$player_name,
"Top 15 Creators",
"Other Players"
)
)
head(playmakers)
## # A tibble: 6 × 8
## player_name assists xA key_passes xA_per_key_pass xA_per_assist
## <chr> <int> <dbl> <int> <dbl> <dbl>
## 1 Mohamed Salah 18 15.4 87 0.177 0.854
## 2 Cole Palmer 8 13.0 86 0.151 1.62
## 3 Bukayo Saka 10 11.6 58 0.200 1.16
## 4 Bruno Fernandes 10 10.2 92 0.111 1.02
## 5 Ismaila Sarr 6 10.1 44 0.229 1.68
## 6 Bryan Mbeumo 7 10.0 70 0.143 1.43
## # ℹ 2 more variables: xA_difference <dbl>, top_15 <chr>
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
We will first look at a scatter plot comparing the quality of chances created (xA vs the actual number of assists. The diagonal line here represents equal performance of Assists vs xA - players above the line have over performed their xA and players below the line are underperforming (generally down to missed chances by team mates))
ggplot(playmakers, aes(x = xA, y = assists)) +
geom_point(aes(size = key_passes), alpha = 0.6, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", linewidth = 1) +
geom_text(
data = top_15_label,
aes(label = player_name),
size = 3,
hjust = -0.1,
vjust = 0.5
) +
labs(
title = paste("Premier League Creative Playmakers:"),
subtitle = "Quality of chances created (xA) vs Actual output (Assists)",
x = "Expected Assists (xA)",
y = "Actual Assists",
size = "Key Passes",
caption = "Data: Understat via worldfootballR\nPlayers above red line outperform their xA"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 11)
)
# When the scatter plot was completed, it was extremely congested with
labels overlapping each other and difficult to read the data. To solve
this, we install the library ggrepel
library(ggrepel)
library(ggplot2)
ggplot(playmakers, aes(x = xA, y = assists)) +
# Background players
geom_point(
data = subset(playmakers, top_15 == "Other Players"),
aes(size = key_passes),
color = "grey70",
alpha = 0.25
) +
# Top 15 highlighted
geom_point(
data = subset(playmakers, top_15 == "Top 15 Creators"),
aes(size = key_passes),
color = "firebrick",
alpha = 0.8
) +
# Smart labels for Top 15 only
geom_text_repel(
data = subset(playmakers, top_15 == "Top 15 Creators"),
aes(label = player_name),
size = 3,
color = "black",
box.padding = 0.5,
point.padding = 0.3,
max.overlaps = 20
) +
# Reference line (xA = assists)
geom_abline(
slope = 1,
intercept = 0,
linetype = "dashed",
color = "darkred",
alpha = 0.5
) +
labs(
title = paste("Premier League Creative Playmakers:"),
subtitle = "Top 15 creators highlighted by Expected Assists (xA)",
x = "Expected Assists (xA)",
y = "Assists",
size = "Key Passes",
caption = "Data: Understat via worldfootballR"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 11),
legend.position = "right"
)
# As you can see, the graph is much cleaner and easy to read with the
leagues top creative players clearly visualised. # It is also worth
noting that in Salahs historic season, where on top of scoring 28 goals,
he achieved a phenomenal 18 assists, and he had his team mates to thank
for finishing efficiently, outperforming xA by over 2.5 goals.
library(ggrepel)
top_15_label <- playmakers %>%
arrange(desc(xA)) %>%
head(15)
ggplot(playmakers, aes(x = key_passes, y = xA_per_key_pass)) +
geom_point(aes(size = assists, color = assists), alpha = 0.6) +
scale_color_gradient(low = "lightblue", high = "darkred") +
geom_text_repel(
data = top_15_label,
aes(label = player_name),
size = 3,
box.padding = 0.5,
point.padding = 0.5,
max.overlaps = 20,
segment.color = "grey50"
) +
labs(
title = "Creative Volume vs Quality: PL Playmakers",
subtitle = paste("Season"),
x = "Key Passes (Volume)",
y = "xA per Key Pass (Quality)",
size = "Assists",
color = "Assists",
caption = "Data: Understat via WorldFootballR\nTop-Right = High volume + High Quality"
) +
theme_minimal() +
theme(plot.title = element_text(size = 14, face = "bold"))
# From the above scatter plot, we can deduce a couple of key take aways
1. Players towards the top right of the graph (Salah, Palmer) produce high quality chances in abundance.
2. Players towards the top left of the chart (Saka, Sarr, Murphy) produce excellent chances but in lower quantity. They are selective and efficient playmakers.
3. Players towards the bottom right of the chart (Fernandes, Mbuemo) produce a high quantity of chances but of lower likelihood of scoring. They are volume creators.
# Conclusion
# Throughout this assignment, I have found a few key takeaways.
1. Some of my findings are not ground breaking. Salah, Palmer, Bruno and Saka are top creators. Water is wet. But it was interesting to explore the output of some under rated playmakers such as Sarr, Murphy and Damsgaard.
2. This analysis could also be beneficial for teams analysing market needs. For Cole Palmer and Chelsea, it is clear that he is an amazing playmaker and that sloppy finishing has let him and his team down. It also highlights a market opportunity for someone to take a chance on Ismaila Sarrs playmaking quality. With over 10 xA, he could thrive and add value to a team with better finishers.
3. The difference in quantity vs quality playmakers is evident and reflects teams playing styles. Control, possession based teams prioritize waiting for the right moment to produce a high quality chance, where as transition based teams focus in on producing high volume of chances to score goals.
4. There were also some limitations. Having information on minutes played would be useful to more accurately compare playmakers Also, it doesnt allow for game context. An assist (or expected Assist) when a team is already winning 4-0 is nowhere near as valuable as producing the assist for the winner in a 1-0 game.
5. Salah is an all time great
6. As my first experience with an R project, I am excited to explore further what this tool can offer, especially packages like WorldFootballR and StatsbombR