GEOG 588 with Marcela Suárez, Penn State

Load necessary packages

library(tidyverse)
library(here)
library(dplyr)
library(janitor)
library(StatsBombR)
library(SBpitch)
library(ggplot2)
library(ggforce)
library(ggsoccer)
library(scales)

Load required data

The data was accessed from a paper titled “A public data set of spatio-temporal match events in soccer competitions” found here: https://www.nature.com/articles/s41597-019-0247-7

The data described in this paper have been collected and provided by Wyscout, a leading company in the soccer industry which connects soccer professionals worldwide, supports more than 50 soccer associations and more than 1,000 professional clubs around the world. The procedure of data collection is performed by expert video analysts (the operators), who are trained and focused on data collection for soccer, through a proprietary software (the tagger).

Note: I used python to transfer the JSON data into a csv format

wcevents <- read.csv(here("wcevents.csv"))
head(wcevents)
##   eventId subEventName                        tags playerId
## 1       8  Simple pass              [{'id': 1801}]   122671
## 2       8    High pass              [{'id': 1801}]   139393
## 3       1     Air duel [{'id': 703}, {'id': 1801}]   103668
## 4       1     Air duel [{'id': 701}, {'id': 1802}]   122940
## 5       8  Simple pass              [{'id': 1801}]   122847
## 6       8  Simple pass              [{'id': 1801}]   122832
##                                  positions matchId eventName teamId matchPeriod
## 1 [{'y': 50, 'x': 50}, {'y': 53, 'x': 35}] 2057954      Pass  16521          1H
## 2 [{'y': 53, 'x': 35}, {'y': 19, 'x': 75}] 2057954      Pass  16521          1H
## 3 [{'y': 81, 'x': 25}, {'y': 83, 'x': 37}] 2057954      Duel  14358          1H
## 4 [{'y': 19, 'x': 75}, {'y': 17, 'x': 63}] 2057954      Duel  16521          1H
## 5 [{'y': 17, 'x': 63}, {'y': 15, 'x': 71}] 2057954      Pass  16521          1H
## 6 [{'y': 15, 'x': 71}, {'y': 11, 'x': 92}] 2057954      Pass  16521          1H
##    eventSec subEventId        id
## 1  1.656214         85 258612104
## 2  4.487814         83 258612106
## 3  5.937411         10 258612077
## 4  6.406961         10 258612112
## 5  8.562167         85 258612110
## 6 10.991292         85 258612113

Ask questions about the data:

Was Luka Modric or Kevin De Bruyne a better passer in the 2018 World Cup?

I filtered the data to only include these two players in my analysis.

croatiadata <- read.csv(here("CroatiaTeamData.csv"))
belgiumdata <- read.csv(here("BelgiumTeamData.csv"))

#Filter the data to get passes from Kevin De Bruyne and Luka Modric across all games in the 2018 World Cup
wc_debruyne <- belgiumdata %>%
  filter(player_id == 38021 & event_name == "Pass")

wc_modric <- croatiadata %>%
  filter(player_id == 8287 & event_name == "Pass")

# Count the number of records to get total passes from the filtered data
debruyne_record_count <- nrow(wc_debruyne)
modric_record_count <- nrow(wc_modric)

While this is great, a table view of the data is not very helpful

modric_kdb_data <- rbind(wc_modric, wc_debruyne)
glimpse(modric_kdb_data)
## Rows: 772
## Columns: 15
## $ event_id       <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8…
## $ sub_event_name <chr> "Simple pass", "Simple pass", "Simple pass", "Simple pa…
## $ tags           <chr> "[{'id': 1801}]", "[{'id': 1801}]", "[{'id': 1801}]", "…
## $ player_id      <int> 8287, 8287, 8287, 8287, 8287, 8287, 8287, 8287, 8287, 8…
## $ start_y        <int> 51, 38, 88, 67, 88, 94, 77, 73, 56, 8, 45, 21, 71, 59, …
## $ start_x        <int> 29, 64, 44, 36, 55, 52, 53, 28, 45, 79, 33, 56, 46, 46,…
## $ end_y          <int> 11, 39, 97, 57, 82, 84, 35, 96, 26, 36, 21, 11, 85, 8, …
## $ end_x          <int> 29, 51, 42, 35, 54, 58, 61, 53, 92, 95, 28, 76, 58, 76,…
## $ match_id       <int> 2057973, 2057973, 2057973, 2057973, 2057973, 2057973, 2…
## $ event_name     <chr> "Pass", "Pass", "Pass", "Pass", "Pass", "Pass", "Pass",…
## $ team_id        <int> 9598, 9598, 9598, 9598, 9598, 9598, 9598, 9598, 9598, 9…
## $ match_period   <chr> "1H", "1H", "1H", "1H", "1H", "1H", "1H", "1H", "1H", "…
## $ event_sec      <dbl> 49.49613, 60.22805, 83.27966, 187.61858, 223.42676, 232…
## $ sub_event_id   <int> 85, 85, 85, 85, 85, 85, 85, 85, 83, 80, 85, 85, 85, 83,…
## $ id             <int> 258792824, 258792841, 258792859, 258792919, 258792929, …

I believe it is better to visualise the data spatially

I used the following dimensions to plot the data and generate a field to visualize the passes.

Field Dimensions
Field Dimensions
# Define pitch dimensions
pitch_wyscout <- list(
  length = 100,
  width = 100,
  penalty_box_length = 16,
  penalty_box_width = 62,
  six_yard_box_length = 6,
  six_yard_box_width = 26,
  penalty_spot_distance = 10,
  goal_width = 12,
  origin_x = 0,
  origin_y = 0
)

create_wyscout_pitch <- function() {
  ggplot() +
    # Full pitch outline
    geom_rect(aes(xmin = pitch_wyscout$origin_x, xmax = pitch_wyscout$length,
                  ymin = pitch_wyscout$origin_y, ymax = pitch_wyscout$width), 
              color = "grey", fill = NA) +
    
    # Penalty box (left)
    geom_rect(aes(xmin = pitch_wyscout$origin_x, 
                  xmax = pitch_wyscout$penalty_box_length,
                  ymin = (pitch_wyscout$width - pitch_wyscout$penalty_box_width) / 2,
                  ymax = (pitch_wyscout$width + pitch_wyscout$penalty_box_width) / 2),
              color = "grey", fill = NA) +
    
    # Penalty box (right)
    geom_rect(aes(xmin = pitch_wyscout$length - pitch_wyscout$penalty_box_length,
                  xmax = pitch_wyscout$length,
                  ymin = (pitch_wyscout$width - pitch_wyscout$penalty_box_width) / 2,
                  ymax = (pitch_wyscout$width + pitch_wyscout$penalty_box_width) / 2),
              color = "grey", fill = NA) +
    
    # Six-yard box (left)
    geom_rect(aes(xmin = pitch_wyscout$origin_x, 
                  xmax = pitch_wyscout$six_yard_box_length,
                  ymin = (pitch_wyscout$width - pitch_wyscout$six_yard_box_width) / 2,
                  ymax = (pitch_wyscout$width + pitch_wyscout$six_yard_box_width) / 2),
              color = "grey", fill = NA) +
    
    # Six-yard box (right)
    geom_rect(aes(xmin = pitch_wyscout$length - pitch_wyscout$six_yard_box_length,
                  xmax = pitch_wyscout$length,
                  ymin = (pitch_wyscout$width - pitch_wyscout$six_yard_box_width) / 2,
                  ymax = (pitch_wyscout$width + pitch_wyscout$six_yard_box_width) / 2),
              color = "grey", fill = NA) +
    
    # Center circle
    geom_point(aes(x = pitch_wyscout$length / 2, y = pitch_wyscout$width / 2), 
               size = 0.5, color = "grey") +
    geom_circle(aes(x0 = pitch_wyscout$length / 2, y0 = pitch_wyscout$width / 2, r = 9.15),
                color = "grey") +
    
    # Goals
    geom_segment(aes(x = pitch_wyscout$origin_x, 
                     xend = pitch_wyscout$origin_x, 
                     y = (pitch_wyscout$width - pitch_wyscout$goal_width) / 2, 
                     yend = (pitch_wyscout$width + pitch_wyscout$goal_width) / 2),
                 color = "black", size = 3) +
    geom_segment(aes(x = pitch_wyscout$length, 
                     xend = pitch_wyscout$length, 
                     y = (pitch_wyscout$width - pitch_wyscout$goal_width) / 2, 
                     yend = (pitch_wyscout$width + pitch_wyscout$goal_width) / 2),
                 color = "black", size = 3) +
    
    # Center line
    geom_segment(aes(x = pitch_wyscout$length / 2, xend = pitch_wyscout$length / 2,
                     y = pitch_wyscout$origin_y, yend = pitch_wyscout$width),
                 color = "grey") +
    
    # Adjust theme
    theme(
      panel.grid = element_blank(),
      panel.background = element_rect(fill = "white", color = NA),
      plot.background = element_rect(fill = "white", color = NA),
      axis.text = element_blank(),
      axis.ticks = element_blank(),
      axis.title = element_blank()
      ) +
    coord_fixed() +
    labs(title = "Football Pitch")
}

So plotting the data, we get something like this:

# Plot Kevin De Bruyne's pass distribution
create_wyscout_pitch() +
  
  # Pass arrows (Red for De Bruyne)
  geom_segment(data = wc_debruyne, 
               aes(x = start_x, y = start_y, xend = end_x, yend = end_y), 
               lineend = "round", linewidth = 0.2,  
               arrow = arrow(length = unit(0.08, "inches")),
               color = "red", alpha = 1) +
    # Adjust transparency
  labs(
    title = "Kevin De Bruyne Pass Distribution (World Cup 2018)",
  ) +
  theme(
    legend.position = "right",
  ) +
  coord_fixed(ratio = 60 / 100)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

ggsave("Lab3DeBruynePasses.png")
## Saving 7 x 5 in image

With this data, we can also filter data to only show “Dangerous Passes” i.e. Passes that enter the “Final Third” of the pitch, the area where most goals occur

To do this, we filter the data where the pass ends in the central location to the final third of the pitch

#Filter passes to end in final third
wc_debruyne_finalthird <- belgiumdata %>%
  filter(player_id == 38021 & event_name == "Pass" & end_x > 75 & end_y >20 & end_y <80)

# Plot Kevin De Bruyne's pass distribution
create_wyscout_pitch() +
  
  # Pass arrows (Red for De Bruyne)
  geom_segment(data = wc_debruyne_finalthird, 
               aes(x = start_x, y = start_y, xend = end_x, yend = end_y), 
               lineend = "round", linewidth = 0.2,  
               arrow = arrow(length = unit(0.08, "inches")),
               color = "red", alpha = 1) + # Adjust transparency
  
    # Attack direction arrow (adjust coordinates as needed)
  annotate("segment", x = 10, xend = 40, y = 105, yend = 105, 
           arrow = arrow(length = unit(0.3, "inches")), color = "black", size = 1.2) +
  annotate("text", x = 25, y = 108, label = "Attacking", size = 5, fontface = "bold") +
  
  labs(
    title = "Kevin De Bruyne Pass Distribution (World Cup 2018)",
  ) +
  theme(
    legend.position = "right",
  ) +
  coord_fixed(ratio = 60 / 100)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

ggsave("Lab3DeBruyneFinalThirdPasses.png")
## Saving 7 x 5 in image

Let’s Compare this to Luka Modric

#Filter passes to end in final third
wc_modric_finalthird <- croatiadata %>%
  filter(player_id == 8287 & event_name == "Pass" & end_x > 75 & end_y >20 & end_y <80)

# Plot Luka Modric's pass distribution
create_wyscout_pitch() +
  
  # Pass arrows (Blue for Modric)
  geom_segment(data = wc_modric_finalthird, 
               aes(x = start_x, y = start_y, xend = end_x, yend = end_y), 
               lineend = "round", linewidth = 0.2,  
               arrow = arrow(length = unit(0.08, "inches")),
               color = "blue", alpha = 1) + # Adjust transparency
  
    # Attack direction arrow (adjust coordinates as needed)
  annotate("segment", x = 10, xend = 40, y = 105, yend = 105, 
           arrow = arrow(length = unit(0.3, "inches")), color = "black", size = 1.2) +
  annotate("text", x = 25, y = 108, label = "Attacking", size = 5, fontface = "bold") +
  
  labs(
    title = "Luka Modric Pass Distribution (World Cup 2018)",
  ) +
  theme(
    legend.position = "right",
  ) +
  coord_fixed(ratio = 60 / 100)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

ggsave("Lab3ModricFinalThirdPasses.png")
## Saving 7 x 5 in image

To compare the two datasets, let’s visualize the density of passes in a heatmap format and overlay the passes into the final third

# Generate the subtitle with the record count
subtitle_text <- paste("Kevin DeBruyne had", debruyne_record_count, "total passes across the seven matches in the 2018 World Cup, with",wc_debruyne_finalthird, "being categorized as 'dangerous passes'.")

# Plot Kevin De Bruyne's pass distribution
create_wyscout_pitch() +
  
  # Pass arrows (Red for De Bruyne)
  geom_segment(data = wc_debruyne_finalthird, 
               aes(x = start_x, y = start_y, xend = end_x, yend = end_y), 
               lineend = "round", linewidth = 0.2,  
               arrow = arrow(length = unit(0.08, "inches")),
               color = "red", alpha = 1) +
  
  # Density heatmap using pass counts
  stat_density_2d_filled(
    data = wc_debruyne,
    aes(x = start_x, y = start_y, fill = as.numeric(after_stat(level))),  # Only use `fill = after_stat(level)`
    alpha = 0.7,
    contour = TRUE
  ) +
  
  # Adjust legend to show number of passes with white-to-blue scale
  scale_fill_gradient(
    low = "white", high = "red",  # White for low density, blue for high density
    name = "Number of Passes",
    labels = scales::comma_format()  # Format large numbers properly
  ) +
  
  # Attack direction arrow (adjust coordinates as needed)
  annotate("segment", x = 10, xend = 40, y = 105, yend = 105, 
           arrow = arrow(length = unit(0.3, "inches")), color = "black", size = 1.2) +
  annotate("text", x = 25, y = 108, label = "Attacking", size = 5, fontface = "bold") +
  
  # Adjust transparency
  labs(
    title = "Kevin De Bruyne Pass Distribution (World Cup 2018)",
    subtitle = subtitle_text
  ) +
  theme(
    legend.position = "right",
  ) +
  coord_fixed(ratio = 60 / 100)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

ggsave("Lab3DeBruyneHeatMap.png")
## Saving 7 x 5 in image

We will now do the same for Luka Modric

# Generate the subtitle with the record count
subtitle_text <- paste("Luka Modric had", modric_record_count, "total passes across the seven matches in the 2018 World Cup, with",wc_modric_finalthird, "being categorized as 'dangerous passes'.")

# Plot Luka Modric's pass distribution
create_wyscout_pitch() +
  
  # Pass arrows (Blue for Modric)
  geom_segment(data = wc_modric_finalthird, 
               aes(x = start_x, y = start_y, xend = end_x, yend = end_y), 
               lineend = "round", linewidth = 0.2,  
               arrow = arrow(length = unit(0.08, "inches")),
               color = "blue", alpha = 1) +
  
  # Density heatmap using pass counts
  stat_density_2d_filled(
    data = wc_modric,
    aes(x = start_x, y = start_y, fill = as.numeric(after_stat(level))),  # Only use `fill = after_stat(level)`
    alpha = 0.7,
    contour = TRUE
  ) +
  
  # Adjust legend to show number of passes with white-to-blue scale
  scale_fill_gradient(
    low = "white", high = "blue",  # White for low density, blue for high density
    name = "Number of Passes",
    labels = scales::comma_format()  # Format large numbers properly
  ) +
  
  # Attack direction arrow (adjust coordinates as needed)
  annotate("segment", x = 10, xend = 40, y = 105, yend = 105, 
           arrow = arrow(length = unit(0.3, "inches")), color = "black", size = 1.2) +
  annotate("text", x = 25, y = 108, label = "Attacking", size = 5, fontface = "bold") +
  
  # Adjust transparency
  labs(
    title = "Luka Modric Pass Distribution (World Cup 2018)",
    subtitle = subtitle_text
  ) +
  theme(
    legend.position = "right",
  ) +
  coord_fixed(ratio = 60 / 100)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

ggsave("Lab3ModricHeatMap.png")
## Saving 7 x 5 in image

We will now compare the two players passing density to see where each player passes from in relation to the goal

# Add a "player" column based on player_id
wc_debruyne_start <- wc_debruyne %>% mutate(player = ifelse(player_id == 38021, "De Bruyne", NA))
wc_modric_start <- wc_modric %>% mutate(player = ifelse(player_id == 8287, "Modric", NA))

# Combine both datasets into a single dataframe
wc_combined <- bind_rows(wc_debruyne_start, wc_modric_start)
# Generate the subtitle with the record count
subtitle_text <- paste("De Bruyne Total Passes:", debruyne_record_count, " // Modric Total Passes:", modric_record_count)

# Plot both players' passing distributions
create_wyscout_pitch() +
  stat_density_2d_filled(
    data = wc_combined,
    aes(x = start_x, y = start_y, fill = player, alpha = as.numeric(after_stat(level))),  # Reference `player` column
    contour = TRUE
  ) +
  scale_fill_manual(
    values = c("De Bruyne" = "red", "Modric" = "blue", "NA"="white"),  # Assign colors manually
    name = "Player",
    na.value = "white"
  ) +
  scale_alpha(range = c(0, 0.6), guide = "none") + 
  
  # Attack direction arrow (adjust coordinates as needed)
  annotate("segment", x = 10, xend = 40, y = 105, yend = 105, 
           arrow = arrow(length = unit(0.3, "inches")), color = "black", size = 1.2) +
  annotate("text", x = 25, y = 108, label = "Attacking", size = 5, fontface = "bold") +
  
  # Adjust transparency
  labs(
    title = "Kevin De Bruyne & Luka Modric Pass Distribution (World Cup 2018)",
    subtitle = subtitle_text
  ) +
  theme(
    legend.position = "right",
  ) +
  coord_fixed(ratio = 60 / 100)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

ggsave("Lab3ComparisonHeatMap.png")
## Saving 7 x 5 in image

From this chart, we can see that while Kevin De Bruyne has more passes into the final third, Luka Modric’s starting position is much farther away in relation to the goal.