GEOG 588 with Marcela Suárez, Penn State

Load necessary packages

library(tidyverse)
library(here)
library(dplyr)
library(janitor)

Load required data

The data was accessed from a paper titled “A public data set of spatio-temporal match events in soccer competitions” found here: https://www.nature.com/articles/s41597-019-0247-7

The data described in this paper have been collected and provided by Wyscout, a leading company in the soccer industry which connects soccer professionals worldwide, supports more than 50 soccer associations and more than 1,000 professional clubs around the world. The procedure of data collection is performed by expert video analysts (the operators), who are trained and focused on data collection for soccer, through a proprietary software (the tagger).

Note: I used python to transfer the JSON data into a csv format

wcevents <- read.csv(here("wcevents.csv"))
head(wcevents)
##   eventId subEventName                        tags playerId
## 1       8  Simple pass              [{'id': 1801}]   122671
## 2       8    High pass              [{'id': 1801}]   139393
## 3       1     Air duel [{'id': 703}, {'id': 1801}]   103668
## 4       1     Air duel [{'id': 701}, {'id': 1802}]   122940
## 5       8  Simple pass              [{'id': 1801}]   122847
## 6       8  Simple pass              [{'id': 1801}]   122832
##                                  positions matchId eventName teamId matchPeriod
## 1 [{'y': 50, 'x': 50}, {'y': 53, 'x': 35}] 2057954      Pass  16521          1H
## 2 [{'y': 53, 'x': 35}, {'y': 19, 'x': 75}] 2057954      Pass  16521          1H
## 3 [{'y': 81, 'x': 25}, {'y': 83, 'x': 37}] 2057954      Duel  14358          1H
## 4 [{'y': 19, 'x': 75}, {'y': 17, 'x': 63}] 2057954      Duel  16521          1H
## 5 [{'y': 17, 'x': 63}, {'y': 15, 'x': 71}] 2057954      Pass  16521          1H
## 6 [{'y': 15, 'x': 71}, {'y': 11, 'x': 92}] 2057954      Pass  16521          1H
##    eventSec subEventId        id
## 1  1.656214         85 258612104
## 2  4.487814         83 258612106
## 3  5.937411         10 258612077
## 4  6.406961         10 258612112
## 5  8.562167         85 258612110
## 6 10.991292         85 258612113

Tidy names

head(clean_names(wcevents))
##   event_id sub_event_name                        tags player_id
## 1        8    Simple pass              [{'id': 1801}]    122671
## 2        8      High pass              [{'id': 1801}]    139393
## 3        1       Air duel [{'id': 703}, {'id': 1801}]    103668
## 4        1       Air duel [{'id': 701}, {'id': 1802}]    122940
## 5        8    Simple pass              [{'id': 1801}]    122847
## 6        8    Simple pass              [{'id': 1801}]    122832
##                                  positions match_id event_name team_id
## 1 [{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]  2057954       Pass   16521
## 2 [{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]  2057954       Pass   16521
## 3 [{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]  2057954       Duel   14358
## 4 [{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]  2057954       Duel   16521
## 5 [{'y': 17, 'x': 63}, {'y': 15, 'x': 71}]  2057954       Pass   16521
## 6 [{'y': 15, 'x': 71}, {'y': 11, 'x': 92}]  2057954       Pass   16521
##   match_period event_sec sub_event_id        id
## 1           1H  1.656214           85 258612104
## 2           1H  4.487814           83 258612106
## 3           1H  5.937411           10 258612077
## 4           1H  6.406961           10 258612112
## 5           1H  8.562167           85 258612110
## 6           1H 10.991292           85 258612113
cleanwcevents <- clean_names(wcevents)
# rename match_period to half (use format: new name = old name)
cleanwcevents <- rename(cleanwcevents, half = match_period)
names(cleanwcevents) #list the new names
##  [1] "event_id"       "sub_event_name" "tags"           "player_id"     
##  [5] "positions"      "match_id"       "event_name"     "team_id"       
##  [9] "half"           "event_sec"      "sub_event_id"   "id"

I changed the match_period name to half, which is more widely understood

Filter data to World Cup Final match

wcfinal <- cleanwcevents %>%
  filter(match_id == 2058017)

head(wcfinal)
##   event_id         sub_event_name                        tags player_id
## 1        8            Simple pass                          []     14943
## 2        8            Simple pass              [{'id': 1801}]     69968
## 3        8            Simple pass              [{'id': 1801}]      8287
## 4        8            Simple pass              [{'id': 1801}]     69409
## 5        8                 Launch              [{'id': 1801}]    135747
## 6        1 Ground loose ball duel [{'id': 703}, {'id': 1801}]     14943
##                                  positions match_id event_name team_id half
## 1                     [{'y': 51, 'x': 50}]  2058017       Pass    9598   1H
## 2 [{'y': 52, 'x': 39}, {'y': 74, 'x': 34}]  2058017       Pass    9598   1H
## 3 [{'y': 74, 'x': 34}, {'y': 93, 'x': 30}]  2058017       Pass    9598   1H
## 4 [{'y': 93, 'x': 30}, {'y': 65, 'x': 11}]  2058017       Pass    9598   1H
## 5 [{'y': 65, 'x': 11}, {'y': 51, 'x': 64}]  2058017       Pass    9598   1H
## 6 [{'y': 51, 'x': 64}, {'y': 20, 'x': 63}]  2058017       Duel    9598   1H
##   event_sec sub_event_id        id
## 1  1.892339           85 263883958
## 2  3.889375           85 263883959
## 3  6.140946           85 263883960
## 4  9.226570           85 263883963
## 5 12.658969           84 263883964
## 6 15.685687           13 263885674

Ask questions about the data:

Which players were best passers in the World Cup Final? (Comparing each player’s total passes against the average number of passes)

Here I convert the event text data and convert it into a numerical format, with 1 indicating a pass was attempted, and 0 indicating there was not a pass. This is then used to find the mean which can be used to analyze if a player passed above the mean or not.

player_pass_summary <- wcfinal %>%
  mutate(pass_numeric = ifelse(event_name == "Pass", 1, 0)) %>%
  group_by(player_id )%>%
  summarize(total_passes = sum(pass_numeric))%>%
  mutate(avg_passes = mean(total_passes))%>%
  mutate(passing_above_mean = total_passes > avg_passes)
knitr::kable(player_pass_summary, "simple")
player_id total_passes avg_passes passing_above_mean
3309 22 28.11111 FALSE
3476 62 28.11111 TRUE
3682 22 28.11111 FALSE
7936 29 28.11111 TRUE
8200 14 28.11111 FALSE
8287 66 28.11111 TRUE
14812 35 28.11111 TRUE
14943 28 28.11111 FALSE
25381 16 28.11111 FALSE
25393 65 28.11111 TRUE
25397 20 28.11111 FALSE
25437 22 28.11111 FALSE
26010 21 28.11111 FALSE
28115 2 28.11111 FALSE
31528 13 28.11111 FALSE
69396 46 28.11111 TRUE
69409 62 28.11111 TRUE
69411 10 28.11111 FALSE
69616 14 28.11111 FALSE
69968 90 28.11111 TRUE
105361 25 28.11111 FALSE
135747 13 28.11111 FALSE
135810 2 28.11111 FALSE
209091 7 28.11111 FALSE
279545 20 28.11111 FALSE
340646 20 28.11111 FALSE
353833 13 28.11111 FALSE

Clean data to change positions column into four usable columns

The positions column contains four key data metrics (start y, start x, end y, end x). I used the separate function from tidyr to split the positions column based on the symbol delimiters “[,:}{}]”. I then removed the columns I did not want to leave only the four relevant columns.

wcfinaltable <- wcfinal %>%
  separate(positions, into = c("a", "a1", "start_y", "a2", "start_x", "a3", "a4", "a5", "end_y", "a6", "end_x"), 
           #separate out commas, semicolon, and all brackets
           sep = "[,:}{}]") %>%
  #remove if it starts with a
  select(-starts_with("a"))
head(wcfinaltable)
##   event_id         sub_event_name                        tags player_id start_y
## 1        8            Simple pass                          []     14943      51
## 2        8            Simple pass              [{'id': 1801}]     69968      52
## 3        8            Simple pass              [{'id': 1801}]      8287      74
## 4        8            Simple pass              [{'id': 1801}]     69409      93
## 5        8                 Launch              [{'id': 1801}]    135747      65
## 6        1 Ground loose ball duel [{'id': 703}, {'id': 1801}]     14943      51
##   start_x end_y end_x match_id event_name team_id half event_sec sub_event_id
## 1      50  <NA>  <NA>  2058017       Pass    9598   1H  1.892339           85
## 2      39    74    34  2058017       Pass    9598   1H  3.889375           85
## 3      34    93    30  2058017       Pass    9598   1H  6.140946           85
## 4      30    65    11  2058017       Pass    9598   1H  9.226570           85
## 5      11    51    64  2058017       Pass    9598   1H 12.658969           84
## 6      64    20    63  2058017       Duel    9598   1H 15.685687           13
##          id
## 1 263883958
## 2 263883959
## 3 263883960
## 4 263883963
## 5 263883964
## 6 263885674

Write csv’s for both passing summary and clean data table for future use

write_csv(player_pass_summary, "player_pass_summary_stats.csv")
write_csv(wcfinaltable, "cleanwctable.csv")

Ethan’s lab 2 is complete!