Interested in learning more about the predictive abilities of AI in sports, this article discusses the probabilities of teams progressing through the World Cup given their Soccer Power Index (SPI) ratings. Looking at the relationship between SPI and tournament performance can help inform the value of this metric.
# Read in CSV and assign it to data frame 'x'
x <- read.csv(url("https://projects.fivethirtyeight.com/soccer-api/international/2022/wc_matches.csv"))
head(x)
## date league_id league team1 team2 spi1 spi2 prob1
## 1 2022-11-20 1908 FIFA World Cup Qatar Ecuador 51.00 72.74 0.2369
## 2 2022-11-21 1908 FIFA World Cup England Iran 85.96 62.17 0.6274
## 3 2022-11-21 1908 FIFA World Cup Senegal Netherlands 73.84 86.01 0.2235
## 4 2022-11-21 1908 FIFA World Cup USA Wales 74.83 65.58 0.4489
## 5 2022-11-22 1908 FIFA World Cup Argentina Saudi Arabia 87.21 56.87 0.7228
## 6 2022-11-22 1908 FIFA World Cup Denmark Tunisia 80.02 65.85 0.5001
## prob2 probtie proj_score1 proj_score2 score1 score2 xg1 xg2 nsxg1 nsxg2
## 1 0.5045 0.2586 1.13 1.75 0 2 0.23 1.14 0.24 1.35
## 2 0.1187 0.2539 1.70 0.58 6 2 1.04 1.45 1.50 0.32
## 3 0.5053 0.2712 0.99 1.63 0 2 0.70 0.68 1.22 1.83
## 4 0.2591 0.2920 1.42 1.01 1 1 0.33 1.78 0.48 0.95
## 5 0.0807 0.1966 2.11 0.54 1 2 1.63 0.15 2.40 0.53
## 6 0.2054 0.2945 1.44 0.82 0 0 0.66 1.16 1.33 0.69
## adj_score1 adj_score2
## 1 0.00 2.10
## 2 5.78 2.10
## 3 0.00 1.58
## 4 1.05 1.05
## 5 1.05 2.10
## 6 0.00 0.00
# Check data classes
lapply(x, class)
## $date
## [1] "character"
##
## $league_id
## [1] "integer"
##
## $league
## [1] "character"
##
## $team1
## [1] "character"
##
## $team2
## [1] "character"
##
## $spi1
## [1] "numeric"
##
## $spi2
## [1] "numeric"
##
## $prob1
## [1] "numeric"
##
## $prob2
## [1] "numeric"
##
## $probtie
## [1] "numeric"
##
## $proj_score1
## [1] "numeric"
##
## $proj_score2
## [1] "numeric"
##
## $score1
## [1] "integer"
##
## $score2
## [1] "integer"
##
## $xg1
## [1] "numeric"
##
## $xg2
## [1] "numeric"
##
## $nsxg1
## [1] "numeric"
##
## $nsxg2
## [1] "numeric"
##
## $adj_score1
## [1] "numeric"
##
## $adj_score2
## [1] "numeric"
# Change date column data type to date
x$date <- as.Date(x$date)
# Confirm update
class(x$date)
## [1] "Date"
# Rename columns
colnames(x)[colnames(x) == "spi1"] <- "team1_power_index"
colnames(x)[colnames(x) == "spi2"] <- "team2_power_index"
colnames(x)[colnames(x) == "prob1"] <- "team1_win_prob"
colnames(x)[colnames(x) == "prob2"] <- "team2_win_prob"
colnames(x)[colnames(x) == "proj_score1"] <- "team1_proj_score"
colnames(x)[colnames(x) == "proj_score2"] <- "team2_proj_score"
colnames(x)[colnames(x) == "score1"] <- "team1_score"
colnames(x)[colnames(x) == "score2"] <- "team2_score"
colnames(x)[colnames(x) == "xg1"] <- "team1_expected_goals"
colnames(x)[colnames(x) == "xg2"] <- "team2_expected_goals"
colnames(x)[colnames(x) == "nsxg1"] <- "non_shot_team1_expected_goals"
colnames(x)[colnames(x) == "nsxg2"] <- "non_shot_team2_expected_goals"
# Create 'Stage' column based on the dates in which the games occurred
x$stage <- ifelse(x$date < as.Date("2022-12-03"), 'Group Stage',
ifelse(x$date >= as.Date("2022-12-03") & x$date < as.Date("2022-12-09"), 'Round of 16',
ifelse(x$date >= as.Date("2022-12-09") & x$date < as.Date("2022-12-13"), 'Quarterfinals',
ifelse(x$date >= as.Date("2022-12-13") & x$date < as.Date("2022-12-17"), 'Semifinals',
ifelse(x$date == as.Date("2022-12-18"), 'Finals', '3rd Place')))))
head(x[, c('date', "team1", "team2", 'team1_score', 'team2_score', 'stage')])
## date team1 team2 team1_score team2_score stage
## 1 2022-11-20 Qatar Ecuador 0 2 Group Stage
## 2 2022-11-21 England Iran 6 2 Group Stage
## 3 2022-11-21 Senegal Netherlands 0 2 Group Stage
## 4 2022-11-21 USA Wales 1 1 Group Stage
## 5 2022-11-22 Argentina Saudi Arabia 1 2 Group Stage
## 6 2022-11-22 Denmark Tunisia 0 0 Group Stage
# Create 'Result' column based on the dates in which the games occurred
x$result <- ifelse(x$team1_score > x$team2_score, paste(x$team1, 'beats', x$team2, x$team1_score, '-', x$team2_score, sep = ' '),
ifelse(x$team2_score > x$team1_score, paste(x$team2, 'beats', x$team1, x$team2_score, '-', x$team1_score, sep = ' '),
ifelse(x$team2_score == x$team1_score & x$date < as.Date("2022-12-03"), paste(x$team1, 'and', x$team2, 'tie', x$team1_score, '-', x$team2_score, sep = ' '), paste(x$team1, 'tied', x$team2, x$team1_score, '-', x$team2_score, 'in regular time, heading to ET', sep = ' '))))
head(x[, c('date', "team1", "team2", 'team1_score', 'team2_score', 'stage', 'result')])
## date team1 team2 team1_score team2_score stage
## 1 2022-11-20 Qatar Ecuador 0 2 Group Stage
## 2 2022-11-21 England Iran 6 2 Group Stage
## 3 2022-11-21 Senegal Netherlands 0 2 Group Stage
## 4 2022-11-21 USA Wales 1 1 Group Stage
## 5 2022-11-22 Argentina Saudi Arabia 1 2 Group Stage
## 6 2022-11-22 Denmark Tunisia 0 0 Group Stage
## result
## 1 Ecuador beats Qatar 2 - 0
## 2 England beats Iran 6 - 2
## 3 Netherlands beats Senegal 2 - 0
## 4 USA and Wales tie 1 - 1
## 5 Saudi Arabia beats Argentina 2 - 1
## 6 Denmark and Tunisia tie 0 - 0
# Create new df with specified columns
final_df <- (x[, c("date",'stage', "team1", "team2", 'team1_power_index', 'team2_power_index', 'team1_expected_goals', 'team2_expected_goals', 'result')])
head(final_df)
## date stage team1 team2 team1_power_index
## 1 2022-11-20 Group Stage Qatar Ecuador 51.00
## 2 2022-11-21 Group Stage England Iran 85.96
## 3 2022-11-21 Group Stage Senegal Netherlands 73.84
## 4 2022-11-21 Group Stage USA Wales 74.83
## 5 2022-11-22 Group Stage Argentina Saudi Arabia 87.21
## 6 2022-11-22 Group Stage Denmark Tunisia 80.02
## team2_power_index team1_expected_goals team2_expected_goals
## 1 72.74 0.23 1.14
## 2 62.17 1.04 1.45
## 3 86.01 0.70 0.68
## 4 65.58 0.33 1.78
## 5 56.87 1.63 0.15
## 6 65.85 0.66 1.16
## result
## 1 Ecuador beats Qatar 2 - 0
## 2 England beats Iran 6 - 2
## 3 Netherlands beats Senegal 2 - 0
## 4 USA and Wales tie 1 - 1
## 5 Saudi Arabia beats Argentina 2 - 1
## 6 Denmark and Tunisia tie 0 - 0
Which game had the biggest winning margin? Which team won and what was the score?
# Store the absolute values of the game margins
difference <- abs(x$team1_score - x$team2_score)
# Identify largest absolute value
largest_margin <- which.max(difference)
# Locate the row in which the largest margin exists
big_diff <- x[largest_margin,]
# Create conditional logic to concatenate result and tell us where the biggest whooping was
big_diff$result <- ifelse(big_diff$team1_score > big_diff$team2_score, paste(big_diff$team1, 'whooped', big_diff$team2, big_diff$team1_score, '-', big_diff$team2_score), paste(big_diff$team2, 'whooped', big_diff$team1, big_diff$team2_score, '-', big_diff$score1))
print(big_diff$result)
## [1] "Spain whooped Costa Rica 7 - 0"
What does the relationship between the difference in Soccer Power Index ratings and the difference in scores look like?
# Calculate absolute differeence
x$power_index_difference <- abs(x$team1_power_index - x$team2_power_index)
x$score_difference <- abs(x$team1_score - x$team2_score)
# Plot the differences
plot(x$power_index_difference, x$score_difference,
xlab = "Power Index Difference",
ylab = "Score Difference",
main = "Relationship Between Winning Margin and SPI Difference")
# Add a line of best fit
fit <- lm(score_difference ~ power_index_difference, data = x)
abline(fit, col = "red")
The final visualization, perhaps being the most informative, is still lacking the amount of data needed to feel confident in the value of the SPI metric. Early indiciation are promising but I would like to have more data from previous tournaments or World Cups and see how the trendline changes (or doesn’t). Given more time and resources, more advanced metrics using expected goals and other ways of determining a teams likelihood of success would need to be tested in comparison to the SPI.