Introduction:

Interested in learning more about the predictive abilities of AI in sports, this article discusses the probabilities of teams progressing through the World Cup given their Soccer Power Index (SPI) ratings. Looking at the relationship between SPI and tournament performance can help inform the value of this metric.

Read in CSV

# Read in CSV and assign it to data frame 'x'
x <- read.csv(url("https://projects.fivethirtyeight.com/soccer-api/international/2022/wc_matches.csv"))
head(x)
##         date league_id         league     team1        team2  spi1  spi2  prob1
## 1 2022-11-20      1908 FIFA World Cup     Qatar      Ecuador 51.00 72.74 0.2369
## 2 2022-11-21      1908 FIFA World Cup   England         Iran 85.96 62.17 0.6274
## 3 2022-11-21      1908 FIFA World Cup   Senegal  Netherlands 73.84 86.01 0.2235
## 4 2022-11-21      1908 FIFA World Cup       USA        Wales 74.83 65.58 0.4489
## 5 2022-11-22      1908 FIFA World Cup Argentina Saudi Arabia 87.21 56.87 0.7228
## 6 2022-11-22      1908 FIFA World Cup   Denmark      Tunisia 80.02 65.85 0.5001
##    prob2 probtie proj_score1 proj_score2 score1 score2  xg1  xg2 nsxg1 nsxg2
## 1 0.5045  0.2586        1.13        1.75      0      2 0.23 1.14  0.24  1.35
## 2 0.1187  0.2539        1.70        0.58      6      2 1.04 1.45  1.50  0.32
## 3 0.5053  0.2712        0.99        1.63      0      2 0.70 0.68  1.22  1.83
## 4 0.2591  0.2920        1.42        1.01      1      1 0.33 1.78  0.48  0.95
## 5 0.0807  0.1966        2.11        0.54      1      2 1.63 0.15  2.40  0.53
## 6 0.2054  0.2945        1.44        0.82      0      0 0.66 1.16  1.33  0.69
##   adj_score1 adj_score2
## 1       0.00       2.10
## 2       5.78       2.10
## 3       0.00       1.58
## 4       1.05       1.05
## 5       1.05       2.10
## 6       0.00       0.00

Check and fix and problematic data types

# Check data classes
lapply(x, class)
## $date
## [1] "character"
## 
## $league_id
## [1] "integer"
## 
## $league
## [1] "character"
## 
## $team1
## [1] "character"
## 
## $team2
## [1] "character"
## 
## $spi1
## [1] "numeric"
## 
## $spi2
## [1] "numeric"
## 
## $prob1
## [1] "numeric"
## 
## $prob2
## [1] "numeric"
## 
## $probtie
## [1] "numeric"
## 
## $proj_score1
## [1] "numeric"
## 
## $proj_score2
## [1] "numeric"
## 
## $score1
## [1] "integer"
## 
## $score2
## [1] "integer"
## 
## $xg1
## [1] "numeric"
## 
## $xg2
## [1] "numeric"
## 
## $nsxg1
## [1] "numeric"
## 
## $nsxg2
## [1] "numeric"
## 
## $adj_score1
## [1] "numeric"
## 
## $adj_score2
## [1] "numeric"
# Change date column data type to date
x$date <- as.Date(x$date)

# Confirm update
class(x$date)
## [1] "Date"

Change column names as needed

# Rename columns
colnames(x)[colnames(x) == "spi1"] <- "team1_power_index"
colnames(x)[colnames(x) == "spi2"] <- "team2_power_index"
colnames(x)[colnames(x) == "prob1"] <- "team1_win_prob"
colnames(x)[colnames(x) == "prob2"] <- "team2_win_prob"
colnames(x)[colnames(x) == "proj_score1"] <- "team1_proj_score"
colnames(x)[colnames(x) == "proj_score2"] <- "team2_proj_score"
colnames(x)[colnames(x) == "score1"] <- "team1_score"
colnames(x)[colnames(x) == "score2"] <- "team2_score"
colnames(x)[colnames(x) == "xg1"] <- "team1_expected_goals"
colnames(x)[colnames(x) == "xg2"] <- "team2_expected_goals"
colnames(x)[colnames(x) == "nsxg1"] <- "non_shot_team1_expected_goals"
colnames(x)[colnames(x) == "nsxg2"] <- "non_shot_team2_expected_goals"

Create ‘Stage’ column for missing information

# Create 'Stage' column based on the dates in which the games occurred
x$stage <- ifelse(x$date < as.Date("2022-12-03"), 'Group Stage',
           ifelse(x$date >= as.Date("2022-12-03") & x$date < as.Date("2022-12-09"), 'Round of 16',
           ifelse(x$date >= as.Date("2022-12-09") & x$date < as.Date("2022-12-13"), 'Quarterfinals',
           ifelse(x$date >= as.Date("2022-12-13") & x$date < as.Date("2022-12-17"), 'Semifinals',
           ifelse(x$date == as.Date("2022-12-18"), 'Finals', '3rd Place')))))

head(x[, c('date', "team1", "team2", 'team1_score', 'team2_score', 'stage')])    
##         date     team1        team2 team1_score team2_score       stage
## 1 2022-11-20     Qatar      Ecuador           0           2 Group Stage
## 2 2022-11-21   England         Iran           6           2 Group Stage
## 3 2022-11-21   Senegal  Netherlands           0           2 Group Stage
## 4 2022-11-21       USA        Wales           1           1 Group Stage
## 5 2022-11-22 Argentina Saudi Arabia           1           2 Group Stage
## 6 2022-11-22   Denmark      Tunisia           0           0 Group Stage

Create ‘Result’ based on the scores of each team

# Create 'Result' column based on the dates in which the games occurred
x$result <- ifelse(x$team1_score > x$team2_score, paste(x$team1, 'beats', x$team2, x$team1_score, '-', x$team2_score, sep = ' '),
            ifelse(x$team2_score > x$team1_score, paste(x$team2, 'beats', x$team1, x$team2_score, '-', x$team1_score, sep = ' '),
            ifelse(x$team2_score == x$team1_score & x$date < as.Date("2022-12-03"), paste(x$team1, 'and', x$team2, 'tie', x$team1_score, '-', x$team2_score, sep = ' '), paste(x$team1, 'tied', x$team2, x$team1_score, '-', x$team2_score, 'in regular time, heading to ET', sep = ' '))))

head(x[, c('date', "team1", "team2", 'team1_score', 'team2_score', 'stage', 'result')])
##         date     team1        team2 team1_score team2_score       stage
## 1 2022-11-20     Qatar      Ecuador           0           2 Group Stage
## 2 2022-11-21   England         Iran           6           2 Group Stage
## 3 2022-11-21   Senegal  Netherlands           0           2 Group Stage
## 4 2022-11-21       USA        Wales           1           1 Group Stage
## 5 2022-11-22 Argentina Saudi Arabia           1           2 Group Stage
## 6 2022-11-22   Denmark      Tunisia           0           0 Group Stage
##                               result
## 1          Ecuador beats Qatar 2 - 0
## 2           England beats Iran 6 - 2
## 3    Netherlands beats Senegal 2 - 0
## 4            USA and Wales tie 1 - 1
## 5 Saudi Arabia beats Argentina 2 - 1
## 6      Denmark and Tunisia tie 0 - 0

Create subset Data Frame with new columns and column names

# Create new df with specified columns
final_df <- (x[, c("date",'stage', "team1", "team2", 'team1_power_index', 'team2_power_index', 'team1_expected_goals', 'team2_expected_goals', 'result')])
head(final_df)
##         date       stage     team1        team2 team1_power_index
## 1 2022-11-20 Group Stage     Qatar      Ecuador             51.00
## 2 2022-11-21 Group Stage   England         Iran             85.96
## 3 2022-11-21 Group Stage   Senegal  Netherlands             73.84
## 4 2022-11-21 Group Stage       USA        Wales             74.83
## 5 2022-11-22 Group Stage Argentina Saudi Arabia             87.21
## 6 2022-11-22 Group Stage   Denmark      Tunisia             80.02
##   team2_power_index team1_expected_goals team2_expected_goals
## 1             72.74                 0.23                 1.14
## 2             62.17                 1.04                 1.45
## 3             86.01                 0.70                 0.68
## 4             65.58                 0.33                 1.78
## 5             56.87                 1.63                 0.15
## 6             65.85                 0.66                 1.16
##                               result
## 1          Ecuador beats Qatar 2 - 0
## 2           England beats Iran 6 - 2
## 3    Netherlands beats Senegal 2 - 0
## 4            USA and Wales tie 1 - 1
## 5 Saudi Arabia beats Argentina 2 - 1
## 6      Denmark and Tunisia tie 0 - 0

Extra Analyses

Which game had the biggest winning margin? Which team won and what was the score?

# Store the absolute values of the game margins
difference <- abs(x$team1_score - x$team2_score)

# Identify largest absolute value
largest_margin <- which.max(difference)

# Locate the row in which the largest margin exists
big_diff <- x[largest_margin,]

# Create conditional logic to concatenate result and tell us where the biggest whooping was
big_diff$result <- ifelse(big_diff$team1_score > big_diff$team2_score, paste(big_diff$team1, 'whooped', big_diff$team2, big_diff$team1_score, '-', big_diff$team2_score), paste(big_diff$team2, 'whooped', big_diff$team1, big_diff$team2_score, '-', big_diff$score1))
print(big_diff$result)
## [1] "Spain whooped Costa Rica 7 - 0"

What does the relationship between the difference in Soccer Power Index ratings and the difference in scores look like?

# Calculate absolute differeence
x$power_index_difference <- abs(x$team1_power_index - x$team2_power_index)
x$score_difference <- abs(x$team1_score - x$team2_score)

# Plot the differences
plot(x$power_index_difference, x$score_difference,
     xlab = "Power Index Difference",
     ylab = "Score Difference",
     main = "Relationship Between Winning Margin and SPI Difference")

# Add a line of best fit
fit <- lm(score_difference ~ power_index_difference, data = x)
abline(fit, col = "red")

Conclusion:

The final visualization, perhaps being the most informative, is still lacking the amount of data needed to feel confident in the value of the SPI metric. Early indiciation are promising but I would like to have more data from previous tournaments or World Cups and see how the trendline changes (or doesn’t). Given more time and resources, more advanced metrics using expected goals and other ways of determining a teams likelihood of success would need to be tested in comparison to the SPI.