Week 1 Assignment

2022 World Cup Predictions

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(gt)
library(ggplot2)
echo = FALSE

Introduction:

This article is about predicting the chances of winning, losing or getting tied of each soccer team that participated in the Qatar World Cup 2022. In addition, this article is showing, in a table, how likely each team will be placed first or second in their groups as well as their chances to move up to the next stage of the competition.

Reading the dataset

There are two data sets: one is the forecasts and the other one is the matches played.

library(readr)
wc_forecasts <- read_csv("https://raw.githubusercontent.com/SalouaDaouki/Data-607-Week-1-assignment-/main/wc_forecasts.csv")
Rows: 256 Columns: 22
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): forecast_timestamp, team, group, timestamp
dbl (18): spi, global_o, global_d, sim_wins, sim_ties, sim_losses, sim_goal_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(wc_forecasts)
Rows: 256
Columns: 22
$ forecast_timestamp <chr> "2022-12-18 17:56:03 UTC", "2022-12-18 17:56:03 UTC…
$ team               <chr> "Argentina", "France", "Morocco", "Croatia", "Engla…
$ group              <chr> "C", "D", "F", "F", "B", "A", "H", "G", "E", "A", "…
$ spi                <dbl> 89.64860, 88.30043, 73.16416, 78.82038, 87.82131, 8…
$ global_o           <dbl> 2.83610, 2.96765, 1.74313, 2.20264, 2.71564, 2.5271…
$ global_d           <dbl> 0.39397, 0.54381, 0.53433, 0.60290, 0.44261, 0.5494…
$ sim_wins           <dbl> 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, …
$ sim_ties           <dbl> 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, …
$ sim_losses         <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
$ sim_goal_diff      <dbl> 3, 3, 3, 3, 7, 4, 2, 2, 1, 1, 1, -1, 1, 6, 0, 0, 1,…
$ goals_scored       <dbl> 5, 6, 4, 4, 9, 5, 6, 3, 4, 5, 4, 3, 2, 9, 4, 2, 6, …
$ goals_against      <dbl> 2, 3, 1, 1, 2, 1, 4, 1, 3, 4, 3, 4, 1, 3, 4, 2, 5, …
$ group_1            <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ group_2            <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
$ group_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
$ group_4            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ make_round_of_16   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
$ make_quarters      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ make_semis         <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ make_final         <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ win_league         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ timestamp          <chr> "2022-12-18 17:56:44 UTC", "2022-12-18 17:56:44 UTC…
library(readr)
wc_matches <- read_csv("https://raw.githubusercontent.com/SalouaDaouki/Data-607-Week-1-assignment-/main/wc_matches.csv")
Rows: 64 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (3): league, team1, team2
dbl  (16): league_id, spi1, spi2, prob1, prob2, probtie, proj_score1, proj_s...
date  (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(wc_matches)
Rows: 64
Columns: 20
$ date        <date> 2022-11-20, 2022-11-21, 2022-11-21, 2022-11-21, 2022-11-2…
$ league_id   <dbl> 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908…
$ league      <chr> "FIFA World Cup", "FIFA World Cup", "FIFA World Cup", "FIF…
$ team1       <chr> "Qatar", "England", "Senegal", "USA", "Argentina", "Denmar…
$ team2       <chr> "Ecuador", "Iran", "Netherlands", "Wales", "Saudi Arabia",…
$ spi1        <dbl> 51.00, 85.96, 73.84, 74.83, 87.21, 80.02, 74.30, 87.71, 75…
$ spi2        <dbl> 72.74, 62.17, 86.01, 65.58, 56.87, 65.85, 68.28, 60.83, 78…
$ prob1       <dbl> 0.2369, 0.6274, 0.2235, 0.4489, 0.7228, 0.5001, 0.4238, 0.…
$ prob2       <dbl> 0.5045, 0.1187, 0.5053, 0.2591, 0.0807, 0.2054, 0.2802, 0.…
$ probtie     <dbl> 0.2586, 0.2539, 0.2712, 0.2920, 0.1966, 0.2945, 0.2960, 0.…
$ proj_score1 <dbl> 1.13, 1.70, 0.99, 1.42, 2.11, 1.44, 1.37, 2.09, 1.18, 2.14…
$ proj_score2 <dbl> 1.75, 0.58, 1.63, 1.01, 0.54, 0.82, 1.06, 0.65, 1.34, 1.06…
$ score1      <dbl> 0, 6, 0, 1, 1, 0, 0, 4, 0, 1, 7, 1, 1, 0, 3, 2, 0, 1, 1, 0…
$ score2      <dbl> 2, 2, 2, 1, 2, 0, 0, 1, 0, 2, 0, 0, 0, 0, 2, 0, 2, 3, 1, 0…
$ xg1         <dbl> 0.23, 1.04, 0.70, 0.33, 1.63, 0.66, 0.45, 3.03, 0.28, 3.10…
$ xg2         <dbl> 1.14, 1.45, 0.68, 1.78, 0.15, 1.16, 1.02, 0.26, 0.88, 1.20…
$ nsxg1       <dbl> 0.24, 1.50, 1.22, 0.48, 2.40, 1.33, 1.19, 3.01, 0.54, 3.10…
$ nsxg2       <dbl> 1.35, 0.32, 1.83, 0.95, 0.53, 0.69, 0.49, 0.30, 0.64, 0.85…
$ adj_score1  <dbl> 0.00, 5.78, 0.00, 1.05, 1.05, 0.00, 0.00, 4.18, 0.00, 1.05…
$ adj_score2  <dbl> 2.10, 2.10, 1.58, 1.05, 2.10, 0.00, 0.00, 1.05, 0.00, 2.10…

Creating a data frame with subset of columns:

Forecasts subset

wc_forecasts_sub = subset(wc_forecasts, spi>82, select = c(team, group, spi, make_final))
glimpse(wc_forecasts_sub)
Rows: 65
Columns: 4
$ team       <chr> "Argentina", "France", "England", "Netherlands", "Portugal"…
$ group      <chr> "C", "D", "B", "A", "H", "G", "E", "E", "C", "D", "B", "A",…
$ spi        <dbl> 89.64860, 88.30043, 87.82131, 83.97533, 87.02373, 93.18946,…
$ make_final <dbl> 1.00000, 1.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.000…
colnames(wc_forecasts_sub)[4] = "make_final" # this line of the code is to undo the mistake I did; I renamed the 4th column instead of the 3.
colnames(wc_forecasts_sub)[3] = "Soccer Power Index"
glimpse(wc_forecasts_sub)
Rows: 65
Columns: 4
$ team                 <chr> "Argentina", "France", "England", "Netherlands", …
$ group                <chr> "C", "D", "B", "A", "H", "G", "E", "E", "C", "D",…
$ `Soccer Power Index` <dbl> 89.64860, 88.30043, 87.82131, 83.97533, 87.02373,…
$ make_final           <dbl> 1.00000, 1.00000, 0.00000, 0.00000, 0.00000, 0.00…
wc_forecasts_sub %>% 
  group_by(group) %>%
  gt() %>%
  tab_header(
    title = md("The `Soccer Power Index`"),
    subtitle = md("The **best** teams of the `World Cup 2022`"))
The Soccer Power Index
The best teams of the World Cup 2022
team Soccer Power Index make_final
C
Argentina 89.64860 1.00000
Argentina 88.85631 1.00000
Argentina 87.45777 0.64042
Argentina 87.32471 0.24429
Argentina 87.98006 0.24966
Argentina 86.02625 0.12681
Argentina 86.09354 0.09295
Argentina 87.20776 0.15944
D
France 88.30043 1.00000
France 88.41321 1.00000
France 87.71519 0.66212
France 87.52870 0.27046
France 88.57378 0.24482
France 90.14910 0.27224
France 89.25241 0.22312
France 87.70516 0.17187
B
England 87.82131 0.00000
England 87.82131 0.00000
England 87.82131 0.00000
England 87.58969 0.29599
England 86.96946 0.19286
England 85.40758 0.14332
England 86.26103 0.15614
England 85.95712 0.14689
A
Netherlands 83.97533 0.00000
Netherlands 83.97533 0.00000
Netherlands 83.97533 0.00000
Netherlands 84.18963 0.15471
Netherlands 83.97368 0.13043
Netherlands 84.38370 0.12204
Netherlands 86.07414 0.14615
Netherlands 86.01102 0.11908
H
Portugal 87.02373 0.00000
Portugal 87.02373 0.00000
Portugal 87.02373 0.00000
Portugal 87.92472 0.32756
Portugal 85.79833 0.14492
Portugal 87.55187 0.18387
Portugal 87.30309 0.15419
Portugal 87.77456 0.15248
G
Brazil 93.18946 0.00000
Brazil 93.18946 0.00000
Brazil 93.18946 0.00000
Brazil 93.46631 0.50710
Brazil 92.89519 0.42125
Brazil 93.48139 0.39419
Brazil 93.65727 0.36793
Brazil 93.54699 0.32477
E
Spain 88.50675 0.00000
Germany 89.95279 0.00000
Spain 88.50675 0.00000
Germany 89.95279 0.00000
Spain 88.50675 0.00000
Germany 89.95279 0.00000
Spain 88.50675 0.00000
Germany 89.95279 0.00000
Spain 89.20054 0.25872
Germany 89.95279 0.00000
Spain 90.62031 0.23196
Germany 88.86004 0.12928
Spain 91.21880 0.23944
Germany 88.72557 0.06744
Spain 89.50604 0.18857
Germany 88.77370 0.14385
F
Belgium 82.49005 0.06141

Matches subset

wc_matches_sub = subset(wc_matches, select = c(team1, team2, spi1, spi2, adj_score1, adj_score2))
glimpse((wc_matches_sub))
Rows: 64
Columns: 6
$ team1      <chr> "Qatar", "England", "Senegal", "USA", "Argentina", "Denmark…
$ team2      <chr> "Ecuador", "Iran", "Netherlands", "Wales", "Saudi Arabia", …
$ spi1       <dbl> 51.00, 85.96, 73.84, 74.83, 87.21, 80.02, 74.30, 87.71, 75.…
$ spi2       <dbl> 72.74, 62.17, 86.01, 65.58, 56.87, 65.85, 68.28, 60.83, 78.…
$ adj_score1 <dbl> 0.00, 5.78, 0.00, 1.05, 1.05, 0.00, 0.00, 4.18, 0.00, 1.05,…
$ adj_score2 <dbl> 2.10, 2.10, 1.58, 1.05, 2.10, 0.00, 0.00, 1.05, 0.00, 2.10,…
  gt(wc_matches_sub) %>%
    tab_header(
      title = md("`The mathces played during the WC2022`"),
      subtitle = md("`Adjusted scores for each team`")
    )
The mathces played during the WC2022
Adjusted scores for each team
team1 team2 spi1 spi2 adj_score1 adj_score2
Qatar Ecuador 51.00 72.74 0.00 2.10
England Iran 85.96 62.17 5.78 2.10
Senegal Netherlands 73.84 86.01 0.00 1.58
USA Wales 74.83 65.58 1.05 1.05
Argentina Saudi Arabia 87.21 56.87 1.05 2.10
Denmark Tunisia 80.02 65.85 0.00 0.00
Mexico Poland 74.30 68.28 0.00 0.00
France Australia 87.71 60.83 4.18 1.05
Morocco Croatia 75.62 78.84 0.00 0.00
Germany Japan 88.77 71.44 1.05 2.10
Spain Costa Rica 89.51 55.46 6.22 0.00
Belgium Canada 82.49 71.59 1.05 0.00
Switzerland Cameroon 77.65 64.16 1.05 0.00
Uruguay South Korea 80.90 66.12 0.00 0.00
Portugal Ghana 87.77 58.63 2.90 2.10
Brazil Serbia 93.55 75.84 2.02 0.00
Wales Iran 67.64 63.33 0.00 1.26
Qatar Senegal 48.16 73.23 1.05 2.81
Netherlands Ecuador 86.07 74.37 1.05 1.05
England USA 86.26 72.63 0.00 0.00
Tunisia Australia 66.48 58.65 0.00 1.05
Poland Saudi Arabia 68.14 59.01 1.80 0.00
France Denmark 89.25 78.87 2.10 1.05
Argentina Mexico 86.09 73.48 1.66 0.00
Japan Costa Rica 71.98 51.99 0.00 1.05
Belgium Morocco 80.54 74.67 0.00 1.58
Croatia Canada 78.46 73.84 3.68 1.05
Spain Germany 91.22 88.73 1.05 1.05
Cameroon Serbia 63.62 74.55 3.15 3.15
South Korea Ghana 66.44 60.03 2.10 3.15
Brazil Switzerland 93.66 77.52 1.05 0.00
Portugal Uruguay 87.30 79.70 1.58 0.00
Netherlands Qatar 84.38 48.46 2.10 0.00
Ecuador Senegal 75.82 73.00 1.05 2.10
Iran USA 65.77 72.55 0.00 1.05
Wales England 65.17 85.41 0.00 3.15
Tunisia France 65.92 90.15 1.05 0.00
Australia Denmark 58.80 77.68 1.05 0.00
Poland Argentina 68.95 86.03 0.00 2.10
Saudi Arabia Mexico 58.41 72.44 1.05 2.10
Canada Morocco 71.52 74.99 1.05 2.10
Croatia Belgium 80.87 79.44 0.00 0.00
Costa Rica Germany 52.90 88.86 2.10 3.72
Japan Spain 70.43 90.62 2.10 1.05
Ghana Uruguay 60.50 79.28 0.00 2.10
South Korea Portugal 66.93 87.55 2.10 1.05
Serbia Switzerland 74.72 77.00 2.10 3.15
Cameroon Brazil 64.48 93.48 1.05 0.00
Netherlands USA 83.97 73.07 2.87 1.05
Argentina Australia 87.98 59.35 2.10 1.05
France Poland 88.57 65.77 2.52 1.05
England Senegal 86.97 75.47 3.15 0.00
Japan Croatia 73.02 79.14 1.05 1.05
Brazil South Korea 92.90 69.40 4.20 1.05
Morocco Spain 74.42 89.20 0.00 0.00
Portugal Switzerland 85.80 78.51 5.78 1.05
Croatia Brazil 78.99 93.47 1.05 1.05
Netherlands Argentina 84.19 87.32 2.10 2.03
Morocco Portugal 74.45 87.92 1.05 0.00
England France 87.59 87.53 1.05 2.10
Argentina Croatia 87.46 79.37 3.15 0.00
France Morocco 87.72 75.13 1.87 0.00
Croatia Morocco 77.65 73.92 2.10 1.05
Argentina France 88.86 88.41 3.15 3.15
ggplot(wc_matches_sub, aes(spi1, adj_score1)) +
  geom_point() +
  geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

cor(wc_matches_sub$spi1, wc_matches_sub$adj_score1)
[1] 0.3982406
ggplot(wc_matches_sub, aes(spi2, adj_score2)) +
  geom_point () +
  geom_smooth ()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

cor(wc_matches_sub$spi2, wc_matches_sub$adj_score2)
[1] 0.08885976

Conclusion:

  1. After filtering and summarizing the forecasts data to include only the teams that have the Soccer Power Index greater than 82%, I noticed that all these teams were placed in either first or second within their groups, except for groups E & F.
  2. In the matches table above, the team has higher Soccer Power Index score tends to have higher adjusted score. This doesn’t show in the scatter plots above because they are showing the relationship between the spi and the adj_score for all teams while they are not grouped by high spi and low spi.