Week 1 Assignment

2022 World Cup Predictions

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)
library(gt)
library(ggplot2)
echo = FALSE

Introduction:

This article is about predicting the chances of winning, losing or getting tied of each soccer team that participated in the Qatar World Cup 2022. In addition, this article is showing, in a table, how likely each team will be placed first or second in their groups as well as their chances to move up to the next stage of the competition.

Reading the dataset

There are two data sets: one is the forecasts and the other one is the matches played.

library(readr)
wc_forecasts <- read_csv("https://raw.githubusercontent.com/SalouaDaouki/Data-607-Week-1-assignment-/main/wc_forecasts.csv")

Rows: 256 Columns: 22
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): forecast_timestamp, team, group, timestamp
dbl (18): spi, global_o, global_d, sim_wins, sim_ties, sim_losses, sim_goal_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(wc_forecasts)

Rows: 256
Columns: 22
$ forecast_timestamp <chr> "2022-12-18 17:56:03 UTC", "2022-12-18 17:56:03 UTC…
$ team               <chr> "Argentina", "France", "Morocco", "Croatia", "Engla…
$ group              <chr> "C", "D", "F", "F", "B", "A", "H", "G", "E", "A", "…
$ spi                <dbl> 89.64860, 88.30043, 73.16416, 78.82038, 87.82131, 8…
$ global_o           <dbl> 2.83610, 2.96765, 1.74313, 2.20264, 2.71564, 2.5271…
$ global_d           <dbl> 0.39397, 0.54381, 0.53433, 0.60290, 0.44261, 0.5494…
$ sim_wins           <dbl> 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, …
$ sim_ties           <dbl> 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, …
$ sim_losses         <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
$ sim_goal_diff      <dbl> 3, 3, 3, 3, 7, 4, 2, 2, 1, 1, 1, -1, 1, 6, 0, 0, 1,…
$ goals_scored       <dbl> 5, 6, 4, 4, 9, 5, 6, 3, 4, 5, 4, 3, 2, 9, 4, 2, 6, …
$ goals_against      <dbl> 2, 3, 1, 1, 2, 1, 4, 1, 3, 4, 3, 4, 1, 3, 4, 2, 5, …
$ group_1            <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ group_2            <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
$ group_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
$ group_4            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ make_round_of_16   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
$ make_quarters      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ make_semis         <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ make_final         <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ win_league         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ timestamp          <chr> "2022-12-18 17:56:44 UTC", "2022-12-18 17:56:44 UTC…

library(readr)
wc_matches <- read_csv("https://raw.githubusercontent.com/SalouaDaouki/Data-607-Week-1-assignment-/main/wc_matches.csv")

Rows: 64 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (3): league, team1, team2
dbl  (16): league_id, spi1, spi2, prob1, prob2, probtie, proj_score1, proj_s...
date  (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(wc_matches)

Rows: 64
Columns: 20
$ date        <date> 2022-11-20, 2022-11-21, 2022-11-21, 2022-11-21, 2022-11-2…
$ league_id   <dbl> 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908…
$ league      <chr> "FIFA World Cup", "FIFA World Cup", "FIFA World Cup", "FIF…
$ team1       <chr> "Qatar", "England", "Senegal", "USA", "Argentina", "Denmar…
$ team2       <chr> "Ecuador", "Iran", "Netherlands", "Wales", "Saudi Arabia",…
$ spi1        <dbl> 51.00, 85.96, 73.84, 74.83, 87.21, 80.02, 74.30, 87.71, 75…
$ spi2        <dbl> 72.74, 62.17, 86.01, 65.58, 56.87, 65.85, 68.28, 60.83, 78…
$ prob1       <dbl> 0.2369, 0.6274, 0.2235, 0.4489, 0.7228, 0.5001, 0.4238, 0.…
$ prob2       <dbl> 0.5045, 0.1187, 0.5053, 0.2591, 0.0807, 0.2054, 0.2802, 0.…
$ probtie     <dbl> 0.2586, 0.2539, 0.2712, 0.2920, 0.1966, 0.2945, 0.2960, 0.…
$ proj_score1 <dbl> 1.13, 1.70, 0.99, 1.42, 2.11, 1.44, 1.37, 2.09, 1.18, 2.14…
$ proj_score2 <dbl> 1.75, 0.58, 1.63, 1.01, 0.54, 0.82, 1.06, 0.65, 1.34, 1.06…
$ score1      <dbl> 0, 6, 0, 1, 1, 0, 0, 4, 0, 1, 7, 1, 1, 0, 3, 2, 0, 1, 1, 0…
$ score2      <dbl> 2, 2, 2, 1, 2, 0, 0, 1, 0, 2, 0, 0, 0, 0, 2, 0, 2, 3, 1, 0…
$ xg1         <dbl> 0.23, 1.04, 0.70, 0.33, 1.63, 0.66, 0.45, 3.03, 0.28, 3.10…
$ xg2         <dbl> 1.14, 1.45, 0.68, 1.78, 0.15, 1.16, 1.02, 0.26, 0.88, 1.20…
$ nsxg1       <dbl> 0.24, 1.50, 1.22, 0.48, 2.40, 1.33, 1.19, 3.01, 0.54, 3.10…
$ nsxg2       <dbl> 1.35, 0.32, 1.83, 0.95, 0.53, 0.69, 0.49, 0.30, 0.64, 0.85…
$ adj_score1  <dbl> 0.00, 5.78, 0.00, 1.05, 1.05, 0.00, 0.00, 4.18, 0.00, 1.05…
$ adj_score2  <dbl> 2.10, 2.10, 1.58, 1.05, 2.10, 0.00, 0.00, 1.05, 0.00, 2.10…

Creating a data frame with subset of columns:

Forecasts subset

wc_forecasts_sub = subset(wc_forecasts, spi>82, select = c(team, group, spi, make_final))
glimpse(wc_forecasts_sub)

Rows: 65
Columns: 4
$ team       <chr> "Argentina", "France", "England", "Netherlands", "Portugal"…
$ group      <chr> "C", "D", "B", "A", "H", "G", "E", "E", "C", "D", "B", "A",…
$ spi        <dbl> 89.64860, 88.30043, 87.82131, 83.97533, 87.02373, 93.18946,…
$ make_final <dbl> 1.00000, 1.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.000…

colnames(wc_forecasts_sub)[4] = "make_final" # this line of the code is to undo the mistake I did; I renamed the 4th column instead of the 3.
colnames(wc_forecasts_sub)[3] = "Soccer Power Index"
glimpse(wc_forecasts_sub)

Rows: 65
Columns: 4
$ team                 <chr> "Argentina", "France", "England", "Netherlands", …
$ group                <chr> "C", "D", "B", "A", "H", "G", "E", "E", "C", "D",…
$ `Soccer Power Index` <dbl> 89.64860, 88.30043, 87.82131, 83.97533, 87.02373,…
$ make_final           <dbl> 1.00000, 1.00000, 0.00000, 0.00000, 0.00000, 0.00…

wc_forecasts_sub %>% 
  group_by(group) %>%
  gt() %>%
  tab_header(
    title = md("The `Soccer Power Index`"),
    subtitle = md("The **best** teams of the `World Cup 2022`"))

team	Soccer Power Index	make_final
The `Soccer Power Index`
The best teams of the `World Cup 2022`
C
Argentina	89.64860	1.00000
Argentina	88.85631	1.00000
Argentina	87.45777	0.64042
Argentina	87.32471	0.24429
Argentina	87.98006	0.24966
Argentina	86.02625	0.12681
Argentina	86.09354	0.09295
Argentina	87.20776	0.15944
D
France	88.30043	1.00000
France	88.41321	1.00000
France	87.71519	0.66212
France	87.52870	0.27046
France	88.57378	0.24482
France	90.14910	0.27224
France	89.25241	0.22312
France	87.70516	0.17187
B
England	87.82131	0.00000
England	87.82131	0.00000
England	87.82131	0.00000
England	87.58969	0.29599
England	86.96946	0.19286
England	85.40758	0.14332
England	86.26103	0.15614
England	85.95712	0.14689
A
Netherlands	83.97533	0.00000
Netherlands	83.97533	0.00000
Netherlands	83.97533	0.00000
Netherlands	84.18963	0.15471
Netherlands	83.97368	0.13043
Netherlands	84.38370	0.12204
Netherlands	86.07414	0.14615
Netherlands	86.01102	0.11908
H
Portugal	87.02373	0.00000
Portugal	87.02373	0.00000
Portugal	87.02373	0.00000
Portugal	87.92472	0.32756
Portugal	85.79833	0.14492
Portugal	87.55187	0.18387
Portugal	87.30309	0.15419
Portugal	87.77456	0.15248
G
Brazil	93.18946	0.00000
Brazil	93.18946	0.00000
Brazil	93.18946	0.00000
Brazil	93.46631	0.50710
Brazil	92.89519	0.42125
Brazil	93.48139	0.39419
Brazil	93.65727	0.36793
Brazil	93.54699	0.32477
E
Spain	88.50675	0.00000
Germany	89.95279	0.00000
Spain	88.50675	0.00000
Germany	89.95279	0.00000
Spain	88.50675	0.00000
Germany	89.95279	0.00000
Spain	88.50675	0.00000
Germany	89.95279	0.00000
Spain	89.20054	0.25872
Germany	89.95279	0.00000
Spain	90.62031	0.23196
Germany	88.86004	0.12928
Spain	91.21880	0.23944
Germany	88.72557	0.06744
Spain	89.50604	0.18857
Germany	88.77370	0.14385
F
Belgium	82.49005	0.06141

Matches subset

wc_matches_sub = subset(wc_matches, select = c(team1, team2, spi1, spi2, adj_score1, adj_score2))
glimpse((wc_matches_sub))

Rows: 64
Columns: 6
$ team1      <chr> "Qatar", "England", "Senegal", "USA", "Argentina", "Denmark…
$ team2      <chr> "Ecuador", "Iran", "Netherlands", "Wales", "Saudi Arabia", …
$ spi1       <dbl> 51.00, 85.96, 73.84, 74.83, 87.21, 80.02, 74.30, 87.71, 75.…
$ spi2       <dbl> 72.74, 62.17, 86.01, 65.58, 56.87, 65.85, 68.28, 60.83, 78.…
$ adj_score1 <dbl> 0.00, 5.78, 0.00, 1.05, 1.05, 0.00, 0.00, 4.18, 0.00, 1.05,…
$ adj_score2 <dbl> 2.10, 2.10, 1.58, 1.05, 2.10, 0.00, 0.00, 1.05, 0.00, 2.10,…

  gt(wc_matches_sub) %>%
    tab_header(
      title = md("`The mathces played during the WC2022`"),
      subtitle = md("`Adjusted scores for each team`")
    )

team1	team2	spi1	spi2	adj_score1	adj_score2
`The mathces played during the WC2022`
`Adjusted scores for each team`
Qatar	Ecuador	51.00	72.74	0.00	2.10
England	Iran	85.96	62.17	5.78	2.10
Senegal	Netherlands	73.84	86.01	0.00	1.58
USA	Wales	74.83	65.58	1.05	1.05
Argentina	Saudi Arabia	87.21	56.87	1.05	2.10
Denmark	Tunisia	80.02	65.85	0.00	0.00
Mexico	Poland	74.30	68.28	0.00	0.00
France	Australia	87.71	60.83	4.18	1.05
Morocco	Croatia	75.62	78.84	0.00	0.00
Germany	Japan	88.77	71.44	1.05	2.10
Spain	Costa Rica	89.51	55.46	6.22	0.00
Belgium	Canada	82.49	71.59	1.05	0.00
Switzerland	Cameroon	77.65	64.16	1.05	0.00
Uruguay	South Korea	80.90	66.12	0.00	0.00
Portugal	Ghana	87.77	58.63	2.90	2.10
Brazil	Serbia	93.55	75.84	2.02	0.00
Wales	Iran	67.64	63.33	0.00	1.26
Qatar	Senegal	48.16	73.23	1.05	2.81
Netherlands	Ecuador	86.07	74.37	1.05	1.05
England	USA	86.26	72.63	0.00	0.00
Tunisia	Australia	66.48	58.65	0.00	1.05
Poland	Saudi Arabia	68.14	59.01	1.80	0.00
France	Denmark	89.25	78.87	2.10	1.05
Argentina	Mexico	86.09	73.48	1.66	0.00
Japan	Costa Rica	71.98	51.99	0.00	1.05
Belgium	Morocco	80.54	74.67	0.00	1.58
Croatia	Canada	78.46	73.84	3.68	1.05
Spain	Germany	91.22	88.73	1.05	1.05
Cameroon	Serbia	63.62	74.55	3.15	3.15
South Korea	Ghana	66.44	60.03	2.10	3.15
Brazil	Switzerland	93.66	77.52	1.05	0.00
Portugal	Uruguay	87.30	79.70	1.58	0.00
Netherlands	Qatar	84.38	48.46	2.10	0.00
Ecuador	Senegal	75.82	73.00	1.05	2.10
Iran	USA	65.77	72.55	0.00	1.05
Wales	England	65.17	85.41	0.00	3.15
Tunisia	France	65.92	90.15	1.05	0.00
Australia	Denmark	58.80	77.68	1.05	0.00
Poland	Argentina	68.95	86.03	0.00	2.10
Saudi Arabia	Mexico	58.41	72.44	1.05	2.10
Canada	Morocco	71.52	74.99	1.05	2.10
Croatia	Belgium	80.87	79.44	0.00	0.00
Costa Rica	Germany	52.90	88.86	2.10	3.72
Japan	Spain	70.43	90.62	2.10	1.05
Ghana	Uruguay	60.50	79.28	0.00	2.10
South Korea	Portugal	66.93	87.55	2.10	1.05
Serbia	Switzerland	74.72	77.00	2.10	3.15
Cameroon	Brazil	64.48	93.48	1.05	0.00
Netherlands	USA	83.97	73.07	2.87	1.05
Argentina	Australia	87.98	59.35	2.10	1.05
France	Poland	88.57	65.77	2.52	1.05
England	Senegal	86.97	75.47	3.15	0.00
Japan	Croatia	73.02	79.14	1.05	1.05
Brazil	South Korea	92.90	69.40	4.20	1.05
Morocco	Spain	74.42	89.20	0.00	0.00
Portugal	Switzerland	85.80	78.51	5.78	1.05
Croatia	Brazil	78.99	93.47	1.05	1.05
Netherlands	Argentina	84.19	87.32	2.10	2.03
Morocco	Portugal	74.45	87.92	1.05	0.00
England	France	87.59	87.53	1.05	2.10
Argentina	Croatia	87.46	79.37	3.15	0.00
France	Morocco	87.72	75.13	1.87	0.00
Croatia	Morocco	77.65	73.92	2.10	1.05
Argentina	France	88.86	88.41	3.15	3.15

ggplot(wc_matches_sub, aes(spi1, adj_score1)) +
  geom_point() +
  geom_smooth()

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

cor(wc_matches_sub$spi1, wc_matches_sub$adj_score1)

[1] 0.3982406

ggplot(wc_matches_sub, aes(spi2, adj_score2)) +
  geom_point () +
  geom_smooth ()

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

cor(wc_matches_sub$spi2, wc_matches_sub$adj_score2)

[1] 0.08885976

Conclusion:

After filtering and summarizing the forecasts data to include only the teams that have the Soccer Power Index greater than 82%, I noticed that all these teams were placed in either first or second within their groups, except for groups E & F.
In the matches table above, the team has higher Soccer Power Index score tends to have higher adjusted score. This doesn’t show in the scatter plots above because they are showing the relationship between the spi and the adj_score for all teams while they are not grouped by high spi and low spi.