##Overview This is Assignment #1 for DATA 607. For this particular assignment I chose to use some of the data that was analyzed in the article “How Our 2022 World Cup Predictions Work”. This was interesting since it is very tough to predict who is going to win the worldcup. The entire data set is available on GitHUb at
https://raw.githubusercontent.com/tanzil64/Tanzil_Data_607_Assignment01/refs/heads/main/wc_forecasts.csv In this project we tried to view,summarize the data set and see a relationship (if any) between the FiveThirtyEight’s Soccer Power Index (SPI) ratings and goals scored.
##Loading data from website
data_soccer<- read.csv('https://raw.githubusercontent.com/tanzil64/Tanzil_Data_607_Assignment01/refs/heads/main/wc_forecasts.csv')
head(data_soccer)
summary(data_soccer)
## forecast_timestamp team group spi
## Length:256 Length:256 Length:256 Min. :48.16
## Class :character Class :character Class :character 1st Qu.:67.78
## Mode :character Mode :character Mode :character Median :74.07
## Mean :74.32
## 3rd Qu.:82.86
## Max. :93.66
## global_o global_d sim_wins sim_ties
## Min. :1.279 Min. :0.2366 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.721 1st Qu.:0.5494 1st Qu.:0.8219 1st Qu.:0.0000
## Median :1.963 Median :0.6516 Median :1.0000 Median :0.8430
## Mean :2.109 Mean :0.6774 Mean :1.1507 Mean :0.6985
## 3rd Qu.:2.497 3rd Qu.:0.7980 3rd Qu.:2.0000 3rd Qu.:1.0000
## Max. :3.428 Max. :1.3164 Max. :2.8129 Max. :2.3153
## sim_losses sim_goal_diff goals_scored goals_against
## Min. :0.0000 Min. :-10.089 Min. : 0.6043 Min. : 0.3131
## 1st Qu.:0.8716 1st Qu.: -2.000 1st Qu.: 2.0292 1st Qu.: 2.0000
## Median :1.0000 Median : 0.000 Median : 3.8747 Median : 3.0000
## Mean :1.1507 Mean : 0.000 Mean : 3.7556 Mean : 3.7556
## 3rd Qu.:1.6129 3rd Qu.: 2.000 3rd Qu.: 5.0000 3rd Qu.: 4.7572
## Max. :3.0000 Max. : 8.542 Max. :10.6056 Max. :11.3878
## group_1 group_2 group_3 group_4
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.04063 Median :0.03056 Median :0.0000
## Mean :0.2500 Mean :0.25000 Mean :0.25000 Mean :0.2500
## 3rd Qu.:0.3714 3rd Qu.:0.30622 3rd Qu.:0.30750 3rd Qu.:0.3572
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## make_round_of_16 make_quarters make_semis make_final
## Min. :0.0000 Min. :0.00000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.4717 Median :0.06148 Median :0.002385 Median :0.00000
## Mean :0.5000 Mean :0.25000 Mean :0.125000 Mean :0.06250
## 3rd Qu.:1.0000 3rd Qu.:0.35581 3rd Qu.:0.118722 3rd Qu.:0.04166
## Max. :1.0000 Max. :1.00000 Max. :1.000000 Max. :1.00000
## win_league timestamp
## Min. :0.00000 Length:256
## 1st Qu.:0.00000 Class :character
## Median :0.00000 Mode :character
## Mean :0.03125
## 3rd Qu.:0.01428
## Max. :1.00000
glimpse(data_soccer)
## Rows: 256
## Columns: 22
## $ forecast_timestamp <chr> "2022-12-18 17:56:03 UTC", "2022-12-18 17:56:03 UTC…
## $ team <chr> "Argentina", "France", "Morocco", "Croatia", "Engla…
## $ group <chr> "C", "D", "F", "F", "B", "A", "H", "G", "E", "A", "…
## $ spi <dbl> 89.64860, 88.30043, 73.16416, 78.82038, 87.82131, 8…
## $ global_o <dbl> 2.83610, 2.96765, 1.74313, 2.20264, 2.71564, 2.5271…
## $ global_d <dbl> 0.39397, 0.54381, 0.53433, 0.60290, 0.44261, 0.5494…
## $ sim_wins <dbl> 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, …
## $ sim_ties <dbl> 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, …
## $ sim_losses <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ sim_goal_diff <dbl> 3, 3, 3, 3, 7, 4, 2, 2, 1, 1, 1, -1, 1, 6, 0, 0, 1,…
## $ goals_scored <dbl> 5, 6, 4, 4, 9, 5, 6, 3, 4, 5, 4, 3, 2, 9, 4, 2, 6, …
## $ goals_against <dbl> 2, 3, 1, 1, 2, 1, 4, 1, 3, 4, 3, 4, 1, 3, 4, 2, 5, …
## $ group_1 <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ group_2 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ group_3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ group_4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ make_round_of_16 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ make_quarters <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ make_semis <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ make_final <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ win_league <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ timestamp <chr> "2022-12-18 17:56:44 UTC", "2022-12-18 17:56:44 UTC…
sub_soccer <- data_soccer %>% select(team, group, spi,goals_scored)
head(sub_soccer)
##Graphic Display
ggplot(sub_soccer, aes(x=spi, y=goals_scored))+geom_point(aes(color=team))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##Conclusion From the graph we can see a positive linear relationship between FiveThirtyEight’s Soccer Power Index (SPI) ratings and goals scored by the teams.