##Overview This is Assignment #1 for DATA 607. For this particular assignment I chose to use some of the data that was analyzed in the article “How Our 2022 World Cup Predictions Work”. This was interesting since it is very tough to predict who is going to win the worldcup. The entire data set is available on GitHUb at

https://raw.githubusercontent.com/tanzil64/Tanzil_Data_607_Assignment01/refs/heads/main/wc_forecasts.csv In this project we tried to view,summarize the data set and see a relationship (if any) between the FiveThirtyEight’s Soccer Power Index (SPI) ratings and goals scored.

##Loading data from website

data_soccer<- read.csv('https://raw.githubusercontent.com/tanzil64/Tanzil_Data_607_Assignment01/refs/heads/main/wc_forecasts.csv')


head(data_soccer)
summary(data_soccer)
##  forecast_timestamp     team              group                spi       
##  Length:256         Length:256         Length:256         Min.   :48.16  
##  Class :character   Class :character   Class :character   1st Qu.:67.78  
##  Mode  :character   Mode  :character   Mode  :character   Median :74.07  
##                                                           Mean   :74.32  
##                                                           3rd Qu.:82.86  
##                                                           Max.   :93.66  
##     global_o        global_d         sim_wins         sim_ties     
##  Min.   :1.279   Min.   :0.2366   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.721   1st Qu.:0.5494   1st Qu.:0.8219   1st Qu.:0.0000  
##  Median :1.963   Median :0.6516   Median :1.0000   Median :0.8430  
##  Mean   :2.109   Mean   :0.6774   Mean   :1.1507   Mean   :0.6985  
##  3rd Qu.:2.497   3rd Qu.:0.7980   3rd Qu.:2.0000   3rd Qu.:1.0000  
##  Max.   :3.428   Max.   :1.3164   Max.   :2.8129   Max.   :2.3153  
##    sim_losses     sim_goal_diff      goals_scored     goals_against    
##  Min.   :0.0000   Min.   :-10.089   Min.   : 0.6043   Min.   : 0.3131  
##  1st Qu.:0.8716   1st Qu.: -2.000   1st Qu.: 2.0292   1st Qu.: 2.0000  
##  Median :1.0000   Median :  0.000   Median : 3.8747   Median : 3.0000  
##  Mean   :1.1507   Mean   :  0.000   Mean   : 3.7556   Mean   : 3.7556  
##  3rd Qu.:1.6129   3rd Qu.:  2.000   3rd Qu.: 5.0000   3rd Qu.: 4.7572  
##  Max.   :3.0000   Max.   :  8.542   Max.   :10.6056   Max.   :11.3878  
##     group_1          group_2           group_3           group_4      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.04063   Median :0.03056   Median :0.0000  
##  Mean   :0.2500   Mean   :0.25000   Mean   :0.25000   Mean   :0.2500  
##  3rd Qu.:0.3714   3rd Qu.:0.30622   3rd Qu.:0.30750   3rd Qu.:0.3572  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##  make_round_of_16 make_quarters       make_semis         make_final     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000  
##  Median :0.4717   Median :0.06148   Median :0.002385   Median :0.00000  
##  Mean   :0.5000   Mean   :0.25000   Mean   :0.125000   Mean   :0.06250  
##  3rd Qu.:1.0000   3rd Qu.:0.35581   3rd Qu.:0.118722   3rd Qu.:0.04166  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.000000   Max.   :1.00000  
##    win_league       timestamp        
##  Min.   :0.00000   Length:256        
##  1st Qu.:0.00000   Class :character  
##  Median :0.00000   Mode  :character  
##  Mean   :0.03125                     
##  3rd Qu.:0.01428                     
##  Max.   :1.00000
glimpse(data_soccer)
## Rows: 256
## Columns: 22
## $ forecast_timestamp <chr> "2022-12-18 17:56:03 UTC", "2022-12-18 17:56:03 UTC…
## $ team               <chr> "Argentina", "France", "Morocco", "Croatia", "Engla…
## $ group              <chr> "C", "D", "F", "F", "B", "A", "H", "G", "E", "A", "…
## $ spi                <dbl> 89.64860, 88.30043, 73.16416, 78.82038, 87.82131, 8…
## $ global_o           <dbl> 2.83610, 2.96765, 1.74313, 2.20264, 2.71564, 2.5271…
## $ global_d           <dbl> 0.39397, 0.54381, 0.53433, 0.60290, 0.44261, 0.5494…
## $ sim_wins           <dbl> 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, …
## $ sim_ties           <dbl> 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, …
## $ sim_losses         <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, …
## $ sim_goal_diff      <dbl> 3, 3, 3, 3, 7, 4, 2, 2, 1, 1, 1, -1, 1, 6, 0, 0, 1,…
## $ goals_scored       <dbl> 5, 6, 4, 4, 9, 5, 6, 3, 4, 5, 4, 3, 2, 9, 4, 2, 6, …
## $ goals_against      <dbl> 2, 3, 1, 1, 2, 1, 4, 1, 3, 4, 3, 4, 1, 3, 4, 2, 5, …
## $ group_1            <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ group_2            <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ group_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ group_4            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ make_round_of_16   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ make_quarters      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ make_semis         <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ make_final         <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ win_league         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ timestamp          <chr> "2022-12-18 17:56:44 UTC", "2022-12-18 17:56:44 UTC…
sub_soccer <- data_soccer %>% select(team, group, spi,goals_scored)


head(sub_soccer)

##Graphic Display

ggplot(sub_soccer, aes(x=spi, y=goals_scored))+geom_point(aes(color=team))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

##Conclusion From the graph we can see a positive linear relationship between FiveThirtyEight’s Soccer Power Index (SPI) ratings and goals scored by the teams.