Lab 7: Multiple Regression and Bootstrapping

Author

Kathleen Strachota

Lab 7: Multiple Regression and Bootstrapping

Load Packages

library(tidyverse)
library(broom)
library(data.table)
library(performance)
library(patchwork)
library(car) 
library(rsample)
library(ggsci)
library(dotwhisker)

Essentials

Load data ‘soccer’ from tidytuesday

find soccer here

soccer <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-04-04/soccer21-22.csv')
Rows: 380 Columns: 22
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): Date, HomeTeam, AwayTeam, FTR, HTR, Referee
dbl (16): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(soccer)
spc_tbl_ [380 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Date    : chr [1:380] "13/08/2021" "14/08/2021" "14/08/2021" "14/08/2021" ...
 $ HomeTeam: chr [1:380] "Brentford" "Man United" "Burnley" "Chelsea" ...
 $ AwayTeam: chr [1:380] "Arsenal" "Leeds" "Brighton" "Crystal Palace" ...
 $ FTHG    : num [1:380] 2 5 1 3 3 1 3 0 2 1 ...
 $ FTAG    : num [1:380] 0 1 2 0 1 0 2 3 4 0 ...
 $ FTR     : chr [1:380] "H" "H" "A" "H" ...
 $ HTHG    : num [1:380] 1 1 1 2 0 1 2 0 2 0 ...
 $ HTAG    : num [1:380] 0 0 0 0 1 0 0 1 1 0 ...
 $ HTR     : chr [1:380] "H" "H" "H" "H" ...
 $ Referee : chr [1:380] "M Oliver" "P Tierney" "D Coote" "J Moss" ...
 $ HS      : num [1:380] 8 16 14 13 14 9 13 14 17 13 ...
 $ AS      : num [1:380] 22 10 14 4 6 17 11 19 8 18 ...
 $ HST     : num [1:380] 3 8 3 6 6 5 7 3 3 3 ...
 $ AST     : num [1:380] 4 3 8 1 3 3 2 8 9 4 ...
 $ HF      : num [1:380] 12 11 10 15 13 6 18 4 4 11 ...
 $ AF      : num [1:380] 8 9 7 11 15 10 13 14 3 8 ...
 $ HC      : num [1:380] 2 5 7 5 6 5 2 3 7 3 ...
 $ AC      : num [1:380] 5 4 6 2 8 4 4 11 6 11 ...
 $ HY      : num [1:380] 0 1 2 0 2 1 3 1 1 2 ...
 $ AY      : num [1:380] 0 2 1 0 0 2 1 1 0 1 ...
 $ HR      : num [1:380] 0 0 0 0 0 0 0 0 0 0 ...
 $ AR      : num [1:380] 0 0 0 0 0 0 0 0 0 0 ...
 - attr(*, "spec")=
  .. cols(
  ..   Date = col_character(),
  ..   HomeTeam = col_character(),
  ..   AwayTeam = col_character(),
  ..   FTHG = col_double(),
  ..   FTAG = col_double(),
  ..   FTR = col_character(),
  ..   HTHG = col_double(),
  ..   HTAG = col_double(),
  ..   HTR = col_character(),
  ..   Referee = col_character(),
  ..   HS = col_double(),
  ..   AS = col_double(),
  ..   HST = col_double(),
  ..   AST = col_double(),
  ..   HF = col_double(),
  ..   AF = col_double(),
  ..   HC = col_double(),
  ..   AC = col_double(),
  ..   HY = col_double(),
  ..   AY = col_double(),
  ..   HR = col_double(),
  ..   AR = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 

After you load the data, record which variables are categorical and which are numeric.

CATEGORICAL: Date, HomeTeam, AwayTeam, Full time tesult (FTR), Halftime results (HTR), Referee

NUMERIC: Full time home Ggals (FTHG), Full time away goals (FTAG), Halftime home goals (HTHG), Halftime away goals (HTAG), Number of shots taken by the home team (HS), Number of shots taken by the away team (AS), Number of shots on target by the home team (HST), Number of shots on target by the away team (AST), Number of fouls by the home team (HF), Number of fouls by the away team (AF), Number of corners taken by the home team (HC), Number of corners taken by the away team (AC), Number of yellow cards received by the home team (HY), Number of yellow cards received by the away team (AY), Number of red cards received by the home team (HR), Number of red cards received by the away team (AR)

Let’s consider the effects of home team shots (HS), home team (HomeTeam), and home team fouls (HF) on home team goals (fullt time home goals). Build a fully interactive multiple linear regression model. Assess model fit and then model assumptions. How well does the model fit the data? Is the model valid?

soc_1 <- lm(FTHG ~ HS*HomeTeam*HF, soccer)
summary(soc_1)

Call:
lm(formula = FTHG ~ HS * HomeTeam * HF, data = soccer)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.7138 -0.6469 -0.0615  0.5152  3.9047 

Coefficients:
                               Estimate Std. Error t value Pr(>|t|)  
(Intercept)                    0.697352   3.708178   0.188   0.8510  
HS                             0.055902   0.190075   0.294   0.7689  
HomeTeamAston Villa            0.881435   4.996848   0.176   0.8601  
HomeTeamBrentford             -2.533050   5.203392  -0.487   0.6268  
HomeTeamBrighton               3.717225   6.575577   0.565   0.5723  
HomeTeamBurnley               -0.276222   6.857200  -0.040   0.9679  
HomeTeamChelsea                6.034787   4.925771   1.225   0.2215  
HomeTeamCrystal Palace        -2.318297   4.283831  -0.541   0.5888  
HomeTeamEverton                0.319386   4.645420   0.069   0.9452  
HomeTeamLeeds                 -3.624261   5.001969  -0.725   0.4693  
HomeTeamLeicester             -0.792986   4.114557  -0.193   0.8473  
HomeTeamLiverpool              4.175053   5.026751   0.831   0.4069  
HomeTeamMan City             -10.661735   5.330983  -2.000   0.0464 *
HomeTeamMan United             7.326569   4.781359   1.532   0.1265  
HomeTeamNewcastle              0.168113   4.800783   0.035   0.9721  
HomeTeamNorwich                2.094630   5.334412   0.393   0.6948  
HomeTeamSouthampton           -0.310685   4.470009  -0.070   0.9446  
HomeTeamTottenham             -2.750661   4.805536  -0.572   0.5675  
HomeTeamWatford               -2.913636   4.142058  -0.703   0.4823  
HomeTeamWest Ham              -2.027195   4.796066  -0.423   0.6728  
HomeTeamWolves                -1.557139   4.102273  -0.380   0.7045  
HF                            -0.009934   0.365871  -0.027   0.9784  
HS:HomeTeamAston Villa         0.061853   0.328940   0.188   0.8510  
HS:HomeTeamBrentford           0.182702   0.332776   0.549   0.5834  
HS:HomeTeamBrighton           -0.262377   0.425503  -0.617   0.5379  
HS:HomeTeamBurnley             0.031909   0.467206   0.068   0.9456  
HS:HomeTeamChelsea            -0.391915   0.267354  -1.466   0.1437  
HS:HomeTeamCrystal Palace      0.105923   0.251576   0.421   0.6740  
HS:HomeTeamEverton             0.011687   0.286931   0.041   0.9675  
HS:HomeTeamLeeds               0.243965   0.297532   0.820   0.4129  
HS:HomeTeamLeicester           0.146672   0.235410   0.623   0.5337  
HS:HomeTeamLiverpool          -0.109337   0.242931  -0.450   0.6530  
HS:HomeTeamMan City            0.558982   0.261335   2.139   0.0332 *
HS:HomeTeamMan United         -0.414803   0.258899  -1.602   0.1102  
HS:HomeTeamNewcastle          -0.025273   0.307031  -0.082   0.9345  
HS:HomeTeamNorwich            -0.290202   0.363840  -0.798   0.4257  
HS:HomeTeamSouthampton         0.053585   0.266735   0.201   0.8409  
HS:HomeTeamTottenham           0.208991   0.306756   0.681   0.4962  
HS:HomeTeamWatford             0.257706   0.248203   1.038   0.3000  
HS:HomeTeamWest Ham            0.147201   0.312313   0.471   0.6378  
HS:HomeTeamWolves              0.010637   0.248269   0.043   0.9659  
HS:HF                          0.001280   0.019025   0.067   0.9464  
HomeTeamAston Villa:HF        -0.084654   0.465462  -0.182   0.8558  
HomeTeamBrentford:HF           0.221657   0.505506   0.438   0.6613  
HomeTeamBrighton:HF           -0.268913   0.596677  -0.451   0.6525  
HomeTeamBurnley:HF            -0.085970   0.651113  -0.132   0.8950  
HomeTeamChelsea:HF            -0.450434   0.447628  -1.006   0.3151  
HomeTeamCrystal Palace:HF      0.196644   0.417779   0.471   0.6382  
HomeTeamEverton:HF            -0.128741   0.491577  -0.262   0.7936  
HomeTeamLeeds:HF               0.213802   0.446558   0.479   0.6324  
HomeTeamLeicester:HF           0.048414   0.403071   0.120   0.9045  
HomeTeamLiverpool:HF          -0.289467   0.533930  -0.542   0.5881  
HomeTeamMan City:HF            1.036326   0.560155   1.850   0.0653 .
HomeTeamMan United:HF         -0.787337   0.499734  -1.576   0.1162  
HomeTeamNewcastle:HF          -0.073170   0.446549  -0.164   0.8700  
HomeTeamNorwich:HF            -0.254151   0.505784  -0.502   0.6157  
HomeTeamSouthampton:HF         0.024514   0.429623   0.057   0.9545  
HomeTeamTottenham:HF           0.187767   0.454811   0.413   0.6800  
HomeTeamWatford:HF             0.117282   0.411028   0.285   0.7756  
HomeTeamWest Ham:HF            0.451202   0.482874   0.934   0.3508  
HomeTeamWolves:HF              0.302944   0.420129   0.721   0.4714  
HS:HomeTeamAston Villa:HF     -0.005724   0.028970  -0.198   0.8435  
HS:HomeTeamBrentford:HF       -0.018001   0.032204  -0.559   0.5766  
HS:HomeTeamBrighton:HF         0.015377   0.038286   0.402   0.6882  
HS:HomeTeamBurnley:HF          0.002547   0.044694   0.057   0.9546  
HS:HomeTeamChelsea:HF          0.031442   0.024617   1.277   0.2025  
HS:HomeTeamCrystal Palace:HF  -0.008544   0.024429  -0.350   0.7268  
HS:HomeTeamEverton:HF          0.006583   0.032144   0.205   0.8379  
HS:HomeTeamLeeds:HF           -0.017376   0.025839  -0.672   0.5018  
HS:HomeTeamLeicester:HF       -0.010627   0.024129  -0.440   0.6599  
HS:HomeTeamLiverpool:HF        0.006820   0.025865   0.264   0.7922  
HS:HomeTeamMan City:HF        -0.048830   0.028501  -1.713   0.0877 .
HS:HomeTeamMan United:HF       0.046000   0.028069   1.639   0.1023  
HS:HomeTeamNewcastle:HF        0.006266   0.028067   0.223   0.8235  
HS:HomeTeamNorwich:HF          0.027014   0.033845   0.798   0.4254  
HS:HomeTeamSouthampton:HF     -0.006353   0.025158  -0.253   0.8008  
HS:HomeTeamTottenham:HF       -0.011332   0.028415  -0.399   0.6903  
HS:HomeTeamWatford:HF         -0.013823   0.024557  -0.563   0.5739  
HS:HomeTeamWest Ham:HF        -0.030930   0.030862  -1.002   0.3171  
HS:HomeTeamWolves:HF          -0.015949   0.027111  -0.588   0.5568  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.16 on 300 degrees of freedom
Multiple R-squared:  0.3949,    Adjusted R-squared:  0.2355 
F-statistic: 2.478 on 79 and 300 DF,  p-value: 1.774e-08

The p-value is 1.0774e-08 which means that at least one of these variables has an effect on home team goals. The adjusted R-Square is 0.2355 which means that approximately 23.55% of the variance is explained by this model.

Checking Assumptions

model_performance(soc_1)
# Indices of model performance

AIC      |     AICc |      BIC |    R2 | R2 (adj.) |  RMSE | Sigma
------------------------------------------------------------------
1263.266 | 1307.843 | 1582.420 | 0.395 |     0.236 | 1.031 | 1.160
check_model(soc_1)
Variable `Component` is not in your data frame :/

Assumptions: 1. Linearity: The assumption regarding linearity has been met because the reference line is pretty flat and horizontal. 2. Normality of Residuals: The residuals are looking pretty normal according to the qq plot. 3. Equal Variance: The reference line appears to be mostly flat and horizontal so I would say that the assumption of homogeneity of variance is mostly met. 4. Colinearity: The VIFs are extremely high which indicates that there is some effect of multicollinearity and thus the assumption is not met. 5. Independence: We don’t know much about the experimental design so it’s hard to say much about the assumption of independence. Since this is a lab assignment, I will continue using this data.

This model does not fit the data very well and multiple assumptions are violated so I would conclude the model is not valid.

Run through a top-down modeling approach to find the best fit model! Be sure to check assumptions after each change and compare performance. What model is the best fit?

Note: Any model that is commented out are models that don’t meet the collinearity assumption

# soc_2 <- lm(FTHG ~ HS*HomeTeam+HF, soccer)
# check_model(soc_2)
# soc_3 <- lm(FTHG ~ HS+HomeTeam*HF, soccer)
# check_model(soc_3)
soc_4 <- lm(FTHG ~ HS+HomeTeam+HF, soccer)
check_model(soc_4)
Variable `Component` is not in your data frame :/

# soc_5 <- lm(FTHG ~ HS*HomeTeam, soccer)
# check_model(soc_5)
soc_6 <- lm(FTHG ~ HS+HomeTeam, soccer)
check_model(soc_6)
Variable `Component` is not in your data frame :/

# soc_7 <- lm(FTHG ~ HS*HF, soccer)
# check_model(soc_7)
soc_8 <- lm(FTHG ~ HS+HF, soccer)
check_model(soc_8)
Variable `Component` is not in your data frame :/

# soc_9 <- lm(FTHG ~ HomeTeam*HF, soccer)
# check_model(soc_9)
soc_10 <- lm(FTHG ~ HomeTeam+HF, soccer)
check_model(soc_10)
Variable `Component` is not in your data frame :/

compare_performance(soc_4, soc_6, soc_8, soc_10, rank = TRUE)
# Comparison of Model Performance Indices

Name   | Model |    R2 | R2 (adj.) |  RMSE | Sigma | AIC weights | AICc weights | BIC weights | Performance-Score
-----------------------------------------------------------------------------------------------------------------
soc_6  |    lm | 0.260 |     0.218 | 1.140 | 1.173 |       0.727 |        0.746 |    9.91e-14 |            85.70%
soc_4  |    lm | 0.260 |     0.216 | 1.140 | 1.174 |       0.270 |        0.243 |    5.13e-15 |            66.25%
soc_8  |    lm | 0.162 |     0.158 | 1.213 | 1.217 |       0.003 |        0.012 |       1.000 |            19.40%
soc_10 |    lm | 0.190 |     0.145 | 1.192 | 1.226 |    2.95e-08 |     3.02e-08 |    4.01e-21 |             8.16%

According to compare-performance(), the model that fits best is soc_6 which is the additive model with the effects of HomeTeam shots and HomeTeam on HomeTeam goals. But it is not the most complex. The most complex model that does not violate assumptions is soc_4 which is a fully additive model with the effects of hometeam shots, hometeam, and hometeam fouls on hometeam goals: \(FTHG = \beta_0 + \beta_1(HS) + \beta_2(HomeTeam) + \beta_3(HF) + \varepsilon\). Since the adjusted \(R^2\) for soc_4 is only slightly lower than that of soc_6, I would probably pick the soc_4 model.

After identifying the best fit model, build the appropriate graph! See our multiple regression tutorial. Next, Build a coef plot for the model. Using patchwork, show me a 2-panel figure with the coef plot and the graph for the model

horrible <- soc_4 %>% 
  augment() %>%
  ggplot(aes(x= HS, y = FTHG, color = HomeTeam))+
  geom_point(aes(size=HF))+
  geom_line(aes(y=.fitted), size=1) +
  theme_classic()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
test <- dwplot(soc_4) +
  theme_classic() +
  geom_vline(xintercept = 0, linetype=2)

coefs<-tidy(soc_4, quick=FALSE)
ci<-data.table(confint(soc_4), keep.rownames='term')
cidf<-cbind(coefs,ci)
cidf<-cidf[,-6]

cidf<- cidf %>%
  rename("lower"="2.5 %",
         "upper"="97.5 %")
cidf$term=as.factor(cidf$term)

coef.plot <- ggplot(data=cidf, aes(x=estimate, y=term))+
  geom_vline(xintercept = 0, linetype=2)+
  geom_point(size=3)+
  geom_errorbarh(aes(xmax=lower, xmin=upper),height=0.2)+
  theme_classic()

horrible+coef.plot

horrible+test

Depth

Bootstrap the coef plot from Essential #4, above.

set.seed(420)

soccer_intervals<- reg_intervals(FTHG ~ HS+HomeTeam+HF, data=soccer, 
                                   type='percentile',
                                   keep_reps=FALSE)

soccer_intervals
# A tibble: 21 × 6
   term                    .lower .estimate .upper .alpha .method   
   <chr>                    <dbl>     <dbl>  <dbl>  <dbl> <chr>     
 1 HF                     -0.0375  -0.00240 0.0325   0.05 percentile
 2 HS                      0.0450   0.0708  0.0975   0.05 percentile
 3 HomeTeamAston Villa    -0.623    0.0983  0.833    0.05 percentile
 4 HomeTeamBrentford      -0.987   -0.273   0.466    0.05 percentile
 5 HomeTeamBrighton       -1.38    -0.606   0.214    0.05 percentile
 6 HomeTeamBurnley        -1.18    -0.463   0.263    0.05 percentile
 7 HomeTeamChelsea        -0.636    0.175   1.03     0.05 percentile
 8 HomeTeamCrystal Palace -0.669    0.00141 0.728    0.05 percentile
 9 HomeTeamEverton        -0.717    0.00608 0.687    0.05 percentile
10 HomeTeamLeeds          -1.16    -0.535   0.102    0.05 percentile
# ℹ 11 more rows
#plot the results
cleats <-ggplot(data=soccer_intervals, aes(x=.estimate, y=term))+
  geom_vline(xintercept=0, linetype=2)+
  geom_errorbarh(aes(xmin=.lower, xmax=.upper),height=0.2)+
  geom_point(size=3)+
  theme_classic()
cleats

Calculate means and 95% CIs of full time home goals and full time away goals (using bootstrapping). Plot the results and interpret the plot (is there a home advantage or not?)

soccer1 <- soccer
n = nrow(soccer1)

soccer.bs<- 1:1000 %>% #1000 = number of trials / resamples
  map_dfr(
    ~soccer1 %>%
      slice_sample(n=n, replace=TRUE) %>%
      summarize(meanhg=mean(FTHG), meanag = mean(FTAG))) %>%
  mutate(n=n)

calc_CIs<-soccer.bs %>%
  summarize(meanhgboot=mean(meanhg), meanagboot=mean(meanag), CIhg=1.96*sd(meanhg), CIag=1.96*sd(meanag))

calc_CIs
# A tibble: 1 × 4
  meanhgboot meanagboot  CIhg  CIag
       <dbl>      <dbl> <dbl> <dbl>
1       1.51       1.31 0.132 0.126

Plots

CIlong <- calc_CIs %>%
  pivot_longer(meanhgboot:meanagboot, names_to = 'team', values_to = 'goals') %>%
  pivot_longer(CIhg:CIag, names_to = 'CIcat', values_to = 'CI') 

CIlong <- CIlong[c(1,4),]
CIlong$team <- recode_factor(CIlong$team, meanhgboot = "Home", meanagboot = "Away")

CIlong
# A tibble: 2 × 4
  team  goals CIcat    CI
  <fct> <dbl> <chr> <dbl>
1 Home   1.51 CIhg  0.132
2 Away   1.31 CIag  0.126
ggplot(data = CIlong, aes(x = team, y = goals)) +
  geom_point() +
  geom_errorbar(aes(ymin = goals-CI, ymax = goals+CI), width = 0.2) +
  theme_classic() +
  labs(x = "Team", y = "Mean goals scored")

Because the 95% confidence intervals overlap, so there is no statistically significant difference between the mean goals scored by the home team and the away team. So, we conclude that there is not a home advantage.

Add raw data behind your 95% CI plot above!

soccer2 <- soccer1 %>%
  select(FTHG, FTAG) %>%
  pivot_longer(FTHG:FTAG, names_to = 'team', values_to = 'goals')

soccer2$team <- recode_factor(soccer2$team, FTHG = "Home", FTAG = "Away")
  
soccer2
# A tibble: 760 × 2
   team  goals
   <fct> <dbl>
 1 Home      2
 2 Away      0
 3 Home      5
 4 Away      1
 5 Home      1
 6 Away      2
 7 Home      3
 8 Away      0
 9 Home      3
10 Away      1
# ℹ 750 more rows
ggplot(data = CIlong, aes(x = team, y = goals)) +
  geom_point() +
  geom_errorbar(aes(ymin = goals-CI, ymax = goals+CI), width = 0.2) +
  geom_jitter(data = soccer2, aes(x = team, y = goals, color = team), alpha = 0.6, size =0.3) +
  theme_classic() +
  labs(x = "Team", y = "Mean goals scored") +
  scale_color_npg()