Lab 7: Multiple Regression and Bootstrapping

Author

Maya Frey

Lab 7: Multiple Regression and Bootstrapping

load packages

library(tidyverse)
library(broom)
library(data.table)
library(performance)
library(patchwork)
library(car) 
library(rsample)

Essentials

1.) Load data ‘soccer’ from tidytuesday

soccer <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-04-04/soccer21-22.csv')
Rows: 380 Columns: 22
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): Date, HomeTeam, AwayTeam, FTR, HTR, Referee
dbl (16): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(soccer)
spc_tbl_ [380 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Date    : chr [1:380] "13/08/2021" "14/08/2021" "14/08/2021" "14/08/2021" ...
 $ HomeTeam: chr [1:380] "Brentford" "Man United" "Burnley" "Chelsea" ...
 $ AwayTeam: chr [1:380] "Arsenal" "Leeds" "Brighton" "Crystal Palace" ...
 $ FTHG    : num [1:380] 2 5 1 3 3 1 3 0 2 1 ...
 $ FTAG    : num [1:380] 0 1 2 0 1 0 2 3 4 0 ...
 $ FTR     : chr [1:380] "H" "H" "A" "H" ...
 $ HTHG    : num [1:380] 1 1 1 2 0 1 2 0 2 0 ...
 $ HTAG    : num [1:380] 0 0 0 0 1 0 0 1 1 0 ...
 $ HTR     : chr [1:380] "H" "H" "H" "H" ...
 $ Referee : chr [1:380] "M Oliver" "P Tierney" "D Coote" "J Moss" ...
 $ HS      : num [1:380] 8 16 14 13 14 9 13 14 17 13 ...
 $ AS      : num [1:380] 22 10 14 4 6 17 11 19 8 18 ...
 $ HST     : num [1:380] 3 8 3 6 6 5 7 3 3 3 ...
 $ AST     : num [1:380] 4 3 8 1 3 3 2 8 9 4 ...
 $ HF      : num [1:380] 12 11 10 15 13 6 18 4 4 11 ...
 $ AF      : num [1:380] 8 9 7 11 15 10 13 14 3 8 ...
 $ HC      : num [1:380] 2 5 7 5 6 5 2 3 7 3 ...
 $ AC      : num [1:380] 5 4 6 2 8 4 4 11 6 11 ...
 $ HY      : num [1:380] 0 1 2 0 2 1 3 1 1 2 ...
 $ AY      : num [1:380] 0 2 1 0 0 2 1 1 0 1 ...
 $ HR      : num [1:380] 0 0 0 0 0 0 0 0 0 0 ...
 $ AR      : num [1:380] 0 0 0 0 0 0 0 0 0 0 ...
 - attr(*, "spec")=
  .. cols(
  ..   Date = col_character(),
  ..   HomeTeam = col_character(),
  ..   AwayTeam = col_character(),
  ..   FTHG = col_double(),
  ..   FTAG = col_double(),
  ..   FTR = col_character(),
  ..   HTHG = col_double(),
  ..   HTAG = col_double(),
  ..   HTR = col_character(),
  ..   Referee = col_character(),
  ..   HS = col_double(),
  ..   AS = col_double(),
  ..   HST = col_double(),
  ..   AST = col_double(),
  ..   HF = col_double(),
  ..   AF = col_double(),
  ..   HC = col_double(),
  ..   AC = col_double(),
  ..   HY = col_double(),
  ..   AY = col_double(),
  ..   HR = col_double(),
  ..   AR = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 

find soccer here

After you load the data, record which variables are categorical and which are numeric. Categorical: Date, HomeTeam, AwayTeam, FTR, HTR, Referee Numerical: FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY, HR, AR, ### 2.) Let’s consider the effects of home team shots (HS), home team (HomeTeam), and home team fouls (HF) on home team goals (fullt time home goals). Build a fully interactive multiple linear regression model. Assess model fit and then model assumptions. How well does the model fit the data? Is the model valid?

drop_na(soccer)
# A tibble: 380 × 22
   Date  HomeT…¹ AwayT…²  FTHG  FTAG FTR    HTHG  HTAG HTR   Referee    HS    AS
   <chr> <chr>   <chr>   <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr>   <dbl> <dbl>
 1 13/0… Brentf… Arsenal     2     0 H         1     0 H     M Oliv…     8    22
 2 14/0… Man Un… Leeds       5     1 H         1     0 H     P Tier…    16    10
 3 14/0… Burnley Bright…     1     2 A         1     0 H     D Coote    14    14
 4 14/0… Chelsea Crysta…     3     0 H         2     0 H     J Moss     13     4
 5 14/0… Everton Southa…     3     1 H         0     1 A     A Madl…    14     6
 6 14/0… Leices… Wolves      1     0 H         1     0 H     C Paws…     9    17
 7 14/0… Watford Aston …     3     2 H         2     0 H     M Dean     13    11
 8 14/0… Norwich Liverp…     0     3 A         0     1 A     A Marr…    14    19
 9 15/0… Newcas… West H…     2     4 A         2     1 H     M Atki…    17     8
10 15/0… Totten… Man Ci…     1     0 H         0     0 D     A Tayl…    13    18
# … with 370 more rows, 10 more variables: HST <dbl>, AST <dbl>, HF <dbl>,
#   AF <dbl>, HC <dbl>, AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>, and
#   abbreviated variable names ¹​HomeTeam, ²​AwayTeam
lms <- lm(FTHG ~ HS * HomeTeam * HF, data = soccer)
summary(lms)

Call:
lm(formula = FTHG ~ HS * HomeTeam * HF, data = soccer)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.7138 -0.6469 -0.0615  0.5152  3.9047 

Coefficients:
                               Estimate Std. Error t value Pr(>|t|)  
(Intercept)                    0.697352   3.708178   0.188   0.8510  
HS                             0.055902   0.190075   0.294   0.7689  
HomeTeamAston Villa            0.881435   4.996848   0.176   0.8601  
HomeTeamBrentford             -2.533050   5.203392  -0.487   0.6268  
HomeTeamBrighton               3.717225   6.575577   0.565   0.5723  
HomeTeamBurnley               -0.276222   6.857200  -0.040   0.9679  
HomeTeamChelsea                6.034787   4.925771   1.225   0.2215  
HomeTeamCrystal Palace        -2.318297   4.283831  -0.541   0.5888  
HomeTeamEverton                0.319386   4.645420   0.069   0.9452  
HomeTeamLeeds                 -3.624261   5.001969  -0.725   0.4693  
HomeTeamLeicester             -0.792986   4.114557  -0.193   0.8473  
HomeTeamLiverpool              4.175053   5.026751   0.831   0.4069  
HomeTeamMan City             -10.661735   5.330983  -2.000   0.0464 *
HomeTeamMan United             7.326569   4.781359   1.532   0.1265  
HomeTeamNewcastle              0.168113   4.800783   0.035   0.9721  
HomeTeamNorwich                2.094630   5.334412   0.393   0.6948  
HomeTeamSouthampton           -0.310685   4.470009  -0.070   0.9446  
HomeTeamTottenham             -2.750661   4.805536  -0.572   0.5675  
HomeTeamWatford               -2.913636   4.142058  -0.703   0.4823  
HomeTeamWest Ham              -2.027195   4.796066  -0.423   0.6728  
HomeTeamWolves                -1.557139   4.102273  -0.380   0.7045  
HF                            -0.009934   0.365871  -0.027   0.9784  
HS:HomeTeamAston Villa         0.061853   0.328940   0.188   0.8510  
HS:HomeTeamBrentford           0.182702   0.332776   0.549   0.5834  
HS:HomeTeamBrighton           -0.262377   0.425503  -0.617   0.5379  
HS:HomeTeamBurnley             0.031909   0.467206   0.068   0.9456  
HS:HomeTeamChelsea            -0.391915   0.267354  -1.466   0.1437  
HS:HomeTeamCrystal Palace      0.105923   0.251576   0.421   0.6740  
HS:HomeTeamEverton             0.011687   0.286931   0.041   0.9675  
HS:HomeTeamLeeds               0.243965   0.297532   0.820   0.4129  
HS:HomeTeamLeicester           0.146672   0.235410   0.623   0.5337  
HS:HomeTeamLiverpool          -0.109337   0.242931  -0.450   0.6530  
HS:HomeTeamMan City            0.558982   0.261335   2.139   0.0332 *
HS:HomeTeamMan United         -0.414803   0.258899  -1.602   0.1102  
HS:HomeTeamNewcastle          -0.025273   0.307031  -0.082   0.9345  
HS:HomeTeamNorwich            -0.290202   0.363840  -0.798   0.4257  
HS:HomeTeamSouthampton         0.053585   0.266735   0.201   0.8409  
HS:HomeTeamTottenham           0.208991   0.306756   0.681   0.4962  
HS:HomeTeamWatford             0.257706   0.248203   1.038   0.3000  
HS:HomeTeamWest Ham            0.147201   0.312313   0.471   0.6378  
HS:HomeTeamWolves              0.010637   0.248269   0.043   0.9659  
HS:HF                          0.001280   0.019025   0.067   0.9464  
HomeTeamAston Villa:HF        -0.084654   0.465462  -0.182   0.8558  
HomeTeamBrentford:HF           0.221657   0.505506   0.438   0.6613  
HomeTeamBrighton:HF           -0.268913   0.596677  -0.451   0.6525  
HomeTeamBurnley:HF            -0.085970   0.651113  -0.132   0.8950  
HomeTeamChelsea:HF            -0.450434   0.447628  -1.006   0.3151  
HomeTeamCrystal Palace:HF      0.196644   0.417779   0.471   0.6382  
HomeTeamEverton:HF            -0.128741   0.491577  -0.262   0.7936  
HomeTeamLeeds:HF               0.213802   0.446558   0.479   0.6324  
HomeTeamLeicester:HF           0.048414   0.403071   0.120   0.9045  
HomeTeamLiverpool:HF          -0.289467   0.533930  -0.542   0.5881  
HomeTeamMan City:HF            1.036326   0.560155   1.850   0.0653 .
HomeTeamMan United:HF         -0.787337   0.499734  -1.576   0.1162  
HomeTeamNewcastle:HF          -0.073170   0.446549  -0.164   0.8700  
HomeTeamNorwich:HF            -0.254151   0.505784  -0.502   0.6157  
HomeTeamSouthampton:HF         0.024514   0.429623   0.057   0.9545  
HomeTeamTottenham:HF           0.187767   0.454811   0.413   0.6800  
HomeTeamWatford:HF             0.117282   0.411028   0.285   0.7756  
HomeTeamWest Ham:HF            0.451202   0.482874   0.934   0.3508  
HomeTeamWolves:HF              0.302944   0.420129   0.721   0.4714  
HS:HomeTeamAston Villa:HF     -0.005724   0.028970  -0.198   0.8435  
HS:HomeTeamBrentford:HF       -0.018001   0.032204  -0.559   0.5766  
HS:HomeTeamBrighton:HF         0.015377   0.038286   0.402   0.6882  
HS:HomeTeamBurnley:HF          0.002547   0.044694   0.057   0.9546  
HS:HomeTeamChelsea:HF          0.031442   0.024617   1.277   0.2025  
HS:HomeTeamCrystal Palace:HF  -0.008544   0.024429  -0.350   0.7268  
HS:HomeTeamEverton:HF          0.006583   0.032144   0.205   0.8379  
HS:HomeTeamLeeds:HF           -0.017376   0.025839  -0.672   0.5018  
HS:HomeTeamLeicester:HF       -0.010627   0.024129  -0.440   0.6599  
HS:HomeTeamLiverpool:HF        0.006820   0.025865   0.264   0.7922  
HS:HomeTeamMan City:HF        -0.048830   0.028501  -1.713   0.0877 .
HS:HomeTeamMan United:HF       0.046000   0.028069   1.639   0.1023  
HS:HomeTeamNewcastle:HF        0.006266   0.028067   0.223   0.8235  
HS:HomeTeamNorwich:HF          0.027014   0.033845   0.798   0.4254  
HS:HomeTeamSouthampton:HF     -0.006353   0.025158  -0.253   0.8008  
HS:HomeTeamTottenham:HF       -0.011332   0.028415  -0.399   0.6903  
HS:HomeTeamWatford:HF         -0.013823   0.024557  -0.563   0.5739  
HS:HomeTeamWest Ham:HF        -0.030930   0.030862  -1.002   0.3171  
HS:HomeTeamWolves:HF          -0.015949   0.027111  -0.588   0.5568  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.16 on 300 degrees of freedom
Multiple R-squared:  0.3949,    Adjusted R-squared:  0.2355 
F-statistic: 2.478 on 79 and 300 DF,  p-value: 1.774e-08
model_performance(lms)
# Indices of model performance

AIC      |     AICc |      BIC |    R2 | R2 (adj.) |  RMSE | Sigma
------------------------------------------------------------------
1263.266 | 1307.843 | 1582.420 | 0.395 |     0.236 | 1.031 | 1.160
check_model(lms)
Variable `Component` is not in your data frame :/

The adjusted R2 is 0.236, so the model explains less than 25% of the variation in the data. This is consistent with the AIC and BIC, which are both above 1000 and very high. This means that the model does not fit the data well at all. For the assumptions, linearity and normality are met, but homogeneity of variance does not appear to be met and collinearity is much higher than it should be so the assumption is not met. We do not know how the data was collected, so we do not know about independence of observations. Based on this assessment, the model is not valid as multiple assumptions are not met and the model does not fit the data.

3.) Run through a top-down modeling approach to find the best fit model! Be sure to check assumptions after each change and compare performance. What model is the best fit?

# Model 1
lms1 <- lm(FTHG ~ HS * HomeTeam + HF, data = soccer)
#summary(lms1)
#model_performance(lms1)
#check_model(lms1)
# Model 2
lms2 <- lm(FTHG ~ HS + HomeTeam * HF, data = soccer)
#summary(lms2)
#model_performance(lms2)
#check_model(lms2)
# Model 3 
lms3 <- lm(FTHG ~ HS + HomeTeam + HF, data = soccer)
#summary(lms3)
model_performance(lms3)
# Indices of model performance

AIC      |     AICc |      BIC |    R2 | R2 (adj.) |  RMSE | Sigma
------------------------------------------------------------------
1223.866 | 1226.967 | 1314.490 | 0.260 |     0.216 | 1.140 | 1.174
check_model(lms3)
Variable `Component` is not in your data frame :/

# Model 4
lms4 <- lm(FTHG ~ HS * HomeTeam, data = soccer)
#summary(lms4)
#model_performance(lms4)
#check_model(lms4)
# Model 5
lms5 <- lm(FTHG ~ HS * HF, data = soccer)
#summary(lms5)
#model_performance(lms5)
#check_model(lms5)
# Model 6 
lms6 <- lm(FTHG ~ HomeTeam * HF, data = soccer)
#summary(lms6)
#model_performance(lms6)
#check_model(lms6)
# Model 7 
lms7 <- lm(FTHG ~ HS + HomeTeam, data = soccer)
#summary(lms7)
model_performance(lms7)
# Indices of model performance

AIC      |     AICc |      BIC |    R2 | R2 (adj.) |  RMSE | Sigma
------------------------------------------------------------------
1221.886 | 1224.721 | 1308.570 | 0.260 |     0.218 | 1.140 | 1.173
check_model(lms7)
Variable `Component` is not in your data frame :/

# Model 8 
lms8 <- lm(FTHG ~ HS + HF, data = soccer)
#summary(lms8)
model_performance(lms8)
# Indices of model performance

AIC      |     AICc |      BIC |    R2 | R2 (adj.) |  RMSE | Sigma
------------------------------------------------------------------
1232.923 | 1233.030 | 1248.684 | 0.162 |     0.158 | 1.213 | 1.217
check_model(lms8)
Variable `Component` is not in your data frame :/

# Model 9 
lms9 <- lm(FTHG ~ HomeTeam + HF, data = soccer)
#summary(lms9)
model_performance(lms9)
# Indices of model performance

AIC      |     AICc |      BIC |    R2 | R2 (adj.) |  RMSE | Sigma
------------------------------------------------------------------
1255.929 | 1258.764 | 1342.613 | 0.190 |     0.145 | 1.192 | 1.226
check_model(lms9)
Variable `Component` is not in your data frame :/

# Compare models
compare_performance(lms3, lms7, lms8, lms9, rank = TRUE)
# Comparison of Model Performance Indices

Name | Model |    R2 | R2 (adj.) |  RMSE | Sigma | AIC weights | AICc weights | BIC weights | Performance-Score
---------------------------------------------------------------------------------------------------------------
lms7 |    lm | 0.260 |     0.218 | 1.140 | 1.173 |       0.727 |        0.746 |    9.91e-14 |            85.70%
lms3 |    lm | 0.260 |     0.216 | 1.140 | 1.174 |       0.270 |        0.243 |    5.13e-15 |            66.25%
lms8 |    lm | 0.162 |     0.158 | 1.213 | 1.217 |       0.003 |        0.012 |       1.000 |            19.40%
lms9 |    lm | 0.190 |     0.145 | 1.192 | 1.226 |    2.95e-08 |     3.02e-08 |    4.01e-21 |             8.16%

The model that is the best fit is lms7, which is HS + HomeTeam. However, it is not the most complex model with a relatively good fit, and lms3 (HS + HomeTeam + HF) fits the data only slightly worse and is more complex. The more complex model, that has a very similar fit to the less complex model, is the one we should use.

4.) After identifying the best fit model, build the appropriate graph! See our multiple regression tutorial. Next, Build a coef plot for the model. Using patchwork, show me a 2-panel figure with the coef plot and the graph for the model

# Build appropriate graph
lms3g <- lms3 %>%
  augment() %>%
  ggplot(aes(x = HS, y = FTHG, color = HomeTeam)) +
  geom_point(aes(size = HF)) +
  geom_point() +
  geom_line(aes(y = .fitted), size = 1) +
  theme_bw()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
lms3g

# Build a coef plot
lms3_coef <- tidy(lms3, quick=FALSE)
lms3_ci <- data.table(confint(lms3), keep.rownames='term')
lms3_df <- cbind(lms3_coef, lms3_ci)
lms3_df<-lms3_df [,-6]
lms3_df <- lms3_df %>%
  rename("lower"="2.5 %",
         "upper"="97.5 %")
lms3_df$term <- as.factor(lms3_df$term)

lms3g1 <- ggplot(data = lms3_df, aes(x = estimate, y = term)) +
  geom_vline(xintercept = 0, linetype = 2) +
  geom_point(size = 3)+
  geom_errorbarh(aes(xmax = lower, xmin = upper),height=0.2) +
  theme_bw()
lms3g1

lms3g + lms3g1

Depth

1.) Bootstrap the coef plot from Essential #4, above.

soccer_intervals <- reg_intervals(FTHG ~ HS + HomeTeam + HF, data = soccer, 
                                   type='percentile',
                                   keep_reps=FALSE)
# Plot results
soccleats <- ggplot(data = soccer_intervals, aes(x = .estimate, y = term)) +
  geom_vline(xintercept = 0, linetype = 2) +
  geom_errorbarh(aes(xmin = .lower, xmax = .upper),height = 0.2) +
  geom_point(size = 3) +
  theme_classic()
soccleats

2.) Calculate means and 95% CIs of full time home goals and full time away goals (using bootstrapping). Plot the results and interpret the plot (is there a home advantage or not?)

# Calculate means and CIs
soccer1 <- soccer
n = nrow(soccer1)
soccer_bs <- 1:1000 %>%
  map_dfr(
    ~soccer1 %>%
      slice_sample(n=n, replace=TRUE) %>%
      summarize(meanhg = mean(FTHG), meanag = mean(FTAG))) %>%
  mutate(n=n)
cibs <- soccer_bs %>%
  summarize(meanhgboot = mean(meanhg), meanagboot = mean(meanag), CI_hg = 1.96*sd(meanhg), CI_ag = 1.96*sd(meanag))
cibs
# A tibble: 1 × 4
  meanhgboot meanagboot CI_hg CI_ag
       <dbl>      <dbl> <dbl> <dbl>
1       1.51       1.31 0.134 0.126
# Plot the results
cibs_long <- cibs %>%
  pivot_longer(meanhgboot:meanagboot, names_to = 'team', values_to = 'goals') %>%
  pivot_longer(CI_hg:CI_ag, names_to = 'cicat', values_to = 'ci')
cibs_long = cibs_long[c(1,4),]
cibs_long$team <- recode_factor(cibs_long$team, meanhgboot = 'Home Team', meanagboot = 'Away Team')
cibs_long
# A tibble: 2 × 4
  team      goals cicat    ci
  <fct>     <dbl> <chr> <dbl>
1 Home Team  1.51 CI_hg 0.134
2 Away Team  1.31 CI_ag 0.126
ggplot(data = cibs_long, aes(x = team, y = goals)) + 
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = goals - ci, ymax = goals + ci)) +
  theme_bw()

The confidence interval error bars overlap, therefore the mean goals for home and away teams are not significantly different and there is not a home team advantage.

3.) Add raw data behind your 95% CI plot above!

soccer1 <- soccer1 %>%
  select(FTHG, FTAG)
soccer1
# A tibble: 380 × 2
    FTHG  FTAG
   <dbl> <dbl>
 1     2     0
 2     5     1
 3     1     2
 4     3     0
 5     3     1
 6     1     0
 7     3     2
 8     0     3
 9     2     4
10     1     0
# … with 370 more rows
soccer2 <- soccer1 %>%
  pivot_longer(FTHG : FTAG, names_to = 'team', values_to = 'goals')
soccer2$team <- recode_factor(soccer2$team, FTHG = 'Home Team', FTAG = 'Away Team')

ggplot(data = cibs_long, aes(x = team, y = goals)) + 
  geom_errorbar(aes(ymin = goals - ci, ymax = goals + ci)) +
  geom_jitter(data = soccer2, aes(x = team, y = goals, color = team)) +
  geom_point(size = 3) +
  theme_bw()