library(tidyverse)
library(broom)
library(data.table)
library(performance)
library(patchwork)
library(car)
library(rsample)Lab 7: Multiple Regression and Bootstrapping
Lab 7: Multiple Regression and Bootstrapping
load packages
Essentials
1.) Load data ‘soccer’ from tidytuesday
soccer <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-04-04/soccer21-22.csv')Rows: 380 Columns: 22
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): Date, HomeTeam, AwayTeam, FTR, HTR, Referee
dbl (16): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(soccer)spc_tbl_ [380 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Date : chr [1:380] "13/08/2021" "14/08/2021" "14/08/2021" "14/08/2021" ...
$ HomeTeam: chr [1:380] "Brentford" "Man United" "Burnley" "Chelsea" ...
$ AwayTeam: chr [1:380] "Arsenal" "Leeds" "Brighton" "Crystal Palace" ...
$ FTHG : num [1:380] 2 5 1 3 3 1 3 0 2 1 ...
$ FTAG : num [1:380] 0 1 2 0 1 0 2 3 4 0 ...
$ FTR : chr [1:380] "H" "H" "A" "H" ...
$ HTHG : num [1:380] 1 1 1 2 0 1 2 0 2 0 ...
$ HTAG : num [1:380] 0 0 0 0 1 0 0 1 1 0 ...
$ HTR : chr [1:380] "H" "H" "H" "H" ...
$ Referee : chr [1:380] "M Oliver" "P Tierney" "D Coote" "J Moss" ...
$ HS : num [1:380] 8 16 14 13 14 9 13 14 17 13 ...
$ AS : num [1:380] 22 10 14 4 6 17 11 19 8 18 ...
$ HST : num [1:380] 3 8 3 6 6 5 7 3 3 3 ...
$ AST : num [1:380] 4 3 8 1 3 3 2 8 9 4 ...
$ HF : num [1:380] 12 11 10 15 13 6 18 4 4 11 ...
$ AF : num [1:380] 8 9 7 11 15 10 13 14 3 8 ...
$ HC : num [1:380] 2 5 7 5 6 5 2 3 7 3 ...
$ AC : num [1:380] 5 4 6 2 8 4 4 11 6 11 ...
$ HY : num [1:380] 0 1 2 0 2 1 3 1 1 2 ...
$ AY : num [1:380] 0 2 1 0 0 2 1 1 0 1 ...
$ HR : num [1:380] 0 0 0 0 0 0 0 0 0 0 ...
$ AR : num [1:380] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "spec")=
.. cols(
.. Date = col_character(),
.. HomeTeam = col_character(),
.. AwayTeam = col_character(),
.. FTHG = col_double(),
.. FTAG = col_double(),
.. FTR = col_character(),
.. HTHG = col_double(),
.. HTAG = col_double(),
.. HTR = col_character(),
.. Referee = col_character(),
.. HS = col_double(),
.. AS = col_double(),
.. HST = col_double(),
.. AST = col_double(),
.. HF = col_double(),
.. AF = col_double(),
.. HC = col_double(),
.. AC = col_double(),
.. HY = col_double(),
.. AY = col_double(),
.. HR = col_double(),
.. AR = col_double()
.. )
- attr(*, "problems")=<externalptr>
After you load the data, record which variables are categorical and which are numeric. Categorical: Date, HomeTeam, AwayTeam, FTR, HTR, Referee Numerical: FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY, HR, AR, ### 2.) Let’s consider the effects of home team shots (HS), home team (HomeTeam), and home team fouls (HF) on home team goals (fullt time home goals). Build a fully interactive multiple linear regression model. Assess model fit and then model assumptions. How well does the model fit the data? Is the model valid?
drop_na(soccer)# A tibble: 380 × 22
Date HomeT…¹ AwayT…² FTHG FTAG FTR HTHG HTAG HTR Referee HS AS
<chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 13/0… Brentf… Arsenal 2 0 H 1 0 H M Oliv… 8 22
2 14/0… Man Un… Leeds 5 1 H 1 0 H P Tier… 16 10
3 14/0… Burnley Bright… 1 2 A 1 0 H D Coote 14 14
4 14/0… Chelsea Crysta… 3 0 H 2 0 H J Moss 13 4
5 14/0… Everton Southa… 3 1 H 0 1 A A Madl… 14 6
6 14/0… Leices… Wolves 1 0 H 1 0 H C Paws… 9 17
7 14/0… Watford Aston … 3 2 H 2 0 H M Dean 13 11
8 14/0… Norwich Liverp… 0 3 A 0 1 A A Marr… 14 19
9 15/0… Newcas… West H… 2 4 A 2 1 H M Atki… 17 8
10 15/0… Totten… Man Ci… 1 0 H 0 0 D A Tayl… 13 18
# … with 370 more rows, 10 more variables: HST <dbl>, AST <dbl>, HF <dbl>,
# AF <dbl>, HC <dbl>, AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>, and
# abbreviated variable names ¹HomeTeam, ²AwayTeam
lms <- lm(FTHG ~ HS * HomeTeam * HF, data = soccer)
summary(lms)
Call:
lm(formula = FTHG ~ HS * HomeTeam * HF, data = soccer)
Residuals:
Min 1Q Median 3Q Max
-2.7138 -0.6469 -0.0615 0.5152 3.9047
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.697352 3.708178 0.188 0.8510
HS 0.055902 0.190075 0.294 0.7689
HomeTeamAston Villa 0.881435 4.996848 0.176 0.8601
HomeTeamBrentford -2.533050 5.203392 -0.487 0.6268
HomeTeamBrighton 3.717225 6.575577 0.565 0.5723
HomeTeamBurnley -0.276222 6.857200 -0.040 0.9679
HomeTeamChelsea 6.034787 4.925771 1.225 0.2215
HomeTeamCrystal Palace -2.318297 4.283831 -0.541 0.5888
HomeTeamEverton 0.319386 4.645420 0.069 0.9452
HomeTeamLeeds -3.624261 5.001969 -0.725 0.4693
HomeTeamLeicester -0.792986 4.114557 -0.193 0.8473
HomeTeamLiverpool 4.175053 5.026751 0.831 0.4069
HomeTeamMan City -10.661735 5.330983 -2.000 0.0464 *
HomeTeamMan United 7.326569 4.781359 1.532 0.1265
HomeTeamNewcastle 0.168113 4.800783 0.035 0.9721
HomeTeamNorwich 2.094630 5.334412 0.393 0.6948
HomeTeamSouthampton -0.310685 4.470009 -0.070 0.9446
HomeTeamTottenham -2.750661 4.805536 -0.572 0.5675
HomeTeamWatford -2.913636 4.142058 -0.703 0.4823
HomeTeamWest Ham -2.027195 4.796066 -0.423 0.6728
HomeTeamWolves -1.557139 4.102273 -0.380 0.7045
HF -0.009934 0.365871 -0.027 0.9784
HS:HomeTeamAston Villa 0.061853 0.328940 0.188 0.8510
HS:HomeTeamBrentford 0.182702 0.332776 0.549 0.5834
HS:HomeTeamBrighton -0.262377 0.425503 -0.617 0.5379
HS:HomeTeamBurnley 0.031909 0.467206 0.068 0.9456
HS:HomeTeamChelsea -0.391915 0.267354 -1.466 0.1437
HS:HomeTeamCrystal Palace 0.105923 0.251576 0.421 0.6740
HS:HomeTeamEverton 0.011687 0.286931 0.041 0.9675
HS:HomeTeamLeeds 0.243965 0.297532 0.820 0.4129
HS:HomeTeamLeicester 0.146672 0.235410 0.623 0.5337
HS:HomeTeamLiverpool -0.109337 0.242931 -0.450 0.6530
HS:HomeTeamMan City 0.558982 0.261335 2.139 0.0332 *
HS:HomeTeamMan United -0.414803 0.258899 -1.602 0.1102
HS:HomeTeamNewcastle -0.025273 0.307031 -0.082 0.9345
HS:HomeTeamNorwich -0.290202 0.363840 -0.798 0.4257
HS:HomeTeamSouthampton 0.053585 0.266735 0.201 0.8409
HS:HomeTeamTottenham 0.208991 0.306756 0.681 0.4962
HS:HomeTeamWatford 0.257706 0.248203 1.038 0.3000
HS:HomeTeamWest Ham 0.147201 0.312313 0.471 0.6378
HS:HomeTeamWolves 0.010637 0.248269 0.043 0.9659
HS:HF 0.001280 0.019025 0.067 0.9464
HomeTeamAston Villa:HF -0.084654 0.465462 -0.182 0.8558
HomeTeamBrentford:HF 0.221657 0.505506 0.438 0.6613
HomeTeamBrighton:HF -0.268913 0.596677 -0.451 0.6525
HomeTeamBurnley:HF -0.085970 0.651113 -0.132 0.8950
HomeTeamChelsea:HF -0.450434 0.447628 -1.006 0.3151
HomeTeamCrystal Palace:HF 0.196644 0.417779 0.471 0.6382
HomeTeamEverton:HF -0.128741 0.491577 -0.262 0.7936
HomeTeamLeeds:HF 0.213802 0.446558 0.479 0.6324
HomeTeamLeicester:HF 0.048414 0.403071 0.120 0.9045
HomeTeamLiverpool:HF -0.289467 0.533930 -0.542 0.5881
HomeTeamMan City:HF 1.036326 0.560155 1.850 0.0653 .
HomeTeamMan United:HF -0.787337 0.499734 -1.576 0.1162
HomeTeamNewcastle:HF -0.073170 0.446549 -0.164 0.8700
HomeTeamNorwich:HF -0.254151 0.505784 -0.502 0.6157
HomeTeamSouthampton:HF 0.024514 0.429623 0.057 0.9545
HomeTeamTottenham:HF 0.187767 0.454811 0.413 0.6800
HomeTeamWatford:HF 0.117282 0.411028 0.285 0.7756
HomeTeamWest Ham:HF 0.451202 0.482874 0.934 0.3508
HomeTeamWolves:HF 0.302944 0.420129 0.721 0.4714
HS:HomeTeamAston Villa:HF -0.005724 0.028970 -0.198 0.8435
HS:HomeTeamBrentford:HF -0.018001 0.032204 -0.559 0.5766
HS:HomeTeamBrighton:HF 0.015377 0.038286 0.402 0.6882
HS:HomeTeamBurnley:HF 0.002547 0.044694 0.057 0.9546
HS:HomeTeamChelsea:HF 0.031442 0.024617 1.277 0.2025
HS:HomeTeamCrystal Palace:HF -0.008544 0.024429 -0.350 0.7268
HS:HomeTeamEverton:HF 0.006583 0.032144 0.205 0.8379
HS:HomeTeamLeeds:HF -0.017376 0.025839 -0.672 0.5018
HS:HomeTeamLeicester:HF -0.010627 0.024129 -0.440 0.6599
HS:HomeTeamLiverpool:HF 0.006820 0.025865 0.264 0.7922
HS:HomeTeamMan City:HF -0.048830 0.028501 -1.713 0.0877 .
HS:HomeTeamMan United:HF 0.046000 0.028069 1.639 0.1023
HS:HomeTeamNewcastle:HF 0.006266 0.028067 0.223 0.8235
HS:HomeTeamNorwich:HF 0.027014 0.033845 0.798 0.4254
HS:HomeTeamSouthampton:HF -0.006353 0.025158 -0.253 0.8008
HS:HomeTeamTottenham:HF -0.011332 0.028415 -0.399 0.6903
HS:HomeTeamWatford:HF -0.013823 0.024557 -0.563 0.5739
HS:HomeTeamWest Ham:HF -0.030930 0.030862 -1.002 0.3171
HS:HomeTeamWolves:HF -0.015949 0.027111 -0.588 0.5568
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.16 on 300 degrees of freedom
Multiple R-squared: 0.3949, Adjusted R-squared: 0.2355
F-statistic: 2.478 on 79 and 300 DF, p-value: 1.774e-08
model_performance(lms)# Indices of model performance
AIC | AICc | BIC | R2 | R2 (adj.) | RMSE | Sigma
------------------------------------------------------------------
1263.266 | 1307.843 | 1582.420 | 0.395 | 0.236 | 1.031 | 1.160
check_model(lms)Variable `Component` is not in your data frame :/
The adjusted R2 is 0.236, so the model explains less than 25% of the variation in the data. This is consistent with the AIC and BIC, which are both above 1000 and very high. This means that the model does not fit the data well at all. For the assumptions, linearity and normality are met, but homogeneity of variance does not appear to be met and collinearity is much higher than it should be so the assumption is not met. We do not know how the data was collected, so we do not know about independence of observations. Based on this assessment, the model is not valid as multiple assumptions are not met and the model does not fit the data.
3.) Run through a top-down modeling approach to find the best fit model! Be sure to check assumptions after each change and compare performance. What model is the best fit?
# Model 1
lms1 <- lm(FTHG ~ HS * HomeTeam + HF, data = soccer)
#summary(lms1)
#model_performance(lms1)
#check_model(lms1)
# Model 2
lms2 <- lm(FTHG ~ HS + HomeTeam * HF, data = soccer)
#summary(lms2)
#model_performance(lms2)
#check_model(lms2)
# Model 3
lms3 <- lm(FTHG ~ HS + HomeTeam + HF, data = soccer)
#summary(lms3)
model_performance(lms3)# Indices of model performance
AIC | AICc | BIC | R2 | R2 (adj.) | RMSE | Sigma
------------------------------------------------------------------
1223.866 | 1226.967 | 1314.490 | 0.260 | 0.216 | 1.140 | 1.174
check_model(lms3)Variable `Component` is not in your data frame :/
# Model 4
lms4 <- lm(FTHG ~ HS * HomeTeam, data = soccer)
#summary(lms4)
#model_performance(lms4)
#check_model(lms4)
# Model 5
lms5 <- lm(FTHG ~ HS * HF, data = soccer)
#summary(lms5)
#model_performance(lms5)
#check_model(lms5)
# Model 6
lms6 <- lm(FTHG ~ HomeTeam * HF, data = soccer)
#summary(lms6)
#model_performance(lms6)
#check_model(lms6)
# Model 7
lms7 <- lm(FTHG ~ HS + HomeTeam, data = soccer)
#summary(lms7)
model_performance(lms7)# Indices of model performance
AIC | AICc | BIC | R2 | R2 (adj.) | RMSE | Sigma
------------------------------------------------------------------
1221.886 | 1224.721 | 1308.570 | 0.260 | 0.218 | 1.140 | 1.173
check_model(lms7)Variable `Component` is not in your data frame :/
# Model 8
lms8 <- lm(FTHG ~ HS + HF, data = soccer)
#summary(lms8)
model_performance(lms8)# Indices of model performance
AIC | AICc | BIC | R2 | R2 (adj.) | RMSE | Sigma
------------------------------------------------------------------
1232.923 | 1233.030 | 1248.684 | 0.162 | 0.158 | 1.213 | 1.217
check_model(lms8)Variable `Component` is not in your data frame :/
# Model 9
lms9 <- lm(FTHG ~ HomeTeam + HF, data = soccer)
#summary(lms9)
model_performance(lms9)# Indices of model performance
AIC | AICc | BIC | R2 | R2 (adj.) | RMSE | Sigma
------------------------------------------------------------------
1255.929 | 1258.764 | 1342.613 | 0.190 | 0.145 | 1.192 | 1.226
check_model(lms9)Variable `Component` is not in your data frame :/
# Compare models
compare_performance(lms3, lms7, lms8, lms9, rank = TRUE)# Comparison of Model Performance Indices
Name | Model | R2 | R2 (adj.) | RMSE | Sigma | AIC weights | AICc weights | BIC weights | Performance-Score
---------------------------------------------------------------------------------------------------------------
lms7 | lm | 0.260 | 0.218 | 1.140 | 1.173 | 0.727 | 0.746 | 9.91e-14 | 85.70%
lms3 | lm | 0.260 | 0.216 | 1.140 | 1.174 | 0.270 | 0.243 | 5.13e-15 | 66.25%
lms8 | lm | 0.162 | 0.158 | 1.213 | 1.217 | 0.003 | 0.012 | 1.000 | 19.40%
lms9 | lm | 0.190 | 0.145 | 1.192 | 1.226 | 2.95e-08 | 3.02e-08 | 4.01e-21 | 8.16%
The model that is the best fit is lms7, which is HS + HomeTeam. However, it is not the most complex model with a relatively good fit, and lms3 (HS + HomeTeam + HF) fits the data only slightly worse and is more complex. The more complex model, that has a very similar fit to the less complex model, is the one we should use.
4.) After identifying the best fit model, build the appropriate graph! See our multiple regression tutorial. Next, Build a coef plot for the model. Using patchwork, show me a 2-panel figure with the coef plot and the graph for the model
# Build appropriate graph
lms3g <- lms3 %>%
augment() %>%
ggplot(aes(x = HS, y = FTHG, color = HomeTeam)) +
geom_point(aes(size = HF)) +
geom_point() +
geom_line(aes(y = .fitted), size = 1) +
theme_bw()Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
lms3g# Build a coef plot
lms3_coef <- tidy(lms3, quick=FALSE)
lms3_ci <- data.table(confint(lms3), keep.rownames='term')
lms3_df <- cbind(lms3_coef, lms3_ci)
lms3_df<-lms3_df [,-6]
lms3_df <- lms3_df %>%
rename("lower"="2.5 %",
"upper"="97.5 %")
lms3_df$term <- as.factor(lms3_df$term)
lms3g1 <- ggplot(data = lms3_df, aes(x = estimate, y = term)) +
geom_vline(xintercept = 0, linetype = 2) +
geom_point(size = 3)+
geom_errorbarh(aes(xmax = lower, xmin = upper),height=0.2) +
theme_bw()
lms3g1lms3g + lms3g1Depth
1.) Bootstrap the coef plot from Essential #4, above.
soccer_intervals <- reg_intervals(FTHG ~ HS + HomeTeam + HF, data = soccer,
type='percentile',
keep_reps=FALSE)
# Plot results
soccleats <- ggplot(data = soccer_intervals, aes(x = .estimate, y = term)) +
geom_vline(xintercept = 0, linetype = 2) +
geom_errorbarh(aes(xmin = .lower, xmax = .upper),height = 0.2) +
geom_point(size = 3) +
theme_classic()
soccleats2.) Calculate means and 95% CIs of full time home goals and full time away goals (using bootstrapping). Plot the results and interpret the plot (is there a home advantage or not?)
# Calculate means and CIs
soccer1 <- soccer
n = nrow(soccer1)
soccer_bs <- 1:1000 %>%
map_dfr(
~soccer1 %>%
slice_sample(n=n, replace=TRUE) %>%
summarize(meanhg = mean(FTHG), meanag = mean(FTAG))) %>%
mutate(n=n)
cibs <- soccer_bs %>%
summarize(meanhgboot = mean(meanhg), meanagboot = mean(meanag), CI_hg = 1.96*sd(meanhg), CI_ag = 1.96*sd(meanag))
cibs# A tibble: 1 × 4
meanhgboot meanagboot CI_hg CI_ag
<dbl> <dbl> <dbl> <dbl>
1 1.51 1.31 0.134 0.126
# Plot the results
cibs_long <- cibs %>%
pivot_longer(meanhgboot:meanagboot, names_to = 'team', values_to = 'goals') %>%
pivot_longer(CI_hg:CI_ag, names_to = 'cicat', values_to = 'ci')
cibs_long = cibs_long[c(1,4),]
cibs_long$team <- recode_factor(cibs_long$team, meanhgboot = 'Home Team', meanagboot = 'Away Team')
cibs_long# A tibble: 2 × 4
team goals cicat ci
<fct> <dbl> <chr> <dbl>
1 Home Team 1.51 CI_hg 0.134
2 Away Team 1.31 CI_ag 0.126
ggplot(data = cibs_long, aes(x = team, y = goals)) +
geom_point(size = 2) +
geom_errorbar(aes(ymin = goals - ci, ymax = goals + ci)) +
theme_bw()The confidence interval error bars overlap, therefore the mean goals for home and away teams are not significantly different and there is not a home team advantage.
3.) Add raw data behind your 95% CI plot above!
soccer1 <- soccer1 %>%
select(FTHG, FTAG)
soccer1# A tibble: 380 × 2
FTHG FTAG
<dbl> <dbl>
1 2 0
2 5 1
3 1 2
4 3 0
5 3 1
6 1 0
7 3 2
8 0 3
9 2 4
10 1 0
# … with 370 more rows
soccer2 <- soccer1 %>%
pivot_longer(FTHG : FTAG, names_to = 'team', values_to = 'goals')
soccer2$team <- recode_factor(soccer2$team, FTHG = 'Home Team', FTAG = 'Away Team')
ggplot(data = cibs_long, aes(x = team, y = goals)) +
geom_errorbar(aes(ymin = goals - ci, ymax = goals + ci)) +
geom_jitter(data = soccer2, aes(x = team, y = goals, color = team)) +
geom_point(size = 3) +
theme_bw()