library(tidyverse)
library(broom)
library(data.table)
library(performance)
library(patchwork)
library(car)
library(rsample)
library(ggsci)
library(dotwhisker)Lab 7: Multiple Regression and Bootstrapping
Lab 7: Multiple Regression and Bootstrapping
Load Packages
Essentials
Load data ‘soccer’ from tidytuesday
soccer <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-04-04/soccer21-22.csv')Rows: 380 Columns: 22
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): Date, HomeTeam, AwayTeam, FTR, HTR, Referee
dbl (16): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(soccer)spc_tbl_ [380 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Date : chr [1:380] "13/08/2021" "14/08/2021" "14/08/2021" "14/08/2021" ...
$ HomeTeam: chr [1:380] "Brentford" "Man United" "Burnley" "Chelsea" ...
$ AwayTeam: chr [1:380] "Arsenal" "Leeds" "Brighton" "Crystal Palace" ...
$ FTHG : num [1:380] 2 5 1 3 3 1 3 0 2 1 ...
$ FTAG : num [1:380] 0 1 2 0 1 0 2 3 4 0 ...
$ FTR : chr [1:380] "H" "H" "A" "H" ...
$ HTHG : num [1:380] 1 1 1 2 0 1 2 0 2 0 ...
$ HTAG : num [1:380] 0 0 0 0 1 0 0 1 1 0 ...
$ HTR : chr [1:380] "H" "H" "H" "H" ...
$ Referee : chr [1:380] "M Oliver" "P Tierney" "D Coote" "J Moss" ...
$ HS : num [1:380] 8 16 14 13 14 9 13 14 17 13 ...
$ AS : num [1:380] 22 10 14 4 6 17 11 19 8 18 ...
$ HST : num [1:380] 3 8 3 6 6 5 7 3 3 3 ...
$ AST : num [1:380] 4 3 8 1 3 3 2 8 9 4 ...
$ HF : num [1:380] 12 11 10 15 13 6 18 4 4 11 ...
$ AF : num [1:380] 8 9 7 11 15 10 13 14 3 8 ...
$ HC : num [1:380] 2 5 7 5 6 5 2 3 7 3 ...
$ AC : num [1:380] 5 4 6 2 8 4 4 11 6 11 ...
$ HY : num [1:380] 0 1 2 0 2 1 3 1 1 2 ...
$ AY : num [1:380] 0 2 1 0 0 2 1 1 0 1 ...
$ HR : num [1:380] 0 0 0 0 0 0 0 0 0 0 ...
$ AR : num [1:380] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "spec")=
.. cols(
.. Date = col_character(),
.. HomeTeam = col_character(),
.. AwayTeam = col_character(),
.. FTHG = col_double(),
.. FTAG = col_double(),
.. FTR = col_character(),
.. HTHG = col_double(),
.. HTAG = col_double(),
.. HTR = col_character(),
.. Referee = col_character(),
.. HS = col_double(),
.. AS = col_double(),
.. HST = col_double(),
.. AST = col_double(),
.. HF = col_double(),
.. AF = col_double(),
.. HC = col_double(),
.. AC = col_double(),
.. HY = col_double(),
.. AY = col_double(),
.. HR = col_double(),
.. AR = col_double()
.. )
- attr(*, "problems")=<externalptr>
After you load the data, record which variables are categorical and which are numeric.
CATEGORICAL: Date, HomeTeam, AwayTeam, Full time tesult (FTR), Halftime results (HTR), Referee
NUMERIC: Full time home Ggals (FTHG), Full time away goals (FTAG), Halftime home goals (HTHG), Halftime away goals (HTAG), Number of shots taken by the home team (HS), Number of shots taken by the away team (AS), Number of shots on target by the home team (HST), Number of shots on target by the away team (AST), Number of fouls by the home team (HF), Number of fouls by the away team (AF), Number of corners taken by the home team (HC), Number of corners taken by the away team (AC), Number of yellow cards received by the home team (HY), Number of yellow cards received by the away team (AY), Number of red cards received by the home team (HR), Number of red cards received by the away team (AR)
Let’s consider the effects of home team shots (HS), home team (HomeTeam), and home team fouls (HF) on home team goals (fullt time home goals). Build a fully interactive multiple linear regression model. Assess model fit and then model assumptions. How well does the model fit the data? Is the model valid?
soc_1 <- lm(FTHG ~ HS*HomeTeam*HF, soccer)
summary(soc_1)
Call:
lm(formula = FTHG ~ HS * HomeTeam * HF, data = soccer)
Residuals:
Min 1Q Median 3Q Max
-2.7138 -0.6469 -0.0615 0.5152 3.9047
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.697352 3.708178 0.188 0.8510
HS 0.055902 0.190075 0.294 0.7689
HomeTeamAston Villa 0.881435 4.996848 0.176 0.8601
HomeTeamBrentford -2.533050 5.203392 -0.487 0.6268
HomeTeamBrighton 3.717225 6.575577 0.565 0.5723
HomeTeamBurnley -0.276222 6.857200 -0.040 0.9679
HomeTeamChelsea 6.034787 4.925771 1.225 0.2215
HomeTeamCrystal Palace -2.318297 4.283831 -0.541 0.5888
HomeTeamEverton 0.319386 4.645420 0.069 0.9452
HomeTeamLeeds -3.624261 5.001969 -0.725 0.4693
HomeTeamLeicester -0.792986 4.114557 -0.193 0.8473
HomeTeamLiverpool 4.175053 5.026751 0.831 0.4069
HomeTeamMan City -10.661735 5.330983 -2.000 0.0464 *
HomeTeamMan United 7.326569 4.781359 1.532 0.1265
HomeTeamNewcastle 0.168113 4.800783 0.035 0.9721
HomeTeamNorwich 2.094630 5.334412 0.393 0.6948
HomeTeamSouthampton -0.310685 4.470009 -0.070 0.9446
HomeTeamTottenham -2.750661 4.805536 -0.572 0.5675
HomeTeamWatford -2.913636 4.142058 -0.703 0.4823
HomeTeamWest Ham -2.027195 4.796066 -0.423 0.6728
HomeTeamWolves -1.557139 4.102273 -0.380 0.7045
HF -0.009934 0.365871 -0.027 0.9784
HS:HomeTeamAston Villa 0.061853 0.328940 0.188 0.8510
HS:HomeTeamBrentford 0.182702 0.332776 0.549 0.5834
HS:HomeTeamBrighton -0.262377 0.425503 -0.617 0.5379
HS:HomeTeamBurnley 0.031909 0.467206 0.068 0.9456
HS:HomeTeamChelsea -0.391915 0.267354 -1.466 0.1437
HS:HomeTeamCrystal Palace 0.105923 0.251576 0.421 0.6740
HS:HomeTeamEverton 0.011687 0.286931 0.041 0.9675
HS:HomeTeamLeeds 0.243965 0.297532 0.820 0.4129
HS:HomeTeamLeicester 0.146672 0.235410 0.623 0.5337
HS:HomeTeamLiverpool -0.109337 0.242931 -0.450 0.6530
HS:HomeTeamMan City 0.558982 0.261335 2.139 0.0332 *
HS:HomeTeamMan United -0.414803 0.258899 -1.602 0.1102
HS:HomeTeamNewcastle -0.025273 0.307031 -0.082 0.9345
HS:HomeTeamNorwich -0.290202 0.363840 -0.798 0.4257
HS:HomeTeamSouthampton 0.053585 0.266735 0.201 0.8409
HS:HomeTeamTottenham 0.208991 0.306756 0.681 0.4962
HS:HomeTeamWatford 0.257706 0.248203 1.038 0.3000
HS:HomeTeamWest Ham 0.147201 0.312313 0.471 0.6378
HS:HomeTeamWolves 0.010637 0.248269 0.043 0.9659
HS:HF 0.001280 0.019025 0.067 0.9464
HomeTeamAston Villa:HF -0.084654 0.465462 -0.182 0.8558
HomeTeamBrentford:HF 0.221657 0.505506 0.438 0.6613
HomeTeamBrighton:HF -0.268913 0.596677 -0.451 0.6525
HomeTeamBurnley:HF -0.085970 0.651113 -0.132 0.8950
HomeTeamChelsea:HF -0.450434 0.447628 -1.006 0.3151
HomeTeamCrystal Palace:HF 0.196644 0.417779 0.471 0.6382
HomeTeamEverton:HF -0.128741 0.491577 -0.262 0.7936
HomeTeamLeeds:HF 0.213802 0.446558 0.479 0.6324
HomeTeamLeicester:HF 0.048414 0.403071 0.120 0.9045
HomeTeamLiverpool:HF -0.289467 0.533930 -0.542 0.5881
HomeTeamMan City:HF 1.036326 0.560155 1.850 0.0653 .
HomeTeamMan United:HF -0.787337 0.499734 -1.576 0.1162
HomeTeamNewcastle:HF -0.073170 0.446549 -0.164 0.8700
HomeTeamNorwich:HF -0.254151 0.505784 -0.502 0.6157
HomeTeamSouthampton:HF 0.024514 0.429623 0.057 0.9545
HomeTeamTottenham:HF 0.187767 0.454811 0.413 0.6800
HomeTeamWatford:HF 0.117282 0.411028 0.285 0.7756
HomeTeamWest Ham:HF 0.451202 0.482874 0.934 0.3508
HomeTeamWolves:HF 0.302944 0.420129 0.721 0.4714
HS:HomeTeamAston Villa:HF -0.005724 0.028970 -0.198 0.8435
HS:HomeTeamBrentford:HF -0.018001 0.032204 -0.559 0.5766
HS:HomeTeamBrighton:HF 0.015377 0.038286 0.402 0.6882
HS:HomeTeamBurnley:HF 0.002547 0.044694 0.057 0.9546
HS:HomeTeamChelsea:HF 0.031442 0.024617 1.277 0.2025
HS:HomeTeamCrystal Palace:HF -0.008544 0.024429 -0.350 0.7268
HS:HomeTeamEverton:HF 0.006583 0.032144 0.205 0.8379
HS:HomeTeamLeeds:HF -0.017376 0.025839 -0.672 0.5018
HS:HomeTeamLeicester:HF -0.010627 0.024129 -0.440 0.6599
HS:HomeTeamLiverpool:HF 0.006820 0.025865 0.264 0.7922
HS:HomeTeamMan City:HF -0.048830 0.028501 -1.713 0.0877 .
HS:HomeTeamMan United:HF 0.046000 0.028069 1.639 0.1023
HS:HomeTeamNewcastle:HF 0.006266 0.028067 0.223 0.8235
HS:HomeTeamNorwich:HF 0.027014 0.033845 0.798 0.4254
HS:HomeTeamSouthampton:HF -0.006353 0.025158 -0.253 0.8008
HS:HomeTeamTottenham:HF -0.011332 0.028415 -0.399 0.6903
HS:HomeTeamWatford:HF -0.013823 0.024557 -0.563 0.5739
HS:HomeTeamWest Ham:HF -0.030930 0.030862 -1.002 0.3171
HS:HomeTeamWolves:HF -0.015949 0.027111 -0.588 0.5568
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.16 on 300 degrees of freedom
Multiple R-squared: 0.3949, Adjusted R-squared: 0.2355
F-statistic: 2.478 on 79 and 300 DF, p-value: 1.774e-08
The p-value is 1.0774e-08 which means that at least one of these variables has an effect on home team goals. The adjusted R-Square is 0.2355 which means that approximately 23.55% of the variance is explained by this model.
Checking Assumptions
model_performance(soc_1)# Indices of model performance
AIC | AICc | BIC | R2 | R2 (adj.) | RMSE | Sigma
------------------------------------------------------------------
1263.266 | 1307.843 | 1582.420 | 0.395 | 0.236 | 1.031 | 1.160
check_model(soc_1)Variable `Component` is not in your data frame :/
Assumptions: 1. Linearity: The assumption regarding linearity has been met because the reference line is pretty flat and horizontal. 2. Normality of Residuals: The residuals are looking pretty normal according to the qq plot. 3. Equal Variance: The reference line appears to be mostly flat and horizontal so I would say that the assumption of homogeneity of variance is mostly met. 4. Colinearity: The VIFs are extremely high which indicates that there is some effect of multicollinearity and thus the assumption is not met. 5. Independence: We don’t know much about the experimental design so it’s hard to say much about the assumption of independence. Since this is a lab assignment, I will continue using this data.
This model does not fit the data very well and multiple assumptions are violated so I would conclude the model is not valid.
Run through a top-down modeling approach to find the best fit model! Be sure to check assumptions after each change and compare performance. What model is the best fit?
Note: Any model that is commented out are models that don’t meet the collinearity assumption
# soc_2 <- lm(FTHG ~ HS*HomeTeam+HF, soccer)
# check_model(soc_2)
# soc_3 <- lm(FTHG ~ HS+HomeTeam*HF, soccer)
# check_model(soc_3)
soc_4 <- lm(FTHG ~ HS+HomeTeam+HF, soccer)
check_model(soc_4)Variable `Component` is not in your data frame :/
# soc_5 <- lm(FTHG ~ HS*HomeTeam, soccer)
# check_model(soc_5)
soc_6 <- lm(FTHG ~ HS+HomeTeam, soccer)
check_model(soc_6)Variable `Component` is not in your data frame :/
# soc_7 <- lm(FTHG ~ HS*HF, soccer)
# check_model(soc_7)
soc_8 <- lm(FTHG ~ HS+HF, soccer)
check_model(soc_8)Variable `Component` is not in your data frame :/
# soc_9 <- lm(FTHG ~ HomeTeam*HF, soccer)
# check_model(soc_9)
soc_10 <- lm(FTHG ~ HomeTeam+HF, soccer)
check_model(soc_10)Variable `Component` is not in your data frame :/
compare_performance(soc_4, soc_6, soc_8, soc_10, rank = TRUE)# Comparison of Model Performance Indices
Name | Model | R2 | R2 (adj.) | RMSE | Sigma | AIC weights | AICc weights | BIC weights | Performance-Score
-----------------------------------------------------------------------------------------------------------------
soc_6 | lm | 0.260 | 0.218 | 1.140 | 1.173 | 0.727 | 0.746 | 9.91e-14 | 85.70%
soc_4 | lm | 0.260 | 0.216 | 1.140 | 1.174 | 0.270 | 0.243 | 5.13e-15 | 66.25%
soc_8 | lm | 0.162 | 0.158 | 1.213 | 1.217 | 0.003 | 0.012 | 1.000 | 19.40%
soc_10 | lm | 0.190 | 0.145 | 1.192 | 1.226 | 2.95e-08 | 3.02e-08 | 4.01e-21 | 8.16%
According to compare-performance(), the model that fits best is soc_6 which is the additive model with the effects of HomeTeam shots and HomeTeam on HomeTeam goals. But it is not the most complex. The most complex model that does not violate assumptions is soc_4 which is a fully additive model with the effects of hometeam shots, hometeam, and hometeam fouls on hometeam goals: \(FTHG = \beta_0 + \beta_1(HS) + \beta_2(HomeTeam) + \beta_3(HF) + \varepsilon\). Since the adjusted \(R^2\) for soc_4 is only slightly lower than that of soc_6, I would probably pick the soc_4 model.
After identifying the best fit model, build the appropriate graph! See our multiple regression tutorial. Next, Build a coef plot for the model. Using patchwork, show me a 2-panel figure with the coef plot and the graph for the model
horrible <- soc_4 %>%
augment() %>%
ggplot(aes(x= HS, y = FTHG, color = HomeTeam))+
geom_point(aes(size=HF))+
geom_line(aes(y=.fitted), size=1) +
theme_classic()Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
test <- dwplot(soc_4) +
theme_classic() +
geom_vline(xintercept = 0, linetype=2)
coefs<-tidy(soc_4, quick=FALSE)
ci<-data.table(confint(soc_4), keep.rownames='term')
cidf<-cbind(coefs,ci)
cidf<-cidf[,-6]
cidf<- cidf %>%
rename("lower"="2.5 %",
"upper"="97.5 %")
cidf$term=as.factor(cidf$term)
coef.plot <- ggplot(data=cidf, aes(x=estimate, y=term))+
geom_vline(xintercept = 0, linetype=2)+
geom_point(size=3)+
geom_errorbarh(aes(xmax=lower, xmin=upper),height=0.2)+
theme_classic()
horrible+coef.plothorrible+testDepth
Bootstrap the coef plot from Essential #4, above.
set.seed(420)
soccer_intervals<- reg_intervals(FTHG ~ HS+HomeTeam+HF, data=soccer,
type='percentile',
keep_reps=FALSE)
soccer_intervals# A tibble: 21 × 6
term .lower .estimate .upper .alpha .method
<chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 HF -0.0375 -0.00240 0.0325 0.05 percentile
2 HS 0.0450 0.0708 0.0975 0.05 percentile
3 HomeTeamAston Villa -0.623 0.0983 0.833 0.05 percentile
4 HomeTeamBrentford -0.987 -0.273 0.466 0.05 percentile
5 HomeTeamBrighton -1.38 -0.606 0.214 0.05 percentile
6 HomeTeamBurnley -1.18 -0.463 0.263 0.05 percentile
7 HomeTeamChelsea -0.636 0.175 1.03 0.05 percentile
8 HomeTeamCrystal Palace -0.669 0.00141 0.728 0.05 percentile
9 HomeTeamEverton -0.717 0.00608 0.687 0.05 percentile
10 HomeTeamLeeds -1.16 -0.535 0.102 0.05 percentile
# ℹ 11 more rows
#plot the results
cleats <-ggplot(data=soccer_intervals, aes(x=.estimate, y=term))+
geom_vline(xintercept=0, linetype=2)+
geom_errorbarh(aes(xmin=.lower, xmax=.upper),height=0.2)+
geom_point(size=3)+
theme_classic()
cleatsCalculate means and 95% CIs of full time home goals and full time away goals (using bootstrapping). Plot the results and interpret the plot (is there a home advantage or not?)
soccer1 <- soccer
n = nrow(soccer1)
soccer.bs<- 1:1000 %>% #1000 = number of trials / resamples
map_dfr(
~soccer1 %>%
slice_sample(n=n, replace=TRUE) %>%
summarize(meanhg=mean(FTHG), meanag = mean(FTAG))) %>%
mutate(n=n)
calc_CIs<-soccer.bs %>%
summarize(meanhgboot=mean(meanhg), meanagboot=mean(meanag), CIhg=1.96*sd(meanhg), CIag=1.96*sd(meanag))
calc_CIs# A tibble: 1 × 4
meanhgboot meanagboot CIhg CIag
<dbl> <dbl> <dbl> <dbl>
1 1.51 1.31 0.132 0.126
Plots
CIlong <- calc_CIs %>%
pivot_longer(meanhgboot:meanagboot, names_to = 'team', values_to = 'goals') %>%
pivot_longer(CIhg:CIag, names_to = 'CIcat', values_to = 'CI')
CIlong <- CIlong[c(1,4),]
CIlong$team <- recode_factor(CIlong$team, meanhgboot = "Home", meanagboot = "Away")
CIlong# A tibble: 2 × 4
team goals CIcat CI
<fct> <dbl> <chr> <dbl>
1 Home 1.51 CIhg 0.132
2 Away 1.31 CIag 0.126
ggplot(data = CIlong, aes(x = team, y = goals)) +
geom_point() +
geom_errorbar(aes(ymin = goals-CI, ymax = goals+CI), width = 0.2) +
theme_classic() +
labs(x = "Team", y = "Mean goals scored")Because the 95% confidence intervals overlap, so there is no statistically significant difference between the mean goals scored by the home team and the away team. So, we conclude that there is not a home advantage.
Add raw data behind your 95% CI plot above!
soccer2 <- soccer1 %>%
select(FTHG, FTAG) %>%
pivot_longer(FTHG:FTAG, names_to = 'team', values_to = 'goals')
soccer2$team <- recode_factor(soccer2$team, FTHG = "Home", FTAG = "Away")
soccer2# A tibble: 760 × 2
team goals
<fct> <dbl>
1 Home 2
2 Away 0
3 Home 5
4 Away 1
5 Home 1
6 Away 2
7 Home 3
8 Away 0
9 Home 3
10 Away 1
# ℹ 750 more rows
ggplot(data = CIlong, aes(x = team, y = goals)) +
geom_point() +
geom_errorbar(aes(ymin = goals-CI, ymax = goals+CI), width = 0.2) +
geom_jitter(data = soccer2, aes(x = team, y = goals, color = team), alpha = 0.6, size =0.3) +
theme_classic() +
labs(x = "Team", y = "Mean goals scored") +
scale_color_npg()