library(tidyverse)
library(broom)
library(data.table)
library(performance)
library(patchwork)
library(car)
library(rsample)Lab_7_Multiple_Regression_and_Bootstrapping
Lab 7: Multiple Regression and Bootstrapping
load packages
Essentials
1.) Load data ‘soccer’ from tidytuesday
After you load the data, record which variables are categorical and which are numeric.
#install.packages("tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2023-04-04')--- Compiling #TidyTuesday Information for 2023-04-04 ----
--- There is 1 file available ---
--- Starting Download ---
Downloading file 1 of 1: `soccer21-22.csv`
--- Download complete ---
tuesdata <- tidytuesdayR::tt_load(2023, week = 14)--- Compiling #TidyTuesday Information for 2023-04-04 ----
--- There is 1 file available ---
--- Starting Download ---
Downloading file 1 of 1: `soccer21-22.csv`
--- Download complete ---
soccer <- tuesdata$soccer
head(soccer)# A tibble: 6 × 22
Date HomeT…¹ AwayT…² FTHG FTAG FTR HTHG HTAG HTR Referee HS AS
<chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 13/08… Brentf… Arsenal 2 0 H 1 0 H M Oliv… 8 22
2 14/08… Man Un… Leeds 5 1 H 1 0 H P Tier… 16 10
3 14/08… Burnley Bright… 1 2 A 1 0 H D Coote 14 14
4 14/08… Chelsea Crysta… 3 0 H 2 0 H J Moss 13 4
5 14/08… Everton Southa… 3 1 H 0 1 A A Madl… 14 6
6 14/08… Leices… Wolves 1 0 H 1 0 H C Paws… 9 17
# … with 10 more variables: HST <dbl>, AST <dbl>, HF <dbl>, AF <dbl>, HC <dbl>,
# AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>, and abbreviated variable
# names ¹HomeTeam, ²AwayTeam
soccer# A tibble: 380 × 22
Date HomeT…¹ AwayT…² FTHG FTAG FTR HTHG HTAG HTR Referee HS AS
<chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 13/0… Brentf… Arsenal 2 0 H 1 0 H M Oliv… 8 22
2 14/0… Man Un… Leeds 5 1 H 1 0 H P Tier… 16 10
3 14/0… Burnley Bright… 1 2 A 1 0 H D Coote 14 14
4 14/0… Chelsea Crysta… 3 0 H 2 0 H J Moss 13 4
5 14/0… Everton Southa… 3 1 H 0 1 A A Madl… 14 6
6 14/0… Leices… Wolves 1 0 H 1 0 H C Paws… 9 17
7 14/0… Watford Aston … 3 2 H 2 0 H M Dean 13 11
8 14/0… Norwich Liverp… 0 3 A 0 1 A A Marr… 14 19
9 15/0… Newcas… West H… 2 4 A 2 1 H M Atki… 17 8
10 15/0… Totten… Man Ci… 1 0 H 0 0 D A Tayl… 13 18
# … with 370 more rows, 10 more variables: HST <dbl>, AST <dbl>, HF <dbl>,
# AF <dbl>, HC <dbl>, AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>, and
# abbreviated variable names ¹HomeTeam, ²AwayTeam
#categorical: Date, HomeTeam, AwayTeam, FTR, HTR
#numerical:FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR2.) Let’s consider the effects of home team shots (HS), home team (HomeTeam), and home team fouls (HF) on home team goals (fullt time home goals). Build a fully interactive multiple linear regression model. Assess model fit and then model assumptions. How well does the model fit the data? Is the model valid?
lm1<-lm(FTHG~HS*HomeTeam*HF,data=soccer)
summary(lm1) #Check the bottom p-value and r squared
Call:
lm(formula = FTHG ~ HS * HomeTeam * HF, data = soccer)
Residuals:
Min 1Q Median 3Q Max
-2.7138 -0.6469 -0.0615 0.5152 3.9047
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.697352 3.708178 0.188 0.8510
HS 0.055902 0.190075 0.294 0.7689
HomeTeamAston Villa 0.881435 4.996848 0.176 0.8601
HomeTeamBrentford -2.533050 5.203392 -0.487 0.6268
HomeTeamBrighton 3.717225 6.575577 0.565 0.5723
HomeTeamBurnley -0.276222 6.857200 -0.040 0.9679
HomeTeamChelsea 6.034787 4.925771 1.225 0.2215
HomeTeamCrystal Palace -2.318297 4.283831 -0.541 0.5888
HomeTeamEverton 0.319386 4.645420 0.069 0.9452
HomeTeamLeeds -3.624261 5.001969 -0.725 0.4693
HomeTeamLeicester -0.792986 4.114557 -0.193 0.8473
HomeTeamLiverpool 4.175053 5.026751 0.831 0.4069
HomeTeamMan City -10.661735 5.330983 -2.000 0.0464 *
HomeTeamMan United 7.326569 4.781359 1.532 0.1265
HomeTeamNewcastle 0.168113 4.800783 0.035 0.9721
HomeTeamNorwich 2.094630 5.334412 0.393 0.6948
HomeTeamSouthampton -0.310685 4.470009 -0.070 0.9446
HomeTeamTottenham -2.750661 4.805536 -0.572 0.5675
HomeTeamWatford -2.913636 4.142058 -0.703 0.4823
HomeTeamWest Ham -2.027195 4.796066 -0.423 0.6728
HomeTeamWolves -1.557139 4.102273 -0.380 0.7045
HF -0.009934 0.365871 -0.027 0.9784
HS:HomeTeamAston Villa 0.061853 0.328940 0.188 0.8510
HS:HomeTeamBrentford 0.182702 0.332776 0.549 0.5834
HS:HomeTeamBrighton -0.262377 0.425503 -0.617 0.5379
HS:HomeTeamBurnley 0.031909 0.467206 0.068 0.9456
HS:HomeTeamChelsea -0.391915 0.267354 -1.466 0.1437
HS:HomeTeamCrystal Palace 0.105923 0.251576 0.421 0.6740
HS:HomeTeamEverton 0.011687 0.286931 0.041 0.9675
HS:HomeTeamLeeds 0.243965 0.297532 0.820 0.4129
HS:HomeTeamLeicester 0.146672 0.235410 0.623 0.5337
HS:HomeTeamLiverpool -0.109337 0.242931 -0.450 0.6530
HS:HomeTeamMan City 0.558982 0.261335 2.139 0.0332 *
HS:HomeTeamMan United -0.414803 0.258899 -1.602 0.1102
HS:HomeTeamNewcastle -0.025273 0.307031 -0.082 0.9345
HS:HomeTeamNorwich -0.290202 0.363840 -0.798 0.4257
HS:HomeTeamSouthampton 0.053585 0.266735 0.201 0.8409
HS:HomeTeamTottenham 0.208991 0.306756 0.681 0.4962
HS:HomeTeamWatford 0.257706 0.248203 1.038 0.3000
HS:HomeTeamWest Ham 0.147201 0.312313 0.471 0.6378
HS:HomeTeamWolves 0.010637 0.248269 0.043 0.9659
HS:HF 0.001280 0.019025 0.067 0.9464
HomeTeamAston Villa:HF -0.084654 0.465462 -0.182 0.8558
HomeTeamBrentford:HF 0.221657 0.505506 0.438 0.6613
HomeTeamBrighton:HF -0.268913 0.596677 -0.451 0.6525
HomeTeamBurnley:HF -0.085970 0.651113 -0.132 0.8950
HomeTeamChelsea:HF -0.450434 0.447628 -1.006 0.3151
HomeTeamCrystal Palace:HF 0.196644 0.417779 0.471 0.6382
HomeTeamEverton:HF -0.128741 0.491577 -0.262 0.7936
HomeTeamLeeds:HF 0.213802 0.446558 0.479 0.6324
HomeTeamLeicester:HF 0.048414 0.403071 0.120 0.9045
HomeTeamLiverpool:HF -0.289467 0.533930 -0.542 0.5881
HomeTeamMan City:HF 1.036326 0.560155 1.850 0.0653 .
HomeTeamMan United:HF -0.787337 0.499734 -1.576 0.1162
HomeTeamNewcastle:HF -0.073170 0.446549 -0.164 0.8700
HomeTeamNorwich:HF -0.254151 0.505784 -0.502 0.6157
HomeTeamSouthampton:HF 0.024514 0.429623 0.057 0.9545
HomeTeamTottenham:HF 0.187767 0.454811 0.413 0.6800
HomeTeamWatford:HF 0.117282 0.411028 0.285 0.7756
HomeTeamWest Ham:HF 0.451202 0.482874 0.934 0.3508
HomeTeamWolves:HF 0.302944 0.420129 0.721 0.4714
HS:HomeTeamAston Villa:HF -0.005724 0.028970 -0.198 0.8435
HS:HomeTeamBrentford:HF -0.018001 0.032204 -0.559 0.5766
HS:HomeTeamBrighton:HF 0.015377 0.038286 0.402 0.6882
HS:HomeTeamBurnley:HF 0.002547 0.044694 0.057 0.9546
HS:HomeTeamChelsea:HF 0.031442 0.024617 1.277 0.2025
HS:HomeTeamCrystal Palace:HF -0.008544 0.024429 -0.350 0.7268
HS:HomeTeamEverton:HF 0.006583 0.032144 0.205 0.8379
HS:HomeTeamLeeds:HF -0.017376 0.025839 -0.672 0.5018
HS:HomeTeamLeicester:HF -0.010627 0.024129 -0.440 0.6599
HS:HomeTeamLiverpool:HF 0.006820 0.025865 0.264 0.7922
HS:HomeTeamMan City:HF -0.048830 0.028501 -1.713 0.0877 .
HS:HomeTeamMan United:HF 0.046000 0.028069 1.639 0.1023
HS:HomeTeamNewcastle:HF 0.006266 0.028067 0.223 0.8235
HS:HomeTeamNorwich:HF 0.027014 0.033845 0.798 0.4254
HS:HomeTeamSouthampton:HF -0.006353 0.025158 -0.253 0.8008
HS:HomeTeamTottenham:HF -0.011332 0.028415 -0.399 0.6903
HS:HomeTeamWatford:HF -0.013823 0.024557 -0.563 0.5739
HS:HomeTeamWest Ham:HF -0.030930 0.030862 -1.002 0.3171
HS:HomeTeamWolves:HF -0.015949 0.027111 -0.588 0.5568
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.16 on 300 degrees of freedom
Multiple R-squared: 0.3949, Adjusted R-squared: 0.2355
F-statistic: 2.478 on 79 and 300 DF, p-value: 1.774e-08
check_model(lm1) #Linearity, homogenity, normality look good but we have high VIF(Collinearity)Variable `Component` is not in your data frame :/
3.) Run through a top-down modeling approach to find the best fit model! Be sure to check assumptions after each change and compare performance. What model is the best fit?
lm2<-lm(FTHG~HS+HomeTeam*HF,data=soccer)
check_model(lm2) #We still have high VIF (3 high VIF)Variable `Component` is not in your data frame :/
lm3<-lm(FTHG~HS*HomeTeam+HF,data=soccer)
check_model(lm3) #We still have high VIF (3 high VIF)Variable `Component` is not in your data frame :/
lm4<-lm(FTHG~HS+HomeTeam+HF,data=soccer)
check_model(lm4) #Homogenity, Linearity and Normality look good. All VIF are low and close to 1. This can be the best fit model.Variable `Component` is not in your data frame :/
lm5<-lm(FTHG~HS*HomeTeam,data=soccer)
check_model(lm5) #VIF are highVariable `Component` is not in your data frame :/
lm6<-lm(FTHG~HomeTeam*HF,data=soccer)
check_model(lm6) #VIF are highVariable `Component` is not in your data frame :/
#1.lineariy
#2.Normality
#3.equal variance
#4.Independence4.) After identifying the best fit model, build the appropriate graph! See our multiple regression tutorial. Next, Build a coef plot for the model. Using patchwork, show me a 2-panel figure with the coef plot and the graph for the model
#lm4<-lm(FTHG~HS+HomeTeam+HF,data=soccer) is the best fit model
lm4g1<-lm4 %>%
augment() %>%
ggplot(aes(x=HS,y=FTHG,color=as.factor(HF)))+
geom_point()+
geom_line(aes(y=.fitted))+
theme_classic()+
facet_wrap(~HomeTeam)
lm4g1lm4g2<-lm4 %>%
augment() %>%
ggplot(aes(x=HS,y=FTHG,color=HomeTeam))+
geom_point(aes(size=HF))+
geom_line(aes(y=.fitted),size=1)+
theme_classic()Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
lm4g2#build a coef plot for the model
# 1. Get dataframes from models
coefs<-tidy(lm4,quick=FALSE)
coefs# A tibble: 22 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.567 0.396 1.43 0.152
2 HS 0.0711 0.0123 5.79 0.0000000151
3 HomeTeamAston Villa 0.105 0.388 0.270 0.787
4 HomeTeamBrentford -0.256 0.388 -0.661 0.509
5 HomeTeamBrighton -0.587 0.384 -1.53 0.127
6 HomeTeamBurnley -0.450 0.389 -1.16 0.248
7 HomeTeamChelsea 0.188 0.383 0.493 0.623
8 HomeTeamCrystal Palace 0.0173 0.389 0.0445 0.965
9 HomeTeamEverton 0.0265 0.389 0.0681 0.946
10 HomeTeamLeeds -0.524 0.388 -1.35 0.178
# … with 12 more rows
ci<-data.table(confint(lm4),keep.rownames='term')
ci term 2.5 % 97.5 %
1: (Intercept) -0.21075026 1.34567111
2: HS 0.04697836 0.09525245
3: HomeTeamAston Villa -0.65790223 0.86770425
4: HomeTeamBrentford -1.01962630 0.50674849
5: HomeTeamBrighton -1.34330647 0.16847085
6: HomeTeamBurnley -1.21505431 0.31484437
7: HomeTeamChelsea -0.56404249 0.94094445
8: HomeTeamCrystal Palace -0.74815835 0.78277336
9: HomeTeamEverton -0.73870205 0.79165904
10: HomeTeamLeeds -1.28763336 0.24012543
11: HomeTeamLeicester -0.40505935 1.12026992
12: HomeTeamLiverpool -0.23029360 1.27539679
13: HomeTeamMan City 0.40498557 1.90586943
14: HomeTeamMan United -0.65931506 0.84986417
15: HomeTeamNewcastle -0.82887822 0.69807362
16: HomeTeamNorwich -1.44200427 0.10053024
17: HomeTeamSouthampton -0.97626468 0.55131870
18: HomeTeamTottenham -0.27164995 1.24653097
19: HomeTeamWatford -1.22220073 0.31782064
20: HomeTeamWest Ham -0.44147985 1.08723763
21: HomeTeamWolves -1.09133427 0.44009047
22: HF -0.03898277 0.03385659
term 2.5 % 97.5 %
cidf<-cbind(coefs,ci)
cidf term estimate std.error statistic p.value
1 (Intercept) 0.56746042 0.39571125 1.43402653 1.524375e-01
2 HS 0.07111540 0.01227341 5.79426619 1.505048e-08
3 HomeTeamAston Villa 0.10490101 0.38787673 0.27044935 7.869704e-01
4 HomeTeamBrentford -0.25643891 0.38807207 -0.66080227 5.091641e-01
5 HomeTeamBrighton -0.58741781 0.38436075 -1.52829811 1.273216e-01
6 HomeTeamBurnley -0.45010497 0.38896800 -1.15717736 2.479714e-01
7 HomeTeamChelsea 0.18845098 0.38263433 0.49250932 6.226612e-01
8 HomeTeamCrystal Palace 0.01730751 0.38923064 0.04446594 9.645578e-01
9 HomeTeamEverton 0.02647849 0.38908557 0.06805314 9.457813e-01
10 HomeTeamLeeds -0.52375396 0.38842394 -1.34840803 1.783796e-01
11 HomeTeamLeicester 0.35760529 0.38780626 0.92212356 3.570851e-01
12 HomeTeamLiverpool 0.52255160 0.38281318 1.36503032 1.731007e-01
13 HomeTeamMan City 1.15542750 0.38159115 3.02792008 2.640982e-03
14 HomeTeamMan United 0.09527456 0.38370020 0.24830468 8.040410e-01
15 HomeTeamNewcastle -0.06540230 0.38821878 -0.16846763 8.663105e-01
16 HomeTeamNorwich -0.67073702 0.39218059 -1.71027591 8.808125e-02
17 HomeTeamSouthampton -0.21247299 0.38837935 -0.54707591 5.846674e-01
18 HomeTeamTottenham 0.48744051 0.38598883 1.26283581 2.074701e-01
19 HomeTeamWatford -0.45219004 0.39154164 -1.15489644 2.489031e-01
20 HomeTeamWest Ham 0.32287889 0.38866769 0.83073252 4.066784e-01
21 HomeTeamWolves -0.32562190 0.38935599 -0.83630895 4.035390e-01
22 HF -0.00256309 0.01851899 -0.13840334 8.899995e-01
term 2.5 % 97.5 %
1 (Intercept) -0.21075026 1.34567111
2 HS 0.04697836 0.09525245
3 HomeTeamAston Villa -0.65790223 0.86770425
4 HomeTeamBrentford -1.01962630 0.50674849
5 HomeTeamBrighton -1.34330647 0.16847085
6 HomeTeamBurnley -1.21505431 0.31484437
7 HomeTeamChelsea -0.56404249 0.94094445
8 HomeTeamCrystal Palace -0.74815835 0.78277336
9 HomeTeamEverton -0.73870205 0.79165904
10 HomeTeamLeeds -1.28763336 0.24012543
11 HomeTeamLeicester -0.40505935 1.12026992
12 HomeTeamLiverpool -0.23029360 1.27539679
13 HomeTeamMan City 0.40498557 1.90586943
14 HomeTeamMan United -0.65931506 0.84986417
15 HomeTeamNewcastle -0.82887822 0.69807362
16 HomeTeamNorwich -1.44200427 0.10053024
17 HomeTeamSouthampton -0.97626468 0.55131870
18 HomeTeamTottenham -0.27164995 1.24653097
19 HomeTeamWatford -1.22220073 0.31782064
20 HomeTeamWest Ham -0.44147985 1.08723763
21 HomeTeamWolves -1.09133427 0.44009047
22 HF -0.03898277 0.03385659
colnames(cidf)[1] "term" "estimate" "std.error" "statistic" "p.value" "term"
[7] "2.5 %" "97.5 %"
cidf<-cidf[,-6]
cidf<-cidf %>%
rename("lower"="2.5 %",
"upper"="97.5 %")
cidf term estimate std.error statistic p.value
1 (Intercept) 0.56746042 0.39571125 1.43402653 1.524375e-01
2 HS 0.07111540 0.01227341 5.79426619 1.505048e-08
3 HomeTeamAston Villa 0.10490101 0.38787673 0.27044935 7.869704e-01
4 HomeTeamBrentford -0.25643891 0.38807207 -0.66080227 5.091641e-01
5 HomeTeamBrighton -0.58741781 0.38436075 -1.52829811 1.273216e-01
6 HomeTeamBurnley -0.45010497 0.38896800 -1.15717736 2.479714e-01
7 HomeTeamChelsea 0.18845098 0.38263433 0.49250932 6.226612e-01
8 HomeTeamCrystal Palace 0.01730751 0.38923064 0.04446594 9.645578e-01
9 HomeTeamEverton 0.02647849 0.38908557 0.06805314 9.457813e-01
10 HomeTeamLeeds -0.52375396 0.38842394 -1.34840803 1.783796e-01
11 HomeTeamLeicester 0.35760529 0.38780626 0.92212356 3.570851e-01
12 HomeTeamLiverpool 0.52255160 0.38281318 1.36503032 1.731007e-01
13 HomeTeamMan City 1.15542750 0.38159115 3.02792008 2.640982e-03
14 HomeTeamMan United 0.09527456 0.38370020 0.24830468 8.040410e-01
15 HomeTeamNewcastle -0.06540230 0.38821878 -0.16846763 8.663105e-01
16 HomeTeamNorwich -0.67073702 0.39218059 -1.71027591 8.808125e-02
17 HomeTeamSouthampton -0.21247299 0.38837935 -0.54707591 5.846674e-01
18 HomeTeamTottenham 0.48744051 0.38598883 1.26283581 2.074701e-01
19 HomeTeamWatford -0.45219004 0.39154164 -1.15489644 2.489031e-01
20 HomeTeamWest Ham 0.32287889 0.38866769 0.83073252 4.066784e-01
21 HomeTeamWolves -0.32562190 0.38935599 -0.83630895 4.035390e-01
22 HF -0.00256309 0.01851899 -0.13840334 8.899995e-01
lower upper
1 -0.21075026 1.34567111
2 0.04697836 0.09525245
3 -0.65790223 0.86770425
4 -1.01962630 0.50674849
5 -1.34330647 0.16847085
6 -1.21505431 0.31484437
7 -0.56404249 0.94094445
8 -0.74815835 0.78277336
9 -0.73870205 0.79165904
10 -1.28763336 0.24012543
11 -0.40505935 1.12026992
12 -0.23029360 1.27539679
13 0.40498557 1.90586943
14 -0.65931506 0.84986417
15 -0.82887822 0.69807362
16 -1.44200427 0.10053024
17 -0.97626468 0.55131870
18 -0.27164995 1.24653097
19 -1.22220073 0.31782064
20 -0.44147985 1.08723763
21 -1.09133427 0.44009047
22 -0.03898277 0.03385659
cidf$term=as.factor(cidf$term)
#Make a plot
coefs_g<-ggplot(data=cidf, aes(x=estimate,y=term))+
geom_vline(xintercept=0, linetype=2)+
geom_point(size=3)+
geom_errorbar(aes(xmax=lower,xmin=upper),height=0.2)+
theme_classic()Warning in geom_errorbar(aes(xmax = lower, xmin = upper), height = 0.2):
Ignoring unknown parameters: `height`
coefs_gp1<-lm4g2
p2<-coefs_g
p1+p2Depth
1.) Bootstrap the coef plot from Essential #4, above.
https://rpubs.com/jbaumann3/CIs
#lm4<-lm(FTHG~HS+HomeTeam+HF,data=soccer) is the best fit model
#Fit a simple LM and have a look at results
simple_mod<-lm4
summary(simple_mod)
Call:
lm(formula = FTHG ~ HS + HomeTeam + HF, data = soccer)
Residuals:
Min 1Q Median 3Q Max
-2.9901 -0.7640 -0.1282 0.7084 4.6443
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.567460 0.395711 1.434 0.15244
HS 0.071115 0.012273 5.794 1.51e-08 ***
HomeTeamAston Villa 0.104901 0.387877 0.270 0.78697
HomeTeamBrentford -0.256439 0.388072 -0.661 0.50916
HomeTeamBrighton -0.587418 0.384361 -1.528 0.12732
HomeTeamBurnley -0.450105 0.388968 -1.157 0.24797
HomeTeamChelsea 0.188451 0.382634 0.493 0.62266
HomeTeamCrystal Palace 0.017308 0.389231 0.044 0.96456
HomeTeamEverton 0.026478 0.389086 0.068 0.94578
HomeTeamLeeds -0.523754 0.388424 -1.348 0.17838
HomeTeamLeicester 0.357605 0.387806 0.922 0.35709
HomeTeamLiverpool 0.522552 0.382813 1.365 0.17310
HomeTeamMan City 1.155428 0.381591 3.028 0.00264 **
HomeTeamMan United 0.095275 0.383700 0.248 0.80404
HomeTeamNewcastle -0.065402 0.388219 -0.168 0.86631
HomeTeamNorwich -0.670737 0.392181 -1.710 0.08808 .
HomeTeamSouthampton -0.212473 0.388379 -0.547 0.58467
HomeTeamTottenham 0.487441 0.385989 1.263 0.20747
HomeTeamWatford -0.452190 0.391542 -1.155 0.24890
HomeTeamWest Ham 0.322879 0.388668 0.831 0.40668
HomeTeamWolves -0.325622 0.389356 -0.836 0.40354
HF -0.002563 0.018519 -0.138 0.89000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.174 on 358 degrees of freedom
Multiple R-squared: 0.2597, Adjusted R-squared: 0.2163
F-statistic: 5.982 on 21 and 358 DF, p-value: 3.927e-14
tidy(simple_mod)# A tibble: 22 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.567 0.396 1.43 0.152
2 HS 0.0711 0.0123 5.79 0.0000000151
3 HomeTeamAston Villa 0.105 0.388 0.270 0.787
4 HomeTeamBrentford -0.256 0.388 -0.661 0.509
5 HomeTeamBrighton -0.587 0.384 -1.53 0.127
6 HomeTeamBurnley -0.450 0.389 -1.16 0.248
7 HomeTeamChelsea 0.188 0.383 0.493 0.623
8 HomeTeamCrystal Palace 0.0173 0.389 0.0445 0.965
9 HomeTeamEverton 0.0265 0.389 0.0681 0.946
10 HomeTeamLeeds -0.524 0.388 -1.35 0.178
# … with 12 more rows
#Bootstrap:a resampling technique.
set.seed(1) #any number is fine
soccer_intervals<-reg_intervals(FTHG~HS+HomeTeam+HF, data=soccer, type='percentile',keep_reps=FALSE)
soccer_intervals# A tibble: 21 × 6
term .lower .estimate .upper .alpha .method
<chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 HF -0.0374 -0.00274 0.0325 0.05 percentile
2 HomeTeamAston Villa -0.615 0.101 0.829 0.05 percentile
3 HomeTeamBrentford -0.963 -0.262 0.397 0.05 percentile
4 HomeTeamBrighton -1.38 -0.584 0.219 0.05 percentile
5 HomeTeamBurnley -1.18 -0.453 0.255 0.05 percentile
6 HomeTeamChelsea -0.668 0.183 1.09 0.05 percentile
7 HomeTeamCrystal Palace -0.708 0.0161 0.723 0.05 percentile
8 HomeTeamEverton -0.724 0.0130 0.748 0.05 percentile
9 HomeTeamLeeds -1.16 -0.523 0.107 0.05 percentile
10 HomeTeamLeicester -0.442 0.357 1.15 0.05 percentile
# … with 11 more rows
#plot the results
soccerboots<-ggplot(data=soccer_intervals,aes(x=.estimate,y=term))+
geom_vline(xintercept=0, linetype=2)+
geom_errorbarh(aes(xmin=.lower,xmax=.upper),height=0.2)+
geom_point(size=3)+
theme_classic()
soccerboots2.) Calculate means and 95% CIs of full time home goals and full time away goals (using bootstrapping). Plot the results and interpret the plot (is there a home advantage or not?)
n<- 200
orig_sample<-soccer %>%
slice_sample(n=n, replace=FALSE) #replace=TRUE, we can resample from the same data
orig_sample #This is the same data set as the original dataset# A tibble: 200 × 22
Date HomeT…¹ AwayT…² FTHG FTAG FTR HTHG HTAG HTR Referee HS AS
<chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 11/0… Wolves Man Ci… 1 5 A 1 3 A M Atki… 7 16
2 19/0… Aston … Watford 0 1 A 0 0 D R Jones 20 8
3 28/1… Chelsea Man Un… 1 1 D 0 0 D A Tayl… 24 3
4 15/0… Newcas… Watford 1 1 D 0 0 D P Tier… 12 18
5 22/0… Man Un… West H… 1 0 H 0 0 D J Moss 18 6
6 12/0… Man Un… Southa… 1 1 D 1 0 H S Attw… 12 13
7 15/0… Totten… Burnley 1 0 H 1 0 H K Frie… 21 8
8 28/0… Bright… Everton 0 2 A 0 1 A J Moss 14 14
9 05/0… Leices… Leeds 1 0 H 0 0 D D Coote 7 19
10 13/0… Chelsea Newcas… 1 0 H 0 0 D D Coote 8 7
# … with 190 more rows, 10 more variables: HST <dbl>, AST <dbl>, HF <dbl>,
# AF <dbl>, HC <dbl>, AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>, and
# abbreviated variable names ¹HomeTeam, ²AwayTeam
#Bootstrap!
soccer_bs<-1:1000 %>% #From 1 to 1000 = 1000 trials
map_dfr(
~orig_sample %>%
slice_sample(n=n, replace=TRUE) %>%
dplyr::summarize(meanFTHG=mean(FTHG),meanFTAG=mean(FTAG))) %>% #Caused by error in `summarize()`:! argument "by" is missing, with no default --> Put dplyr::
mutate(n=n)
soccer_bs #We got means for 1000 trials# A tibble: 1,000 × 3
meanFTHG meanFTAG n
<dbl> <dbl> <dbl>
1 1.44 1.59 200
2 1.36 1.4 200
3 1.54 1.30 200
4 1.34 1.37 200
5 1.59 1.42 200
6 1.42 1.40 200
7 1.34 1.20 200
8 1.35 1.42 200
9 1.52 1.44 200
10 1.28 1.5 200
# … with 990 more rows
#calculate mean and 95% CI of FTHG and FTAG
calc_CIs<-soccer_bs %>%
dplyr::summarize(FTHG=mean(meanFTHG), FTAG=mean(meanFTAG),CI1=1.96*sd(meanFTHG), CI2=1.96*sd(meanFTAG))
calc_CIs # A tibble: 1 × 4
FTHG FTAG CI1 CI2
<dbl> <dbl> <dbl> <dbl>
1 1.45 1.38 0.173 0.172
CIslong<-calc_CIs %>%
pivot_longer(FTHG:FTAG,names_to='team',values_to = 'goals') %>%
pivot_longer(CI1:CI2,names_to = 'CIcat',values_to='CI2')
CIslong# A tibble: 4 × 4
team goals CIcat CI2
<chr> <dbl> <chr> <dbl>
1 FTHG 1.45 CI1 0.173
2 FTHG 1.45 CI2 0.172
3 FTAG 1.38 CI1 0.173
4 FTAG 1.38 CI2 0.172
CIslong2<-CIslong[c(1,4),]#[rows,columns], c=concatenate, c(1,2)<- lists
CIslong2# A tibble: 2 × 4
team goals CIcat CI2
<chr> <dbl> <chr> <dbl>
1 FTHG 1.45 CI1 0.173
2 FTAG 1.38 CI2 0.172
ggplot(data=CIslong2, aes(x=team,y=goals, color=team))+
geom_point()+
geom_errorbar(aes(ymin=goals-CI2, ymax=goals+CI2),width=0.2)+
theme_classic()+
ylim(0, NA) #There is home advantage,3.) Add raw data behind your 95% CI plot above!
head(soccer)# A tibble: 6 × 22
Date HomeT…¹ AwayT…² FTHG FTAG FTR HTHG HTAG HTR Referee HS AS
<chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 13/08… Brentf… Arsenal 2 0 H 1 0 H M Oliv… 8 22
2 14/08… Man Un… Leeds 5 1 H 1 0 H P Tier… 16 10
3 14/08… Burnley Bright… 1 2 A 1 0 H D Coote 14 14
4 14/08… Chelsea Crysta… 3 0 H 2 0 H J Moss 13 4
5 14/08… Everton Southa… 3 1 H 0 1 A A Madl… 14 6
6 14/08… Leices… Wolves 1 0 H 1 0 H C Paws… 9 17
# … with 10 more variables: HST <dbl>, AST <dbl>, HF <dbl>, AF <dbl>, HC <dbl>,
# AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>, and abbreviated variable
# names ¹HomeTeam, ²AwayTeam
soccer# A tibble: 380 × 22
Date HomeT…¹ AwayT…² FTHG FTAG FTR HTHG HTAG HTR Referee HS AS
<chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 13/0… Brentf… Arsenal 2 0 H 1 0 H M Oliv… 8 22
2 14/0… Man Un… Leeds 5 1 H 1 0 H P Tier… 16 10
3 14/0… Burnley Bright… 1 2 A 1 0 H D Coote 14 14
4 14/0… Chelsea Crysta… 3 0 H 2 0 H J Moss 13 4
5 14/0… Everton Southa… 3 1 H 0 1 A A Madl… 14 6
6 14/0… Leices… Wolves 1 0 H 1 0 H C Paws… 9 17
7 14/0… Watford Aston … 3 2 H 2 0 H M Dean 13 11
8 14/0… Norwich Liverp… 0 3 A 0 1 A A Marr… 14 19
9 15/0… Newcas… West H… 2 4 A 2 1 H M Atki… 17 8
10 15/0… Totten… Man Ci… 1 0 H 0 0 D A Tayl… 13 18
# … with 370 more rows, 10 more variables: HST <dbl>, AST <dbl>, HF <dbl>,
# AF <dbl>, HC <dbl>, AC <dbl>, HY <dbl>, AY <dbl>, HR <dbl>, AR <dbl>, and
# abbreviated variable names ¹HomeTeam, ²AwayTeam
soccer_1<-soccer %>%
select(FTHG,FTAG)
soccer_1# A tibble: 380 × 2
FTHG FTAG
<dbl> <dbl>
1 2 0
2 5 1
3 1 2
4 3 0
5 3 1
6 1 0
7 3 2
8 0 3
9 2 4
10 1 0
# … with 370 more rows
soccer_2<-soccer_1 %>%
pivot_longer(FTHG:FTAG,names_to='team',values_to = 'goals')
soccer_2 # A tibble: 760 × 2
team goals
<chr> <dbl>
1 FTHG 2
2 FTAG 0
3 FTHG 5
4 FTAG 1
5 FTHG 1
6 FTAG 2
7 FTHG 3
8 FTAG 0
9 FTHG 3
10 FTAG 1
# … with 750 more rows
ggplot(data=CIslong2, aes(x=team,y=goals, color=team))+
geom_point()+
geom_errorbar(aes(ymin=goals-CI2, ymax=goals+CI2),width=0.2)+
geom_point(data=soccer_2, aes(x=team, y=goals))