library( dplyr )
library( ggplot2 )
library( gridExtra )
library( openintro )
library( tidyverse )
library( modelr )
library( UsingR )
library( broom )
library( plotly )
library( Stat2Data )
Parallel Slopes
What if you have two groups?
#glimpse( mpg )
n <- nrow(mpg)
mpg_manual <- subset(mpg, substring(trans[1:n],1,1)=="m")
glimpse( mpg_manual )
## Rows: 77
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", …
## $ model <chr> "a4", "a4", "a4", "a4 quattro", "a4 quattro", "a4 quattr…
## $ displ <dbl> 1.8, 2.0, 2.8, 1.8, 2.0, 2.8, 3.1, 5.7, 6.2, 7.0, 3.7, 3…
## $ year <int> 1999, 2008, 1999, 1999, 2008, 1999, 2008, 1999, 2008, 20…
## $ cyl <int> 4, 4, 6, 4, 4, 6, 6, 8, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6,…
## $ trans <chr> "manual(m5)", "manual(m6)", "manual(m5)", "manual(m5)", …
## $ drv <chr> "f", "f", "f", "4", "4", "4", "4", "r", "r", "r", "4", "…
## $ cty <int> 21, 20, 18, 18, 20, 17, 15, 16, 16, 15, 15, 14, 11, 12, …
## $ hwy <int> 29, 31, 26, 26, 28, 25, 25, 26, 26, 24, 19, 17, 17, 16, …
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "r", "…
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "…
ggplot( data = mpg_manual, aes( x = displ, y = hwy ) ) +
geom_point()
The data above are from two different years
unique( mpg_manual$year )
## [1] 1999 2008
Did the mileage change between these two years?
#a look at fuel efficiency
ggplot( data = mpg_manual, aes( x = factor( year ), y = hwy ) ) +
geom_boxplot()
We want to assess the effects of engine size and year simultaneously.
Parallel Lines Model: used when one of the explanatory variables is categoric and the other is numeric.
mod <- lm( hwy ~ displ + factor( year ), data = mpg )
summary( mod )
##
## Call:
## lm(formula = hwy ~ displ + factor(year), data = mpg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7616 -2.5187 -0.2899 1.8701 15.5852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.2757 0.7257 48.610 < 2e-16 ***
## displ -3.6110 0.1938 -18.630 < 2e-16 ***
## factor(year)2008 1.4021 0.4998 2.806 0.00545 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.78 on 231 degrees of freedom
## Multiple R-squared: 0.6004, Adjusted R-squared: 0.5969
## F-statistic: 173.5 on 2 and 231 DF, p-value: < 2.2e-16
glimpse( mariokart )
## Rows: 143
## Columns: 12
## $ id <dbl> 150377422259, 260483376854, 320432342985, 280405224677, 1…
## $ duration <int> 3, 7, 3, 3, 1, 3, 1, 1, 3, 7, 1, 1, 1, 1, 7, 7, 3, 3, 1, …
## $ n_bids <int> 20, 13, 16, 18, 20, 19, 13, 15, 29, 8, 15, 15, 13, 16, 6,…
## $ cond <fct> new, used, new, new, new, new, used, new, used, used, new…
## $ start_pr <dbl> 0.99, 0.99, 0.99, 0.99, 0.01, 0.99, 0.01, 1.00, 0.99, 19.…
## $ ship_pr <dbl> 4.00, 3.99, 3.50, 0.00, 0.00, 4.00, 0.00, 2.99, 4.00, 4.0…
## $ total_pr <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53.99, 4…
## $ ship_sp <fct> standard, firstClass, firstClass, standard, media, standa…
## $ seller_rate <int> 1580, 365, 998, 7, 820, 270144, 7284, 4858, 27, 201, 4858…
## $ stock_photo <fct> yes, yes, no, yes, yes, yes, yes, yes, yes, no, yes, yes,…
## $ wheels <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1, 2, …
## $ title <fct> "~~ Wii MARIO KART & WHEEL ~ NINTENDO Wii ~ BRAND NEW…
# fit parallel slopes
lm( total_pr ~ wheels + cond, data = mariokart )
##
## Call:
## lm(formula = total_pr ~ wheels + cond, data = mariokart)
##
## Coefficients:
## (Intercept) wheels condused
## 37.6673 10.2161 0.8457
Visualizing Parallel Slopes Models
ggplot( data = mpg_manual, aes( x = displ, y = hwy, color = factor( year ) ) ) +
geom_point()
Setting up our Model
- Define
\[newer = \left\{ \begin{array}{rcl} 1 & \mbox{if } year = 2008 \\ 0 & \mbox{if } year = 1999 \end{array}\right. \]
- Our model is:
\[\hat{hwy} = \hat{\beta_0} + \hat{\beta}_1 \cdot displ + \hat{\beta}_2 \cdot newer\]
Use R’s lm() to fint the coefficients
mod
##
## Call:
## lm(formula = hwy ~ displ + factor(year), data = mpg)
##
## Coefficients:
## (Intercept) displ factor(year)2008
## 35.276 -3.611 1.402
From this, we can find define the parallel line fits for this model:
For year = 2008, we have:
\[\hat{hwy} = 35.276 - 3.611 \cdot displ + 1.402 \cdot (1) = \] \[= (35.276 + 1.402) - 3.611 \cdot displ\]
For year = 1999, we have: \[\hat{hwy} = 35.276 - 3.611 \cdot displ + 1.402 \cdot (0) = \] \[= 35.276 - 3.611 \cdot displ\]
Visualizing the parallel lines:
#glimpse( augment( mod ) )
ggplot( data = mpg_manual, aes( x = displ, y = hwy, color = factor( year ) ) ) +
geom_point() +
geom_line( data = augment( mod ), aes( y = .fitted, color = `factor(year)` ) )
# Augment the model
mariokart_filt <- mariokart %>%
filter(total_pr < 100)
mod2 <- lm( total_pr ~ wheels + cond, data = mariokart_filt )
augmented_mod <- augment( mod2 )
#glimpse(augmented_mod)
# scatterplot, with color
data_space <- ggplot(data = augmented_mod, aes(x = wheels, y = total_pr, color = cond)) +
geom_point()
# single call to geom_line()
data_space +
geom_line(aes(y = .fitted, color = 'cond'))
Interpreting parallel slopes coefficients
Common misunderstandings for interpretting parallel slopes models:
- There is only one slope. Yes there are 2 lines, however only the numeric feature is associated with a slope.
- What is the reference level of the categorical feature?
- What are the units?
- After controlling for ….coefficients need to be interpretted n the context of the other explanatory variables in the model.
Three ways to describe a model
Mathematical Description \[\mbox{Equation: }y = \beta_0 + \beta_1x_1 + \beta_2x_2 + \epsilon\] \[\mbox{Residuals: } \epsilon \sim N(0,\sigma_{\epsilon})\] \[\mbox{Coefficients: } \beta_0, \beta_1, \beta_2\]
Geometric Description Visualizing graphically
Syntactic Using R
Multiple Regression:
- \(y = \beta_0 + \beta_1x_1 + \beta_2x_2 + \cdots + \beta_px_p + \epsilon\)
- `y ~ x1 + x2 + … + xp
- one best fit line becomes multiple lines or a plane or even multiple planes.
glimpse( babies )
## Rows: 1,236
## Columns: 23
## $ id <dbl> 15, 20, 58, 61, 72, 100, 102, 129, 142, 148, 164, 171, 175,…
## $ pluralty <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ outcome <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ date <dbl> 1411, 1499, 1576, 1504, 1425, 1673, 1449, 1562, 1408, 1568,…
## $ gestation <dbl> 284, 282, 279, 999, 282, 286, 244, 245, 289, 299, 351, 282,…
## $ sex <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ wt <dbl> 120, 113, 128, 123, 108, 136, 138, 132, 120, 143, 140, 144,…
## $ parity <dbl> 1, 2, 1, 2, 1, 4, 4, 2, 3, 3, 2, 4, 3, 5, 3, 4, 3, 3, 2, 3,…
## $ race <dbl> 8, 0, 0, 0, 0, 0, 7, 7, 0, 0, 0, 0, 0, 8, 7, 7, 4, 3, 0, 0,…
## $ age <dbl> 27, 33, 28, 36, 23, 25, 33, 23, 25, 30, 27, 32, 23, 36, 30,…
## $ ed <dbl> 5, 5, 2, 5, 5, 2, 2, 1, 4, 5, 5, 2, 1, 5, 2, 2, 7, 2, 2, 2,…
## $ ht <dbl> 62, 64, 64, 69, 67, 62, 62, 65, 62, 66, 68, 64, 63, 61, 63,…
## $ wt1 <dbl> 100, 135, 115, 190, 125, 93, 178, 140, 125, 136, 120, 124, …
## $ drace <dbl> 8, 0, 5, 3, 0, 3, 7, 7, 3, 0, 5, 0, 5, 0, 7, 7, 7, 3, 5, 0,…
## $ dage <dbl> 31, 38, 32, 43, 24, 28, 37, 23, 26, 34, 28, 36, 28, 32, 42,…
## $ ded <dbl> 5, 5, 1, 4, 5, 2, 4, 4, 1, 5, 4, 1, 2, 5, 0, 0, 1, 2, 4, 2,…
## $ dht <dbl> 65, 70, 99, 68, 99, 64, 99, 71, 70, 99, 99, 74, 99, 99, 99,…
## $ dwt <dbl> 110, 148, 999, 197, 999, 130, 999, 192, 180, 999, 999, 185,…
## $ marital <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ inc <dbl> 1, 4, 2, 8, 1, 4, 98, 2, 2, 2, 98, 2, 2, 2, 1, 1, 1, 4, 7, …
## $ smoke <dbl> 0, 0, 1, 3, 1, 2, 0, 0, 0, 1, 3, 1, 1, 1, 0, 0, 1, 1, 0, 1,…
## $ time <dbl> 0, 0, 1, 5, 1, 2, 0, 0, 0, 1, 4, 1, 1, 1, 0, 0, 1, 1, 0, 1,…
## $ number <dbl> 0, 0, 1, 5, 5, 2, 0, 0, 0, 4, 2, 1, 1, 2, 0, 0, 5, 5, 0, 6,…
# build model
mod3 <- lm( data = babies, wt ~ age + factor( smoke ) )
mod3
##
## Call:
## lm(formula = wt ~ age + factor(smoke), data = babies)
##
## Coefficients:
## (Intercept) age factor(smoke)1 factor(smoke)2 factor(smoke)3
## 121.76727 0.03655 -8.64003 0.33830 1.62719
## factor(smoke)9
## 3.83272
Evaluating and Extending Parallel Slopes Model
Model fit, residuals, and prediction
If the model fits the data better, the residuals are smaller, the SSE is smaller and the \(R^2\) is larger.
Models that incorporate more features automatically have a larger \(R^2\) value that simple univariate regression models. Therefore, to make appropriate comparasons of model performance, the adjusted \(R^2\) value is preferred: \[R^2_{adj} = 1 - \frac{SSE}{SST} \cdot \frac{n-1}{n-p-1}\] \(R^2_{adj}\) applies a penalty \(p\) for each additional feature added to the model
#return just the predicted values:
predict( mod )
## 1 2 3 4 5 6 7 8
## 28.77593 28.77593 29.45587 29.45587 25.16494 25.16494 25.48379 28.77593
## 9 10 11 12 13 14 15 16
## 28.77593 29.45587 29.45587 25.16494 25.16494 25.48379 25.48379 25.16494
## 17 18 19 20 21 22 23 24
## 25.48379 21.51170 17.53962 17.53962 17.53962 14.69308 15.01193 14.69308
## 25 26 27 28 29 30 31 32
## 14.69308 14.28973 14.28973 11.40094 17.53962 17.53962 14.69308 11.80430
## 33 34 35 36 37 38 39 40
## 26.60934 28.01148 24.08165 24.03939 23.67829 26.60934 24.44275 23.35945
## 41 42 43 44 45 46 47 48
## 23.35945 24.76159 24.76159 24.76159 21.55396 21.55396 22.95610 22.23390
## 49 50 51 52 53 54 55 56
## 23.31719 23.31719 21.19286 21.19286 19.70621 19.70621 19.70621 16.49858
## 57 58 59 60 61 62 63 64
## 16.49858 21.19286 19.70621 19.70621 19.70621 16.49858 16.09522 13.97089
## 65 66 67 68 69 70 71 72
## 19.70621 19.70621 19.70621 19.70621 19.70621 19.70621 16.49858 16.49858
## 73 74 75 76 77 78 79 80
## 16.09522 13.97089 18.66517 15.77638 17.17852 20.83176 20.83176 20.83176
## 81 82 83 84 85 86 87 88
## 22.23390 20.06731 17.22078 20.10956 20.10956 18.66517 18.66517 20.06731
## 89 90 91 92 93 94 95 96
## 15.77638 17.17852 21.55396 21.55396 22.23390 22.23390 18.66517 18.66517
## 97 98 99 100 101 102 103 104
## 20.06731 20.06731 17.17852 29.49813 29.49813 29.49813 29.49813 29.49813
## 105 106 107 108 109 110 111 112
## 30.17807 30.17807 30.17807 29.45587 26.60934 26.60934 28.01148 28.01148
## 113 114 115 116 117 118 119 120
## 26.24824 26.24824 24.76159 28.05373 28.05373 29.45587 29.45587 26.92818
## 121 122 123 124 125 126 127 128
## 26.92818 26.92818 25.84488 23.31719 20.83176 18.30407 19.70621 19.70621
## 129 130 131 132 133 134 135 136
## 16.09522 14.65083 20.83176 21.51170 20.78950 18.66517 15.77638 15.77638
## 137 138 139 140 141 142 143 144
## 17.17852 20.83176 22.23390 20.06731 17.22078 26.60934 26.60934 27.65038
## 145 146 147 148 149 150 151 152
## 27.65038 24.03939 24.03939 24.44275 24.44275 24.03939 23.35945 23.35945
## 153 154 155 156 157 158 159 160
## 22.23390 16.45632 24.08165 21.55396 21.55396 22.95610 17.53962 26.24824
## 161 162 163 164 165 166 167 168
## 26.24824 27.65038 27.65038 27.65038 27.65038 27.33154 27.33154 26.24824
## 169 170 171 172 173 174 175 176
## 26.24824 27.65038 27.65038 27.65038 27.65038 25.52604 25.52604 22.99835
## 177 178 179 180 181 182 183 184
## 22.99835 22.23390 19.70621 27.33154 27.33154 28.01148 28.01148 24.44275
## 185 186 187 188 189 190 191 192
## 24.44275 24.03939 27.33154 27.33154 28.01148 28.01148 24.44275 24.44275
## 193 194 195 196 197 198 199 200
## 24.76159 28.77593 28.77593 28.77593 30.17807 30.17807 18.30407 16.09522
## 201 202 203 204 205 206 207 208
## 25.52604 25.52604 26.92818 22.99835 22.99835 22.23390 22.23390 28.05373
## 209 210 211 212 213 214 215 216
## 28.05373 29.45587 29.45587 25.16494 28.41483 28.05373 28.05373 29.45587
## 217 218 219 220 221 222 223 224
## 29.45587 27.65038 27.65038 25.16494 25.16494 28.41483 28.41483 28.05373
## 225 226 227 228 229 230 231 232
## 28.05373 27.65038 27.65038 28.77593 28.77593 29.45587 29.45587 25.16494
## 233 234
## 25.16494 23.67829
#return a dataframe with the fitted values, data and other metrics
augment( mod )
## # A tibble: 234 x 8
## hwy displ `factor(year)` .fitted .std.resid .hat .sigma .cooksd
## <int> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29 1.8 1999 28.8 0.0597 0.0143 3.79 0.0000173
## 2 29 1.8 1999 28.8 0.0597 0.0143 3.79 0.0000173
## 3 31 2 2008 29.5 0.412 0.0158 3.79 0.000908
## 4 30 2 2008 29.5 0.145 0.0158 3.79 0.000113
## 5 26 2.8 1999 25.2 0.222 0.00916 3.79 0.000152
## 6 26 2.8 1999 25.2 0.222 0.00916 3.79 0.000152
## 7 27 3.1 2008 25.5 0.403 0.00938 3.79 0.000512
## 8 26 1.8 1999 28.8 -0.740 0.0143 3.78 0.00265
## 9 25 1.8 1999 28.8 -1.01 0.0143 3.78 0.00490
## 10 28 2 2008 29.5 -0.388 0.0158 3.79 0.000807
## # … with 224 more rows
#some new data
new_obs <- data.frame( displ = 1.8, year = 2008 )
#return a prediction for this new data
predict( mod, newdata = new_obs )
## 1
## 30.17807
#return a data.frame
augment( mod, newdata = new_obs )
## # A tibble: 1 x 3
## displ year .fitted
## <dbl> <dbl> <dbl>
## 1 1.8 2008 30.2
glimpse( mariokart )
## Rows: 143
## Columns: 12
## $ id <dbl> 150377422259, 260483376854, 320432342985, 280405224677, 1…
## $ duration <int> 3, 7, 3, 3, 1, 3, 1, 1, 3, 7, 1, 1, 1, 1, 7, 7, 3, 3, 1, …
## $ n_bids <int> 20, 13, 16, 18, 20, 19, 13, 15, 29, 8, 15, 15, 13, 16, 6,…
## $ cond <fct> new, used, new, new, new, new, used, new, used, used, new…
## $ start_pr <dbl> 0.99, 0.99, 0.99, 0.99, 0.01, 0.99, 0.01, 1.00, 0.99, 19.…
## $ ship_pr <dbl> 4.00, 3.99, 3.50, 0.00, 0.00, 4.00, 0.00, 2.99, 4.00, 4.0…
## $ total_pr <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53.99, 4…
## $ ship_sp <fct> standard, firstClass, firstClass, standard, media, standa…
## $ seller_rate <int> 1580, 365, 998, 7, 820, 270144, 7284, 4858, 27, 201, 4858…
## $ stock_photo <fct> yes, yes, no, yes, yes, yes, yes, yes, yes, no, yes, yes,…
## $ wheels <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1, 2, …
## $ title <fct> "~~ Wii MARIO KART & WHEEL ~ NINTENDO Wii ~ BRAND NEW…
mod_mk1 <- lm( data = mariokart, total_pr ~ wheels + cond )
# R^2 and adjusted R^2
summary( mod_mk1 )
##
## Call:
## lm(formula = total_pr ~ wheels + cond, data = mariokart)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.719 -6.462 -2.513 0.487 267.565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.6673 5.2795 7.135 4.78e-11 ***
## wheels 10.2161 2.6740 3.821 0.0002 ***
## condused 0.8457 4.5855 0.184 0.8539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.42 on 140 degrees of freedom
## Multiple R-squared: 0.1091, Adjusted R-squared: 0.09638
## F-statistic: 8.573 on 2 and 140 DF, p-value: 0.0003075
# add random noise
mario_kart_noisy <- mariokart %>% mutate( noise = rnorm( n() ) )
# compute new model
mod_mk2 <- lm( data = mario_kart_noisy, total_pr ~ wheels + cond + noise )
# new R^2 and adjusted R^2
summary( mod_mk2 )
##
## Call:
## lm(formula = total_pr ~ wheels + cond + noise, data = mario_kart_noisy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.605 -6.658 -2.503 1.231 266.150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.552 5.288 7.101 5.84e-11 ***
## wheels 10.181 2.678 3.802 0.000214 ***
## condused 1.017 4.596 0.221 0.825239
## noise 1.805 2.248 0.803 0.423354
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.45 on 139 degrees of freedom
## Multiple R-squared: 0.1132, Adjusted R-squared: 0.09408
## F-statistic: 5.916 on 3 and 139 DF, p-value: 0.0007913
Understanding interaction
Interaction terms. allow for changes in the relationships of explanatory variables.
ex:
\[\hat{mpg} = \hat{\beta_0} + \hat{\beta_1} \cdot displ + \hat{\beta_2}\cdot is\_newer + \hat{\beta_3} \cdot displ \cdot is\_newer\] This results in two best fit lines:
- For older cars: \(\hat{mpg} = \hat{\beta_0} + \hat{\beta_0} \cdot displ\)
- For newer cars: \(\hat{mpg} = (\hat{\beta_0} + \hat{\beta_2}) + (\hat{\beta_1} + \hat{\beta_3})\cdot displ\)
Syntax for Interactions:
# include interaction
# : means multiplication here
lm( total_pr ~ cond + duration + cond:duration, data = mariokart_filt )
##
## Call:
## lm(formula = total_pr ~ cond + duration + cond:duration, data = mariokart_filt)
##
## Coefficients:
## (Intercept) condused duration condused:duration
## 58.268 -17.122 -1.966 2.325
# interaction plot
ggplot(data = mariokart_filt, aes(x = duration, y = total_pr, color = cond)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE )
## `geom_smooth()` using formula 'y ~ x'
Simpson’s Paradox
glimpse( SAT )
## Rows: 50
## Columns: 8
## $ state <fct> Alabama, Alaska, Arizona, Arkansas, California, Colorado, Conn…
## $ expend <dbl> 4.405, 8.963, 4.778, 4.459, 4.992, 5.443, 8.817, 7.030, 5.718,…
## $ ratio <dbl> 17.2, 17.6, 19.3, 17.1, 24.0, 18.4, 14.4, 16.6, 19.1, 16.3, 17…
## $ salary <dbl> 31.144, 47.951, 32.175, 28.934, 41.078, 34.571, 50.045, 39.076…
## $ perc <int> 8, 47, 27, 6, 45, 29, 81, 68, 48, 65, 57, 15, 13, 58, 5, 9, 11…
## $ verbal <int> 491, 445, 448, 482, 417, 462, 431, 429, 420, 406, 407, 468, 48…
## $ math <int> 538, 489, 496, 523, 485, 518, 477, 468, 469, 448, 482, 511, 56…
## $ total <int> 1029, 934, 944, 1005, 902, 980, 908, 897, 889, 854, 889, 979, …
ggplot( data = SAT, aes( x = salary, y = total ) ) +
geom_point() +
geom_smooth( method = 'lm', se = 0 )
## `geom_smooth()` using formula 'y ~ x'
Revisualize the data using a paralel slopes model to evaluate the effect of percentage of students taking the SAT:
SAT_wbin <- SAT %>%
mutate( sat_bin = cut( perc, 3 ) )
mod_satbin <- lm( formula = total ~ salary + sat_bin, data = SAT_wbin )
mod_satbin
##
## Call:
## lm(formula = total ~ salary + sat_bin, data = SAT_wbin)
##
## Coefficients:
## (Intercept) salary sat_bin(29.7,55.3] sat_bin(55.3,81.1]
## 1000.7173 0.8697 -116.3174 -143.5428
glimpse( SAT_wbin )
## Rows: 50
## Columns: 9
## $ state <fct> Alabama, Alaska, Arizona, Arkansas, California, Colorado, Con…
## $ expend <dbl> 4.405, 8.963, 4.778, 4.459, 4.992, 5.443, 8.817, 7.030, 5.718…
## $ ratio <dbl> 17.2, 17.6, 19.3, 17.1, 24.0, 18.4, 14.4, 16.6, 19.1, 16.3, 1…
## $ salary <dbl> 31.144, 47.951, 32.175, 28.934, 41.078, 34.571, 50.045, 39.07…
## $ perc <int> 8, 47, 27, 6, 45, 29, 81, 68, 48, 65, 57, 15, 13, 58, 5, 9, 1…
## $ verbal <int> 491, 445, 448, 482, 417, 462, 431, 429, 420, 406, 407, 468, 4…
## $ math <int> 538, 489, 496, 523, 485, 518, 477, 468, 469, 448, 482, 511, 5…
## $ total <int> 1029, 934, 944, 1005, 902, 980, 908, 897, 889, 854, 889, 979,…
## $ sat_bin <fct> "(3.92,29.7]", "(29.7,55.3]", "(3.92,29.7]", "(3.92,29.7]", "…
ggplot( data = SAT_wbin, aes( x = salary, y = total, color = sat_bin ) ) +
geom_point() +
geom_line( data = broom::augment( mod_satbin ), aes( y = .fitted ) )
Ignoring SAT percentage, thre was a negative trend for SAT score to decrease as a function of salary. However, if the percentage of SAT is discretized, we see that all groups show a positive trend as a function of salary. This is an example of Simpson’s effect: an effect that occurs when the marginal association between two categorical variables is qualitatively different from the partial association between the same two variables after controlling for one or more other variables.
When Simpson’s phenomenon is present, the relationship between variables changes when subgroups are considers. When this paradox is present, the relationship is an important effect to include in a model.
slr <- ggplot(mariokart_filt, aes(y = total_pr, x = duration)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# model with one slope
lm( data = mariokart_filt, total_pr ~ duration )
##
## Call:
## lm(formula = total_pr ~ duration, data = mariokart_filt)
##
## Coefficients:
## (Intercept) duration
## 52.374 -1.317
# plot with two slopes
slr2 <- slr + aes(color = cond )
grid.arrange( slr, slr2, ncol = 2 )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Multiple Regression
Adding a Numerical Explanatory Variable
Mathematical: \(\hat{bwt} = \hat{\beta_0} + \hat{\beta_1} \cdot gestation + \hat{\beta_2} \cdot age\)
- Syntactical:
lm( bwt ~ gestation + age, data = babies ) Geometric:
#glimpse( babies )
babies_filt <- babies %>%
filter( age < 50 & gestation < 500 )
data_space <- ggplot( babies_filt, aes( x = gestation, y = age ) ) +
geom_point( aes( color = wt ) )
data_space
Tiling the Plane
grid <- babies_filt %>%
data_grid(
gestation = seq_range( gestation, by = 1 ),
age = seq_range( age, by = 1 )
)
mod_tiling <- lm( wt ~ gestation + age, data = babies )
bwt_hats <- augment( mod_tiling, newdata = grid )
#glimpse( bwt_hats )
data_space + geom_tile( data = bwt_hats, aes( fill = .fitted, alpha = 0.5 ) ) +
scale_fill_continuous( 'wt', limits = range( babies_filt$wt ))
Adding a Third (categorical) variable
glimpse( babies_filt )
## Rows: 1,221
## Columns: 23
## $ id <dbl> 15, 20, 58, 72, 100, 102, 129, 142, 148, 164, 171, 175, 183…
## $ pluralty <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ outcome <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ date <dbl> 1411, 1499, 1576, 1425, 1673, 1449, 1562, 1408, 1568, 1554,…
## $ gestation <dbl> 284, 282, 279, 282, 286, 244, 245, 289, 299, 351, 282, 279,…
## $ sex <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ wt <dbl> 120, 113, 128, 108, 136, 138, 132, 120, 143, 140, 144, 141,…
## $ parity <dbl> 1, 2, 1, 1, 4, 4, 2, 3, 3, 2, 4, 3, 5, 3, 4, 3, 3, 2, 3, 3,…
## $ race <dbl> 8, 0, 0, 0, 0, 7, 7, 0, 0, 0, 0, 0, 8, 7, 7, 4, 3, 0, 0, 9,…
## $ age <dbl> 27, 33, 28, 23, 25, 33, 23, 25, 30, 27, 32, 23, 36, 30, 38,…
## $ ed <dbl> 5, 5, 2, 5, 2, 2, 1, 4, 5, 5, 2, 1, 5, 2, 2, 7, 2, 2, 2, 2,…
## $ ht <dbl> 62, 64, 64, 67, 62, 62, 65, 62, 66, 68, 64, 63, 61, 63, 63,…
## $ wt1 <dbl> 100, 135, 115, 125, 93, 178, 140, 125, 136, 120, 124, 128, …
## $ drace <dbl> 8, 0, 5, 0, 3, 7, 7, 3, 0, 5, 0, 5, 0, 7, 7, 7, 3, 5, 0, 10…
## $ dage <dbl> 31, 38, 32, 24, 28, 37, 23, 26, 34, 28, 36, 28, 32, 42, 37,…
## $ ded <dbl> 5, 5, 1, 5, 2, 4, 4, 1, 5, 4, 1, 2, 5, 0, 0, 1, 2, 4, 2, 1,…
## $ dht <dbl> 65, 70, 99, 99, 64, 99, 71, 70, 99, 99, 74, 99, 99, 99, 71,…
## $ dwt <dbl> 110, 148, 999, 999, 130, 999, 192, 180, 999, 999, 185, 999,…
## $ marital <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ inc <dbl> 1, 4, 2, 1, 4, 98, 2, 2, 2, 98, 2, 2, 2, 1, 1, 1, 4, 7, 5, …
## $ smoke <dbl> 0, 0, 1, 1, 2, 0, 0, 0, 1, 3, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,…
## $ time <dbl> 0, 0, 1, 1, 2, 0, 0, 0, 1, 4, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,…
## $ number <dbl> 0, 0, 1, 5, 2, 0, 0, 0, 4, 2, 1, 1, 2, 0, 0, 5, 5, 0, 6, 0,…
mod_bab3 <- lm( wt ~ gestation + age + smoke, data = babies_filt )
mod_bab3
##
## Call:
## lm(formula = wt ~ gestation + age + smoke, data = babies_filt)
##
## Coefficients:
## (Intercept) gestation age smoke
## -15.5181 0.4680 0.1661 -0.1388
Geometry:
- numeric + categorical: parallel lines
- numeric + numeric: a plane
- numeric + numeric + categorical: parallel planes
use plotly for parallel planes visualization
Higher Dimensions
Kitchen sink model: throw everything in and see what happens
Logistic Regression
What is Logistic Regression?
Linear Regression with a categorical response variable.
glimpse( heart_transplant )
## Rows: 103
## Columns: 8
## $ id <int> 15, 43, 61, 75, 6, 42, 54, 38, 85, 2, 103, 12, 48, 102, 35…
## $ acceptyear <int> 68, 70, 71, 72, 68, 70, 71, 70, 73, 68, 67, 68, 71, 74, 70…
## $ age <int> 53, 43, 52, 52, 54, 36, 47, 41, 47, 51, 39, 53, 56, 40, 43…
## $ survived <fct> dead, dead, dead, dead, dead, dead, dead, dead, dead, dead…
## $ survtime <int> 1, 2, 2, 2, 3, 3, 3, 5, 5, 6, 6, 8, 9, 11, 12, 16, 16, 16,…
## $ prior <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no…
## $ transplant <fct> control, control, control, control, control, control, cont…
## $ wait <int> NA, NA, NA, NA, NA, NA, NA, 5, NA, NA, NA, NA, NA, NA, NA,…
ggplot( data = heart_transplant, aes( x = age, y = survived ) ) +
geom_jitter( width = 0, height = 0.05, alpha = 0.5 )
We have to transform the
survived feature to a binary variable
heartTr <- heart_transplant %>%
mutate( is_alive = ifelse( survived == 'alive', 1 ,0 ) )
Now, visualize the binary response
data_space_hearTr <- ggplot( data = heartTr, aes( x = age, y = is_alive ) ) +
geom_jitter( width = 0, height = 0.05, alpha = 0.5 )
data_space_hearTr
Just fitting a linear regression line here is inappropriate and will not capture the trands in the data here.
Generalized Linear Models:
- Generalization of multiple regression.
- modelnon-normal response distributions
- GLM special case: logistic regression
- model a binary response variable
- uses logit link function
- GLM in a nutshell: apply a link function to appropriately scale the response variable to match the output of a linear model. The link function used in logistic regression is the logit function.
Fitting a GLM:
glm( is_alive ~ age, data = heartTr, family = binomial )
##
## Call: glm(formula = is_alive ~ age, family = binomial, data = heartTr)
##
## Coefficients:
## (Intercept) age
## 1.56438 -0.05847
##
## Degrees of Freedom: 102 Total (i.e. Null); 101 Residual
## Null Deviance: 120.5
## Residual Deviance: 113.7 AIC: 117.7
binomial()
##
## Family: binomial
## Link function: logit
data("MedGPA")
head( MedGPA )
## Accept Acceptance Sex BCPM GPA VR PS WS BS MCAT Apps
## 1 D 0 F 3.59 3.62 11 9 9 9 38 5
## 2 A 1 M 3.75 3.84 12 13 8 12 45 3
## 3 A 1 F 3.24 3.23 9 10 5 9 33 19
## 4 A 1 F 3.74 3.69 12 11 7 10 40 5
## 5 A 1 F 3.53 3.38 9 11 4 11 35 11
## 6 A 1 M 3.59 3.72 10 9 7 10 36 5
Example categorical response variable plotted with a simple linear regression line:
# scatterplot with jitter
data_space <- ggplot( data = MedGPA, aes( x = GPA, y = Acceptance ) ) +
geom_jitter(width = 0, height = 0.05, alpha = 0.5)
# linear regression line
data_space +
geom_smooth( method = 'lm', se = FALSE )
## `geom_smooth()` using formula 'y ~ x'
For predictions close to the median, a linear regression model might seem appropriate:
# filter
MedGPA_middle <- MedGPA %>%
filter( GPA > 3.375 & GPA <= 3.77 )
# scatterplot with jitter
data_space2 <- ggplot(data = MedGPA_middle, aes( x = GPA, y = Acceptance )) +
geom_jitter(width = 0, height = 0.05, alpha = 0.5)
# linear regression line
data_space2 +
geom_smooth( method = 'lm', se = FALSE )
## `geom_smooth()` using formula 'y ~ x'
However, linear regression does not make good predictions for the extreem ends of the data distribution (very high or very low GPA scores )
Fit a GLM model
# fit model
glm(Acceptance ~ GPA, data = MedGPA, family = binomial)
##
## Call: glm(formula = Acceptance ~ GPA, family = binomial, data = MedGPA)
##
## Coefficients:
## (Intercept) GPA
## -19.207 5.454
##
## Degrees of Freedom: 54 Total (i.e. Null); 53 Residual
## Null Deviance: 75.79
## Residual Deviance: 56.84 AIC: 60.84
Visualizing Logistic Regression
data_space
data_space +
geom_smooth( method = 'lm', se = FALSE ) +
geom_smooth( method = 'glm', se = FALSE, color = 'red',
method.args = list( family = 'binomial' ) )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
data_space_hearTr +
geom_smooth( method = 'lm', se = FALSE ) +
geom_smooth( method = 'glm', se = FALSE, color = 'red',
method.args = list( family = 'binomial' ) )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
The sigmoidal shaped logistic curve will never reach the values of 1 or 0 which illiminates the problematic prediction values from the lm model.
MedGPA_binned <- MedGPA %>%
mutate( bin = cut( GPA, 6 ) ) %>%
group_by( bin ) %>%
summarise( mean_GPA = mean( GPA ),
acceptance_rate = sum(Acceptance)/n())
## `summarise()` ungrouping output (override with `.groups` argument)
glimpse( MedGPA_binned )
## Rows: 6
## Columns: 3
## $ bin <fct> "(2.72,2.93]", "(2.93,3.14]", "(3.14,3.35]", "(3.35,3…
## $ mean_GPA <dbl> 2.760000, 3.065000, 3.255714, 3.437857, 3.646667, 3.8…
## $ acceptance_rate <dbl> 0.0000000, 0.0000000, 0.2857143, 0.4285714, 0.5333333…
glmmod <- glm(formula = Acceptance ~ GPA, family = binomial, data = MedGPA)
# binned points and line
data_space <- ggplot( data = MedGPA_binned, aes( x = mean_GPA, y = acceptance_rate ) ) +
geom_line()
#data_space
# augmented model
MedGPA_plus <- augment( glmmod, type.predict = 'response' )
#glimpse( MedGPA_plus )
# logistic model on probability scale
data_space +
geom_line(data = MedGPA_plus, aes( x = GPA, y = .fitted), color = "red")
Observe that the logistic predictions follow the binned values very well
Three scales approach to interpretation
hTrmod <- glm( is_alive ~ age, data = heartTr, family = binomial )
heartTr_plus <- hTrmod %>%
augment( type.predict = 'response' ) %>%
mutate( y_hat = .fitted )
Probability scale plot
ggplot( heartTr_plus, aes( x = age, y = y_hat ) ) +
geom_point() +
geom_line() +
scale_y_continuous( 'Probability of Being Alive', limits = c( 0,1 ) )
The logistic line is curved, therefore we can no longer say that each additional year in age is associated with a set change (slope) of P(alive) as if this were a linear model.
To combat this, we can change the scale of the y-axis.
Odds scale \[odds(y) = \frac{\hat{y}}{1-\hat{y}} = \mbox{exp}(\hat{\beta_0} + \hat{\beta_1} \cdot x)\]
#glimpse( heartTr_plus )
heartTr_plus <- heartTr_plus %>%
mutate( odds_hat = y_hat / (1 - y_hat ) )
ggplot( heartTr_plus, aes( x = age, y = odds_hat ) ) +
geom_point() +
geom_line() +
scale_y_continuous( 'Odds of Being Alive', limits = c( 0,1 ) )
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 row(s) containing missing values (geom_path).
The model now has the form of an exponential function
Log-odds scale:
\[logit( \hat{y]}) = log \left[ \frac{\hat{y}}{1-\hat{y}} \right] = \hat{\beta_0} + \hat{\beta_1} \cdot x\]
heartTr_plus <- heartTr_plus %>%
mutate( log_odds_hat = log( odds_hat ) )
ggplot( heartTr_plus, aes( x = age, y = log_odds_hat ) ) +
geom_point() +
geom_line() +
scale_y_continuous( 'Odds of Being Alive', limits = c( 0,0.5 ) ) +
scale_x_continuous( limits = c( 0,70 ) )
## Warning: Removed 97 rows containing missing values (geom_point).
## Warning: Removed 97 row(s) containing missing values (geom_path).
Caparison of the 3 scales:
- Probability Scale
- scale: intuitive, easy to read
- function: non-linear and hard to interpret
- Odds Scale
- scale: harder to interpret
- function: exponention; hard to interpret
- Log-Odds Scale
- scale: impossible to interpret
- function: linear; easy to interpret
Odds ratios
exp( coef(mod) )
## (Intercept) displ factor(year)2008
## 2.089510e+15 2.702518e-02 4.063874e+00
# compute odds for bins
MedGPA_binned <- MedGPA_binned %>%
mutate( odds = acceptance_rate/(1-acceptance_rate))
# plot binned odds
data_space <- ggplot( MedGPA_binned, aes( x = mean_GPA, y = odds ) ) +
geom_point() +
geom_line()
# compute odds for observations
MedGPA_plus <- MedGPA_plus %>%
mutate( odds_hat = .fitted/(1-.fitted))
# logistic model on odds scale
data_space +
geom_line(data = MedGPA_plus, aes( x = GPA, y = odds_hat ), color = "red")
Log_odds scale
# compute log odds for bins
MedGPA_binned <- MedGPA_binned %>%
mutate( log_odds = log( acceptance_rate/( 1-acceptance_rate ) ) )
# plot binned log odds
data_space <- ggplot( MedGPA_binned, aes( x = mean_GPA, y = log_odds ) ) +
geom_point() +
geom_line()
# compute log odds for observations
MedGPA_plus <- MedGPA_plus %>%
mutate( log_odds_hat = log( .fitted/( 1-.fitted ) ) )
# logistic model on log odds scale
data_space +
geom_line(data = MedGPA_plus, aes( x = GPA, y = log_odds_hat), color = "red")
Using a Logistic Model
Learning from the coefficients about the underlying processes
#fitted probabilities on the log-odds scale (not very useful):
augment( hTrmod )
## # A tibble: 103 x 8
## is_alive age .fitted .resid .std.resid .hat .sigma .cooksd
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 53 -1.53 -0.625 -0.630 0.0156 1.06 0.00174
## 2 0 43 -0.950 -0.809 -0.813 0.0106 1.06 0.00210
## 3 0 52 -1.48 -0.642 -0.646 0.0147 1.06 0.00173
## 4 0 52 -1.48 -0.642 -0.646 0.0147 1.06 0.00173
## 5 0 54 -1.59 -0.608 -0.614 0.0166 1.06 0.00175
## 6 0 36 -0.540 -0.958 -0.967 0.0181 1.06 0.00548
## 7 0 47 -1.18 -0.731 -0.735 0.0111 1.06 0.00174
## 8 0 41 -0.833 -0.850 -0.855 0.0115 1.06 0.00257
## 9 0 47 -1.18 -0.731 -0.735 0.0111 1.06 0.00174
## 10 0 51 -1.42 -0.659 -0.663 0.0138 1.06 0.00172
## # … with 93 more rows
#return on probability scale (more familiar)
augment( hTrmod, type.predict = 'response' )
## # A tibble: 103 x 8
## is_alive age .fitted .resid .std.resid .hat .sigma .cooksd
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 53 0.177 -0.625 -0.630 0.0156 1.06 0.00174
## 2 0 43 0.279 -0.809 -0.813 0.0106 1.06 0.00210
## 3 0 52 0.186 -0.642 -0.646 0.0147 1.06 0.00173
## 4 0 52 0.186 -0.642 -0.646 0.0147 1.06 0.00173
## 5 0 54 0.169 -0.608 -0.614 0.0166 1.06 0.00175
## 6 0 36 0.368 -0.958 -0.967 0.0181 1.06 0.00548
## 7 0 47 0.234 -0.731 -0.735 0.0111 1.06 0.00174
## 8 0 41 0.303 -0.850 -0.855 0.0115 1.06 0.00257
## 9 0 47 0.234 -0.731 -0.735 0.0111 1.06 0.00174
## 10 0 51 0.195 -0.659 -0.663 0.0138 1.06 0.00172
## # … with 93 more rows
Out-of-Sample predictions
cheney <- data.frame( age = 71, transplant = 'treatment' )
augment( hTrmod, newdata = cheney, type.predict = 'response' )
## # A tibble: 1 x 3
## age transplant .fitted
## <dbl> <chr> <dbl>
## 1 71 treatment 0.0700
Making Binary Predictions
e.g. for the heart transplant data, a person either lives or dies. does it make sense to predict a probability?
hTrmod_plus <- augment( hTrmod, type.predict = 'response' ) %>%
mutate( alive_hat = round( .fitted ) )
#glimpse( hTrmod_plus)
hTrmod_plus %>%
select( is_alive, age, .fitted, alive_hat )
## # A tibble: 103 x 4
## is_alive age .fitted alive_hat
## <dbl> <int> <dbl> <dbl>
## 1 0 53 0.177 0
## 2 0 43 0.279 0
## 3 0 52 0.186 0
## 4 0 52 0.186 0
## 5 0 54 0.169 0
## 6 0 36 0.368 0
## 7 0 47 0.234 0
## 8 0 41 0.303 0
## 9 0 47 0.234 0
## 10 0 51 0.195 0
## # … with 93 more rows
A look at the confusion matrix for a simple rounding scheme:
hTrmod_plus %>%
select( is_alive, alive_hat ) %>%
table()
## alive_hat
## is_alive 0 1
## 0 71 4
## 1 25 3
Back the MedGPA data
# create new data frame
new_data <- data.frame( 'GPA' = 3.51 )
# make predictions
augment( glmmod, newdata = new_data, type.predict = 'response' )
## # A tibble: 1 x 2
## GPA .fitted
## <dbl> <dbl>
## 1 3.51 0.484
# data frame with binary predictions
tidy_mod <- glmmod %>%
augment( type.predict = 'response' ) %>%
mutate( Acceptance_hat = round( .fitted ) )
# confusion matrix
tidy_mod %>%
select(Acceptance, Acceptance_hat) %>%
table()
## Acceptance_hat
## Acceptance 0 1
## 0 16 9
## 1 6 24
Case Study: Italian Retaurants in NYC
Italian Restaurants in NYC
What are the factors that contribute to the price of a meal at an Italian Restaurant in NYC?
#Zagat reviews data
nyc_url <- 'https://assets.datacamp.com/production/repositories/845/datasets/639a7a3f9020edb51bcbc4bfdb7b71cbd8b9a70e/nyc.csv'
nyc <- read.csv( nyc_url )
glimpse( nyc )
## Rows: 168
## Columns: 7
## $ Case <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Restaurant <chr> "Daniella Ristorante", "Tello's Ristorante", "Biricchino",…
## $ Price <int> 43, 32, 34, 41, 54, 52, 34, 34, 39, 44, 45, 47, 52, 35, 47…
## $ Food <int> 22, 20, 21, 20, 24, 22, 22, 20, 22, 21, 19, 21, 21, 19, 20…
## $ Decor <int> 18, 19, 13, 20, 19, 22, 16, 18, 19, 17, 17, 19, 19, 17, 18…
## $ Service <int> 20, 19, 18, 17, 21, 21, 21, 21, 22, 19, 20, 21, 20, 19, 21…
## $ East <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
EDA
How are the variables distributed?
nyc %>% select( -Restaurant ) %>% pairs()
# Price by Food plot
ggplot( data = nyc, aes( x = Food, y = Price ) ) +
geom_point()
# Price by Food model
lm( Price ~ Food, data = nyc )
##
## Call:
## lm(formula = Price ~ Food, data = nyc)
##
## Coefficients:
## (Intercept) Food
## -17.832 2.939
Incorporating another variable
How does location to the East or West of 5th Avenue affect the price of an Italian meal?
nyc %>% group_by( East ) %>%
dplyr::summarize( mean_price = mean( Price ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## East mean_price
## <int> <dbl>
## 1 0 40.4
## 2 1 44.0
But is there a confounding variable? Could the food quality have more to do with the price than location? How does the quality of service affect price?
lm( data = nyc, Price ~ Food + East )
##
## Call:
## lm(formula = Price ~ Food + East, data = nyc)
##
## Coefficients:
## (Intercept) Food East
## -17.430 2.875 1.459
Higher Dimentions
How is the percieved quality of the food vary with the price of a meal? How is this effect moderated by the quality of food and/or service?
Collinear variables: if one variable is varies constantly with another variable. Having colinear variables in a model detracts from the power of the model by adding redundant information.
An example of perfect collinearity:
#Comparing the price of a model in dollars to the price in cents
nyc %>%
mutate( Price_cents = Price / 100 ) %>%
dplyr::summarise( cor_collinear = cor( Price, Price_cents ) )
## cor_collinear
## 1 1
Multicollinearity
- Explanatory variables can get highly correlated. especially as more and more features are added to a model
- leads to unstable coefficient estimates
- However, does not detract the \(R^2\), or the explanatory power of the model as a whole
# Price by Food and Service and East
lm( data = nyc, Price ~ Food + Service + East )
##
## Call:
## lm(formula = Price ~ Food + Service + East, data = nyc)
##
## Coefficients:
## (Intercept) Food Service East
## -20.8155 1.4863 1.6647 0.9649
Higher dimensions
hdimmod <- lm( data = nyc, Price ~ Food + Decor + Service + East )
summary( hdimmod )
##
## Call:
## lm(formula = Price ~ Food + Decor + Service + East, data = nyc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0465 -3.8837 0.0373 3.3942 17.7491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.023800 4.708359 -5.102 9.24e-07 ***
## Food 1.538120 0.368951 4.169 4.96e-05 ***
## Decor 1.910087 0.217005 8.802 1.87e-15 ***
## Service -0.002727 0.396232 -0.007 0.9945
## East 2.068050 0.946739 2.184 0.0304 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.738 on 163 degrees of freedom
## Multiple R-squared: 0.6279, Adjusted R-squared: 0.6187
## F-statistic: 68.76 on 4 and 163 DF, p-value: < 2.2e-16