Multiple Logistic Regression inR

DataCamp: Statistics with R

Bonnie Cooper

library( dplyr )
library( ggplot2 )
library( gridExtra )
library( openintro )
library( tidyverse )
library( modelr )
library( UsingR )
library( broom )
library( plotly )
library( Stat2Data )

Parallel Slopes

What if you have two groups?

#glimpse( mpg )
n <- nrow(mpg)
mpg_manual <- subset(mpg, substring(trans[1:n],1,1)=="m")
glimpse( mpg_manual )
## Rows: 77
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", …
## $ model        <chr> "a4", "a4", "a4", "a4 quattro", "a4 quattro", "a4 quattr…
## $ displ        <dbl> 1.8, 2.0, 2.8, 1.8, 2.0, 2.8, 3.1, 5.7, 6.2, 7.0, 3.7, 3…
## $ year         <int> 1999, 2008, 1999, 1999, 2008, 1999, 2008, 1999, 2008, 20…
## $ cyl          <int> 4, 4, 6, 4, 4, 6, 6, 8, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6,…
## $ trans        <chr> "manual(m5)", "manual(m6)", "manual(m5)", "manual(m5)", …
## $ drv          <chr> "f", "f", "f", "4", "4", "4", "4", "r", "r", "r", "4", "…
## $ cty          <int> 21, 20, 18, 18, 20, 17, 15, 16, 16, 15, 15, 14, 11, 12, …
## $ hwy          <int> 29, 31, 26, 26, 28, 25, 25, 26, 26, 24, 19, 17, 17, 16, …
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "r", "…
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "…
ggplot( data = mpg_manual, aes( x = displ, y = hwy ) ) +
  geom_point()

The data above are from two different years

unique( mpg_manual$year )
## [1] 1999 2008

Did the mileage change between these two years?

#a look at fuel efficiency
ggplot( data = mpg_manual, aes( x = factor( year ), y = hwy ) ) +
  geom_boxplot()

We want to assess the effects of engine size and year simultaneously.

Parallel Lines Model: used when one of the explanatory variables is categoric and the other is numeric.

mod <- lm( hwy ~ displ + factor( year ), data = mpg )
summary( mod )
## 
## Call:
## lm(formula = hwy ~ displ + factor(year), data = mpg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7616 -2.5187 -0.2899  1.8701 15.5852 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       35.2757     0.7257  48.610  < 2e-16 ***
## displ             -3.6110     0.1938 -18.630  < 2e-16 ***
## factor(year)2008   1.4021     0.4998   2.806  0.00545 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.78 on 231 degrees of freedom
## Multiple R-squared:  0.6004, Adjusted R-squared:  0.5969 
## F-statistic: 173.5 on 2 and 231 DF,  p-value: < 2.2e-16
glimpse( mariokart )
## Rows: 143
## Columns: 12
## $ id          <dbl> 150377422259, 260483376854, 320432342985, 280405224677, 1…
## $ duration    <int> 3, 7, 3, 3, 1, 3, 1, 1, 3, 7, 1, 1, 1, 1, 7, 7, 3, 3, 1, …
## $ n_bids      <int> 20, 13, 16, 18, 20, 19, 13, 15, 29, 8, 15, 15, 13, 16, 6,…
## $ cond        <fct> new, used, new, new, new, new, used, new, used, used, new…
## $ start_pr    <dbl> 0.99, 0.99, 0.99, 0.99, 0.01, 0.99, 0.01, 1.00, 0.99, 19.…
## $ ship_pr     <dbl> 4.00, 3.99, 3.50, 0.00, 0.00, 4.00, 0.00, 2.99, 4.00, 4.0…
## $ total_pr    <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53.99, 4…
## $ ship_sp     <fct> standard, firstClass, firstClass, standard, media, standa…
## $ seller_rate <int> 1580, 365, 998, 7, 820, 270144, 7284, 4858, 27, 201, 4858…
## $ stock_photo <fct> yes, yes, no, yes, yes, yes, yes, yes, yes, no, yes, yes,…
## $ wheels      <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1, 2, …
## $ title       <fct> "~~ Wii MARIO KART &amp; WHEEL ~ NINTENDO Wii ~ BRAND NEW…
# fit parallel slopes
lm( total_pr ~ wheels + cond, data = mariokart )
## 
## Call:
## lm(formula = total_pr ~ wheels + cond, data = mariokart)
## 
## Coefficients:
## (Intercept)       wheels     condused  
##     37.6673      10.2161       0.8457

Visualizing Parallel Slopes Models

ggplot( data = mpg_manual, aes( x = displ, y = hwy, color = factor( year ) ) ) +
  geom_point()

Setting up our Model

  • Define

\[newer = \left\{ \begin{array}{rcl} 1 & \mbox{if } year = 2008 \\ 0 & \mbox{if } year = 1999 \end{array}\right. \]

  • Our model is:

\[\hat{hwy} = \hat{\beta_0} + \hat{\beta}_1 \cdot displ + \hat{\beta}_2 \cdot newer\]

Use R’s lm() to fint the coefficients

mod
## 
## Call:
## lm(formula = hwy ~ displ + factor(year), data = mpg)
## 
## Coefficients:
##      (Intercept)             displ  factor(year)2008  
##           35.276            -3.611             1.402

From this, we can find define the parallel line fits for this model:
For year = 2008, we have:
\[\hat{hwy} = 35.276 - 3.611 \cdot displ + 1.402 \cdot (1) = \] \[= (35.276 + 1.402) - 3.611 \cdot displ\]

For year = 1999, we have: \[\hat{hwy} = 35.276 - 3.611 \cdot displ + 1.402 \cdot (0) = \] \[= 35.276 - 3.611 \cdot displ\]

Visualizing the parallel lines:

#glimpse( augment( mod ) )
ggplot( data = mpg_manual, aes( x = displ, y = hwy, color = factor( year ) ) ) +
  geom_point() +
  geom_line( data = augment( mod ), aes( y = .fitted, color = `factor(year)` ) )

# Augment the model
mariokart_filt <- mariokart %>%
  filter(total_pr < 100)
mod2 <- lm( total_pr ~ wheels + cond, data = mariokart_filt )
augmented_mod <- augment( mod2 )
#glimpse(augmented_mod)

# scatterplot, with color
data_space <- ggplot(data = augmented_mod, aes(x = wheels, y = total_pr, color = cond)) + 
  geom_point()
  
# single call to geom_line()
data_space + 
  geom_line(aes(y = .fitted, color = 'cond'))

Interpreting parallel slopes coefficients

Common misunderstandings for interpretting parallel slopes models:

  • There is only one slope. Yes there are 2 lines, however only the numeric feature is associated with a slope.
  • What is the reference level of the categorical feature?
  • What are the units?
  • After controlling for ….coefficients need to be interpretted n the context of the other explanatory variables in the model.

Three ways to describe a model

Mathematical Description \[\mbox{Equation: }y = \beta_0 + \beta_1x_1 + \beta_2x_2 + \epsilon\] \[\mbox{Residuals: } \epsilon \sim N(0,\sigma_{\epsilon})\] \[\mbox{Coefficients: } \beta_0, \beta_1, \beta_2\]

Geometric Description Visualizing graphically

Syntactic Using R

Multiple Regression:

  • \(y = \beta_0 + \beta_1x_1 + \beta_2x_2 + \cdots + \beta_px_p + \epsilon\)
  • `y ~ x1 + x2 + … + xp
  • one best fit line becomes multiple lines or a plane or even multiple planes.
glimpse( babies )
## Rows: 1,236
## Columns: 23
## $ id        <dbl> 15, 20, 58, 61, 72, 100, 102, 129, 142, 148, 164, 171, 175,…
## $ pluralty  <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ outcome   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ date      <dbl> 1411, 1499, 1576, 1504, 1425, 1673, 1449, 1562, 1408, 1568,…
## $ gestation <dbl> 284, 282, 279, 999, 282, 286, 244, 245, 289, 299, 351, 282,…
## $ sex       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ wt        <dbl> 120, 113, 128, 123, 108, 136, 138, 132, 120, 143, 140, 144,…
## $ parity    <dbl> 1, 2, 1, 2, 1, 4, 4, 2, 3, 3, 2, 4, 3, 5, 3, 4, 3, 3, 2, 3,…
## $ race      <dbl> 8, 0, 0, 0, 0, 0, 7, 7, 0, 0, 0, 0, 0, 8, 7, 7, 4, 3, 0, 0,…
## $ age       <dbl> 27, 33, 28, 36, 23, 25, 33, 23, 25, 30, 27, 32, 23, 36, 30,…
## $ ed        <dbl> 5, 5, 2, 5, 5, 2, 2, 1, 4, 5, 5, 2, 1, 5, 2, 2, 7, 2, 2, 2,…
## $ ht        <dbl> 62, 64, 64, 69, 67, 62, 62, 65, 62, 66, 68, 64, 63, 61, 63,…
## $ wt1       <dbl> 100, 135, 115, 190, 125, 93, 178, 140, 125, 136, 120, 124, …
## $ drace     <dbl> 8, 0, 5, 3, 0, 3, 7, 7, 3, 0, 5, 0, 5, 0, 7, 7, 7, 3, 5, 0,…
## $ dage      <dbl> 31, 38, 32, 43, 24, 28, 37, 23, 26, 34, 28, 36, 28, 32, 42,…
## $ ded       <dbl> 5, 5, 1, 4, 5, 2, 4, 4, 1, 5, 4, 1, 2, 5, 0, 0, 1, 2, 4, 2,…
## $ dht       <dbl> 65, 70, 99, 68, 99, 64, 99, 71, 70, 99, 99, 74, 99, 99, 99,…
## $ dwt       <dbl> 110, 148, 999, 197, 999, 130, 999, 192, 180, 999, 999, 185,…
## $ marital   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ inc       <dbl> 1, 4, 2, 8, 1, 4, 98, 2, 2, 2, 98, 2, 2, 2, 1, 1, 1, 4, 7, …
## $ smoke     <dbl> 0, 0, 1, 3, 1, 2, 0, 0, 0, 1, 3, 1, 1, 1, 0, 0, 1, 1, 0, 1,…
## $ time      <dbl> 0, 0, 1, 5, 1, 2, 0, 0, 0, 1, 4, 1, 1, 1, 0, 0, 1, 1, 0, 1,…
## $ number    <dbl> 0, 0, 1, 5, 5, 2, 0, 0, 0, 4, 2, 1, 1, 2, 0, 0, 5, 5, 0, 6,…
# build model
mod3 <- lm( data = babies, wt ~ age + factor( smoke ) )
mod3
## 
## Call:
## lm(formula = wt ~ age + factor(smoke), data = babies)
## 
## Coefficients:
##    (Intercept)             age  factor(smoke)1  factor(smoke)2  factor(smoke)3  
##      121.76727         0.03655        -8.64003         0.33830         1.62719  
## factor(smoke)9  
##        3.83272

Evaluating and Extending Parallel Slopes Model

Model fit, residuals, and prediction

If the model fits the data better, the residuals are smaller, the SSE is smaller and the \(R^2\) is larger.
Models that incorporate more features automatically have a larger \(R^2\) value that simple univariate regression models. Therefore, to make appropriate comparasons of model performance, the adjusted \(R^2\) value is preferred: \[R^2_{adj} = 1 - \frac{SSE}{SST} \cdot \frac{n-1}{n-p-1}\] \(R^2_{adj}\) applies a penalty \(p\) for each additional feature added to the model

#return just the predicted values:
predict( mod )
##        1        2        3        4        5        6        7        8 
## 28.77593 28.77593 29.45587 29.45587 25.16494 25.16494 25.48379 28.77593 
##        9       10       11       12       13       14       15       16 
## 28.77593 29.45587 29.45587 25.16494 25.16494 25.48379 25.48379 25.16494 
##       17       18       19       20       21       22       23       24 
## 25.48379 21.51170 17.53962 17.53962 17.53962 14.69308 15.01193 14.69308 
##       25       26       27       28       29       30       31       32 
## 14.69308 14.28973 14.28973 11.40094 17.53962 17.53962 14.69308 11.80430 
##       33       34       35       36       37       38       39       40 
## 26.60934 28.01148 24.08165 24.03939 23.67829 26.60934 24.44275 23.35945 
##       41       42       43       44       45       46       47       48 
## 23.35945 24.76159 24.76159 24.76159 21.55396 21.55396 22.95610 22.23390 
##       49       50       51       52       53       54       55       56 
## 23.31719 23.31719 21.19286 21.19286 19.70621 19.70621 19.70621 16.49858 
##       57       58       59       60       61       62       63       64 
## 16.49858 21.19286 19.70621 19.70621 19.70621 16.49858 16.09522 13.97089 
##       65       66       67       68       69       70       71       72 
## 19.70621 19.70621 19.70621 19.70621 19.70621 19.70621 16.49858 16.49858 
##       73       74       75       76       77       78       79       80 
## 16.09522 13.97089 18.66517 15.77638 17.17852 20.83176 20.83176 20.83176 
##       81       82       83       84       85       86       87       88 
## 22.23390 20.06731 17.22078 20.10956 20.10956 18.66517 18.66517 20.06731 
##       89       90       91       92       93       94       95       96 
## 15.77638 17.17852 21.55396 21.55396 22.23390 22.23390 18.66517 18.66517 
##       97       98       99      100      101      102      103      104 
## 20.06731 20.06731 17.17852 29.49813 29.49813 29.49813 29.49813 29.49813 
##      105      106      107      108      109      110      111      112 
## 30.17807 30.17807 30.17807 29.45587 26.60934 26.60934 28.01148 28.01148 
##      113      114      115      116      117      118      119      120 
## 26.24824 26.24824 24.76159 28.05373 28.05373 29.45587 29.45587 26.92818 
##      121      122      123      124      125      126      127      128 
## 26.92818 26.92818 25.84488 23.31719 20.83176 18.30407 19.70621 19.70621 
##      129      130      131      132      133      134      135      136 
## 16.09522 14.65083 20.83176 21.51170 20.78950 18.66517 15.77638 15.77638 
##      137      138      139      140      141      142      143      144 
## 17.17852 20.83176 22.23390 20.06731 17.22078 26.60934 26.60934 27.65038 
##      145      146      147      148      149      150      151      152 
## 27.65038 24.03939 24.03939 24.44275 24.44275 24.03939 23.35945 23.35945 
##      153      154      155      156      157      158      159      160 
## 22.23390 16.45632 24.08165 21.55396 21.55396 22.95610 17.53962 26.24824 
##      161      162      163      164      165      166      167      168 
## 26.24824 27.65038 27.65038 27.65038 27.65038 27.33154 27.33154 26.24824 
##      169      170      171      172      173      174      175      176 
## 26.24824 27.65038 27.65038 27.65038 27.65038 25.52604 25.52604 22.99835 
##      177      178      179      180      181      182      183      184 
## 22.99835 22.23390 19.70621 27.33154 27.33154 28.01148 28.01148 24.44275 
##      185      186      187      188      189      190      191      192 
## 24.44275 24.03939 27.33154 27.33154 28.01148 28.01148 24.44275 24.44275 
##      193      194      195      196      197      198      199      200 
## 24.76159 28.77593 28.77593 28.77593 30.17807 30.17807 18.30407 16.09522 
##      201      202      203      204      205      206      207      208 
## 25.52604 25.52604 26.92818 22.99835 22.99835 22.23390 22.23390 28.05373 
##      209      210      211      212      213      214      215      216 
## 28.05373 29.45587 29.45587 25.16494 28.41483 28.05373 28.05373 29.45587 
##      217      218      219      220      221      222      223      224 
## 29.45587 27.65038 27.65038 25.16494 25.16494 28.41483 28.41483 28.05373 
##      225      226      227      228      229      230      231      232 
## 28.05373 27.65038 27.65038 28.77593 28.77593 29.45587 29.45587 25.16494 
##      233      234 
## 25.16494 23.67829
#return a dataframe with the fitted values, data and other metrics
augment( mod )
## # A tibble: 234 x 8
##      hwy displ `factor(year)` .fitted .std.resid    .hat .sigma   .cooksd
##    <int> <dbl> <fct>            <dbl>      <dbl>   <dbl>  <dbl>     <dbl>
##  1    29   1.8 1999              28.8     0.0597 0.0143    3.79 0.0000173
##  2    29   1.8 1999              28.8     0.0597 0.0143    3.79 0.0000173
##  3    31   2   2008              29.5     0.412  0.0158    3.79 0.000908 
##  4    30   2   2008              29.5     0.145  0.0158    3.79 0.000113 
##  5    26   2.8 1999              25.2     0.222  0.00916   3.79 0.000152 
##  6    26   2.8 1999              25.2     0.222  0.00916   3.79 0.000152 
##  7    27   3.1 2008              25.5     0.403  0.00938   3.79 0.000512 
##  8    26   1.8 1999              28.8    -0.740  0.0143    3.78 0.00265  
##  9    25   1.8 1999              28.8    -1.01   0.0143    3.78 0.00490  
## 10    28   2   2008              29.5    -0.388  0.0158    3.79 0.000807 
## # … with 224 more rows
#some new data
new_obs <- data.frame( displ = 1.8, year = 2008 )
#return a prediction for this new data
predict( mod, newdata = new_obs )
##        1 
## 30.17807
#return a data.frame
augment( mod, newdata =  new_obs )
## # A tibble: 1 x 3
##   displ  year .fitted
##   <dbl> <dbl>   <dbl>
## 1   1.8  2008    30.2
glimpse( mariokart )
## Rows: 143
## Columns: 12
## $ id          <dbl> 150377422259, 260483376854, 320432342985, 280405224677, 1…
## $ duration    <int> 3, 7, 3, 3, 1, 3, 1, 1, 3, 7, 1, 1, 1, 1, 7, 7, 3, 3, 1, …
## $ n_bids      <int> 20, 13, 16, 18, 20, 19, 13, 15, 29, 8, 15, 15, 13, 16, 6,…
## $ cond        <fct> new, used, new, new, new, new, used, new, used, used, new…
## $ start_pr    <dbl> 0.99, 0.99, 0.99, 0.99, 0.01, 0.99, 0.01, 1.00, 0.99, 19.…
## $ ship_pr     <dbl> 4.00, 3.99, 3.50, 0.00, 0.00, 4.00, 0.00, 2.99, 4.00, 4.0…
## $ total_pr    <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53.99, 4…
## $ ship_sp     <fct> standard, firstClass, firstClass, standard, media, standa…
## $ seller_rate <int> 1580, 365, 998, 7, 820, 270144, 7284, 4858, 27, 201, 4858…
## $ stock_photo <fct> yes, yes, no, yes, yes, yes, yes, yes, yes, no, yes, yes,…
## $ wheels      <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1, 2, …
## $ title       <fct> "~~ Wii MARIO KART &amp; WHEEL ~ NINTENDO Wii ~ BRAND NEW…
mod_mk1 <- lm( data = mariokart, total_pr ~ wheels + cond )
# R^2 and adjusted R^2
summary( mod_mk1 )
## 
## Call:
## lm(formula = total_pr ~ wheels + cond, data = mariokart)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.719  -6.462  -2.513   0.487 267.565 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.6673     5.2795   7.135 4.78e-11 ***
## wheels       10.2161     2.6740   3.821   0.0002 ***
## condused      0.8457     4.5855   0.184   0.8539    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.42 on 140 degrees of freedom
## Multiple R-squared:  0.1091, Adjusted R-squared:  0.09638 
## F-statistic: 8.573 on 2 and 140 DF,  p-value: 0.0003075
# add random noise
mario_kart_noisy <- mariokart %>% mutate( noise = rnorm( n() ) )
  
# compute new model
mod_mk2 <- lm( data = mario_kart_noisy, total_pr ~ wheels + cond + noise )

# new R^2 and adjusted R^2
summary( mod_mk2 )
## 
## Call:
## lm(formula = total_pr ~ wheels + cond + noise, data = mario_kart_noisy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.605  -6.658  -2.503   1.231 266.150 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.552      5.288   7.101 5.84e-11 ***
## wheels        10.181      2.678   3.802 0.000214 ***
## condused       1.017      4.596   0.221 0.825239    
## noise          1.805      2.248   0.803 0.423354    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.45 on 139 degrees of freedom
## Multiple R-squared:  0.1132, Adjusted R-squared:  0.09408 
## F-statistic: 5.916 on 3 and 139 DF,  p-value: 0.0007913

Understanding interaction

Interaction terms. allow for changes in the relationships of explanatory variables.
ex:
\[\hat{mpg} = \hat{\beta_0} + \hat{\beta_1} \cdot displ + \hat{\beta_2}\cdot is\_newer + \hat{\beta_3} \cdot displ \cdot is\_newer\] This results in two best fit lines:

  • For older cars: \(\hat{mpg} = \hat{\beta_0} + \hat{\beta_0} \cdot displ\)
  • For newer cars: \(\hat{mpg} = (\hat{\beta_0} + \hat{\beta_2}) + (\hat{\beta_1} + \hat{\beta_3})\cdot displ\)

Syntax for Interactions:

# include interaction
# : means multiplication here
lm( total_pr ~ cond + duration + cond:duration, data = mariokart_filt )
## 
## Call:
## lm(formula = total_pr ~ cond + duration + cond:duration, data = mariokart_filt)
## 
## Coefficients:
##       (Intercept)           condused           duration  condused:duration  
##            58.268            -17.122             -1.966              2.325
# interaction plot
ggplot(data = mariokart_filt, aes(x = duration, y = total_pr, color = cond)) + 
  geom_point() + 
  geom_smooth(method = 'lm', se = FALSE )
## `geom_smooth()` using formula 'y ~ x'

Simpson’s Paradox

glimpse( SAT )
## Rows: 50
## Columns: 8
## $ state  <fct> Alabama, Alaska, Arizona, Arkansas, California, Colorado, Conn…
## $ expend <dbl> 4.405, 8.963, 4.778, 4.459, 4.992, 5.443, 8.817, 7.030, 5.718,…
## $ ratio  <dbl> 17.2, 17.6, 19.3, 17.1, 24.0, 18.4, 14.4, 16.6, 19.1, 16.3, 17…
## $ salary <dbl> 31.144, 47.951, 32.175, 28.934, 41.078, 34.571, 50.045, 39.076…
## $ perc   <int> 8, 47, 27, 6, 45, 29, 81, 68, 48, 65, 57, 15, 13, 58, 5, 9, 11…
## $ verbal <int> 491, 445, 448, 482, 417, 462, 431, 429, 420, 406, 407, 468, 48…
## $ math   <int> 538, 489, 496, 523, 485, 518, 477, 468, 469, 448, 482, 511, 56…
## $ total  <int> 1029, 934, 944, 1005, 902, 980, 908, 897, 889, 854, 889, 979, …
ggplot( data = SAT, aes( x = salary, y = total ) ) +
  geom_point() +
  geom_smooth( method = 'lm', se = 0 )
## `geom_smooth()` using formula 'y ~ x'

Revisualize the data using a paralel slopes model to evaluate the effect of percentage of students taking the SAT:

SAT_wbin <- SAT %>%
  mutate( sat_bin = cut( perc, 3 ) )

mod_satbin <- lm( formula = total ~ salary + sat_bin, data = SAT_wbin )
mod_satbin
## 
## Call:
## lm(formula = total ~ salary + sat_bin, data = SAT_wbin)
## 
## Coefficients:
##        (Intercept)              salary  sat_bin(29.7,55.3]  sat_bin(55.3,81.1]  
##          1000.7173              0.8697           -116.3174           -143.5428
glimpse( SAT_wbin )
## Rows: 50
## Columns: 9
## $ state   <fct> Alabama, Alaska, Arizona, Arkansas, California, Colorado, Con…
## $ expend  <dbl> 4.405, 8.963, 4.778, 4.459, 4.992, 5.443, 8.817, 7.030, 5.718…
## $ ratio   <dbl> 17.2, 17.6, 19.3, 17.1, 24.0, 18.4, 14.4, 16.6, 19.1, 16.3, 1…
## $ salary  <dbl> 31.144, 47.951, 32.175, 28.934, 41.078, 34.571, 50.045, 39.07…
## $ perc    <int> 8, 47, 27, 6, 45, 29, 81, 68, 48, 65, 57, 15, 13, 58, 5, 9, 1…
## $ verbal  <int> 491, 445, 448, 482, 417, 462, 431, 429, 420, 406, 407, 468, 4…
## $ math    <int> 538, 489, 496, 523, 485, 518, 477, 468, 469, 448, 482, 511, 5…
## $ total   <int> 1029, 934, 944, 1005, 902, 980, 908, 897, 889, 854, 889, 979,…
## $ sat_bin <fct> "(3.92,29.7]", "(29.7,55.3]", "(3.92,29.7]", "(3.92,29.7]", "…
ggplot( data = SAT_wbin, aes( x = salary, y = total, color = sat_bin ) ) +
  geom_point() +
  geom_line( data = broom::augment( mod_satbin ), aes( y = .fitted ) )

Ignoring SAT percentage, thre was a negative trend for SAT score to decrease as a function of salary. However, if the percentage of SAT is discretized, we see that all groups show a positive trend as a function of salary. This is an example of Simpson’s effect: an effect that occurs when the marginal association between two categorical variables is qualitatively different from the partial association between the same two variables after controlling for one or more other variables.

When Simpson’s phenomenon is present, the relationship between variables changes when subgroups are considers. When this paradox is present, the relationship is an important effect to include in a model.

slr <- ggplot(mariokart_filt, aes(y = total_pr, x = duration)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)

# model with one slope
lm( data = mariokart_filt, total_pr ~ duration )
## 
## Call:
## lm(formula = total_pr ~ duration, data = mariokart_filt)
## 
## Coefficients:
## (Intercept)     duration  
##      52.374       -1.317
# plot with two slopes
slr2 <- slr + aes(color = cond )

grid.arrange( slr, slr2, ncol = 2 )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Multiple Regression

Adding a Numerical Explanatory Variable

  • Mathematical: \(\hat{bwt} = \hat{\beta_0} + \hat{\beta_1} \cdot gestation + \hat{\beta_2} \cdot age\)

  • Syntactical: lm( bwt ~ gestation + age, data = babies )
  • Geometric:

#glimpse( babies )
babies_filt <- babies %>%
  filter( age < 50 & gestation < 500 )
data_space <- ggplot( babies_filt, aes( x = gestation, y = age ) ) +
  geom_point( aes( color = wt ) )
data_space

Tiling the Plane

grid <- babies_filt %>%
  data_grid(
    gestation = seq_range( gestation, by = 1 ),
    age = seq_range( age, by = 1 )
  )
mod_tiling <- lm( wt ~ gestation + age, data = babies )
bwt_hats <- augment( mod_tiling, newdata = grid )
#glimpse( bwt_hats )
data_space + geom_tile( data = bwt_hats, aes( fill = .fitted, alpha = 0.5 ) ) +
  scale_fill_continuous( 'wt', limits = range( babies_filt$wt ))

Adding a Third (categorical) variable

glimpse( babies_filt )
## Rows: 1,221
## Columns: 23
## $ id        <dbl> 15, 20, 58, 72, 100, 102, 129, 142, 148, 164, 171, 175, 183…
## $ pluralty  <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ outcome   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ date      <dbl> 1411, 1499, 1576, 1425, 1673, 1449, 1562, 1408, 1568, 1554,…
## $ gestation <dbl> 284, 282, 279, 282, 286, 244, 245, 289, 299, 351, 282, 279,…
## $ sex       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ wt        <dbl> 120, 113, 128, 108, 136, 138, 132, 120, 143, 140, 144, 141,…
## $ parity    <dbl> 1, 2, 1, 1, 4, 4, 2, 3, 3, 2, 4, 3, 5, 3, 4, 3, 3, 2, 3, 3,…
## $ race      <dbl> 8, 0, 0, 0, 0, 7, 7, 0, 0, 0, 0, 0, 8, 7, 7, 4, 3, 0, 0, 9,…
## $ age       <dbl> 27, 33, 28, 23, 25, 33, 23, 25, 30, 27, 32, 23, 36, 30, 38,…
## $ ed        <dbl> 5, 5, 2, 5, 2, 2, 1, 4, 5, 5, 2, 1, 5, 2, 2, 7, 2, 2, 2, 2,…
## $ ht        <dbl> 62, 64, 64, 67, 62, 62, 65, 62, 66, 68, 64, 63, 61, 63, 63,…
## $ wt1       <dbl> 100, 135, 115, 125, 93, 178, 140, 125, 136, 120, 124, 128, …
## $ drace     <dbl> 8, 0, 5, 0, 3, 7, 7, 3, 0, 5, 0, 5, 0, 7, 7, 7, 3, 5, 0, 10…
## $ dage      <dbl> 31, 38, 32, 24, 28, 37, 23, 26, 34, 28, 36, 28, 32, 42, 37,…
## $ ded       <dbl> 5, 5, 1, 5, 2, 4, 4, 1, 5, 4, 1, 2, 5, 0, 0, 1, 2, 4, 2, 1,…
## $ dht       <dbl> 65, 70, 99, 99, 64, 99, 71, 70, 99, 99, 74, 99, 99, 99, 71,…
## $ dwt       <dbl> 110, 148, 999, 999, 130, 999, 192, 180, 999, 999, 185, 999,…
## $ marital   <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ inc       <dbl> 1, 4, 2, 1, 4, 98, 2, 2, 2, 98, 2, 2, 2, 1, 1, 1, 4, 7, 5, …
## $ smoke     <dbl> 0, 0, 1, 1, 2, 0, 0, 0, 1, 3, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,…
## $ time      <dbl> 0, 0, 1, 1, 2, 0, 0, 0, 1, 4, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,…
## $ number    <dbl> 0, 0, 1, 5, 2, 0, 0, 0, 4, 2, 1, 1, 2, 0, 0, 5, 5, 0, 6, 0,…
mod_bab3 <- lm( wt ~ gestation + age + smoke, data = babies_filt )
mod_bab3
## 
## Call:
## lm(formula = wt ~ gestation + age + smoke, data = babies_filt)
## 
## Coefficients:
## (Intercept)    gestation          age        smoke  
##    -15.5181       0.4680       0.1661      -0.1388

Geometry:

  • numeric + categorical: parallel lines
  • numeric + numeric: a plane
  • numeric + numeric + categorical: parallel planes

use plotly for parallel planes visualization

Higher Dimensions

Kitchen sink model: throw everything in and see what happens

Logistic Regression

What is Logistic Regression?

Linear Regression with a categorical response variable.

glimpse( heart_transplant )
## Rows: 103
## Columns: 8
## $ id         <int> 15, 43, 61, 75, 6, 42, 54, 38, 85, 2, 103, 12, 48, 102, 35…
## $ acceptyear <int> 68, 70, 71, 72, 68, 70, 71, 70, 73, 68, 67, 68, 71, 74, 70…
## $ age        <int> 53, 43, 52, 52, 54, 36, 47, 41, 47, 51, 39, 53, 56, 40, 43…
## $ survived   <fct> dead, dead, dead, dead, dead, dead, dead, dead, dead, dead…
## $ survtime   <int> 1, 2, 2, 2, 3, 3, 3, 5, 5, 6, 6, 8, 9, 11, 12, 16, 16, 16,…
## $ prior      <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no…
## $ transplant <fct> control, control, control, control, control, control, cont…
## $ wait       <int> NA, NA, NA, NA, NA, NA, NA, 5, NA, NA, NA, NA, NA, NA, NA,…
ggplot( data = heart_transplant, aes( x = age, y = survived ) ) +
  geom_jitter( width = 0, height = 0.05, alpha = 0.5 )

We have to transform the survived feature to a binary variable

heartTr <- heart_transplant %>%
  mutate( is_alive = ifelse( survived == 'alive', 1 ,0 ) )

Now, visualize the binary response

data_space_hearTr <- ggplot( data = heartTr, aes( x = age, y = is_alive ) ) +
  geom_jitter( width = 0, height = 0.05, alpha = 0.5 )
data_space_hearTr

Just fitting a linear regression line here is inappropriate and will not capture the trands in the data here.

Generalized Linear Models:

  • Generalization of multiple regression.
    • modelnon-normal response distributions
  • GLM special case: logistic regression
    • model a binary response variable
    • uses logit link function
  • GLM in a nutshell: apply a link function to appropriately scale the response variable to match the output of a linear model. The link function used in logistic regression is the logit function.

Fitting a GLM:

glm( is_alive ~ age, data = heartTr, family = binomial )
## 
## Call:  glm(formula = is_alive ~ age, family = binomial, data = heartTr)
## 
## Coefficients:
## (Intercept)          age  
##     1.56438     -0.05847  
## 
## Degrees of Freedom: 102 Total (i.e. Null);  101 Residual
## Null Deviance:       120.5 
## Residual Deviance: 113.7     AIC: 117.7
binomial()
## 
## Family: binomial 
## Link function: logit
data("MedGPA")
head( MedGPA )
##   Accept Acceptance Sex BCPM  GPA VR PS WS BS MCAT Apps
## 1      D          0   F 3.59 3.62 11  9  9  9   38    5
## 2      A          1   M 3.75 3.84 12 13  8 12   45    3
## 3      A          1   F 3.24 3.23  9 10  5  9   33   19
## 4      A          1   F 3.74 3.69 12 11  7 10   40    5
## 5      A          1   F 3.53 3.38  9 11  4 11   35   11
## 6      A          1   M 3.59 3.72 10  9  7 10   36    5

Example categorical response variable plotted with a simple linear regression line:

# scatterplot with jitter
data_space <- ggplot( data = MedGPA, aes( x = GPA, y = Acceptance ) ) + 
  geom_jitter(width = 0, height = 0.05, alpha = 0.5)

# linear regression line
data_space + 
  geom_smooth( method = 'lm', se = FALSE )
## `geom_smooth()` using formula 'y ~ x'

For predictions close to the median, a linear regression model might seem appropriate:

# filter
MedGPA_middle <- MedGPA %>%
filter( GPA > 3.375 & GPA <= 3.77 )

# scatterplot with jitter
data_space2 <- ggplot(data = MedGPA_middle, aes( x = GPA, y = Acceptance )) + 
  geom_jitter(width = 0, height = 0.05, alpha = 0.5)

# linear regression line
data_space2 + 
  geom_smooth( method = 'lm', se = FALSE )
## `geom_smooth()` using formula 'y ~ x'

However, linear regression does not make good predictions for the extreem ends of the data distribution (very high or very low GPA scores )

Fit a GLM model

# fit model
glm(Acceptance ~ GPA, data = MedGPA, family = binomial)
## 
## Call:  glm(formula = Acceptance ~ GPA, family = binomial, data = MedGPA)
## 
## Coefficients:
## (Intercept)          GPA  
##     -19.207        5.454  
## 
## Degrees of Freedom: 54 Total (i.e. Null);  53 Residual
## Null Deviance:       75.79 
## Residual Deviance: 56.84     AIC: 60.84

Visualizing Logistic Regression

data_space

data_space +
  geom_smooth( method = 'lm', se = FALSE ) +
  geom_smooth( method = 'glm', se = FALSE, color = 'red',
               method.args = list( family = 'binomial' ) )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

data_space_hearTr +
  geom_smooth( method = 'lm', se = FALSE ) +
  geom_smooth( method = 'glm', se = FALSE, color = 'red',
               method.args = list( family = 'binomial' ) )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

The sigmoidal shaped logistic curve will never reach the values of 1 or 0 which illiminates the problematic prediction values from the lm model.

MedGPA_binned <- MedGPA %>%
  mutate( bin = cut( GPA, 6 ) ) %>%
  group_by( bin ) %>%
  summarise( mean_GPA = mean( GPA ),
             acceptance_rate = sum(Acceptance)/n())
## `summarise()` ungrouping output (override with `.groups` argument)
glimpse( MedGPA_binned )
## Rows: 6
## Columns: 3
## $ bin             <fct> "(2.72,2.93]", "(2.93,3.14]", "(3.14,3.35]", "(3.35,3…
## $ mean_GPA        <dbl> 2.760000, 3.065000, 3.255714, 3.437857, 3.646667, 3.8…
## $ acceptance_rate <dbl> 0.0000000, 0.0000000, 0.2857143, 0.4285714, 0.5333333…
glmmod <- glm(formula = Acceptance ~ GPA, family = binomial, data = MedGPA)

# binned points and line
data_space <- ggplot( data = MedGPA_binned, aes( x = mean_GPA, y = acceptance_rate ) ) +
geom_line()
#data_space

# augmented model
MedGPA_plus <- augment( glmmod, type.predict = 'response' )
#glimpse( MedGPA_plus )
# logistic model on probability scale
data_space +
  geom_line(data = MedGPA_plus, aes( x = GPA, y = .fitted), color = "red")

Observe that the logistic predictions follow the binned values very well

Three scales approach to interpretation

hTrmod <- glm( is_alive ~ age, data = heartTr, family = binomial )
heartTr_plus <- hTrmod %>%
  augment( type.predict = 'response' ) %>%
  mutate( y_hat = .fitted )

Probability scale plot

ggplot( heartTr_plus, aes( x = age, y = y_hat ) ) +
  geom_point() +
  geom_line() +
  scale_y_continuous( 'Probability of Being Alive', limits = c( 0,1 ) )

The logistic line is curved, therefore we can no longer say that each additional year in age is associated with a set change (slope) of P(alive) as if this were a linear model.

To combat this, we can change the scale of the y-axis.
Odds scale \[odds(y) = \frac{\hat{y}}{1-\hat{y}} = \mbox{exp}(\hat{\beta_0} + \hat{\beta_1} \cdot x)\]

#glimpse( heartTr_plus )
heartTr_plus <- heartTr_plus %>%
  mutate( odds_hat = y_hat / (1 - y_hat ) )

ggplot( heartTr_plus, aes( x = age, y = odds_hat ) ) +
  geom_point() +
  geom_line() +
  scale_y_continuous( 'Odds of Being Alive', limits = c( 0,1 ) )
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 row(s) containing missing values (geom_path).

The model now has the form of an exponential function

Log-odds scale:
\[logit( \hat{y]}) = log \left[ \frac{\hat{y}}{1-\hat{y}} \right] = \hat{\beta_0} + \hat{\beta_1} \cdot x\]

heartTr_plus <- heartTr_plus %>%
  mutate( log_odds_hat = log( odds_hat ) )

ggplot( heartTr_plus, aes( x = age, y = log_odds_hat ) ) +
  geom_point() +
  geom_line() +
  scale_y_continuous( 'Odds of Being Alive', limits = c( 0,0.5 ) ) +
  scale_x_continuous( limits = c( 0,70 ) )
## Warning: Removed 97 rows containing missing values (geom_point).
## Warning: Removed 97 row(s) containing missing values (geom_path).

Caparison of the 3 scales:

  1. Probability Scale
  • scale: intuitive, easy to read
  • function: non-linear and hard to interpret
  1. Odds Scale
  • scale: harder to interpret
  • function: exponention; hard to interpret
  1. Log-Odds Scale
  • scale: impossible to interpret
  • function: linear; easy to interpret

Odds ratios

exp( coef(mod) )
##      (Intercept)            displ factor(year)2008 
##     2.089510e+15     2.702518e-02     4.063874e+00
# compute odds for bins
MedGPA_binned <- MedGPA_binned %>%
mutate( odds = acceptance_rate/(1-acceptance_rate))

# plot binned odds
data_space <- ggplot( MedGPA_binned, aes( x = mean_GPA, y = odds ) ) +
geom_point() +
geom_line()


# compute odds for observations
MedGPA_plus <- MedGPA_plus %>%
mutate( odds_hat = .fitted/(1-.fitted))

# logistic model on odds scale
data_space +
  geom_line(data = MedGPA_plus, aes( x = GPA, y = odds_hat ), color = "red")

Log_odds scale

# compute log odds for bins
MedGPA_binned <- MedGPA_binned %>%
mutate( log_odds = log( acceptance_rate/( 1-acceptance_rate ) ) )

# plot binned log odds
data_space <- ggplot( MedGPA_binned, aes( x = mean_GPA, y = log_odds ) ) +
geom_point() +
geom_line()


# compute log odds for observations
MedGPA_plus <- MedGPA_plus %>%
mutate( log_odds_hat = log( .fitted/( 1-.fitted ) ) )

# logistic model on log odds scale
data_space +
  geom_line(data = MedGPA_plus, aes( x = GPA, y = log_odds_hat), color = "red")

Using a Logistic Model

Learning from the coefficients about the underlying processes

#fitted probabilities on the log-odds scale (not very useful):
augment( hTrmod )  
## # A tibble: 103 x 8
##    is_alive   age .fitted .resid .std.resid   .hat .sigma .cooksd
##       <dbl> <int>   <dbl>  <dbl>      <dbl>  <dbl>  <dbl>   <dbl>
##  1        0    53  -1.53  -0.625     -0.630 0.0156   1.06 0.00174
##  2        0    43  -0.950 -0.809     -0.813 0.0106   1.06 0.00210
##  3        0    52  -1.48  -0.642     -0.646 0.0147   1.06 0.00173
##  4        0    52  -1.48  -0.642     -0.646 0.0147   1.06 0.00173
##  5        0    54  -1.59  -0.608     -0.614 0.0166   1.06 0.00175
##  6        0    36  -0.540 -0.958     -0.967 0.0181   1.06 0.00548
##  7        0    47  -1.18  -0.731     -0.735 0.0111   1.06 0.00174
##  8        0    41  -0.833 -0.850     -0.855 0.0115   1.06 0.00257
##  9        0    47  -1.18  -0.731     -0.735 0.0111   1.06 0.00174
## 10        0    51  -1.42  -0.659     -0.663 0.0138   1.06 0.00172
## # … with 93 more rows
#return on probability scale (more familiar)
augment( hTrmod, type.predict = 'response' )
## # A tibble: 103 x 8
##    is_alive   age .fitted .resid .std.resid   .hat .sigma .cooksd
##       <dbl> <int>   <dbl>  <dbl>      <dbl>  <dbl>  <dbl>   <dbl>
##  1        0    53   0.177 -0.625     -0.630 0.0156   1.06 0.00174
##  2        0    43   0.279 -0.809     -0.813 0.0106   1.06 0.00210
##  3        0    52   0.186 -0.642     -0.646 0.0147   1.06 0.00173
##  4        0    52   0.186 -0.642     -0.646 0.0147   1.06 0.00173
##  5        0    54   0.169 -0.608     -0.614 0.0166   1.06 0.00175
##  6        0    36   0.368 -0.958     -0.967 0.0181   1.06 0.00548
##  7        0    47   0.234 -0.731     -0.735 0.0111   1.06 0.00174
##  8        0    41   0.303 -0.850     -0.855 0.0115   1.06 0.00257
##  9        0    47   0.234 -0.731     -0.735 0.0111   1.06 0.00174
## 10        0    51   0.195 -0.659     -0.663 0.0138   1.06 0.00172
## # … with 93 more rows

Out-of-Sample predictions

cheney <- data.frame( age = 71, transplant = 'treatment' )
augment( hTrmod, newdata = cheney, type.predict = 'response' )
## # A tibble: 1 x 3
##     age transplant .fitted
##   <dbl> <chr>        <dbl>
## 1    71 treatment   0.0700

Making Binary Predictions
e.g. for the heart transplant data, a person either lives or dies. does it make sense to predict a probability?

hTrmod_plus <- augment( hTrmod, type.predict = 'response' ) %>%
  mutate( alive_hat = round( .fitted ) )
#glimpse( hTrmod_plus)
hTrmod_plus %>%
  select( is_alive, age, .fitted, alive_hat )
## # A tibble: 103 x 4
##    is_alive   age .fitted alive_hat
##       <dbl> <int>   <dbl>     <dbl>
##  1        0    53   0.177         0
##  2        0    43   0.279         0
##  3        0    52   0.186         0
##  4        0    52   0.186         0
##  5        0    54   0.169         0
##  6        0    36   0.368         0
##  7        0    47   0.234         0
##  8        0    41   0.303         0
##  9        0    47   0.234         0
## 10        0    51   0.195         0
## # … with 93 more rows

A look at the confusion matrix for a simple rounding scheme:

hTrmod_plus %>%
  select( is_alive, alive_hat ) %>%
  table()
##         alive_hat
## is_alive  0  1
##        0 71  4
##        1 25  3

Back the MedGPA data

# create new data frame
new_data <- data.frame( 'GPA' = 3.51 )

# make predictions
augment( glmmod, newdata = new_data, type.predict = 'response' )
## # A tibble: 1 x 2
##     GPA .fitted
##   <dbl>   <dbl>
## 1  3.51   0.484
# data frame with binary predictions
tidy_mod <- glmmod %>%
augment( type.predict = 'response' ) %>%
mutate( Acceptance_hat = round( .fitted ) )
  
# confusion matrix
tidy_mod %>%
  select(Acceptance, Acceptance_hat) %>% 
  table()
##           Acceptance_hat
## Acceptance  0  1
##          0 16  9
##          1  6 24

Case Study: Italian Retaurants in NYC

Italian Restaurants in NYC

What are the factors that contribute to the price of a meal at an Italian Restaurant in NYC?

#Zagat reviews data
nyc_url <- 'https://assets.datacamp.com/production/repositories/845/datasets/639a7a3f9020edb51bcbc4bfdb7b71cbd8b9a70e/nyc.csv'
nyc <- read.csv( nyc_url )
glimpse( nyc )
## Rows: 168
## Columns: 7
## $ Case       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Restaurant <chr> "Daniella Ristorante", "Tello's Ristorante", "Biricchino",…
## $ Price      <int> 43, 32, 34, 41, 54, 52, 34, 34, 39, 44, 45, 47, 52, 35, 47…
## $ Food       <int> 22, 20, 21, 20, 24, 22, 22, 20, 22, 21, 19, 21, 21, 19, 20…
## $ Decor      <int> 18, 19, 13, 20, 19, 22, 16, 18, 19, 17, 17, 19, 19, 17, 18…
## $ Service    <int> 20, 19, 18, 17, 21, 21, 21, 21, 22, 19, 20, 21, 20, 19, 21…
## $ East       <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…

EDA
How are the variables distributed?

nyc %>% select( -Restaurant ) %>% pairs()

# Price by Food plot
ggplot( data = nyc, aes( x = Food, y = Price ) ) +
geom_point()

# Price by Food model
lm( Price ~ Food, data = nyc )
## 
## Call:
## lm(formula = Price ~ Food, data = nyc)
## 
## Coefficients:
## (Intercept)         Food  
##     -17.832        2.939

Incorporating another variable

How does location to the East or West of 5th Avenue affect the price of an Italian meal?

nyc %>% group_by( East ) %>%
  dplyr::summarize( mean_price = mean( Price ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##    East mean_price
##   <int>      <dbl>
## 1     0       40.4
## 2     1       44.0

But is there a confounding variable? Could the food quality have more to do with the price than location? How does the quality of service affect price?

lm( data = nyc, Price ~ Food + East )
## 
## Call:
## lm(formula = Price ~ Food + East, data = nyc)
## 
## Coefficients:
## (Intercept)         Food         East  
##     -17.430        2.875        1.459

Higher Dimentions

How is the percieved quality of the food vary with the price of a meal? How is this effect moderated by the quality of food and/or service?

Collinear variables: if one variable is varies constantly with another variable. Having colinear variables in a model detracts from the power of the model by adding redundant information.

An example of perfect collinearity:

#Comparing the price of a model in dollars to the price in cents
nyc %>%
  mutate( Price_cents = Price / 100 ) %>%
  dplyr::summarise( cor_collinear = cor( Price, Price_cents ) )
##   cor_collinear
## 1             1

Multicollinearity

  • Explanatory variables can get highly correlated. especially as more and more features are added to a model
  • leads to unstable coefficient estimates
  • However, does not detract the \(R^2\), or the explanatory power of the model as a whole
# Price by Food and Service and East
lm( data = nyc, Price ~ Food + Service + East )
## 
## Call:
## lm(formula = Price ~ Food + Service + East, data = nyc)
## 
## Coefficients:
## (Intercept)         Food      Service         East  
##    -20.8155       1.4863       1.6647       0.9649

Higher dimensions

hdimmod <- lm( data = nyc, Price ~ Food + Decor + Service + East )
summary( hdimmod )
## 
## Call:
## lm(formula = Price ~ Food + Decor + Service + East, data = nyc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0465  -3.8837   0.0373   3.3942  17.7491 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -24.023800   4.708359  -5.102 9.24e-07 ***
## Food          1.538120   0.368951   4.169 4.96e-05 ***
## Decor         1.910087   0.217005   8.802 1.87e-15 ***
## Service      -0.002727   0.396232  -0.007   0.9945    
## East          2.068050   0.946739   2.184   0.0304 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.738 on 163 degrees of freedom
## Multiple R-squared:  0.6279, Adjusted R-squared:  0.6187 
## F-statistic: 68.76 on 4 and 163 DF,  p-value: < 2.2e-16