Sales of Child Car Seats

We’ll be using the Carseat data set in the ISLR package. Its a simulated dataset about the sales of car seats at different stores.

library(ISLR)
data(Carseats)
names(Carseats)
##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"
str(Carseats)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

Observe that there are a few variables that were “Factors” this means that they are categorical. Using the str() function we can also see how many levels each have.

Dummy variables

The method of using dummy variables is also called contrasts. We can use the contrast() function to see how this coding is done.

contrasts(Carseats$ShelveLoc)
##        Good Medium
## Bad       0      0
## Good      1      0
## Medium    0      1

Fitting a Simple Model

First, lets fit a simple model of sales as a function of location:

carMod1<-lm(Sales~ShelveLoc, data=Carseats)
summary(carMod1)
## 
## Call:
## lm(formula = Sales ~ ShelveLoc, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3066 -1.6282 -0.0416  1.5666  6.1471 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.5229     0.2388  23.131  < 2e-16 ***
## ShelveLocGood     4.6911     0.3484  13.464  < 2e-16 ***
## ShelveLocMedium   1.7837     0.2864   6.229  1.2e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.339 on 397 degrees of freedom
## Multiple R-squared:  0.3172, Adjusted R-squared:  0.3138 
## F-statistic: 92.23 on 2 and 397 DF,  p-value: < 2.2e-16
# Visualize the model
library(tidyverse)
ggplot(Carseats, aes(y=Sales, x=ShelveLoc, fill=ShelveLoc))+
  geom_boxplot()

# ANOVA output
anova(carMod1)
## Analysis of Variance Table
## 
## Response: Sales
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## ShelveLoc   2 1009.5  504.77   92.23 < 2.2e-16 ***
## Residuals 397 2172.7    5.47                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Second Model: Parallel lines

Second, lets include Price in the model:

carMod2<-lm(Sales~ShelveLoc+Price, data=Carseats)
summary(carMod2)
## 
## Call:
## lm(formula = Sales ~ ShelveLoc + Price, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8229 -1.3930 -0.0179  1.3868  5.0780 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     12.001802   0.503447  23.839  < 2e-16 ***
## ShelveLocGood    4.895848   0.285921  17.123  < 2e-16 ***
## ShelveLocMedium  1.862022   0.234748   7.932 2.23e-14 ***
## Price           -0.056698   0.004059 -13.967  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.917 on 396 degrees of freedom
## Multiple R-squared:  0.5426, Adjusted R-squared:  0.5391 
## F-statistic: 156.6 on 3 and 396 DF,  p-value: < 2.2e-16
# BAD
# intercept = 12.001802
# slope = -0.056698

# GOOD
# intercept = 12.001802+4.895848
# slope = -0.056698

# MEDIUM
# intercept = 12.001802+1.862022
# slope = -0.056698

# VIZ: Shifts in intercept
ggplot(Carseats, aes(x=Price, y=Sales, color=ShelveLoc))+
  geom_point()+
  geom_abline(intercept = carMod2$coefficients[1], slope=carMod2$coefficients[4],
              color="red", lwd=1)+
  geom_abline(intercept = carMod2$coefficients[1]+carMod2$coefficients[2], slope=carMod2$coefficients[4],
              color="forestgreen", lwd=1)+
  geom_abline(intercept = carMod2$coefficients[1]+carMod2$coefficients[3], slope=carMod2$coefficients[4],
              color="blue", lwd=1)

# ANOVA
anova(carMod2)
## Analysis of Variance Table
## 
## Response: Sales
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## ShelveLoc   2 1009.5  504.77  137.32 < 2.2e-16 ***
## Price       1  717.1  717.10  195.08 < 2.2e-16 ***
## Residuals 396 1455.6    3.68                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Third Model: Interaction

Third, lets include an interaction with Price in the model:

carMod3<-lm(Sales~ShelveLoc*Price, data=Carseats)
summary(carMod3)
## 
## Call:
## lm(formula = Sales ~ ShelveLoc * Price, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9037 -1.3461 -0.0595  1.3679  4.9037 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           11.832984   0.965788  12.252  < 2e-16 ***
## ShelveLocGood          6.135880   1.392844   4.405 1.36e-05 ***
## ShelveLocMedium        1.630481   1.171616   1.392    0.165    
## Price                 -0.055220   0.008276  -6.672 8.57e-11 ***
## ShelveLocGood:Price   -0.010564   0.011742  -0.900    0.369    
## ShelveLocMedium:Price  0.001984   0.010007   0.198    0.843    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.918 on 394 degrees of freedom
## Multiple R-squared:  0.5444, Adjusted R-squared:  0.5386 
## F-statistic: 94.17 on 5 and 394 DF,  p-value: < 2.2e-16
# BAD
# intercept = 11.832984
# slope = -0.055220

# GOOD
# intercept = 11.832984+6.135880
# slope = -0.055220-0.010564

# MEDIUM
# intercept = 11.832984+1.630481
# slope = -0.055220+0.001984

# VIZ: Shift intercepts and slopes
ggplot(Carseats, aes(x=Price, y=Sales, color=ShelveLoc))+
  geom_point()+
  geom_abline(intercept = carMod3$coefficients[1], slope=carMod3$coefficients[4],
              color="red", lwd=1)+
  geom_abline(intercept = carMod3$coefficients[1]+carMod3$coefficients[2], slope=carMod3$coefficients[4]+carMod3$coefficients[5],
              color="forestgreen", lwd=1)+
  geom_abline(intercept = carMod3$coefficients[1]+carMod3$coefficients[3], slope=carMod3$coefficients[4]+carMod3$coefficients[6],
              color="blue", lwd=1)

# ANOVA
anova(carMod3)
## Analysis of Variance Table
## 
## Response: Sales
##                  Df  Sum Sq Mean Sq  F value Pr(>F)    
## ShelveLoc         2 1009.53  504.77 137.1807 <2e-16 ***
## Price             1  717.10  717.10 194.8878 <2e-16 ***
## ShelveLoc:Price   2    5.89    2.95   0.8005 0.4498    
## Residuals       394 1449.75    3.68                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Re-leveling

One last comment, you might want to set a different reference level, rather than using the alphabetic order.

Carseats$ShelveLoc <- factor(Carseats$ShelveLoc, levels = c("Medium", "Bad", "Good"))

contrasts(Carseats$ShelveLoc)
##        Bad Good
## Medium   0    0
## Bad      1    0
## Good     0    1