We’ll be using the Carseat data set in the ISLR package. Its a simulated dataset about the sales of car seats at different stores.
library(ISLR)
data(Carseats)
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
Observe that there are a few variables that were “Factors” this means that they are categorical. Using the str() function we can also see how many levels each have.
The method of using dummy variables is also called contrasts. We can use the contrast() function to see how this coding is done.
contrasts(Carseats$ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
First, lets fit a simple model of sales as a function of location:
carMod1<-lm(Sales~ShelveLoc, data=Carseats)
summary(carMod1)
##
## Call:
## lm(formula = Sales ~ ShelveLoc, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3066 -1.6282 -0.0416 1.5666 6.1471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.5229 0.2388 23.131 < 2e-16 ***
## ShelveLocGood 4.6911 0.3484 13.464 < 2e-16 ***
## ShelveLocMedium 1.7837 0.2864 6.229 1.2e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.339 on 397 degrees of freedom
## Multiple R-squared: 0.3172, Adjusted R-squared: 0.3138
## F-statistic: 92.23 on 2 and 397 DF, p-value: < 2.2e-16
# Visualize the model
library(tidyverse)
ggplot(Carseats, aes(y=Sales, x=ShelveLoc, fill=ShelveLoc))+
geom_boxplot()
# ANOVA output
anova(carMod1)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## ShelveLoc 2 1009.5 504.77 92.23 < 2.2e-16 ***
## Residuals 397 2172.7 5.47
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Second, lets include Price in the model:
carMod2<-lm(Sales~ShelveLoc+Price, data=Carseats)
summary(carMod2)
##
## Call:
## lm(formula = Sales ~ ShelveLoc + Price, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8229 -1.3930 -0.0179 1.3868 5.0780
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.001802 0.503447 23.839 < 2e-16 ***
## ShelveLocGood 4.895848 0.285921 17.123 < 2e-16 ***
## ShelveLocMedium 1.862022 0.234748 7.932 2.23e-14 ***
## Price -0.056698 0.004059 -13.967 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.917 on 396 degrees of freedom
## Multiple R-squared: 0.5426, Adjusted R-squared: 0.5391
## F-statistic: 156.6 on 3 and 396 DF, p-value: < 2.2e-16
# BAD
# intercept = 12.001802
# slope = -0.056698
# GOOD
# intercept = 12.001802+4.895848
# slope = -0.056698
# MEDIUM
# intercept = 12.001802+1.862022
# slope = -0.056698
# VIZ: Shifts in intercept
ggplot(Carseats, aes(x=Price, y=Sales, color=ShelveLoc))+
geom_point()+
geom_abline(intercept = carMod2$coefficients[1], slope=carMod2$coefficients[4],
color="red", lwd=1)+
geom_abline(intercept = carMod2$coefficients[1]+carMod2$coefficients[2], slope=carMod2$coefficients[4],
color="forestgreen", lwd=1)+
geom_abline(intercept = carMod2$coefficients[1]+carMod2$coefficients[3], slope=carMod2$coefficients[4],
color="blue", lwd=1)
# ANOVA
anova(carMod2)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## ShelveLoc 2 1009.5 504.77 137.32 < 2.2e-16 ***
## Price 1 717.1 717.10 195.08 < 2.2e-16 ***
## Residuals 396 1455.6 3.68
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Third, lets include an interaction with Price in the model:
carMod3<-lm(Sales~ShelveLoc*Price, data=Carseats)
summary(carMod3)
##
## Call:
## lm(formula = Sales ~ ShelveLoc * Price, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9037 -1.3461 -0.0595 1.3679 4.9037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.832984 0.965788 12.252 < 2e-16 ***
## ShelveLocGood 6.135880 1.392844 4.405 1.36e-05 ***
## ShelveLocMedium 1.630481 1.171616 1.392 0.165
## Price -0.055220 0.008276 -6.672 8.57e-11 ***
## ShelveLocGood:Price -0.010564 0.011742 -0.900 0.369
## ShelveLocMedium:Price 0.001984 0.010007 0.198 0.843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.918 on 394 degrees of freedom
## Multiple R-squared: 0.5444, Adjusted R-squared: 0.5386
## F-statistic: 94.17 on 5 and 394 DF, p-value: < 2.2e-16
# BAD
# intercept = 11.832984
# slope = -0.055220
# GOOD
# intercept = 11.832984+6.135880
# slope = -0.055220-0.010564
# MEDIUM
# intercept = 11.832984+1.630481
# slope = -0.055220+0.001984
# VIZ: Shift intercepts and slopes
ggplot(Carseats, aes(x=Price, y=Sales, color=ShelveLoc))+
geom_point()+
geom_abline(intercept = carMod3$coefficients[1], slope=carMod3$coefficients[4],
color="red", lwd=1)+
geom_abline(intercept = carMod3$coefficients[1]+carMod3$coefficients[2], slope=carMod3$coefficients[4]+carMod3$coefficients[5],
color="forestgreen", lwd=1)+
geom_abline(intercept = carMod3$coefficients[1]+carMod3$coefficients[3], slope=carMod3$coefficients[4]+carMod3$coefficients[6],
color="blue", lwd=1)
# ANOVA
anova(carMod3)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## ShelveLoc 2 1009.53 504.77 137.1807 <2e-16 ***
## Price 1 717.10 717.10 194.8878 <2e-16 ***
## ShelveLoc:Price 2 5.89 2.95 0.8005 0.4498
## Residuals 394 1449.75 3.68
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
One last comment, you might want to set a different reference level, rather than using the alphabetic order.
Carseats$ShelveLoc <- factor(Carseats$ShelveLoc, levels = c("Medium", "Bad", "Good"))
contrasts(Carseats$ShelveLoc)
## Bad Good
## Medium 0 0
## Bad 1 0
## Good 0 1