Multiple Linear Regression

Categorical Explanatory Variables and Interactions

Example 1: Shipping Books

When you buy a book off Amazon, you get a quote for how much it costs to ship. This is based on the weight of the book. If you didn’t know the weight of the book, what other characteristics of it could you measure to help predict the weight?

Step 0: Load the data into R.

library(tidyverse)

#install.packages("DAAG")
library(DAAG)
data(allbacks)

books <- allbacks[, c(3, 1, 4)]
head(books)
##   weight volume cover
## 1    800    885    hb
## 2    950   1016    hb
## 3   1050   1125    hb
## 4    350    239    hb
## 5    750    701    hb
## 6    600    641    hb

Step 1: Look at the data

# Scatterplot of all books together
ggplot(data=books, aes(x=volume, y=weight))+
  geom_point()+
  ggtitle("Scatterplot of Volume vs Weight of Books")+
  theme_bw()

Step 2: Create a simple linear model

m3<-lm(weight~volume, data=books)
summary(m3)
## 
## Call:
## lm(formula = weight ~ volume, data = books)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -189.97 -109.86   38.08  109.73  145.57 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 107.67931   88.37758   1.218    0.245    
## volume        0.70864    0.09746   7.271 6.26e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 123.9 on 13 degrees of freedom
## Multiple R-squared:  0.8026, Adjusted R-squared:  0.7875 
## F-statistic: 52.87 on 1 and 13 DF,  p-value: 6.262e-06

… and plot the model

# Scatterplot of all books together and fitted model
ggplot(data=books, aes(x=volume, y=weight))+
  geom_point()+
  ggtitle("Scatterplot of Volume vs Weight of Books")+
  theme_bw()+
  geom_abline(slope=m3$coefficients[2], intercept = m3$coefficients[1])

What about considering cover type in the model?

# What about considering cover type 
ggplot(data=books, aes(x=volume, y=weight, color=cover))+
  geom_point()+
  ggtitle("Scatterplot of Volume vs Weight of Books")+
  theme_bw()

Step 3: Now create a MLR model

# Include indicator to shift y-intercept
m4<-lm(weight~volume+cover, data=books)
summary(m4)
## 
## Call:
## lm(formula = weight ~ volume + cover, data = books)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -110.10  -32.32  -16.10   28.93  210.95 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  197.96284   59.19274   3.344 0.005841 ** 
## volume         0.71795    0.06153  11.669  6.6e-08 ***
## coverpb     -184.04727   40.49420  -4.545 0.000672 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.2 on 12 degrees of freedom
## Multiple R-squared:  0.9275, Adjusted R-squared:  0.9154 
## F-statistic: 76.73 on 2 and 12 DF,  p-value: 1.455e-07
m4$coefficients
##  (Intercept)       volume      coverpb 
##  197.9628436    0.7179537 -184.0472714

… and plot the new mlr model

# plot with shift of y-intercept
ggplot(data=books, aes(x=volume, y=weight, color=cover))+
  geom_point()+
  ggtitle("Scatterplot of Volume vs Weight of Books")+
  theme_bw()+
  geom_abline(slope=m4$coefficients[2], intercept = m4$coefficients[1], col=2)+
  geom_abline(slope=m4$coefficients[2], intercept = m4$coefficients[1]+m4$coefficients[3], col=4)

Step 4: Include interaction to shift intercept and change slope. Fit a new MLR

# Include interaction to shift intercept and change slope
m5<-lm(weight~volume*cover, data=books)
summary(m5)
## 
## Call:
## lm(formula = weight ~ volume * cover, data = books)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89.67 -32.07 -21.82  17.94 215.91 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     161.58654   86.51918   1.868   0.0887 .  
## volume            0.76159    0.09718   7.837 7.94e-06 ***
## coverpb        -120.21407  115.65899  -1.039   0.3209    
## volume:coverpb   -0.07573    0.12802  -0.592   0.5661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80.41 on 11 degrees of freedom
## Multiple R-squared:  0.9297, Adjusted R-squared:  0.9105 
## F-statistic:  48.5 on 3 and 11 DF,  p-value: 1.245e-06
m5$coefficients
##    (Intercept)         volume        coverpb volume:coverpb 
##   161.58654141     0.76159284  -120.21406570    -0.07573366

… and plot the new mlr model with the interaction

# plot with shift of y-intercept
ggplot(data=books, aes(x=volume, y=weight, color=cover))+
  geom_point()+
  ggtitle("Scatterplot of Volume vs Weight of Books")+
  theme_bw()+
  geom_abline(slope=m5$coefficients[2], intercept = m5$coefficients[1], col=2)+
  geom_abline(slope=m5$coefficients[2]+m5$coefficients[4], intercept = m5$coefficients[1]+m5$coefficients[3], col=4)

Example 2: NYC Zagat Ratings

Step 0: Load the data into R.

## Multiple linear regression 
nyc <- read.csv("http://andrewpbray.github.io/data/nyc.csv")

head(nyc)
##   Case          Restaurant Price Food Decor Service East
## 1    1 Daniella Ristorante    43   22    18      20    0
## 2    2  Tello's Ristorante    32   20    19      19    0
## 3    3          Biricchino    34   21    13      18    0
## 4    4             Bottino    41   20    20      17    0
## 5    5          Da Umberto    54   24    19      21    0
## 6    6            Le Madri    52   22    22      21    0

Step 1: Look at the data

nyc%>%
  select(Price, Food, Decor)%>%
  pairs()

Step 2: Create a multiple linear model

m1<-lm(Price~Food+Decor, data=nyc)
summary(m1)
## 
## Call:
## lm(formula = Price ~ Food + Decor, data = nyc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.945  -3.766  -0.153   3.701  18.757 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -24.5002     4.7230  -5.187 6.19e-07 ***
## Food          1.6461     0.2615   6.294 2.68e-09 ***
## Decor         1.8820     0.1919   9.810  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.788 on 165 degrees of freedom
## Multiple R-squared:  0.6167, Adjusted R-squared:  0.6121 
## F-statistic: 132.7 on 2 and 165 DF,  p-value: < 2.2e-16

Step 3: ANOVA

anova(m1)
## Analysis of Variance Table
## 
## Response: Price
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## Food        1 5670.3  5670.3 169.262 < 2.2e-16 ***
## Decor       1 3223.7  3223.7  96.228 < 2.2e-16 ***
## Residuals 165 5527.5    33.5                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Explore Multiple Colinearlity with VIF

A way to quantify the impact of multicollinearity is to look at the VIF (variance inflation factor). A VIF close to 1 would imply no correlation. The larger the VIF the larger the amount of multicollinearity.

library(car)
vif(lm(Price~Food+Decor, data=nyc))
##     Food    Decor 
## 1.340359 1.340359

BONUS: SLR Start to Finish

Example: Climate Change and Fish Habitats

As the climate grows warmer, we expect many animal species to move toward the poles in an attempt to maintain their preferred temperature range.

Do data on fish in the North Sea confirm this suspicion?

The data are 25 years of mean winter temperatures at the bottom of the North Sea (degrees Celsius) and the center of the distribution of anglerfish (sometimes called monkfish) in degrees of north latitude.

Step 0: Load the data into R. Create a data frame.

# EXAMPLE: Climate Change and Fish Habitats
# Data on anglerfish distribution
# Explanatory: Temp in C (mean winter temperature at bottom of North Sea)
# Response: Latitude of center for distribution of anglerfish

year<-c(1977:2001)
temp<-c(6.26, 6.26, 6.27, 6.31, 6.34, 6.32, 6.37, 6.39, 6.42, 
        6.52, 6.68, 6.76, 6.78, 6.89, 6.90, 6.93, 6.98, 
        7.02, 7.09, 7.13, 7.15, 7.29, 7.34, 7.57, 7.65)
lat<-c(57.20, 57.96, 57.65, 57.59, 58.01, 59.06, 56.85, 56.87, 57.43,
       57.72, 57.83, 57.87, 57.48, 58.13, 58.52, 58.48, 57.89,
       58.71, 58.07, 58.49, 58.28, 58.49, 58.01, 58.57, 58.90)

# Make a dataframe
fish<-data.frame(year, temp, lat)

Step 1: Look at the data! Create a scatterplot!

library(tidyverse)
# Scatterplot of data looks linear
ggplot(data=fish, aes(temp, lat))+
  geom_point()+
  ggtitle("Scatterplot of Temp vs Fish Latitude")+
  xlab("Temperature (C)")+
  ylab("Fish Latitude")+
  theme_bw()

Step 2: Create a simple linear model

# Fit a simple linear model 
mod2<-lm(lat~temp)
summary(mod2)
## 
## Call:
## lm(formula = lat ~ temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81309 -0.27207 -0.02401  0.20523  1.43781 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  52.4524     1.5324  34.229  < 2e-16 ***
## temp          0.8180     0.2254   3.629  0.00141 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4734 on 23 degrees of freedom
## Multiple R-squared:  0.3641, Adjusted R-squared:  0.3364 
## F-statistic: 13.17 on 1 and 23 DF,  p-value: 0.001408
names(mod2)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

… and add the fitted line to the scatterplot

# Scatterplot with fitted line
ggplot(data=fish, aes(temp, lat))+
  geom_point()+
  ggtitle("Scatterplot of Temp vs Fish Latitude")+
  xlab("Temperature (C)")+
  ylab("Fish Latitude")+
  theme_bw()+
  geom_abline(slope=mod2$coefficients[2], intercept=mod2$coefficients[1],
              color="blue", lty=2, lwd=1)

Step 3: Check the model conditions

  1. Are the data independent?
  2. Is there a linear relationship between x and y?
  3. Are the residuals normally distributed?
  4. Are the residuals centered around zero and do they have constant spread?
# QQ NORM Plot 
hist(mod2$residuals)

qqnorm(mod2$residuals)
qqline(mod2$residuals)

fish<-cbind(fish, 
            fit=mod2$fitted.values,
            residual=mod2$residuals)

ggplot(data=fish, aes(residual))+
  geom_histogram(bins=8)+
  ggtitle("Histogram of Residuals")+
  theme_bw()

# Residual Plot
ggplot(data=fish, aes(temp, residual))+
  geom_point()+
  ggtitle("Residual Plot")+
  xlab("Temperature (C)")+
  ylab("Residuals")+
  theme_bw()+
  geom_hline(yintercept = 0,
             color="blue", lty=2, lwd=1)