Class 5B

Fitting Simple Linear Regression Models in R

I am borrowing and adding to the example from our textbook (Introduction to Statistical Learning)

Housing in Boston

The MASS library contains the Boston data set, which records medv (median house value) for 506 neighborhoods around Boston. We will seek to predict medv using 13 predictors such as rm (average number of rooms per house), age (average age of houses), and lstat (percent of households with low socioeconomic status).

library(MASS)

## Warning: package 'MASS' was built under R version 3.4.4

data(Boston)
attach(Boston)

names(Boston)

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

Create plots

We want to visualize the relationship between lstat (percent of households with low socioeconomic status) and medv (median house value).

# in base r
plot(lstat, medv)

# in tidyverse
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.4.2

## ── Attaching packages ──────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.0       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0

## Warning: package 'tibble' was built under R version 3.4.4

## Warning: package 'tidyr' was built under R version 3.4.4

## Warning: package 'readr' was built under R version 3.4.4

## Warning: package 'purrr' was built under R version 3.4.4

## Warning: package 'dplyr' was built under R version 3.4.4

## Warning: package 'stringr' was built under R version 3.4.4

## Warning: package 'forcats' was built under R version 3.4.4

## ── Conflicts ─────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()

ggplot(Boston, aes(x=lstat, y=medv))+
  geom_point()+
  theme_bw()

Fit an SLR

We are going to save the model output because we want to reference it later in the code. Also use the summary function to see what other information about the model is accessible.

mod<-lm(medv~lstat)
mod

## 
## Call:
## lm(formula = medv ~ lstat)
## 
## Coefficients:
## (Intercept)        lstat  
##       34.55        -0.95

summary(mod)

## 
## Call:
## lm(formula = medv ~ lstat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16

names(mod)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

Adding a line to our scatter plot

# in tidyverse add a line
ggplot(Boston, aes(x=lstat, y=medv))+
  geom_point()+
  geom_abline(slope=mod$coefficients[2], intercept=mod$coefficients[1],
              color="blue", lty=2, lwd=1)+
  theme_bw()

Confidence intervals for the parameters

# confidence intervals for coefficents
confint(mod)

##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505

Confidence and Prediction intervals for a given value

# confidence intervals for a given value
newdata<-data.frame(lstat=c(5, 10, 15))

predict(mod, newdata, 
        interval="confidence")

##        fit      lwr      upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461

# prediction interval 
predict(mod, newdata, 
        interval="predict")

##        fit       lwr      upr
## 1 29.80359 17.565675 42.04151
## 2 25.05335 12.827626 37.27907
## 3 20.30310  8.077742 32.52846

Adding Confidence and Prediction Bands

# add confidence and prediction bands to plot
confBand<-predict(mod, interval="confidence")
predBand<-predict(mod, interval="predict")

## Warning in predict.lm(mod, interval = "predict"): predictions on current data refer to _future_ responses

colnames(predBand)<-c("fit2", "lwr2", "upr2")


newDF<-cbind(Boston, confBand, predBand)

ggplot(newDF, aes(x=lstat, y=medv))+
  geom_point(alpha=.3)+
  geom_abline(slope=mod$coefficients[2], intercept=mod$coefficients[1],
              color="blue", lty=2, lwd=1)+
  geom_line(aes(y=lwr), color="green", lty=2, lwd=1)+
  geom_line(aes(y=upr), color="green", lty=2, lwd=1)+
  geom_line(aes(y=lwr2), color="red", lty=2, lwd=1)+
  geom_line(aes(y=upr2), color="red", lty=2, lwd=1)+
  theme_bw()

ANOVA

# ANOVA
anova(mod)

## Analysis of Variance Table
## 
## Response: medv
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1  23244 23243.9  601.62 < 2.2e-16 ***
## Residuals 504  19472    38.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1