I am borrowing and adding to the example from our textbook (Introduction to Statistical Learning)
The MASS library contains the Boston data set, which records medv (median house value) for 506 neighborhoods around Boston. We will seek to predict medv using 13 predictors such as rm (average number of rooms per house), age (average age of houses), and lstat (percent of households with low socioeconomic status).
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
data(Boston)
attach(Boston)
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
We want to visualize the relationship between lstat (percent of households with low socioeconomic status) and medv (median house value).
# in base r
plot(lstat, medv)
# in tidyverse
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ──────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## ── Conflicts ─────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
ggplot(Boston, aes(x=lstat, y=medv))+
geom_point()+
theme_bw()
We are going to save the model output because we want to reference it later in the code. Also use the summary function to see what other information about the model is accessible.
mod<-lm(medv~lstat)
mod
##
## Call:
## lm(formula = medv ~ lstat)
##
## Coefficients:
## (Intercept) lstat
## 34.55 -0.95
summary(mod)
##
## Call:
## lm(formula = medv ~ lstat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
names(mod)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
# in tidyverse add a line
ggplot(Boston, aes(x=lstat, y=medv))+
geom_point()+
geom_abline(slope=mod$coefficients[2], intercept=mod$coefficients[1],
color="blue", lty=2, lwd=1)+
theme_bw()
# confidence intervals for coefficents
confint(mod)
## 2.5 % 97.5 %
## (Intercept) 33.448457 35.6592247
## lstat -1.026148 -0.8739505
# confidence intervals for a given value
newdata<-data.frame(lstat=c(5, 10, 15))
predict(mod, newdata,
interval="confidence")
## fit lwr upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
# prediction interval
predict(mod, newdata,
interval="predict")
## fit lwr upr
## 1 29.80359 17.565675 42.04151
## 2 25.05335 12.827626 37.27907
## 3 20.30310 8.077742 32.52846
# add confidence and prediction bands to plot
confBand<-predict(mod, interval="confidence")
predBand<-predict(mod, interval="predict")
## Warning in predict.lm(mod, interval = "predict"): predictions on current data refer to _future_ responses
colnames(predBand)<-c("fit2", "lwr2", "upr2")
newDF<-cbind(Boston, confBand, predBand)
ggplot(newDF, aes(x=lstat, y=medv))+
geom_point(alpha=.3)+
geom_abline(slope=mod$coefficients[2], intercept=mod$coefficients[1],
color="blue", lty=2, lwd=1)+
geom_line(aes(y=lwr), color="green", lty=2, lwd=1)+
geom_line(aes(y=upr), color="green", lty=2, lwd=1)+
geom_line(aes(y=lwr2), color="red", lty=2, lwd=1)+
geom_line(aes(y=upr2), color="red", lty=2, lwd=1)+
theme_bw()
# ANOVA
anova(mod)
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23244 23243.9 601.62 < 2.2e-16 ***
## Residuals 504 19472 38.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1