Setup

library(MASS)
library(tidyverse)
## ── Attaching packages ────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ISLR)

Linear Model Tasks

  1. Split data into train and test subsets
  2. Identify target
  3. Select list(s) of predictors
  4. Run models and examine goodness of fit with train
  5. Measure goodness of fit with test.

Splitting

The following code splits the dataframe Auto from the ISLR package into a training set with 80% of the data and a testing set with the remaining 20%.

length = nrow(Auto)
set.seed(245)
indices = sample(1:length)
scrambled = Auto[indices,]
split = round(length*.8)
train = scrambled[1:split,]
test = scrambled[(split+1):length,]
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
head(train)
##      mpg cylinders displacement horsepower weight acceleration year origin
## 369 27.0         4          112         88   2640         18.6   82      1
## 192 22.0         6          225        100   3233         15.4   76      1
## 93  13.0         8          351        158   4363         13.0   73      1
## 266 17.5         8          318        140   4080         13.7   78      1
## 10  15.0         8          390        190   3850          8.5   70      1
## 112 18.0         3           70         90   2124         13.5   73      3
##                         name
## 369 chevrolet cavalier wagon
## 192         plymouth valiant
## 93                  ford ltd
## 266          dodge magnum xe
## 10        amc ambassador dpl
## 112                maxda rx3

Target and Predictors

Use glimpse to see the variables in Auto.

glimpse(Auto)
## Observations: 392
## Variables: 9
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 1...
## $ cylinders    <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6...
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390,...
## $ horsepower   <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190,...
## $ weight       <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4...
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10....
## $ year         <dbl> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7...
## $ origin       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1...
## $ name         <fct> chevrolet chevelle malibu, buick skylark 320, ply...

We’ll use mpg as the target. There are numerous variables that might be useful. Try displacement first.

lm1 = lm(mpg~displacement,data=train)
summary(lm1)
## 
## Call:
## lm(formula = mpg ~ displacement, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.007  -3.045  -0.469   2.272  18.570 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  35.22699    0.55892   63.03   <2e-16 ***
## displacement -0.06029    0.00256  -23.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.733 on 312 degrees of freedom
## Multiple R-squared:   0.64,  Adjusted R-squared:  0.6389 
## F-statistic: 554.8 on 1 and 312 DF,  p-value: < 2.2e-16

Try Horsepower.

lm2 = lm(mpg~horsepower,data=train)
summary(lm2)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.7937  -3.3414  -0.4231   2.6630  16.6895 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 40.280166   0.781079   51.57   <2e-16 ***
## horsepower  -0.159534   0.007025  -22.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.843 on 312 degrees of freedom
## Multiple R-squared:  0.6231, Adjusted R-squared:  0.6219 
## F-statistic: 515.8 on 1 and 312 DF,  p-value: < 2.2e-16

Try adding year to each of the previous models.

lm3 = lm(mpg~displacement + year,data=train)
summary(lm3)
## 
## Call:
## lm(formula = mpg ~ displacement + year, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2995  -2.6841  -0.3894   2.2320  14.7688 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -20.83537    5.21680  -3.994 8.12e-05 ***
## displacement  -0.05073    0.00236 -21.497  < 2e-16 ***
## year           0.71286    0.06606  10.792  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.043 on 311 degrees of freedom
## Multiple R-squared:  0.7381, Adjusted R-squared:  0.7364 
## F-statistic: 438.3 on 2 and 311 DF,  p-value: < 2.2e-16
lm4 = lm(mpg~horsepower + year,data=train)
summary(lm4)
## 
## Call:
## lm(formula = mpg ~ horsepower + year, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.2469  -2.9673  -0.4467   2.5250  15.2299 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.115439   5.934263  -1.873    0.062 .  
## horsepower   -0.132694   0.007017 -18.909   <2e-16 ***
## year          0.638884   0.073250   8.722   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.348 on 311 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6952 
## F-statistic:   358 on 2 and 311 DF,  p-value: < 2.2e-16

Check goodness of fit (RMSE) using test.

First create a function to calculate RMSE based on a predicted vector and an observed vector.

RMSE = function(predicted,observed){
  residual = observed - predicted
  return(sqrt(mean(residual^2)))
}
pred1 = predict(lm1, newdata = test)
pred2 = predict(lm2, newdata = test)
pred3 = predict(lm3, newdata = test)
pred4 = predict(lm4, newdata = test)
RMSE(pred1,test$mpg)
## [1] 4.224143
RMSE(pred2,test$mpg)
## [1] 5.164706
RMSE(pred3,test$mpg)
## [1] 3.737896
RMSE(pred4,test$mpg) 
## [1] 4.555032

Exercise

Use some different variables to predict mpg.

Exercise

Look at the Carseats data in ISLR. Build the best model you can for sales with no more than three variables. Assess the quality of your model using the performance of your model on a 20% test dataset.

Exercise

Follow the same procedure to build and test a few models for Wage in the ISLR package.