library(MASS)
library(tidyverse)
## ── Attaching packages ────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ISLR)
The following code splits the dataframe Auto from the ISLR package into a training set with 80% of the data and a testing set with the remaining 20%.
length = nrow(Auto)
set.seed(245)
indices = sample(1:length)
scrambled = Auto[indices,]
split = round(length*.8)
train = scrambled[1:split,]
test = scrambled[(split+1):length,]
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
head(train)
## mpg cylinders displacement horsepower weight acceleration year origin
## 369 27.0 4 112 88 2640 18.6 82 1
## 192 22.0 6 225 100 3233 15.4 76 1
## 93 13.0 8 351 158 4363 13.0 73 1
## 266 17.5 8 318 140 4080 13.7 78 1
## 10 15.0 8 390 190 3850 8.5 70 1
## 112 18.0 3 70 90 2124 13.5 73 3
## name
## 369 chevrolet cavalier wagon
## 192 plymouth valiant
## 93 ford ltd
## 266 dodge magnum xe
## 10 amc ambassador dpl
## 112 maxda rx3
Use glimpse to see the variables in Auto.
glimpse(Auto)
## Observations: 392
## Variables: 9
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 1...
## $ cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6...
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390,...
## $ horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190,...
## $ weight <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4...
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10....
## $ year <dbl> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7...
## $ origin <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1...
## $ name <fct> chevrolet chevelle malibu, buick skylark 320, ply...
We’ll use mpg as the target. There are numerous variables that might be useful. Try displacement first.
lm1 = lm(mpg~displacement,data=train)
summary(lm1)
##
## Call:
## lm(formula = mpg ~ displacement, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.007 -3.045 -0.469 2.272 18.570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.22699 0.55892 63.03 <2e-16 ***
## displacement -0.06029 0.00256 -23.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.733 on 312 degrees of freedom
## Multiple R-squared: 0.64, Adjusted R-squared: 0.6389
## F-statistic: 554.8 on 1 and 312 DF, p-value: < 2.2e-16
Try Horsepower.
lm2 = lm(mpg~horsepower,data=train)
summary(lm2)
##
## Call:
## lm(formula = mpg ~ horsepower, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.7937 -3.3414 -0.4231 2.6630 16.6895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.280166 0.781079 51.57 <2e-16 ***
## horsepower -0.159534 0.007025 -22.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.843 on 312 degrees of freedom
## Multiple R-squared: 0.6231, Adjusted R-squared: 0.6219
## F-statistic: 515.8 on 1 and 312 DF, p-value: < 2.2e-16
Try adding year to each of the previous models.
lm3 = lm(mpg~displacement + year,data=train)
summary(lm3)
##
## Call:
## lm(formula = mpg ~ displacement + year, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.2995 -2.6841 -0.3894 2.2320 14.7688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.83537 5.21680 -3.994 8.12e-05 ***
## displacement -0.05073 0.00236 -21.497 < 2e-16 ***
## year 0.71286 0.06606 10.792 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.043 on 311 degrees of freedom
## Multiple R-squared: 0.7381, Adjusted R-squared: 0.7364
## F-statistic: 438.3 on 2 and 311 DF, p-value: < 2.2e-16
lm4 = lm(mpg~horsepower + year,data=train)
summary(lm4)
##
## Call:
## lm(formula = mpg ~ horsepower + year, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.2469 -2.9673 -0.4467 2.5250 15.2299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.115439 5.934263 -1.873 0.062 .
## horsepower -0.132694 0.007017 -18.909 <2e-16 ***
## year 0.638884 0.073250 8.722 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.348 on 311 degrees of freedom
## Multiple R-squared: 0.6972, Adjusted R-squared: 0.6952
## F-statistic: 358 on 2 and 311 DF, p-value: < 2.2e-16
First create a function to calculate RMSE based on a predicted vector and an observed vector.
RMSE = function(predicted,observed){
residual = observed - predicted
return(sqrt(mean(residual^2)))
}
pred1 = predict(lm1, newdata = test)
pred2 = predict(lm2, newdata = test)
pred3 = predict(lm3, newdata = test)
pred4 = predict(lm4, newdata = test)
RMSE(pred1,test$mpg)
## [1] 4.224143
RMSE(pred2,test$mpg)
## [1] 5.164706
RMSE(pred3,test$mpg)
## [1] 3.737896
RMSE(pred4,test$mpg)
## [1] 4.555032
Use some different variables to predict mpg.
Look at the Carseats data in ISLR. Build the best model you can for sales with no more than three variables. Assess the quality of your model using the performance of your model on a 20% test dataset.
Follow the same procedure to build and test a few models for Wage in the ISLR package.