if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999, digits = 3)

Estimating the Regression Equation and Prediction

Example: Predicting the Price of Used Toyota Corolla Cars

library(caret)
car.df <- mlba::ToyotaCorolla
# select variables for regression
outcome <- "Price"
predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic",
                "CC", "Doors", "Quarterly_Tax", "Weight")
# reduce data set to first 1000 rows and selected variables
car.df <- car.df[1:1000, c(outcome, predictors)]

# partition data
set.seed(1)  # set seed for reproducing the partition
idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE)
train.df <- car.df[idx, ]
holdout.df <- car.df[-idx, ]

# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
car.lm <- lm(Price ~ ., data = train.df)
#  use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(car.lm)

#> 
#> Call:
#> lm(formula = Price ~ ., data = train.df)
#> 
#> Residuals:
#>    Min     1Q Median     3Q    Max 
#>  -9047   -831     -6    832   6057 
#> 
#> Coefficients:
#>                    Estimate  Std. Error t value             Pr(>|t|)    
#> (Intercept)     -3725.59270  1913.92374   -1.95              0.05206 .  
#> Age_08_04        -133.98649     4.92047  -27.23 < 0.0000000000000002 ***
#> KM                 -0.01741     0.00231   -7.53  0.00000000000019238 ***
#> Fuel_TypeDiesel  1179.18603   724.71141    1.63              0.10425    
#> Fuel_TypePetrol  2173.64897   729.55378    2.98              0.00301 ** 
#> HP                 36.34253     4.75838    7.64  0.00000000000008997 ***
#> Met_Color          -7.60255   119.54320   -0.06              0.94931    
#> Automatic         276.55860   267.85985    1.03              0.30227    
#> CC                  0.01517     0.09440    0.16              0.87236    
#> Doors               2.28016    62.30556    0.04              0.97082    
#> Quarterly_Tax       9.64453     2.60048    3.71              0.00023 ***
#> Weight             15.25566     1.81726    8.39  0.00000000000000035 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1340 on 589 degrees of freedom
#> Multiple R-squared:  0.869,  Adjusted R-squared:  0.867 
#> F-statistic:  356 on 11 and 589 DF,  p-value: <0.0000000000000002

# use predict() to make predictions on a new set.
pred <- predict(car.lm, holdout.df)

options(scipen=999, digits=1)
data.frame(
  'Predicted' = pred[1:20],
  'Actual' = holdout.df$Price[1:20],
  'Residual' = holdout.df$Price[1:20] - pred[1:20]
)

#>    Predicted Actual Residual
#> 1      16652  13500    -3152
#> 14     19941  21500     1559
#> 15     19613  22500     2887
#> 16     20424  22000     1576
#> 18     16553  17950     1397
#> 19     15247  16750     1503
#> 20     15006  16950     1944
#> 21     14949  15950     1001
#> 22     16917  16950       33
#> 24     16063  16950      887
#> 25     16040  16250      210
#> 26     16530  15950     -580
#> 30     16163  17950     1787
#> 32     16034  15750     -284
#> 36     15370  15750      380
#> 37     15817  15950      133
#> 39     14866  15750      884
#> 40     15506  14750     -756
#> 41     15800  13950    -1850
#> 45     17475  16950     -525

options(scipen=999, digits = 3)

# calculate performance metrics
rbind(
  Training=mlba::regressionSummary(predict(car.lm, train.df), train.df$Price),
  Holdout=mlba::regressionSummary(pred, holdout.df$Price)
)

#>          ME              RMSE MAE 
#> Training 0.0000000000842 1329 1009
#> Holdout  1.7             1423 1054

library(ggplot2)
pred <- predict(car.lm, holdout.df)
all.residuals <- holdout.df$Price - pred

ggplot() +
    geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
    labs(x="Residuals", y="Frequency")

g <- ggplot() +
         geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
         labs(x="Residuals", y="Frequency") +
         theme_bw()
ggsave(file=file.path(getwd(), "figures", "chapter_06", "residuals-histogram.pdf"),
       g, width=5, height=3, units="in")

Cross-validation and caret

set.seed(1)
library(caret)
# define 5-fold
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(Price ~ ., data=car.df,
                      method="lm",  # specify the model
                      trControl=trControl)
model

#> Linear Regression 
#> 
#> 1000 samples
#>   10 predictor
#> 
#> No pre-processing
#> Resampling: Cross-Validated (5 fold) 
#> Summary of sample sizes: 799, 801, 800, 799, 801 
#> Resampling results:
#> 
#>   RMSE  Rsquared  MAE 
#>   1949  0.753     1086
#> 
#> Tuning parameter 'intercept' was held constant at a value of TRUE

coef(model$finalModel)

#>     (Intercept)       Age_08_04              KM Fuel_TypeDiesel Fuel_TypePetrol 
#>      -6419.2947       -132.4105         -0.0187        654.4718       2536.4747 
#>              HP       Met_Color       Automatic              CC           Doors 
#>         35.0697         58.8612        239.5717         -0.0210        -69.0173 
#>   Quarterly_Tax          Weight 
#>         15.9124         17.4002

library(tidyverse)
collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
  if (missing(nPredictors)) {
    coefs = coef(model$finalModel)
    nPredictors = length(coefs) - 1
  }
  return (cbind(
    CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
    Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
    Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price),
    nPredictors=nPredictors
  ))
}

metric.full <- collectMetrics(model, train.df, holdout.df)

predict(model, car.df[1:3,])

#>     1     2     3 
#> 16892 16408 16858

Variable Selection in Linear Regression

How to Reduce the Number of Predictors

Exhaustive Search

# use regsubsets() in package leaps to run an exhaustive search.
# unlike with lm, categorical predictors must be turned into dummies manually.
library(leaps)
library(fastDummies)

# create dummies for fuel type
leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE,
                             remove_selected_columns=TRUE)
search <- regsubsets(Price ~ ., data=leaps.train.df, nbest=1,
                     nvmax=ncol(leaps.train.df), method="exhaustive")
sum <- summary(search)

# show models
sum$which

#>    (Intercept) Age_08_04    KM    HP Met_Color Automatic    CC Doors
#> 1         TRUE      TRUE FALSE FALSE     FALSE     FALSE FALSE FALSE
#> 2         TRUE      TRUE FALSE  TRUE     FALSE     FALSE FALSE FALSE
#> 3         TRUE      TRUE FALSE  TRUE     FALSE     FALSE FALSE FALSE
#> 4         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE FALSE
#> 5         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE FALSE
#> 6         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE FALSE
#> 7         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE FALSE
#> 8         TRUE      TRUE  TRUE  TRUE     FALSE      TRUE FALSE FALSE
#> 9         TRUE      TRUE  TRUE  TRUE     FALSE      TRUE  TRUE FALSE
#> 10        TRUE      TRUE  TRUE  TRUE      TRUE      TRUE  TRUE FALSE
#> 11        TRUE      TRUE  TRUE  TRUE      TRUE      TRUE  TRUE  TRUE
#>    Quarterly_Tax Weight Fuel_Type_Diesel Fuel_Type_Petrol
#> 1          FALSE  FALSE            FALSE            FALSE
#> 2          FALSE  FALSE            FALSE            FALSE
#> 3          FALSE   TRUE            FALSE            FALSE
#> 4          FALSE   TRUE            FALSE            FALSE
#> 5           TRUE   TRUE            FALSE            FALSE
#> 6           TRUE   TRUE            FALSE             TRUE
#> 7           TRUE   TRUE             TRUE             TRUE
#> 8           TRUE   TRUE             TRUE             TRUE
#> 9           TRUE   TRUE             TRUE             TRUE
#> 10          TRUE   TRUE             TRUE             TRUE
#> 11          TRUE   TRUE             TRUE             TRUE

# show metrics
sum$rsq

#>  [1] 0.773 0.816 0.848 0.865 0.866 0.868 0.869 0.869 0.869 0.869 0.869

sum$adjr2

#>  [1] 0.773 0.815 0.847 0.864 0.865 0.867 0.867 0.867 0.867 0.867 0.867

sum$cp

#>  [1] 422.90 234.33  92.94  17.09  14.05   5.73   5.20   6.03   8.01  10.00
#> [11]  12.00

optimal <- which.min(sum$cp)

# determine the variable names for the optimal model
X <- summary(search)$which[, -1]  # information about included predictors
xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept)
xvars <- xvars[X[optimal,]]

# the optimal model contains all dummy variables of Fuel_Type
xvars <- c("Age_08_04", "KM", "HP", "Quarterly_Tax", "Weight", "Fuel_Type")

# rebuild model for best predictor set
set.seed(1)
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(Price ~ ., data=car.df[, c("Price", xvars)],
                      method="lm",  # specify the model
                      trControl=trControl)
model

#> Linear Regression 
#> 
#> 1000 samples
#>    6 predictor
#> 
#> No pre-processing
#> Resampling: Cross-Validated (5 fold) 
#> Summary of sample sizes: 799, 801, 800, 799, 801 
#> Resampling results:
#> 
#>   RMSE  Rsquared  MAE 
#>   1379  0.867     1031
#> 
#> Tuning parameter 'intercept' was held constant at a value of TRUE

coef(model$finalModel)

#>     (Intercept)       Age_08_04              KM              HP   Quarterly_Tax 
#>      -6363.8962       -132.0947         -0.0189         35.1790         15.6829 
#>          Weight Fuel_TypeDiesel Fuel_TypePetrol 
#>         17.1561        641.5680       2486.1642

metric.exhaustive <- collectMetrics(model, train.df, holdout.df)

Popular Subset Selection Algorithms

# as model performance is estimated using AIC, we don't need to use cross-validation
trControl <- caret::trainControl(method="none")
model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
                      # select backward elmination
                      method="glmStepAIC", direction='backward')

#> Start:  AIC=10377
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Met_Color + Automatic + CC + Doors + Quarterly_Tax + 
#>     Weight
#> 
#>                   Df   Deviance   AIC
#> - Doors            1 1062310738 10375
#> - Met_Color        1 1062315617 10375
#> - CC               1 1062354916 10375
#> - Automatic        1 1064230947 10376
#> <none>               1062308322 10377
#> - Fuel_TypeDiesel  1 1067083282 10378
#> - Fuel_TypePetrol  1 1078318623 10384
#> - Quarterly_Tax    1 1087116311 10389
#> - KM               1 1164556303 10430
#> - HP               1 1167515944 10432
#> - Weight           1 1189413064 10443
#> - Age_08_04        1 2399657299 10865
#> 
#> Step:  AIC=10375
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Met_Color + Automatic + CC + Quarterly_Tax + Weight
#> 
#>                   Df   Deviance   AIC
#> - Met_Color        1 1062317361 10373
#> - CC               1 1062357828 10373
#> - Automatic        1 1064231132 10374
#> <none>               1062310738 10375
#> - Fuel_TypeDiesel  1 1067084546 10376
#> - Fuel_TypePetrol  1 1078459491 10382
#> - Quarterly_Tax    1 1087171002 10387
#> - KM               1 1164578923 10428
#> - HP               1 1169381434 10431
#> - Weight           1 1202134106 10447
#> - Age_08_04        1 2399657526 10863
#> 
#> Step:  AIC=10373
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Automatic + CC + Quarterly_Tax + Weight
#> 
#>                   Df   Deviance   AIC
#> - CC               1 1062363137 10371
#> - Automatic        1 1064260284 10372
#> <none>               1062317361 10373
#> - Fuel_TypeDiesel  1 1067086942 10374
#> - Fuel_TypePetrol  1 1078460073 10380
#> - Quarterly_Tax    1 1087227750 10385
#> - KM               1 1164601826 10426
#> - HP               1 1169381865 10429
#> - Weight           1 1202694121 10446
#> - Age_08_04        1 2404025593 10862
#> 
#> Step:  AIC=10371
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Automatic + Quarterly_Tax + Weight
#> 
#>                   Df   Deviance   AIC
#> - Automatic        1 1064479599 10370
#> <none>               1062363137 10371
#> - Fuel_TypeDiesel  1 1067205988 10372
#> - Fuel_TypePetrol  1 1078509401 10378
#> - Quarterly_Tax    1 1087307523 10383
#> - KM               1 1164621006 10424
#> - HP               1 1171040817 10428
#> - Weight           1 1203279589 10444
#> - Age_08_04        1 2404329389 10860
#> 
#> Step:  AIC=10370
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Quarterly_Tax + Weight
#> 
#>                   Df   Deviance   AIC
#> <none>               1064479599 10370
#> - Fuel_TypeDiesel  1 1069038155 10371
#> - Fuel_TypePetrol  1 1080915857 10377
#> - Quarterly_Tax    1 1089142014 10382
#> - KM               1 1167120338 10424
#> - HP               1 1171287677 10426
#> - Weight           1 1219525385 10450
#> - Age_08_04        1 2411897369 10860

coef(model$finalModel)

#>     (Intercept)       Age_08_04              KM Fuel_TypeDiesel Fuel_TypePetrol 
#>      -4129.8231       -133.3675         -0.0174       1147.8554       2193.7124 
#>              HP   Quarterly_Tax          Weight 
#>         35.9268          9.6020         15.6732

model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
                      method="glmStepAIC", direction='forward')

#> Start:  AIC=11577
#> .outcome ~ 1
#> 
#>                   Df   Deviance   AIC
#> + Age_08_04        1 1839473922 10687
#> + Weight           1 4942095895 11281
#> + KM               1 5428350510 11337
#> + HP               1 7015561395 11492
#> + Doors            1 7621442286 11541
#> + Quarterly_Tax    1 7707385696 11548
#> + Met_Color        1 7959808301 11567
#> + CC               1 7965963079 11568
#> + Fuel_TypeDiesel  1 8080351818 11576
#> <none>               8119928181 11577
#> + Fuel_TypePetrol  1 8095103402 11578
#> + Automatic        1 8108208602 11579
#> 
#> Step:  AIC=10687
#> .outcome ~ Age_08_04
#> 
#>                   Df   Deviance   AIC
#> + HP               1 1495755628 10565
#> + Weight           1 1568531621 10593
#> + KM               1 1731231021 10653
#> + Doors            1 1804938849 10678
#> + Automatic        1 1811625126 10680
#> + CC               1 1822653805 10683
#> + Fuel_TypePetrol  1 1827199292 10685
#> + Quarterly_Tax    1 1828134693 10685
#> + Fuel_TypeDiesel  1 1832172362 10687
#> <none>               1839473922 10687
#> + Met_Color        1 1836889395 10688
#> 
#> Step:  AIC=10565
#> .outcome ~ Age_08_04 + HP
#> 
#>                   Df   Deviance   AIC
#> + Weight           1 1237149544 10453
#> + Quarterly_Tax    1 1381514233 10519
#> + Fuel_TypeDiesel  1 1425944734 10538
#> + Fuel_TypePetrol  1 1449296951 10548
#> + KM               1 1464314768 10554
#> + Doors            1 1472806525 10557
#> + Automatic        1 1478233353 10560
#> + CC               1 1480195311 10560
#> <none>               1495755628 10565
#> + Met_Color        1 1495343756 10567
#> 
#> Step:  AIC=10453
#> .outcome ~ Age_08_04 + HP + Weight
#> 
#>                   Df   Deviance   AIC
#> + KM               1 1096745657 10382
#> + Fuel_TypePetrol  1 1195162204 10434
#> + Fuel_TypeDiesel  1 1213907602 10443
#> + Automatic        1 1230466003 10451
#> <none>               1237149544 10453
#> + Doors            1 1234977609 10454
#> + Quarterly_Tax    1 1236283850 10454
#> + CC               1 1237134720 10455
#> + Met_Color        1 1237148719 10455
#> 
#> Step:  AIC=10382
#> .outcome ~ Age_08_04 + HP + Weight + KM
#> 
#>                   Df   Deviance   AIC
#> + Quarterly_Tax    1 1087643838 10379
#> <none>               1096745657 10382
#> + Fuel_TypePetrol  1 1094391693 10383
#> + Automatic        1 1094527602 10383
#> + CC               1 1096518027 10384
#> + Doors            1 1096547709 10384
#> + Fuel_TypeDiesel  1 1096571992 10384
#> + Met_Color        1 1096707124 10384
#> 
#> Step:  AIC=10379
#> .outcome ~ Age_08_04 + HP + Weight + KM + Quarterly_Tax
#> 
#>                   Df   Deviance   AIC
#> + Fuel_TypePetrol  1 1069038155 10371
#> + Fuel_TypeDiesel  1 1080915857 10377
#> <none>               1087643838 10379
#> + Automatic        1 1084134742 10379
#> + Doors            1 1086976100 10381
#> + CC               1 1087573859 10381
#> + Met_Color        1 1087641402 10381
#> 
#> Step:  AIC=10371
#> .outcome ~ Age_08_04 + HP + Weight + KM + Quarterly_Tax + Fuel_TypePetrol
#> 
#>                   Df   Deviance   AIC
#> + Fuel_TypeDiesel  1 1064479599 10370
#> <none>               1069038155 10371
#> + Automatic        1 1067205988 10372
#> + CC               1 1068706629 10373
#> + Met_Color        1 1069024990 10373
#> + Doors            1 1069029498 10373
#> 
#> Step:  AIC=10370
#> .outcome ~ Age_08_04 + HP + Weight + KM + Quarterly_Tax + Fuel_TypePetrol + 
#>     Fuel_TypeDiesel
#> 
#>             Df   Deviance   AIC
#> <none>         1064479599 10370
#> + Automatic  1 1062363137 10371
#> + CC         1 1064260284 10372
#> + Met_Color  1 1064454849 10372
#> + Doors      1 1064479146 10372

coef(model$finalModel)

#>     (Intercept)       Age_08_04              HP          Weight              KM 
#>      -4129.8231       -133.3675         35.9268         15.6732         -0.0174 
#>   Quarterly_Tax Fuel_TypePetrol Fuel_TypeDiesel 
#>          9.6020       2193.7124       1147.8554

model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
                      method="glmStepAIC", direction='both')

#> Start:  AIC=10377
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Met_Color + Automatic + CC + Doors + Quarterly_Tax + 
#>     Weight
#> 
#>                   Df   Deviance   AIC
#> - Doors            1 1062310738 10375
#> - Met_Color        1 1062315617 10375
#> - CC               1 1062354916 10375
#> - Automatic        1 1064230947 10376
#> <none>               1062308322 10377
#> - Fuel_TypeDiesel  1 1067083282 10378
#> - Fuel_TypePetrol  1 1078318623 10384
#> - Quarterly_Tax    1 1087116311 10389
#> - KM               1 1164556303 10430
#> - HP               1 1167515944 10432
#> - Weight           1 1189413064 10443
#> - Age_08_04        1 2399657299 10865
#> 
#> Step:  AIC=10375
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Met_Color + Automatic + CC + Quarterly_Tax + Weight
#> 
#>                   Df   Deviance   AIC
#> - Met_Color        1 1062317361 10373
#> - CC               1 1062357828 10373
#> - Automatic        1 1064231132 10374
#> <none>               1062310738 10375
#> - Fuel_TypeDiesel  1 1067084546 10376
#> + Doors            1 1062308322 10377
#> - Fuel_TypePetrol  1 1078459491 10382
#> - Quarterly_Tax    1 1087171002 10387
#> - KM               1 1164578923 10428
#> - HP               1 1169381434 10431
#> - Weight           1 1202134106 10447
#> - Age_08_04        1 2399657526 10863
#> 
#> Step:  AIC=10373
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Automatic + CC + Quarterly_Tax + Weight
#> 
#>                   Df   Deviance   AIC
#> - CC               1 1062363137 10371
#> - Automatic        1 1064260284 10372
#> <none>               1062317361 10373
#> - Fuel_TypeDiesel  1 1067086942 10374
#> + Met_Color        1 1062310738 10375
#> + Doors            1 1062315617 10375
#> - Fuel_TypePetrol  1 1078460073 10380
#> - Quarterly_Tax    1 1087227750 10385
#> - KM               1 1164601826 10426
#> - HP               1 1169381865 10429
#> - Weight           1 1202694121 10446
#> - Age_08_04        1 2404025593 10862
#> 
#> Step:  AIC=10371
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Automatic + Quarterly_Tax + Weight
#> 
#>                   Df   Deviance   AIC
#> - Automatic        1 1064479599 10370
#> <none>               1062363137 10371
#> - Fuel_TypeDiesel  1 1067205988 10372
#> + CC               1 1062317361 10373
#> + Met_Color        1 1062357828 10373
#> + Doors            1 1062360907 10373
#> - Fuel_TypePetrol  1 1078509401 10378
#> - Quarterly_Tax    1 1087307523 10383
#> - KM               1 1164621006 10424
#> - HP               1 1171040817 10428
#> - Weight           1 1203279589 10444
#> - Age_08_04        1 2404329389 10860
#> 
#> Step:  AIC=10370
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol + 
#>     HP + Quarterly_Tax + Weight
#> 
#>                   Df   Deviance   AIC
#> <none>               1064479599 10370
#> - Fuel_TypeDiesel  1 1069038155 10371
#> + Automatic        1 1062363137 10371
#> + CC               1 1064260284 10372
#> + Met_Color        1 1064454849 10372
#> + Doors            1 1064479146 10372
#> - Fuel_TypePetrol  1 1080915857 10377
#> - Quarterly_Tax    1 1089142014 10382
#> - KM               1 1167120338 10424
#> - HP               1 1171287677 10426
#> - Weight           1 1219525385 10450
#> - Age_08_04        1 2411897369 10860

coef(model$finalModel)

#>     (Intercept)       Age_08_04              KM Fuel_TypeDiesel Fuel_TypePetrol 
#>      -4129.8231       -133.3675         -0.0174       1147.8554       2193.7124 
#>              HP   Quarterly_Tax          Weight 
#>         35.9268          9.6020         15.6732

rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
      Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price))

#>          ME              RMSE MAE 
#> Training 0.0000000000876 1331 1010
#> Holdout  1.37            1423 1054

# The models are identical to the best model obtained from the exhaustive search.
# We therefore duplicate the metrics.
metric.stepwise <- metric.exhaustive

Regularization (Shrinkage Models)

set.seed(1)
library(caret)
trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
model <- caret::train(Price ~ ., data=train.df,
                 method='glmnet',
                 family='gaussian',  # set the family for linear regression
                 trControl=trControl,
                 tuneGrid=tuneGrid)
model$bestTune

#>    alpha lambda
#> 11     0   1000

coef(model$finalModel, s=model$bestTune$lambda)

#> 12 x 1 sparse Matrix of class "dgCMatrix"
#>                         s1
#> (Intercept)     -2658.9954
#> Age_08_04        -100.9073
#> KM                 -0.0223
#> Fuel_TypeDiesel    -8.4115
#> Fuel_TypePetrol   266.9359
#> HP                 33.9810
#> Met_Color          97.2147
#> Automatic         195.7619
#> CC                  0.0726
#> Doors              99.6551
#> Quarterly_Tax       6.5570
#> Weight             14.7082

metric.ridge <- collectMetrics(model, train.df, holdout.df,
  length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
ridge.model <- model

rbind(
  Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
  Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
)

#>          ME              RMSE MAE 
#> Training 0.0000000000181 1423 1061
#> Holdout  6.63            1590 1162

set.seed(1)
tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
model <- caret::train(Price ~ ., data=train.df,
                 method='glmnet',
                 family='gaussian',  # set the family for linear regression
                 trControl=trControl,
                 tuneGrid=tuneGrid)
model$bestTune

#>    alpha lambda
#> 23     1    158

coef(model$finalModel, s=model$bestTune$lambda)

#> 12 x 1 sparse Matrix of class "dgCMatrix"
#>                       s1
#> (Intercept)      368.168
#> Age_08_04       -132.917
#> KM                -0.015
#> Fuel_TypeDiesel    .    
#> Fuel_TypePetrol    .    
#> HP                30.759
#> Met_Color          .    
#> Automatic          .    
#> CC                 .    
#> Doors              .    
#> Quarterly_Tax      .    
#> Weight            14.536

lasso.model <- model
metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df,
  sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)

rbind(
  Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
  Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
)

#>          ME              RMSE MAE 
#> Training 0.0000000000124 1372 1026
#> Holdout  -24             1557 1112

library(tidyverse)
library(gridExtra)
g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
    geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
    geom_line() +
    geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
    labs(x=expression(paste('Ridge parameter ', lambda)),
         y='RMSE (cross-validation)') +
    scale_x_log10()
g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
    geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
    geom_line() +
    geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
    labs(x=expression(paste('Lasso parameter ', lambda)),
         y='RMSE (cross-validation)') +
    scale_x_log10()
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
ggsave(file=file.path(getwd(), 'figures', 'chapter_06', 'shrinkage-parameter-tuning.pdf'),
       g, width=6, height=2.5, units='in')

data.frame(rbind(
  'full'= metric.full,
  'exhaustive' = metric.exhaustive,
  'stepwise' = metric.stepwise,
  'ridge' = metric.ridge,
  'lasso' = metric.lasso
))

#>            CV.RMSE CV.MAE     Training.ME Training.RMSE Training.MAE Holdout.ME
#> full          1949   1086 0.7284815029209          1344         1009      -1.10
#> exhaustive    1379   1031 0.9735056879862          1343         1012      -1.47
#> stepwise      1379   1031 0.9735056879862          1343         1012      -1.47
#> ridge         1507   1092 0.0000000000181          1423         1061       6.63
#> lasso         1396   1039 0.0000000000124          1372         1026     -23.96
#>            Holdout.RMSE Holdout.MAE nPredictors
#> full               1366        1030          11
#> exhaustive         1373        1034           7
#> stepwise           1373        1034           7
#> ridge              1590        1162          11
#> lasso              1557        1112           4

Machine Learning for Business Analytics

Chapter 6: Multiple Linear Regression

Estimating the Regression Equation and Prediction

Example: Predicting the Price of Used Toyota Corolla Cars

Cross-validation and caret

Variable Selection in Linear Regression

How to Reduce the Number of Predictors

Exhaustive Search

Popular Subset Selection Algorithms

Regularization (Shrinkage Models)