if (!require(mlba)) {
library(devtools)
install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999, digits = 3)
library(caret)
car.df <- mlba::ToyotaCorolla
# select variables for regression
outcome <- "Price"
predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic",
"CC", "Doors", "Quarterly_Tax", "Weight")
# reduce data set to first 1000 rows and selected variables
car.df <- car.df[1:1000, c(outcome, predictors)]
# partition data
set.seed(1) # set seed for reproducing the partition
idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE)
train.df <- car.df[idx, ]
holdout.df <- car.df[-idx, ]
# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
car.lm <- lm(Price ~ ., data = train.df)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(car.lm)
#>
#> Call:
#> lm(formula = Price ~ ., data = train.df)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -9047 -831 -6 832 6057
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -3725.59270 1913.92374 -1.95 0.05206 .
#> Age_08_04 -133.98649 4.92047 -27.23 < 0.0000000000000002 ***
#> KM -0.01741 0.00231 -7.53 0.00000000000019238 ***
#> Fuel_TypeDiesel 1179.18603 724.71141 1.63 0.10425
#> Fuel_TypePetrol 2173.64897 729.55378 2.98 0.00301 **
#> HP 36.34253 4.75838 7.64 0.00000000000008997 ***
#> Met_Color -7.60255 119.54320 -0.06 0.94931
#> Automatic 276.55860 267.85985 1.03 0.30227
#> CC 0.01517 0.09440 0.16 0.87236
#> Doors 2.28016 62.30556 0.04 0.97082
#> Quarterly_Tax 9.64453 2.60048 3.71 0.00023 ***
#> Weight 15.25566 1.81726 8.39 0.00000000000000035 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 1340 on 589 degrees of freedom
#> Multiple R-squared: 0.869, Adjusted R-squared: 0.867
#> F-statistic: 356 on 11 and 589 DF, p-value: <0.0000000000000002
# use predict() to make predictions on a new set.
pred <- predict(car.lm, holdout.df)
options(scipen=999, digits=1)
data.frame(
'Predicted' = pred[1:20],
'Actual' = holdout.df$Price[1:20],
'Residual' = holdout.df$Price[1:20] - pred[1:20]
)
#> Predicted Actual Residual
#> 1 16652 13500 -3152
#> 14 19941 21500 1559
#> 15 19613 22500 2887
#> 16 20424 22000 1576
#> 18 16553 17950 1397
#> 19 15247 16750 1503
#> 20 15006 16950 1944
#> 21 14949 15950 1001
#> 22 16917 16950 33
#> 24 16063 16950 887
#> 25 16040 16250 210
#> 26 16530 15950 -580
#> 30 16163 17950 1787
#> 32 16034 15750 -284
#> 36 15370 15750 380
#> 37 15817 15950 133
#> 39 14866 15750 884
#> 40 15506 14750 -756
#> 41 15800 13950 -1850
#> 45 17475 16950 -525
options(scipen=999, digits = 3)
# calculate performance metrics
rbind(
Training=mlba::regressionSummary(predict(car.lm, train.df), train.df$Price),
Holdout=mlba::regressionSummary(pred, holdout.df$Price)
)
#> ME RMSE MAE
#> Training 0.0000000000842 1329 1009
#> Holdout 1.7 1423 1054
library(ggplot2)
pred <- predict(car.lm, holdout.df)
all.residuals <- holdout.df$Price - pred
ggplot() +
geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
labs(x="Residuals", y="Frequency")
g <- ggplot() +
geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
labs(x="Residuals", y="Frequency") +
theme_bw()
ggsave(file=file.path(getwd(), "figures", "chapter_06", "residuals-histogram.pdf"),
g, width=5, height=3, units="in")
set.seed(1)
library(caret)
# define 5-fold
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(Price ~ ., data=car.df,
method="lm", # specify the model
trControl=trControl)
model
#> Linear Regression
#>
#> 1000 samples
#> 10 predictor
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 799, 801, 800, 799, 801
#> Resampling results:
#>
#> RMSE Rsquared MAE
#> 1949 0.753 1086
#>
#> Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
#> (Intercept) Age_08_04 KM Fuel_TypeDiesel Fuel_TypePetrol
#> -6419.2947 -132.4105 -0.0187 654.4718 2536.4747
#> HP Met_Color Automatic CC Doors
#> 35.0697 58.8612 239.5717 -0.0210 -69.0173
#> Quarterly_Tax Weight
#> 15.9124 17.4002
library(tidyverse)
collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
if (missing(nPredictors)) {
coefs = coef(model$finalModel)
nPredictors = length(coefs) - 1
}
return (cbind(
CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price),
nPredictors=nPredictors
))
}
metric.full <- collectMetrics(model, train.df, holdout.df)
predict(model, car.df[1:3,])
#> 1 2 3
#> 16892 16408 16858
# use regsubsets() in package leaps to run an exhaustive search.
# unlike with lm, categorical predictors must be turned into dummies manually.
library(leaps)
library(fastDummies)
# create dummies for fuel type
leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE,
remove_selected_columns=TRUE)
search <- regsubsets(Price ~ ., data=leaps.train.df, nbest=1,
nvmax=ncol(leaps.train.df), method="exhaustive")
sum <- summary(search)
# show models
sum$which
#> (Intercept) Age_08_04 KM HP Met_Color Automatic CC Doors
#> 1 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
#> 2 TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
#> 3 TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
#> 4 TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
#> 5 TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
#> 6 TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
#> 7 TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
#> 8 TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE
#> 9 TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
#> 10 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
#> 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#> Quarterly_Tax Weight Fuel_Type_Diesel Fuel_Type_Petrol
#> 1 FALSE FALSE FALSE FALSE
#> 2 FALSE FALSE FALSE FALSE
#> 3 FALSE TRUE FALSE FALSE
#> 4 FALSE TRUE FALSE FALSE
#> 5 TRUE TRUE FALSE FALSE
#> 6 TRUE TRUE FALSE TRUE
#> 7 TRUE TRUE TRUE TRUE
#> 8 TRUE TRUE TRUE TRUE
#> 9 TRUE TRUE TRUE TRUE
#> 10 TRUE TRUE TRUE TRUE
#> 11 TRUE TRUE TRUE TRUE
# show metrics
sum$rsq
#> [1] 0.773 0.816 0.848 0.865 0.866 0.868 0.869 0.869 0.869 0.869 0.869
sum$adjr2
#> [1] 0.773 0.815 0.847 0.864 0.865 0.867 0.867 0.867 0.867 0.867 0.867
sum$cp
#> [1] 422.90 234.33 92.94 17.09 14.05 5.73 5.20 6.03 8.01 10.00
#> [11] 12.00
optimal <- which.min(sum$cp)
# determine the variable names for the optimal model
X <- summary(search)$which[, -1] # information about included predictors
xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept)
xvars <- xvars[X[optimal,]]
# the optimal model contains all dummy variables of Fuel_Type
xvars <- c("Age_08_04", "KM", "HP", "Quarterly_Tax", "Weight", "Fuel_Type")
# rebuild model for best predictor set
set.seed(1)
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(Price ~ ., data=car.df[, c("Price", xvars)],
method="lm", # specify the model
trControl=trControl)
model
#> Linear Regression
#>
#> 1000 samples
#> 6 predictor
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 799, 801, 800, 799, 801
#> Resampling results:
#>
#> RMSE Rsquared MAE
#> 1379 0.867 1031
#>
#> Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
#> (Intercept) Age_08_04 KM HP Quarterly_Tax
#> -6363.8962 -132.0947 -0.0189 35.1790 15.6829
#> Weight Fuel_TypeDiesel Fuel_TypePetrol
#> 17.1561 641.5680 2486.1642
metric.exhaustive <- collectMetrics(model, train.df, holdout.df)
# as model performance is estimated using AIC, we don't need to use cross-validation
trControl <- caret::trainControl(method="none")
model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
# select backward elmination
method="glmStepAIC", direction='backward')
#> Start: AIC=10377
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Met_Color + Automatic + CC + Doors + Quarterly_Tax +
#> Weight
#>
#> Df Deviance AIC
#> - Doors 1 1062310738 10375
#> - Met_Color 1 1062315617 10375
#> - CC 1 1062354916 10375
#> - Automatic 1 1064230947 10376
#> <none> 1062308322 10377
#> - Fuel_TypeDiesel 1 1067083282 10378
#> - Fuel_TypePetrol 1 1078318623 10384
#> - Quarterly_Tax 1 1087116311 10389
#> - KM 1 1164556303 10430
#> - HP 1 1167515944 10432
#> - Weight 1 1189413064 10443
#> - Age_08_04 1 2399657299 10865
#>
#> Step: AIC=10375
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Met_Color + Automatic + CC + Quarterly_Tax + Weight
#>
#> Df Deviance AIC
#> - Met_Color 1 1062317361 10373
#> - CC 1 1062357828 10373
#> - Automatic 1 1064231132 10374
#> <none> 1062310738 10375
#> - Fuel_TypeDiesel 1 1067084546 10376
#> - Fuel_TypePetrol 1 1078459491 10382
#> - Quarterly_Tax 1 1087171002 10387
#> - KM 1 1164578923 10428
#> - HP 1 1169381434 10431
#> - Weight 1 1202134106 10447
#> - Age_08_04 1 2399657526 10863
#>
#> Step: AIC=10373
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Automatic + CC + Quarterly_Tax + Weight
#>
#> Df Deviance AIC
#> - CC 1 1062363137 10371
#> - Automatic 1 1064260284 10372
#> <none> 1062317361 10373
#> - Fuel_TypeDiesel 1 1067086942 10374
#> - Fuel_TypePetrol 1 1078460073 10380
#> - Quarterly_Tax 1 1087227750 10385
#> - KM 1 1164601826 10426
#> - HP 1 1169381865 10429
#> - Weight 1 1202694121 10446
#> - Age_08_04 1 2404025593 10862
#>
#> Step: AIC=10371
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Automatic + Quarterly_Tax + Weight
#>
#> Df Deviance AIC
#> - Automatic 1 1064479599 10370
#> <none> 1062363137 10371
#> - Fuel_TypeDiesel 1 1067205988 10372
#> - Fuel_TypePetrol 1 1078509401 10378
#> - Quarterly_Tax 1 1087307523 10383
#> - KM 1 1164621006 10424
#> - HP 1 1171040817 10428
#> - Weight 1 1203279589 10444
#> - Age_08_04 1 2404329389 10860
#>
#> Step: AIC=10370
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Quarterly_Tax + Weight
#>
#> Df Deviance AIC
#> <none> 1064479599 10370
#> - Fuel_TypeDiesel 1 1069038155 10371
#> - Fuel_TypePetrol 1 1080915857 10377
#> - Quarterly_Tax 1 1089142014 10382
#> - KM 1 1167120338 10424
#> - HP 1 1171287677 10426
#> - Weight 1 1219525385 10450
#> - Age_08_04 1 2411897369 10860
coef(model$finalModel)
#> (Intercept) Age_08_04 KM Fuel_TypeDiesel Fuel_TypePetrol
#> -4129.8231 -133.3675 -0.0174 1147.8554 2193.7124
#> HP Quarterly_Tax Weight
#> 35.9268 9.6020 15.6732
model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
method="glmStepAIC", direction='forward')
#> Start: AIC=11577
#> .outcome ~ 1
#>
#> Df Deviance AIC
#> + Age_08_04 1 1839473922 10687
#> + Weight 1 4942095895 11281
#> + KM 1 5428350510 11337
#> + HP 1 7015561395 11492
#> + Doors 1 7621442286 11541
#> + Quarterly_Tax 1 7707385696 11548
#> + Met_Color 1 7959808301 11567
#> + CC 1 7965963079 11568
#> + Fuel_TypeDiesel 1 8080351818 11576
#> <none> 8119928181 11577
#> + Fuel_TypePetrol 1 8095103402 11578
#> + Automatic 1 8108208602 11579
#>
#> Step: AIC=10687
#> .outcome ~ Age_08_04
#>
#> Df Deviance AIC
#> + HP 1 1495755628 10565
#> + Weight 1 1568531621 10593
#> + KM 1 1731231021 10653
#> + Doors 1 1804938849 10678
#> + Automatic 1 1811625126 10680
#> + CC 1 1822653805 10683
#> + Fuel_TypePetrol 1 1827199292 10685
#> + Quarterly_Tax 1 1828134693 10685
#> + Fuel_TypeDiesel 1 1832172362 10687
#> <none> 1839473922 10687
#> + Met_Color 1 1836889395 10688
#>
#> Step: AIC=10565
#> .outcome ~ Age_08_04 + HP
#>
#> Df Deviance AIC
#> + Weight 1 1237149544 10453
#> + Quarterly_Tax 1 1381514233 10519
#> + Fuel_TypeDiesel 1 1425944734 10538
#> + Fuel_TypePetrol 1 1449296951 10548
#> + KM 1 1464314768 10554
#> + Doors 1 1472806525 10557
#> + Automatic 1 1478233353 10560
#> + CC 1 1480195311 10560
#> <none> 1495755628 10565
#> + Met_Color 1 1495343756 10567
#>
#> Step: AIC=10453
#> .outcome ~ Age_08_04 + HP + Weight
#>
#> Df Deviance AIC
#> + KM 1 1096745657 10382
#> + Fuel_TypePetrol 1 1195162204 10434
#> + Fuel_TypeDiesel 1 1213907602 10443
#> + Automatic 1 1230466003 10451
#> <none> 1237149544 10453
#> + Doors 1 1234977609 10454
#> + Quarterly_Tax 1 1236283850 10454
#> + CC 1 1237134720 10455
#> + Met_Color 1 1237148719 10455
#>
#> Step: AIC=10382
#> .outcome ~ Age_08_04 + HP + Weight + KM
#>
#> Df Deviance AIC
#> + Quarterly_Tax 1 1087643838 10379
#> <none> 1096745657 10382
#> + Fuel_TypePetrol 1 1094391693 10383
#> + Automatic 1 1094527602 10383
#> + CC 1 1096518027 10384
#> + Doors 1 1096547709 10384
#> + Fuel_TypeDiesel 1 1096571992 10384
#> + Met_Color 1 1096707124 10384
#>
#> Step: AIC=10379
#> .outcome ~ Age_08_04 + HP + Weight + KM + Quarterly_Tax
#>
#> Df Deviance AIC
#> + Fuel_TypePetrol 1 1069038155 10371
#> + Fuel_TypeDiesel 1 1080915857 10377
#> <none> 1087643838 10379
#> + Automatic 1 1084134742 10379
#> + Doors 1 1086976100 10381
#> + CC 1 1087573859 10381
#> + Met_Color 1 1087641402 10381
#>
#> Step: AIC=10371
#> .outcome ~ Age_08_04 + HP + Weight + KM + Quarterly_Tax + Fuel_TypePetrol
#>
#> Df Deviance AIC
#> + Fuel_TypeDiesel 1 1064479599 10370
#> <none> 1069038155 10371
#> + Automatic 1 1067205988 10372
#> + CC 1 1068706629 10373
#> + Met_Color 1 1069024990 10373
#> + Doors 1 1069029498 10373
#>
#> Step: AIC=10370
#> .outcome ~ Age_08_04 + HP + Weight + KM + Quarterly_Tax + Fuel_TypePetrol +
#> Fuel_TypeDiesel
#>
#> Df Deviance AIC
#> <none> 1064479599 10370
#> + Automatic 1 1062363137 10371
#> + CC 1 1064260284 10372
#> + Met_Color 1 1064454849 10372
#> + Doors 1 1064479146 10372
coef(model$finalModel)
#> (Intercept) Age_08_04 HP Weight KM
#> -4129.8231 -133.3675 35.9268 15.6732 -0.0174
#> Quarterly_Tax Fuel_TypePetrol Fuel_TypeDiesel
#> 9.6020 2193.7124 1147.8554
model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
method="glmStepAIC", direction='both')
#> Start: AIC=10377
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Met_Color + Automatic + CC + Doors + Quarterly_Tax +
#> Weight
#>
#> Df Deviance AIC
#> - Doors 1 1062310738 10375
#> - Met_Color 1 1062315617 10375
#> - CC 1 1062354916 10375
#> - Automatic 1 1064230947 10376
#> <none> 1062308322 10377
#> - Fuel_TypeDiesel 1 1067083282 10378
#> - Fuel_TypePetrol 1 1078318623 10384
#> - Quarterly_Tax 1 1087116311 10389
#> - KM 1 1164556303 10430
#> - HP 1 1167515944 10432
#> - Weight 1 1189413064 10443
#> - Age_08_04 1 2399657299 10865
#>
#> Step: AIC=10375
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Met_Color + Automatic + CC + Quarterly_Tax + Weight
#>
#> Df Deviance AIC
#> - Met_Color 1 1062317361 10373
#> - CC 1 1062357828 10373
#> - Automatic 1 1064231132 10374
#> <none> 1062310738 10375
#> - Fuel_TypeDiesel 1 1067084546 10376
#> + Doors 1 1062308322 10377
#> - Fuel_TypePetrol 1 1078459491 10382
#> - Quarterly_Tax 1 1087171002 10387
#> - KM 1 1164578923 10428
#> - HP 1 1169381434 10431
#> - Weight 1 1202134106 10447
#> - Age_08_04 1 2399657526 10863
#>
#> Step: AIC=10373
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Automatic + CC + Quarterly_Tax + Weight
#>
#> Df Deviance AIC
#> - CC 1 1062363137 10371
#> - Automatic 1 1064260284 10372
#> <none> 1062317361 10373
#> - Fuel_TypeDiesel 1 1067086942 10374
#> + Met_Color 1 1062310738 10375
#> + Doors 1 1062315617 10375
#> - Fuel_TypePetrol 1 1078460073 10380
#> - Quarterly_Tax 1 1087227750 10385
#> - KM 1 1164601826 10426
#> - HP 1 1169381865 10429
#> - Weight 1 1202694121 10446
#> - Age_08_04 1 2404025593 10862
#>
#> Step: AIC=10371
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Automatic + Quarterly_Tax + Weight
#>
#> Df Deviance AIC
#> - Automatic 1 1064479599 10370
#> <none> 1062363137 10371
#> - Fuel_TypeDiesel 1 1067205988 10372
#> + CC 1 1062317361 10373
#> + Met_Color 1 1062357828 10373
#> + Doors 1 1062360907 10373
#> - Fuel_TypePetrol 1 1078509401 10378
#> - Quarterly_Tax 1 1087307523 10383
#> - KM 1 1164621006 10424
#> - HP 1 1171040817 10428
#> - Weight 1 1203279589 10444
#> - Age_08_04 1 2404329389 10860
#>
#> Step: AIC=10370
#> .outcome ~ Age_08_04 + KM + Fuel_TypeDiesel + Fuel_TypePetrol +
#> HP + Quarterly_Tax + Weight
#>
#> Df Deviance AIC
#> <none> 1064479599 10370
#> - Fuel_TypeDiesel 1 1069038155 10371
#> + Automatic 1 1062363137 10371
#> + CC 1 1064260284 10372
#> + Met_Color 1 1064454849 10372
#> + Doors 1 1064479146 10372
#> - Fuel_TypePetrol 1 1080915857 10377
#> - Quarterly_Tax 1 1089142014 10382
#> - KM 1 1167120338 10424
#> - HP 1 1171287677 10426
#> - Weight 1 1219525385 10450
#> - Age_08_04 1 2411897369 10860
coef(model$finalModel)
#> (Intercept) Age_08_04 KM Fuel_TypeDiesel Fuel_TypePetrol
#> -4129.8231 -133.3675 -0.0174 1147.8554 2193.7124
#> HP Quarterly_Tax Weight
#> 35.9268 9.6020 15.6732
rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price))
#> ME RMSE MAE
#> Training 0.0000000000876 1331 1010
#> Holdout 1.37 1423 1054
# The models are identical to the best model obtained from the exhaustive search.
# We therefore duplicate the metrics.
metric.stepwise <- metric.exhaustive
set.seed(1)
library(caret)
trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
model <- caret::train(Price ~ ., data=train.df,
method='glmnet',
family='gaussian', # set the family for linear regression
trControl=trControl,
tuneGrid=tuneGrid)
model$bestTune
#> alpha lambda
#> 11 0 1000
coef(model$finalModel, s=model$bestTune$lambda)
#> 12 x 1 sparse Matrix of class "dgCMatrix"
#> s1
#> (Intercept) -2658.9954
#> Age_08_04 -100.9073
#> KM -0.0223
#> Fuel_TypeDiesel -8.4115
#> Fuel_TypePetrol 266.9359
#> HP 33.9810
#> Met_Color 97.2147
#> Automatic 195.7619
#> CC 0.0726
#> Doors 99.6551
#> Quarterly_Tax 6.5570
#> Weight 14.7082
metric.ridge <- collectMetrics(model, train.df, holdout.df,
length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
ridge.model <- model
rbind(
Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
)
#> ME RMSE MAE
#> Training 0.0000000000181 1423 1061
#> Holdout 6.63 1590 1162
set.seed(1)
tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
model <- caret::train(Price ~ ., data=train.df,
method='glmnet',
family='gaussian', # set the family for linear regression
trControl=trControl,
tuneGrid=tuneGrid)
model$bestTune
#> alpha lambda
#> 23 1 158
coef(model$finalModel, s=model$bestTune$lambda)
#> 12 x 1 sparse Matrix of class "dgCMatrix"
#> s1
#> (Intercept) 368.168
#> Age_08_04 -132.917
#> KM -0.015
#> Fuel_TypeDiesel .
#> Fuel_TypePetrol .
#> HP 30.759
#> Met_Color .
#> Automatic .
#> CC .
#> Doors .
#> Quarterly_Tax .
#> Weight 14.536
lasso.model <- model
metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df,
sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)
rbind(
Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
)
#> ME RMSE MAE
#> Training 0.0000000000124 1372 1026
#> Holdout -24 1557 1112
library(tidyverse)
library(gridExtra)
g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
geom_line() +
geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
labs(x=expression(paste('Ridge parameter ', lambda)),
y='RMSE (cross-validation)') +
scale_x_log10()
g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
geom_line() +
geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
labs(x=expression(paste('Lasso parameter ', lambda)),
y='RMSE (cross-validation)') +
scale_x_log10()
grid.arrange(g1, g2, ncol=2)
g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
ggsave(file=file.path(getwd(), 'figures', 'chapter_06', 'shrinkage-parameter-tuning.pdf'),
g, width=6, height=2.5, units='in')
data.frame(rbind(
'full'= metric.full,
'exhaustive' = metric.exhaustive,
'stepwise' = metric.stepwise,
'ridge' = metric.ridge,
'lasso' = metric.lasso
))
#> CV.RMSE CV.MAE Training.ME Training.RMSE Training.MAE Holdout.ME
#> full 1949 1086 0.7284815029209 1344 1009 -1.10
#> exhaustive 1379 1031 0.9735056879862 1343 1012 -1.47
#> stepwise 1379 1031 0.9735056879862 1343 1012 -1.47
#> ridge 1507 1092 0.0000000000181 1423 1061 6.63
#> lasso 1396 1039 0.0000000000124 1372 1026 -23.96
#> Holdout.RMSE Holdout.MAE nPredictors
#> full 1366 1030 11
#> exhaustive 1373 1034 7
#> stepwise 1373 1034 7
#> ridge 1590 1162 11
#> lasso 1557 1112 4