if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999, digits = 3)

Estimating the Regression Equation and Prediction

Example: Predicting the Price of Used Toyota Corolla Cars

library(caret)
car.df <- mlba::ToyotaCorolla
# select variables for regression
outcome <- "Price"
predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic",
                "CC", "Doors", "Quarterly_Tax", "Weight")
# reduce data set to first 1000 rows and selected variables
car.df <- car.df[1:1000, c(outcome, predictors)]

# partition data
set.seed(1)  # set seed for reproducing the partition
idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE)
train.df <- car.df[idx, ]
holdout.df <- car.df[-idx, ]

# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
car.lm <- lm(Price ~ ., data = train.df)
#  use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(car.lm)
#> 
#> Call:
#> lm(formula = Price ~ ., data = train.df)
#> 
#> Residuals:
#>    Min     1Q Median     3Q    Max 
#>  -9047   -831     -6    832   6057 
#> 
#> Coefficients:
#>                    Estimate  Std. Error t value             Pr(>|t|)    
#> (Intercept)     -3725.59270  1913.92374   -1.95              0.05206 .  
#> Age_08_04        -133.98649     4.92047  -27.23 < 0.0000000000000002 ***
#> KM                 -0.01741     0.00231   -7.53  0.00000000000019238 ***
#> Fuel_TypeDiesel  1179.18603   724.71141    1.63              0.10425    
#> Fuel_TypePetrol  2173.64897   729.55378    2.98              0.00301 ** 
#> HP                 36.34253     4.75838    7.64  0.00000000000008997 ***
#> Met_Color          -7.60255   119.54320   -0.06              0.94931    
#> Automatic         276.55860   267.85985    1.03              0.30227    
#> CC                  0.01517     0.09440    0.16              0.87236    
#> Doors               2.28016    62.30556    0.04              0.97082    
#> Quarterly_Tax       9.64453     2.60048    3.71              0.00023 ***
#> Weight             15.25566     1.81726    8.39  0.00000000000000035 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1340 on 589 degrees of freedom
#> Multiple R-squared:  0.869,  Adjusted R-squared:  0.867 
#> F-statistic:  356 on 11 and 589 DF,  p-value: <0.0000000000000002
# use predict() to make predictions on a new set.
pred <- predict(car.lm, holdout.df)

options(scipen=999, digits=1)
data.frame(
  'Predicted' = pred[1:20],
  'Actual' = holdout.df$Price[1:20],
  'Residual' = holdout.df$Price[1:20] - pred[1:20]
)
#>    Predicted Actual Residual
#> 1      16652  13500    -3152
#> 14     19941  21500     1559
#> 15     19613  22500     2887
#> 16     20424  22000     1576
#> 18     16553  17950     1397
#> 19     15247  16750     1503
#> 20     15006  16950     1944
#> 21     14949  15950     1001
#> 22     16917  16950       33
#> 24     16063  16950      887
#> 25     16040  16250      210
#> 26     16530  15950     -580
#> 30     16163  17950     1787
#> 32     16034  15750     -284
#> 36     15370  15750      380
#> 37     15817  15950      133
#> 39     14866  15750      884
#> 40     15506  14750     -756
#> 41     15800  13950    -1850
#> 45     17475  16950     -525
options(scipen=999, digits = 3)

# calculate performance metrics
rbind(
  Training=mlba::regressionSummary(predict(car.lm, train.df), train.df$Price),
  Holdout=mlba::regressionSummary(pred, holdout.df$Price)
)
#>          ME              RMSE MAE 
#> Training 0.0000000000842 1329 1009
#> Holdout  1.7             1423 1054
library(ggplot2)
pred <- predict(car.lm, holdout.df)
all.residuals <- holdout.df$Price - pred

ggplot() +
    geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
    labs(x="Residuals", y="Frequency")

g <- ggplot() +
         geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
         labs(x="Residuals", y="Frequency") +
         theme_bw()
ggsave(file=file.path(getwd(), "figures", "chapter_06", "residuals-histogram.pdf"),
       g, width=5, height=3, units="in")

Cross-validation and caret

set.seed(1)
library(caret)
# define 5-fold
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(Price ~ ., data=car.df,
                      method="lm",  # specify the model
                      trControl=trControl)
model
#> Linear Regression 
#> 
#> 1000 samples
#>   10 predictor
#> 
#> No pre-processing
#> Resampling: Cross-Validated (5 fold) 
#> Summary of sample sizes: 799, 801, 800, 799, 801 
#> Resampling results:
#> 
#>   RMSE  Rsquared  MAE 
#>   1949  0.753     1086
#> 
#> Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
#>     (Intercept)       Age_08_04              KM Fuel_TypeDiesel Fuel_TypePetrol 
#>      -6419.2947       -132.4105         -0.0187        654.4718       2536.4747 
#>              HP       Met_Color       Automatic              CC           Doors 
#>         35.0697         58.8612        239.5717         -0.0210        -69.0173 
#>   Quarterly_Tax          Weight 
#>         15.9124         17.4002
library(tidyverse)
collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
  if (missing(nPredictors)) {
    coefs = coef(model$finalModel)
    nPredictors = length(coefs) - 1
  }
  return (cbind(
    CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
    Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
    Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price),
    nPredictors=nPredictors
  ))
}

metric.full <- collectMetrics(model, train.df, holdout.df)
predict(model, car.df[1:3,])
#>     1     2     3 
#> 16892 16408 16858

Variable Selection in Linear Regression

How to Reduce the Number of Predictors

Regularization (Shrinkage Models)

set.seed(1)
library(caret)
trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
model <- caret::train(Price ~ ., data=train.df,
                 method='glmnet',
                 family='gaussian',  # set the family for linear regression
                 trControl=trControl,
                 tuneGrid=tuneGrid)
model$bestTune
#>    alpha lambda
#> 11     0   1000
coef(model$finalModel, s=model$bestTune$lambda)
#> 12 x 1 sparse Matrix of class "dgCMatrix"
#>                         s1
#> (Intercept)     -2658.9954
#> Age_08_04        -100.9073
#> KM                 -0.0223
#> Fuel_TypeDiesel    -8.4115
#> Fuel_TypePetrol   266.9359
#> HP                 33.9810
#> Met_Color          97.2147
#> Automatic         195.7619
#> CC                  0.0726
#> Doors              99.6551
#> Quarterly_Tax       6.5570
#> Weight             14.7082
metric.ridge <- collectMetrics(model, train.df, holdout.df,
  length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
ridge.model <- model
rbind(
  Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
  Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
)
#>          ME              RMSE MAE 
#> Training 0.0000000000181 1423 1061
#> Holdout  6.63            1590 1162
set.seed(1)
tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
model <- caret::train(Price ~ ., data=train.df,
                 method='glmnet',
                 family='gaussian',  # set the family for linear regression
                 trControl=trControl,
                 tuneGrid=tuneGrid)
model$bestTune
#>    alpha lambda
#> 23     1    158
coef(model$finalModel, s=model$bestTune$lambda)
#> 12 x 1 sparse Matrix of class "dgCMatrix"
#>                       s1
#> (Intercept)      368.168
#> Age_08_04       -132.917
#> KM                -0.015
#> Fuel_TypeDiesel    .    
#> Fuel_TypePetrol    .    
#> HP                30.759
#> Met_Color          .    
#> Automatic          .    
#> CC                 .    
#> Doors              .    
#> Quarterly_Tax      .    
#> Weight            14.536
lasso.model <- model
metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df,
  sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)
rbind(
  Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
  Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
)
#>          ME              RMSE MAE 
#> Training 0.0000000000124 1372 1026
#> Holdout  -24             1557 1112
library(tidyverse)
library(gridExtra)
g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
    geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
    geom_line() +
    geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
    labs(x=expression(paste('Ridge parameter ', lambda)),
         y='RMSE (cross-validation)') +
    scale_x_log10()
g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
    geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
    geom_line() +
    geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
    labs(x=expression(paste('Lasso parameter ', lambda)),
         y='RMSE (cross-validation)') +
    scale_x_log10()
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
ggsave(file=file.path(getwd(), 'figures', 'chapter_06', 'shrinkage-parameter-tuning.pdf'),
       g, width=6, height=2.5, units='in')
data.frame(rbind(
  'full'= metric.full,
  'exhaustive' = metric.exhaustive,
  'stepwise' = metric.stepwise,
  'ridge' = metric.ridge,
  'lasso' = metric.lasso
))
#>            CV.RMSE CV.MAE     Training.ME Training.RMSE Training.MAE Holdout.ME
#> full          1949   1086 0.7284815029209          1344         1009      -1.10
#> exhaustive    1379   1031 0.9735056879862          1343         1012      -1.47
#> stepwise      1379   1031 0.9735056879862          1343         1012      -1.47
#> ridge         1507   1092 0.0000000000181          1423         1061       6.63
#> lasso         1396   1039 0.0000000000124          1372         1026     -23.96
#>            Holdout.RMSE Holdout.MAE nPredictors
#> full               1366        1030          11
#> exhaustive         1373        1034           7
#> stepwise           1373        1034           7
#> ridge              1590        1162          11
#> lasso              1557        1112           4