# Introduce all used packages
library(ggplot2)
library(lattice)
library(caret)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaps)
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(gridExtra)
## 
## 载入程辑包:'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine




Read Data

options(scipen=999, digits = 3)

# read data
westroxbury.df <- mlba::WestRoxbury

# select variables for regression
outcome <- "TOTAL.VALUE"
predictors <- c("LOT.SQFT", "YR.BUILT", "GROSS.AREA", "LIVING.AREA", "FLOORS",
                "ROOMS", "BEDROOMS", "FULL.BATH", "HALF.BATH", "KITCHEN", "FIREPLACE", "REMODEL")

# reduce data set to first 3000 rows and selected variable
west.df <- westroxbury.df[1:3000, c(outcome, predictors)]

Train Data and Test Data

# partition data
set.seed(1)  # set seed for reproducing the partition
idx <- createDataPartition(west.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- west.df[idx, ]
holdout.df <- west.df[-idx, ]




Model Training and Testing

# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
west.lm <- lm(TOTAL.VALUE ~ ., data = train.df)
#  use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(west.lm)
## 
## Call:
## lm(formula = TOTAL.VALUE ~ ., data = train.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -178.44  -27.42   -0.86   25.87  198.53 
## 
## Coefficients:
##                 Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   -35.295638  41.002262   -0.86               0.3895    
## LOT.SQFT        0.007807   0.000392   19.93 < 0.0000000000000002 ***
## YR.BUILT        0.037122   0.019960    1.86               0.0631 .  
## GROSS.AREA      0.030378   0.002772   10.96 < 0.0000000000000002 ***
## LIVING.AREA     0.050552   0.005024   10.06 < 0.0000000000000002 ***
## FLOORS         41.417184   2.862168   14.47 < 0.0000000000000002 ***
## ROOMS           3.650320   1.231809    2.96               0.0031 ** 
## BEDROOMS       -2.626316   1.859669   -1.41               0.1581    
## FULL.BATH      15.182365   2.407331    6.31        0.00000000036 ***
## HALF.BATH      19.608046   2.213825    8.86 < 0.0000000000000002 ***
## KITCHEN       -12.894560   8.719267   -1.48               0.1394    
## FIREPLACE      17.145103   1.920669    8.93 < 0.0000000000000002 ***
## REMODELOld      4.670305   3.527513    1.32               0.1857    
## REMODELRecent  27.143402   3.129025    8.67 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.1 on 1787 degrees of freedom
## Multiple R-squared:  0.796,  Adjusted R-squared:  0.794 
## F-statistic:  536 on 13 and 1787 DF,  p-value: <0.0000000000000002
# use predict() to make predictions on a new set.
pred <- predict(west.lm, holdout.df)

options(scipen=999, digits=1)
data.frame(
  'Predicted' = pred[1:20],
  'Actual' = holdout.df$TOTAL.VALUE[1:20],
  'Residual' = holdout.df$TOTAL.VALUE[1:20] - pred[1:20]
)
##    Predicted Actual Residual
## 1        373    344      -29
## 6        275    337       63
## 7        393    359      -34
## 10       492    409      -82
## 13       305    316       11
## 14       555    575       20
## 15       333    326       -7
## 18       350    345       -5
## 20       376    348      -28
## 21       278    318       39
## 22       326    331        5
## 26       336    346       10
## 30       327    321       -6
## 32       314    294      -20
## 38       340    296      -44
## 40       343    318      -24
## 42       364    350      -14
## 47       394    372      -23
## 50       379    432       53
## 56       392    346      -46




Model Evaluation

options(scipen=999, digits = 3)

# calculate performance metrics
rbind(
  Training=mlba::regressionSummary(predict(west.lm, train.df), train.df$TOTAL.VALUE),
  Holdout=mlba::regressionSummary(pred, holdout.df$TOTAL.VALUE)
)
##          ME                RMSE MAE 
## Training -0.00000000000119 42.9 33.1
## Holdout  -0.202            42.3 32.9
pred <- predict(west.lm, holdout.df)
all.residuals <- holdout.df$TOTAL.VALUE - pred

ggplot() +
  geom_histogram(aes(x=all.residuals), fill="pink", color="blue") +
  labs(x="Residuals", y="Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

g <- ggplot() +
  geom_histogram(aes(x=all.residuals), fill="pink", color="blue") +
  labs(x="Residuals", y="Frequency") +
  theme_bw()
# ggsave(file=file.path("E:\\Documentation\\课程\\R\\test1", "residuals-histogram.pdf"), g, width=5, height=3, units="in")





Cross-validation and caret

set.seed(1)
# define 5-fold
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=west.df,
                      method="lm",  # specify the model
                      trControl=trControl)
model
## Linear Regression 
## 
## 3000 samples
##   12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2400, 2402, 2399, 2399, 2400 
## Resampling results:
## 
##   RMSE  Rsquared  MAE 
##   43.5  0.782     33.1
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
##   (Intercept)      LOT.SQFT      YR.BUILT    GROSS.AREA   LIVING.AREA 
##      -84.2348        0.0082        0.0579        0.0323        0.0436 
##        FLOORS         ROOMS      BEDROOMS     FULL.BATH     HALF.BATH 
##       42.8958        3.3285       -0.7456       16.1256       19.7283 
##       KITCHEN     FIREPLACE    REMODELOld REMODELRecent 
##       -8.6909       17.8466        4.1979       25.3402
collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
  if (missing(nPredictors)) {
    coefs = coef(model$finalModel)
    nPredictors = length(coefs) - 1
  }
  return (cbind(
    CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
    Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
    Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE),
    nPredictors=nPredictors
  ))
}

metric.full <- collectMetrics(model, train.df, holdout.df)

predict(model, west.df[1:3,])
##   1   2   3 
## 375 454 357





Variable Selection in Linear Regression

Reduce the Number of Predictors

Regularization (Shrinkage Models)

set.seed(1)
library(caret)
trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
model <- caret::train(TOTAL.VALUE ~ ., data=west.df,
                      method='glmnet',
                      family='gaussian',  # set the family for linear regression
                      trControl=trControl,
                      tuneGrid=tuneGrid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
model$bestTune
##   alpha lambda
## 1     0    100
coef(model$finalModel, s=model$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept)   90.24293
## LOT.SQFT       0.00481
## YR.BUILT       0.00288
## GROSS.AREA     0.02006
## LIVING.AREA    0.03159
## FLOORS        24.38370
## ROOMS          6.55513
## BEDROOMS       7.26270
## FULL.BATH     13.13814
## HALF.BATH     14.27822
## KITCHEN       -7.55701
## FIREPLACE     13.32439
## REMODELOld     2.37126
## REMODELRecent 15.48961
metric.ridge <- collectMetrics(model, train.df, holdout.df,
                               length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
ridge.model <- model

rbind(
  Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
  Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE)
)
##          ME     RMSE MAE 
## Training 0.427  48.7 37.1
## Holdout  -0.641 47   36.7
set.seed(1)
tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
model <- caret::train(TOTAL.VALUE ~ ., data=train.df,
                      method='glmnet',
                      family='gaussian',  # set the family for linear regression
                      trControl=trControl,
                      tuneGrid=tuneGrid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
model$bestTune
##   alpha lambda
## 5     1   2.51
coef(model$finalModel, s=model$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept)   42.36603
## LOT.SQFT       0.00711
## YR.BUILT       .      
## GROSS.AREA     0.02994
## LIVING.AREA    0.05266
## FLOORS        38.09561
## ROOMS          2.31567
## BEDROOMS       .      
## FULL.BATH     11.18611
## HALF.BATH     15.99640
## KITCHEN        .      
## FIREPLACE     14.82108
## REMODELOld     .      
## REMODELRecent 20.13996
lasso.model <- model
metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df,
                               sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)
rbind(
  Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
  Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE)
)
##          ME                 RMSE MAE 
## Training -0.000000000000135 43.3 33.3
## Holdout  -0.368             42.7 33.1
g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
  geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='green') +
  geom_line() +
  geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
  labs(x=expression(paste('Ridge parameter ', lambda)),
       y='RMSE (cross-validation)') +
  scale_x_log10()
g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
  geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='green') +
  geom_line() +
  geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
  labs(x=expression(paste('Lasso parameter ', lambda)),
       y='RMSE (cross-validation)') +
  scale_x_log10()
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
# ggsave(file=file.path("E:\\Documentation\\课程\\R\\test1", "shrinkage-parameter-tuning.pdf"), g, width=6, height=2.5, units='in')

data.frame(rbind(
  'full'= metric.full,
  'exhaustive' = metric.exhaustive,
  'stepwise' = metric.stepwise,
  'ridge' = metric.ridge,
  'lasso' = metric.lasso
))
##            CV.RMSE CV.MAE        Training.ME Training.RMSE Training.MAE
## full          43.5   33.1  0.134712559086481          43.0         33.0
## exhaustive    42.9   33.1  0.192950992986573          43.0         33.1
## stepwise      42.9   33.1  0.192950992986573          43.0         33.1
## ridge         48.2   37.1  0.426909206328931          48.7         37.1
## lasso         43.9   33.5 -0.000000000000135          43.3         33.3
##            Holdout.ME Holdout.RMSE Holdout.MAE nPredictors
## full           -0.202         42.0        32.6          13
## exhaustive     -0.290         42.1        32.7          10
## stepwise       -0.290         42.1        32.7          10
## ridge          -0.641         47.0        36.7          13
## lasso          -0.368         42.7        33.1           9