Haiyuan Gui’s Document

# Introduce all used packages
library(ggplot2)
library(lattice)
library(caret)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(leaps)
library(fastDummies)

## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.

library(gridExtra)

## 
## 载入程辑包：'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Read Data

options(scipen=999, digits = 3)

# read data
westroxbury.df <- mlba::WestRoxbury

# select variables for regression
outcome <- "TOTAL.VALUE"
predictors <- c("LOT.SQFT", "YR.BUILT", "GROSS.AREA", "LIVING.AREA", "FLOORS",
                "ROOMS", "BEDROOMS", "FULL.BATH", "HALF.BATH", "KITCHEN", "FIREPLACE", "REMODEL")

# reduce data set to first 3000 rows and selected variable
west.df <- westroxbury.df[1:3000, c(outcome, predictors)]

Train Data and Test Data

# partition data
set.seed(1)  # set seed for reproducing the partition
idx <- createDataPartition(west.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- west.df[idx, ]
holdout.df <- west.df[-idx, ]

Model Training and Testing

# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
west.lm <- lm(TOTAL.VALUE ~ ., data = train.df)
#  use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(west.lm)

## 
## Call:
## lm(formula = TOTAL.VALUE ~ ., data = train.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -178.44  -27.42   -0.86   25.87  198.53 
## 
## Coefficients:
##                 Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   -35.295638  41.002262   -0.86               0.3895    
## LOT.SQFT        0.007807   0.000392   19.93 < 0.0000000000000002 ***
## YR.BUILT        0.037122   0.019960    1.86               0.0631 .  
## GROSS.AREA      0.030378   0.002772   10.96 < 0.0000000000000002 ***
## LIVING.AREA     0.050552   0.005024   10.06 < 0.0000000000000002 ***
## FLOORS         41.417184   2.862168   14.47 < 0.0000000000000002 ***
## ROOMS           3.650320   1.231809    2.96               0.0031 ** 
## BEDROOMS       -2.626316   1.859669   -1.41               0.1581    
## FULL.BATH      15.182365   2.407331    6.31        0.00000000036 ***
## HALF.BATH      19.608046   2.213825    8.86 < 0.0000000000000002 ***
## KITCHEN       -12.894560   8.719267   -1.48               0.1394    
## FIREPLACE      17.145103   1.920669    8.93 < 0.0000000000000002 ***
## REMODELOld      4.670305   3.527513    1.32               0.1857    
## REMODELRecent  27.143402   3.129025    8.67 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.1 on 1787 degrees of freedom
## Multiple R-squared:  0.796,  Adjusted R-squared:  0.794 
## F-statistic:  536 on 13 and 1787 DF,  p-value: <0.0000000000000002

# use predict() to make predictions on a new set.
pred <- predict(west.lm, holdout.df)

options(scipen=999, digits=1)
data.frame(
  'Predicted' = pred[1:20],
  'Actual' = holdout.df$TOTAL.VALUE[1:20],
  'Residual' = holdout.df$TOTAL.VALUE[1:20] - pred[1:20]
)

##    Predicted Actual Residual
## 1        373    344      -29
## 6        275    337       63
## 7        393    359      -34
## 10       492    409      -82
## 13       305    316       11
## 14       555    575       20
## 15       333    326       -7
## 18       350    345       -5
## 20       376    348      -28
## 21       278    318       39
## 22       326    331        5
## 26       336    346       10
## 30       327    321       -6
## 32       314    294      -20
## 38       340    296      -44
## 40       343    318      -24
## 42       364    350      -14
## 47       394    372      -23
## 50       379    432       53
## 56       392    346      -46

Model Evaluation

options(scipen=999, digits = 3)

# calculate performance metrics
rbind(
  Training=mlba::regressionSummary(predict(west.lm, train.df), train.df$TOTAL.VALUE),
  Holdout=mlba::regressionSummary(pred, holdout.df$TOTAL.VALUE)
)

##          ME                RMSE MAE 
## Training -0.00000000000119 42.9 33.1
## Holdout  -0.202            42.3 32.9

pred <- predict(west.lm, holdout.df)
all.residuals <- holdout.df$TOTAL.VALUE - pred

ggplot() +
  geom_histogram(aes(x=all.residuals), fill="pink", color="blue") +
  labs(x="Residuals", y="Frequency")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

g <- ggplot() +
  geom_histogram(aes(x=all.residuals), fill="pink", color="blue") +
  labs(x="Residuals", y="Frequency") +
  theme_bw()
# ggsave(file=file.path("E:\\Documentation\\课程\\R\\test1", "residuals-histogram.pdf"), g, width=5, height=3, units="in")

Cross-validation and caret

set.seed(1)
# define 5-fold
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=west.df,
                      method="lm",  # specify the model
                      trControl=trControl)
model

## Linear Regression 
## 
## 3000 samples
##   12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2400, 2402, 2399, 2399, 2400 
## Resampling results:
## 
##   RMSE  Rsquared  MAE 
##   43.5  0.782     33.1
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

coef(model$finalModel)

##   (Intercept)      LOT.SQFT      YR.BUILT    GROSS.AREA   LIVING.AREA 
##      -84.2348        0.0082        0.0579        0.0323        0.0436 
##        FLOORS         ROOMS      BEDROOMS     FULL.BATH     HALF.BATH 
##       42.8958        3.3285       -0.7456       16.1256       19.7283 
##       KITCHEN     FIREPLACE    REMODELOld REMODELRecent 
##       -8.6909       17.8466        4.1979       25.3402

collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
  if (missing(nPredictors)) {
    coefs = coef(model$finalModel)
    nPredictors = length(coefs) - 1
  }
  return (cbind(
    CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
    Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
    Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE),
    nPredictors=nPredictors
  ))
}

metric.full <- collectMetrics(model, train.df, holdout.df)

predict(model, west.df[1:3,])

##   1   2   3 
## 375 454 357

Variable Selection in Linear Regression

Reduce the Number of Predictors

Exhaustive Search

# use regsubsets() in package leaps to run an exhaustive search.
# unlike with lm, categorical predictors must be turned into dummies manually.
# create dummies for fuel type
# create dummies for fuel type
leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE,
                             remove_selected_columns=TRUE)
search <- regsubsets(TOTAL.VALUE ~ ., data=leaps.train.df, nbest=1,
                     nvmax=ncol(leaps.train.df), method="exhaustive")
sum <- summary(search)
sum

## Subset selection object
## Call: regsubsets.formula(TOTAL.VALUE ~ ., data = leaps.train.df, nbest = 1, 
##     nvmax = ncol(leaps.train.df), method = "exhaustive")
## 13 Variables  (and intercept)
##                Forced in Forced out
## LOT.SQFT           FALSE      FALSE
## YR.BUILT           FALSE      FALSE
## GROSS.AREA         FALSE      FALSE
## LIVING.AREA        FALSE      FALSE
## FLOORS             FALSE      FALSE
## ROOMS              FALSE      FALSE
## BEDROOMS           FALSE      FALSE
## FULL.BATH          FALSE      FALSE
## HALF.BATH          FALSE      FALSE
## KITCHEN            FALSE      FALSE
## FIREPLACE          FALSE      FALSE
## REMODEL_Old        FALSE      FALSE
## REMODEL_Recent     FALSE      FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
##           LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS
## 1  ( 1 )  " "      " "      " "        "*"         " "    " "   " "     
## 2  ( 1 )  "*"      " "      " "        "*"         " "    " "   " "     
## 3  ( 1 )  "*"      " "      " "        "*"         "*"    " "   " "     
## 4  ( 1 )  "*"      " "      "*"        "*"         "*"    " "   " "     
## 5  ( 1 )  "*"      " "      "*"        "*"         "*"    " "   " "     
## 6  ( 1 )  "*"      " "      "*"        "*"         "*"    " "   " "     
## 7  ( 1 )  "*"      " "      "*"        "*"         "*"    " "   " "     
## 8  ( 1 )  "*"      " "      "*"        "*"         "*"    " "   " "     
## 9  ( 1 )  "*"      " "      "*"        "*"         "*"    "*"   " "     
## 10  ( 1 ) "*"      "*"      "*"        "*"         "*"    "*"   " "     
## 11  ( 1 ) "*"      "*"      "*"        "*"         "*"    "*"   " "     
## 12  ( 1 ) "*"      "*"      "*"        "*"         "*"    "*"   "*"     
## 13  ( 1 ) "*"      "*"      "*"        "*"         "*"    "*"   "*"     
##           FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
## 1  ( 1 )  " "       " "       " "     " "       " "         " "           
## 2  ( 1 )  " "       " "       " "     " "       " "         " "           
## 3  ( 1 )  " "       " "       " "     " "       " "         " "           
## 4  ( 1 )  " "       " "       " "     " "       " "         " "           
## 5  ( 1 )  " "       " "       " "     "*"       " "         " "           
## 6  ( 1 )  " "       " "       " "     "*"       " "         "*"           
## 7  ( 1 )  " "       "*"       " "     "*"       " "         "*"           
## 8  ( 1 )  "*"       "*"       " "     "*"       " "         "*"           
## 9  ( 1 )  "*"       "*"       " "     "*"       " "         "*"           
## 10  ( 1 ) "*"       "*"       " "     "*"       " "         "*"           
## 11  ( 1 ) "*"       "*"       "*"     "*"       " "         "*"           
## 12  ( 1 ) "*"       "*"       "*"     "*"       " "         "*"           
## 13  ( 1 ) "*"       "*"       "*"     "*"       "*"         "*"

# show models
sum$which

##    (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS
## 1         TRUE    FALSE    FALSE      FALSE        TRUE  FALSE FALSE    FALSE
## 2         TRUE     TRUE    FALSE      FALSE        TRUE  FALSE FALSE    FALSE
## 3         TRUE     TRUE    FALSE      FALSE        TRUE   TRUE FALSE    FALSE
## 4         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 5         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 6         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 7         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 8         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 9         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE  TRUE    FALSE
## 10        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE  TRUE    FALSE
## 11        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE  TRUE    FALSE
## 12        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE  TRUE     TRUE
## 13        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE  TRUE     TRUE
##    FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
## 1      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 2      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 3      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 4      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 5      FALSE     FALSE   FALSE      TRUE       FALSE          FALSE
## 6      FALSE     FALSE   FALSE      TRUE       FALSE           TRUE
## 7      FALSE      TRUE   FALSE      TRUE       FALSE           TRUE
## 8       TRUE      TRUE   FALSE      TRUE       FALSE           TRUE
## 9       TRUE      TRUE   FALSE      TRUE       FALSE           TRUE
## 10      TRUE      TRUE   FALSE      TRUE       FALSE           TRUE
## 11      TRUE      TRUE    TRUE      TRUE       FALSE           TRUE
## 12      TRUE      TRUE    TRUE      TRUE       FALSE           TRUE
## 13      TRUE      TRUE    TRUE      TRUE        TRUE           TRUE

# show metrics
sum$rsq

##  [1] 0.661 0.707 0.740 0.761 0.773 0.782 0.789 0.794 0.795 0.795 0.796 0.796
## [13] 0.796

sum$adjr2

##  [1] 0.661 0.706 0.740 0.760 0.772 0.781 0.788 0.793 0.794 0.794 0.794 0.794
## [13] 0.794

# The smaller the value of cp, the better the fit of the model. cp portrays the balance between the predictive power of the model and the complexity of the model. It takes into account the sum of squares of the fitted residuals and the number of variables used in the model, preferring models with smaller residuals but not overly complex.
sum$cp

##  [1] 1167.7  775.1  480.6  304.5  203.3  120.2   64.5   19.2   15.4   14.0
## [11]   13.9   13.8   14.0

# In this list, -0.696 is the smallest Cp value, so the 3rd model corresponds to the lowest Cp value and can be considered as the best-fitting model
optimal <- which.min(sum$cp)
optimal

## [1] 12

# determine the variable names for the optimal model
X <- summary(search)$which[, -1]  # information about included predictors
xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept)
xvars <- xvars[X[optimal,]]

# By sum$adjr2, it can be seen that the parameters contained in line 9 can make the model optimal.
xvars <- c("LOT.SQFT", "GROSS.AREA", "LIVING.AREA", "FLOORS", "ROOMS", "FULL.BATH", "HALF.BATH", "FIREPLACE", "REMODEL")

# rebuild model for best predictor set
set.seed(1)
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=west.df[, c("TOTAL.VALUE", xvars)],
                      method="lm",  # specify the model
                      trControl=trControl)
model

## Linear Regression 
## 
## 3000 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2400, 2402, 2399, 2399, 2400 
## Resampling results:
## 
##   RMSE  Rsquared  MAE 
##   42.9  0.788     33.1
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

coef(model$finalModel)

##   (Intercept)      LOT.SQFT    GROSS.AREA   LIVING.AREA        FLOORS 
##      21.19583       0.00814       0.03206       0.04344      42.49249 
##         ROOMS     FULL.BATH     HALF.BATH     FIREPLACE    REMODELOld 
##       2.72186      16.89665      20.53439      18.45088       3.60474 
## REMODELRecent 
##      24.49414

metric.exhaustive <- collectMetrics(model, train.df, holdout.df)

Popular Subset Selection Algorithms

# as model performance is estimated using AIC, we don't need to use cross-validation
trControl <- caret::trainControl(method="none")
model <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
                      # select backward elmination
                      method="glmStepAIC", direction='backward')

## Start:  AIC=18681
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE + 
##     REMODELOld + REMODELRecent
## 
##                 Df Deviance   AIC
## - REMODELOld     1  3318722 18681
## <none>              3315470 18681
## - BEDROOMS       1  3319171 18681
## - KITCHEN        1  3319528 18681
## - YR.BUILT       1  3321887 18682
## - ROOMS          1  3331763 18688
## - FULL.BATH      1  3389265 18719
## - REMODELRecent  1  3455085 18753
## - HALF.BATH      1  3461017 18756
## - FIREPLACE      1  3463312 18758
## - LIVING.AREA    1  3503285 18778
## - GROSS.AREA     1  3538281 18796
## - FLOORS         1  3703971 18879
## - LOT.SQFT       1  4052241 19040
## 
## Step:  AIC=18681
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE + 
##     REMODELRecent
## 
##                 Df Deviance   AIC
## <none>              3318722 18681
## - BEDROOMS       1  3322623 18681
## - KITCHEN        1  3322700 18681
## - YR.BUILT       1  3324777 18682
## - ROOMS          1  3335619 18688
## - FULL.BATH      1  3392077 18718
## - REMODELRecent  1  3455102 18751
## - HALF.BATH      1  3463426 18756
## - FIREPLACE      1  3465011 18756
## - LIVING.AREA    1  3504282 18777
## - GROSS.AREA     1  3547442 18799
## - FLOORS         1  3714283 18882
## - LOT.SQFT       1  4056808 19040

coef(model$finalModel)

##   (Intercept)      LOT.SQFT      YR.BUILT    GROSS.AREA   LIVING.AREA 
##     -33.52125       0.00781       0.03603       0.03068       0.05016 
##        FLOORS         ROOMS      BEDROOMS     FULL.BATH     HALF.BATH 
##      41.68629       3.71444      -2.69520      15.13529      19.54688 
##       KITCHEN     FIREPLACE REMODELRecent 
##     -12.76574      17.04039      26.55641

model <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
                      method="glmStepAIC", direction='forward')

## Start:  AIC=21517
## .outcome ~ 1
## 
##                 Df Deviance   AIC
## + LIVING.AREA    1  5500449 19569
## + GROSS.AREA     1  6356404 19829
## + ROOMS          1  9619216 20575
## + BEDROOMS       1 11602082 20913
## + FLOORS         1 12045563 20980
## + LOT.SQFT       1 12285640 21016
## + HALF.BATH      1 13796424 21225
## + FULL.BATH      1 13911304 21240
## + FIREPLACE      1 14154297 21271
## + REMODELRecent  1 15388524 21422
## + YR.BUILT       1 16188466 21513
## + REMODELOld     1 16223082 21517
## <none>             16248252 21517
## + KITCHEN        1 16243845 21519
## 
## Step:  AIC=19569
## .outcome ~ LIVING.AREA
## 
##                 Df Deviance   AIC
## + LOT.SQFT       1  4768338 19313
## + FIREPLACE      1  5131870 19446
## + FLOORS         1  5139417 19448
## + GROSS.AREA     1  5171419 19460
## + HALF.BATH      1  5283037 19498
## + KITCHEN        1  5344442 19519
## + REMODELRecent  1  5352405 19522
## + ROOMS          1  5387655 19533
## + FULL.BATH      1  5446162 19553
## + BEDROOMS       1  5483440 19565
## <none>              5500449 19569
## + REMODELOld     1  5494607 19569
## + YR.BUILT       1  5497436 19570
## 
## Step:  AIC=19313
## .outcome ~ LIVING.AREA + LOT.SQFT
## 
##                 Df Deviance   AIC
## + FLOORS         1  4218363 19095
## + FIREPLACE      1  4474573 19201
## + GROSS.AREA     1  4548797 19231
## + HALF.BATH      1  4550101 19231
## + REMODELRecent  1  4602135 19252
## + KITCHEN        1  4624278 19260
## + ROOMS          1  4654462 19272
## + FULL.BATH      1  4711614 19294
## + BEDROOMS       1  4739825 19305
## <none>              4768338 19313
## + REMODELOld     1  4764306 19314
## + YR.BUILT       1  4768152 19315
## 
## Step:  AIC=19095
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS
## 
##                 Df Deviance   AIC
## + GROSS.AREA     1  3887877 18950
## + FIREPLACE      1  3958271 18982
## + REMODELRecent  1  4072117 19033
## + HALF.BATH      1  4111917 19051
## + FULL.BATH      1  4141574 19064
## + KITCHEN        1  4173991 19078
## + ROOMS          1  4175529 19078
## + YR.BUILT       1  4211512 19094
## <none>              4218363 19095
## + BEDROOMS       1  4217419 19096
## + REMODELOld     1  4218002 19097
## 
## Step:  AIC=18950
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA
## 
##                 Df Deviance   AIC
## + FIREPLACE      1  3696340 18861
## + REMODELRecent  1  3752161 18888
## + HALF.BATH      1  3761930 18892
## + FULL.BATH      1  3826082 18923
## + ROOMS          1  3857616 18938
## + YR.BUILT       1  3863302 18940
## + KITCHEN        1  3881828 18949
## <none>              3887877 18950
## + BEDROOMS       1  3886374 18951
## + REMODELOld     1  3887098 18951
## 
## Step:  AIC=18861
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE
## 
##                 Df Deviance   AIC
## + REMODELRecent  1  3538487 18784
## + HALF.BATH      1  3604075 18817
## + FULL.BATH      1  3639321 18835
## + ROOMS          1  3669397 18850
## + YR.BUILT       1  3686000 18858
## <none>              3696340 18861
## + KITCHEN        1  3693246 18861
## + BEDROOMS       1  3695361 18862
## + REMODELOld     1  3696238 18863
## 
## Step:  AIC=18784
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE + 
##     REMODELRecent
## 
##              Df Deviance   AIC
## + HALF.BATH   1  3431407 18731
## + FULL.BATH   1  3506718 18770
## + ROOMS       1  3515086 18774
## + YR.BUILT    1  3518357 18776
## <none>           3538487 18784
## + REMODELOld  1  3536335 18785
## + KITCHEN     1  3536471 18785
## + BEDROOMS    1  3537861 18786
## 
## Step:  AIC=18731
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE + 
##     REMODELRecent + HALF.BATH
## 
##              Df Deviance   AIC
## + FULL.BATH   1  3343679 18686
## + ROOMS       1  3412106 18723
## + YR.BUILT    1  3417217 18725
## <none>           3431407 18731
## + REMODELOld  1  3428624 18731
## + KITCHEN     1  3430623 18732
## + BEDROOMS    1  3431061 18733
## 
## Step:  AIC=18686
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE + 
##     REMODELRecent + HALF.BATH + FULL.BATH
## 
##              Df Deviance   AIC
## + ROOMS       1  3332827 18682
## + YR.BUILT    1  3338394 18685
## <none>           3343679 18686
## + REMODELOld  1  3340289 18686
## + KITCHEN     1  3340898 18687
## + BEDROOMS    1  3343605 18688
## 
## Step:  AIC=18682
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE + 
##     REMODELRecent + HALF.BATH + FULL.BATH + ROOMS
## 
##              Df Deviance   AIC
## + YR.BUILT    1  3326634 18681
## + BEDROOMS    1  3328719 18682
## + KITCHEN     1  3328850 18682
## <none>           3332827 18682
## + REMODELOld  1  3329826 18683
## 
## Step:  AIC=18681
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE + 
##     REMODELRecent + HALF.BATH + FULL.BATH + ROOMS + YR.BUILT
## 
##              Df Deviance   AIC
## + KITCHEN     1  3322623 18681
## + BEDROOMS    1  3322700 18681
## <none>           3326634 18681
## + REMODELOld  1  3323265 18681
## 
## Step:  AIC=18681
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE + 
##     REMODELRecent + HALF.BATH + FULL.BATH + ROOMS + YR.BUILT + 
##     KITCHEN
## 
##              Df Deviance   AIC
## + BEDROOMS    1  3318722 18681
## <none>           3322623 18681
## + REMODELOld  1  3319171 18681
## 
## Step:  AIC=18681
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE + 
##     REMODELRecent + HALF.BATH + FULL.BATH + ROOMS + YR.BUILT + 
##     KITCHEN + BEDROOMS
## 
##              Df Deviance   AIC
## <none>           3318722 18681
## + REMODELOld  1  3315470 18681

coef(model$finalModel)

##   (Intercept)   LIVING.AREA      LOT.SQFT        FLOORS    GROSS.AREA 
##     -33.52125       0.05016       0.00781      41.68629       0.03068 
##     FIREPLACE REMODELRecent     HALF.BATH     FULL.BATH         ROOMS 
##      17.04039      26.55641      19.54688      15.13529       3.71444 
##      YR.BUILT       KITCHEN      BEDROOMS 
##       0.03603     -12.76574      -2.69520

odel <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
                     method="glmStepAIC", direction='both')

## Start:  AIC=18681
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE + 
##     REMODELOld + REMODELRecent
## 
##                 Df Deviance   AIC
## - REMODELOld     1  3318722 18681
## <none>              3315470 18681
## - BEDROOMS       1  3319171 18681
## - KITCHEN        1  3319528 18681
## - YR.BUILT       1  3321887 18682
## - ROOMS          1  3331763 18688
## - FULL.BATH      1  3389265 18719
## - REMODELRecent  1  3455085 18753
## - HALF.BATH      1  3461017 18756
## - FIREPLACE      1  3463312 18758
## - LIVING.AREA    1  3503285 18778
## - GROSS.AREA     1  3538281 18796
## - FLOORS         1  3703971 18879
## - LOT.SQFT       1  4052241 19040
## 
## Step:  AIC=18681
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE + 
##     REMODELRecent
## 
##                 Df Deviance   AIC
## <none>              3318722 18681
## - BEDROOMS       1  3322623 18681
## - KITCHEN        1  3322700 18681
## + REMODELOld     1  3315470 18681
## - YR.BUILT       1  3324777 18682
## - ROOMS          1  3335619 18688
## - FULL.BATH      1  3392077 18718
## - REMODELRecent  1  3455102 18751
## - HALF.BATH      1  3463426 18756
## - FIREPLACE      1  3465011 18756
## - LIVING.AREA    1  3504282 18777
## - GROSS.AREA     1  3547442 18799
## - FLOORS         1  3714283 18882
## - LOT.SQFT       1  4056808 19040

coef(model$finalModel)

##   (Intercept)   LIVING.AREA      LOT.SQFT        FLOORS    GROSS.AREA 
##     -33.52125       0.05016       0.00781      41.68629       0.03068 
##     FIREPLACE REMODELRecent     HALF.BATH     FULL.BATH         ROOMS 
##      17.04039      26.55641      19.54688      15.13529       3.71444 
##      YR.BUILT       KITCHEN      BEDROOMS 
##       0.03603     -12.76574      -2.69520

rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
      Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE))

##          ME                RMSE MAE 
## Training -0.00000000000119 42.9 33.1
## Holdout  -0.0958           42.3 32.9

# The models are identical to the best model obtained from the exhaustive search.
# We therefore duplicate the metrics.
metric.stepwise <- metric.exhaustive
metric.stepwise

##   CV.RMSE CV.MAE Training.ME Training.RMSE Training.MAE Holdout.ME Holdout.RMSE
## 1    42.9   33.1       0.193            43         33.1      -0.29         42.1
##   Holdout.MAE nPredictors
## 1        32.7          10

Regularization (Shrinkage Models)

set.seed(1)
library(caret)
trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
model <- caret::train(TOTAL.VALUE ~ ., data=west.df,
                      method='glmnet',
                      family='gaussian',  # set the family for linear regression
                      trControl=trControl,
                      tuneGrid=tuneGrid)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

model$bestTune

##   alpha lambda
## 1     0    100

coef(model$finalModel, s=model$bestTune$lambda)

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept)   90.24293
## LOT.SQFT       0.00481
## YR.BUILT       0.00288
## GROSS.AREA     0.02006
## LIVING.AREA    0.03159
## FLOORS        24.38370
## ROOMS          6.55513
## BEDROOMS       7.26270
## FULL.BATH     13.13814
## HALF.BATH     14.27822
## KITCHEN       -7.55701
## FIREPLACE     13.32439
## REMODELOld     2.37126
## REMODELRecent 15.48961

metric.ridge <- collectMetrics(model, train.df, holdout.df,
                               length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
ridge.model <- model

rbind(
  Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
  Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE)
)

##          ME     RMSE MAE 
## Training 0.427  48.7 37.1
## Holdout  -0.641 47   36.7

set.seed(1)
tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
model <- caret::train(TOTAL.VALUE ~ ., data=train.df,
                      method='glmnet',
                      family='gaussian',  # set the family for linear regression
                      trControl=trControl,
                      tuneGrid=tuneGrid)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

model$bestTune

##   alpha lambda
## 5     1   2.51

coef(model$finalModel, s=model$bestTune$lambda)

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept)   42.36603
## LOT.SQFT       0.00711
## YR.BUILT       .      
## GROSS.AREA     0.02994
## LIVING.AREA    0.05266
## FLOORS        38.09561
## ROOMS          2.31567
## BEDROOMS       .      
## FULL.BATH     11.18611
## HALF.BATH     15.99640
## KITCHEN        .      
## FIREPLACE     14.82108
## REMODELOld     .      
## REMODELRecent 20.13996

lasso.model <- model
metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df,
                               sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)
rbind(
  Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
  Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE)
)

##          ME                 RMSE MAE 
## Training -0.000000000000135 43.3 33.3
## Holdout  -0.368             42.7 33.1

g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
  geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='green') +
  geom_line() +
  geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
  labs(x=expression(paste('Ridge parameter ', lambda)),
       y='RMSE (cross-validation)') +
  scale_x_log10()
g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
  geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='green') +
  geom_line() +
  geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
  labs(x=expression(paste('Lasso parameter ', lambda)),
       y='RMSE (cross-validation)') +
  scale_x_log10()
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
# ggsave(file=file.path("E:\\Documentation\\课程\\R\\test1", "shrinkage-parameter-tuning.pdf"), g, width=6, height=2.5, units='in')

data.frame(rbind(
  'full'= metric.full,
  'exhaustive' = metric.exhaustive,
  'stepwise' = metric.stepwise,
  'ridge' = metric.ridge,
  'lasso' = metric.lasso
))

##            CV.RMSE CV.MAE        Training.ME Training.RMSE Training.MAE
## full          43.5   33.1  0.134712559086481          43.0         33.0
## exhaustive    42.9   33.1  0.192950992986573          43.0         33.1
## stepwise      42.9   33.1  0.192950992986573          43.0         33.1
## ridge         48.2   37.1  0.426909206328931          48.7         37.1
## lasso         43.9   33.5 -0.000000000000135          43.3         33.3
##            Holdout.ME Holdout.RMSE Holdout.MAE nPredictors
## full           -0.202         42.0        32.6          13
## exhaustive     -0.290         42.1        32.7          10
## stepwise       -0.290         42.1        32.7          10
## ridge          -0.641         47.0        36.7          13
## lasso          -0.368         42.7        33.1           9

Haiyuan Gui’s Document

ghy

2023-07-07

Read Data

Train Data and Test Data

Model Training and Testing

Model Evaluation

Cross-validation and caret

Variable Selection in Linear Regression

Reduce the Number of Predictors

Exhaustive Search

Popular Subset Selection Algorithms

Regularization (Shrinkage Models)