if (!require(mlba)) {
library(devtools)
install_github("gedeck/mlba/mlba", force=TRUE)
}
## 载入需要的程辑包:mlba
## 载入需要的程辑包:caret
## 载入需要的程辑包:ggplot2
## 载入需要的程辑包:lattice
## 载入需要的程辑包:forecast
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
options(scipen=999, digits = 3)
library(caret)
car.da <- mlba::WestRoxbury
# select variables for regression
outcome <- "TOTAL.VALUE"
predictors <- c("LOT.SQFT", "YR.BUILT", "GROSS.AREA", "LIVING.AREA", "FLOORS",
"ROOMS", "BEDROOMS", "FULL.BATH", "HALF.BATH", "KITCHEN", "FIREPLACE", "REMODEL")
# reduce data set to first 1000 rows and selected variables
car.da <- car.da[1:1000, c(outcome, predictors)]
# partition data
set.seed(1) # set seed for reproducing the partition
idx <- createDataPartition(car.da$TOTAL.VALUE, p=0.6, list=FALSE)
train.da <- car.da[idx, ]
holdout.da <- car.da[-idx, ]
# use lm() to run a linear regression of TOTAL.VALUE on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.da as predictors.
car.lm <- lm(TOTAL.VALUE ~ ., data = train.da)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(car.lm)
##
## Call:
## lm(formula = TOTAL.VALUE ~ ., data = train.da)
##
## Residuals:
## Min 1Q Median 3Q Max
## -95.73 -18.29 -1.57 18.68 90.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1271.434742 120.400264 -10.56 < 0.0000000000000002 ***
## LOT.SQFT 0.006246 0.000533 11.73 < 0.0000000000000002 ***
## YR.BUILT 0.684596 0.061597 11.11 < 0.0000000000000002 ***
## GROSS.AREA 0.028697 0.003062 9.37 < 0.0000000000000002 ***
## LIVING.AREA 0.039129 0.005285 7.40 0.000000000000462 ***
## FLOORS 24.739882 3.143669 7.87 0.000000000000017 ***
## ROOMS 0.678755 1.476382 0.46 0.64587
## BEDROOMS 1.123924 2.279470 0.49 0.62215
## FULL.BATH 17.267366 3.243260 5.32 0.000000144683044 ***
## HALF.BATH 10.217786 2.884375 3.54 0.00043 ***
## KITCHEN 8.406670 5.952869 1.41 0.15842
## FIREPLACE 1.234309 2.205038 0.56 0.57585
## REMODELOld 6.730961 4.596683 1.46 0.14365
## REMODELRecent 28.944668 4.069081 7.11 0.000000000003311 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29 on 587 degrees of freedom
## Multiple R-squared: 0.801, Adjusted R-squared: 0.797
## F-statistic: 182 on 13 and 587 DF, p-value: <0.0000000000000002
# use predict() to make predictions on a new set.
pred <- predict(car.lm, holdout.da)
options(scipen=999, digits=1)
data.frame(
'Predicted' = pred[1:20],
'Actual' = holdout.da$TOTAL.VALUE[1:20],
'Residual' = holdout.da$TOTAL.VALUE[1:20] - pred[1:20]
)
## Predicted Actual Residual
## 1 293 344 51
## 11 324 313 -11
## 13 249 316 66
## 16 259 298 40
## 17 278 313 35
## 19 277 331 53
## 21 242 318 75
## 22 275 331 55
## 26 287 346 59
## 28 261 317 56
## 29 220 247 27
## 31 289 329 40
## 33 324 280 -44
## 36 298 336 38
## 39 263 240 -23
## 41 404 432 27
## 42 304 350 46
## 44 306 358 52
## 45 312 346 34
## 46 474 491 17
options(scipen=999, digits = 3)
# calculate performance metrics
rbind(
Training=mlba::regressionSummary(predict(car.lm, train.da), train.da$TOTAL.VALUE),
Holdout=mlba::regressionSummary(pred, holdout.da$TOTAL.VALUE)
)
## ME RMSE MAE
## Training 0.00000000000352 28.6 22.5
## Holdout -2.05 33.5 25.8
library(ggplot2)
pred <- predict(car.lm, holdout.da)
all.residuals <- holdout.da$TOTAL.VALUE - pred
ggplot() +
geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
labs(x="Residuals", y="Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

g <- ggplot() +
geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
labs(x="Residuals", y="Frequency") +
theme_bw()
ggsave(file=file.path("E:\\R\\Document", "residuals-histogram.jpg"),
g, width=5, height=3, units="in")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
set.seed(1)
library(caret)
# define 5-fold
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=car.da,
method="lm", # specify the model
trControl=trControl)
model
## Linear Regression
##
## 1000 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 801, 799, 800, 800, 800
## Resampling results:
##
## RMSE Rsquared MAE
## 31.1 0.765 24.1
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
## (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA
## -1212.50467 0.00626 0.65851 0.02625 0.03854
## FLOORS ROOMS BEDROOMS FULL.BATH HALF.BATH
## 25.06836 0.73489 1.80020 16.18717 9.56406
## KITCHEN FIREPLACE REMODELOld REMODELRecent
## 5.66075 1.18755 8.40823 28.06191
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
collectMetrics <- function(model, train.da, holdout.da, nPredictors) {
if (missing(nPredictors)) {
coefs = coef(model$finalModel)
nPredictors = length(coefs) - 1
}
return (cbind(
CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
Training=mlba::regressionSummary(predict(model, train.da), train.da$TOTAL.VALUE),
Holdout=mlba::regressionSummary(predict(model, holdout.da), holdout.da$TOTAL.VALUE),
nPredictors=nPredictors
))
}
metric.full <- collectMetrics(model, train.da, holdout.da)
predict(model, car.da[1:3,])
## 1 2 3
## 295 408 287
# use regsubsets() in package leaps to run an exhaustive search.
# unlike with lm, categorical predictors must be turned into dummies manually.
library(leaps)
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
# create dummies for fuel type
leaps.train.da <- dummy_cols(train.da, remove_first_dummy=TRUE,
remove_selected_columns=TRUE)
search <- regsubsets(TOTAL.VALUE ~ ., data=leaps.train.da, nbest=1,
nvmax=ncol(leaps.train.da), method="exhaustive")
sum <- summary(search)
# show models
sum$which
## (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS
## 1 TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 2 TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 3 TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
## 4 TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## 5 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 6 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 7 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 8 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 9 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 10 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 12 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
## 1 FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE FALSE FALSE
## 5 FALSE FALSE FALSE FALSE FALSE FALSE
## 6 FALSE FALSE FALSE FALSE FALSE TRUE
## 7 TRUE FALSE FALSE FALSE FALSE TRUE
## 8 TRUE TRUE FALSE FALSE FALSE TRUE
## 9 TRUE TRUE TRUE FALSE FALSE TRUE
## 10 TRUE TRUE TRUE FALSE TRUE TRUE
## 11 TRUE TRUE TRUE FALSE TRUE TRUE
## 12 TRUE TRUE TRUE TRUE TRUE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE
# show metrics
sum$rsq
## [1] 0.596 0.646 0.700 0.736 0.767 0.787 0.794 0.799 0.800 0.801 0.801 0.801
## [13] 0.801
sum$adjr2
## [1] 0.595 0.645 0.698 0.734 0.765 0.785 0.791 0.797 0.797 0.797 0.797 0.797
## [13] 0.797
sum$cp
## [1] 597.07 449.74 293.98 189.41 99.19 42.56 23.46 9.73 9.37 9.20
## [11] 10.54 12.21 14.00
optimal <- which.min(sum$cp)
# determine the variable names for the optimal model
X <- summary(search)$which[, -1] # information about included predictors
xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept)
xvars <- xvars[X[optimal,]]
# the optimal model contains all dummy variables of Fuel_Type
xvars <- c("LOT.SQFT", "GROSS.AREA", "LIVING.AREA", "FLOORS", "ROOMS", "FULL.BATH", "HALF.BATH", "FIREPLACE", "REMODEL")
# rebuild model for best predictor set
set.seed(1)
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=car.da[, c("TOTAL.VALUE", xvars)],
method="lm", # specify the model
trControl=trControl)
model
## Linear Regression
##
## 1000 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 801, 799, 800, 800, 800
## Resampling results:
##
## RMSE Rsquared MAE
## 33.7 0.723 25.4
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
## (Intercept) LOT.SQFT GROSS.AREA LIVING.AREA FLOORS
## 73.85927 0.00593 0.01975 0.05349 16.80213
## ROOMS FULL.BATH HALF.BATH FIREPLACE REMODELOld
## -0.36284 28.67122 21.47026 8.02969 2.52769
## REMODELRecent
## 21.94942
metric.exhaustive <- collectMetrics(model, train.da, holdout.da)
# as model performance is estimated using AIC, we don't need to use cross-validation
trControl <- caret::trainControl(method="none")
model <- caret::train(TOTAL.VALUE ~ ., data=train.da, trControl=trControl,
# select backward elmination
method="glmStepAIC", direction='backward')
## Start: AIC=5768
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE +
## REMODELOld + REMODELRecent
##
## Df Deviance AIC
## - ROOMS 1 492967 5766
## - BEDROOMS 1 492994 5766
## - FIREPLACE 1 493053 5766
## <none> 492790 5768
## - KITCHEN 1 494464 5768
## - REMODELOld 1 494590 5768
## - HALF.BATH 1 503325 5779
## - FULL.BATH 1 516586 5794
## - REMODELRecent 1 535268 5816
## - LIVING.AREA 1 538806 5819
## - FLOORS 1 544783 5826
## - GROSS.AREA 1 566511 5850
## - YR.BUILT 1 596490 5881
## - LOT.SQFT 1 608258 5892
##
## Step: AIC=5766
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE +
## REMODELOld + REMODELRecent
##
## Df Deviance AIC
## - FIREPLACE 1 493257 5764
## - BEDROOMS 1 493504 5765
## <none> 492967 5766
## - REMODELOld 1 494782 5766
## - KITCHEN 1 494793 5766
## - HALF.BATH 1 504128 5777
## - FULL.BATH 1 517506 5793
## - REMODELRecent 1 535768 5814
## - FLOORS 1 544955 5824
## - LIVING.AREA 1 545135 5824
## - GROSS.AREA 1 566566 5848
## - YR.BUILT 1 597340 5879
## - LOT.SQFT 1 609497 5892
##
## Step: AIC=5764
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + REMODELOld +
## REMODELRecent
##
## Df Deviance AIC
## - BEDROOMS 1 493796 5763
## <none> 493257 5764
## - KITCHEN 1 495047 5765
## - REMODELOld 1 495098 5765
## - HALF.BATH 1 504283 5776
## - FULL.BATH 1 517681 5791
## - REMODELRecent 1 536070 5812
## - FLOORS 1 545339 5823
## - LIVING.AREA 1 545743 5823
## - GROSS.AREA 1 572134 5852
## - YR.BUILT 1 611275 5891
## - LOT.SQFT 1 613496 5893
##
## Step: AIC=5763
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## FULL.BATH + HALF.BATH + KITCHEN + REMODELOld + REMODELRecent
##
## Df Deviance AIC
## <none> 493796 5763
## - REMODELOld 1 495617 5763
## - KITCHEN 1 495715 5763
## - HALF.BATH 1 505628 5775
## - FULL.BATH 1 520435 5793
## - REMODELRecent 1 536747 5811
## - FLOORS 1 547545 5823
## - LIVING.AREA 1 553653 5830
## - GROSS.AREA 1 572364 5850
## - YR.BUILT 1 611602 5890
## - LOT.SQFT 1 613570 5892
coef(model$finalModel)
## (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA
## -1279.70645 0.00629 0.69082 0.02897 0.04039
## FLOORS FULL.BATH HALF.BATH KITCHEN REMODELOld
## 25.01876 17.85302 10.62199 8.92141 6.76882
## REMODELRecent
## 29.07483
model <- caret::train(TOTAL.VALUE ~ ., data=train.da, trControl=trControl,
method="glmStepAIC", direction='forward')
## Start: AIC=6713
## .outcome ~ 1
##
## Df Deviance AIC
## + LIVING.AREA 1 1002430 6171
## + GROSS.AREA 1 1210022 6284
## + ROOMS 1 1742812 6503
## + FULL.BATH 1 1803972 6524
## + BEDROOMS 1 1956883 6573
## + YR.BUILT 1 2003950 6587
## + LOT.SQFT 1 2041754 6598
## + HALF.BATH 1 2320169 6675
## + FLOORS 1 2338215 6680
## + REMODELRecent 1 2400231 6695
## + FIREPLACE 1 2407421 6697
## + KITCHEN 1 2410160 6698
## <none> 2478386 6713
## + REMODELOld 1 2478067 6715
##
## Step: AIC=6171
## .outcome ~ LIVING.AREA
##
## Df Deviance AIC
## + LOT.SQFT 1 877061 6092
## + YR.BUILT 1 883869 6097
## + GROSS.AREA 1 897905 6106
## + FULL.BATH 1 927387 6126
## + FIREPLACE 1 949137 6140
## + REMODELRecent 1 970678 6153
## + FLOORS 1 979511 6159
## + HALF.BATH 1 980085 6159
## + BEDROOMS 1 994459 6168
## + ROOMS 1 995184 6168
## + KITCHEN 1 995690 6169
## <none> 1002430 6171
## + REMODELOld 1 1002057 6172
##
## Step: AIC=6092
## .outcome ~ LIVING.AREA + LOT.SQFT
##
## Df Deviance AIC
## + YR.BUILT 1 769374 6016
## + GROSS.AREA 1 793847 6034
## + FULL.BATH 1 807860 6045
## + FLOORS 1 820328 6054
## + REMODELRecent 1 843127 6071
## + FIREPLACE 1 846139 6073
## + HALF.BATH 1 857659 6081
## + BEDROOMS 1 864533 6086
## + ROOMS 1 871610 6091
## + KITCHEN 1 872310 6091
## <none> 877061 6092
## + REMODELOld 1 877016 6094
##
## Step: AIC=6016
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT
##
## Df Deviance AIC
## + GROSS.AREA 1 655163 5921
## + FLOORS 1 678253 5942
## + REMODELRecent 1 712846 5972
## + FULL.BATH 1 719045 5977
## + BEDROOMS 1 759199 6010
## + KITCHEN 1 760858 6011
## + ROOMS 1 761111 6011
## + FIREPLACE 1 761892 6012
## + HALF.BATH 1 765194 6014
## <none> 769374 6016
## + REMODELOld 1 767633 6016
##
## Step: AIC=5921
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA
##
## Df Deviance AIC
## + FLOORS 1 577741 5847
## + REMODELRecent 1 608598 5879
## + FULL.BATH 1 622639 5892
## + BEDROOMS 1 644248 5913
## + ROOMS 1 645373 5914
## + HALF.BATH 1 649490 5918
## <none> 655163 5921
## + REMODELOld 1 654384 5922
## + KITCHEN 1 654939 5923
## + FIREPLACE 1 655006 5923
##
## Step: AIC=5847
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS
##
## Df Deviance AIC
## + REMODELRecent 1 528519 5796
## + FULL.BATH 1 552707 5823
## + ROOMS 1 571739 5843
## + BEDROOMS 1 573374 5845
## + KITCHEN 1 574827 5846
## <none> 577741 5847
## + REMODELOld 1 577139 5849
## + HALF.BATH 1 577326 5849
## + FIREPLACE 1 577625 5849
##
## Step: AIC=5796
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS +
## REMODELRecent
##
## Df Deviance AIC
## + FULL.BATH 1 510808 5777
## + KITCHEN 1 523764 5792
## + ROOMS 1 524196 5793
## + BEDROOMS 1 525027 5794
## + REMODELOld 1 525681 5795
## <none> 528519 5796
## + HALF.BATH 1 527771 5797
## + FIREPLACE 1 528400 5798
##
## Step: AIC=5777
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS +
## REMODELRecent + FULL.BATH
##
## Df Deviance AIC
## + HALF.BATH 1 497600 5764
## + KITCHEN 1 507995 5776
## + ROOMS 1 508227 5776
## + REMODELOld 1 508334 5776
## <none> 510808 5777
## + BEDROOMS 1 509195 5778
## + FIREPLACE 1 510675 5779
##
## Step: AIC=5764
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS +
## REMODELRecent + FULL.BATH + HALF.BATH
##
## Df Deviance AIC
## + KITCHEN 1 495617 5763
## + REMODELOld 1 495715 5763
## <none> 497600 5764
## + ROOMS 1 496739 5765
## + BEDROOMS 1 496951 5765
## + FIREPLACE 1 497322 5765
##
## Step: AIC=5763
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS +
## REMODELRecent + FULL.BATH + HALF.BATH + KITCHEN
##
## Df Deviance AIC
## + REMODELOld 1 493796 5763
## <none> 495617 5763
## + ROOMS 1 495051 5765
## + BEDROOMS 1 495098 5765
## + FIREPLACE 1 495300 5765
##
## Step: AIC=5763
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS +
## REMODELRecent + FULL.BATH + HALF.BATH + KITCHEN + REMODELOld
##
## Df Deviance AIC
## <none> 493796 5763
## + ROOMS 1 493245 5764
## + BEDROOMS 1 493257 5764
## + FIREPLACE 1 493504 5765
coef(model$finalModel)
## (Intercept) LIVING.AREA LOT.SQFT YR.BUILT GROSS.AREA
## -1279.70645 0.04039 0.00629 0.69082 0.02897
## FLOORS REMODELRecent FULL.BATH HALF.BATH KITCHEN
## 25.01876 29.07483 17.85302 10.62199 8.92141
## REMODELOld
## 6.76882
model <- caret::train(TOTAL.VALUE ~ ., data=train.da, trControl=trControl,
method="glmStepAIC", direction='both')
## Start: AIC=5768
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE +
## REMODELOld + REMODELRecent
##
## Df Deviance AIC
## - ROOMS 1 492967 5766
## - BEDROOMS 1 492994 5766
## - FIREPLACE 1 493053 5766
## <none> 492790 5768
## - KITCHEN 1 494464 5768
## - REMODELOld 1 494590 5768
## - HALF.BATH 1 503325 5779
## - FULL.BATH 1 516586 5794
## - REMODELRecent 1 535268 5816
## - LIVING.AREA 1 538806 5819
## - FLOORS 1 544783 5826
## - GROSS.AREA 1 566511 5850
## - YR.BUILT 1 596490 5881
## - LOT.SQFT 1 608258 5892
##
## Step: AIC=5766
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE +
## REMODELOld + REMODELRecent
##
## Df Deviance AIC
## - FIREPLACE 1 493257 5764
## - BEDROOMS 1 493504 5765
## <none> 492967 5766
## - REMODELOld 1 494782 5766
## - KITCHEN 1 494793 5766
## + ROOMS 1 492790 5768
## - HALF.BATH 1 504128 5777
## - FULL.BATH 1 517506 5793
## - REMODELRecent 1 535768 5814
## - FLOORS 1 544955 5824
## - LIVING.AREA 1 545135 5824
## - GROSS.AREA 1 566566 5848
## - YR.BUILT 1 597340 5879
## - LOT.SQFT 1 609497 5892
##
## Step: AIC=5764
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + REMODELOld +
## REMODELRecent
##
## Df Deviance AIC
## - BEDROOMS 1 493796 5763
## <none> 493257 5764
## - KITCHEN 1 495047 5765
## - REMODELOld 1 495098 5765
## + FIREPLACE 1 492967 5766
## + ROOMS 1 493053 5766
## - HALF.BATH 1 504283 5776
## - FULL.BATH 1 517681 5791
## - REMODELRecent 1 536070 5812
## - FLOORS 1 545339 5823
## - LIVING.AREA 1 545743 5823
## - GROSS.AREA 1 572134 5852
## - YR.BUILT 1 611275 5891
## - LOT.SQFT 1 613496 5893
##
## Step: AIC=5763
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## FULL.BATH + HALF.BATH + KITCHEN + REMODELOld + REMODELRecent
##
## Df Deviance AIC
## <none> 493796 5763
## - REMODELOld 1 495617 5763
## - KITCHEN 1 495715 5763
## + ROOMS 1 493245 5764
## + BEDROOMS 1 493257 5764
## + FIREPLACE 1 493504 5765
## - HALF.BATH 1 505628 5775
## - FULL.BATH 1 520435 5793
## - REMODELRecent 1 536747 5811
## - FLOORS 1 547545 5823
## - LIVING.AREA 1 553653 5830
## - GROSS.AREA 1 572364 5850
## - YR.BUILT 1 611602 5890
## - LOT.SQFT 1 613570 5892
coef(model$finalModel)
## (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA
## -1279.70645 0.00629 0.69082 0.02897 0.04039
## FLOORS FULL.BATH HALF.BATH KITCHEN REMODELOld
## 25.01876 17.85302 10.62199 8.92141 6.76882
## REMODELRecent
## 29.07483
rbind(Training=mlba::regressionSummary(predict(model, train.da), train.da$TOTAL.VALUE),
Holdout=mlba::regressionSummary(predict(model, holdout.da), holdout.da$TOTAL.VALUE))
## ME RMSE MAE
## Training 0.00000000000335 28.7 22.5
## Holdout -2.25 33.5 26
# The models are identical to the best model obtained from the exhaustive search.
# We therefore duplicate the metrics.
metric.stepwise <- metric.exhaustive
set.seed(1)
library(caret)
trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
model <- caret::train(TOTAL.VALUE ~ ., data=train.da,
method='glmnet',
family='gaussian', # set the family for linear regression
trControl=trControl,
tuneGrid=tuneGrid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
model$bestTune
## alpha lambda
## 1 0 100
coef(model$finalModel, s=model$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -443.12932
## LOT.SQFT 0.00292
## YR.BUILT 0.29685
## GROSS.AREA 0.01622
## LIVING.AREA 0.02388
## FLOORS 9.28919
## ROOMS 4.10898
## BEDROOMS 5.47217
## FULL.BATH 13.17695
## HALF.BATH 7.28355
## KITCHEN 5.62324
## FIREPLACE 3.65591
## REMODELOld 1.06974
## REMODELRecent 11.35971
metric.ridge <- collectMetrics(model, train.da, holdout.da,
length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
ridge.model <- model
rbind(
Training=mlba::regressionSummary(predict(model, train.da), train.da$TOTAL.VALUE),
Holdout=mlba::regressionSummary(predict(model, holdout.da), holdout.da$TOTAL.VALUE)
)
## ME RMSE MAE
## Training -0.00000000000038 36.1 26.3
## Holdout -0.815 38.3 27.8
set.seed(1)
tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
model <- caret::train(TOTAL.VALUE ~ ., data=train.da,
method='glmnet',
family='gaussian', # set the family for linear regression
trControl=trControl,
tuneGrid=tuneGrid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
model$bestTune
## alpha lambda
## 1 1 1
coef(model$finalModel, s=model$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1187.00444
## LOT.SQFT 0.00583
## YR.BUILT 0.64903
## GROSS.AREA 0.02797
## LIVING.AREA 0.04154
## FLOORS 21.66798
## ROOMS 0.83653
## BEDROOMS 0.64074
## FULL.BATH 16.43354
## HALF.BATH 8.88729
## KITCHEN 2.58017
## FIREPLACE 0.01609
## REMODELOld 2.22904
## REMODELRecent 24.70495
lasso.model <- model
metric.lasso <- collectMetrics(lasso.model, train.da, holdout.da,
sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)
rbind(
Training=mlba::regressionSummary(predict(model, train.da), train.da$TOTAL.VALUE),
Holdout=mlba::regressionSummary(predict(model, holdout.da), holdout.da$TOTAL.VALUE)
)
## ME RMSE MAE
## Training -0.000000000000841 28.8 22.6
## Holdout -2.17 33.4 25.7
library(tidyverse)
library(gridExtra)
##
## 载入程辑包:'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
geom_line() +
geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
labs(x=expression(paste('Ridge parameter ', lambda)),
y='RMSE (cross-validation)') +
scale_x_log10()
g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
geom_line() +
geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
labs(x=expression(paste('Lasso parameter ', lambda)),
y='RMSE (cross-validation)') +
scale_x_log10()
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
ggsave(file=file.path("E:\\R\\Document", "residuals-histogram.pdf"),
g, width=6, height=2.5, units='in')
data.frame(rbind(
'full'= metric.full,
'exhaustive' = metric.exhaustive,
'stepwise' = metric.stepwise,
'ridge' = metric.ridge,
'lasso' = metric.lasso
))
## CV.RMSE CV.MAE Training.ME Training.RMSE Training.MAE
## full 31.1 24.1 0.792955841120971 28.7 22.6
## exhaustive 33.7 25.4 0.785918742680181 31.6 24.3
## stepwise 33.7 25.4 0.785918742680181 31.6 24.3
## ridge 36.2 26.5 -0.000000000000380 36.1 26.3
## lasso 29.4 23.2 -0.000000000000841 28.8 22.6
## Holdout.ME Holdout.RMSE Holdout.MAE nPredictors
## full -1.194 33.1 25.4 13
## exhaustive -1.184 35.5 26.0 10
## stepwise -1.184 35.5 26.0 10
## ridge -0.815 38.3 27.8 13
## lasso -2.172 33.4 25.7 13