# Introduce all used packages
library(ggplot2)
library(lattice)
library(caret)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaps)
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(gridExtra)
##
## 载入程辑包:'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
options(scipen=999, digits = 3)
# read data
westroxbury.df <- mlba::WestRoxbury
# select variables for regression
outcome <- "TOTAL.VALUE"
predictors <- c("LOT.SQFT", "YR.BUILT", "GROSS.AREA", "LIVING.AREA", "FLOORS",
"ROOMS", "BEDROOMS", "FULL.BATH", "HALF.BATH", "KITCHEN", "FIREPLACE", "REMODEL")
# reduce data set to first 3000 rows and selected variable
west.df <- westroxbury.df[1:3000, c(outcome, predictors)]
# partition data
set.seed(1) # set seed for reproducing the partition
idx <- createDataPartition(west.df$TOTAL.VALUE, p=0.6, list=FALSE)
train.df <- west.df[idx, ]
holdout.df <- west.df[-idx, ]
# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
west.lm <- lm(TOTAL.VALUE ~ ., data = train.df)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(west.lm)
##
## Call:
## lm(formula = TOTAL.VALUE ~ ., data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -178.44 -27.42 -0.86 25.87 198.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35.295638 41.002262 -0.86 0.3895
## LOT.SQFT 0.007807 0.000392 19.93 < 0.0000000000000002 ***
## YR.BUILT 0.037122 0.019960 1.86 0.0631 .
## GROSS.AREA 0.030378 0.002772 10.96 < 0.0000000000000002 ***
## LIVING.AREA 0.050552 0.005024 10.06 < 0.0000000000000002 ***
## FLOORS 41.417184 2.862168 14.47 < 0.0000000000000002 ***
## ROOMS 3.650320 1.231809 2.96 0.0031 **
## BEDROOMS -2.626316 1.859669 -1.41 0.1581
## FULL.BATH 15.182365 2.407331 6.31 0.00000000036 ***
## HALF.BATH 19.608046 2.213825 8.86 < 0.0000000000000002 ***
## KITCHEN -12.894560 8.719267 -1.48 0.1394
## FIREPLACE 17.145103 1.920669 8.93 < 0.0000000000000002 ***
## REMODELOld 4.670305 3.527513 1.32 0.1857
## REMODELRecent 27.143402 3.129025 8.67 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.1 on 1787 degrees of freedom
## Multiple R-squared: 0.796, Adjusted R-squared: 0.794
## F-statistic: 536 on 13 and 1787 DF, p-value: <0.0000000000000002
# use predict() to make predictions on a new set.
pred <- predict(west.lm, holdout.df)
options(scipen=999, digits=1)
data.frame(
'Predicted' = pred[1:20],
'Actual' = holdout.df$TOTAL.VALUE[1:20],
'Residual' = holdout.df$TOTAL.VALUE[1:20] - pred[1:20]
)
## Predicted Actual Residual
## 1 373 344 -29
## 6 275 337 63
## 7 393 359 -34
## 10 492 409 -82
## 13 305 316 11
## 14 555 575 20
## 15 333 326 -7
## 18 350 345 -5
## 20 376 348 -28
## 21 278 318 39
## 22 326 331 5
## 26 336 346 10
## 30 327 321 -6
## 32 314 294 -20
## 38 340 296 -44
## 40 343 318 -24
## 42 364 350 -14
## 47 394 372 -23
## 50 379 432 53
## 56 392 346 -46
options(scipen=999, digits = 3)
# calculate performance metrics
rbind(
Training=mlba::regressionSummary(predict(west.lm, train.df), train.df$TOTAL.VALUE),
Holdout=mlba::regressionSummary(pred, holdout.df$TOTAL.VALUE)
)
## ME RMSE MAE
## Training -0.00000000000119 42.9 33.1
## Holdout -0.202 42.3 32.9
pred <- predict(west.lm, holdout.df)
all.residuals <- holdout.df$TOTAL.VALUE - pred
ggplot() +
geom_histogram(aes(x=all.residuals), fill="pink", color="blue") +
labs(x="Residuals", y="Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
g <- ggplot() +
geom_histogram(aes(x=all.residuals), fill="pink", color="blue") +
labs(x="Residuals", y="Frequency") +
theme_bw()
# ggsave(file=file.path("E:\\Documentation\\课程\\R\\test1", "residuals-histogram.pdf"), g, width=5, height=3, units="in")
set.seed(1)
# define 5-fold
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=west.df,
method="lm", # specify the model
trControl=trControl)
model
## Linear Regression
##
## 3000 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2400, 2402, 2399, 2399, 2400
## Resampling results:
##
## RMSE Rsquared MAE
## 43.5 0.782 33.1
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
## (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA
## -84.2348 0.0082 0.0579 0.0323 0.0436
## FLOORS ROOMS BEDROOMS FULL.BATH HALF.BATH
## 42.8958 3.3285 -0.7456 16.1256 19.7283
## KITCHEN FIREPLACE REMODELOld REMODELRecent
## -8.6909 17.8466 4.1979 25.3402
collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
if (missing(nPredictors)) {
coefs = coef(model$finalModel)
nPredictors = length(coefs) - 1
}
return (cbind(
CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE),
nPredictors=nPredictors
))
}
metric.full <- collectMetrics(model, train.df, holdout.df)
predict(model, west.df[1:3,])
## 1 2 3
## 375 454 357
# use regsubsets() in package leaps to run an exhaustive search.
# unlike with lm, categorical predictors must be turned into dummies manually.
# create dummies for fuel type
# create dummies for fuel type
leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE,
remove_selected_columns=TRUE)
search <- regsubsets(TOTAL.VALUE ~ ., data=leaps.train.df, nbest=1,
nvmax=ncol(leaps.train.df), method="exhaustive")
sum <- summary(search)
sum
## Subset selection object
## Call: regsubsets.formula(TOTAL.VALUE ~ ., data = leaps.train.df, nbest = 1,
## nvmax = ncol(leaps.train.df), method = "exhaustive")
## 13 Variables (and intercept)
## Forced in Forced out
## LOT.SQFT FALSE FALSE
## YR.BUILT FALSE FALSE
## GROSS.AREA FALSE FALSE
## LIVING.AREA FALSE FALSE
## FLOORS FALSE FALSE
## ROOMS FALSE FALSE
## BEDROOMS FALSE FALSE
## FULL.BATH FALSE FALSE
## HALF.BATH FALSE FALSE
## KITCHEN FALSE FALSE
## FIREPLACE FALSE FALSE
## REMODEL_Old FALSE FALSE
## REMODEL_Recent FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS
## 1 ( 1 ) " " " " " " "*" " " " " " "
## 2 ( 1 ) "*" " " " " "*" " " " " " "
## 3 ( 1 ) "*" " " " " "*" "*" " " " "
## 4 ( 1 ) "*" " " "*" "*" "*" " " " "
## 5 ( 1 ) "*" " " "*" "*" "*" " " " "
## 6 ( 1 ) "*" " " "*" "*" "*" " " " "
## 7 ( 1 ) "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" " " "*" "*" "*" " " " "
## 9 ( 1 ) "*" " " "*" "*" "*" "*" " "
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " "*" " " " "
## 6 ( 1 ) " " " " " " "*" " " "*"
## 7 ( 1 ) " " "*" " " "*" " " "*"
## 8 ( 1 ) "*" "*" " " "*" " " "*"
## 9 ( 1 ) "*" "*" " " "*" " " "*"
## 10 ( 1 ) "*" "*" " " "*" " " "*"
## 11 ( 1 ) "*" "*" "*" "*" " " "*"
## 12 ( 1 ) "*" "*" "*" "*" " " "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*"
# show models
sum$which
## (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS
## 1 TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 2 TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 3 TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
## 4 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 5 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 6 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 7 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 8 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 9 TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
## 10 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 12 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
## 1 FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE FALSE FALSE
## 5 FALSE FALSE FALSE TRUE FALSE FALSE
## 6 FALSE FALSE FALSE TRUE FALSE TRUE
## 7 FALSE TRUE FALSE TRUE FALSE TRUE
## 8 TRUE TRUE FALSE TRUE FALSE TRUE
## 9 TRUE TRUE FALSE TRUE FALSE TRUE
## 10 TRUE TRUE FALSE TRUE FALSE TRUE
## 11 TRUE TRUE TRUE TRUE FALSE TRUE
## 12 TRUE TRUE TRUE TRUE FALSE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE
# show metrics
sum$rsq
## [1] 0.661 0.707 0.740 0.761 0.773 0.782 0.789 0.794 0.795 0.795 0.796 0.796
## [13] 0.796
sum$adjr2
## [1] 0.661 0.706 0.740 0.760 0.772 0.781 0.788 0.793 0.794 0.794 0.794 0.794
## [13] 0.794
# The smaller the value of cp, the better the fit of the model. cp portrays the balance between the predictive power of the model and the complexity of the model. It takes into account the sum of squares of the fitted residuals and the number of variables used in the model, preferring models with smaller residuals but not overly complex.
sum$cp
## [1] 1167.7 775.1 480.6 304.5 203.3 120.2 64.5 19.2 15.4 14.0
## [11] 13.9 13.8 14.0
# In this list, -0.696 is the smallest Cp value, so the 3rd model corresponds to the lowest Cp value and can be considered as the best-fitting model
optimal <- which.min(sum$cp)
optimal
## [1] 12
# determine the variable names for the optimal model
X <- summary(search)$which[, -1] # information about included predictors
xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept)
xvars <- xvars[X[optimal,]]
# By sum$adjr2, it can be seen that the parameters contained in line 9 can make the model optimal.
xvars <- c("LOT.SQFT", "GROSS.AREA", "LIVING.AREA", "FLOORS", "ROOMS", "FULL.BATH", "HALF.BATH", "FIREPLACE", "REMODEL")
# rebuild model for best predictor set
set.seed(1)
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=west.df[, c("TOTAL.VALUE", xvars)],
method="lm", # specify the model
trControl=trControl)
model
## Linear Regression
##
## 3000 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2400, 2402, 2399, 2399, 2400
## Resampling results:
##
## RMSE Rsquared MAE
## 42.9 0.788 33.1
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
## (Intercept) LOT.SQFT GROSS.AREA LIVING.AREA FLOORS
## 21.19583 0.00814 0.03206 0.04344 42.49249
## ROOMS FULL.BATH HALF.BATH FIREPLACE REMODELOld
## 2.72186 16.89665 20.53439 18.45088 3.60474
## REMODELRecent
## 24.49414
metric.exhaustive <- collectMetrics(model, train.df, holdout.df)
# as model performance is estimated using AIC, we don't need to use cross-validation
trControl <- caret::trainControl(method="none")
model <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
# select backward elmination
method="glmStepAIC", direction='backward')
## Start: AIC=18681
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE +
## REMODELOld + REMODELRecent
##
## Df Deviance AIC
## - REMODELOld 1 3318722 18681
## <none> 3315470 18681
## - BEDROOMS 1 3319171 18681
## - KITCHEN 1 3319528 18681
## - YR.BUILT 1 3321887 18682
## - ROOMS 1 3331763 18688
## - FULL.BATH 1 3389265 18719
## - REMODELRecent 1 3455085 18753
## - HALF.BATH 1 3461017 18756
## - FIREPLACE 1 3463312 18758
## - LIVING.AREA 1 3503285 18778
## - GROSS.AREA 1 3538281 18796
## - FLOORS 1 3703971 18879
## - LOT.SQFT 1 4052241 19040
##
## Step: AIC=18681
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE +
## REMODELRecent
##
## Df Deviance AIC
## <none> 3318722 18681
## - BEDROOMS 1 3322623 18681
## - KITCHEN 1 3322700 18681
## - YR.BUILT 1 3324777 18682
## - ROOMS 1 3335619 18688
## - FULL.BATH 1 3392077 18718
## - REMODELRecent 1 3455102 18751
## - HALF.BATH 1 3463426 18756
## - FIREPLACE 1 3465011 18756
## - LIVING.AREA 1 3504282 18777
## - GROSS.AREA 1 3547442 18799
## - FLOORS 1 3714283 18882
## - LOT.SQFT 1 4056808 19040
coef(model$finalModel)
## (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA
## -33.52125 0.00781 0.03603 0.03068 0.05016
## FLOORS ROOMS BEDROOMS FULL.BATH HALF.BATH
## 41.68629 3.71444 -2.69520 15.13529 19.54688
## KITCHEN FIREPLACE REMODELRecent
## -12.76574 17.04039 26.55641
model <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
method="glmStepAIC", direction='forward')
## Start: AIC=21517
## .outcome ~ 1
##
## Df Deviance AIC
## + LIVING.AREA 1 5500449 19569
## + GROSS.AREA 1 6356404 19829
## + ROOMS 1 9619216 20575
## + BEDROOMS 1 11602082 20913
## + FLOORS 1 12045563 20980
## + LOT.SQFT 1 12285640 21016
## + HALF.BATH 1 13796424 21225
## + FULL.BATH 1 13911304 21240
## + FIREPLACE 1 14154297 21271
## + REMODELRecent 1 15388524 21422
## + YR.BUILT 1 16188466 21513
## + REMODELOld 1 16223082 21517
## <none> 16248252 21517
## + KITCHEN 1 16243845 21519
##
## Step: AIC=19569
## .outcome ~ LIVING.AREA
##
## Df Deviance AIC
## + LOT.SQFT 1 4768338 19313
## + FIREPLACE 1 5131870 19446
## + FLOORS 1 5139417 19448
## + GROSS.AREA 1 5171419 19460
## + HALF.BATH 1 5283037 19498
## + KITCHEN 1 5344442 19519
## + REMODELRecent 1 5352405 19522
## + ROOMS 1 5387655 19533
## + FULL.BATH 1 5446162 19553
## + BEDROOMS 1 5483440 19565
## <none> 5500449 19569
## + REMODELOld 1 5494607 19569
## + YR.BUILT 1 5497436 19570
##
## Step: AIC=19313
## .outcome ~ LIVING.AREA + LOT.SQFT
##
## Df Deviance AIC
## + FLOORS 1 4218363 19095
## + FIREPLACE 1 4474573 19201
## + GROSS.AREA 1 4548797 19231
## + HALF.BATH 1 4550101 19231
## + REMODELRecent 1 4602135 19252
## + KITCHEN 1 4624278 19260
## + ROOMS 1 4654462 19272
## + FULL.BATH 1 4711614 19294
## + BEDROOMS 1 4739825 19305
## <none> 4768338 19313
## + REMODELOld 1 4764306 19314
## + YR.BUILT 1 4768152 19315
##
## Step: AIC=19095
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS
##
## Df Deviance AIC
## + GROSS.AREA 1 3887877 18950
## + FIREPLACE 1 3958271 18982
## + REMODELRecent 1 4072117 19033
## + HALF.BATH 1 4111917 19051
## + FULL.BATH 1 4141574 19064
## + KITCHEN 1 4173991 19078
## + ROOMS 1 4175529 19078
## + YR.BUILT 1 4211512 19094
## <none> 4218363 19095
## + BEDROOMS 1 4217419 19096
## + REMODELOld 1 4218002 19097
##
## Step: AIC=18950
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA
##
## Df Deviance AIC
## + FIREPLACE 1 3696340 18861
## + REMODELRecent 1 3752161 18888
## + HALF.BATH 1 3761930 18892
## + FULL.BATH 1 3826082 18923
## + ROOMS 1 3857616 18938
## + YR.BUILT 1 3863302 18940
## + KITCHEN 1 3881828 18949
## <none> 3887877 18950
## + BEDROOMS 1 3886374 18951
## + REMODELOld 1 3887098 18951
##
## Step: AIC=18861
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE
##
## Df Deviance AIC
## + REMODELRecent 1 3538487 18784
## + HALF.BATH 1 3604075 18817
## + FULL.BATH 1 3639321 18835
## + ROOMS 1 3669397 18850
## + YR.BUILT 1 3686000 18858
## <none> 3696340 18861
## + KITCHEN 1 3693246 18861
## + BEDROOMS 1 3695361 18862
## + REMODELOld 1 3696238 18863
##
## Step: AIC=18784
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE +
## REMODELRecent
##
## Df Deviance AIC
## + HALF.BATH 1 3431407 18731
## + FULL.BATH 1 3506718 18770
## + ROOMS 1 3515086 18774
## + YR.BUILT 1 3518357 18776
## <none> 3538487 18784
## + REMODELOld 1 3536335 18785
## + KITCHEN 1 3536471 18785
## + BEDROOMS 1 3537861 18786
##
## Step: AIC=18731
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE +
## REMODELRecent + HALF.BATH
##
## Df Deviance AIC
## + FULL.BATH 1 3343679 18686
## + ROOMS 1 3412106 18723
## + YR.BUILT 1 3417217 18725
## <none> 3431407 18731
## + REMODELOld 1 3428624 18731
## + KITCHEN 1 3430623 18732
## + BEDROOMS 1 3431061 18733
##
## Step: AIC=18686
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE +
## REMODELRecent + HALF.BATH + FULL.BATH
##
## Df Deviance AIC
## + ROOMS 1 3332827 18682
## + YR.BUILT 1 3338394 18685
## <none> 3343679 18686
## + REMODELOld 1 3340289 18686
## + KITCHEN 1 3340898 18687
## + BEDROOMS 1 3343605 18688
##
## Step: AIC=18682
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE +
## REMODELRecent + HALF.BATH + FULL.BATH + ROOMS
##
## Df Deviance AIC
## + YR.BUILT 1 3326634 18681
## + BEDROOMS 1 3328719 18682
## + KITCHEN 1 3328850 18682
## <none> 3332827 18682
## + REMODELOld 1 3329826 18683
##
## Step: AIC=18681
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE +
## REMODELRecent + HALF.BATH + FULL.BATH + ROOMS + YR.BUILT
##
## Df Deviance AIC
## + KITCHEN 1 3322623 18681
## + BEDROOMS 1 3322700 18681
## <none> 3326634 18681
## + REMODELOld 1 3323265 18681
##
## Step: AIC=18681
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE +
## REMODELRecent + HALF.BATH + FULL.BATH + ROOMS + YR.BUILT +
## KITCHEN
##
## Df Deviance AIC
## + BEDROOMS 1 3318722 18681
## <none> 3322623 18681
## + REMODELOld 1 3319171 18681
##
## Step: AIC=18681
## .outcome ~ LIVING.AREA + LOT.SQFT + FLOORS + GROSS.AREA + FIREPLACE +
## REMODELRecent + HALF.BATH + FULL.BATH + ROOMS + YR.BUILT +
## KITCHEN + BEDROOMS
##
## Df Deviance AIC
## <none> 3318722 18681
## + REMODELOld 1 3315470 18681
coef(model$finalModel)
## (Intercept) LIVING.AREA LOT.SQFT FLOORS GROSS.AREA
## -33.52125 0.05016 0.00781 41.68629 0.03068
## FIREPLACE REMODELRecent HALF.BATH FULL.BATH ROOMS
## 17.04039 26.55641 19.54688 15.13529 3.71444
## YR.BUILT KITCHEN BEDROOMS
## 0.03603 -12.76574 -2.69520
odel <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
method="glmStepAIC", direction='both')
## Start: AIC=18681
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE +
## REMODELOld + REMODELRecent
##
## Df Deviance AIC
## - REMODELOld 1 3318722 18681
## <none> 3315470 18681
## - BEDROOMS 1 3319171 18681
## - KITCHEN 1 3319528 18681
## - YR.BUILT 1 3321887 18682
## - ROOMS 1 3331763 18688
## - FULL.BATH 1 3389265 18719
## - REMODELRecent 1 3455085 18753
## - HALF.BATH 1 3461017 18756
## - FIREPLACE 1 3463312 18758
## - LIVING.AREA 1 3503285 18778
## - GROSS.AREA 1 3538281 18796
## - FLOORS 1 3703971 18879
## - LOT.SQFT 1 4052241 19040
##
## Step: AIC=18681
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS +
## ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE +
## REMODELRecent
##
## Df Deviance AIC
## <none> 3318722 18681
## - BEDROOMS 1 3322623 18681
## - KITCHEN 1 3322700 18681
## + REMODELOld 1 3315470 18681
## - YR.BUILT 1 3324777 18682
## - ROOMS 1 3335619 18688
## - FULL.BATH 1 3392077 18718
## - REMODELRecent 1 3455102 18751
## - HALF.BATH 1 3463426 18756
## - FIREPLACE 1 3465011 18756
## - LIVING.AREA 1 3504282 18777
## - GROSS.AREA 1 3547442 18799
## - FLOORS 1 3714283 18882
## - LOT.SQFT 1 4056808 19040
coef(model$finalModel)
## (Intercept) LIVING.AREA LOT.SQFT FLOORS GROSS.AREA
## -33.52125 0.05016 0.00781 41.68629 0.03068
## FIREPLACE REMODELRecent HALF.BATH FULL.BATH ROOMS
## 17.04039 26.55641 19.54688 15.13529 3.71444
## YR.BUILT KITCHEN BEDROOMS
## 0.03603 -12.76574 -2.69520
rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE))
## ME RMSE MAE
## Training -0.00000000000119 42.9 33.1
## Holdout -0.0958 42.3 32.9
# The models are identical to the best model obtained from the exhaustive search.
# We therefore duplicate the metrics.
metric.stepwise <- metric.exhaustive
metric.stepwise
## CV.RMSE CV.MAE Training.ME Training.RMSE Training.MAE Holdout.ME Holdout.RMSE
## 1 42.9 33.1 0.193 43 33.1 -0.29 42.1
## Holdout.MAE nPredictors
## 1 32.7 10
set.seed(1)
library(caret)
trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
model <- caret::train(TOTAL.VALUE ~ ., data=west.df,
method='glmnet',
family='gaussian', # set the family for linear regression
trControl=trControl,
tuneGrid=tuneGrid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
model$bestTune
## alpha lambda
## 1 0 100
coef(model$finalModel, s=model$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 90.24293
## LOT.SQFT 0.00481
## YR.BUILT 0.00288
## GROSS.AREA 0.02006
## LIVING.AREA 0.03159
## FLOORS 24.38370
## ROOMS 6.55513
## BEDROOMS 7.26270
## FULL.BATH 13.13814
## HALF.BATH 14.27822
## KITCHEN -7.55701
## FIREPLACE 13.32439
## REMODELOld 2.37126
## REMODELRecent 15.48961
metric.ridge <- collectMetrics(model, train.df, holdout.df,
length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
ridge.model <- model
rbind(
Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE)
)
## ME RMSE MAE
## Training 0.427 48.7 37.1
## Holdout -0.641 47 36.7
set.seed(1)
tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
model <- caret::train(TOTAL.VALUE ~ ., data=train.df,
method='glmnet',
family='gaussian', # set the family for linear regression
trControl=trControl,
tuneGrid=tuneGrid)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
model$bestTune
## alpha lambda
## 5 1 2.51
coef(model$finalModel, s=model$bestTune$lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 42.36603
## LOT.SQFT 0.00711
## YR.BUILT .
## GROSS.AREA 0.02994
## LIVING.AREA 0.05266
## FLOORS 38.09561
## ROOMS 2.31567
## BEDROOMS .
## FULL.BATH 11.18611
## HALF.BATH 15.99640
## KITCHEN .
## FIREPLACE 14.82108
## REMODELOld .
## REMODELRecent 20.13996
lasso.model <- model
metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df,
sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)
rbind(
Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE)
)
## ME RMSE MAE
## Training -0.000000000000135 43.3 33.3
## Holdout -0.368 42.7 33.1
g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='green') +
geom_line() +
geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
labs(x=expression(paste('Ridge parameter ', lambda)),
y='RMSE (cross-validation)') +
scale_x_log10()
g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='green') +
geom_line() +
geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
labs(x=expression(paste('Lasso parameter ', lambda)),
y='RMSE (cross-validation)') +
scale_x_log10()
grid.arrange(g1, g2, ncol=2)
g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
# ggsave(file=file.path("E:\\Documentation\\课程\\R\\test1", "shrinkage-parameter-tuning.pdf"), g, width=6, height=2.5, units='in')
data.frame(rbind(
'full'= metric.full,
'exhaustive' = metric.exhaustive,
'stepwise' = metric.stepwise,
'ridge' = metric.ridge,
'lasso' = metric.lasso
))
## CV.RMSE CV.MAE Training.ME Training.RMSE Training.MAE
## full 43.5 33.1 0.134712559086481 43.0 33.0
## exhaustive 42.9 33.1 0.192950992986573 43.0 33.1
## stepwise 42.9 33.1 0.192950992986573 43.0 33.1
## ridge 48.2 37.1 0.426909206328931 48.7 37.1
## lasso 43.9 33.5 -0.000000000000135 43.3 33.3
## Holdout.ME Holdout.RMSE Holdout.MAE nPredictors
## full -0.202 42.0 32.6 13
## exhaustive -0.290 42.1 32.7 10
## stepwise -0.290 42.1 32.7 10
## ridge -0.641 47.0 36.7 13
## lasso -0.368 42.7 33.1 9