library(caret)
## 载入需要的程辑包:ggplot2
## 载入需要的程辑包:lattice
housing01.df <- mlba::WestRoxbury
housing.df <- housing01.df[, -2]
str(housing.df)
## 'data.frame':    5802 obs. of  13 variables:
##  $ TOTAL.VALUE: num  344 413 330 499 332 ...
##  $ LOT.SQFT   : int  9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
##  $ YR.BUILT   : int  1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
##  $ GROSS.AREA : int  2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
##  $ LIVING.AREA: int  1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
##  $ FLOORS     : num  2 2 2 1 2 1 2 1 1 2 ...
##  $ ROOMS      : int  6 10 8 9 7 6 7 6 5 8 ...
##  $ BEDROOMS   : int  3 4 4 5 3 3 3 3 3 4 ...
##  $ FULL.BATH  : int  1 2 1 1 2 1 1 1 1 2 ...
##  $ HALF.BATH  : int  1 1 1 1 0 0 1 0 0 0 ...
##  $ KITCHEN    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FIREPLACE  : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ REMODEL    : chr  "None" "Recent" "None" "None" ...
outcom <- "TOTAL.VALUE"
str(housing.df)
## 'data.frame':    5802 obs. of  13 variables:
##  $ TOTAL.VALUE: num  344 413 330 499 332 ...
##  $ LOT.SQFT   : int  9965 6590 7500 13773 5000 5142 5000 10000 6835 5093 ...
##  $ YR.BUILT   : int  1880 1945 1890 1957 1910 1950 1954 1950 1958 1900 ...
##  $ GROSS.AREA : int  2436 3108 2294 5032 2370 2124 3220 2208 2582 4818 ...
##  $ LIVING.AREA: int  1352 1976 1371 2608 1438 1060 1916 1200 1092 2992 ...
##  $ FLOORS     : num  2 2 2 1 2 1 2 1 1 2 ...
##  $ ROOMS      : int  6 10 8 9 7 6 7 6 5 8 ...
##  $ BEDROOMS   : int  3 4 4 5 3 3 3 3 3 4 ...
##  $ FULL.BATH  : int  1 2 1 1 2 1 1 1 1 2 ...
##  $ HALF.BATH  : int  1 1 1 1 0 0 1 0 0 0 ...
##  $ KITCHEN    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FIREPLACE  : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ REMODEL    : chr  "None" "Recent" "None" "None" ...
predictors <- c("LOT.SQFT","YR.BUILT","GROSS.AREA","LIVING.AREA","FLOORS","ROOMS",
                "BEDROOMS","FULL.BATH","HALF.BATH","KITCHEN","FIREPLACE","REMODEL")
housing.df <- housing.df[1:1000,c(outcom,predictors)]
set.seed(1)
idx <- createDataPartition(housing.df$TOTAL.VALUE,p = 0.6,list = FALSE)
train.df <- housing.df[idx,]
holdout.df <- housing.df[-idx,]

housing.lm <- lm(TOTAL.VALUE  ~ .,data = train.df)
options(scipen = 999)
summary(housing.lm)
## 
## Call:
## lm(formula = TOTAL.VALUE ~ ., data = train.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -95.727 -18.289  -1.575  18.677  90.987 
## 
## Coefficients:
##                    Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)   -1271.4347418   120.4002636 -10.560 < 0.0000000000000002 ***
## LOT.SQFT          0.0062462     0.0005326  11.728 < 0.0000000000000002 ***
## YR.BUILT          0.6845956     0.0615966  11.114 < 0.0000000000000002 ***
## GROSS.AREA        0.0286970     0.0030623   9.371 < 0.0000000000000002 ***
## LIVING.AREA       0.0391289     0.0052851   7.404   0.0000000000004623 ***
## FLOORS           24.7398816     3.1436691   7.870   0.0000000000000172 ***
## ROOMS             0.6787551     1.4763820   0.460             0.645871    
## BEDROOMS          1.1239237     2.2794705   0.493             0.622152    
## FULL.BATH        17.2673662     3.2432597   5.324   0.0000001446830437 ***
## HALF.BATH        10.2177861     2.8843747   3.542             0.000428 ***
## KITCHEN           8.4066697     5.9528691   1.412             0.158420    
## FIREPLACE         1.2343088     2.2050379   0.560             0.575851    
## REMODELOld        6.7309615     4.5966832   1.464             0.143645    
## REMODELRecent    28.9446681     4.0690805   7.113   0.0000000000033110 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.97 on 587 degrees of freedom
## Multiple R-squared:  0.8012, Adjusted R-squared:  0.7968 
## F-statistic: 181.9 on 13 and 587 DF,  p-value: < 0.00000000000000022
pred <- predict(housing.lm,holdout.df)

options(scipen = 999,digits = 1)
data.frame(
  'Predicted' = pred[1:20],
  'Actual' = holdout.df$TOTAL.VALUE[1:20],
  'Residual' = holdout.df$TOTAL.VALUE[1:20]-pred[1:20]
)
##    Predicted Actual Residual
## 1        293    344       51
## 11       324    313      -11
## 13       249    316       66
## 16       259    298       40
## 17       278    313       35
## 19       277    331       53
## 21       242    318       75
## 22       275    331       55
## 26       287    346       59
## 28       261    317       56
## 29       220    247       27
## 31       289    329       40
## 33       324    280      -44
## 36       298    336       38
## 39       263    240      -23
## 41       404    432       27
## 42       304    350       46
## 44       306    358       52
## 45       312    346       34
## 46       474    491       17
options(scipen = 999,digits = 3)
rbind(
  Training = mlba::regressionSummary(predict(housing.lm,train.df),train.df$TOTAL.VALUE),
  Holdout = mlba::regressionSummary(pred,holdout.df$TOTAL.VALUE)
)
##          ME               RMSE MAE 
## Training 0.00000000000352 28.6 22.5
## Holdout  -2.05            33.5 25.8
library(ggplot2)
library(caret)
pred <- predict(housing.lm,holdout.df)
all.residuals <- holdout.df$TOTAL.VALUE - pred

ggplot() + geom_histogram(aes(x=all.residuals),fill="lightgray",color="grey") +
  labs(x="Residuals",y="Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

set.seed(1)
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=housing.df,
                      method="lm",
                      trControl=trControl)
model
## Linear Regression 
## 
## 1000 samples
##   12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 801, 799, 800, 800, 800 
## Resampling results:
## 
##   RMSE  Rsquared  MAE 
##   31.1  0.765     24.1
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
##   (Intercept)      LOT.SQFT      YR.BUILT    GROSS.AREA   LIVING.AREA 
##   -1212.50467       0.00626       0.65851       0.02625       0.03854 
##        FLOORS         ROOMS      BEDROOMS     FULL.BATH     HALF.BATH 
##      25.06836       0.73489       1.80020      16.18717       9.56406 
##       KITCHEN     FIREPLACE    REMODELOld REMODELRecent 
##       5.66075       1.18755       8.40823      28.06191
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
  if (missing(nPredictors)) {
    coefs = coef(model$finalModel)
    nPredictors = length(coefs) - 1
  }
  return (cbind(
    CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
    Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
    Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE),
    nPredictors=nPredictors
  ))
}

metric.full <- collectMetrics(model, train.df, holdout.df)

predict(model, housing.df[1:3,])
##   1   2   3 
## 295 408 287
library(leaps)
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE,
                             remove_selected_columns=TRUE)
search <- regsubsets(TOTAL.VALUE ~ ., data=leaps.train.df, nbest=1,
                     nvmax=ncol(leaps.train.df), method="exhaustive")
sum <- summary(search)
sum$which
##    (Intercept) LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS BEDROOMS
## 1         TRUE    FALSE    FALSE      FALSE        TRUE  FALSE FALSE    FALSE
## 2         TRUE     TRUE    FALSE      FALSE        TRUE  FALSE FALSE    FALSE
## 3         TRUE    FALSE     TRUE       TRUE        TRUE  FALSE FALSE    FALSE
## 4         TRUE     TRUE     TRUE       TRUE        TRUE  FALSE FALSE    FALSE
## 5         TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE    FALSE
## 6         TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE    FALSE
## 7         TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE    FALSE
## 8         TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE    FALSE
## 9         TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE    FALSE
## 10        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE    FALSE
## 11        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE  TRUE    FALSE
## 12        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE     TRUE
## 13        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE  TRUE     TRUE
##    FULL.BATH HALF.BATH KITCHEN FIREPLACE REMODEL_Old REMODEL_Recent
## 1      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 2      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 3      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 4      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 5      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 6      FALSE     FALSE   FALSE     FALSE       FALSE           TRUE
## 7       TRUE     FALSE   FALSE     FALSE       FALSE           TRUE
## 8       TRUE      TRUE   FALSE     FALSE       FALSE           TRUE
## 9       TRUE      TRUE    TRUE     FALSE       FALSE           TRUE
## 10      TRUE      TRUE    TRUE     FALSE        TRUE           TRUE
## 11      TRUE      TRUE    TRUE     FALSE        TRUE           TRUE
## 12      TRUE      TRUE    TRUE      TRUE        TRUE           TRUE
## 13      TRUE      TRUE    TRUE      TRUE        TRUE           TRUE
sum$rsq
##  [1] 0.596 0.646 0.700 0.736 0.767 0.787 0.794 0.799 0.800 0.801 0.801 0.801
## [13] 0.801
sum$adjr2
##  [1] 0.595 0.645 0.698 0.734 0.765 0.785 0.791 0.797 0.797 0.797 0.797 0.797
## [13] 0.797
sum$cp
##  [1] 597.07 449.74 293.98 189.41  99.19  42.56  23.46   9.73   9.37   9.20
## [11]  10.54  12.21  14.00
optimal <- which.min(sum$cp)
X <- summary(search)$which[, -1]
xvars <- dimnames(X)[[2]]
xvars <- xvars[X[optimal,]]
xvars <- c("LOT.SQFT","YR.BUILT","GROSS.AREA","LIVING.AREA","FLOORS","FULL.BATH","HALF.BATH","KITCHEN","REMODEL")
set.seed(1)
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(TOTAL.VALUE ~ ., data=housing.df[, c("TOTAL.VALUE", xvars)],
                      method="lm",
                      trControl=trControl)
model
## Linear Regression 
## 
## 1000 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 801, 799, 800, 800, 800 
## Resampling results:
## 
##   RMSE  Rsquared  MAE 
##   31    0.766     24.1
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
coef(model$finalModel)
##   (Intercept)      LOT.SQFT      YR.BUILT    GROSS.AREA   LIVING.AREA 
##   -1216.68115       0.00627       0.66325       0.02650       0.04007 
##        FLOORS     FULL.BATH     HALF.BATH       KITCHEN    REMODELOld 
##      25.40125      17.11530      10.24455       6.39764       8.65626 
## REMODELRecent 
##      28.26329
metric.exhaustive <- collectMetrics(model, train.df, holdout.df)
trControl <- caret::trainControl(method="none")
model <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
                      method="glmStepAIC", direction='backward')
## Start:  AIC=5768
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE + 
##     REMODELOld + REMODELRecent
## 
##                 Df Deviance  AIC
## - ROOMS          1   492967 5766
## - BEDROOMS       1   492994 5766
## - FIREPLACE      1   493053 5766
## <none>               492790 5768
## - KITCHEN        1   494464 5768
## - REMODELOld     1   494590 5768
## - HALF.BATH      1   503325 5779
## - FULL.BATH      1   516586 5794
## - REMODELRecent  1   535268 5816
## - LIVING.AREA    1   538806 5819
## - FLOORS         1   544783 5826
## - GROSS.AREA     1   566511 5850
## - YR.BUILT       1   596490 5881
## - LOT.SQFT       1   608258 5892
## 
## Step:  AIC=5766
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE + 
##     REMODELOld + REMODELRecent
## 
##                 Df Deviance  AIC
## - FIREPLACE      1   493257 5764
## - BEDROOMS       1   493504 5765
## <none>               492967 5766
## - REMODELOld     1   494782 5766
## - KITCHEN        1   494793 5766
## - HALF.BATH      1   504128 5777
## - FULL.BATH      1   517506 5793
## - REMODELRecent  1   535768 5814
## - FLOORS         1   544955 5824
## - LIVING.AREA    1   545135 5824
## - GROSS.AREA     1   566566 5848
## - YR.BUILT       1   597340 5879
## - LOT.SQFT       1   609497 5892
## 
## Step:  AIC=5764
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + REMODELOld + 
##     REMODELRecent
## 
##                 Df Deviance  AIC
## - BEDROOMS       1   493796 5763
## <none>               493257 5764
## - KITCHEN        1   495047 5765
## - REMODELOld     1   495098 5765
## - HALF.BATH      1   504283 5776
## - FULL.BATH      1   517681 5791
## - REMODELRecent  1   536070 5812
## - FLOORS         1   545339 5823
## - LIVING.AREA    1   545743 5823
## - GROSS.AREA     1   572134 5852
## - YR.BUILT       1   611275 5891
## - LOT.SQFT       1   613496 5893
## 
## Step:  AIC=5763
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     FULL.BATH + HALF.BATH + KITCHEN + REMODELOld + REMODELRecent
## 
##                 Df Deviance  AIC
## <none>               493796 5763
## - REMODELOld     1   495617 5763
## - KITCHEN        1   495715 5763
## - HALF.BATH      1   505628 5775
## - FULL.BATH      1   520435 5793
## - REMODELRecent  1   536747 5811
## - FLOORS         1   547545 5823
## - LIVING.AREA    1   553653 5830
## - GROSS.AREA     1   572364 5850
## - YR.BUILT       1   611602 5890
## - LOT.SQFT       1   613570 5892
coef(model$finalModel)
##   (Intercept)      LOT.SQFT      YR.BUILT    GROSS.AREA   LIVING.AREA 
##   -1279.70645       0.00629       0.69082       0.02897       0.04039 
##        FLOORS     FULL.BATH     HALF.BATH       KITCHEN    REMODELOld 
##      25.01876      17.85302      10.62199       8.92141       6.76882 
## REMODELRecent 
##      29.07483
model <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
                      method="glmStepAIC", direction='forward')
## Start:  AIC=6713
## .outcome ~ 1
## 
##                 Df Deviance  AIC
## + LIVING.AREA    1  1002430 6171
## + GROSS.AREA     1  1210022 6284
## + ROOMS          1  1742812 6503
## + FULL.BATH      1  1803972 6524
## + BEDROOMS       1  1956883 6573
## + YR.BUILT       1  2003950 6587
## + LOT.SQFT       1  2041754 6598
## + HALF.BATH      1  2320169 6675
## + FLOORS         1  2338215 6680
## + REMODELRecent  1  2400231 6695
## + FIREPLACE      1  2407421 6697
## + KITCHEN        1  2410160 6698
## <none>              2478386 6713
## + REMODELOld     1  2478067 6715
## 
## Step:  AIC=6171
## .outcome ~ LIVING.AREA
## 
##                 Df Deviance  AIC
## + LOT.SQFT       1   877061 6092
## + YR.BUILT       1   883869 6097
## + GROSS.AREA     1   897905 6106
## + FULL.BATH      1   927387 6126
## + FIREPLACE      1   949137 6140
## + REMODELRecent  1   970678 6153
## + FLOORS         1   979511 6159
## + HALF.BATH      1   980085 6159
## + BEDROOMS       1   994459 6168
## + ROOMS          1   995184 6168
## + KITCHEN        1   995690 6169
## <none>              1002430 6171
## + REMODELOld     1  1002057 6172
## 
## Step:  AIC=6092
## .outcome ~ LIVING.AREA + LOT.SQFT
## 
##                 Df Deviance  AIC
## + YR.BUILT       1   769374 6016
## + GROSS.AREA     1   793847 6034
## + FULL.BATH      1   807860 6045
## + FLOORS         1   820328 6054
## + REMODELRecent  1   843127 6071
## + FIREPLACE      1   846139 6073
## + HALF.BATH      1   857659 6081
## + BEDROOMS       1   864533 6086
## + ROOMS          1   871610 6091
## + KITCHEN        1   872310 6091
## <none>               877061 6092
## + REMODELOld     1   877016 6094
## 
## Step:  AIC=6016
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT
## 
##                 Df Deviance  AIC
## + GROSS.AREA     1   655163 5921
## + FLOORS         1   678253 5942
## + REMODELRecent  1   712846 5972
## + FULL.BATH      1   719045 5977
## + BEDROOMS       1   759199 6010
## + KITCHEN        1   760858 6011
## + ROOMS          1   761111 6011
## + FIREPLACE      1   761892 6012
## + HALF.BATH      1   765194 6014
## <none>               769374 6016
## + REMODELOld     1   767633 6016
## 
## Step:  AIC=5921
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA
## 
##                 Df Deviance  AIC
## + FLOORS         1   577741 5847
## + REMODELRecent  1   608598 5879
## + FULL.BATH      1   622639 5892
## + BEDROOMS       1   644248 5913
## + ROOMS          1   645373 5914
## + HALF.BATH      1   649490 5918
## <none>               655163 5921
## + REMODELOld     1   654384 5922
## + KITCHEN        1   654939 5923
## + FIREPLACE      1   655006 5923
## 
## Step:  AIC=5847
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS
## 
##                 Df Deviance  AIC
## + REMODELRecent  1   528519 5796
## + FULL.BATH      1   552707 5823
## + ROOMS          1   571739 5843
## + BEDROOMS       1   573374 5845
## + KITCHEN        1   574827 5846
## <none>               577741 5847
## + REMODELOld     1   577139 5849
## + HALF.BATH      1   577326 5849
## + FIREPLACE      1   577625 5849
## 
## Step:  AIC=5796
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS + 
##     REMODELRecent
## 
##              Df Deviance  AIC
## + FULL.BATH   1   510808 5777
## + KITCHEN     1   523764 5792
## + ROOMS       1   524196 5793
## + BEDROOMS    1   525027 5794
## + REMODELOld  1   525681 5795
## <none>            528519 5796
## + HALF.BATH   1   527771 5797
## + FIREPLACE   1   528400 5798
## 
## Step:  AIC=5777
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS + 
##     REMODELRecent + FULL.BATH
## 
##              Df Deviance  AIC
## + HALF.BATH   1   497600 5764
## + KITCHEN     1   507995 5776
## + ROOMS       1   508227 5776
## + REMODELOld  1   508334 5776
## <none>            510808 5777
## + BEDROOMS    1   509195 5778
## + FIREPLACE   1   510675 5779
## 
## Step:  AIC=5764
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS + 
##     REMODELRecent + FULL.BATH + HALF.BATH
## 
##              Df Deviance  AIC
## + KITCHEN     1   495617 5763
## + REMODELOld  1   495715 5763
## <none>            497600 5764
## + ROOMS       1   496739 5765
## + BEDROOMS    1   496951 5765
## + FIREPLACE   1   497322 5765
## 
## Step:  AIC=5763
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS + 
##     REMODELRecent + FULL.BATH + HALF.BATH + KITCHEN
## 
##              Df Deviance  AIC
## + REMODELOld  1   493796 5763
## <none>            495617 5763
## + ROOMS       1   495051 5765
## + BEDROOMS    1   495098 5765
## + FIREPLACE   1   495300 5765
## 
## Step:  AIC=5763
## .outcome ~ LIVING.AREA + LOT.SQFT + YR.BUILT + GROSS.AREA + FLOORS + 
##     REMODELRecent + FULL.BATH + HALF.BATH + KITCHEN + REMODELOld
## 
##             Df Deviance  AIC
## <none>           493796 5763
## + ROOMS      1   493245 5764
## + BEDROOMS   1   493257 5764
## + FIREPLACE  1   493504 5765
coef(model$finalModel)
##   (Intercept)   LIVING.AREA      LOT.SQFT      YR.BUILT    GROSS.AREA 
##   -1279.70645       0.04039       0.00629       0.69082       0.02897 
##        FLOORS REMODELRecent     FULL.BATH     HALF.BATH       KITCHEN 
##      25.01876      29.07483      17.85302      10.62199       8.92141 
##    REMODELOld 
##       6.76882
model <- caret::train(TOTAL.VALUE ~ ., data=train.df, trControl=trControl,
                      method="glmStepAIC", direction='both')
## Start:  AIC=5768
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     ROOMS + BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE + 
##     REMODELOld + REMODELRecent
## 
##                 Df Deviance  AIC
## - ROOMS          1   492967 5766
## - BEDROOMS       1   492994 5766
## - FIREPLACE      1   493053 5766
## <none>               492790 5768
## - KITCHEN        1   494464 5768
## - REMODELOld     1   494590 5768
## - HALF.BATH      1   503325 5779
## - FULL.BATH      1   516586 5794
## - REMODELRecent  1   535268 5816
## - LIVING.AREA    1   538806 5819
## - FLOORS         1   544783 5826
## - GROSS.AREA     1   566511 5850
## - YR.BUILT       1   596490 5881
## - LOT.SQFT       1   608258 5892
## 
## Step:  AIC=5766
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + FIREPLACE + 
##     REMODELOld + REMODELRecent
## 
##                 Df Deviance  AIC
## - FIREPLACE      1   493257 5764
## - BEDROOMS       1   493504 5765
## <none>               492967 5766
## - REMODELOld     1   494782 5766
## - KITCHEN        1   494793 5766
## + ROOMS          1   492790 5768
## - HALF.BATH      1   504128 5777
## - FULL.BATH      1   517506 5793
## - REMODELRecent  1   535768 5814
## - FLOORS         1   544955 5824
## - LIVING.AREA    1   545135 5824
## - GROSS.AREA     1   566566 5848
## - YR.BUILT       1   597340 5879
## - LOT.SQFT       1   609497 5892
## 
## Step:  AIC=5764
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     BEDROOMS + FULL.BATH + HALF.BATH + KITCHEN + REMODELOld + 
##     REMODELRecent
## 
##                 Df Deviance  AIC
## - BEDROOMS       1   493796 5763
## <none>               493257 5764
## - KITCHEN        1   495047 5765
## - REMODELOld     1   495098 5765
## + FIREPLACE      1   492967 5766
## + ROOMS          1   493053 5766
## - HALF.BATH      1   504283 5776
## - FULL.BATH      1   517681 5791
## - REMODELRecent  1   536070 5812
## - FLOORS         1   545339 5823
## - LIVING.AREA    1   545743 5823
## - GROSS.AREA     1   572134 5852
## - YR.BUILT       1   611275 5891
## - LOT.SQFT       1   613496 5893
## 
## Step:  AIC=5763
## .outcome ~ LOT.SQFT + YR.BUILT + GROSS.AREA + LIVING.AREA + FLOORS + 
##     FULL.BATH + HALF.BATH + KITCHEN + REMODELOld + REMODELRecent
## 
##                 Df Deviance  AIC
## <none>               493796 5763
## - REMODELOld     1   495617 5763
## - KITCHEN        1   495715 5763
## + ROOMS          1   493245 5764
## + BEDROOMS       1   493257 5764
## + FIREPLACE      1   493504 5765
## - HALF.BATH      1   505628 5775
## - FULL.BATH      1   520435 5793
## - REMODELRecent  1   536747 5811
## - FLOORS         1   547545 5823
## - LIVING.AREA    1   553653 5830
## - GROSS.AREA     1   572364 5850
## - YR.BUILT       1   611602 5890
## - LOT.SQFT       1   613570 5892
coef(model$finalModel)
##   (Intercept)      LOT.SQFT      YR.BUILT    GROSS.AREA   LIVING.AREA 
##   -1279.70645       0.00629       0.69082       0.02897       0.04039 
##        FLOORS     FULL.BATH     HALF.BATH       KITCHEN    REMODELOld 
##      25.01876      17.85302      10.62199       8.92141       6.76882 
## REMODELRecent 
##      29.07483
rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$TOTAL.VALUE),
      Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$TOTAL.VALUE))
##          ME               RMSE MAE 
## Training 0.00000000000335 28.7 22.5
## Holdout  -2.25            33.5 26
metric.stepwise <- metric.exhaustive