housing.df <- mlba::WestRoxbury
names(housing.df)[3]<- "LOT.SIZE"  #change the column name if so desired
names(housing.df) <- tolower(names(housing.df)) #change to lowercase if so desired

#partition data
set.seed(1010)
library(caret)
## 载入需要的程辑包:ggplot2
## 载入需要的程辑包:lattice
idx <- createDataPartition(housing.df$total.value,p=.6,list=FALSE)
train.df <- housing.df[idx,]
holdout.df <- housing.df[-idx,]

library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
housing.train.df <- dummy_cols(train.df,remove_first_dummy = TRUE,remove_selected_columns = TRUE)

library(leaps)
search <- regsubsets(total.value ~ lot.size+yr.built+gross.area+living.area+floors+rooms+bedrooms+full.bath+half.bath+kitchen+fireplace+remodel_Old+remodel_Recent ,data=housing.train.df,nbest = 1,nvmax = ncol(housing.train.df),method = "exhaustive")
sum <- summary(search)
#show models
sum$which
##    (Intercept) lot.size yr.built gross.area living.area floors rooms bedrooms
## 1         TRUE    FALSE    FALSE      FALSE        TRUE  FALSE FALSE    FALSE
## 2         TRUE     TRUE    FALSE      FALSE        TRUE  FALSE FALSE    FALSE
## 3         TRUE     TRUE    FALSE      FALSE        TRUE   TRUE FALSE    FALSE
## 4         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 5         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 6         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 7         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 8         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE    FALSE
## 9         TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE     TRUE
## 10        TRUE     TRUE    FALSE       TRUE        TRUE   TRUE FALSE     TRUE
## 11        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE     TRUE
## 12        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE FALSE     TRUE
## 13        TRUE     TRUE     TRUE       TRUE        TRUE   TRUE  TRUE     TRUE
##    full.bath half.bath kitchen fireplace remodel_Old remodel_Recent
## 1      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 2      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 3      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 4      FALSE     FALSE   FALSE     FALSE       FALSE          FALSE
## 5      FALSE     FALSE   FALSE      TRUE       FALSE          FALSE
## 6      FALSE     FALSE   FALSE      TRUE       FALSE           TRUE
## 7       TRUE      TRUE   FALSE      TRUE       FALSE          FALSE
## 8       TRUE      TRUE   FALSE      TRUE       FALSE           TRUE
## 9       TRUE      TRUE   FALSE      TRUE       FALSE           TRUE
## 10      TRUE      TRUE    TRUE      TRUE       FALSE           TRUE
## 11      TRUE      TRUE    TRUE      TRUE       FALSE           TRUE
## 12      TRUE      TRUE    TRUE      TRUE        TRUE           TRUE
## 13      TRUE      TRUE    TRUE      TRUE        TRUE           TRUE
#show metrics
sum$rsq
##  [1] 0.7053867 0.7495648 0.7642047 0.7780689 0.7895952 0.8005015 0.8059147
##  [8] 0.8142845 0.8146123 0.8148203 0.8150152 0.8151666 0.8151910
sum$adjr2
##  [1] 0.7053021 0.7494208 0.7640014 0.7778136 0.7892926 0.8001571 0.8055238
##  [8] 0.8138568 0.8141319 0.8142869 0.8144290 0.8145274 0.8144984
sum$cp
##  [1] 2051.10593 1223.85127  951.04896  692.80866  478.45103  275.73187
##  [7]  176.12128   21.01544   16.86123   14.95847   13.29837   12.45710
## [13]   14.00000
# Regression Model using train.df
housing.lm <- lm(total.value ~lot.size+yr.built+gross.area+living.area+floors+bedrooms+full.bath+half.bath+kitchen+fireplace+remodel,data=train.df)
options(scipen = 999)
summary(housing.lm)
## 
## Call:
## lm(formula = total.value ~ lot.size + yr.built + gross.area + 
##     living.area + floors + bedrooms + full.bath + half.bath + 
##     kitchen + fireplace + remodel, data = train.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -259.817  -26.309    0.032   24.411  285.244 
## 
## Coefficients:
##                  Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept)   -19.0575706  37.6264516  -0.506              0.6125    
## lot.size        0.0082582   0.0003005  27.479 <0.0000000000000002 ***
## yr.built        0.0373258   0.0186782   1.998              0.0458 *  
## gross.area      0.0314581   0.0020969  15.002 <0.0000000000000002 ***
## living.area     0.0554714   0.0038541  14.393 <0.0000000000000002 ***
## floors         39.3303752   2.1751942  18.081 <0.0000000000000002 ***
## bedrooms       -2.6057485   1.1552778  -2.256              0.0242 *  
## full.bath      21.7191989   1.7140035  12.672 <0.0000000000000002 ***
## half.bath      20.2846883   1.5791925  12.845 <0.0000000000000002 ***
## kitchen       -12.3137316   6.0575764  -2.033              0.0422 *  
## fireplace      17.7326973   1.3773722  12.874 <0.0000000000000002 ***
## remodelOld      4.2846300   2.5416915   1.686              0.0919 .  
## remodelRecent  26.9602049   2.1281861  12.668 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.23 on 3470 degrees of freedom
## Multiple R-squared:  0.8152, Adjusted R-squared:  0.8145 
## F-statistic:  1275 on 12 and 3470 DF,  p-value: < 0.00000000000000022
# use predict() to make predictions on a new set.
pred <- predict(housing.lm,holdout.df)


data.frame(
   'Predicted' = pred[1:20],
   'Actual' = holdout.df$total.value[1:20],
   'Residual' = holdout.df$total.value[1:20] - pred[1:20]
)
##    Predicted Actual     Residual
## 2   461.9532  412.6  -49.3532273
## 3   359.5688  330.1  -29.4688198
## 4   544.4203  498.6  -45.8203118
## 7   403.2800  359.4  -43.8799557
## 10  510.8184  409.4 -101.4183522
## 13  313.6309  315.5    1.8690630
## 15  344.1066  326.2  -17.9065647
## 18  361.4325  344.9  -16.5325461
## 19  331.0687  330.7   -0.3687411
## 21  281.9881  317.5   35.5119356
## 22  335.6481  330.8   -4.8480914
## 24  496.0637  414.7  -81.3636809
## 25  346.4231  318.8  -27.6230632
## 26  342.7774  346.2    3.4226491
## 27  259.4506  245.1  -14.3506429
## 29  266.2893  247.3  -18.9893018
## 32  319.4732  293.6  -25.8732379
## 35  396.8875  337.2  -59.6874567
## 36  359.3556  336.5  -22.8556078
## 38  354.3208  296.5  -57.8208412
rbind(
  Training=mlba::regressionSummary(predict(housing.lm,train.df),train.df$total.value),Holdout=mlba::regressionSummary(pred,holdout.df$total.value)
)
##          ME                    RMSE     MAE     
## Training -0.000000000003965788 43.1501  32.42148
## Holdout  0.3522007             42.47474 32.55664
library(ggplot2)
pred <- predict(housing.lm, holdout.df)
all.residuals <- holdout.df$total.value - pred

ggplot() + geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
     labs(x="Residuals", Y="Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.