housing.df <- mlba::WestRoxbury
names(housing.df)[3]<- "LOT.SIZE" #change the column name if so desired
names(housing.df) <- tolower(names(housing.df)) #change to lowercase if so desired
#partition data
set.seed(1010)
library(caret)
## 载入需要的程辑包:ggplot2
## 载入需要的程辑包:lattice
idx <- createDataPartition(housing.df$total.value,p=.6,list=FALSE)
train.df <- housing.df[idx,]
holdout.df <- housing.df[-idx,]
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
housing.train.df <- dummy_cols(train.df,remove_first_dummy = TRUE,remove_selected_columns = TRUE)
library(leaps)
search <- regsubsets(total.value ~ lot.size+yr.built+gross.area+living.area+floors+rooms+bedrooms+full.bath+half.bath+kitchen+fireplace+remodel_Old+remodel_Recent ,data=housing.train.df,nbest = 1,nvmax = ncol(housing.train.df),method = "exhaustive")
sum <- summary(search)
#show models
sum$which
## (Intercept) lot.size yr.built gross.area living.area floors rooms bedrooms
## 1 TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 2 TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 3 TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
## 4 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 5 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 6 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 7 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 8 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## 9 TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
## 10 TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
## 11 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## 12 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## full.bath half.bath kitchen fireplace remodel_Old remodel_Recent
## 1 FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE FALSE FALSE
## 5 FALSE FALSE FALSE TRUE FALSE FALSE
## 6 FALSE FALSE FALSE TRUE FALSE TRUE
## 7 TRUE TRUE FALSE TRUE FALSE FALSE
## 8 TRUE TRUE FALSE TRUE FALSE TRUE
## 9 TRUE TRUE FALSE TRUE FALSE TRUE
## 10 TRUE TRUE TRUE TRUE FALSE TRUE
## 11 TRUE TRUE TRUE TRUE FALSE TRUE
## 12 TRUE TRUE TRUE TRUE TRUE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE
#show metrics
sum$rsq
## [1] 0.7053867 0.7495648 0.7642047 0.7780689 0.7895952 0.8005015 0.8059147
## [8] 0.8142845 0.8146123 0.8148203 0.8150152 0.8151666 0.8151910
sum$adjr2
## [1] 0.7053021 0.7494208 0.7640014 0.7778136 0.7892926 0.8001571 0.8055238
## [8] 0.8138568 0.8141319 0.8142869 0.8144290 0.8145274 0.8144984
sum$cp
## [1] 2051.10593 1223.85127 951.04896 692.80866 478.45103 275.73187
## [7] 176.12128 21.01544 16.86123 14.95847 13.29837 12.45710
## [13] 14.00000
# Regression Model using train.df
housing.lm <- lm(total.value ~lot.size+yr.built+gross.area+living.area+floors+bedrooms+full.bath+half.bath+kitchen+fireplace+remodel,data=train.df)
options(scipen = 999)
summary(housing.lm)
##
## Call:
## lm(formula = total.value ~ lot.size + yr.built + gross.area +
## living.area + floors + bedrooms + full.bath + half.bath +
## kitchen + fireplace + remodel, data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -259.817 -26.309 0.032 24.411 285.244
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -19.0575706 37.6264516 -0.506 0.6125
## lot.size 0.0082582 0.0003005 27.479 <0.0000000000000002 ***
## yr.built 0.0373258 0.0186782 1.998 0.0458 *
## gross.area 0.0314581 0.0020969 15.002 <0.0000000000000002 ***
## living.area 0.0554714 0.0038541 14.393 <0.0000000000000002 ***
## floors 39.3303752 2.1751942 18.081 <0.0000000000000002 ***
## bedrooms -2.6057485 1.1552778 -2.256 0.0242 *
## full.bath 21.7191989 1.7140035 12.672 <0.0000000000000002 ***
## half.bath 20.2846883 1.5791925 12.845 <0.0000000000000002 ***
## kitchen -12.3137316 6.0575764 -2.033 0.0422 *
## fireplace 17.7326973 1.3773722 12.874 <0.0000000000000002 ***
## remodelOld 4.2846300 2.5416915 1.686 0.0919 .
## remodelRecent 26.9602049 2.1281861 12.668 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.23 on 3470 degrees of freedom
## Multiple R-squared: 0.8152, Adjusted R-squared: 0.8145
## F-statistic: 1275 on 12 and 3470 DF, p-value: < 0.00000000000000022
# use predict() to make predictions on a new set.
pred <- predict(housing.lm,holdout.df)
data.frame(
'Predicted' = pred[1:20],
'Actual' = holdout.df$total.value[1:20],
'Residual' = holdout.df$total.value[1:20] - pred[1:20]
)
## Predicted Actual Residual
## 2 461.9532 412.6 -49.3532273
## 3 359.5688 330.1 -29.4688198
## 4 544.4203 498.6 -45.8203118
## 7 403.2800 359.4 -43.8799557
## 10 510.8184 409.4 -101.4183522
## 13 313.6309 315.5 1.8690630
## 15 344.1066 326.2 -17.9065647
## 18 361.4325 344.9 -16.5325461
## 19 331.0687 330.7 -0.3687411
## 21 281.9881 317.5 35.5119356
## 22 335.6481 330.8 -4.8480914
## 24 496.0637 414.7 -81.3636809
## 25 346.4231 318.8 -27.6230632
## 26 342.7774 346.2 3.4226491
## 27 259.4506 245.1 -14.3506429
## 29 266.2893 247.3 -18.9893018
## 32 319.4732 293.6 -25.8732379
## 35 396.8875 337.2 -59.6874567
## 36 359.3556 336.5 -22.8556078
## 38 354.3208 296.5 -57.8208412
rbind(
Training=mlba::regressionSummary(predict(housing.lm,train.df),train.df$total.value),Holdout=mlba::regressionSummary(pred,holdout.df$total.value)
)
## ME RMSE MAE
## Training -0.000000000003965788 43.1501 32.42148
## Holdout 0.3522007 42.47474 32.55664
library(ggplot2)
pred <- predict(housing.lm, holdout.df)
all.residuals <- holdout.df$total.value - pred
ggplot() + geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
labs(x="Residuals", Y="Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
