The column remodel is a categorial variable with three options {None, Old, Recent}. It needs to be converted into dummy variables. library(dummies) is used for this purpose. As shown below, we will select specific columns as predictors.
df <- dummy.data.frame(df, sep = ".")
names(df)
## [1] "total.value" "tax" "lot.size" "yr.built"
## [5] "gross.area" "living.area" "floors" "rooms"
## [9] "bedrooms" "full.bath" "half.bath" "kitchen"
## [13] "fireplace" "remodel.None" "remodel.Old" "remodel.Recent"
cols <- c("total.value", "lot.size", "gross.area", "living.area", "floors")
cols <- c(cols, "rooms", "bedrooms","remodel.Old", "remodel.Recent")
df<- df[, cols]
##remove categorical columns (remodel.Old & remodel.Recent for heat map
df1 <- df[ , -(8:9) ]
par(cex.main=1)
heatmap.2(cor(df1), Rowv = FALSE, Colv = FALSE, dendrogram = "none",
cellnote = round(cor(df1),2),main = "Correlation values",
cexRow = 1, cexCol = 1, notecol = "black", key = FALSE,
trace = 'none', margins = c(10,10))

##regression with all predictors included
reg1 <- lm(total.value ~ ., data = df)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(reg1)
##
## Call:
## lm(formula = total.value ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -295.376 -27.668 -0.416 26.157 300.560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.8630929 3.5945519 12.759 < 0.0000000000000002 ***
## lot.size 0.0089850 0.0002551 35.218 < 0.0000000000000002 ***
## gross.area 0.0327789 0.0016753 19.566 < 0.0000000000000002 ***
## living.area 0.0628466 0.0031160 20.169 < 0.0000000000000002 ***
## floors 42.5133205 1.6831957 25.258 < 0.0000000000000002 ***
## rooms 2.3119471 0.6872242 3.364 0.000773 ***
## bedrooms -0.4945594 1.0599964 -0.467 0.640827
## remodel.Old 2.2498146 2.0382432 1.104 0.269725
## remodel.Recent 26.6510108 1.7315189 15.392 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.93 on 5793 degrees of freedom
## Multiple R-squared: 0.7858, Adjusted R-squared: 0.7855
## F-statistic: 2657 on 8 and 5793 DF, p-value: < 0.00000000000000022
###########################################
###Partitioning the dataset
set.seed(1) # set seed for reproducing the partition
train.rows <- sample(1:nrow(df), 3500)
train.df <- df[train.rows, ]
valid.df <- df[-train.rows, ]
reg2 <- lm(total.value ~ ., data = train.df)
summary(reg2)
##
## Call:
## lm(formula = total.value ~ ., data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -232.123 -27.938 -0.602 25.863 195.057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.0323360 4.5350984 8.607 < 0.0000000000000002 ***
## lot.size 0.0093371 0.0003184 29.325 < 0.0000000000000002 ***
## gross.area 0.0323227 0.0021283 15.187 < 0.0000000000000002 ***
## living.area 0.0596082 0.0039330 15.156 < 0.0000000000000002 ***
## floors 43.5623795 2.1186645 20.561 < 0.0000000000000002 ***
## rooms 3.6762897 0.8799811 4.178 0.0000302 ***
## bedrooms -0.4024796 1.3831603 -0.291 0.771
## remodel.Old 2.8335024 2.6166912 1.083 0.279
## remodel.Recent 27.1272282 2.1854218 12.413 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.47 on 3491 degrees of freedom
## Multiple R-squared: 0.797, Adjusted R-squared: 0.7965
## F-statistic: 1713 on 8 and 3491 DF, p-value: < 0.00000000000000022
lm.pred <- predict(reg2, valid.df)
options(scipen=999, digits = 0)
some.residuals <- valid.df$total.value[1:20] - lm.pred[1:20]
data.frame("Actual" = valid.df$total.value[1:20],"Predicted" = lm.pred[1:20], "Residual" = some.residuals)
## Actual Predicted Residual
## 2 413 468 -56
## 5 332 360 -28
## 6 337 286 51
## 8 320 340 -19
## 9 334 339 -6
## 10 409 536 -126
## 11 313 345 -32
## 12 344 367 -23
## 13 316 342 -26
## 16 298 274 24
## 20 348 424 -76
## 21 318 311 7
## 23 358 401 -43
## 24 415 491 -77
## 25 319 358 -39
## 31 329 348 -19
## 33 280 444 -164
## 35 337 419 -81
## 42 350 386 -36
## 43 344 424 -80
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
t(accuracy(lm.pred, valid.df$total.value))
## Test set
## ME -1.05
## RMSE 46.71
## MAE 34.85
## MPE -1.53
## MAPE 9.07
all.residuals <- valid.df$total.value - lm.pred
df2 <- data.frame(all.residuals, lm.pred)
h1 <- ggplot(df2, aes(x= all.residuals)) +
geom_histogram(fill = "blue", col = "red", binwidth = 20)
h1 <- h1 + labs(x = "residuals", y = "count", title = "Histogram of residuals")
h1

p1 <- ggplot(df2, aes(x = lm.pred, y = all.residuals)) + geom_point()
p1 <- p1 + labs(x = "predicted values", y = "residuals", title = "Residuals vs. predicted values")
p1 <- p1+ geom_smooth()
p1

search <- regsubsets(total.value ~ ., data = train.df, nbest = 1, nvmax = dim(train.df)[2],
method = "exhaustive")
sum <- summary(search)
# show models
t(sum$which)
## 1 2 3 4 5 6 7 8
## (Intercept) TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## lot.size FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## gross.area FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## living.area TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## floors FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## rooms FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## bedrooms FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## remodel.Old FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## remodel.Recent FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
# show metrics
sum$rsq
## [1] 0.705 0.753 0.772 0.787 0.796 0.797 0.797 0.797
sum$adjr2
## [1] 0.705 0.753 0.772 0.786 0.795 0.797 0.797 0.797
##Find the model with the highest adjusted R2.
x <- which(sum$adjr2== max(sum$adjr2))
t(t(sum$which[x,]))
## [,1]
## (Intercept) TRUE
## lot.size TRUE
## gross.area TRUE
## living.area TRUE
## floors TRUE
## rooms TRUE
## bedrooms FALSE
## remodel.Old TRUE
## remodel.Recent TRUE
# use step() to run stepwise regression.
reg.step <- step(reg1, direction = "backward")
## Start: AIC=44419
## total.value ~ lot.size + gross.area + living.area + floors +
## rooms + bedrooms + remodel.Old + remodel.Recent
##
## Df Sum of Sq RSS AIC
## - bedrooms 1 459 12222339 44418
## - remodel.Old 1 2570 12224451 44419
## <none> 12221880 44419
## - rooms 1 23878 12245758 44429
## - remodel.Recent 1 499813 12721693 44650
## - gross.area 1 807646 13029526 44789
## - living.area 1 858223 13080103 44811
## - floors 1 1345908 13567788 45024
## - lot.size 1 2616726 14838607 45543
##
## Step: AIC=44418
## total.value ~ lot.size + gross.area + living.area + floors +
## rooms + remodel.Old + remodel.Recent
##
## Df Sum of Sq RSS AIC
## - remodel.Old 1 2567 12224906 44417
## <none> 12222339 44418
## - rooms 1 26338 12248678 44428
## - remodel.Recent 1 499692 12722031 44648
## - gross.area 1 807192 13029531 44787
## - living.area 1 863450 13085790 44812
## - floors 1 1363363 13585703 45029
## - lot.size 1 2618080 14840419 45542
##
## Step: AIC=44417
## total.value ~ lot.size + gross.area + living.area + floors +
## rooms + remodel.Recent
##
## Df Sum of Sq RSS AIC
## <none> 12224906 44417
## - rooms 1 26709 12251615 44428
## - remodel.Recent 1 500432 12725338 44648
## - gross.area 1 813485 13038391 44789
## - living.area 1 861448 13086354 44810
## - floors 1 1370979 13595885 45032
## - lot.size 1 2617603 14842509 45541
summary(reg.step) # Which variables were dropped?
##
## Call:
## lm(formula = total.value ~ lot.size + gross.area + living.area +
## floors + rooms + remodel.Recent, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -295.6 -27.6 -0.3 26.1 300.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.695689 3.586374 12.74 < 0.0000000000000002 ***
## lot.size 0.008986 0.000255 35.23 < 0.0000000000000002 ***
## gross.area 0.032848 0.001673 19.64 < 0.0000000000000002 ***
## living.area 0.062596 0.003098 20.21 < 0.0000000000000002 ***
## floors 42.488048 1.666662 25.49 < 0.0000000000000002 ***
## rooms 2.182100 0.613252 3.56 0.00038 ***
## remodel.Recent 26.357106 1.711281 15.40 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.9 on 5795 degrees of freedom
## Multiple R-squared: 0.786, Adjusted R-squared: 0.786
## F-statistic: 3.54e+03 on 6 and 5795 DF, p-value: <0.0000000000000002
reg.step.pred <- predict(reg.step, valid.df)
accuracy(reg.step.pred, valid.df$total.value)
## ME RMSE MAE MPE MAPE
## Test set -0.611 46.6 34.8 -1.43 9.06
reg.step <- step(reg1, direction = "both")
## Start: AIC=44419
## total.value ~ lot.size + gross.area + living.area + floors +
## rooms + bedrooms + remodel.Old + remodel.Recent
##
## Df Sum of Sq RSS AIC
## - bedrooms 1 459 12222339 44418
## - remodel.Old 1 2570 12224451 44419
## <none> 12221880 44419
## - rooms 1 23878 12245758 44429
## - remodel.Recent 1 499813 12721693 44650
## - gross.area 1 807646 13029526 44789
## - living.area 1 858223 13080103 44811
## - floors 1 1345908 13567788 45024
## - lot.size 1 2616726 14838607 45543
##
## Step: AIC=44418
## total.value ~ lot.size + gross.area + living.area + floors +
## rooms + remodel.Old + remodel.Recent
##
## Df Sum of Sq RSS AIC
## - remodel.Old 1 2567 12224906 44417
## <none> 12222339 44418
## + bedrooms 1 459 12221880 44419
## - rooms 1 26338 12248678 44428
## - remodel.Recent 1 499692 12722031 44648
## - gross.area 1 807192 13029531 44787
## - living.area 1 863450 13085790 44812
## - floors 1 1363363 13585703 45029
## - lot.size 1 2618080 14840419 45542
##
## Step: AIC=44417
## total.value ~ lot.size + gross.area + living.area + floors +
## rooms + remodel.Recent
##
## Df Sum of Sq RSS AIC
## <none> 12224906 44417
## + remodel.Old 1 2567 12222339 44418
## + bedrooms 1 455 12224451 44419
## - rooms 1 26709 12251615 44428
## - remodel.Recent 1 500432 12725338 44648
## - gross.area 1 813485 13038391 44789
## - living.area 1 861448 13086354 44810
## - floors 1 1370979 13595885 45032
## - lot.size 1 2617603 14842509 45541
summary(reg.step) # Which variables were dropped?
##
## Call:
## lm(formula = total.value ~ lot.size + gross.area + living.area +
## floors + rooms + remodel.Recent, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -295.6 -27.6 -0.3 26.1 300.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.695689 3.586374 12.74 < 0.0000000000000002 ***
## lot.size 0.008986 0.000255 35.23 < 0.0000000000000002 ***
## gross.area 0.032848 0.001673 19.64 < 0.0000000000000002 ***
## living.area 0.062596 0.003098 20.21 < 0.0000000000000002 ***
## floors 42.488048 1.666662 25.49 < 0.0000000000000002 ***
## rooms 2.182100 0.613252 3.56 0.00038 ***
## remodel.Recent 26.357106 1.711281 15.40 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.9 on 5795 degrees of freedom
## Multiple R-squared: 0.786, Adjusted R-squared: 0.786
## F-statistic: 3.54e+03 on 6 and 5795 DF, p-value: <0.0000000000000002
reg.step.pred <- predict(reg.step, valid.df)
accuracy(reg.step.pred, valid.df$total.value)
## ME RMSE MAE MPE MAPE
## Test set -0.611 46.6 34.8 -1.43 9.06