This is the final part of Capstone project for Coursera’s Statistics with R Specialization.
The Capstone Project aims to develop a model to predict the selling price of a given house in Ames, Iowa. This information helps assess whether the asking price of a house is higher or lower than the true value of the house. If the home is undervalued, it may be a good investment.
The data for the project comes from the Ames Assessor’s Office information used in computing assessed values for individual residential properties sold in Ames, IA from 2006 to 2010. More information can be found in the Codebook. The data have been randomly divided into three separate data sets: a training data set, a test data set, and a validation data set.
Code buttonLoad the data and necessary packages
load("./data/1220_SR-CS_RealEstate/ames_train.Rdata")
library(statsr); library(dplyr); library(BAS)
library(ggplot2); library(plotly); library(corrplot)
library(knitr); library(gridExtra); library(DT)
library(car); library(MASS); library(tidyverse)
library(pander)Explore the structure of the data and the relationships between the variables in the data set.
Start work with ames_train data set:
datatable(ames_train, extensions = 'Responsive')ames_train includes \(1 000\) observations on \(81\) variables.
there are two categorical variables coded as having type int - Overall.Qual/ Overall.Cond, so convert them to factors with a working data set train:
train <- ames_train %>% mutate(Overall.Qual= factor(Overall.Qual),
Overall.Cond= factor(Overall.Cond))Year.Built/ Year.Remod.Add to age/ age.Remod for convenience:train <- ames_train %>% mutate(Overall.Qual= factor(Overall.Qual),
Overall.Cond= factor(Overall.Cond)) %>%
mutate(age = 2020-Year.Built,age.Remod = 2020-Year.Remod.Add) %>%
dplyr::select(-c(Year.Built, Year.Remod.Add))Perform distribution and summary statistics for what seem to be quite important non-categorical variables in train data set: price, area and age:
get_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
gprice <-ggplot(train, aes(x = price/1000, y = ..density..)) +
geom_histogram(fill = "cornflowerblue", colour = "darkgray") +
geom_density(size = 1, colour = "brown")+
geom_vline(aes(xintercept=mean(price/1000,na.rm=TRUE), color="mean"), size=1)+
geom_vline(aes(
xintercept=median(price/1000,na.rm=TRUE), color="median"),
size=1) +
scale_color_manual(name=NULL, values=c(mean="purple", median="violet"))+
theme(legend.position = "top")+
labs(x= "price ($, thsnd)",title = "Distribution of House Price")
garea <-ggplot(train, aes(x = area, y = ..density..)) +
geom_histogram(fill = "cornflowerblue", colour = "darkgray") +
geom_density(size = 1, colour = "brown")+
geom_vline(aes(xintercept=mean(area,na.rm=TRUE), color="mean"), size=1)+
geom_vline(aes(
xintercept=median(area,na.rm=TRUE), color="median"), size=1) +
scale_color_manual(name=NULL, values=c(mean="purple", median="violet"))+
theme(legend.position = "none")+
labs(x= "area (SF)", y="", title = "Distribution of House Area")
gage <-ggplot(train, aes(x = age, y = ..density..)) +
geom_histogram(fill = "cornflowerblue", colour = "darkgray") +
geom_density(size = 1, colour = "brown")+
geom_vline(aes(xintercept=mean(age,na.rm=TRUE), color="mean"), size=1)+
geom_vline(aes(
xintercept=median(age,na.rm=TRUE), color="median"), size=1) +
scale_color_manual(name=NULL, values=c(mean="purple", median="violet"))+
theme(legend.position = "none")+
labs(x= "age of house (years)", title = "Distribution of House Age")
gageR <-ggplot(train, aes(x = age.Remod, y = ..density..)) +
geom_histogram(fill = "cornflowerblue", colour = "darkgray") +
geom_density(size = 1, colour = "brown")+
geom_vline(aes(xintercept=mean(age.Remod,na.rm=TRUE), color="mean"), size=1)+
geom_vline(aes(
xintercept=median(age.Remod,na.rm=TRUE), color="median"), size=1) +
scale_color_manual(name=NULL, values=c(mean="purple", median="violet"))+
theme(legend.position = "none")+
labs(x= "age of house remodel (years)", y="",
title = "Distribution of House Remodel Age")
legend <- get_legend(gprice)
gprice <- gprice + theme(legend.position="none")
blankPlot <- ggplot()+geom_blank(aes(1,1)) +
cowplot::theme_nothing()
grid.arrange(legend, gprice, garea, gage, gageR, ncol=2, nrow =3,
layout_matrix = rbind(c(1,1), c(2,3), c(4,5)),
widths = c(2.7, 2.7), heights = c(0.2, 2.5, 2.5))price/ area/ age, age.Remod are right-skewed, so +medians (not means) should be used for analysis
age/ age.Remod distributions are also multimodalskew <- function(x) {
train %>%
summarize(Q1 = quantile(x, 0.25), MEDIAN = median(x), MEAN = mean(x),
Q3 = quantile(x, 0.75), IQR = IQR(x), STDEV = sd(x)) %>%
mutate(SKEW = ifelse(MEAN > MEDIAN, "RIGHT", "LEFT"))
}
pr <- skew(train$price)
ar <- skew(train$area)
ag <- skew(train$age)
agR <- skew(train$age.Remod)
summ<- rbind(pr,ar,ag, agR)
summ<- cbind(variable=c('price','area','age', 'remodel age'), summ)
kable(summ,
caption = "Summary statistics for `price`, `area`, `age`,
`age.Remod` variables")| variable | Q1 | MEDIAN | MEAN | Q3 | IQR | STDEV | SKEW |
|---|---|---|---|---|---|---|---|
| price | 129762.5 | 159467.0 | 181190.076 | 213000.00 | 83237.50 | 81909.78780 | RIGHT |
| area | 1092.0 | 1411.0 | 1476.615 | 1743.25 | 651.25 | 505.17419 | RIGHT |
| age | 19.0 | 45.0 | 47.797 | 65.00 | 46.00 | 29.63741 | RIGHT |
| remodel age | 16.0 | 27.5 | 35.662 | 54.00 | 38.00 | 20.55749 | RIGHT |
There are 17 blanks "" and 4711 NA values in train. Transform blanks to NA and examine all these missing values:
for (i in 1:ncol(train)) train[i][train[i] == ""] <- NA
nas <- colSums(is.na(train))
nas <- tibble(variable = names(nas), count.na = nas) %>%
filter(count.na !=0) %>% arrange(desc(count.na))
astr<- train %>% dplyr::select(PID,c(nas$variable)) %>%
dplyr::select(where(is.factor))
astr1<- train %>% dplyr::select(c(PID,nas$variable)) %>%
dplyr::select(!(where(is.factor)))
gg<-ggplot(data = nas, aes(x = variable, y = count.na, fill = count.na)) +
geom_bar(stat = "identity")+
ggtitle("Number of NA's per variable (interactive)") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplotly(gg)A total of 26 variables with missing values has been detected, 15 of them - categorical, 12 - integer.
Most problematic variables:
Pool.QC (997 NA values),Misc.Feature (971 NA values),Alley (933 NA values),Fense (798 NA values),Fireplace.Qu (491 NA values)The Codebook reveals that NA values for categorical variables have been applied if an item doesn’t exist (e.g. Garage.Qual == NA means No garage).
Explore then NA values in integer variables:
bsmt<- train %>% filter(is.na(BsmtFin.SF.1)==TRUE |
is.na(BsmtFin.SF.2)==TRUE|
is.na(Bsmt.Unf.SF)==TRUE|
is.na(Total.Bsmt.SF)==TRUE|
is.na(Bsmt.Full.Bath)==TRUE|
is.na(Bsmt.Half.Bath)==TRUE) %>%
dplyr::select(c(35,37:39,47:48,31))
kable(bsmt,caption = "NA's in `Basement.*` non-categorical variables")| BsmtFin.SF.1 | BsmtFin.SF.2 | Bsmt.Unf.SF | Total.Bsmt.SF | Bsmt.Full.Bath | Bsmt.Half.Bath | Bsmt.Qual |
|---|---|---|---|---|---|---|
| NA | NA | NA | NA | NA | NA | NA |
garage <- train %>% filter(is.na(Garage.Yr.Blt)==TRUE |
is.na(Garage.Finish)==TRUE) %>%
dplyr::select(c(59:63))
frontAll <- train %>% filter(is.na(Lot.Frontage)==TRUE) %>%
dplyr::select(c(6,5,13))
front<- frontAll %>% filter(!(Lot.Config == "Inside"))
datatable(garage, caption = "NA's in `Garage.*` variables",
options = list(columnDefs = list(list(
targets = 1:5,
render = JS(
"function(data, type, row, meta) {",
"return data === null ? 'NA' : data;",
"}")
))))datatable(frontAll, caption = "NA's in `Lot.Frontage` variable",
options = list(columnDefs = list(list(
targets = 1:3,
render = JS(
"function(data, type, row, meta) {",
"return data === null ? 'NA' : data;",
"}")
))))There is only case all six Basement.* non-categorical variables equal NA, and it’s the case with no basement as Bsmt.Qual== NA too. So it makes sense to replace all missing values (describing area) with zero in this case.
The same story with Garage.* variables: they are only equal NA when there is no garage (Garage.Qual== NA). So also replace all of them, except for the year of construction Garage.Yr.Blt, with zero.
A slightly different situation with Lot.Frontage variable. Most cases with its NA values (89 of 167) are Inside lot (Lot.Config== Inside). Otherwise, the sale is located in zones Floating Village Residential/ Residential Low Density (MS.Zoning == RL, FV). Obviously, both there and there Lot.Frontage (Linear feet of street connected to property) is equal to zero.
Missing values handling strategy
None for each variable,None to NA values
NA values for all variables except for describing dates (age, age.Remod, Garage.Yr.Blt, Mo.Sold) to zero (0)astr<- train %>% dplyr::select(where(is.factor))
astr1<- train %>% dplyr::select(-c(age, age.Remod, Garage.Yr.Blt,
Mo.Sold)) %>%
dplyr::select(!(where(is.factor)))
astr2 <- train %>% dplyr::select(age, age.Remod, Garage.Yr.Blt,
Mo.Sold)
for (i in 1:ncol(astr)) levels(astr[[i]]) <- c(levels(astr[[i]]), "None")
astr[is.na(astr)] <- "None"
astr1[is.na(astr1)] <- 0
train <- tibble(cbind(astr, astr1, astr2))Construct correlogram to investigate relationships between variables and decide which of them could be chosen for further analysis. Consider only variables having correlation with price more than \(0.51\) as most realistic to appear in a regression model:
data.corr <- as.data.frame(sapply(train, as.numeric))
correlations <- cor(data.corr, method = "s")
corr.price <- as.matrix(correlations[,'price'])
corr.id <- names(which(apply(corr.price, 1, function(x) (abs(x) > 0.51))))
col <- colorRampPalette(
c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(as.matrix(correlations[corr.id,corr.id]), type = 'upper',
method='color', col=col(200), order="FPC", number.font = 4,
number.cex=.8, tl.col = "purple3", addCoef.col = 'black', tl.cex=.9,
mar = c(0, 0, 1, 0))Garage.Area and Garage.Cars (\(0.88\)) \(=>\) choose Garage.Cars as having higher correlation with price (\(0.72\))Full.Bath and area (\(0.68\)) \(=>\) choose area (corr with price: \(0.74\))X1st.Flr.SF and Total.Bsmt.SF (\(0.84\)) \(=>\) choose Total.Bsmt.SF (corr with price: \(0.62\))TotRms.AbvGrd and area (\(0.82\)) \(=>\) choose area (corr with price: \(0.74\))Garage.Finish and age (\(0.62\)) \(=>\) choose age (corr with price: \(-0.71\))Bsmt.Qual and age (\(0.67\)) \(=>\) choose age (corr with price: \(-0.71\))Start by creating a simple, intuitive initial model: select 10 predictor variables from ames_train and create a linear model for price using those variables.
Justification for variables chosen
From the above EDA, choose for initial model:
Garage.Cars, Total.Bsmt.SF, area, age variablesTake also important variables having noticeable correlation with price:
Overall.Qual (corr with price: \(0.81\)),age.Remod (corr with price: \(-0.62\)),Foundation (corr with price: \(0.56\)),Heating.QC (corr with price: \(-0.53\)),Kitchen.Qual (corr with price: \(-0.61\)),Exter.Qual (corr with price: \(-0.64\))R code and summary output table
Perform price.all with all these variables, not forgetting to log-transform price/ area/ age/ age.Remod variables due to their skewness (see section Histograms):
price.all <- lm(log(price) ~ Garage.Cars+ Total.Bsmt.SF+ log(area)+
log(age)+
Overall.Qual + log(age.Remod) + Foundation+
Heating.QC + Kitchen.Qual+ Exter.Qual, data = train)
pander(summary(price.all))| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 9.24 | 0.2268 | 40.74 | 2.89e-212 |
| Garage.Cars | 0.06621 | 0.009854 | 6.719 | 0.00000000003113 |
| Total.Bsmt.SF | 0.0001777 | 0.00001721 | 10.33 | 8.696e-24 |
| log(area) | 0.3631 | 0.0225 | 16.14 | 4.42e-52 |
| log(age) | -0.06708 | 0.01749 | -3.835 | 0.0001335 |
| Overall.Qual2 | -0.322 | 0.1918 | -1.679 | 0.0934 |
| Overall.Qual3 | 0.06454 | 0.1918 | 0.3365 | 0.7366 |
| Overall.Qual4 | 0.1107 | 0.1883 | 0.588 | 0.5566 |
| Overall.Qual5 | 0.2708 | 0.1892 | 1.431 | 0.1527 |
| Overall.Qual6 | 0.317 | 0.1903 | 1.665 | 0.09617 |
| Overall.Qual7 | 0.3825 | 0.1912 | 2.001 | 0.04572 |
| Overall.Qual8 | 0.4958 | 0.1922 | 2.58 | 0.01003 |
| Overall.Qual9 | 0.5847 | 0.1958 | 2.986 | 0.002901 |
| Overall.Qual10 | 0.519 | 0.2048 | 2.533 | 0.01145 |
| log(age.Remod) | -0.03493 | 0.01407 | -2.482 | 0.01322 |
| FoundationCBlock | 0.07834 | 0.02074 | 3.777 | 0.0001685 |
| FoundationPConc | 0.02743 | 0.02613 | 1.05 | 0.2942 |
| FoundationSlab | 0.1293 | 0.05579 | 2.317 | 0.02069 |
| FoundationStone | 0.05593 | 0.09811 | 0.5701 | 0.5687 |
| Heating.QCFa | -0.09616 | 0.03879 | -2.479 | 0.01334 |
| Heating.QCGd | -0.01124 | 0.01672 | -0.6723 | 0.5016 |
| Heating.QCPo | -0.2923 | 0.1648 | -1.774 | 0.07644 |
| Heating.QCTA | -0.07162 | 0.01573 | -4.553 | 0.000005971 |
| Kitchen.QualFa | -0.111 | 0.05164 | -2.149 | 0.03189 |
| Kitchen.QualGd | -0.04578 | 0.03098 | -1.478 | 0.1398 |
| Kitchen.QualPo | -0.1597 | 0.1682 | -0.9493 | 0.3427 |
| Kitchen.QualTA | -0.105 | 0.0342 | -3.071 | 0.002196 |
| Exter.QualFa | -0.256 | 0.07035 | -3.639 | 0.0002878 |
| Exter.QualGd | -0.07607 | 0.04146 | -1.835 | 0.06687 |
| Exter.QualTA | -0.06559 | 0.04605 | -1.424 | 0.1547 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 1000 | 0.1634 | 0.8535 | 0.8492 |
Discussion of model results
Model as a whole
Predictors coefficients
log(age.Remod) seem to be utterly important
log(age.Remod) is also quite important, with significance level of \(0.05\) (p-value = 0.0132)Overall.Qual: levels \(2-6\) (initial level \(=1\))Foundation: levels PConc (Poured Contrete)/ Stone (initial level: BrkTil - Brick & Tile)Heating.QC: levels Gd/ Po (initial level: Ex - Excellent)Kitchen.Qual: levels Gd/ Po (initial level: Ex - Excellent)Exter.Qual: levels Gd/ TA (initial level: Ex - Excellent)One can try to recode categorical variables.
Choose the best model, using the initial model as a starting point.
From the all.model results above, recode categorical variables so that they only contain LOW/ HIGH levels (in case of Overall.Qual/ Kitchen.Qual, one more level: TA - Typical/ Average). Then, run lm() to get price.all model with recoded variables:
train$Overall.Qual <- fct_collapse(train$Overall.Qual,
LOW = c("1", "2", "3", "4", "5"),
TA= c("6","7"), HIGH = c("8", "9", "10"))
train$Foundation <- fct_collapse(train$Foundation,
LOW = c("CBlock", "PConc", "Slab"),
HIGH = c("BrkTil", "Stone", "Wood"))
train$Heating.QC <- fct_collapse(train$Heating.QC,
LOW = c("Po", "Fa", "TA"),
HIGH = c("Gd", "Ex"))
train$Kitchen.Qual <- fct_collapse(train$Kitchen.Qual,
LOW = c("Po", "Fa"),
TA=c("TA", "Gd"),
HIGH = c("Ex"))
train$Exter.Qual <- fct_collapse(train$Exter.Qual,
LOW = c("Fa"),
HIGH = c("TA", "Gd", "Ex"))
price.all <- lm(log(price) ~ Garage.Cars+ Total.Bsmt.SF+
log(area)+ log(age)+
Overall.Qual + log(age.Remod) + Foundation+
Heating.QC + Kitchen.Qual+ Exter.Qual,
data = train)
pander(summary(price.all))| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 8.975 | 0.1632 | 55 | 8.182e-303 |
| Garage.Cars | 0.05917 | 0.0104 | 5.69 | 0.00000001677 |
| Total.Bsmt.SF | 0.0002072 | 0.00001709 | 12.12 | 1.271e-31 |
| log(area) | 0.4276 | 0.02254 | 18.97 | 1.138e-68 |
| log(age) | -0.06167 | 0.01575 | -3.915 | 0.00009649 |
| Overall.QualTA | 0.08911 | 0.01543 | 5.776 | 0.00000001024 |
| Overall.QualHIGH | 0.2211 | 0.02495 | 8.86 | 3.653e-18 |
| log(age.Remod) | -0.05758 | 0.01433 | -4.017 | 0.00006329 |
| FoundationLOW | 0.1039 | 0.02136 | 4.863 | 0.000001347 |
| Heating.QCLOW | -0.07704 | 0.01415 | -5.443 | 0.00000006619 |
| Kitchen.QualLOW | -0.1703 | 0.0482 | -3.534 | 0.000428 |
| Kitchen.QualTA | -0.1031 | 0.02651 | -3.89 | 0.0001071 |
| Exter.QualLOW | -0.2792 | 0.05413 | -5.158 | 0.0000003019 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 1000 | 0.1761 | 0.8267 | 0.8246 |
Model with categorical variables recoded (price.all) as a whole seems to be almost the same as all.model except for Adj.R-sq:
While, all variables seem to be utterly important
Models Selection
price.all predicting log(price) based on all ten chosen variables (with categorical variables recoded)stepAIC() function
bas.lm function
uniform())
Backwards elimination
price.aic <- stepAIC(price.all, direction = "backward", trace = 0, k=2)
price.bic <-stepAIC(price.all, direction = "backward", trace = 0,
k= log(nrow(train)))
step<- cbind(coef(price.all), coef(price.aic), coef(price.bic),
summary(price.all)$coefficients[,4])
colnames(step) <- c("price.all", "price.aic", "price.bic", "Pr(>|t|)")
datatable(round(step,4), options = list(pageLength = 13),
caption = "Backwards elimination coefficients")price.all predictorsprice.all model characteristics)Bayesian model averaging
price.bma <- bas.lm(log(price) ~ Garage.Cars+ Total.Bsmt.SF+
log(area)+ log(age)+
Overall.Qual + log(age.Remod) + Foundation+
Heating.QC + Kitchen.Qual+ Exter.Qual,
data = train,
modelprior = uniform(), prior = "ZS-null",
method = "MCMC")
datatable(data.frame(round(summary(price.bma)[1:13,1],6)),
options = list(pageLength = 13),
colnames = c("", "P(B != 0 | Y)"))Kitchen.QualLOW/ Kitchen.QualTA
Kitchen.QualLOWKitchen.QualTALook at convergence plot for posterior inclusion probabilities (pip):
diagnostics(price.bma, type="pip", col = "purple", pch = 16, cex = 1.5)Model Space Visualization:
image(price.bma,rotate=FALSE)Choosing Bayesian approach, select price.bma model as initial:
initial.model <- price.bmaOne way to assess the performance of a model is to examine the model’s residuals. Create a residual plot for the preferred model from above and use it to assess whether the model appears to fit the data well.
plot(initial.model, which=1, sub.caption = "")out<-cbind(row = c(310,428,741),
train[c(310,428,741),c(48,68,56,47,78,14,79,23,30,33,21)])
datatable(out, extensions = 'Responsive')Possible sources of problems
gg<-ggplot(data = train, aes(x=area, y=price))+
geom_jitter(col="cornflowerblue")+
stat_smooth(colour="brown")+
ggtitle("Price vs area")
ggplotly(gg)gg<-ggplot(data = train, aes(x=age, y=price))+
geom_jitter(col="cornflowerblue")+
stat_smooth(method="lm", colour="brown")+
ggtitle("Price vs age")
ggplotly(gg)pricePossible solutions
price of houses over \(4\ 000\) square feetRMSE (root mean squared error) can be calculated directly based on the model output.
pred.train <- predict(initial.model, train, estimator = "BMA")
resid.train <- na.omit(train$price - exp(pred.train$fit))
rmse.train <- sqrt(mean(resid.train^2))RMSE (root mean squared error) for BMA initial.model is round(rmse.train,2)
train.no4 <- train %>% filter(area<4000)
initial.no4 <- bas.lm(log(price) ~ Garage.Cars+ Total.Bsmt.SF+
log(area)+ log(age)+
Overall.Qual + log(age.Remod) + Foundation+
Heating.QC + Kitchen.Qual+ Exter.Qual,
data = train.no4,
modelprior = uniform(), prior = "ZS-null", method = "MCMC")
pred.train.no4 <- predict(initial.no4, train.no4, estimator = "BMA")
resid.train.no4 <- na.omit(train.no4$price - exp(pred.train.no4$fit))
rmse.train.no4 <- sqrt(mean(resid.train.no4^2))
rmse<- cbind(with.4000=rmse.train, without.4000=rmse.train.no4)
rownames(rmse) <- "RMSE"
datatable(round(rmse,2),
caption = "RMSE for BMA `initial.model` with and without area of more than 4 000 SF")Removing observations with area over \(4\ 000SF\) results in a significantly improved RMSE value.
The model may do well on training data but perform poorly out-of-sample (meaning, on a data set other than the original training data) because the model is overly-tuned to specifically fit the training data. This is called “overfitting”. To determine whether overfitting is occurring on the model, compare the performance of it on both in-sample (ames_train) and out-of-sample (ames_test) data sets.
load("./data/1220_SR-CS_RealEstate/ames_test.Rdata")Use the model from above to generate predictions for the housing prices in the test data set.
test <- trans(ames_test) # Appendix: trans() makes the same changes as with training data set
pred.test <- predict(initial.model, test, estimator = "BMA")
pred.test <- predict(initial.no4, test, estimator = "BMA")
resid.test <- na.omit(test$price - exp(pred.test$fit))
rmse.test <- sqrt(mean(resid.test^2))
rmse<- cbind(train=rmse.train, test=rmse.test)
rownames(rmse) <- "RMSE"
datatable(round(rmse,2),
caption = "RMSE for BMA `initial.model`: `train` and `test` data")To compare models prediction accuracy/ out-of-sample performance is used RMSE (root mean squared error) metric. The lower RMSE, the better is prediction. In general, RMSE for prediction on a training data set is lower than that for testing one. It’s because model is built on the training data and fit to it better. Yet, in this case, the situation is different: initial.model shows larger RMSE on training data set. This implies that there is no overfitting of the model.
Now create a final model with up to 20 variables to predict housing prices in Ames, IA, selecting from the full array of variables in the data set.
The final model price.aic is obtained by applying stepAIC() backwards elimination
load("./data/1220_SR-CS_RealEstate/ames_train.Rdata")
train <- ames_train %>% mutate(Overall.Qual= factor(Overall.Qual),
Overall.Cond= factor(Overall.Cond)) %>%
mutate(age= 2020-Year.Built,age.Remod= 2020-Year.Remod.Add) %>%
dplyr::select(-c(Year.Built, Year.Remod.Add, Foundation))
# "" -->> NA, NA -->> None/ 0
for (i in 1:ncol(train)) train[i][train[i] == ""] <- NA
astr0<- train %>% dplyr::select(where(is.factor))
astr01<- train %>% dplyr::select(-c(age, age.Remod, Garage.Yr.Blt,
Mo.Sold)) %>%
dplyr::select(!(where(is.factor)))
astr02 <- train %>% dplyr::select(age, age.Remod, Garage.Yr.Blt,
Mo.Sold)
for (i in 1:ncol(astr0)) levels(astr0[[i]]) <- c(levels(astr0[[i]]),
"None")
astr0[is.na(astr0)] <- "None"
astr01[is.na(astr01)] <- 0
train <- tibble(cbind(astr0, astr01, astr02))
# corr>0.5
data.corr <- as.data.frame(sapply(train, as.numeric))
correlations <- cor(data.corr, method = "s")
corr.price <- as.matrix(correlations[,'price'])
corr.id <- names(which(apply(corr.price, 1, function(x) (abs(x) > 0.5))))
train<- train %>% filter(area<4000) # exclude outlier
# log-transform
train<- train %>% dplyr::select(all_of(corr.id)) %>%
mutate("log(1+price)"=log(1+price),"log(1+area)" = log(1+area),
"log(1+age)"= log(1+age),"log(1+age.Remod)"= log(1+age.Remod),
"log(1+X1st.Flr.SF)"= log(1+X1st.Flr.SF),
"log(1+Full.Bath)"= log(1+Full.Bath),
"log(1+TotRms.AbvGrd)"= log(1+TotRms.AbvGrd),
"log(1+Fireplaces)"= log(1+Fireplaces),
"log(1+Garage.Cars)"= log(1+Garage.Cars)) %>%
dplyr::select(-c(price, area, age, age.Remod, X1st.Flr.SF, Full.Bath,
TotRms.AbvGrd, Fireplaces, Garage.Cars))
# modelling
price.all <- lm(`log(1+price)` ~ ., data = train)
price.aic <- stepAIC(price.all, direction = "backward", trace = 0, k=2)
pander(summary(price.aic))| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 9.216 | 0.2653 | 34.74 | 7.615e-172 |
| Overall.Qual2 | -0.4208 | 0.1784 | -2.358 | 0.01856 |
| Overall.Qual3 | 0.006049 | 0.1804 | 0.03353 | 0.9733 |
| Overall.Qual4 | 0.04687 | 0.1773 | 0.2643 | 0.7916 |
| Overall.Qual5 | 0.1779 | 0.179 | 0.9943 | 0.3203 |
| Overall.Qual6 | 0.2137 | 0.1796 | 1.189 | 0.2346 |
| Overall.Qual7 | 0.2664 | 0.1803 | 1.478 | 0.1399 |
| Overall.Qual8 | 0.354 | 0.1812 | 1.954 | 0.05101 |
| Overall.Qual9 | 0.4114 | 0.1849 | 2.224 | 0.02635 |
| Overall.Qual10 | 0.4904 | 0.194 | 2.529 | 0.01161 |
| Exter.QualFa | -0.2191 | 0.06632 | -3.304 | 0.00099 |
| Exter.QualGd | -0.06293 | 0.0393 | -1.601 | 0.1096 |
| Exter.QualTA | -0.05482 | 0.04346 | -1.261 | 0.2075 |
| Bsmt.QualFa | -0.1517 | 0.04382 | -3.461 | 0.000561 |
| Bsmt.QualGd | -0.07183 | 0.0246 | -2.92 | 0.003584 |
| Bsmt.QualPo | -0.207 | 0.1586 | -1.306 | 0.192 |
| Bsmt.QualTA | -0.1025 | 0.02982 | -3.437 | 0.0006141 |
| Bsmt.QualNone | -0.09752 | 0.05395 | -1.807 | 0.071 |
| Heating.QCFa | -0.08755 | 0.03643 | -2.404 | 0.01643 |
| Heating.QCGd | 0.00121 | 0.01561 | 0.07752 | 0.9382 |
| Heating.QCPo | -0.175 | 0.1539 | -1.137 | 0.2558 |
| Heating.QCTA | -0.05185 | 0.0143 | -3.627 | 0.0003022 |
| Kitchen.QualFa | -0.07635 | 0.04865 | -1.569 | 0.1169 |
| Kitchen.QualGd | -0.03094 | 0.02943 | -1.052 | 0.2932 |
| Kitchen.QualPo | -0.1717 | 0.1581 | -1.086 | 0.2776 |
| Kitchen.QualTA | -0.08277 | 0.03223 | -2.568 | 0.01038 |
| Garage.TypeAttchd | 0.05034 | 0.05125 | 0.9822 | 0.3262 |
| Garage.TypeBasment | 0.0125 | 0.06945 | 0.18 | 0.8572 |
| Garage.TypeBuiltIn | 0.06377 | 0.05621 | 1.134 | 0.2569 |
| Garage.TypeCarPort | 0.1451 | 0.1621 | 0.895 | 0.371 |
| Garage.TypeDetchd | 0.00534 | 0.05153 | 0.1036 | 0.9175 |
| Garage.TypeNone | -0.03951 | 0.06023 | -0.6561 | 0.5119 |
| Total.Bsmt.SF | 0.0001365 | 0.00002861 | 4.772 | 0.000002103 |
| Garage.Area | 0.0001505 | 0.00003557 | 4.232 | 0.00002539 |
log(1+area) |
0.3489 | 0.02695 | 12.95 | 1.854e-35 |
log(1+age) |
-0.0603 | 0.01774 | -3.4 | 0.0007028 |
log(1+age.Remod) |
-0.04239 | 0.01375 | -3.083 | 0.00211 |
log(1+X1st.Flr.SF) |
0.05984 | 0.03457 | 1.731 | 0.08376 |
log(1+Full.Bath) |
-0.07591 | 0.03539 | -2.145 | 0.0322 |
log(1+Fireplaces) |
0.09028 | 0.01546 | 5.84 | 0.000000007157 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 999 | 0.1528 | 0.8733 | 0.8682 |
The model is statistically significant (p-value \(2.2\cdot 10^{-16}\)), contains 14 predictors and explains 86.82\(\%\) of variance in price around its mean.
All non-categorical variables except for Total.Bsmt.SF and Garage.Area has been log-transformed with the preliminary addition of 1
price, example:gg1 <- ggplot(ames_train, aes(x = area, y = price)) +
geom_point(color= "cornflowerblue") +
stat_smooth(method = 'lm', color="brown")+
ggtitle("price ~ area")
gg2 <- ggplot(ames_train, aes(x = log(area), y = price)) +
geom_point(color= "cornflowerblue") +
stat_smooth(method = 'lm', color="brown")+
ggtitle("price ~ log(area)")
gg3 <- ggplot(ames_train, aes(x = area, y = log(price))) +
geom_point(color= "cornflowerblue") +
stat_smooth(method = 'lm', color="brown")+
ggtitle("log(price) ~ area")
gg4 <- ggplot(ames_train, aes(x = log(area), y = log(price))) +
geom_point(color= "cornflowerblue") +
stat_smooth(method = 'lm', color="brown")+
ggtitle("log(price) ~ log(area)")
grid.arrange(gg1, gg2, gg3, gg4, ncol = 2)Variable interaction in the sense of, for instance, multiplying two factors, should have been included if strong multicollinearity presented in a regression model. Use variance/ generalized variance inflation factors (VIF/ GVIF) to handle it:
pander(vif(price.aic))| GVIF | Df | GVIF^(1/(2*Df)) | |
|---|---|---|---|
| Overall.Qual | 21.28 | 9 | 1.185 |
| Exter.Qual | 9.143 | 3 | 1.446 |
| Bsmt.Qual | 11.96 | 5 | 1.282 |
| Heating.QC | 2.068 | 4 | 1.095 |
| Kitchen.Qual | 6.618 | 4 | 1.266 |
| Garage.Type | 3.895 | 6 | 1.12 |
| Total.Bsmt.SF | 6.249 | 1 | 2.5 |
| Garage.Area | 2.592 | 1 | 1.61 |
log(1+area) |
3.486 | 1 | 1.867 |
log(1+age) |
6.062 | 1 | 2.462 |
log(1+age.Remod) |
2.841 | 1 | 1.685 |
log(1+X1st.Flr.SF) |
5.402 | 1 | 2.324 |
log(1+Full.Bath) |
2.517 | 1 | 1.586 |
log(1+Fireplaces) |
1.617 | 1 | 1.271 |
Explanatory variables are not strongly associated with each other \(=>\) no need to include any variable interactions.
Selection of variables includes several stages:
Year.Built and Year.Remod.Add to age and Remod.age, respectively, for convenienceFoundation variable as there are no observations in ames_train having one of its level (Wood), while in ames_test there areprice \(>0.50\))
age Exter.Qual Garage.Finish age.Remod Bsmt.Qual Kitchen.Qual Heating.QC Fireplace.Qu Garage.Type Fireplaces TotRmsAbvGrd X1st.Flr.SF Total.Bsmt.SF Full.Bath Garage.Area area Garage.Cars Overall.Qual
price.bic <-stepAIC(price.all, direction = "backward", trace = 0,
k= log(nrow(train)))
aicrsq<- round(summary(price.aic)$adj.r.squared,4)
bicrsq<- round(summary(price.bic)$adj.r.squared,4)
pred.train.aic <- predict(price.aic, train)
resid.train.aic <- na.omit(exp(train$`log(1+price)`) - exp(pred.train.aic))
rmse.train.aic <- sqrt(mean(resid.train.aic^2))
pred.train.bic <- predict(price.bic, train)
resid.train.bic <- na.omit(exp(train$`log(1+price)`) - exp(pred.train.bic))
rmse.train.bic <- sqrt(mean(resid.train.bic^2))
aic<- round(c(AIC(price.aic), AIC(price.bic)),2)
bic<- round(c(BIC(price.aic), BIC(price.bic)),2)
fpvalue <- round(anova(price.aic, price.bic)$`Pr(>F)`,4)
model<- c("price.aic", "price.bic")
compar.mdls<- cbind(model= model, `Adj.R-sq`=c(aicrsq, bicrsq),
RMSE=round(c(rmse.train.aic, rmse.train.bic),2),
AIC=aic, BIC=bic)
kable(compar.mdls, caption = "Comparison of models: `price.aic` versus `price.bic`")| model | Adj.R-sq | RMSE | AIC | BIC |
|---|---|---|---|---|
| price.aic | 0.8682 | 26223.04 | -877.55 | -676.37 |
| price.bic | 0.8591 | 27528.63 | -833.69 | -745.37 |
To compare models prediction accuracy/ out-of-sample performance is used RMSE (root mean squared error) metric. The lower RMSE, the better is prediction. In general, RMSE for prediction on a training data set is lower than that for testing one. It’s because model is built on the training data and fit to it better. Yet, in this case, the situation is different: final price.aic model shows larger RMSE on training data set:
test <- transfin(ames_test)
pred.test.aic <- predict(price.aic, test)
resid.test.aic <- na.omit(exp(test$`log(1+price)`) - exp(pred.test.aic))
rmse.test.aic <- sqrt(mean(resid.test.aic^2))
ds <- c("ames_train", "ames_test")
RMSE<- cbind(`data set` = ds, RMSE = round(c(rmse.train.aic,rmse.test.aic),2))
kable(RMSE, caption = "RMSE for final model on train and test data sets")| data set | RMSE |
|---|---|
| ames_train | 26223.04 |
| ames_test | 24202.1 |
This implies final price.aic model performs well on the training and testing sets, and there is no overfitting. The model did not overreact to minor fluctuations in data, and no need to make any changes.
gg<-ggplot(data = price.aic, aes(x = .fitted, y = .resid))+
geom_point(colour= "cornflowerblue") +
geom_smooth(colour="brown") +
xlab("predicted log(price)") +ylab("residuals")+
ggtitle("Residuals vs Fitted (interactive)")
ggplotly(gg)ds <- c("ames_train", "ames_test")
RMSE<- cbind(`data set` = ds,
RMSE.initial = round(c(rmse.train,rmse.test),2),
RMSE.final = round(c(rmse.train.aic,rmse.test.aic),2))
kable(RMSE, caption = "RMSE: final model vs initial model")| data set | RMSE.initial | RMSE.final |
|---|---|---|
| ames_train | 33789.08 | 26223.04 |
| ames_test | 27144.8 | 24202.1 |
In the context of the initial model (Section 2.4), RMSE has significantly improved: from 33789.08 dollars on the training data with initial.model, to 24202.1 dollars on the test data with the final price.aic model, that is by quite substantial 28.37\(\%\).
Testing the final model on validation data set, will show model’s reliability.
Strengths and weaknesses of the model.
Strengths
price around its meanWeaknesses
Testing the final model on a separate, validation data set is a great way to determine how it will perform in real-life practice. Use the ames_validation data set to do some additional assessment of the final model.
load("./data/1220_SR-CS_RealEstate/ames_validation.Rdata")RMSE of final price.aic model on the validation data
valid <- transfin(ames_validation)
pred.valid.aic <- predict(price.aic, valid)
resid.valid.aic <- na.omit(exp(valid$`log(1+price)`) - exp(pred.valid.aic))
rmse.valid.aic <- sqrt(mean(resid.valid.aic^2))
ds <- c("ames_train", "ames_test", "ames_validation")
RMSE<- cbind(`data set` = ds,
RMSE.final = round(c(rmse.train.aic,rmse.test.aic, rmse.valid.aic),2))
kable(RMSE, caption = "RMSE: final model")| data set | RMSE.final |
|---|---|
| ames_train | 26223.04 |
| ames_test | 24202.1 |
| ames_validation | 23649.92 |
ames_validation data set reaches 23649.92 dollars.Comparison of RMSE on the training/ test/ validation data sets
In general, the lower the RMSE, the better the model fit. Therefore, the final price.aic model is better fitted for validation data.
Percentage of the 95% predictive confidence intervals containing the true price of the house in the validation data set
pred.int <- predict(price.aic, valid, interval = "prediction",
level = 0.95)
coverage <- mean(valid$`log(1+price)` > pred.int[,"lwr"] &
valid$`log(1+price)` < pred.int[,"upr"])Proper reflection of uncertainty
The final model explains 86.82% of variance in price around its mean and contains 14 predictors: Overall.Qual, Exter.Qual, Bsmt.Qual, Heating.QC, Kitchen.Qual, Garage.Type, Total.Bsmt.SF, Garage.Area, area, age, age.Remod, X1st.Flr.SF, Full.Bath, Fireplaces
Feature engineering, cleaning and transformation the data become one of the major tasks in choosing the best model.
Measuring improvements via adjusted R-squared, Root Mean Squared Error, and visual analyses of Residual vs. Fitted plots is vital to understand changes. Managing outliers/ high leverage points have a great impact on improvements in model. Following a stepwise approach (improving and measuring) helps to find a robust model.
In short, to have a good model one need to obtain good data, explore the data to understand it well, build model, diagnose the model, test the model and validate the model.
transtrans <- function(data) {
# Overall.Qual/ .Cond -->> factors, Year.Built/ .Remod.Add -->> age/ age.Remod
trans <- data %>% mutate(Overall.Qual= factor(Overall.Qual),
Overall.Cond= factor(Overall.Cond)) %>%
mutate(age= 2020-Year.Built,age.Remod= 2020-Year.Remod.Add) %>%
dplyr::select(-c(Year.Built, Year.Remod.Add))
# "" -->> NA, NA -->> None/ 0
for (i in 1:ncol(trans)) trans[i][trans[i] == ""] <- NA
astr0<- trans %>% dplyr::select(where(is.factor))
astr01<- trans %>% dplyr::select(-c(age, age.Remod, Garage.Yr.Blt,
Mo.Sold)) %>%
dplyr::select(!(where(is.factor)))
astr02 <- trans %>% dplyr::select(age, age.Remod, Garage.Yr.Blt,
Mo.Sold)
for (i in 1:ncol(astr0)) levels(astr0[[i]]) <- c(levels(astr0[[i]]),
"None")
astr0[is.na(astr0)] <- "None"
astr01[is.na(astr01)] <- 0
trans <- tibble(cbind(astr0, astr01, astr02))
# categorical vars -->> recoded
trans$Overall.Qual <- fct_collapse(trans$Overall.Qual,
LOW = c("1", "2", "3", "4", "5"),
TA= c("6","7"), HIGH = c("8", "9", "10"))
trans$Foundation <- fct_collapse(trans$Foundation,
LOW = c("CBlock", "PConc", "Slab"),
HIGH = c("BrkTil", "Stone", "Wood"))
trans$Heating.QC <- fct_collapse(trans$Heating.QC,
LOW = c("Po", "Fa", "TA"),
HIGH = c("Gd", "Ex"))
trans$Kitchen.Qual <- fct_collapse(trans$Kitchen.Qual,
LOW = c("Po", "Fa"),
TA=c("TA", "Gd"),
HIGH = c("Ex"))
trans$Exter.Qual <- fct_collapse(trans$Exter.Qual,
LOW = c("Fa"),
HIGH = c("TA", "Gd", "Ex"))
trans
}transfintransfin <- function(data) {
# Overall.Qual/ .Cond -->> factors, Year.Built/ .Remod.Add -->> age/ age.Remod
transfin <- data %>% mutate(Overall.Qual= factor(Overall.Qual),
Overall.Cond= factor(Overall.Cond)) %>%
mutate(age= 2020-Year.Built,age.Remod= 2020-Year.Remod.Add) %>%
dplyr::select(-c(Year.Built, Year.Remod.Add, Foundation))
# "" -->> NA, NA -->> None/ 0
for (i in 1:ncol(transfin)) transfin[i][transfin[i] == ""] <- NA
astr0<- transfin %>% dplyr::select(where(is.factor))
astr01<- transfin %>% dplyr::select(-c(age, age.Remod, Garage.Yr.Blt,
Mo.Sold)) %>%
dplyr::select(!(where(is.factor)))
astr02 <- transfin %>% dplyr::select(age, age.Remod, Garage.Yr.Blt,
Mo.Sold)
for (i in 1:ncol(astr0)) levels(astr0[[i]]) <- c(levels(astr0[[i]]),
"None")
astr0[is.na(astr0)] <- "None"
astr01[is.na(astr01)] <- 0
transfin <- tibble(cbind(astr0, astr01, astr02))
# log-transform
transfin<- transfin %>% dplyr::select(all_of(corr.id)) %>%
mutate("log(1+price)"=log(1+price),"log(1+area)" = log(1+area),
"log(1+age)"= log(1+age),"log(1+age.Remod)"= log(1+age.Remod),
"log(1+X1st.Flr.SF)"= log(1+X1st.Flr.SF),
"log(1+Full.Bath)"= log(1+Full.Bath),
"log(1+TotRms.AbvGrd)"= log(1+TotRms.AbvGrd),
"log(1+Fireplaces)"= log(1+Fireplaces),
"log(1+Garage.Cars)"= log(1+Garage.Cars)) %>%
dplyr::select(-c(price, area, age, age.Remod, X1st.Flr.SF, Full.Bath,
TotRms.AbvGrd, Fireplaces, Garage.Cars))
transfin
}