Intro

This is the final part of Capstone project for Coursera’s Statistics with R Specialization.

The Capstone Project aims to develop a model to predict the selling price of a given house in Ames, Iowa. This information helps assess whether the asking price of a house is higher or lower than the true value of the house. If the home is undervalued, it may be a good investment.

The data for the project comes from the Ames Assessor’s Office information used in computing assessed values for individual residential properties sold in Ames, IA from 2006 to 2010. More information can be found in the Codebook. The data have been randomly divided into three separate data sets: a training data set, a test data set, and a validation data set.


  • Code chunks can be displayed by clicking Code button

Load the data and necessary packages

load("./data/1220_SR-CS_RealEstate/ames_train.Rdata")
library(statsr); library(dplyr); library(BAS)
library(ggplot2); library(plotly); library(corrplot)
library(knitr); library(gridExtra); library(DT)
library(car); library(MASS); library(tidyverse)
library(pander)

1. Exploratory Data Analysis (EDA)

Explore the structure of the data and the relationships between the variables in the data set.


Data

Start work with ames_train data set:

datatable(ames_train, extensions = 'Responsive')
  • ames_train includes \(1 000\) observations on \(81\) variables.

  • there are two categorical variables coded as having type int - Overall.Qual/ Overall.Cond, so convert them to factors with a working data set train:

train <- ames_train %>% mutate(Overall.Qual= factor(Overall.Qual),
                               Overall.Cond= factor(Overall.Cond))
  • change variables Year.Built/ Year.Remod.Add to age/ age.Remod for convenience:
train <- ames_train %>% mutate(Overall.Qual= factor(Overall.Qual),
                               Overall.Cond= factor(Overall.Cond)) %>%
        mutate(age = 2020-Year.Built,age.Remod = 2020-Year.Remod.Add) %>%
        dplyr::select(-c(Year.Built, Year.Remod.Add))

Histograms

Perform distribution and summary statistics for what seem to be quite important non-categorical variables in train data set: price, area and age:

get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

gprice <-ggplot(train, aes(x = price/1000, y = ..density..)) +
  geom_histogram(fill = "cornflowerblue", colour = "darkgray") + 
  geom_density(size = 1, colour = "brown")+
  geom_vline(aes(xintercept=mean(price/1000,na.rm=TRUE), color="mean"), size=1)+
        geom_vline(aes(
                xintercept=median(price/1000,na.rm=TRUE), color="median"),
                size=1) +
        scale_color_manual(name=NULL, values=c(mean="purple", median="violet"))+
        theme(legend.position = "top")+
        labs(x= "price ($, thsnd)",title = "Distribution of House Price")
garea <-ggplot(train, aes(x = area, y = ..density..)) +
  geom_histogram(fill = "cornflowerblue", colour = "darkgray") + 
  geom_density(size = 1, colour = "brown")+
  geom_vline(aes(xintercept=mean(area,na.rm=TRUE), color="mean"), size=1)+
        geom_vline(aes(
                xintercept=median(area,na.rm=TRUE), color="median"), size=1) +
        scale_color_manual(name=NULL, values=c(mean="purple", median="violet"))+
        theme(legend.position = "none")+
        labs(x= "area (SF)", y="", title = "Distribution of House Area")
gage <-ggplot(train, aes(x = age, y = ..density..)) +
  geom_histogram(fill = "cornflowerblue", colour = "darkgray") + 
  geom_density(size = 1, colour = "brown")+
  geom_vline(aes(xintercept=mean(age,na.rm=TRUE), color="mean"), size=1)+
        geom_vline(aes(
                xintercept=median(age,na.rm=TRUE), color="median"), size=1) +
        scale_color_manual(name=NULL, values=c(mean="purple", median="violet"))+
        theme(legend.position = "none")+
        labs(x= "age of house (years)", title = "Distribution of House Age")
gageR <-ggplot(train, aes(x = age.Remod, y = ..density..)) +
  geom_histogram(fill = "cornflowerblue", colour = "darkgray") + 
  geom_density(size = 1, colour = "brown")+
  geom_vline(aes(xintercept=mean(age.Remod,na.rm=TRUE), color="mean"), size=1)+
        geom_vline(aes(
                xintercept=median(age.Remod,na.rm=TRUE), color="median"), size=1) +
        scale_color_manual(name=NULL, values=c(mean="purple", median="violet"))+
        theme(legend.position = "none")+
        labs(x= "age of house remodel (years)", y="",
             title = "Distribution of House Remodel Age")
legend <- get_legend(gprice)
gprice <- gprice + theme(legend.position="none")
blankPlot <- ggplot()+geom_blank(aes(1,1)) + 
  cowplot::theme_nothing()
grid.arrange(legend, gprice, garea,  gage, gageR, ncol=2, nrow =3, 
             layout_matrix = rbind(c(1,1), c(2,3), c(4,5)),
             widths = c(2.7, 2.7), heights = c(0.2, 2.5, 2.5))

  • all the distributions of price/ area/ age, age.Remod are right-skewed, so +medians (not means) should be used for analysis
    • all these variables would be better to log-transform
  • age/ age.Remod distributions are also multimodal
skew <- function(x) {
  train %>%
    summarize(Q1 = quantile(x, 0.25), MEDIAN = median(x), MEAN = mean(x),
              Q3 = quantile(x, 0.75), IQR = IQR(x), STDEV = sd(x)) %>%
                mutate(SKEW = ifelse(MEAN > MEDIAN, "RIGHT", "LEFT"))
}
pr <- skew(train$price)
ar <- skew(train$area)
ag <- skew(train$age)
agR <- skew(train$age.Remod)
summ<- rbind(pr,ar,ag, agR)
summ<- cbind(variable=c('price','area','age', 'remodel age'), summ)
kable(summ,
      caption = "Summary statistics for `price`, `area`, `age`,
      `age.Remod` variables")
Summary statistics for price, area, age, age.Remod variables
variable Q1 MEDIAN MEAN Q3 IQR STDEV SKEW
price 129762.5 159467.0 181190.076 213000.00 83237.50 81909.78780 RIGHT
area 1092.0 1411.0 1476.615 1743.25 651.25 505.17419 RIGHT
age 19.0 45.0 47.797 65.00 46.00 29.63741 RIGHT
remodel age 16.0 27.5 35.662 54.00 38.00 20.55749 RIGHT
  • average house price in Ames is \(\$181\ 190\), which is higher than the median price
  • average house area/ age/ remodel age are also higher than their medians

Missing Values

There are 17 blanks "" and 4711 NA values in train. Transform blanks to NA and examine all these missing values:

for (i in  1:ncol(train)) train[i][train[i] == ""] <- NA
nas <- colSums(is.na(train))
nas <- tibble(variable = names(nas), count.na = nas) %>%
  filter(count.na !=0) %>% arrange(desc(count.na))

astr<- train %>% dplyr::select(PID,c(nas$variable)) %>%
  dplyr::select(where(is.factor))
astr1<- train %>% dplyr::select(c(PID,nas$variable)) %>%
  dplyr::select(!(where(is.factor)))

gg<-ggplot(data = nas, aes(x = variable, y = count.na, fill = count.na)) +
  geom_bar(stat = "identity")+
  ggtitle("Number of NA's per variable (interactive)") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplotly(gg)

A total of 26 variables with missing values has been detected, 15 of them - categorical, 12 - integer.

Most problematic variables:

  • Pool.QC (997 NA values),
  • Misc.Feature (971 NA values),
  • Alley (933 NA values),
  • Fense (798 NA values),
  • Fireplace.Qu (491 NA values)

The Codebook reveals that NA values for categorical variables have been applied if an item doesn’t exist (e.g. Garage.Qual == NA means No garage).

Explore then NA values in integer variables:

bsmt<- train %>%  filter(is.na(BsmtFin.SF.1)==TRUE |
                                is.na(BsmtFin.SF.2)==TRUE|
                                is.na(Bsmt.Unf.SF)==TRUE|
                                is.na(Total.Bsmt.SF)==TRUE|
                                is.na(Bsmt.Full.Bath)==TRUE|
                                is.na(Bsmt.Half.Bath)==TRUE) %>%
  dplyr::select(c(35,37:39,47:48,31))
kable(bsmt,caption = "NA's in `Basement.*` non-categorical variables")
NA’s in Basement.* non-categorical variables
BsmtFin.SF.1 BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Bsmt.Full.Bath Bsmt.Half.Bath Bsmt.Qual
NA NA NA NA NA NA NA
garage <- train %>% filter(is.na(Garage.Yr.Blt)==TRUE |
                                  is.na(Garage.Finish)==TRUE) %>%
  dplyr::select(c(59:63))
frontAll <- train %>% filter(is.na(Lot.Frontage)==TRUE) %>%
  dplyr::select(c(6,5,13))
front<- frontAll %>% filter(!(Lot.Config == "Inside"))
datatable(garage, caption = "NA's in `Garage.*` variables",
          options = list(columnDefs = list(list(
  targets = 1:5,
  render = JS(
    "function(data, type, row, meta) {",
    "return data === null ? 'NA' : data;",
    "}")
))))
datatable(frontAll, caption = "NA's in `Lot.Frontage` variable",
          options = list(columnDefs = list(list(
  targets = 1:3,
  render = JS(
    "function(data, type, row, meta) {",
    "return data === null ? 'NA' : data;",
    "}")
))))

There is only case all six Basement.* non-categorical variables equal NA, and it’s the case with no basement as Bsmt.Qual== NA too. So it makes sense to replace all missing values (describing area) with zero in this case.

The same story with Garage.* variables: they are only equal NA when there is no garage (Garage.Qual== NA). So also replace all of them, except for the year of construction Garage.Yr.Blt, with zero.

A slightly different situation with Lot.Frontage variable. Most cases with its NA values (89 of 167) are Inside lot (Lot.Config== Inside). Otherwise, the sale is located in zones Floating Village Residential/ Residential Low Density (MS.Zoning == RL, FV). Obviously, both there and there Lot.Frontage (Linear feet of street connected to property) is equal to zero.

Missing values handling strategy

  • categorical variables
    • add level None for each variable,
    • assign level None to NA values
  • non-categorical variables
    • change NA values for all variables except for describing dates (age, age.Remod, Garage.Yr.Blt, Mo.Sold) to zero (0)
astr<- train %>% dplyr::select(where(is.factor))
astr1<- train %>% dplyr::select(-c(age, age.Remod, Garage.Yr.Blt,
                                 Mo.Sold)) %>%
  dplyr::select(!(where(is.factor)))
astr2 <- train %>% dplyr::select(age, age.Remod, Garage.Yr.Blt,
                                      Mo.Sold)

for (i in  1:ncol(astr)) levels(astr[[i]]) <- c(levels(astr[[i]]), "None")
astr[is.na(astr)] <- "None"
astr1[is.na(astr1)] <- 0
train <- tibble(cbind(astr, astr1, astr2))

Correlations

Construct correlogram to investigate relationships between variables and decide which of them could be chosen for further analysis. Consider only variables having correlation with price more than \(0.51\) as most realistic to appear in a regression model:

data.corr <- as.data.frame(sapply(train, as.numeric))
correlations <- cor(data.corr, method = "s")
corr.price <- as.matrix(correlations[,'price'])
corr.id <- names(which(apply(corr.price, 1, function(x) (abs(x) > 0.51))))
col <- colorRampPalette(
  c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(as.matrix(correlations[corr.id,corr.id]), type = 'upper',
         method='color', col=col(200), order="FPC", number.font = 4,
         number.cex=.8, tl.col = "purple3", addCoef.col = 'black', tl.cex=.9,
         mar = c(0, 0, 1, 0))

  • strong correlation between Garage.Area and Garage.Cars (\(0.88\)) \(=>\) choose Garage.Cars as having higher correlation with price (\(0.72\))
  • high correlation between Full.Bath and area (\(0.68\)) \(=>\) choose area (corr with price: \(0.74\))
  • strong correlation between X1st.Flr.SF and Total.Bsmt.SF (\(0.84\)) \(=>\) choose Total.Bsmt.SF (corr with price: \(0.62\))
  • strong correlation between TotRms.AbvGrd and area (\(0.82\)) \(=>\) choose area (corr with price: \(0.74\))
  • high correlation between Garage.Finish and age (\(0.62\)) \(=>\) choose age (corr with price: \(-0.71\))
  • high correlation between Bsmt.Qual and age (\(0.67\)) \(=>\) choose age (corr with price: \(-0.71\))

2. Initial Model

Start by creating a simple, intuitive initial model: select 10 predictor variables from ames_train and create a linear model for price using those variables.


Justification for variables chosen

From the above EDA, choose for initial model:

  • Garage.Cars, Total.Bsmt.SF, area, age variables

Take also important variables having noticeable correlation with price:

  • Overall.Qual (corr with price: \(0.81\)),
  • age.Remod (corr with price: \(-0.62\)),
  • Foundation (corr with price: \(0.56\)),
  • Heating.QC (corr with price: \(-0.53\)),
  • Kitchen.Qual (corr with price: \(-0.61\)),
  • Exter.Qual (corr with price: \(-0.64\))

R code and summary output table

Perform price.all with all these variables, not forgetting to log-transform price/ area/ age/ age.Remod variables due to their skewness (see section Histograms):

price.all <- lm(log(price) ~ Garage.Cars+ Total.Bsmt.SF+ log(area)+
                  log(age)+
                  Overall.Qual + log(age.Remod) + Foundation+
                   Heating.QC + Kitchen.Qual+ Exter.Qual, data = train)
pander(summary(price.all))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.24 0.2268 40.74 2.89e-212
Garage.Cars 0.06621 0.009854 6.719 0.00000000003113
Total.Bsmt.SF 0.0001777 0.00001721 10.33 8.696e-24
log(area) 0.3631 0.0225 16.14 4.42e-52
log(age) -0.06708 0.01749 -3.835 0.0001335
Overall.Qual2 -0.322 0.1918 -1.679 0.0934
Overall.Qual3 0.06454 0.1918 0.3365 0.7366
Overall.Qual4 0.1107 0.1883 0.588 0.5566
Overall.Qual5 0.2708 0.1892 1.431 0.1527
Overall.Qual6 0.317 0.1903 1.665 0.09617
Overall.Qual7 0.3825 0.1912 2.001 0.04572
Overall.Qual8 0.4958 0.1922 2.58 0.01003
Overall.Qual9 0.5847 0.1958 2.986 0.002901
Overall.Qual10 0.519 0.2048 2.533 0.01145
log(age.Remod) -0.03493 0.01407 -2.482 0.01322
FoundationCBlock 0.07834 0.02074 3.777 0.0001685
FoundationPConc 0.02743 0.02613 1.05 0.2942
FoundationSlab 0.1293 0.05579 2.317 0.02069
FoundationStone 0.05593 0.09811 0.5701 0.5687
Heating.QCFa -0.09616 0.03879 -2.479 0.01334
Heating.QCGd -0.01124 0.01672 -0.6723 0.5016
Heating.QCPo -0.2923 0.1648 -1.774 0.07644
Heating.QCTA -0.07162 0.01573 -4.553 0.000005971
Kitchen.QualFa -0.111 0.05164 -2.149 0.03189
Kitchen.QualGd -0.04578 0.03098 -1.478 0.1398
Kitchen.QualPo -0.1597 0.1682 -0.9493 0.3427
Kitchen.QualTA -0.105 0.0342 -3.071 0.002196
Exter.QualFa -0.256 0.07035 -3.639 0.0002878
Exter.QualGd -0.07607 0.04146 -1.835 0.06687
Exter.QualTA -0.06559 0.04605 -1.424 0.1547
Fitting linear model: log(price) ~ Garage.Cars + Total.Bsmt.SF + log(area) + log(age) + Overall.Qual + log(age.Remod) + Foundation + Heating.QC + Kitchen.Qual + Exter.Qual
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1000 0.1634 0.8535 0.8492

Discussion of model results

Model as a whole

  • the model adjusted R-squared of 0.8492 is pretty high
    • variance in log(price) around its mean explained by the model is 84.92\(\%\)
  • F-statistic p-value of \(2.2\cdot 10^{-16}\) is almost zero
    • probability of rejecting the null hypothesis \(H_0\) that the model is insignificant is extremely small

Predictors coefficients

  • all non-categorical variables except for log(age.Remod) seem to be utterly important
    • their coefficients p-values are all much less than \(0.001\), i.e. probabilities of the coefficients being zero are extremely small
    • log(age.Remod) is also quite important, with significance level of \(0.05\) (p-value = 0.0132)
  • categorical variables - levels, whose coefficients difference from the initial level coefficient are statistically insignificant
    • Overall.Qual: levels \(2-6\) (initial level \(=1\))
    • Foundation: levels PConc (Poured Contrete)/ Stone (initial level: BrkTil - Brick & Tile)
    • Heating.QC: levels Gd/ Po (initial level: Ex - Excellent)
    • Kitchen.Qual: levels Gd/ Po (initial level: Ex - Excellent)
    • Exter.Qual: levels Gd/ TA (initial level: Ex - Excellent)

One can try to recode categorical variables.


Model Selection

Choose the best model, using the initial model as a starting point.


From the all.model results above, recode categorical variables so that they only contain LOW/ HIGH levels (in case of Overall.Qual/ Kitchen.Qual, one more level: TA - Typical/ Average). Then, run lm() to get price.all model with recoded variables:

train$Overall.Qual <- fct_collapse(train$Overall.Qual,
                                   LOW = c("1", "2", "3", "4", "5"),
                                   TA= c("6","7"), HIGH = c("8", "9", "10"))
train$Foundation <- fct_collapse(train$Foundation,
                                 LOW = c("CBlock", "PConc", "Slab"),
                                 HIGH = c("BrkTil", "Stone", "Wood"))
train$Heating.QC <- fct_collapse(train$Heating.QC,
                                 LOW = c("Po", "Fa", "TA"),
                                 HIGH = c("Gd", "Ex"))
train$Kitchen.Qual <- fct_collapse(train$Kitchen.Qual,
                                 LOW = c("Po", "Fa"),
                                 TA=c("TA", "Gd"),
                                 HIGH = c("Ex"))
train$Exter.Qual <- fct_collapse(train$Exter.Qual,
                                 LOW = c("Fa"),
                                 HIGH = c("TA", "Gd", "Ex"))

price.all <- lm(log(price) ~ Garage.Cars+ Total.Bsmt.SF+
                          log(area)+ log(age)+
                  Overall.Qual + log(age.Remod) + Foundation+
                   Heating.QC + Kitchen.Qual+ Exter.Qual,
                  data = train)
pander(summary(price.all))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.975 0.1632 55 8.182e-303
Garage.Cars 0.05917 0.0104 5.69 0.00000001677
Total.Bsmt.SF 0.0002072 0.00001709 12.12 1.271e-31
log(area) 0.4276 0.02254 18.97 1.138e-68
log(age) -0.06167 0.01575 -3.915 0.00009649
Overall.QualTA 0.08911 0.01543 5.776 0.00000001024
Overall.QualHIGH 0.2211 0.02495 8.86 3.653e-18
log(age.Remod) -0.05758 0.01433 -4.017 0.00006329
FoundationLOW 0.1039 0.02136 4.863 0.000001347
Heating.QCLOW -0.07704 0.01415 -5.443 0.00000006619
Kitchen.QualLOW -0.1703 0.0482 -3.534 0.000428
Kitchen.QualTA -0.1031 0.02651 -3.89 0.0001071
Exter.QualLOW -0.2792 0.05413 -5.158 0.0000003019
Fitting linear model: log(price) ~ Garage.Cars + Total.Bsmt.SF + log(area) + log(age) + Overall.Qual + log(age.Remod) + Foundation + Heating.QC + Kitchen.Qual + Exter.Qual
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1000 0.1761 0.8267 0.8246

Model with categorical variables recoded (price.all) as a whole seems to be almost the same as all.model except for Adj.R-sq:

  • the model adjusted R-squared dropped to 0.8246
    • variance in price around its mean explained by the model become 82.46\(\%\)
  • F-statistic p-value of \(2.2\cdot 10^{-16}\) is almost zero
    • probability of rejecting the null hypothesis \(H_0\) that the model is insignificant is extremely small

While, all variables seem to be utterly important

  • their coefficients p-values are all much less than \(0.001\), i.e. probabilities of the coefficients being zero are extremely small

Models Selection

  • start with full price.all predicting log(price) based on all ten chosen variables (with categorical variables recoded)
  • backwards elimination with stepAIC() function
    • drop one predictor at a time until the parsimonious model is reached
    • dropping rules: AIC/ BIC
      • AIC (Akaike information criterion: both overfitting/ underfitting risk),
      • BIC (Bayesian information criterion: similar to AIC, but with a different penalty for the number of parameters)
  • OR bayesian adaptive sampling (BAS) with bas.lm function
    • samples with replacement via MCMC algorithm
    • Zernell-Siow Cauchy prior (ZS-null)
    • equal prior probabilities for all models (uniform())
      • BMA (Bayesian model averaging using model posterior probability)

Backwards elimination

price.aic <- stepAIC(price.all, direction = "backward", trace = 0, k=2)
price.bic <-stepAIC(price.all, direction = "backward", trace = 0,
                    k= log(nrow(train)))
step<- cbind(coef(price.all), coef(price.aic), coef(price.bic),
             summary(price.all)$coefficients[,4])
colnames(step) <- c("price.all", "price.aic", "price.bic", "Pr(>|t|)")
datatable(round(step,4), options = list(pageLength = 13),
          caption = "Backwards elimination coefficients")
  • both backwards eliminations methods (AIC/ BIC) arrive at the same result leaving all price.all predictors
  • p-value for the model F-statistic and adjusted R-squared were discussed just above (price.all model characteristics)

Bayesian model averaging

price.bma <- bas.lm(log(price) ~ Garage.Cars+ Total.Bsmt.SF+
                            log(area)+ log(age)+
                    Overall.Qual + log(age.Remod) + Foundation+
                    Heating.QC + Kitchen.Qual+ Exter.Qual,
                    data = train,
                  modelprior = uniform(), prior = "ZS-null",
                  method = "MCMC")
datatable(data.frame(round(summary(price.bma)[1:13,1],6)),
          options = list(pageLength = 13),
          colnames = c("", "P(B != 0 | Y)"))
  • BAS method results in posterior probabilities almost \(=1\) for all predictors except for Kitchen.QualLOW/ Kitchen.QualTA
    • 0.9023 for Kitchen.QualLOW
    • 0.9308 for Kitchen.QualTA

Look at convergence plot for posterior inclusion probabilities (pip):

diagnostics(price.bma, type="pip", col = "purple", pch = 16, cex = 1.5)

  • posterior inclusion probability of each variable from MCMC have converged to the theoretical posterior inclusion probability

Model Space Visualization:

image(price.bma,rotate=FALSE)

  • the highest rank model includes all initial variables, too

Choosing Bayesian approach, select price.bma model as initial:

initial.model <- price.bma

Initial Model Residuals

One way to assess the performance of a model is to examine the model’s residuals. Create a residual plot for the preferred model from above and use it to assess whether the model appears to fit the data well.


plot(initial.model, which=1, sub.caption = "")

  • Residual vs Fitted plot reveals:
    • wave pattern
      • the model tends to underestimate high price segment
    • outliers
      • observations #310, #428, #741 (all overestimated)
out<-cbind(row = c(310,428,741),
           train[c(310,428,741),c(48,68,56,47,78,14,79,23,30,33,21)])
datatable(out, extensions = 'Responsive')

Possible sources of problems

gg<-ggplot(data = train, aes(x=area, y=price))+
        geom_jitter(col="cornflowerblue")+
        stat_smooth(colour="brown")+
        ggtitle("Price vs area")
ggplotly(gg)
  • observation #310: unusually large area (well beyond three standard deviations)
    • no other points in the range \(>4\ 000SF =>\) the observation could affect model as an influence point
gg<-ggplot(data = train, aes(x=age, y=price))+
        geom_jitter(col="cornflowerblue")+
        stat_smooth(method="lm", colour="brown")+
        ggtitle("Price vs age")
ggplotly(gg)
  • observations #428/ #741
    • age around 100 years is not very unusual: within two standard deviations
    • quite a few points in the range about \(100\) years \(=>\) the observations couldn’t affect model as not being influence points
    • some predictors not included in the model could affect their price

Possible solutions

  • remove houses with area of more than \(4\ 000\) square feet from the data set
    • not predict price of houses over \(4\ 000\) square feet
  • find other important variable(s) to include them in the model
  • apply some type of interaction/ transformation of the predictors

Initial Model RMSE

RMSE (root mean squared error) can be calculated directly based on the model output.


pred.train <- predict(initial.model, train, estimator = "BMA")
resid.train <- na.omit(train$price - exp(pred.train$fit))
rmse.train <- sqrt(mean(resid.train^2))

RMSE (root mean squared error) for BMA initial.model is round(rmse.train,2)

train.no4 <- train %>% filter(area<4000)
initial.no4 <- bas.lm(log(price) ~ Garage.Cars+ Total.Bsmt.SF+
                                log(area)+ log(age)+
                    Overall.Qual + log(age.Remod) + Foundation+
                    Heating.QC + Kitchen.Qual+ Exter.Qual,
                    data = train.no4,
                  modelprior = uniform(), prior = "ZS-null", method = "MCMC")
pred.train.no4 <- predict(initial.no4, train.no4, estimator = "BMA")
resid.train.no4 <- na.omit(train.no4$price - exp(pred.train.no4$fit))
rmse.train.no4 <- sqrt(mean(resid.train.no4^2))
rmse<- cbind(with.4000=rmse.train, without.4000=rmse.train.no4)
rownames(rmse) <- "RMSE"
datatable(round(rmse,2),
      caption = "RMSE for BMA `initial.model` with and without area of more than 4 000 SF")

Removing observations with area over \(4\ 000SF\) results in a significantly improved RMSE value.


Overfitting

The model may do well on training data but perform poorly out-of-sample (meaning, on a data set other than the original training data) because the model is overly-tuned to specifically fit the training data. This is called “overfitting”. To determine whether overfitting is occurring on the model, compare the performance of it on both in-sample (ames_train) and out-of-sample (ames_test) data sets.

load("./data/1220_SR-CS_RealEstate/ames_test.Rdata")

Use the model from above to generate predictions for the housing prices in the test data set.


test <- trans(ames_test) # Appendix: trans() makes the same changes as with training data set
pred.test <- predict(initial.model, test, estimator = "BMA")
pred.test <- predict(initial.no4, test, estimator = "BMA")
resid.test <- na.omit(test$price - exp(pred.test$fit))
rmse.test <- sqrt(mean(resid.test^2))
rmse<- cbind(train=rmse.train, test=rmse.test)
rownames(rmse) <- "RMSE"
datatable(round(rmse,2),
      caption = "RMSE for BMA `initial.model`: `train` and `test` data")

To compare models prediction accuracy/ out-of-sample performance is used RMSE (root mean squared error) metric. The lower RMSE, the better is prediction. In general, RMSE for prediction on a training data set is lower than that for testing one. It’s because model is built on the training data and fit to it better. Yet, in this case, the situation is different: initial.model shows larger RMSE on training data set. This implies that there is no overfitting of the model.


3. Development of a Final Model

Now create a final model with up to 20 variables to predict housing prices in Ames, IA, selecting from the full array of variables in the data set.

Final Model


The final model price.aic is obtained by applying stepAIC() backwards elimination

  • drop one predictor at a time until the parsimonious model is reached
  • dropping rule AIC (Akaike information criterion: both overfitting/ underfitting risk)
load("./data/1220_SR-CS_RealEstate/ames_train.Rdata")
train <- ames_train %>% mutate(Overall.Qual= factor(Overall.Qual),
                         Overall.Cond= factor(Overall.Cond)) %>%
        mutate(age= 2020-Year.Built,age.Remod= 2020-Year.Remod.Add) %>%
        dplyr::select(-c(Year.Built, Year.Remod.Add, Foundation))

# "" -->> NA, NA -->> None/ 0
for (i in  1:ncol(train)) train[i][train[i] == ""] <- NA
astr0<- train %>% dplyr::select(where(is.factor))
astr01<- train %>% dplyr::select(-c(age, age.Remod, Garage.Yr.Blt,
                                    Mo.Sold)) %>%
        dplyr::select(!(where(is.factor)))
astr02 <- train %>% dplyr::select(age, age.Remod, Garage.Yr.Blt,
                                  Mo.Sold)
for (i in  1:ncol(astr0)) levels(astr0[[i]]) <- c(levels(astr0[[i]]),
                                                  "None")
astr0[is.na(astr0)] <- "None"
astr01[is.na(astr01)] <- 0
train <- tibble(cbind(astr0, astr01, astr02))

# corr>0.5
data.corr <- as.data.frame(sapply(train, as.numeric))
correlations <- cor(data.corr, method = "s")
corr.price <- as.matrix(correlations[,'price'])
corr.id <- names(which(apply(corr.price, 1, function(x) (abs(x) > 0.5))))

train<- train %>% filter(area<4000) # exclude outlier

# log-transform
train<- train %>% dplyr::select(all_of(corr.id)) %>%
        mutate("log(1+price)"=log(1+price),"log(1+area)" = log(1+area),
               "log(1+age)"= log(1+age),"log(1+age.Remod)"= log(1+age.Remod),
               "log(1+X1st.Flr.SF)"= log(1+X1st.Flr.SF),
               "log(1+Full.Bath)"= log(1+Full.Bath),
               "log(1+TotRms.AbvGrd)"= log(1+TotRms.AbvGrd),
               "log(1+Fireplaces)"= log(1+Fireplaces),
               "log(1+Garage.Cars)"= log(1+Garage.Cars)) %>%
        dplyr::select(-c(price, area, age, age.Remod, X1st.Flr.SF, Full.Bath,
                         TotRms.AbvGrd, Fireplaces, Garage.Cars))
# modelling
price.all <- lm(`log(1+price)` ~ ., data = train)
price.aic <- stepAIC(price.all, direction = "backward", trace = 0, k=2)
pander(summary(price.aic))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.216 0.2653 34.74 7.615e-172
Overall.Qual2 -0.4208 0.1784 -2.358 0.01856
Overall.Qual3 0.006049 0.1804 0.03353 0.9733
Overall.Qual4 0.04687 0.1773 0.2643 0.7916
Overall.Qual5 0.1779 0.179 0.9943 0.3203
Overall.Qual6 0.2137 0.1796 1.189 0.2346
Overall.Qual7 0.2664 0.1803 1.478 0.1399
Overall.Qual8 0.354 0.1812 1.954 0.05101
Overall.Qual9 0.4114 0.1849 2.224 0.02635
Overall.Qual10 0.4904 0.194 2.529 0.01161
Exter.QualFa -0.2191 0.06632 -3.304 0.00099
Exter.QualGd -0.06293 0.0393 -1.601 0.1096
Exter.QualTA -0.05482 0.04346 -1.261 0.2075
Bsmt.QualFa -0.1517 0.04382 -3.461 0.000561
Bsmt.QualGd -0.07183 0.0246 -2.92 0.003584
Bsmt.QualPo -0.207 0.1586 -1.306 0.192
Bsmt.QualTA -0.1025 0.02982 -3.437 0.0006141
Bsmt.QualNone -0.09752 0.05395 -1.807 0.071
Heating.QCFa -0.08755 0.03643 -2.404 0.01643
Heating.QCGd 0.00121 0.01561 0.07752 0.9382
Heating.QCPo -0.175 0.1539 -1.137 0.2558
Heating.QCTA -0.05185 0.0143 -3.627 0.0003022
Kitchen.QualFa -0.07635 0.04865 -1.569 0.1169
Kitchen.QualGd -0.03094 0.02943 -1.052 0.2932
Kitchen.QualPo -0.1717 0.1581 -1.086 0.2776
Kitchen.QualTA -0.08277 0.03223 -2.568 0.01038
Garage.TypeAttchd 0.05034 0.05125 0.9822 0.3262
Garage.TypeBasment 0.0125 0.06945 0.18 0.8572
Garage.TypeBuiltIn 0.06377 0.05621 1.134 0.2569
Garage.TypeCarPort 0.1451 0.1621 0.895 0.371
Garage.TypeDetchd 0.00534 0.05153 0.1036 0.9175
Garage.TypeNone -0.03951 0.06023 -0.6561 0.5119
Total.Bsmt.SF 0.0001365 0.00002861 4.772 0.000002103
Garage.Area 0.0001505 0.00003557 4.232 0.00002539
log(1+area) 0.3489 0.02695 12.95 1.854e-35
log(1+age) -0.0603 0.01774 -3.4 0.0007028
log(1+age.Remod) -0.04239 0.01375 -3.083 0.00211
log(1+X1st.Flr.SF) 0.05984 0.03457 1.731 0.08376
log(1+Full.Bath) -0.07591 0.03539 -2.145 0.0322
log(1+Fireplaces) 0.09028 0.01546 5.84 0.000000007157
Fitting linear model: log(1+price) ~ Overall.Qual + Exter.Qual + Bsmt.Qual + Heating.QC + Kitchen.Qual + Garage.Type + Total.Bsmt.SF + Garage.Area + log(1+area) + log(1+age) + log(1+age.Remod) + log(1+X1st.Flr.SF) + log(1+Full.Bath) + log(1+Fireplaces)
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
999 0.1528 0.8733 0.8682

The model is statistically significant (p-value \(2.2\cdot 10^{-16}\)), contains 14 predictors and explains 86.82\(\%\) of variance in price around its mean.


Transformation


All non-categorical variables except for Total.Bsmt.SF and Garage.Area has been log-transformed with the preliminary addition of 1

  • their distributions are right-skewed
  • some of them could be equal zero (\(=>\) added 1)
  • log-transformation provides
    • obtaining lower Adjusted R-squared values
    • more linear relationship between them and price, example:
gg1 <- ggplot(ames_train, aes(x = area, y = price)) +
  geom_point(color= "cornflowerblue") +
  stat_smooth(method = 'lm', color="brown")+
  ggtitle("price ~ area")
gg2 <- ggplot(ames_train, aes(x = log(area), y = price)) +
  geom_point(color= "cornflowerblue") +
  stat_smooth(method = 'lm', color="brown")+
  ggtitle("price ~ log(area)")
gg3 <- ggplot(ames_train, aes(x = area, y = log(price))) +
  geom_point(color= "cornflowerblue") +
  stat_smooth(method = 'lm', color="brown")+
  ggtitle("log(price) ~ area")
gg4 <- ggplot(ames_train, aes(x = log(area), y = log(price))) +
  geom_point(color= "cornflowerblue") +
  stat_smooth(method = 'lm', color="brown")+
  ggtitle("log(price) ~ log(area)")
grid.arrange(gg1, gg2, gg3, gg4, ncol = 2)


Variable Interaction


Variable interaction in the sense of, for instance, multiplying two factors, should have been included if strong multicollinearity presented in a regression model. Use variance/ generalized variance inflation factors (VIF/ GVIF) to handle it:

  • VIF - for non-categorical variables: the square root of VIF indicates how much larger the standard error is, compared with what it would be if that variable were uncorrelated with the other predictor variables in the model
    • \(VIF \hat{(\beta_j)}<5 =>\) the variable is not collinear with the remaining explanatory variables
    • \(VIF \hat{(\beta_j)}>10 =>\) multicollinearity is strong
  • GVIF - for categorical variables:
    • requiring \((GVIF^{\frac{1}{2\cdot DF}})^2 <5\) is equivalent to a requirement that \(VIF<5\) for non-categorical variables
pander(vif(price.aic))
  GVIF Df GVIF^(1/(2*Df))
Overall.Qual 21.28 9 1.185
Exter.Qual 9.143 3 1.446
Bsmt.Qual 11.96 5 1.282
Heating.QC 2.068 4 1.095
Kitchen.Qual 6.618 4 1.266
Garage.Type 3.895 6 1.12
Total.Bsmt.SF 6.249 1 2.5
Garage.Area 2.592 1 1.61
log(1+area) 3.486 1 1.867
log(1+age) 6.062 1 2.462
log(1+age.Remod) 2.841 1 1.685
log(1+X1st.Flr.SF) 5.402 1 2.324
log(1+Full.Bath) 2.517 1 1.586
log(1+Fireplaces) 1.617 1 1.271

Explanatory variables are not strongly associated with each other \(=>\) no need to include any variable interactions.


Variable Selection


Selection of variables includes several stages:

  • changing variables Year.Built and Year.Remod.Add to age and Remod.age, respectively, for convenience
  • excluding Foundation variable as there are no observations in ames_train having one of its level (Wood), while in ames_test there are
  • correlogram to find out variables having strongest correlation with price \(>0.50\))
    • top-18 variables:

age Exter.Qual Garage.Finish age.Remod Bsmt.Qual Kitchen.Qual Heating.QC Fireplace.Qu Garage.Type Fireplaces TotRmsAbvGrd X1st.Flr.SF Total.Bsmt.SF Full.Bath Garage.Area area Garage.Cars Overall.Qual

  • backward selection based on AIC: remove variables until there is no improvement in the model
    • also, it has given best performance in Adjusted R-squared/ RMSE than BIC
price.bic <-stepAIC(price.all, direction = "backward", trace = 0,
                    k= log(nrow(train)))

aicrsq<- round(summary(price.aic)$adj.r.squared,4)
bicrsq<- round(summary(price.bic)$adj.r.squared,4)

pred.train.aic <- predict(price.aic, train)
resid.train.aic <- na.omit(exp(train$`log(1+price)`) - exp(pred.train.aic))
rmse.train.aic <- sqrt(mean(resid.train.aic^2))

pred.train.bic <- predict(price.bic, train)
resid.train.bic <- na.omit(exp(train$`log(1+price)`) - exp(pred.train.bic))
rmse.train.bic <- sqrt(mean(resid.train.bic^2))

aic<- round(c(AIC(price.aic), AIC(price.bic)),2)
bic<- round(c(BIC(price.aic), BIC(price.bic)),2)
fpvalue <- round(anova(price.aic, price.bic)$`Pr(>F)`,4)
model<- c("price.aic", "price.bic")
compar.mdls<- cbind(model= model, `Adj.R-sq`=c(aicrsq, bicrsq),
                    RMSE=round(c(rmse.train.aic, rmse.train.bic),2),
                    AIC=aic, BIC=bic)
kable(compar.mdls, caption = "Comparison of models: `price.aic` versus `price.bic`")
Comparison of models: price.aic versus price.bic
model Adj.R-sq RMSE AIC BIC
price.aic 0.8682 26223.04 -877.55 -676.37
price.bic 0.8591 27528.63 -833.69 -745.37

Model Testing


To compare models prediction accuracy/ out-of-sample performance is used RMSE (root mean squared error) metric. The lower RMSE, the better is prediction. In general, RMSE for prediction on a training data set is lower than that for testing one. It’s because model is built on the training data and fit to it better. Yet, in this case, the situation is different: final price.aic model shows larger RMSE on training data set:

test <- transfin(ames_test)
pred.test.aic <- predict(price.aic, test)
resid.test.aic <- na.omit(exp(test$`log(1+price)`) - exp(pred.test.aic))
rmse.test.aic <- sqrt(mean(resid.test.aic^2))
ds <- c("ames_train", "ames_test")
RMSE<- cbind(`data set` = ds, RMSE = round(c(rmse.train.aic,rmse.test.aic),2))
kable(RMSE, caption = "RMSE for final model on train and test data sets")
RMSE for final model on train and test data sets
data set RMSE
ames_train 26223.04
ames_test 24202.1

This implies final price.aic model performs well on the training and testing sets, and there is no overfitting. The model did not overreact to minor fluctuations in data, and no need to make any changes.


4. Final Model Assessment

Final Model Residual


gg<-ggplot(data = price.aic, aes(x = .fitted, y = .resid))+
        geom_point(colour= "cornflowerblue") +
        geom_smooth(colour="brown") +
        xlab("predicted log(price)") +ylab("residuals")+
        ggtitle("Residuals vs Fitted (interactive)")
ggplotly(gg)
  • Residual vs Fitted plot reveals:
    • left-tail wave: the model tends to underestimate low price segment
  • several outliers not affect model as not being influence points

Final Model RMSE


ds <- c("ames_train", "ames_test")
RMSE<- cbind(`data set` = ds,
             RMSE.initial = round(c(rmse.train,rmse.test),2),
             RMSE.final = round(c(rmse.train.aic,rmse.test.aic),2))
kable(RMSE, caption = "RMSE: final model vs initial model")
RMSE: final model vs initial model
data set RMSE.initial RMSE.final
ames_train 33789.08 26223.04
ames_test 27144.8 24202.1

In the context of the initial model (Section 2.4), RMSE has significantly improved: from 33789.08 dollars on the training data with initial.model, to 24202.1 dollars on the test data with the final price.aic model, that is by quite substantial 28.37\(\%\).

Testing the final model on validation data set, will show model’s reliability.


Final Model Evaluation

Strengths and weaknesses of the model.


Strengths

  • outliers could have affected the model have been excluded
  • multicollinearity has been avoided by verifying features via Variance inflation factors
  • log-transformation applied has compensated non-linearity of the model
  • proper reflection of uncertainty: true value of price falling within 95% in 97% cases
  • RMSE has been improved by 28.37\(\%\)
  • the model explains 86.82\(\%\) of variance in price around its mean

Weaknesses

  • strategy of filling missing data: there may be a more sophisticated approach
  • variables could be clustered to be more informative
  • there are could be some other relevant features/combination of features not included in the model
  • the model tends to underestimate low price segment

Final Model Validation

Testing the final model on a separate, validation data set is a great way to determine how it will perform in real-life practice. Use the ames_validation data set to do some additional assessment of the final model.

load("./data/1220_SR-CS_RealEstate/ames_validation.Rdata")

RMSE of final price.aic model on the validation data

valid <- transfin(ames_validation)
pred.valid.aic <- predict(price.aic, valid)
resid.valid.aic <- na.omit(exp(valid$`log(1+price)`) - exp(pred.valid.aic))
rmse.valid.aic <- sqrt(mean(resid.valid.aic^2))
ds <- c("ames_train", "ames_test", "ames_validation")
RMSE<- cbind(`data set` = ds,
             RMSE.final = round(c(rmse.train.aic,rmse.test.aic, rmse.valid.aic),2))
kable(RMSE, caption = "RMSE: final model")
RMSE: final model
data set RMSE.final
ames_train 26223.04
ames_test 24202.1
ames_validation 23649.92
  • Root Mean Squared Error (RMSE) on the validation ames_validation data set reaches 23649.92 dollars.

Comparison of RMSE on the training/ test/ validation data sets

  • RMSE on training data set of 26223.04
    • drops with the test data to 24202.1,
    • with validation data drops even to 23649.92 dollars.

In general, the lower the RMSE, the better the model fit. Therefore, the final price.aic model is better fitted for validation data.

Percentage of the 95% predictive confidence intervals containing the true price of the house in the validation data set

pred.int <- predict(price.aic, valid, interval = "prediction",
                            level = 0.95)
coverage <- mean(valid$`log(1+price)` > pred.int[,"lwr"] &
  valid$`log(1+price)` < pred.int[,"upr"])
  • 97.12\(\%\) predictive confidence intervals contains the true price of the house in the validation data set

Proper reflection of uncertainty

  • the true value of housing prices is met in 97.12\(\%\) cases \(=>\) final model properly reflects uncertainty

5. Conclusion


The final model explains 86.82% of variance in price around its mean and contains 14 predictors: Overall.Qual, Exter.Qual, Bsmt.Qual, Heating.QC, Kitchen.Qual, Garage.Type, Total.Bsmt.SF, Garage.Area, area, age, age.Remod, X1st.Flr.SF, Full.Bath, Fireplaces

Feature engineering, cleaning and transformation the data become one of the major tasks in choosing the best model.

Measuring improvements via adjusted R-squared, Root Mean Squared Error, and visual analyses of Residual vs. Fitted plots is vital to understand changes. Managing outliers/ high leverage points have a great impact on improvements in model. Following a stepwise approach (improving and measuring) helps to find a robust model.

In short, to have a good model one need to obtain good data, explore the data to understand it well, build model, diagnose the model, test the model and validate the model.


Appendix

trans

trans <- function(data) {
# Overall.Qual/ .Cond -->> factors, Year.Built/ .Remod.Add -->> age/ age.Remod
       trans <- data %>% mutate(Overall.Qual= factor(Overall.Qual),
                         Overall.Cond= factor(Overall.Cond)) %>%
        mutate(age= 2020-Year.Built,age.Remod= 2020-Year.Remod.Add) %>%
        dplyr::select(-c(Year.Built, Year.Remod.Add))
# "" -->> NA, NA -->> None/ 0
        for (i in  1:ncol(trans)) trans[i][trans[i] == ""] <- NA
        astr0<- trans %>% dplyr::select(where(is.factor))
        astr01<- trans %>% dplyr::select(-c(age, age.Remod, Garage.Yr.Blt,
                                           Mo.Sold)) %>%
                dplyr::select(!(where(is.factor)))
        astr02 <- trans %>% dplyr::select(age, age.Remod, Garage.Yr.Blt,
                                      Mo.Sold)
        for (i in  1:ncol(astr0)) levels(astr0[[i]]) <- c(levels(astr0[[i]]),
                                                        "None")
        astr0[is.na(astr0)] <- "None"
        astr01[is.na(astr01)] <- 0
        trans <- tibble(cbind(astr0, astr01, astr02))
        
        # categorical vars -->> recoded
        trans$Overall.Qual <- fct_collapse(trans$Overall.Qual,
                                   LOW = c("1", "2", "3", "4", "5"),
                                   TA= c("6","7"), HIGH = c("8", "9", "10"))
        trans$Foundation <- fct_collapse(trans$Foundation,
                                 LOW = c("CBlock", "PConc", "Slab"),
                                 HIGH = c("BrkTil", "Stone", "Wood"))
        trans$Heating.QC <- fct_collapse(trans$Heating.QC,
                                 LOW = c("Po", "Fa", "TA"),
                                 HIGH = c("Gd", "Ex"))
        trans$Kitchen.Qual <- fct_collapse(trans$Kitchen.Qual,
                                 LOW = c("Po", "Fa"),
                                 TA=c("TA", "Gd"),
                                 HIGH = c("Ex"))
        trans$Exter.Qual <- fct_collapse(trans$Exter.Qual,
                                 LOW = c("Fa"),
                                 HIGH = c("TA", "Gd", "Ex"))
        trans
}

transfin

transfin <- function(data) {
# Overall.Qual/ .Cond -->> factors, Year.Built/ .Remod.Add -->> age/ age.Remod
       transfin <- data %>% mutate(Overall.Qual= factor(Overall.Qual),
                         Overall.Cond= factor(Overall.Cond)) %>%
        mutate(age= 2020-Year.Built,age.Remod= 2020-Year.Remod.Add) %>%
        dplyr::select(-c(Year.Built, Year.Remod.Add, Foundation))
# "" -->> NA, NA -->> None/ 0
        for (i in  1:ncol(transfin)) transfin[i][transfin[i] == ""] <- NA
        astr0<- transfin %>% dplyr::select(where(is.factor))
        astr01<- transfin %>% dplyr::select(-c(age, age.Remod, Garage.Yr.Blt,
                                           Mo.Sold)) %>%
                dplyr::select(!(where(is.factor)))
        astr02 <- transfin %>% dplyr::select(age, age.Remod, Garage.Yr.Blt,
                                      Mo.Sold)
        for (i in  1:ncol(astr0)) levels(astr0[[i]]) <- c(levels(astr0[[i]]),
                                                        "None")
        astr0[is.na(astr0)] <- "None"
        astr01[is.na(astr01)] <- 0
        transfin <- tibble(cbind(astr0, astr01, astr02))
# log-transform
        transfin<- transfin %>% dplyr::select(all_of(corr.id)) %>%
        mutate("log(1+price)"=log(1+price),"log(1+area)" = log(1+area),
               "log(1+age)"= log(1+age),"log(1+age.Remod)"= log(1+age.Remod),
               "log(1+X1st.Flr.SF)"= log(1+X1st.Flr.SF),
               "log(1+Full.Bath)"= log(1+Full.Bath),
               "log(1+TotRms.AbvGrd)"= log(1+TotRms.AbvGrd),
               "log(1+Fireplaces)"= log(1+Fireplaces),
               "log(1+Garage.Cars)"= log(1+Garage.Cars)) %>%
        dplyr::select(-c(price, area, age, age.Remod, X1st.Flr.SF, Full.Bath,
                         TotRms.AbvGrd, Fireplaces, Garage.Cars)) 
       transfin
}