First, let us load the data and necessary packages:

# set defaults: cache chunks to speed compiling subsequent edits.
knitr::opts_chunk$set(cache=TRUE, echo = TRUE)
load("ames_train.Rdata")
library(MASS)
library(BAS)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(kableExtra)
library(moments)
library(RColorBrewer)

1

Make a labelled histogram (with 30 bins) of the ages of the houses in the data set, and describe the distribution.

house_ages <- ames_train %>% 
    mutate(age_at_2010 = 2010 - Year.Built) %>%
    select(age_at_2010) %>%
    drop_na()

mean_age <- mean(house_ages$age_at_2010)
sd_age <- sd(house_ages$age_at_2010)
plot_age <- house_ages %>% 
    ggplot(aes(x=age_at_2010)) +
    geom_histogram(
        alpha = 0.8, 
        bins = 30, 
        aes(y = stat(density)),
        position="identity", 
        fill="orange", 
        col="darkgrey") +
    stat_function(
        fun = dnorm, 
        aes(colour = "Normal"), 
        size = 1.2, 
        alpha = 0.6,
        args = list(mean = mean_age, sd = sd_age)) +
    geom_line(
        aes(y = ..density.., colour = "Observed"), 
        stat = 'density', 
        size = 1.5, 
        alpha = 0.8) + 
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5), legend.position="bottom") +
    scale_color_manual(name="Distribution", values=c(Normal="blue", Observed="red")) +
    labs(title = "Age of Houses at 2010", y = "Proportion of Values", x="Age (years)")

age_summary <- house_ages %>% 
    summarise(
        count=n(),
        min=min(age_at_2010),
        q25=quantile(age_at_2010, 0.25),
        median=median(age_at_2010), 
        q75=quantile(age_at_2010,0.75),
        max=max(age_at_2010),
        mean=round(mean(age_at_2010),2),
        sd=round(sd(age_at_2010),2),
        skew=round(skewness(age_at_2010),2)) %>%
    kbl() %>% 
    kable_styling(bootstrap_options = c("condensed"))

Distribution Summary

count min q25 median q75 max mean sd skew
1000 0 9 35 55 138 37.8 29.64 0.66
  • There are 1000 observations with a range of 0 to 138 years.
  • Distribution is multi-modal (2 prominent peaks with a smaller third).
  • The first peak will be related to a higher proportion of new builds.
  • Highest peak of values is around 5-9 years.
  • The H‑spread is 9 to 55 years old (inclusive).
  • There is a high degree of variance shown by a standard deviation of 29.64.
  • Despite a heavy right skew appearance, there is only a moderate skew of 0.66, possibly because of the large mass of observations in the central peak which is wider than the first and pulls both the mean and median to the right again.
  • There will be an element of right-skew due to the fact that homes cannot be younger than 0 years.

2

The mantra in real estate is “Location, Location, Location!” Make a graphical display that relates a home price to its neighborhood in Ames, Iowa. Which summary statistics are most appropriate to use for determining the most expensive, least expensive, and most heterogeneous (having the most variation in housing price) neighborhoods? Report which neighborhoods these are based on the summary statistics of your choice. Report the value of your chosen summary statistics for these neighborhoods.

nei_colours <- colorRampPalette(brewer.pal(4, "PuOr"))(27)

neighbourhood_price_variation <- ames_train %>% 
    group_by(Neighborhood) %>% 
    summarise(
        Mean = mean(price)/1e5, 
        Median = median(price)/1e5, 
        Min = min(price)/1e5, 
        Max = max(price)/1e5, 
        SD = sd(price)/1e5, 
        `H-Spread` = IQR(price)/1e5,
        ) %>%
    mutate(CV = SD/Mean) %>%
    arrange(desc(SD)) %>%
    mutate(Neighborhood = factor(Neighborhood, unique(Neighborhood))) %>%
    arrange(desc(Neighborhood))

nei_plot <- ames_train %>% 
    select(Neighborhood, price) %>%
    mutate(Neighborhood = factor(Neighborhood, neighbourhood_price_variation$Neighborhood)) %>% 
    ggplot(aes(as.factor(Neighborhood), price/1e5), fill=Neighborhood) +
    theme_minimal() +
    geom_boxplot(aes(fill=Neighborhood)) +
    geom_violin(aes(colour=Neighborhood), width=1.5, alpha=0.3) + 
    coord_flip() +
    theme(
        plot.title = element_text(hjust = 0.5), 
        legend.position="none", 
        plot.caption = element_text(face="italic")
        ) +
    scale_color_manual(values = nei_colours) +
    labs(
        title = "Distribution of House Prices by Neighbourhood", 
        caption = "Neighbourhoods listed in descending order of variability",
        x = NULL, 
        y="Price ($'00,000)"
        )

tbl <- neighbourhood_price_variation %>%
    pivot_longer(-Neighborhood,names_to = "Measure", values_to = "vals") %>%
    mutate(vals = round(vals,2)) %>%
    arrange(desc(Neighborhood)) %>%
    pivot_wider(names_from = "Neighborhood", values_from = "vals", names_sort = T)

tbl1 <- tbl %>% select(1:15) %>%
    kbl() %>% 
    kable_styling(bootstrap_options = c("condensed"), full_width = F, font_size = 10, position = "left")

tbl2 <- tbl %>% select(c(1,16:28)) %>%
    kbl() %>% 
    kable_styling(bootstrap_options = c("condensed"), full_width = F, font_size = 10, position = "left")

Distribution of Neighbourhood Prices in Order of Variability ($’00,000)

Measure StoneBr NridgHt Timber Veenker Crawfor GrnHill Somerst Edwards CollgCr SawyerW ClearCr NWAmes Gilbert Mitchel
Mean 3.39 3.34 2.65 2.34 2.04 2.80 2.35 1.36 1.97 1.83 1.93 1.94 1.93 1.65
Median 3.41 3.37 2.33 2.06 2.05 2.80 2.22 1.27 1.96 1.82 1.85 1.85 1.83 1.56
Min 1.80 1.73 1.75 1.50 0.96 2.30 1.39 0.62 1.10 0.68 1.07 0.82 1.33 0.94
Max 5.92 6.15 4.25 3.85 3.92 3.30 4.68 4.15 4.75 3.20 2.77 2.78 3.78 2.85
SD 1.23 1.05 0.84 0.73 0.71 0.71 0.65 0.55 0.53 0.48 0.48 0.41 0.41 0.40
H-Spread 1.51 1.49 1.51 0.68 0.80 0.50 0.73 0.36 0.59 0.67 0.76 0.59 0.28 0.33
CV 0.36 0.31 0.32 0.31 0.35 0.25 0.28 0.40 0.27 0.26 0.25 0.21 0.21 0.24
Measure BrkSide OldTown NoRidge IDOTRR Greens SWISU NAmes Blmngtn Sawyer MeadowV BrDale NPkVill Blueste
Mean 1.22 1.20 2.96 0.98 1.99 1.31 1.41 1.99 1.39 0.93 0.99 1.39 1.26
Median 1.24 1.20 2.90 1.00 2.13 1.34 1.40 1.91 1.36 0.86 0.99 1.42 1.24
Min 0.39 0.13 2.35 0.35 1.55 0.80 0.76 1.73 0.64 0.73 0.83 1.23 1.17
Max 2.07 2.66 4.05 1.51 2.14 1.69 2.78 2.47 2.19 1.29 1.25 1.50 1.37
SD 0.37 0.36 0.36 0.32 0.29 0.27 0.27 0.26 0.21 0.19 0.13 0.12 0.10
H-Spread 0.42 0.30 0.50 0.49 0.16 0.29 0.27 0.23 0.20 0.20 0.17 0.13 0.10
CV 0.30 0.30 0.12 0.32 0.15 0.21 0.19 0.13 0.15 0.20 0.13 0.09 0.08

Summary

Most Expensive:

Least Expensive:

Most Heterogeneous:

Note:


3

Which variable has the largest number of missing values? Explain why it makes sense that there are so many missing values for this variable.

ames_train %>% 
    summarise(across(everything(), ~ sum(is.na(.x)))) %>%
    pivot_longer(everything(), names_to = "Variable") %>%
    rename(NAs = value) %>%
    slice_max(order_by = NAs, n=5)
Variable NAs
Pool.QC 997
Misc.Feature 971
Alley 933
Fence 798
Fireplace.Qu 491

Pool.QC has the highest count of NA’s.

From the codebook:

Pool QC (Ordinal): Pool quality
        
       Ex   Excellent
       Gd   Good
       TA   Average/Typical
       Fa   Fair
       NA   No Pool

Each NA just represents a home without a pool.


4

We want to predict the natural log of the home prices. Candidate explanatory variables are lot size in square feet (Lot.Area), slope of property (Land.Slope), original construction date (Year.Built), remodel date (Year.Remod.Add), and the number of bedrooms above grade (Bedroom.AbvGr). Pick a model selection or model averaging method covered in the Specialization, and describe how this method works. Then, use this method to find the best multiple regression model for predicting the natural log of the home prices.

df_base <- ames_train %>%
    select(
        price,
        Lot.Area,
        Land.Slope,
        Year.Built,
        Year.Remod.Add,
        Bedroom.AbvGr
        ) %>%
    mutate(price = log(price)) %>%
    drop_na()

m_ames1 <- bas.lm(price ~ ., 
                   data = df_base,
                   prior = "ZS-null", 
                   modelprior = uniform()
)
m_ames1
## 
## Call:
## bas.lm(formula = price ~ ., data = df_base, prior = "ZS-null", 
##     modelprior = uniform())
## 
## 
##  Marginal Posterior Inclusion Probabilities: 
##      Intercept        Lot.Area   Land.SlopeMod   Land.SlopeSev      Year.Built  
##         1.0000          1.0000          0.7201          0.8466          1.0000  
## Year.Remod.Add   Bedroom.AbvGr  
##         1.0000          1.0000
ames.BPM1 = predict(m_ames1, estimator = "BPM", se.fit = TRUE)
ames.BPM1$best.vars
## [1] "Intercept"      "Lot.Area"       "Land.SlopeMod"  "Land.SlopeSev" 
## [5] "Year.Built"     "Year.Remod.Add" "Bedroom.AbvGr"
BPM1 = as.vector(
    which.matrix(
        m_ames1$which[ames.BPM1$best],
        m_ames1$n.vars)
    )
ames.BPM1 = bas.lm(price ~ ., data = df_base,
                    prior = "ZS-null",
                    modelprior = uniform(),
                    bestmodel = BPM1, n.models = 1)
ames.BPM1.coef <- coef(ames.BPM1)
ames.BPM1.coef
## 
##  Marginal Posterior Summaries of Coefficients: 
## 
##  Using  BMA 
## 
##  Based on the top  1 models 
##                 post mean   post SD     post p(B != 0)
## Intercept        1.202e+01   8.824e-03   1.000e+00    
## Lot.Area         1.025e-05   1.104e-06   1.000e+00    
## Land.SlopeMod    1.380e-01   4.984e-02   1.000e+00    
## Land.SlopeSev   -4.553e-01   1.512e-01   1.000e+00    
## Year.Built       6.031e-03   3.782e-04   1.000e+00    
## Year.Remod.Add   6.757e-03   5.460e-04   1.000e+00    
## Bedroom.AbvGr    8.660e-02   1.076e-02   1.000e+00
par(mfrow = c(2, 3))
plot(ames.BPM1.coef, subset = c(2:7))


Initial Linear Model

The bas.lm() function from the BAS package is used:

Model Selection - Best Predictive Model

Since the purpose of the model is to predict future audience movie scores, the approach used is Best Predictive Model (BPM). This is the option whose predictions are closest to those given by the Bayesian model averaging model.


5

Which home has the largest squared residual in the previous analysis (Question 4)? Looking at all the variables in the data set, can you explain why this home stands out from the rest (what factors contribute to the high squared residual and why are those factors relevant)?

par(mfrow = c(1, 1))
plot(
    m_ames1, 
    which = 1, 
    add.smooth = F, 
    ask = F, 
    pch = 16, 
    sub.caption="", 
    caption="")

n <- nrow(df_base)
ames_k <- qnorm(0.5 + 0.5 * 0.95 ^ (1 / n))
ames.lm <- lm(price ~ ., data = df_base)
outliers <- Bayes.outlier(ames.lm, k = ames_k)
max_outlier <- ames_train[which.max(outliers$prob.outlier),]
max_outlier %>%
    select(PID, price, Year.Built, Year.Remod.Add, area, Overall.Cond, Exterior.1st, 
           Exter.Cond, Bsmt.Cond, BsmtFin.Type.1, Central.Air, Garage.Cond, Sale.Condition) %>%
    kbl() %>% kable_styling(bootstrap_options = c("condensed"), font_size = 10)
PID price Year.Built Year.Remod.Add area Overall.Cond Exterior.1st Exter.Cond Bsmt.Cond BsmtFin.Type.1 Central.Air Garage.Cond Sale.Condition
902207130 12789 1923 1970 832 2 AsbShng Fa Fa Unf N Fa Abnorml

The property with the highest \(R^2\) value is observation 428 (Property ID 902207130).

Sale Price was $12,789, well below the median $120,000 for that neighbourhood (Old Town).

A number of factors might point to a cause (likely a combination of some or all of these):


6

Use the same model selection method you chose in Question 4 to again find the best multiple regression model to predict the natural log of home prices, but this time replacing Lot.Area with log(Lot.Area). Do you arrive at a model including the same set of predictors?

df_base <- ames_train %>%
    select(
        price,
        Lot.Area,
        Land.Slope,
        Year.Built,
        Year.Remod.Add,
        Bedroom.AbvGr
        ) %>%
    mutate(price = log(price), Lot.Area = log(Lot.Area)) %>%
    drop_na()

m_ames2 <- bas.lm(price ~ ., 
                   data = df_base,
                   prior = "ZS-null", 
                   modelprior = uniform()
)
m_ames2
## 
## Call:
## bas.lm(formula = price ~ ., data = df_base, prior = "ZS-null", 
##     modelprior = uniform())
## 
## 
##  Marginal Posterior Inclusion Probabilities: 
##      Intercept        Lot.Area   Land.SlopeMod   Land.SlopeSev      Year.Built  
##        1.00000         1.00000         0.47366         0.05238         1.00000  
## Year.Remod.Add   Bedroom.AbvGr  
##        1.00000         0.99999
ames.BPM2 = predict(m_ames2, estimator = "BPM", se.fit = TRUE)
ames.BPM2$best.vars
## [1] "Intercept"      "Lot.Area"       "Year.Built"     "Year.Remod.Add"
## [5] "Bedroom.AbvGr"
BPM2 = as.vector(
    which.matrix(
        m_ames2$which[ames.BPM2$best],
        m_ames2$n.vars)
    )
ames.BPM2 = bas.lm(price ~ ., data = df_base,
                    prior = "ZS-null",
                    modelprior = uniform(),
                    bestmodel = BPM2, n.models = 1)
ames.BPM2.coef <- coef(ames.BPM2)
ames.BPM2.coef
## 
##  Marginal Posterior Summaries of Coefficients: 
## 
##  Using  BMA 
## 
##  Based on the top  1 models 
##                 post mean  post SD    post p(B != 0)
## Intercept       1.202e+01  8.396e-03  1.000e+00     
## Lot.Area        2.466e-01  1.653e-02  1.000e+00     
## Land.SlopeMod   0.000e+00  0.000e+00  0.000e+00     
## Land.SlopeSev   0.000e+00  0.000e+00  0.000e+00     
## Year.Built      5.952e-03  3.600e-04  1.000e+00     
## Year.Remod.Add  6.752e-03  5.192e-04  1.000e+00     
## Bedroom.AbvGr   5.714e-02  1.053e-02  1.000e+00
par(mfrow = c(1, 4))
plot(ames.BPM2.coef, subset = c(2, 5:7))


The revised model no longer includes Land.Slope. This was the weakest variable in the previous model.


7

Do you think it is better to log transform Lot.Area, in terms of assumptions for linear regression? Make graphs of the predicted values of log home price versus the true values of log home price for the regression models selected for Lot.Area and log(Lot.Area). Referencing these two plots, provide a written support that includes a quantitative justification for your answer in the first part of question 7.

prices <- tibble(
        Observed = log(ames_train$price), 
        `Lot Area` = fitted(ames.BPM1), 
        `Log Lot Area` = fitted(ames.BPM2)
    ) %>% 
    pivot_longer(
        cols = c(`Lot Area`,`Log Lot Area`), 
        values_to = "Predicted", 
        names_to = "Model"
    ) %>%
    filter(Observed != min(Observed)) %>%
    mutate(Errors = Predicted - Observed) 

plot_fitted <- prices %>%
    ggplot(aes(x = Observed, y = Predicted, colour = Model)) +
    geom_point(alpha = 0.4) +
    geom_smooth(se = F, method = "lm", size = 1.2) +
    theme_minimal() +
    scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
    geom_abline(intercept = 0, slope = 1, size = 1, colour = "#FB8072") +
    theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
    labs(
        title = "Observed vs Predicted House Prices for Lot Area & Log of Lot Area Models", 
        caption = "Red line represents actual values. Outlier 428 removed."
    )

plot_errors <- prices %>%
    ggplot(aes(x = Observed, y = Errors, colour = Model)) +
    geom_point(alpha = 0.4) +
    geom_smooth(se = F, size = 1.2) +
    theme_minimal() +
    scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
    geom_hline(yintercept = 0) +
    theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
    labs(
        title = "Errors of Predicted Values for Lot Area & Log of Lot Area Models", 
        caption = "Outlier 428 removed."
    )

tbl_errors <- prices %>% 
    group_by(Model) %>%
    mutate(Errors = abs(Errors)) %>%
    summarise(
        mean=round(mean(Errors),2),
        median=median(Errors), 
        sd=round(sd(Errors),2),
        max=max(Errors),
        q25=quantile(Errors, 0.25),
        q75=quantile(Errors,0.75),
        skew=round(skewness(Errors),2)) %>%
    kbl() %>% 
    kable_styling(
        bootstrap_options = c("condensed"),
        full_width = F,
        position = "left"
        )


Model mean median sd max q25 q75 skew
Log Lot Area 0.20 0.1560065 0.16 1.076523 0.0717882 0.2705997 1.55
Lot Area 0.21 0.1647967 0.17 1.024407 0.0774881 0.2832801 1.44

A rough approximation is to look at the far end of the tail on the errors. A distance of 1.5 from the median of ~12 gives an error of around 1.0. Square of the distance is 2.25 so the correction factor would be 1/2.25:

prices_median <- median(prices$Observed)
prices %>% 
    mutate(
        correction = 
            (Predicted + 
                 ((1 - (!(Observed>prices_median))*2) *
                      (1/2.25)*(Observed - prices_median)^2)) - Observed
        )  %>%
    ggplot(aes(x = Observed, y = correction, colour = Model)) +
    geom_point(alpha = 0.4) +
    geom_smooth(se = F, size = 1.2) +
    theme_minimal() +
    scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
    geom_hline(yintercept = 0) +
    theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
    labs(
        title = "Errors of Corrected Predicted Values for Lot Area & Log of Lot Area Models", 
        caption = "Outlier 428 removed.",
    ) +
    ylab("Errors for Corrected Predicted Values")

Already a much better performance on the tails.