First, let us load the data and necessary packages:

# set defaults: cache chunks to speed compiling subsequent edits.
knitr::opts_chunk$set(cache=TRUE, echo = TRUE)

load("ames_train.Rdata")
library(MASS)
library(BAS)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(kableExtra)
library(moments)
library(RColorBrewer)

1

Make a labelled histogram (with 30 bins) of the ages of the houses in the data set, and describe the distribution.

house_ages <- ames_train %>% 
    mutate(age_at_2010 = 2010 - Year.Built) %>%
    select(age_at_2010) %>%
    drop_na()

mean_age <- mean(house_ages$age_at_2010)
sd_age <- sd(house_ages$age_at_2010)
plot_age <- house_ages %>% 
    ggplot(aes(x=age_at_2010)) +
    geom_histogram(
        alpha = 0.8, 
        bins = 30, 
        aes(y = stat(density)),
        position="identity", 
        fill="orange", 
        col="darkgrey") +
    stat_function(
        fun = dnorm, 
        aes(colour = "Normal"), 
        size = 1.2, 
        alpha = 0.6,
        args = list(mean = mean_age, sd = sd_age)) +
    geom_line(
        aes(y = ..density.., colour = "Observed"), 
        stat = 'density', 
        size = 1.5, 
        alpha = 0.8) + 
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5), legend.position="bottom") +
    scale_color_manual(name="Distribution", values=c(Normal="blue", Observed="red")) +
    labs(title = "Age of Houses at 2010", y = "Proportion of Values", x="Age (years)")

age_summary <- house_ages %>% 
    summarise(
        count=n(),
        min=min(age_at_2010),
        q25=quantile(age_at_2010, 0.25),
        median=median(age_at_2010), 
        q75=quantile(age_at_2010,0.75),
        max=max(age_at_2010),
        mean=round(mean(age_at_2010),2),
        sd=round(sd(age_at_2010),2),
        skew=round(skewness(age_at_2010),2)) %>%
    kbl() %>% 
    kable_styling(bootstrap_options = c("condensed"))

Distribution Summary

count	min	q25	median	q75	max	mean	sd	skew
1000	0	9	35	55	138	37.8	29.64	0.66

There are 1000 observations with a range of 0 to 138 years.
Distribution is multi-modal (2 prominent peaks with a smaller third).
The first peak will be related to a higher proportion of new builds.
Highest peak of values is around 5-9 years.
The H‑spread is 9 to 55 years old (inclusive).
There is a high degree of variance shown by a standard deviation of 29.64.
Despite a heavy right skew appearance, there is only a moderate skew of 0.66, possibly because of the large mass of observations in the central peak which is wider than the first and pulls both the mean and median to the right again.
There will be an element of right-skew due to the fact that homes cannot be younger than 0 years.

2

The mantra in real estate is “Location, Location, Location!” Make a graphical display that relates a home price to its neighborhood in Ames, Iowa. Which summary statistics are most appropriate to use for determining the most expensive, least expensive, and most heterogeneous (having the most variation in housing price) neighborhoods? Report which neighborhoods these are based on the summary statistics of your choice. Report the value of your chosen summary statistics for these neighborhoods.

nei_colours <- colorRampPalette(brewer.pal(4, "PuOr"))(27)

neighbourhood_price_variation <- ames_train %>% 
    group_by(Neighborhood) %>% 
    summarise(
        Mean = mean(price)/1e5, 
        Median = median(price)/1e5, 
        Min = min(price)/1e5, 
        Max = max(price)/1e5, 
        SD = sd(price)/1e5, 
        `H-Spread` = IQR(price)/1e5,
        ) %>%
    mutate(CV = SD/Mean) %>%
    arrange(desc(SD)) %>%
    mutate(Neighborhood = factor(Neighborhood, unique(Neighborhood))) %>%
    arrange(desc(Neighborhood))

nei_plot <- ames_train %>% 
    select(Neighborhood, price) %>%
    mutate(Neighborhood = factor(Neighborhood, neighbourhood_price_variation$Neighborhood)) %>% 
    ggplot(aes(as.factor(Neighborhood), price/1e5), fill=Neighborhood) +
    theme_minimal() +
    geom_boxplot(aes(fill=Neighborhood)) +
    geom_violin(aes(colour=Neighborhood), width=1.5, alpha=0.3) + 
    coord_flip() +
    theme(
        plot.title = element_text(hjust = 0.5), 
        legend.position="none", 
        plot.caption = element_text(face="italic")
        ) +
    scale_color_manual(values = nei_colours) +
    labs(
        title = "Distribution of House Prices by Neighbourhood", 
        caption = "Neighbourhoods listed in descending order of variability",
        x = NULL, 
        y="Price ($'00,000)"
        )

tbl <- neighbourhood_price_variation %>%
    pivot_longer(-Neighborhood,names_to = "Measure", values_to = "vals") %>%
    mutate(vals = round(vals,2)) %>%
    arrange(desc(Neighborhood)) %>%
    pivot_wider(names_from = "Neighborhood", values_from = "vals", names_sort = T)

tbl1 <- tbl %>% select(1:15) %>%
    kbl() %>% 
    kable_styling(bootstrap_options = c("condensed"), full_width = F, font_size = 10, position = "left")

tbl2 <- tbl %>% select(c(1,16:28)) %>%
    kbl() %>% 
    kable_styling(bootstrap_options = c("condensed"), full_width = F, font_size = 10, position = "left")

Distribution of Neighbourhood Prices in Order of Variability ($’00,000)

Measure	StoneBr	NridgHt	Timber	Veenker	Crawfor	GrnHill	Somerst	Edwards	CollgCr	SawyerW	ClearCr	NWAmes	Gilbert	Mitchel
Mean	3.39	3.34	2.65	2.34	2.04	2.80	2.35	1.36	1.97	1.83	1.93	1.94	1.93	1.65
Median	3.41	3.37	2.33	2.06	2.05	2.80	2.22	1.27	1.96	1.82	1.85	1.85	1.83	1.56
Min	1.80	1.73	1.75	1.50	0.96	2.30	1.39	0.62	1.10	0.68	1.07	0.82	1.33	0.94
Max	5.92	6.15	4.25	3.85	3.92	3.30	4.68	4.15	4.75	3.20	2.77	2.78	3.78	2.85
SD	1.23	1.05	0.84	0.73	0.71	0.71	0.65	0.55	0.53	0.48	0.48	0.41	0.41	0.40
H-Spread	1.51	1.49	1.51	0.68	0.80	0.50	0.73	0.36	0.59	0.67	0.76	0.59	0.28	0.33
CV	0.36	0.31	0.32	0.31	0.35	0.25	0.28	0.40	0.27	0.26	0.25	0.21	0.21	0.24

Measure	BrkSide	OldTown	NoRidge	IDOTRR	Greens	SWISU	NAmes	Blmngtn	Sawyer	MeadowV	BrDale	NPkVill	Blueste
Mean	1.22	1.20	2.96	0.98	1.99	1.31	1.41	1.99	1.39	0.93	0.99	1.39	1.26
Median	1.24	1.20	2.90	1.00	2.13	1.34	1.40	1.91	1.36	0.86	0.99	1.42	1.24
Min	0.39	0.13	2.35	0.35	1.55	0.80	0.76	1.73	0.64	0.73	0.83	1.23	1.17
Max	2.07	2.66	4.05	1.51	2.14	1.69	2.78	2.47	2.19	1.29	1.25	1.50	1.37
SD	0.37	0.36	0.36	0.32	0.29	0.27	0.27	0.26	0.21	0.19	0.13	0.12	0.10
H-Spread	0.42	0.30	0.50	0.49	0.16	0.29	0.27	0.23	0.20	0.20	0.17	0.13	0.10
CV	0.30	0.30	0.12	0.32	0.15	0.21	0.19	0.13	0.15	0.20	0.13	0.09	0.08

Summary

Most Expensive:

Stone Brook (Median $341,000)
Northridge Heights (Maximum $615,000)

Least Expensive:

Meadow Village (Median $86,000)
Old Town (Minimum $13,000)

Most Heterogeneous:

Stone Brook (Standard Deviation $123,000, H-Spread $151,000)
Timberland (H-Spread $151,000)
Edwards (Coefficient of Variance 0.4)

Note:

Median has been used for most/least expensive rather than mean as it is more robust to outliers and skew.
Coefficient of variance is calculated as $\sigma / \mu$, a measure of variability in proportion to the mean. It’s useful for comparing the degree of variation from one data series to another, even if the means are drastically different from one another (ref).

3

Which variable has the largest number of missing values? Explain why it makes sense that there are so many missing values for this variable.

ames_train %>% 
    summarise(across(everything(), ~ sum(is.na(.x)))) %>%
    pivot_longer(everything(), names_to = "Variable") %>%
    rename(NAs = value) %>%
    slice_max(order_by = NAs, n=5)

Variable	NAs
Pool.QC	997
Misc.Feature	971
Alley	933
Fence	798
Fireplace.Qu	491

Pool.QC has the highest count of NA’s.

From the codebook:

Pool QC (Ordinal): Pool quality
        
       Ex   Excellent
       Gd   Good
       TA   Average/Typical
       Fa   Fair
       NA   No Pool

Each NA just represents a home without a pool.

4

We want to predict the natural log of the home prices. Candidate explanatory variables are lot size in square feet (Lot.Area), slope of property (Land.Slope), original construction date (Year.Built), remodel date (Year.Remod.Add), and the number of bedrooms above grade (Bedroom.AbvGr). Pick a model selection or model averaging method covered in the Specialization, and describe how this method works. Then, use this method to find the best multiple regression model for predicting the natural log of the home prices.

df_base <- ames_train %>%
    select(
        price,
        Lot.Area,
        Land.Slope,
        Year.Built,
        Year.Remod.Add,
        Bedroom.AbvGr
        ) %>%
    mutate(price = log(price)) %>%
    drop_na()

m_ames1 <- bas.lm(price ~ ., 
                   data = df_base,
                   prior = "ZS-null", 
                   modelprior = uniform()
)
m_ames1

## 
## Call:
## bas.lm(formula = price ~ ., data = df_base, prior = "ZS-null", 
##     modelprior = uniform())
## 
## 
##  Marginal Posterior Inclusion Probabilities: 
##      Intercept        Lot.Area   Land.SlopeMod   Land.SlopeSev      Year.Built  
##         1.0000          1.0000          0.7201          0.8466          1.0000  
## Year.Remod.Add   Bedroom.AbvGr  
##         1.0000          1.0000

ames.BPM1 = predict(m_ames1, estimator = "BPM", se.fit = TRUE)
ames.BPM1$best.vars

## [1] "Intercept"      "Lot.Area"       "Land.SlopeMod"  "Land.SlopeSev" 
## [5] "Year.Built"     "Year.Remod.Add" "Bedroom.AbvGr"

BPM1 = as.vector(
    which.matrix(
        m_ames1$which[ames.BPM1$best],
        m_ames1$n.vars)
    )
ames.BPM1 = bas.lm(price ~ ., data = df_base,
                    prior = "ZS-null",
                    modelprior = uniform(),
                    bestmodel = BPM1, n.models = 1)
ames.BPM1.coef <- coef(ames.BPM1)
ames.BPM1.coef

## 
##  Marginal Posterior Summaries of Coefficients: 
## 
##  Using  BMA 
## 
##  Based on the top  1 models 
##                 post mean   post SD     post p(B != 0)
## Intercept        1.202e+01   8.824e-03   1.000e+00    
## Lot.Area         1.025e-05   1.104e-06   1.000e+00    
## Land.SlopeMod    1.380e-01   4.984e-02   1.000e+00    
## Land.SlopeSev   -4.553e-01   1.512e-01   1.000e+00    
## Year.Built       6.031e-03   3.782e-04   1.000e+00    
## Year.Remod.Add   6.757e-03   5.460e-04   1.000e+00    
## Bedroom.AbvGr    8.660e-02   1.076e-02   1.000e+00

par(mfrow = c(2, 3))
plot(ames.BPM1.coef, subset = c(2:7))

Initial Linear Model

The bas.lm() function from the BAS package is used:

Zellner-Siow cauchy prior (prior = "ZS-null") for the prior distributions of the coefficients in this regression.
It’s assumed all variables have an equal likelihood of being included in the final model using the uniform distribution modelprior = uniform().
Marginal Posterior Inclusion Probabilities are high for all variables.

Model Selection - Best Predictive Model

Since the purpose of the model is to predict future audience movie scores, the approach used is Best Predictive Model (BPM). This is the option whose predictions are closest to those given by the Bayesian model averaging model.

BPM estimation with the `predict` function confirms that all variables are significant influencers for the model.
Using the `se.fit = TRUE` option with `predict` we can calculate standard deviations for the predictions or for the mean. Then we can use this as input for the `confint` function for the prediction object.
The option estimator = "BPM" is not yet available in coef(), so we need to extract a vector of zeros and ones representing which variables are included in the BPM model.
Next, we will refit the model with bas.lm using the optional argument bestmodel = BPM.
We will also specify that want to have 1 model by setting n.models = 1. In this way, bas.lm starts with the BPM and fits only that model.
Since we have only one model in our new object representing the BPM, we can use the coef function to obtain the summaries.

5

Which home has the largest squared residual in the previous analysis (Question 4)? Looking at all the variables in the data set, can you explain why this home stands out from the rest (what factors contribute to the high squared residual and why are those factors relevant)?

par(mfrow = c(1, 1))
plot(
    m_ames1, 
    which = 1, 
    add.smooth = F, 
    ask = F, 
    pch = 16, 
    sub.caption="", 
    caption="")

n <- nrow(df_base)
ames_k <- qnorm(0.5 + 0.5 * 0.95 ^ (1 / n))
ames.lm <- lm(price ~ ., data = df_base)
outliers <- Bayes.outlier(ames.lm, k = ames_k)
max_outlier <- ames_train[which.max(outliers$prob.outlier),]
max_outlier %>%
    select(PID, price, Year.Built, Year.Remod.Add, area, Overall.Cond, Exterior.1st, 
           Exter.Cond, Bsmt.Cond, BsmtFin.Type.1, Central.Air, Garage.Cond, Sale.Condition) %>%
    kbl() %>% kable_styling(bootstrap_options = c("condensed"), font_size = 10)

PID	price	Year.Built	Year.Remod.Add	area	Overall.Cond	Exterior.1st	Exter.Cond	Bsmt.Cond	BsmtFin.Type.1	Central.Air	Garage.Cond	Sale.Condition
902207130	12789	1923	1970	832	2	AsbShng	Fa	Fa	Unf	N	Fa	Abnorml

The property with the highest $R^2$ value is observation 428 (Property ID 902207130).

Sale Price was $12,789, well below the median $120,000 for that neighbourhood (Old Town).

A number of factors might point to a cause (likely a combination of some or all of these):

The house was built in 1923 and not refurbished since 1970 (40 years at the time of the data set). Both of these factors are shown to have strong impact on sale price in the model.
The house has small area, it lies at approximately the 10^th percentile.
Condition overall, and of the basement, garage and exterior, rate the second lowest possible score each.
Exterior cladding is asbestos, a recognised health hazard and very expensive to remove and dispose of.
Basement type is unfinished, i.e. suitable only for storage.
There is no central air conditioning.
Sale condition was abnormal (trade, foreclosure, short sale). It’s possible this is a foreclosure, death estate, or perhaps the building is only good for demolition and sold for land value minus clearance costs.
The last and simplest reason of course is that this could be a key entry error and a digit was missed from the sale price entry (perhaps the actual sale price was $127,890).

6

Use the same model selection method you chose in Question 4 to again find the best multiple regression model to predict the natural log of home prices, but this time replacing Lot.Area with log(Lot.Area). Do you arrive at a model including the same set of predictors?

df_base <- ames_train %>%
    select(
        price,
        Lot.Area,
        Land.Slope,
        Year.Built,
        Year.Remod.Add,
        Bedroom.AbvGr
        ) %>%
    mutate(price = log(price), Lot.Area = log(Lot.Area)) %>%
    drop_na()

m_ames2 <- bas.lm(price ~ ., 
                   data = df_base,
                   prior = "ZS-null", 
                   modelprior = uniform()
)
m_ames2

## 
## Call:
## bas.lm(formula = price ~ ., data = df_base, prior = "ZS-null", 
##     modelprior = uniform())
## 
## 
##  Marginal Posterior Inclusion Probabilities: 
##      Intercept        Lot.Area   Land.SlopeMod   Land.SlopeSev      Year.Built  
##        1.00000         1.00000         0.47366         0.05238         1.00000  
## Year.Remod.Add   Bedroom.AbvGr  
##        1.00000         0.99999

ames.BPM2 = predict(m_ames2, estimator = "BPM", se.fit = TRUE)
ames.BPM2$best.vars

## [1] "Intercept"      "Lot.Area"       "Year.Built"     "Year.Remod.Add"
## [5] "Bedroom.AbvGr"

BPM2 = as.vector(
    which.matrix(
        m_ames2$which[ames.BPM2$best],
        m_ames2$n.vars)
    )
ames.BPM2 = bas.lm(price ~ ., data = df_base,
                    prior = "ZS-null",
                    modelprior = uniform(),
                    bestmodel = BPM2, n.models = 1)
ames.BPM2.coef <- coef(ames.BPM2)
ames.BPM2.coef

## 
##  Marginal Posterior Summaries of Coefficients: 
## 
##  Using  BMA 
## 
##  Based on the top  1 models 
##                 post mean  post SD    post p(B != 0)
## Intercept       1.202e+01  8.396e-03  1.000e+00     
## Lot.Area        2.466e-01  1.653e-02  1.000e+00     
## Land.SlopeMod   0.000e+00  0.000e+00  0.000e+00     
## Land.SlopeSev   0.000e+00  0.000e+00  0.000e+00     
## Year.Built      5.952e-03  3.600e-04  1.000e+00     
## Year.Remod.Add  6.752e-03  5.192e-04  1.000e+00     
## Bedroom.AbvGr   5.714e-02  1.053e-02  1.000e+00

par(mfrow = c(1, 4))
plot(ames.BPM2.coef, subset = c(2, 5:7))

The revised model no longer includes Land.Slope. This was the weakest variable in the previous model.

7

Do you think it is better to log transform Lot.Area, in terms of assumptions for linear regression? Make graphs of the predicted values of log home price versus the true values of log home price for the regression models selected for Lot.Area and log(Lot.Area). Referencing these two plots, provide a written support that includes a quantitative justification for your answer in the first part of question 7.

prices <- tibble(
        Observed = log(ames_train$price), 
        `Lot Area` = fitted(ames.BPM1), 
        `Log Lot Area` = fitted(ames.BPM2)
    ) %>% 
    pivot_longer(
        cols = c(`Lot Area`,`Log Lot Area`), 
        values_to = "Predicted", 
        names_to = "Model"
    ) %>%
    filter(Observed != min(Observed)) %>%
    mutate(Errors = Predicted - Observed) 

plot_fitted <- prices %>%
    ggplot(aes(x = Observed, y = Predicted, colour = Model)) +
    geom_point(alpha = 0.4) +
    geom_smooth(se = F, method = "lm", size = 1.2) +
    theme_minimal() +
    scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
    geom_abline(intercept = 0, slope = 1, size = 1, colour = "#FB8072") +
    theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
    labs(
        title = "Observed vs Predicted House Prices for Lot Area & Log of Lot Area Models", 
        caption = "Red line represents actual values. Outlier 428 removed."
    )

plot_errors <- prices %>%
    ggplot(aes(x = Observed, y = Errors, colour = Model)) +
    geom_point(alpha = 0.4) +
    geom_smooth(se = F, size = 1.2) +
    theme_minimal() +
    scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
    geom_hline(yintercept = 0) +
    theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
    labs(
        title = "Errors of Predicted Values for Lot Area & Log of Lot Area Models", 
        caption = "Outlier 428 removed."
    )

tbl_errors <- prices %>% 
    group_by(Model) %>%
    mutate(Errors = abs(Errors)) %>%
    summarise(
        mean=round(mean(Errors),2),
        median=median(Errors), 
        sd=round(sd(Errors),2),
        max=max(Errors),
        q25=quantile(Errors, 0.25),
        q75=quantile(Errors,0.75),
        skew=round(skewness(Errors),2)) %>%
    kbl() %>% 
    kable_styling(
        bootstrap_options = c("condensed"),
        full_width = F,
        position = "left"
        )

Model	mean	median	sd	max	q25	q75	skew
Log Lot Area	0.20	0.1560065	0.16	1.076523	0.0717882	0.2705997	1.55
Lot Area	0.21	0.1647967	0.17	1.024407	0.0774881	0.2832801	1.44

The two models produce very similar results with log(Lot.Area) edging out Lot.Area in slope coefficient, and having a lower median and standard deviation of errors.
Taking these factors into consideration, log(Lot.Area) is the better performing of the two models.
Note however, log(Lot.Area) also produced the largest error and the distribution of errors is more heavily right-skewed.
Both models show good predictive success for the mid-range where the bulk of the mass is, however both perform poorly in the tails.
The model would need more work to remedy the tails, either by applying a correction where the predicted value is adjusted by some determined coefficient multiplied by the square of the distance between the observation and the median, or other variables need to be considered in the model.

A rough approximation is to look at the far end of the tail on the errors. A distance of 1.5 from the median of ~12 gives an error of around 1.0. Square of the distance is 2.25 so the correction factor would be 1/2.25:

prices_median <- median(prices$Observed)
prices %>% 
    mutate(
        correction = 
            (Predicted + 
                 ((1 - (!(Observed>prices_median))*2) *
                      (1/2.25)*(Observed - prices_median)^2)) - Observed
        )  %>%
    ggplot(aes(x = Observed, y = correction, colour = Model)) +
    geom_point(alpha = 0.4) +
    geom_smooth(se = F, size = 1.2) +
    theme_minimal() +
    scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
    geom_hline(yintercept = 0) +
    theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
    labs(
        title = "Errors of Corrected Predicted Values for Lot Area & Log of Lot Area Models", 
        caption = "Outlier 428 removed.",
    ) +
    ylab("Errors for Corrected Predicted Values")

Already a much better performance on the tails.

Peer Assessment I

1

2

3

4

5

6

7