First, let us load the data and necessary packages:
# set defaults: cache chunks to speed compiling subsequent edits.
knitr::opts_chunk$set(cache=TRUE, echo = TRUE)load("ames_train.Rdata")
library(MASS)
library(BAS)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(kableExtra)
library(moments)
library(RColorBrewer)Make a labelled histogram (with 30 bins) of the ages of the houses in the data set, and describe the distribution.
house_ages <- ames_train %>%
mutate(age_at_2010 = 2010 - Year.Built) %>%
select(age_at_2010) %>%
drop_na()
mean_age <- mean(house_ages$age_at_2010)
sd_age <- sd(house_ages$age_at_2010)
plot_age <- house_ages %>%
ggplot(aes(x=age_at_2010)) +
geom_histogram(
alpha = 0.8,
bins = 30,
aes(y = stat(density)),
position="identity",
fill="orange",
col="darkgrey") +
stat_function(
fun = dnorm,
aes(colour = "Normal"),
size = 1.2,
alpha = 0.6,
args = list(mean = mean_age, sd = sd_age)) +
geom_line(
aes(y = ..density.., colour = "Observed"),
stat = 'density',
size = 1.5,
alpha = 0.8) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5), legend.position="bottom") +
scale_color_manual(name="Distribution", values=c(Normal="blue", Observed="red")) +
labs(title = "Age of Houses at 2010", y = "Proportion of Values", x="Age (years)")
age_summary <- house_ages %>%
summarise(
count=n(),
min=min(age_at_2010),
q25=quantile(age_at_2010, 0.25),
median=median(age_at_2010),
q75=quantile(age_at_2010,0.75),
max=max(age_at_2010),
mean=round(mean(age_at_2010),2),
sd=round(sd(age_at_2010),2),
skew=round(skewness(age_at_2010),2)) %>%
kbl() %>%
kable_styling(bootstrap_options = c("condensed"))Distribution Summary
| count | min | q25 | median | q75 | max | mean | sd | skew |
|---|---|---|---|---|---|---|---|---|
| 1000 | 0 | 9 | 35 | 55 | 138 | 37.8 | 29.64 | 0.66 |
The mantra in real estate is “Location, Location, Location!” Make a graphical display that relates a home price to its neighborhood in Ames, Iowa. Which summary statistics are most appropriate to use for determining the most expensive, least expensive, and most heterogeneous (having the most variation in housing price) neighborhoods? Report which neighborhoods these are based on the summary statistics of your choice. Report the value of your chosen summary statistics for these neighborhoods.
nei_colours <- colorRampPalette(brewer.pal(4, "PuOr"))(27)
neighbourhood_price_variation <- ames_train %>%
group_by(Neighborhood) %>%
summarise(
Mean = mean(price)/1e5,
Median = median(price)/1e5,
Min = min(price)/1e5,
Max = max(price)/1e5,
SD = sd(price)/1e5,
`H-Spread` = IQR(price)/1e5,
) %>%
mutate(CV = SD/Mean) %>%
arrange(desc(SD)) %>%
mutate(Neighborhood = factor(Neighborhood, unique(Neighborhood))) %>%
arrange(desc(Neighborhood))
nei_plot <- ames_train %>%
select(Neighborhood, price) %>%
mutate(Neighborhood = factor(Neighborhood, neighbourhood_price_variation$Neighborhood)) %>%
ggplot(aes(as.factor(Neighborhood), price/1e5), fill=Neighborhood) +
theme_minimal() +
geom_boxplot(aes(fill=Neighborhood)) +
geom_violin(aes(colour=Neighborhood), width=1.5, alpha=0.3) +
coord_flip() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position="none",
plot.caption = element_text(face="italic")
) +
scale_color_manual(values = nei_colours) +
labs(
title = "Distribution of House Prices by Neighbourhood",
caption = "Neighbourhoods listed in descending order of variability",
x = NULL,
y="Price ($'00,000)"
)
tbl <- neighbourhood_price_variation %>%
pivot_longer(-Neighborhood,names_to = "Measure", values_to = "vals") %>%
mutate(vals = round(vals,2)) %>%
arrange(desc(Neighborhood)) %>%
pivot_wider(names_from = "Neighborhood", values_from = "vals", names_sort = T)
tbl1 <- tbl %>% select(1:15) %>%
kbl() %>%
kable_styling(bootstrap_options = c("condensed"), full_width = F, font_size = 10, position = "left")
tbl2 <- tbl %>% select(c(1,16:28)) %>%
kbl() %>%
kable_styling(bootstrap_options = c("condensed"), full_width = F, font_size = 10, position = "left")Distribution of Neighbourhood Prices in Order of Variability ($’00,000)
| Measure | StoneBr | NridgHt | Timber | Veenker | Crawfor | GrnHill | Somerst | Edwards | CollgCr | SawyerW | ClearCr | NWAmes | Gilbert | Mitchel |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Mean | 3.39 | 3.34 | 2.65 | 2.34 | 2.04 | 2.80 | 2.35 | 1.36 | 1.97 | 1.83 | 1.93 | 1.94 | 1.93 | 1.65 |
| Median | 3.41 | 3.37 | 2.33 | 2.06 | 2.05 | 2.80 | 2.22 | 1.27 | 1.96 | 1.82 | 1.85 | 1.85 | 1.83 | 1.56 |
| Min | 1.80 | 1.73 | 1.75 | 1.50 | 0.96 | 2.30 | 1.39 | 0.62 | 1.10 | 0.68 | 1.07 | 0.82 | 1.33 | 0.94 |
| Max | 5.92 | 6.15 | 4.25 | 3.85 | 3.92 | 3.30 | 4.68 | 4.15 | 4.75 | 3.20 | 2.77 | 2.78 | 3.78 | 2.85 |
| SD | 1.23 | 1.05 | 0.84 | 0.73 | 0.71 | 0.71 | 0.65 | 0.55 | 0.53 | 0.48 | 0.48 | 0.41 | 0.41 | 0.40 |
| H-Spread | 1.51 | 1.49 | 1.51 | 0.68 | 0.80 | 0.50 | 0.73 | 0.36 | 0.59 | 0.67 | 0.76 | 0.59 | 0.28 | 0.33 |
| CV | 0.36 | 0.31 | 0.32 | 0.31 | 0.35 | 0.25 | 0.28 | 0.40 | 0.27 | 0.26 | 0.25 | 0.21 | 0.21 | 0.24 |
| Measure | BrkSide | OldTown | NoRidge | IDOTRR | Greens | SWISU | NAmes | Blmngtn | Sawyer | MeadowV | BrDale | NPkVill | Blueste |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Mean | 1.22 | 1.20 | 2.96 | 0.98 | 1.99 | 1.31 | 1.41 | 1.99 | 1.39 | 0.93 | 0.99 | 1.39 | 1.26 |
| Median | 1.24 | 1.20 | 2.90 | 1.00 | 2.13 | 1.34 | 1.40 | 1.91 | 1.36 | 0.86 | 0.99 | 1.42 | 1.24 |
| Min | 0.39 | 0.13 | 2.35 | 0.35 | 1.55 | 0.80 | 0.76 | 1.73 | 0.64 | 0.73 | 0.83 | 1.23 | 1.17 |
| Max | 2.07 | 2.66 | 4.05 | 1.51 | 2.14 | 1.69 | 2.78 | 2.47 | 2.19 | 1.29 | 1.25 | 1.50 | 1.37 |
| SD | 0.37 | 0.36 | 0.36 | 0.32 | 0.29 | 0.27 | 0.27 | 0.26 | 0.21 | 0.19 | 0.13 | 0.12 | 0.10 |
| H-Spread | 0.42 | 0.30 | 0.50 | 0.49 | 0.16 | 0.29 | 0.27 | 0.23 | 0.20 | 0.20 | 0.17 | 0.13 | 0.10 |
| CV | 0.30 | 0.30 | 0.12 | 0.32 | 0.15 | 0.21 | 0.19 | 0.13 | 0.15 | 0.20 | 0.13 | 0.09 | 0.08 |
Summary
Most Expensive:
Least Expensive:
Most Heterogeneous:
Note:
Which variable has the largest number of missing values? Explain why it makes sense that there are so many missing values for this variable.
ames_train %>%
summarise(across(everything(), ~ sum(is.na(.x)))) %>%
pivot_longer(everything(), names_to = "Variable") %>%
rename(NAs = value) %>%
slice_max(order_by = NAs, n=5)| Variable | NAs |
|---|---|
| Pool.QC | 997 |
| Misc.Feature | 971 |
| Alley | 933 |
| Fence | 798 |
| Fireplace.Qu | 491 |
Pool.QC has the highest count of NA’s.
From the codebook:
Pool QC (Ordinal): Pool quality
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
NA No Pool
Each NA just represents a home without a pool.
We want to predict the natural log of the home prices. Candidate explanatory variables are lot size in square feet (Lot.Area), slope of property (Land.Slope), original construction date (Year.Built), remodel date (Year.Remod.Add), and the number of bedrooms above grade (Bedroom.AbvGr). Pick a model selection or model averaging method covered in the Specialization, and describe how this method works. Then, use this method to find the best multiple regression model for predicting the natural log of the home prices.
df_base <- ames_train %>%
select(
price,
Lot.Area,
Land.Slope,
Year.Built,
Year.Remod.Add,
Bedroom.AbvGr
) %>%
mutate(price = log(price)) %>%
drop_na()
m_ames1 <- bas.lm(price ~ .,
data = df_base,
prior = "ZS-null",
modelprior = uniform()
)
m_ames1##
## Call:
## bas.lm(formula = price ~ ., data = df_base, prior = "ZS-null",
## modelprior = uniform())
##
##
## Marginal Posterior Inclusion Probabilities:
## Intercept Lot.Area Land.SlopeMod Land.SlopeSev Year.Built
## 1.0000 1.0000 0.7201 0.8466 1.0000
## Year.Remod.Add Bedroom.AbvGr
## 1.0000 1.0000
ames.BPM1 = predict(m_ames1, estimator = "BPM", se.fit = TRUE)
ames.BPM1$best.vars## [1] "Intercept" "Lot.Area" "Land.SlopeMod" "Land.SlopeSev"
## [5] "Year.Built" "Year.Remod.Add" "Bedroom.AbvGr"
BPM1 = as.vector(
which.matrix(
m_ames1$which[ames.BPM1$best],
m_ames1$n.vars)
)
ames.BPM1 = bas.lm(price ~ ., data = df_base,
prior = "ZS-null",
modelprior = uniform(),
bestmodel = BPM1, n.models = 1)
ames.BPM1.coef <- coef(ames.BPM1)
ames.BPM1.coef##
## Marginal Posterior Summaries of Coefficients:
##
## Using BMA
##
## Based on the top 1 models
## post mean post SD post p(B != 0)
## Intercept 1.202e+01 8.824e-03 1.000e+00
## Lot.Area 1.025e-05 1.104e-06 1.000e+00
## Land.SlopeMod 1.380e-01 4.984e-02 1.000e+00
## Land.SlopeSev -4.553e-01 1.512e-01 1.000e+00
## Year.Built 6.031e-03 3.782e-04 1.000e+00
## Year.Remod.Add 6.757e-03 5.460e-04 1.000e+00
## Bedroom.AbvGr 8.660e-02 1.076e-02 1.000e+00
par(mfrow = c(2, 3))
plot(ames.BPM1.coef, subset = c(2:7))Initial Linear Model
The bas.lm() function from the BAS package
is used:
prior = "ZS-null") for the
prior distributions of the coefficients in this regression.modelprior = uniform().Model Selection - Best Predictive Model
Since the purpose of the model is to predict future audience movie scores, the approach used is Best Predictive Model (BPM). This is the option whose predictions are closest to those given by the Bayesian model averaging model.
estimator = "BPM" is not yet available in
coef(), so we need to extract a vector of zeros and ones
representing which variables are included in the BPM model.bas.lm using the
optional argument bestmodel = BPM.n.models = 1. In this way, bas.lm starts with
the BPM and fits only that model.coef function to obtain the summaries.Which home has the largest squared residual in the previous analysis (Question 4)? Looking at all the variables in the data set, can you explain why this home stands out from the rest (what factors contribute to the high squared residual and why are those factors relevant)?
par(mfrow = c(1, 1))
plot(
m_ames1,
which = 1,
add.smooth = F,
ask = F,
pch = 16,
sub.caption="",
caption="")n <- nrow(df_base)
ames_k <- qnorm(0.5 + 0.5 * 0.95 ^ (1 / n))
ames.lm <- lm(price ~ ., data = df_base)
outliers <- Bayes.outlier(ames.lm, k = ames_k)
max_outlier <- ames_train[which.max(outliers$prob.outlier),]
max_outlier %>%
select(PID, price, Year.Built, Year.Remod.Add, area, Overall.Cond, Exterior.1st,
Exter.Cond, Bsmt.Cond, BsmtFin.Type.1, Central.Air, Garage.Cond, Sale.Condition) %>%
kbl() %>% kable_styling(bootstrap_options = c("condensed"), font_size = 10)| PID | price | Year.Built | Year.Remod.Add | area | Overall.Cond | Exterior.1st | Exter.Cond | Bsmt.Cond | BsmtFin.Type.1 | Central.Air | Garage.Cond | Sale.Condition |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 902207130 | 12789 | 1923 | 1970 | 832 | 2 | AsbShng | Fa | Fa | Unf | N | Fa | Abnorml |
The property with the highest \(R^2\) value is observation 428 (Property ID 902207130).
Sale Price was $12,789, well below the median $120,000 for that neighbourhood (Old Town).
A number of factors might point to a cause (likely a combination of some or all of these):
Use the same model selection method you chose in Question 4 to again find the best multiple regression model to predict the natural log of home prices, but this time replacing Lot.Area with log(Lot.Area). Do you arrive at a model including the same set of predictors?
df_base <- ames_train %>%
select(
price,
Lot.Area,
Land.Slope,
Year.Built,
Year.Remod.Add,
Bedroom.AbvGr
) %>%
mutate(price = log(price), Lot.Area = log(Lot.Area)) %>%
drop_na()
m_ames2 <- bas.lm(price ~ .,
data = df_base,
prior = "ZS-null",
modelprior = uniform()
)
m_ames2##
## Call:
## bas.lm(formula = price ~ ., data = df_base, prior = "ZS-null",
## modelprior = uniform())
##
##
## Marginal Posterior Inclusion Probabilities:
## Intercept Lot.Area Land.SlopeMod Land.SlopeSev Year.Built
## 1.00000 1.00000 0.47366 0.05238 1.00000
## Year.Remod.Add Bedroom.AbvGr
## 1.00000 0.99999
ames.BPM2 = predict(m_ames2, estimator = "BPM", se.fit = TRUE)
ames.BPM2$best.vars## [1] "Intercept" "Lot.Area" "Year.Built" "Year.Remod.Add"
## [5] "Bedroom.AbvGr"
BPM2 = as.vector(
which.matrix(
m_ames2$which[ames.BPM2$best],
m_ames2$n.vars)
)
ames.BPM2 = bas.lm(price ~ ., data = df_base,
prior = "ZS-null",
modelprior = uniform(),
bestmodel = BPM2, n.models = 1)
ames.BPM2.coef <- coef(ames.BPM2)
ames.BPM2.coef##
## Marginal Posterior Summaries of Coefficients:
##
## Using BMA
##
## Based on the top 1 models
## post mean post SD post p(B != 0)
## Intercept 1.202e+01 8.396e-03 1.000e+00
## Lot.Area 2.466e-01 1.653e-02 1.000e+00
## Land.SlopeMod 0.000e+00 0.000e+00 0.000e+00
## Land.SlopeSev 0.000e+00 0.000e+00 0.000e+00
## Year.Built 5.952e-03 3.600e-04 1.000e+00
## Year.Remod.Add 6.752e-03 5.192e-04 1.000e+00
## Bedroom.AbvGr 5.714e-02 1.053e-02 1.000e+00
par(mfrow = c(1, 4))
plot(ames.BPM2.coef, subset = c(2, 5:7))The revised model no longer includes Land.Slope. This
was the weakest variable in the previous model.
Do you think it is better to log transform Lot.Area, in terms of assumptions for linear regression? Make graphs of the predicted values of log home price versus the true values of log home price for the regression models selected for Lot.Area and log(Lot.Area). Referencing these two plots, provide a written support that includes a quantitative justification for your answer in the first part of question 7.
prices <- tibble(
Observed = log(ames_train$price),
`Lot Area` = fitted(ames.BPM1),
`Log Lot Area` = fitted(ames.BPM2)
) %>%
pivot_longer(
cols = c(`Lot Area`,`Log Lot Area`),
values_to = "Predicted",
names_to = "Model"
) %>%
filter(Observed != min(Observed)) %>%
mutate(Errors = Predicted - Observed)
plot_fitted <- prices %>%
ggplot(aes(x = Observed, y = Predicted, colour = Model)) +
geom_point(alpha = 0.4) +
geom_smooth(se = F, method = "lm", size = 1.2) +
theme_minimal() +
scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
geom_abline(intercept = 0, slope = 1, size = 1, colour = "#FB8072") +
theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
labs(
title = "Observed vs Predicted House Prices for Lot Area & Log of Lot Area Models",
caption = "Red line represents actual values. Outlier 428 removed."
)
plot_errors <- prices %>%
ggplot(aes(x = Observed, y = Errors, colour = Model)) +
geom_point(alpha = 0.4) +
geom_smooth(se = F, size = 1.2) +
theme_minimal() +
scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
geom_hline(yintercept = 0) +
theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
labs(
title = "Errors of Predicted Values for Lot Area & Log of Lot Area Models",
caption = "Outlier 428 removed."
)
tbl_errors <- prices %>%
group_by(Model) %>%
mutate(Errors = abs(Errors)) %>%
summarise(
mean=round(mean(Errors),2),
median=median(Errors),
sd=round(sd(Errors),2),
max=max(Errors),
q25=quantile(Errors, 0.25),
q75=quantile(Errors,0.75),
skew=round(skewness(Errors),2)) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("condensed"),
full_width = F,
position = "left"
)| Model | mean | median | sd | max | q25 | q75 | skew |
|---|---|---|---|---|---|---|---|
| Log Lot Area | 0.20 | 0.1560065 | 0.16 | 1.076523 | 0.0717882 | 0.2705997 | 1.55 |
| Lot Area | 0.21 | 0.1647967 | 0.17 | 1.024407 | 0.0774881 | 0.2832801 | 1.44 |
log(Lot.Area) edging out Lot.Area in slope
coefficient, and having a lower median and standard deviation of
errors.log(Lot.Area)
is the better performing of the two models.log(Lot.Area) also produced the largest
error and the distribution of errors is more heavily right-skewed.A rough approximation is to look at the far end of the tail on the errors. A distance of 1.5 from the median of ~12 gives an error of around 1.0. Square of the distance is 2.25 so the correction factor would be 1/2.25:
prices_median <- median(prices$Observed)
prices %>%
mutate(
correction =
(Predicted +
((1 - (!(Observed>prices_median))*2) *
(1/2.25)*(Observed - prices_median)^2)) - Observed
) %>%
ggplot(aes(x = Observed, y = correction, colour = Model)) +
geom_point(alpha = 0.4) +
geom_smooth(se = F, size = 1.2) +
theme_minimal() +
scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
geom_hline(yintercept = 0) +
theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
labs(
title = "Errors of Corrected Predicted Values for Lot Area & Log of Lot Area Models",
caption = "Outlier 428 removed.",
) +
ylab("Errors for Corrected Predicted Values")Already a much better performance on the tails.