First, let us load the data and necessary packages:
# set defaults: cache chunks to speed compiling subsequent edits.
::opts_chunk$set(cache=TRUE, echo = TRUE) knitr
load("ames_train.Rdata")
library(MASS)
library(BAS)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(kableExtra)
library(moments)
library(RColorBrewer)
Make a labelled histogram (with 30 bins) of the ages of the houses in the data set, and describe the distribution.
<- ames_train %>%
house_ages mutate(age_at_2010 = 2010 - Year.Built) %>%
select(age_at_2010) %>%
drop_na()
<- mean(house_ages$age_at_2010)
mean_age <- sd(house_ages$age_at_2010)
sd_age <- house_ages %>%
plot_age ggplot(aes(x=age_at_2010)) +
geom_histogram(
alpha = 0.8,
bins = 30,
aes(y = stat(density)),
position="identity",
fill="orange",
col="darkgrey") +
stat_function(
fun = dnorm,
aes(colour = "Normal"),
size = 1.2,
alpha = 0.6,
args = list(mean = mean_age, sd = sd_age)) +
geom_line(
aes(y = ..density.., colour = "Observed"),
stat = 'density',
size = 1.5,
alpha = 0.8) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5), legend.position="bottom") +
scale_color_manual(name="Distribution", values=c(Normal="blue", Observed="red")) +
labs(title = "Age of Houses at 2010", y = "Proportion of Values", x="Age (years)")
<- house_ages %>%
age_summary summarise(
count=n(),
min=min(age_at_2010),
q25=quantile(age_at_2010, 0.25),
median=median(age_at_2010),
q75=quantile(age_at_2010,0.75),
max=max(age_at_2010),
mean=round(mean(age_at_2010),2),
sd=round(sd(age_at_2010),2),
skew=round(skewness(age_at_2010),2)) %>%
kbl() %>%
kable_styling(bootstrap_options = c("condensed"))
Distribution Summary
count | min | q25 | median | q75 | max | mean | sd | skew |
---|---|---|---|---|---|---|---|---|
1000 | 0 | 9 | 35 | 55 | 138 | 37.8 | 29.64 | 0.66 |
The mantra in real estate is “Location, Location, Location!” Make a graphical display that relates a home price to its neighborhood in Ames, Iowa. Which summary statistics are most appropriate to use for determining the most expensive, least expensive, and most heterogeneous (having the most variation in housing price) neighborhoods? Report which neighborhoods these are based on the summary statistics of your choice. Report the value of your chosen summary statistics for these neighborhoods.
<- colorRampPalette(brewer.pal(4, "PuOr"))(27)
nei_colours
<- ames_train %>%
neighbourhood_price_variation group_by(Neighborhood) %>%
summarise(
Mean = mean(price)/1e5,
Median = median(price)/1e5,
Min = min(price)/1e5,
Max = max(price)/1e5,
SD = sd(price)/1e5,
`H-Spread` = IQR(price)/1e5,
%>%
) mutate(CV = SD/Mean) %>%
arrange(desc(SD)) %>%
mutate(Neighborhood = factor(Neighborhood, unique(Neighborhood))) %>%
arrange(desc(Neighborhood))
<- ames_train %>%
nei_plot select(Neighborhood, price) %>%
mutate(Neighborhood = factor(Neighborhood, neighbourhood_price_variation$Neighborhood)) %>%
ggplot(aes(as.factor(Neighborhood), price/1e5), fill=Neighborhood) +
theme_minimal() +
geom_boxplot(aes(fill=Neighborhood)) +
geom_violin(aes(colour=Neighborhood), width=1.5, alpha=0.3) +
coord_flip() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position="none",
plot.caption = element_text(face="italic")
+
) scale_color_manual(values = nei_colours) +
labs(
title = "Distribution of House Prices by Neighbourhood",
caption = "Neighbourhoods listed in descending order of variability",
x = NULL,
y="Price ($'00,000)"
)
<- neighbourhood_price_variation %>%
tbl pivot_longer(-Neighborhood,names_to = "Measure", values_to = "vals") %>%
mutate(vals = round(vals,2)) %>%
arrange(desc(Neighborhood)) %>%
pivot_wider(names_from = "Neighborhood", values_from = "vals", names_sort = T)
<- tbl %>% select(1:15) %>%
tbl1 kbl() %>%
kable_styling(bootstrap_options = c("condensed"), full_width = F, font_size = 10, position = "left")
<- tbl %>% select(c(1,16:28)) %>%
tbl2 kbl() %>%
kable_styling(bootstrap_options = c("condensed"), full_width = F, font_size = 10, position = "left")
Distribution of Neighbourhood Prices in Order of Variability ($’00,000)
Measure | StoneBr | NridgHt | Timber | Veenker | Crawfor | GrnHill | Somerst | Edwards | CollgCr | SawyerW | ClearCr | NWAmes | Gilbert | Mitchel |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Mean | 3.39 | 3.34 | 2.65 | 2.34 | 2.04 | 2.80 | 2.35 | 1.36 | 1.97 | 1.83 | 1.93 | 1.94 | 1.93 | 1.65 |
Median | 3.41 | 3.37 | 2.33 | 2.06 | 2.05 | 2.80 | 2.22 | 1.27 | 1.96 | 1.82 | 1.85 | 1.85 | 1.83 | 1.56 |
Min | 1.80 | 1.73 | 1.75 | 1.50 | 0.96 | 2.30 | 1.39 | 0.62 | 1.10 | 0.68 | 1.07 | 0.82 | 1.33 | 0.94 |
Max | 5.92 | 6.15 | 4.25 | 3.85 | 3.92 | 3.30 | 4.68 | 4.15 | 4.75 | 3.20 | 2.77 | 2.78 | 3.78 | 2.85 |
SD | 1.23 | 1.05 | 0.84 | 0.73 | 0.71 | 0.71 | 0.65 | 0.55 | 0.53 | 0.48 | 0.48 | 0.41 | 0.41 | 0.40 |
H-Spread | 1.51 | 1.49 | 1.51 | 0.68 | 0.80 | 0.50 | 0.73 | 0.36 | 0.59 | 0.67 | 0.76 | 0.59 | 0.28 | 0.33 |
CV | 0.36 | 0.31 | 0.32 | 0.31 | 0.35 | 0.25 | 0.28 | 0.40 | 0.27 | 0.26 | 0.25 | 0.21 | 0.21 | 0.24 |
Measure | BrkSide | OldTown | NoRidge | IDOTRR | Greens | SWISU | NAmes | Blmngtn | Sawyer | MeadowV | BrDale | NPkVill | Blueste |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Mean | 1.22 | 1.20 | 2.96 | 0.98 | 1.99 | 1.31 | 1.41 | 1.99 | 1.39 | 0.93 | 0.99 | 1.39 | 1.26 |
Median | 1.24 | 1.20 | 2.90 | 1.00 | 2.13 | 1.34 | 1.40 | 1.91 | 1.36 | 0.86 | 0.99 | 1.42 | 1.24 |
Min | 0.39 | 0.13 | 2.35 | 0.35 | 1.55 | 0.80 | 0.76 | 1.73 | 0.64 | 0.73 | 0.83 | 1.23 | 1.17 |
Max | 2.07 | 2.66 | 4.05 | 1.51 | 2.14 | 1.69 | 2.78 | 2.47 | 2.19 | 1.29 | 1.25 | 1.50 | 1.37 |
SD | 0.37 | 0.36 | 0.36 | 0.32 | 0.29 | 0.27 | 0.27 | 0.26 | 0.21 | 0.19 | 0.13 | 0.12 | 0.10 |
H-Spread | 0.42 | 0.30 | 0.50 | 0.49 | 0.16 | 0.29 | 0.27 | 0.23 | 0.20 | 0.20 | 0.17 | 0.13 | 0.10 |
CV | 0.30 | 0.30 | 0.12 | 0.32 | 0.15 | 0.21 | 0.19 | 0.13 | 0.15 | 0.20 | 0.13 | 0.09 | 0.08 |
Summary
Most Expensive:
Least Expensive:
Most Heterogeneous:
Note:
Which variable has the largest number of missing values? Explain why it makes sense that there are so many missing values for this variable.
%>%
ames_train summarise(across(everything(), ~ sum(is.na(.x)))) %>%
pivot_longer(everything(), names_to = "Variable") %>%
rename(NAs = value) %>%
slice_max(order_by = NAs, n=5)
Variable | NAs |
---|---|
Pool.QC | 997 |
Misc.Feature | 971 |
Alley | 933 |
Fence | 798 |
Fireplace.Qu | 491 |
Pool.QC has the highest count of NA’s.
From the codebook:
Pool QC (Ordinal): Pool quality
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
NA No Pool
Each NA just represents a home without a pool.
We want to predict the natural log of the home prices. Candidate explanatory variables are lot size in square feet (Lot.Area), slope of property (Land.Slope), original construction date (Year.Built), remodel date (Year.Remod.Add), and the number of bedrooms above grade (Bedroom.AbvGr). Pick a model selection or model averaging method covered in the Specialization, and describe how this method works. Then, use this method to find the best multiple regression model for predicting the natural log of the home prices.
<- ames_train %>%
df_base select(
price,
Lot.Area,
Land.Slope,
Year.Built,
Year.Remod.Add,
Bedroom.AbvGr%>%
) mutate(price = log(price)) %>%
drop_na()
<- bas.lm(price ~ .,
m_ames1 data = df_base,
prior = "ZS-null",
modelprior = uniform()
) m_ames1
##
## Call:
## bas.lm(formula = price ~ ., data = df_base, prior = "ZS-null",
## modelprior = uniform())
##
##
## Marginal Posterior Inclusion Probabilities:
## Intercept Lot.Area Land.SlopeMod Land.SlopeSev Year.Built
## 1.0000 1.0000 0.7201 0.8466 1.0000
## Year.Remod.Add Bedroom.AbvGr
## 1.0000 1.0000
= predict(m_ames1, estimator = "BPM", se.fit = TRUE)
ames.BPM1 $best.vars ames.BPM1
## [1] "Intercept" "Lot.Area" "Land.SlopeMod" "Land.SlopeSev"
## [5] "Year.Built" "Year.Remod.Add" "Bedroom.AbvGr"
= as.vector(
BPM1 which.matrix(
$which[ames.BPM1$best],
m_ames1$n.vars)
m_ames1
)= bas.lm(price ~ ., data = df_base,
ames.BPM1 prior = "ZS-null",
modelprior = uniform(),
bestmodel = BPM1, n.models = 1)
<- coef(ames.BPM1)
ames.BPM1.coef ames.BPM1.coef
##
## Marginal Posterior Summaries of Coefficients:
##
## Using BMA
##
## Based on the top 1 models
## post mean post SD post p(B != 0)
## Intercept 1.202e+01 8.824e-03 1.000e+00
## Lot.Area 1.025e-05 1.104e-06 1.000e+00
## Land.SlopeMod 1.380e-01 4.984e-02 1.000e+00
## Land.SlopeSev -4.553e-01 1.512e-01 1.000e+00
## Year.Built 6.031e-03 3.782e-04 1.000e+00
## Year.Remod.Add 6.757e-03 5.460e-04 1.000e+00
## Bedroom.AbvGr 8.660e-02 1.076e-02 1.000e+00
par(mfrow = c(2, 3))
plot(ames.BPM1.coef, subset = c(2:7))
Initial Linear Model
The bas.lm()
function from the BAS
package
is used:
prior = "ZS-null"
) for the
prior distributions of the coefficients in this regression.modelprior = uniform()
.Model Selection - Best Predictive Model
Since the purpose of the model is to predict future audience movie scores, the approach used is Best Predictive Model (BPM). This is the option whose predictions are closest to those given by the Bayesian model averaging model.
estimator = "BPM"
is not yet available in
coef()
, so we need to extract a vector of zeros and ones
representing which variables are included in the BPM model.bas.lm
using the
optional argument bestmodel = BPM
.n.models = 1
. In this way, bas.lm
starts with
the BPM and fits only that model.coef
function to obtain the summaries.Which home has the largest squared residual in the previous analysis (Question 4)? Looking at all the variables in the data set, can you explain why this home stands out from the rest (what factors contribute to the high squared residual and why are those factors relevant)?
par(mfrow = c(1, 1))
plot(
m_ames1, which = 1,
add.smooth = F,
ask = F,
pch = 16,
sub.caption="",
caption="")
<- nrow(df_base)
n <- qnorm(0.5 + 0.5 * 0.95 ^ (1 / n))
ames_k <- lm(price ~ ., data = df_base)
ames.lm <- Bayes.outlier(ames.lm, k = ames_k)
outliers <- ames_train[which.max(outliers$prob.outlier),]
max_outlier %>%
max_outlier select(PID, price, Year.Built, Year.Remod.Add, area, Overall.Cond, Exterior.1st,
.1, Central.Air, Garage.Cond, Sale.Condition) %>%
Exter.Cond, Bsmt.Cond, BsmtFin.Typekbl() %>% kable_styling(bootstrap_options = c("condensed"), font_size = 10)
PID | price | Year.Built | Year.Remod.Add | area | Overall.Cond | Exterior.1st | Exter.Cond | Bsmt.Cond | BsmtFin.Type.1 | Central.Air | Garage.Cond | Sale.Condition |
---|---|---|---|---|---|---|---|---|---|---|---|---|
902207130 | 12789 | 1923 | 1970 | 832 | 2 | AsbShng | Fa | Fa | Unf | N | Fa | Abnorml |
The property with the highest \(R^2\) value is observation 428 (Property ID 902207130).
Sale Price was $12,789, well below the median $120,000 for that neighbourhood (Old Town).
A number of factors might point to a cause (likely a combination of some or all of these):
Use the same model selection method you chose in Question 4 to again find the best multiple regression model to predict the natural log of home prices, but this time replacing Lot.Area with log(Lot.Area). Do you arrive at a model including the same set of predictors?
<- ames_train %>%
df_base select(
price,
Lot.Area,
Land.Slope,
Year.Built,
Year.Remod.Add,
Bedroom.AbvGr%>%
) mutate(price = log(price), Lot.Area = log(Lot.Area)) %>%
drop_na()
<- bas.lm(price ~ .,
m_ames2 data = df_base,
prior = "ZS-null",
modelprior = uniform()
) m_ames2
##
## Call:
## bas.lm(formula = price ~ ., data = df_base, prior = "ZS-null",
## modelprior = uniform())
##
##
## Marginal Posterior Inclusion Probabilities:
## Intercept Lot.Area Land.SlopeMod Land.SlopeSev Year.Built
## 1.00000 1.00000 0.47366 0.05238 1.00000
## Year.Remod.Add Bedroom.AbvGr
## 1.00000 0.99999
= predict(m_ames2, estimator = "BPM", se.fit = TRUE)
ames.BPM2 $best.vars ames.BPM2
## [1] "Intercept" "Lot.Area" "Year.Built" "Year.Remod.Add"
## [5] "Bedroom.AbvGr"
= as.vector(
BPM2 which.matrix(
$which[ames.BPM2$best],
m_ames2$n.vars)
m_ames2
)= bas.lm(price ~ ., data = df_base,
ames.BPM2 prior = "ZS-null",
modelprior = uniform(),
bestmodel = BPM2, n.models = 1)
<- coef(ames.BPM2)
ames.BPM2.coef ames.BPM2.coef
##
## Marginal Posterior Summaries of Coefficients:
##
## Using BMA
##
## Based on the top 1 models
## post mean post SD post p(B != 0)
## Intercept 1.202e+01 8.396e-03 1.000e+00
## Lot.Area 2.466e-01 1.653e-02 1.000e+00
## Land.SlopeMod 0.000e+00 0.000e+00 0.000e+00
## Land.SlopeSev 0.000e+00 0.000e+00 0.000e+00
## Year.Built 5.952e-03 3.600e-04 1.000e+00
## Year.Remod.Add 6.752e-03 5.192e-04 1.000e+00
## Bedroom.AbvGr 5.714e-02 1.053e-02 1.000e+00
par(mfrow = c(1, 4))
plot(ames.BPM2.coef, subset = c(2, 5:7))
The revised model no longer includes Land.Slope
. This
was the weakest variable in the previous model.
Do you think it is better to log transform Lot.Area, in terms of assumptions for linear regression? Make graphs of the predicted values of log home price versus the true values of log home price for the regression models selected for Lot.Area and log(Lot.Area). Referencing these two plots, provide a written support that includes a quantitative justification for your answer in the first part of question 7.
<- tibble(
prices Observed = log(ames_train$price),
`Lot Area` = fitted(ames.BPM1),
`Log Lot Area` = fitted(ames.BPM2)
%>%
) pivot_longer(
cols = c(`Lot Area`,`Log Lot Area`),
values_to = "Predicted",
names_to = "Model"
%>%
) filter(Observed != min(Observed)) %>%
mutate(Errors = Predicted - Observed)
<- prices %>%
plot_fitted ggplot(aes(x = Observed, y = Predicted, colour = Model)) +
geom_point(alpha = 0.4) +
geom_smooth(se = F, method = "lm", size = 1.2) +
theme_minimal() +
scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
geom_abline(intercept = 0, slope = 1, size = 1, colour = "#FB8072") +
theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
labs(
title = "Observed vs Predicted House Prices for Lot Area & Log of Lot Area Models",
caption = "Red line represents actual values. Outlier 428 removed."
)
<- prices %>%
plot_errors ggplot(aes(x = Observed, y = Errors, colour = Model)) +
geom_point(alpha = 0.4) +
geom_smooth(se = F, size = 1.2) +
theme_minimal() +
scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
geom_hline(yintercept = 0) +
theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
labs(
title = "Errors of Predicted Values for Lot Area & Log of Lot Area Models",
caption = "Outlier 428 removed."
)
<- prices %>%
tbl_errors group_by(Model) %>%
mutate(Errors = abs(Errors)) %>%
summarise(
mean=round(mean(Errors),2),
median=median(Errors),
sd=round(sd(Errors),2),
max=max(Errors),
q25=quantile(Errors, 0.25),
q75=quantile(Errors,0.75),
skew=round(skewness(Errors),2)) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("condensed"),
full_width = F,
position = "left"
)
Model | mean | median | sd | max | q25 | q75 | skew |
---|---|---|---|---|---|---|---|
Log Lot Area | 0.20 | 0.1560065 | 0.16 | 1.076523 | 0.0717882 | 0.2705997 | 1.55 |
Lot Area | 0.21 | 0.1647967 | 0.17 | 1.024407 | 0.0774881 | 0.2832801 | 1.44 |
log(Lot.Area)
edging out Lot.Area
in slope
coefficient, and having a lower median and standard deviation of
errors.log(Lot.Area)
is the better performing of the two models.log(Lot.Area)
also produced the largest
error and the distribution of errors is more heavily right-skewed.A rough approximation is to look at the far end of the tail on the errors. A distance of 1.5 from the median of ~12 gives an error of around 1.0. Square of the distance is 2.25 so the correction factor would be 1/2.25:
<- median(prices$Observed)
prices_median %>%
prices mutate(
correction =
+
(Predicted 1 - (!(Observed>prices_median))*2) *
((1/2.25)*(Observed - prices_median)^2)) - Observed
(%>%
) ggplot(aes(x = Observed, y = correction, colour = Model)) +
geom_point(alpha = 0.4) +
geom_smooth(se = F, size = 1.2) +
theme_minimal() +
scale_color_manual(values = c("#80B1D3", "#B3DE69")) +
geom_hline(yintercept = 0) +
theme(plot.title = element_text(hjust = 0.5), plot.caption = element_text(face="italic")) +
labs(
title = "Errors of Corrected Predicted Values for Lot Area & Log of Lot Area Models",
caption = "Outlier 428 removed.",
+
) ylab("Errors for Corrected Predicted Values")
Already a much better performance on the tails.