knitr::opts_chunk$set(
comment = NA,
message = FALSE,
warning = FALSE)
# --- Setup: Libraries ----
# Regex parsing package
library(tidyverse)
library(ggplot2)
library(sf)
library(sp)
library(rmarkdown)
library(kableExtra)
library(viridis)
library(tidycensus)
library(mapview)
library(lubridate)
library(ggcorrplot)
library(raster)
library(stringr)
library(stargazer)
library(ggpubr)
library(caret)
library(mapview)
library(ggspatial)
library(rgeos)
library(spdep)
library(geosphere)
library(ckanr)
library(FNN)
library(geosphere)
library(grid)
library(gridExtra)
library(ggstance)
library(ggspatial)
library(jtools)
library(broom)
library(tufte) #excluding for now..weird errors
library(readr)
options(scipen = 999)
# --- Setup: Aesthetics ----
## Aesthetics
mapTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 14,colour = "black"),
plot.subtitle=element_text(face="italic"),
plot.caption=element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2)
)
}
plotTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 14,colour = "black"),
plot.subtitle = element_text(face="italic"),
plot.caption = element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=12),
axis.title = element_text(size=12),
axis.text = element_text(size=10),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"),
strip.text.x = element_text(size = 14)
)
}
palette5 <- c("#25CB10", "#5AB60C", "#8FA108", "#C48C04", "#FA7800")
qBr <- function(df, variable, rnd) {
if (missing(rnd)) {
as.character(quantile(round(df[[variable]],0),
c(.01,.2,.4,.6,.8), na.rm=T))
} else if (rnd == FALSE | rnd == F) {
as.character(formatC(quantile(df[[variable]]), digits = 3),
c(.01,.2,.4,.6,.8), na.rm=T)
}
}
q5 <- function(variable) {as.factor(ntile(variable, 5))}
# --- Setup: Functions ----
nn_function <- function(measureFrom,measureTo,k) {
measureFrom_Matrix <- as.matrix(measureFrom)
measureTo_Matrix <- as.matrix(measureTo)
nn <-
get.knnx(measureTo, measureFrom, k)$nn.dist
output <-
as.data.frame(nn) %>%
rownames_to_column(var = "thisPoint") %>%
gather(points, point_distance, V1:ncol(.)) %>%
arrange(as.numeric(thisPoint)) %>%
group_by(thisPoint) %>%
dplyr::summarize(pointDistance = mean(point_distance)) %>%
arrange(as.numeric(thisPoint)) %>%
dplyr::select(-thisPoint) %>%
pull()
return(output)
}
multipleRingBuffer <- function(inputPolygon, maxDistance, interval)
{
#create a list of distances that we'll iterate through to create each ring
distances <- seq(0, maxDistance, interval)
#we'll start with the second value in that list - the first is '0'
distancesCounter <- 2
#total number of rings we're going to create
numberOfRings <- floor(maxDistance / interval)
#a counter of number of rings
numberOfRingsCounter <- 1
#initialize an otuput data frame (that is not an sf)
allRings <- data.frame()
#while number of rings counteris less than the specified nubmer of rings
while (numberOfRingsCounter <= numberOfRings)
{
#if we're interested in a negative buffer and this is the first buffer
#(ie. not distance = '0' in the distances list)
if(distances[distancesCounter] < 0 & distancesCounter == 2)
{
#buffer the input by the first distance
buffer1 <- st_buffer(inputPolygon, distances[distancesCounter])
#different that buffer from the input polygon to get the first ring
buffer1_ <- st_difference(inputPolygon, buffer1)
#cast this sf as a polygon geometry type
thisRing <- st_cast(buffer1_, "POLYGON")
#take the last column which is 'geometry'
thisRing <- as.data.frame(thisRing[,ncol(thisRing)])
#add a new field, 'distance' so we know how far the distance is for a give ring
thisRing$distance <- distances[distancesCounter]
}
#otherwise, if this is the second or more ring (and a negative buffer)
else if(distances[distancesCounter] < 0 & distancesCounter > 2)
{
#buffer by a specific distance
buffer1 <- st_buffer(inputPolygon, distances[distancesCounter])
#create the next smallest buffer
buffer2 <- st_buffer(inputPolygon, distances[distancesCounter-1])
#This can then be used to difference out a buffer running from 660 to 1320
#This works because differencing 1320ft by 660ft = a buffer between 660 & 1320.
#bc the area after 660ft in buffer2 = NA.
thisRing <- st_difference(buffer2,buffer1)
#cast as apolygon
thisRing <- st_cast(thisRing, "POLYGON")
#get the last field
thisRing <- as.data.frame(thisRing$geometry)
#create the distance field
thisRing$distance <- distances[distancesCounter]
}
#Otherwise, if its a positive buffer
else
{
#Create a positive buffer
buffer1 <- st_buffer(inputPolygon, distances[distancesCounter])
#create a positive buffer that is one distance smaller. So if its the first buffer
#distance, buffer1_ will = 0.
buffer1_ <- st_buffer(inputPolygon, distances[distancesCounter-1])
#difference the two buffers
thisRing <- st_difference(buffer1,buffer1_)
#cast as a polygon
thisRing <- st_cast(thisRing, "POLYGON")
#geometry column as a data frame
thisRing <- as.data.frame(thisRing[,ncol(thisRing)])
#add teh distance
thisRing$distance <- distances[distancesCounter]
}
#rbind this ring to the rest of the rings
allRings <- rbind(allRings, thisRing)
#iterate the distance counter
distancesCounter <- distancesCounter + 1
#iterate the number of rings counter
numberOfRingsCounter <- numberOfRingsCounter + 1
}
#convert the allRings data frame to an sf data frame
allRings <- st_as_sf(allRings)
}
An accurate home price prediction algorithm can reduce volatility in the housing market and take into account existing factors that may not be reflected in a home’s previous selling prices (e.g., new roof, new shopping center, etc.) However, predictive algorithms can also be exceedingly difficult to perfect. A falsely high average estimate in a neighborhood might lead home sellers to list their homes at too high an asking price and dragging out the process of selling their home, thereby introducing friction into the housing market. A falsely low estimate may depress the value of what is oftentimes a homeowner’s most valuable asset.
This project attempts to predict housing prices in metropolitan Miami by taking into consideration a home’s unique features (e.g., fence, patio) as well as considering local amenities and external features like schools, parks, and access to major roads. One interesting finding from this process is that a home’s location in a middle school zone shows a positive relationship with home prices, but not elementary or high school zone.
To create our model, we converted our features of interest into variables that can be fed into an OLS regression model. We tested each featured for correlation with home sale prices and fine-tuned our model until we were able to minimize error.
After testing and rejecting several features that did not deduce our prediction errors (e.g., distances to nearest park, major road, and elementary & high schools), we ultimately settled on the features (dependent variables) listed below.
The map below shows the spatial distribution of home prices in Miami and Miami Beach. Darker points represent more expensive homes, with the deepest purple shade representing any home 1 million dollars or higher. Given the extreme range of home prices in Miami (max approx. 27 million dollars), we felt it necessary to collapse the outlier homes into the highest tier of home prices.
Here we see the spatial relationship between home sale prices and distance from the shore. Unsurprisingly, as we move farther inland, home prices decrease.
# Map of Distance to Shore
ggplot() +
annotation_map_tile("cartolight") +
geom_sf(data=miami.base, fill='transparent') +
geom_sf(data=miamiHomes.sf, aes(colour=Shore.mile),
show.legend = "line", size= 1) +
labs(title = "Distance from Shoreline",
caption = "Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.") +
scale_colour_viridis(name = "Distance (miles)")
This map shows the relationship between middle school attendance zones and home sale prices.
### Middle School Areas
# Map of Middle School Areas
ggplot() +
annotation_map_tile(type = 'cartolight', progress = 'none') +
geom_sf(data = midschool, fill = "#d3e9ff", alpha =.4, color = "#498cd3", lwd = .8)+
geom_sf(data=trainprice.sf, aes(color=SalePrice),
show.legend = "line", size = 1) +
labs(title = "Middle School Districts & Home Prices",
caption = "Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.") +
scale_color_viridis(option="C",
name = "Home Price\n($0-$1M+)",
limits=c(10000,1000000),
breaks=c(0, 250000, 500000, 750000, 1000000),
direction = -1,
na.value = .97)
Below is a map of the percent of White residents in each Census tract in Miami and Miami Beach. As shown below, although having a higher percentage of White residence does not appear to be directly or perfectly correlated with home price, the absence of White residents is clearly tied to a lower estimate of home price.
Several other features such as distance from major roads, parks, and location within elementary and high school attendance zones were tested, but did not prove to be relevant. As shown below, there appears to be little relationship between distance from major roads and home price.
stargazer(sumstat.df, title="Summary Statistics", type='html',
summary.stat = c('n','mean','sd','min','max'))
| Statistic | N | Mean | St. Dev. | Min | Max |
| SalePrice | 2,066 | 405,476.400 | 199,741.700 | 12,500 | 1,000,000 |
| LotSize | 2,066 | 6,360.875 | 1,721.617 | 1,250 | 17,620 |
| Age | 2,066 | 70.954 | 18.186 | -1 | 115 |
| Stories | 2,066 | 1.073 | 0.265 | 0 | 3 |
| Bed | 2,066 | 2.692 | 0.794 | 0 | 8 |
| Bath | 2,066 | 1.611 | 0.700 | 0 | 6 |
| Pool | 2,066 | 0.108 | 0.310 | 0 | 1 |
| Fence | 2,066 | 0.738 | 0.440 | 0 | 1 |
| Patio | 2,066 | 0.499 | 0.500 | 0 | 1 |
| Shore1 | 2,066 | 7,047.549 | 5,248.614 | 88.597 | 26,528.540 |
| MedRent | 2,040 | 1,042.535 | 311.133 | 246.000 | 2,297.000 |
| pctWhite | 2,062 | 0.703 | 0.320 | 0.057 | 0.989 |
| pctPoverty | 2,062 | 0.217 | 0.108 | 0.052 | 0.556 |
| Brownsville.MS | 1,588 | 0.098 | 0.298 | 0.000 | 1.000 |
| CitrusGrove.MS | 1,588 | 0.115 | 0.319 | 0.000 | 1.000 |
| JosedeDiego.MS | 1,588 | 0.129 | 0.335 | 0.000 | 1.000 |
| GeorgiaJA.MS | 1,588 | 0.133 | 0.340 | 0.000 | 1.000 |
| KinlochPk.MS | 1,588 | 0.196 | 0.397 | 0.000 | 1.000 |
| Madison.MS | 1,588 | 0.001 | 0.035 | 0.000 | 1.000 |
| Nautilus.MS | 1,588 | 0.061 | 0.240 | 0.000 | 1.000 |
| Shenandoah.MS | 1,588 | 0.243 | 0.429 | 0.000 | 1.000 |
| WestMiami.MS | 1,588 | 0.024 | 0.153 | 0.000 | 1.000 |
Below is a correlation matrix, showing the relatedness of each numeric variable to every other. The red-bounded box shows each variable’s correlation with sale price, our dependent variable.
###Correlation Matrix
# Create a new subset of the training data with geometry dropped
numericVars <- miamiHomesClean.sf %>%
st_drop_geometry() %>%
filter(toPredict == 0) %>%
filter(SalePrice <= 1000000) %>%
dplyr::select(-ID, -Folio) %>%
select_if(is.numeric) %>%
na.omit() %>%
rename(Shore_Distance = Shore1)
# Plot correlation matrix
ggcorrplot(
round(cor(numericVars), 1),
p.mat = cor_pmat(numericVars),
colors = c("#FA7800", "white", "#25CB10"),
type="lower",
insig = "blank") +
labs(title = "Correlation across numeric variables") +
geom_rect(aes(xmin = 0, xmax = 24.5, ymin = 0.25, ymax = 1.75),
fill = "transparent", color = "red", size = 1)
The below plots show the linear relationship between 4 independent variables, and home prices. Actual square footage is most highly and positively correlated with home price. Median rent in a home’s area also has a small positive relationship, and distance from the shore and age are quite expectedly negatively correlated (i.e., as the age of a home increases, home price decreases).
# Home price correlation scatterplots
a <- ggplot(trainprice.sf, aes(y=SalePrice, x = ActualSqFt)) +
geom_point() +
geom_smooth(method = "lm")+
labs(title = "Price vs. SqFt (Actual)")+
xlab(label = 'Square Footage')
b <- ggplot(trainprice.sf, aes(y=SalePrice, x = Shore1/5280)) +
geom_point() +
geom_smooth(method = "lm")+
labs(title = "Price vs. Shore Distance")+
xlab(label = 'Distance (miles)')
c <- ggplot(trainprice.sf, aes(y=SalePrice, x = MedRent)) +
geom_point() +
geom_smooth(method = "lm")+
labs(title = "Price vs. Avg Rent (Tract)")+
xlab(label='Median HHI')
d <- ggplot(trainprice.sf, aes(y=SalePrice, x = Age)) +
geom_point() +
geom_smooth(method = "lm")+
labs(title = "Price vs. Home Age")+
xlab(label='Age')
ggarrange(a,b,c,d, ncol = 2, nrow = 2)
Our goal was to build a model that most accurately predicted Miami home prices. This was not a black-and-white procedure. We had three main considerations: 1) explain the variance in home prices, 2) minimize errors in predictions, and 3) be generalizable. Our first models used a simple ordinary least squares (OLS) regression. The results allowed us to exclude some census variables and remove overlapping variables (like square feet). In the next step we split our dataset 60/40, trained a model on 60% of the data, and tested it on the remaining 40%. This process gave us some insight into the model’s generalizability; it also allowed us to remove some variables. For example, this stage revealed that census tracts were our most influential neighborhood categorization, instead of property or mailing zip codes.
The last stage involved a machine learning method known as “k-fold cross validation,” by which we split our dataset into ten different subsets, which were then each divided further into training and test sets. We measured the average performance across each of these folds in order to measure generalizability. We tested different combinations of features until we settled on a model that had the lowest errors in relation to actual sale price.
stargazer(Reg1, Reg2, title="Training Set LM Results", align=TRUE, type = "html", out = "Regression_Results.htm")
| Dependent variable: | ||
| SalePrice | ||
| (1) | (2) | |
| Folio | 0.00000 | |
| (0.00000) | ||
| Property.CityMiami Beach | 220,589.400** | |
| (102,729.500) | ||
| LotSize | 17.974*** | |
| (1.660) | ||
| Bed | 8,653.610* | |
| (4,483.327) | ||
| Bath | 4,613.343 | |
| (5,440.150) | ||
| Stories | 13,854.920 | |
| (11,214.380) | ||
| Pool | 77,281.650*** | |
| (9,820.892) | ||
| Fence | -149.050 | |
| (5,646.349) | ||
| Patio | 4,073.120 | |
| (5,115.939) | ||
| ActualSqFt | 67.033*** | |
| (6.231) | ||
| Age | -698.975*** | |
| (147.148) | ||
| Shore1 | -5.745*** | -3.369*** |
| (1.229) | (1.030) | |
| MedHHInc | 1.370*** | 1.091*** |
| (0.234) | (0.193) | |
| TotalPop | 4.453** | 3.400** |
| (1.797) | (1.481) | |
| MedRent | 8.011 | 10.111 |
| (16.927) | (13.970) | |
| pctWhite | 87,738.750*** | 72,584.590*** |
| (21,213.910) | (18,038.400) | |
| pctPoverty | -65,160.580 | -28,329.320 |
| (46,955.060) | (38,669.420) | |
| Brownsville.MS | -74,321.900** | -19,595.140 |
| (35,665.230) | (29,848.970) | |
| CitrusGrove.MS | -63,085.380* | -16,536.970 |
| (33,757.230) | (27,908.010) | |
| JosedeDiego.MS | -24,574.030 | 41,689.690 |
| (36,087.170) | (30,171.430) | |
| GeorgiaJA.MS | -83,659.540** | -21,745.790 |
| (33,968.440) | (28,569.970) | |
| KinlochPk.MS | -20,898.120 | 7,151.547 |
| (23,871.550) | (19,733.870) | |
| Madison.MS | -102,752.000 | -24,901.940 |
| (89,066.670) | (73,254.900) | |
| Nautilus.MS | 229,234.000*** | |
| (37,774.810) | ||
| Shenandoah.MS | 99,482.560*** | 122,292.100*** |
| (31,097.140) | (25,991.120) | |
| WestMiami.MS | ||
| Constant | 274,559.400*** | -5,246.435 |
| (50,967.110) | (142,697.300) | |
| Observations | 1,584 | 1,584 |
| R2 | 0.603 | 0.736 |
| Adjusted R2 | 0.599 | 0.732 |
| Residual Std. Error | 113,746.600 (df = 1569) | 93,070.580 (df = 1559) |
| F Statistic | 170.158*** (df = 14; 1569) | 180.949*** (df = 24; 1559) |
| Note: | p<0.1; p<0.05; p<0.01 | |
#GEOID R2 = .3, MailingZip =.4, PropertyZip =.9
Our first regression was based on our feature engineering. We included census variables, shoreline distance and middle school distance, but excluded household details. This resulted in several statistically significant features, an adjusted R2 of 0.599 and an overall model F-statistic implies statistical significance. The intercept, $274,600, is the estimated mean sale price when all of our independent variables are zero. For every additional foot from the shore, the sale price will drop by $5.75. Our second model below on the right, includes all of the census variables, household variables, and some of our feature engineering. The adjusted R2 improves to 0.732.
This table shows our mean absolute error and mean absolute percent error for a single test set. Clearly more work needed to be done to improve our model.
Single Test MAE and MAPE
Test MAE: 195610
Test MAPE: 61
These are the high level results of our final model based on the features discussed in the introduction. We were able to explain about 75% of the variance in sale prices, while our distance from sale price average about +/- $66,000.
Linear Regression
2066 samples
24 predictor
No pre-processing
Resampling: Cross-Validated (100 fold)
Summary of sample sizes: 1568, 1568, 1567, 1568, 1568, 1568, ...
Resampling results:
RMSE Rsquared MAE
88416.28 0.7556264 65775.12
Tuning parameter 'intercept' was held constant at a value of TRUE
| intercept | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| TRUE | 88416.28 | 0.7556264 | 65775.12 | 25971.98 | 0.1477249 | 16997.15 |
The histogram below examines our final cross validation model run on 100 folds. The MAE, or Mean Absolute Error, measures the difference in actual and predicted sales price across each test subset. Our errors clustered between $50k and $75k. From this plot alone, it is difficult to say whether our model is generalizable.
Our final model was much more accurate at predicting prices that fell in the lower-middle range for homes. Because we removed homes that cost more than $1 million from our training data, our model is particularly nongeneralizable when it comes to the upper end of the price spectrum (vastly underestimates the price of expensive homes). Removing outliers improves our overall mean errors, but hurts our predictive power at higher home values.
ggplot(regCV2plot,aes(obs,pred)) +
geom_point() +
stat_smooth(aes(obs, obs),
method = "lm", se = FALSE, size = 1, colour="#FA7800") +
stat_smooth(aes(obs, pred),
method = "lm", se = FALSE, size = 1, colour="#25CB10") +
labs(title="Predicted sale price as a function of observed price",
subtitle="Orange line represents a perfect prediction; Green line represents prediction",
x="Sale Price",
y="Predicted Price") +
plotTheme() + theme(plot.title = element_text(size = 18, colour = "black"))
The following two maps show our predicted home prices based on our final model in comparison to all the actual home prices used in the training dataset. Our model fails to capture higher-priced homes along the coast and lower-priced homes in the Northwest by Hialeah (missing variation in colors as seen in training data map).
Home prices are spatially autocorrelated with neighbors according to our model. In addition, the bottom chart shows that errors are also spacially correlated with neighbors.
Our observed Moran’s I, denoted by the vertical orange line, is statistically higher than the randomly permuted I, thus there exists spatial autocorrelation in our final model.
The chart below looks at all 3503 observations from the dataset, including those where no sale price is known. It applies our final model predictions.
#Updated this, hopefully it works
PredictMap <-
miamiHomesClean.sf %>%
mutate(SalePrice.Predict = predict(finaltraining, miamiHomesClean.sf))
#This isn't working for me b/c R saying you need 6 colors but palette5 only has 5?
#But don't think we need it, I already made a mape of the predicted prices (in that ggarrange plot)
#ggplot() +
#annotation_map_tile("cartolight") +
#geom_sf(data=miami.base, aes(fill='transparent')) +
#geom_sf(data = PredictMap, aes(fill = q5(SalePrice.Predict))) +
#labs(title="Precited Prices of All Homes") +
#scale_fill_manual(values = palette5,
#labels=qBr(PredictMap,"SalePrice.Predict"),
#name="Quintile\nBreaks") +
#mapTheme()
ggplot() +
annotation_map_tile("cartolight") +
geom_sf(data=miami.base, fill='transparent') +
geom_sf(data=PredictMap, aes(color=SalePrice.Predict),
show.legend = "line", size = 1) +
labs(title = "Predicted Home Prices") +
scale_color_viridis(option="C",
name = "Price ($)",
limits=c(10000,1000000),
breaks=c(0, 250000, 500000, 750000, 1000000),
direction = -1,
begin = 0,
end = .95,
na.value = .95)
### Commented out some plots b/c don’t think they’re needed (See Below)
This scatterplot looks at our errors in relation to price by neighborhood. Our errors grow larger for more expensive homes.
Finally, we used census data to test generalizability by average census tract household income. These results show that our model does differ drastically across income, therefore struggling with generalizability.
kable(miamiPoorLM$results)
| intercept | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| TRUE | 130147.8 | 0.7933946 | 93060.55 | 354594.9 | 0.2778949 | 210570.7 |
kable(miamiRichLM$results)
| intercept | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| TRUE | 98912.42 | 0.709759 | 82492.1 | 52113.41 | 0.271611 | 39852.27 |
In conclusion, the high mean average error (i.e., poor accuracy) and low generalizability of this model does not recommend its use. The average error is still approximately $67K, which is far too large an error for a use case as high-stakes as predicting housing prices for the reasons mentioned in the Introduction. This model is clearly missing additional features relevant to home prices, and future iterations of this model will need to account for the not insignifcant number of extremely high-value homes in Miami and Miami Beach, as well as the spatially clustered set of low-price homes in the Northeast of Miami.