The sales in Texas and how the housing bubble was in Texas and its cities. The curiosity arised from after reading this post. How do the sales and the construction work together. Did house contruction meet the demand for the sales? Comparing the indices for population, sales, price, and new construction might throw light on these questions
library(plyr)
library(ggplot2)
library(MASS)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.7-29. For overview type 'help("mgcv-package")'.
options(stringsAsFactors = FALSE)
msa <- read.csv("data/msa-changes/msa-states.csv")
tx_msa <- subset(msa, state == "TX")
tx_msa$state <- NULL
tx_codes <- tx_msa$msa_code
msa_labels <- read.csv("data/msa-changes/msa-major.csv")
head(msa_labels)
## msa_code city state label
## 1 10020 Abbeville LA AbbvllLA
## 2 10100 Aberdeen SD AberdnSD
## 3 10140 Aberdeen WA AberdnWA
## 4 10180 Abilene TX AbilenTX
## 5 10220 Ada OK AdaOK
## 6 10260 Adjuntas PR AdjntsPR
# Load population data,construction data,multiple listing of service in
# Texas and msa-names data
pop <- read.csv("data/census-population/population-msa.csv")
pop <- subset(pop, msa_code %in% tx_codes)
head(pop)
## city year births deaths domesticmig internationalmig naturalinc
## 64 Amarillo TX 2000 906 472 19 124 434
## 65 Amarillo TX 2001 3575 1943 -765 549 1632
## 66 Amarillo TX 2002 3505 2081 165 519 1424
## 67 Amarillo TX 2003 3694 2108 783 444 1586
## 68 Amarillo TX 2004 3873 2050 287 430 1823
## 69 Amarillo TX 2005 3801 2074 639 438 1727
## netmig npopchg_ popestimate residual msa_code
## 64 143 576 227098 -1 11100
## 65 -216 1194 228292 -222 11100
## 66 684 1869 230161 -239 11100
## 67 1227 2546 232707 -267 11100
## 68 717 2308 235015 -232 11100
## 69 1077 2629 237644 -175 11100
popt <- pop[c("msa_code", "year", "npopchg_", "popestimate", "domesticmig")]
names(popt)[3:4] <- c("pop_change", "pop")
head(popt)
## msa_code year pop_change pop domesticmig
## 64 11100 2000 576 227098 19
## 65 11100 2001 1194 228292 -765
## 66 11100 2002 1869 230161 165
## 67 11100 2003 2546 232707 783
## 68 11100 2004 2308 235015 287
## 69 11100 2005 2629 237644 639
cons <- read.csv("data/construction/construction-housing-units.csv")
cons <- subset(cons, msa_code %in% tx_codes)
cons <- cons[with(cons, order(city, year, month, size)), ]
cons$city <- NULL
head(cons)
## year month size units valuation msa_code
## 358 2000 1 1 16 1739 10180
## 318 2000 1 2 0 0 10180
## 315 2000 1 3-4 0 0 10180
## 341 2000 1 5+ 0 0 10180
## 370 2000 2 1 16 2426 10180
## 346 2000 2 2 0 0 10180
listings <- read.csv("data/texas-msa-sales/texas-listings.csv")
listings <- subset(listings, year >= 2000)
recenter_msa <- read.csv("data/texas-msa-sales/msa-names.csv")
# merge the listing with msa and remove the city from the listing as msa is
# enough
listings_raw <- merge(listings, recenter_msa, by = "msa")
listings_raw$city <- NULL
head(listings_raw)
## msa sales volume price_avg price_med listings inventory month year
## 1 110 72 5380000 74700 71400 701 6.3 1 2000
## 2 110 98 6505000 66400 58700 746 6.6 2 2000
## 3 110 130 9285000 71400 58100 784 6.8 3 2000
## 4 110 98 9730000 99300 68600 785 6.9 4 2000
## 5 110 141 10590000 75100 67300 794 6.8 5 2000
## 6 110 156 13910000 89200 66900 780 6.6 6 2000
## msa_code
## 1 10180
## 2 10180
## 3 10180
## 4 10180
## 5 10180
## 6 10180
# Aggregated based on year code and month as i want only volumn listings and
# sales. this can also be done by setting those feilds to null but its a
# lengthy process i left
listings <- ddply(listings_raw, c("msa_code", "year", "month"), summarise, volume = sum(volume,
na.rm = T), listings = sum(listings, na.rm = T), sales = sum(sales, na.rm = T))
is.na(listings$sales) <- listings$sales == 0
# calculate the avg price the sales on listing and the date and transformed
# into dataframe and appended with listings df
listings <- transform(listings, avg_price = volume/sales, pct_sold = sales/listings,
date = year + month/12)
listings <- merge(listings, msa_labels, by = "msa_code", all.x = T) #merged the msa codes (city state and label) from msa major
listings <- listings[with(listings, order(msa_code, year, month)), ] #arrange the df on code year and month
head(listings)
## msa_code year month volume listings sales avg_price pct_sold date
## 10 10180 2000 1 5380000 701 72 74722 0.1027 2000
## 11 10180 2000 2 6505000 746 98 66378 0.1314 2000
## 12 10180 2000 3 9285000 784 130 71423 0.1658 2000
## 13 10180 2000 4 9730000 785 98 99286 0.1248 2000
## 14 10180 2000 5 10590000 794 141 75106 0.1776 2000
## 15 10180 2000 6 13910000 780 156 89167 0.2000 2000
## city state label
## 10 Abilene TX AbilenTX
## 11 Abilene TX AbilenTX
## 12 Abilene TX AbilenTX
## 13 Abilene TX AbilenTX
## 14 Abilene TX AbilenTX
## 15 Abilene TX AbilenTX
# Explore sales by pop
xdate <- scale_x_continuous("Date", breaks = c(2001, 2003, 2005, 2007, 2009),
labels = c("01", "03", "05", "07", "09"))
listpop <- merge(listings, popt, by = c("msa_code", "year")) #by merging listings with popt we r considering only for Texas
head(listpop)
## msa_code year month volume listings sales avg_price pct_sold date
## 1 11100 2000 3 17930000 995 201 89204 0.2020 2000
## 2 11100 2000 12 15800000 1092 158 100000 0.1447 2001
## 3 11100 2000 11 13955000 1174 133 104925 0.1133 2001
## 4 11100 2000 8 25995000 1208 242 107417 0.2003 2001
## 5 11100 2000 1 8860000 972 102 86863 0.1049 2000
## 6 11100 2000 2 13875000 937 147 94388 0.1569 2000
## city state label pop_change pop domesticmig
## 1 Amarillo TX AmarllTX 576 227098 19
## 2 Amarillo TX AmarllTX 576 227098 19
## 3 Amarillo TX AmarllTX 576 227098 19
## 4 Amarillo TX AmarllTX 576 227098 19
## 5 Amarillo TX AmarllTX 576 227098 19
## 6 Amarillo TX AmarllTX 576 227098 19
The plots generated are:
# simply plot the date against sales to see which cities have had sales and
# how have they been in these years
qplot(date, sales, data = listpop, geom = "line") + facet_wrap(~city) + xdate +
theme(panel.background = element_rect(fill = "yellow", colour = "red"))
qplot(date, sales/pop * 10000, data = listpop, geom = "line") + facet_wrap(~city) +
xdate + ylim(2, 22) + theme(panel.background = element_rect(fill = "orange",
colour = "red"))
# find the mean of the residuals from the linear regression model attained
dicease <- function(var, month) {
resid(rlm(var ~ factor(month), na.action = "na.exclude")) + mean(var, na.rm = TRUE)
}
listpop <- ddply(listpop, "msa_code", transform, sales_adj = dicease(sales/pop *
10000, month))
qplot(date, sales_adj, data = listpop, geom = "line") + facet_wrap(~city) +
xdate + ylim(2, 22) + theme(panel.background = element_rect(fill = "orange"))
list2008 <- ddply(subset(listpop, year == 2008), "city", summarise, pop = pop[length(pop)],
sales = sum(sales), avg_sales_per_10000 = sum(sales)/pop[length(pop)]/12 *
10000)
qplot(pop/1e+06, avg_sales_per_10000, data = list2008, xlab = "Population (millions)",
ylab = "Average monthly sales (per 10000 people)") + geom_text(aes(label = city,
color = "red"), size = 5, hjust = -0.1, angle = 45) + theme(panel.background = element_rect(fill = "yellow",
colour = "red"))
qplot(pop, avg_sales_per_10000, label = city, data = list2008, geom = c("point",
"text"), log = "x") + theme(panel.background = element_rect(fill = "orange",
colour = "black"))
# Compare indices for population, sales, price, and new construction
single <- subset(cons, size == 1) #consider only for 1
single$size <- NULL
all <- merge(listpop, single, by = c("msa_code", "year", "month"), all = T) #merge listings of texas pop and the construction of singles
all <- all[with(all, order(msa_code, year, month)), ]
all <- subset(all, !is.na(city)) #clean NA rows
index <- function(x) {
x/x[1]
}
# the smooth function gives the smoothness to the model and predict the fit
smooth <- function(var, data) {
try_default(predict(gam(var ~ s(data), na.action = na.exclude)), NA)
}
smoothes <- ddply(all, "city", summarise, date = date, construction_s = smooth(units/pop,
date) * 10000, population_s = smooth(pop, date), sales_s = smooth(sales/pop,
date) * 10000, sold_value_s = smooth(avg_price, date), new_value_s = smooth(valuation/units,
date))
ggplot(smoothes, aes(date)) + geom_line(aes(y = sales_s, colour = "sold")) +
geom_line(aes(y = construction_s, colour = "built")) + labs(colour = "Type",
y = "Houses (per 10000 people)") + facet_wrap(~city) + xdate + theme(panel.background = element_rect(fill = "white"))
ggplot(smoothes, aes(date)) + geom_line(aes(y = sold_value_s/1000, colour = "sold")) +
geom_line(aes(y = new_value_s, colour = "built")) + labs(colour = "Type",
y = "Average price ($000)") + facet_wrap(~city) + xdate + theme(panel.background = element_rect(fill = "white"))