Texas sales

The sales in Texas and how the housing bubble was in Texas and its cities. The curiosity arised from after reading this post. How do the sales and the construction work together. Did house contruction meet the demand for the sales? Comparing the indices for population, sales, price, and new construction might throw light on these questions

Follow the comments to understand the code

library(plyr)
library(ggplot2)
library(MASS)
library(mgcv)

## Loading required package: nlme
## This is mgcv 1.7-29. For overview type 'help("mgcv-package")'.

options(stringsAsFactors = FALSE)
msa <- read.csv("data/msa-changes/msa-states.csv")
tx_msa <- subset(msa, state == "TX")
tx_msa$state <- NULL
tx_codes <- tx_msa$msa_code
msa_labels <- read.csv("data/msa-changes/msa-major.csv")
head(msa_labels)

##   msa_code      city state    label
## 1    10020 Abbeville    LA AbbvllLA
## 2    10100  Aberdeen    SD AberdnSD
## 3    10140  Aberdeen    WA AberdnWA
## 4    10180   Abilene    TX AbilenTX
## 5    10220       Ada    OK    AdaOK
## 6    10260  Adjuntas    PR AdjntsPR

# Load population data,construction data,multiple listing of service in
# Texas and msa-names data
pop <- read.csv("data/census-population/population-msa.csv")
pop <- subset(pop, msa_code %in% tx_codes)
head(pop)

##           city year births deaths domesticmig internationalmig naturalinc
## 64 Amarillo TX 2000    906    472          19              124        434
## 65 Amarillo TX 2001   3575   1943        -765              549       1632
## 66 Amarillo TX 2002   3505   2081         165              519       1424
## 67 Amarillo TX 2003   3694   2108         783              444       1586
## 68 Amarillo TX 2004   3873   2050         287              430       1823
## 69 Amarillo TX 2005   3801   2074         639              438       1727
##    netmig npopchg_ popestimate residual msa_code
## 64    143      576      227098       -1    11100
## 65   -216     1194      228292     -222    11100
## 66    684     1869      230161     -239    11100
## 67   1227     2546      232707     -267    11100
## 68    717     2308      235015     -232    11100
## 69   1077     2629      237644     -175    11100

popt <- pop[c("msa_code", "year", "npopchg_", "popestimate", "domesticmig")]
names(popt)[3:4] <- c("pop_change", "pop")
head(popt)

##    msa_code year pop_change    pop domesticmig
## 64    11100 2000        576 227098          19
## 65    11100 2001       1194 228292        -765
## 66    11100 2002       1869 230161         165
## 67    11100 2003       2546 232707         783
## 68    11100 2004       2308 235015         287
## 69    11100 2005       2629 237644         639

cons <- read.csv("data/construction/construction-housing-units.csv")
cons <- subset(cons, msa_code %in% tx_codes)
cons <- cons[with(cons, order(city, year, month, size)), ]
cons$city <- NULL
head(cons)

##     year month size units valuation msa_code
## 358 2000     1    1    16      1739    10180
## 318 2000     1    2     0         0    10180
## 315 2000     1  3-4     0         0    10180
## 341 2000     1   5+     0         0    10180
## 370 2000     2    1    16      2426    10180
## 346 2000     2    2     0         0    10180

listings <- read.csv("data/texas-msa-sales/texas-listings.csv")
listings <- subset(listings, year >= 2000)
recenter_msa <- read.csv("data/texas-msa-sales/msa-names.csv")
# merge the listing with msa and remove the city from the listing as msa is
# enough
listings_raw <- merge(listings, recenter_msa, by = "msa")
listings_raw$city <- NULL
head(listings_raw)

##   msa sales   volume price_avg price_med listings inventory month year
## 1 110    72  5380000     74700     71400      701       6.3     1 2000
## 2 110    98  6505000     66400     58700      746       6.6     2 2000
## 3 110   130  9285000     71400     58100      784       6.8     3 2000
## 4 110    98  9730000     99300     68600      785       6.9     4 2000
## 5 110   141 10590000     75100     67300      794       6.8     5 2000
## 6 110   156 13910000     89200     66900      780       6.6     6 2000
##   msa_code
## 1    10180
## 2    10180
## 3    10180
## 4    10180
## 5    10180
## 6    10180

# Aggregated based on year code and month as i want only volumn listings and
# sales. this can also be done by setting those feilds to null but its a
# lengthy process i left
listings <- ddply(listings_raw, c("msa_code", "year", "month"), summarise, volume = sum(volume, 
    na.rm = T), listings = sum(listings, na.rm = T), sales = sum(sales, na.rm = T))
is.na(listings$sales) <- listings$sales == 0
# calculate the avg price the sales on listing and the date and transformed
# into dataframe and appended with listings df
listings <- transform(listings, avg_price = volume/sales, pct_sold = sales/listings, 
    date = year + month/12)
listings <- merge(listings, msa_labels, by = "msa_code", all.x = T)  #merged the msa codes (city state and label) from msa major
listings <- listings[with(listings, order(msa_code, year, month)), ]  #arrange the df on code year and month
head(listings)

##    msa_code year month   volume listings sales avg_price pct_sold date
## 10    10180 2000     1  5380000      701    72     74722   0.1027 2000
## 11    10180 2000     2  6505000      746    98     66378   0.1314 2000
## 12    10180 2000     3  9285000      784   130     71423   0.1658 2000
## 13    10180 2000     4  9730000      785    98     99286   0.1248 2000
## 14    10180 2000     5 10590000      794   141     75106   0.1776 2000
## 15    10180 2000     6 13910000      780   156     89167   0.2000 2000
##       city state    label
## 10 Abilene    TX AbilenTX
## 11 Abilene    TX AbilenTX
## 12 Abilene    TX AbilenTX
## 13 Abilene    TX AbilenTX
## 14 Abilene    TX AbilenTX
## 15 Abilene    TX AbilenTX

# Explore sales by pop
xdate <- scale_x_continuous("Date", breaks = c(2001, 2003, 2005, 2007, 2009), 
    labels = c("01", "03", "05", "07", "09"))
listpop <- merge(listings, popt, by = c("msa_code", "year"))  #by merging listings with popt we r considering only for Texas
head(listpop)

##   msa_code year month   volume listings sales avg_price pct_sold date
## 1    11100 2000     3 17930000      995   201     89204   0.2020 2000
## 2    11100 2000    12 15800000     1092   158    100000   0.1447 2001
## 3    11100 2000    11 13955000     1174   133    104925   0.1133 2001
## 4    11100 2000     8 25995000     1208   242    107417   0.2003 2001
## 5    11100 2000     1  8860000      972   102     86863   0.1049 2000
## 6    11100 2000     2 13875000      937   147     94388   0.1569 2000
##       city state    label pop_change    pop domesticmig
## 1 Amarillo    TX AmarllTX        576 227098          19
## 2 Amarillo    TX AmarllTX        576 227098          19
## 3 Amarillo    TX AmarllTX        576 227098          19
## 4 Amarillo    TX AmarllTX        576 227098          19
## 5 Amarillo    TX AmarllTX        576 227098          19
## 6 Amarillo    TX AmarllTX        576 227098          19

The plots generated are:

# simply plot the date against sales to see which cities have had sales and
# how have they been in these years
qplot(date, sales, data = listpop, geom = "line") + facet_wrap(~city) + xdate + 
    theme(panel.background = element_rect(fill = "yellow", colour = "red"))

plot of chunk unnamed-chunk-2


qplot(date, sales/pop * 10000, data = listpop, geom = "line") + facet_wrap(~city) + 
    xdate + ylim(2, 22) + theme(panel.background = element_rect(fill = "orange", 
    colour = "red"))

plot of chunk unnamed-chunk-2


# find the mean of the residuals from the linear regression model attained
dicease <- function(var, month) {
    resid(rlm(var ~ factor(month), na.action = "na.exclude")) + mean(var, na.rm = TRUE)
}

listpop <- ddply(listpop, "msa_code", transform, sales_adj = dicease(sales/pop * 
    10000, month))

qplot(date, sales_adj, data = listpop, geom = "line") + facet_wrap(~city) + 
    xdate + ylim(2, 22) + theme(panel.background = element_rect(fill = "orange"))

plot of chunk unnamed-chunk-2


list2008 <- ddply(subset(listpop, year == 2008), "city", summarise, pop = pop[length(pop)], 
    sales = sum(sales), avg_sales_per_10000 = sum(sales)/pop[length(pop)]/12 * 
        10000)
qplot(pop/1e+06, avg_sales_per_10000, data = list2008, xlab = "Population (millions)", 
    ylab = "Average monthly sales (per 10000 people)") + geom_text(aes(label = city, 
    color = "red"), size = 5, hjust = -0.1, angle = 45) + theme(panel.background = element_rect(fill = "yellow", 
    colour = "red"))

plot of chunk unnamed-chunk-2


qplot(pop, avg_sales_per_10000, label = city, data = list2008, geom = c("point", 
    "text"), log = "x") + theme(panel.background = element_rect(fill = "orange", 
    colour = "black"))

plot of chunk unnamed-chunk-2


# Compare indices for population, sales, price, and new construction
single <- subset(cons, size == 1)  #consider only for 1
single$size <- NULL
all <- merge(listpop, single, by = c("msa_code", "year", "month"), all = T)  #merge listings of texas pop and the construction of singles
all <- all[with(all, order(msa_code, year, month)), ]
all <- subset(all, !is.na(city))  #clean NA rows

index <- function(x) {
    x/x[1]
}
# the smooth function gives the smoothness to the model and predict the fit
smooth <- function(var, data) {
    try_default(predict(gam(var ~ s(data), na.action = na.exclude)), NA)
}
smoothes <- ddply(all, "city", summarise, date = date, construction_s = smooth(units/pop, 
    date) * 10000, population_s = smooth(pop, date), sales_s = smooth(sales/pop, 
    date) * 10000, sold_value_s = smooth(avg_price, date), new_value_s = smooth(valuation/units, 
    date))

ggplot(smoothes, aes(date)) + geom_line(aes(y = sales_s, colour = "sold")) + 
    geom_line(aes(y = construction_s, colour = "built")) + labs(colour = "Type", 
    y = "Houses (per 10000 people)") + facet_wrap(~city) + xdate + theme(panel.background = element_rect(fill = "white"))

plot of chunk unnamed-chunk-2


ggplot(smoothes, aes(date)) + geom_line(aes(y = sold_value_s/1000, colour = "sold")) + 
    geom_line(aes(y = new_value_s, colour = "built")) + labs(colour = "Type", 
    y = "Average price ($000)") + facet_wrap(~city) + xdate + theme(panel.background = element_rect(fill = "white"))

plot of chunk unnamed-chunk-2