library(plyr)
library(ggplot2)
library(R.oo)
## Loading required package: R.methodsS3
## R.methodsS3 v1.6.1 (2014-01-04) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.18.0 (2014-02-22) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
##
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
##
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
options(stringsAsFactors = F)
hpi <- read.csv("data/fhfa-house-price-index/fhfa-house-price-index-msa.csv",
header = T)
hpi$time <- hpi$year + hpi$quarter/4
hpi <- na.omit(hpi)
# find the difference/change of hpi with 2009
head(hpi)
## city state fips_msa year quarter hpi error time
## 1 Abilene TX 10180 2000 1 112.1 2.63 2000
## 2 Abilene TX 10180 2000 2 112.5 2.44 2000
## 3 Abilene TX 10180 2000 3 114.1 2.47 2001
## 4 Abilene TX 10180 2000 4 116.7 2.70 2001
## 5 Abilene TX 10180 2001 1 116.8 2.64 2001
## 6 Abilene TX 10180 2001 2 117.7 2.55 2002
get_max <- function(df) {
hpi2009 <- df$hpi[df$time == 2009]
df <- df[df$hpi == max(df$hpi), ]
df$change <- hpi2009 - df$hpi
df
}
maxhpi <- ddply(hpi, .(fips_msa), get_max)
maxhpi <- maxhpi[, c(1, 2, 3, 6, 8, 9)]
names(maxhpi)[3] <- "cbsa" #renamed only then i was able to merge with pop cbsa
maxhpi$rate <- with(maxhpi, change/(2009.25 - time)) #calculate the rate of change with wrt time, we need this to calculate rate per hpi
maxhpi <- within(maxhpi, per_change <- change/hpi)
maxhpi <- within(maxhpi, per_rate <- rate/hpi)
maxhpi <- within(maxhpi, state <- as.character(state))
for (i in 1:nrow(maxhpi)) maxhpi$state[i] <- trim(gsub("MSAD", "", maxhpi$state[i]))
head(maxhpi)
## city state cbsa hpi time change rate per_change
## 1 Abilene TX 10180 174.2 2009 0.00 0.000 0.00000
## 2 Akron OH 10420 154.5 2006 -8.56 -2.853 -0.05540
## 3 Albany GA 10500 173.5 2008 -7.20 -7.200 -0.04151
## 4 Albany-Schenectady-Troy NY 10580 202.0 2008 -3.38 -3.380 -0.01673
## 5 Albuquerque NM 10740 184.6 2008 -4.45 -4.450 -0.02411
## 6 Alexandria LA 10780 195.8 2009 -4.58 -Inf -0.02340
## per_rate
## 1 0.00000
## 2 -0.01847
## 3 -0.04151
## 4 -0.01673
## 5 -0.02411
## 6 -Inf
The plots are:
overalloutcome <- qplot(data = maxhpi, hpi, per_rate, colour = state, geom = "text",
label = state, main = "Max HPI vs. outcome", ylab = "Rate of change per year (as percentage of max HPI)",
xlab = "Maximum HPI (2005-2009)") + theme(legend.position = "none")
overalloutcome
pop <- read.csv("exploration/fhfa-house-price-index/pop-cbsa.csv", header = T)
head(pop)
## cbsa cbsaname pop2k
## 1 cbsa CBSA Name Total Pop, 2000 census
## 2 19190644
## 3 10020 Abbeville, LA Micropolitan Statistical Area 53807
## 4 10100 Aberdeen, SD Micropolitan Statistical Area 39827
## 5 10140 Aberdeen, WA Micropolitan Statistical Area 67194
## 6 10180 Abilene, TX Metropolitan Statistical Area 160245
## afact
## 1 cbsa to cbsa alloc factor
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
pop <- pop[-c(1, 2), c(1, 3)]
pop$cbsa <- as.numeric(pop$cbsa)
pop$pop2k <- as.numeric(pop$pop2k)
head(pop)
## cbsa pop2k
## 3 10020 53807
## 4 10100 39827
## 5 10140 67194
## 6 10180 160245
## 7 10220 35143
## 8 10300 98890
maxhpi <- merge(maxhpi, pop, by = "cbsa", all.x = T)
FLoutcome <- qplot(data = maxhpi[maxhpi$state == "FL", ], hpi, per_rate, main = "Florida Max HPI vs. outcome",
ylab = "Rate of change per year (as percentage of max HPI)", xlab = "Maximum HPI (2005-2009)") +
geom_text(aes(label = city, color = "red"), size = 5, hjust = -0.1, angle = 45)
FLoutcome
CAoutcome <- qplot(data = maxhpi[maxhpi$state == "CA", ], hpi, per_rate, main = "California Max HPI vs. outcome",
ylab = "Rate of change per year (as percentage of max HPI)", xlab = "Maximum HPI (2005-2009)") +
geom_text(aes(label = city, color = "red"), size = 5, hjust = -0.1, angle = 5)
CAoutcome
# took only these states because they seem to have strong realtion with hpi
# and outcome
FCANoutcomes <- qplot(data = maxhpi[maxhpi$state %in% c("AZ", "CA", "FL", "NV",
"NJ", "OR"), ], hpi, per_rate, geom = c("point", "text"), label = state,
colour = state, main = "Max HPI vs. outcome", ylab = "Rate of change per year (as percentage of max HPI)",
xlab = "Maximum HPI (2005-2009)") + theme(legend.position = "none") + facet_wrap(~state)
FCANoutcomes
location <- read.csv("exploration/fhfa-house-price-index/loc-cbsa.csv", header = T)
head(location)
## cbsa cbsaname
## 1 cbsa CBSA Name
## 2
## 3 10020 Abbeville, LA Micropolitan Statistical Area
## 4 10100 Aberdeen, SD Micropolitan Statistical Area
## 5 10140 Aberdeen, WA Micropolitan Statistical Area
## 6 10180 Abilene, TX Metropolitan Statistical Area
## intptlon intptlat
## 1 Wtd centroid W longitude, degrees Wtd centroid latitude, degrees
## 2 -90.984558 38.711639
## 3 -92.174303 29.984629
## 4 -98.526062 45.477781
## 5 -123.763184 46.995579
## 6 -99.749932 32.458902
## pop2k afact
## 1 Total Pop, 2000 census cbsa to cbsa alloc factor
## 2 19190644 1
## 3 53807 1
## 4 39827 1
## 5 67194 1
## 6 160245 1
location <- location[-c(1, 2), c(1, 3, 4)]
names(location) <- c("cbsa", "longitude", "latitude")
location$cbsa <- as.numeric(location$cbsa)
location$longitude <- as.numeric(location$longitude)
location$latitude <- as.numeric(location$latitude)
head(location)
## cbsa longitude latitude
## 3 10020 -92.17 29.98
## 4 10100 -98.53 45.48
## 5 10140 -123.76 47.00
## 6 10180 -99.75 32.46
## 7 10220 -96.67 34.77
## 8 10300 -84.05 41.92
maxhpi <- merge(maxhpi, location, by = "cbsa", all.x = T)
head(maxhpi)
## cbsa city state hpi time change rate per_change
## 1 10180 Abilene TX 174.2 2009 0.00 0.000 0.00000
## 2 10420 Akron OH 154.5 2006 -8.56 -2.853 -0.05540
## 3 10500 Albany GA 173.5 2008 -7.20 -7.200 -0.04151
## 4 10580 Albany-Schenectady-Troy NY 202.0 2008 -3.38 -3.380 -0.01673
## 5 10740 Albuquerque NM 184.6 2008 -4.45 -4.450 -0.02411
## 6 10780 Alexandria LA 195.8 2009 -4.58 -Inf -0.02340
## per_rate pop2k longitude latitude
## 1 0.00000 160245 -99.75 32.46
## 2 -0.01847 694960 -81.46 41.12
## 3 -0.04151 157833 -84.15 31.59
## 4 -0.01673 825875 -73.82 42.78
## 5 -0.02411 729649 -106.61 35.10
## 6 -Inf 145035 -92.47 31.32
calhpi <- na.omit(maxhpi[maxhpi$state == "CA", ])
for (i in 1:nrow(calhpi)) {
if (calhpi$latitude[i] >= 36.14)
calhpi$n_or_s[i] <- "n" else calhpi$n_or_s[i] <- "s"
}
for (i in 1:nrow(calhpi)) {
if (calhpi$latitude[i] <= 36.14 & calhpi$longitude[i] <= -121.768)
calhpi$region[i] <- "bay_area" else if (calhpi$latitude[i] <= 35.38)
calhpi$region[i] <- "south_cal" else calhpi$region[i] <- "others"
}
LA <- maxhpi[210, ]
LA$latitude <- 33.93
LA$longitude <- -118.4
LA$n_or_s <- "s"
LA$region <- "south_cal"
SanFrancisco <- maxhpi[316, ]
SanFrancisco$latitude <- 37.75
SanFrancisco$longitude <- -122.68
SanFrancisco$n_or_s <- "n"
SanFrancisco$region <- "bay_area"
Oakland <- maxhpi[252, ]
Oakland$latitude <- 37.73
Oakland$longitude <- -122.22
Oakland$n_or_s <- "n"
Oakland$region <- "bay_area"
Irvine <- maxhpi[319, ]
Irvine$latitude <- 33.67
Irvine$longitude <- -117.88
Irvine$n_or_s <- "s"
Irvine$region <- "south_cal"
calhpi <- rbind(calhpi, LA, SanFrancisco, Oakland, Irvine)
calhpi[c(9, 14, 16, 19, 20, 26, 27), 14] <- "bay_area"
calhpi$region <- c("central valley", "central valley", "southern desert", "central valley",
"central valley", "central valley", "central valley", "central valley",
"bay area", "south cal", "central valley", "so cal", "central valley", "bay area",
"south cal", "bay area", "south cal", "south cal", "bay area", "bay area",
"central valley", "bay area", "central valley", "central valley", "south cal",
"bay area", "bay area", "south cal")
head(calhpi)
## cbsa city state hpi time change rate per_change
## 24 12540 Bakersfield CA 280.3 2007 -100.68 -44.75 -0.3591
## 71 17020 Chico CA 271.7 2006 -47.55 -17.29 -0.1750
## 106 20940 El Centro CA 258.7 2007 -102.20 -51.10 -0.3951
## 130 23420 Fresno CA 285.6 2007 -92.45 -36.98 -0.3236
## 148 25260 Hanford-Corcoran CA 249.7 2007 -61.81 -30.91 -0.2475
## 215 31460 Madera-Chowchilla CA 280.9 2007 -95.39 -42.40 -0.3395
## per_rate pop2k longitude latitude n_or_s region
## 24 -0.15962 661645 -118.9 35.37 s central valley
## 71 -0.06364 203171 -121.7 39.67 n central valley
## 106 -0.19753 142361 -115.5 32.85 s southern desert
## 130 -0.12946 799407 -119.8 36.74 n central valley
## 148 -0.12376 129461 -119.7 36.24 n central valley
## 215 -0.15091 123109 -120.0 37.05 n central valley
CalByRegion <- qplot(data = calhpi, hpi, per_rate, colour = region, geom = "text",
label = state, main = "California Max HPI by region vs. outcome", ylab = "Rate of change per year (as percentage of max HPI)",
xlab = "Maximum HPI (2005-2009)")
CalByRegion
calhpi2 <- hpi[hpi$state == "CA" | hpi$state == "CA MSAD", ]
names(calhpi2)[3] <- "cbsa"
toget <- calhpi[, c(1, 14)]
calhpi2 <- merge(calhpi2, toget, by = "cbsa")
head(calhpi2)
## cbsa city state year quarter hpi error time region
## 1 12540 Bakersfield CA 2000 1 104.2 0.96 2000 central valley
## 2 12540 Bakersfield CA 2000 2 104.9 0.94 2000 central valley
## 3 12540 Bakersfield CA 2000 3 107.2 0.94 2001 central valley
## 4 12540 Bakersfield CA 2000 4 109.4 0.96 2001 central valley
## 5 12540 Bakersfield CA 2001 1 111.8 0.95 2001 central valley
## 6 12540 Bakersfield CA 2001 2 113.9 0.95 2002 central valley
CalHpiByRegion <- qplot(data = calhpi2, time, hpi, geom = "line", group = cbsa,
colour = region, main = "California HPI by region on time")
CalHpiByRegion
calhpi2 <- merge(calhpi2, pop, by = "cbsa", all.x = T)
head(calhpi2)
## cbsa city state year quarter hpi error time region
## 1 12540 Bakersfield CA 2005 1 213.6 1.78 2005 central valley
## 2 12540 Bakersfield CA 2002 4 130.1 1.05 2003 central valley
## 3 12540 Bakersfield CA 2005 2 227.6 1.88 2006 central valley
## 4 12540 Bakersfield CA 2003 2 138.0 1.11 2004 central valley
## 5 12540 Bakersfield CA 2001 3 115.5 0.96 2002 central valley
## 6 12540 Bakersfield CA 2004 4 200.3 1.66 2005 central valley
## pop2k
## 1 661645
## 2 661645
## 3 661645
## 4 661645
## 5 661645
## 6 661645
qplot(data = calhpi2, time, hpi, geom = "line", group = cbsa, colour = pop2k,
main = "California HPI by region based on the population")