Exploring the HPIs based on the regions in USA during 2005-2009

library(plyr)
library(ggplot2)
library(R.oo)

## Loading required package: R.methodsS3
## R.methodsS3 v1.6.1 (2014-01-04) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.18.0 (2014-02-22) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## 
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## 
## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save

options(stringsAsFactors = F)
hpi <- read.csv("data/fhfa-house-price-index/fhfa-house-price-index-msa.csv", 
    header = T)
hpi$time <- hpi$year + hpi$quarter/4
hpi <- na.omit(hpi)
# find the difference/change of hpi with 2009
head(hpi)

##      city state fips_msa year quarter   hpi error time
## 1 Abilene    TX    10180 2000       1 112.1  2.63 2000
## 2 Abilene    TX    10180 2000       2 112.5  2.44 2000
## 3 Abilene    TX    10180 2000       3 114.1  2.47 2001
## 4 Abilene    TX    10180 2000       4 116.7  2.70 2001
## 5 Abilene    TX    10180 2001       1 116.8  2.64 2001
## 6 Abilene    TX    10180 2001       2 117.7  2.55 2002

get_max <- function(df) {
    hpi2009 <- df$hpi[df$time == 2009]
    df <- df[df$hpi == max(df$hpi), ]
    df$change <- hpi2009 - df$hpi
    df
}

maxhpi <- ddply(hpi, .(fips_msa), get_max)
maxhpi <- maxhpi[, c(1, 2, 3, 6, 8, 9)]
names(maxhpi)[3] <- "cbsa"  #renamed only then i was able to merge with pop cbsa

maxhpi$rate <- with(maxhpi, change/(2009.25 - time))  #calculate the rate of change with wrt time, we need this to calculate rate per hpi
maxhpi <- within(maxhpi, per_change <- change/hpi)
maxhpi <- within(maxhpi, per_rate <- rate/hpi)
maxhpi <- within(maxhpi, state <- as.character(state))
for (i in 1:nrow(maxhpi)) maxhpi$state[i] <- trim(gsub("MSAD", "", maxhpi$state[i]))
head(maxhpi)

##                      city state  cbsa   hpi time change   rate per_change
## 1                 Abilene    TX 10180 174.2 2009   0.00  0.000    0.00000
## 2                   Akron    OH 10420 154.5 2006  -8.56 -2.853   -0.05540
## 3                  Albany    GA 10500 173.5 2008  -7.20 -7.200   -0.04151
## 4 Albany-Schenectady-Troy    NY 10580 202.0 2008  -3.38 -3.380   -0.01673
## 5             Albuquerque    NM 10740 184.6 2008  -4.45 -4.450   -0.02411
## 6              Alexandria    LA 10780 195.8 2009  -4.58   -Inf   -0.02340
##   per_rate
## 1  0.00000
## 2 -0.01847
## 3 -0.04151
## 4 -0.01673
## 5 -0.02411
## 6     -Inf

The plots are:


overalloutcome <- qplot(data = maxhpi, hpi, per_rate, colour = state, geom = "text", 
    label = state, main = "Max HPI vs. outcome", ylab = "Rate of change per year (as percentage of max HPI)", 
    xlab = "Maximum HPI (2005-2009)") + theme(legend.position = "none")
overalloutcome

plot of chunk unnamed-chunk-2


pop <- read.csv("exploration/fhfa-house-price-index/pop-cbsa.csv", header = T)
head(pop)

##    cbsa                                    cbsaname                  pop2k
## 1  cbsa                                   CBSA Name Total Pop, 2000 census
## 2                                                                 19190644
## 3 10020 Abbeville, LA Micropolitan Statistical Area                  53807
## 4 10100  Aberdeen, SD Micropolitan Statistical Area                  39827
## 5 10140  Aberdeen, WA Micropolitan Statistical Area                  67194
## 6 10180   Abilene, TX Metropolitan Statistical Area                 160245
##                       afact
## 1 cbsa to cbsa alloc factor
## 2                        1 
## 3                        1 
## 4                        1 
## 5                        1 
## 6                        1

pop <- pop[-c(1, 2), c(1, 3)]
pop$cbsa <- as.numeric(pop$cbsa)
pop$pop2k <- as.numeric(pop$pop2k)
head(pop)

##    cbsa  pop2k
## 3 10020  53807
## 4 10100  39827
## 5 10140  67194
## 6 10180 160245
## 7 10220  35143
## 8 10300  98890

maxhpi <- merge(maxhpi, pop, by = "cbsa", all.x = T)

FLoutcome <- qplot(data = maxhpi[maxhpi$state == "FL", ], hpi, per_rate, main = "Florida Max HPI vs. outcome", 
    ylab = "Rate of change per year (as percentage of max HPI)", xlab = "Maximum HPI (2005-2009)") + 
    geom_text(aes(label = city, color = "red"), size = 5, hjust = -0.1, angle = 45)
FLoutcome


CAoutcome <- qplot(data = maxhpi[maxhpi$state == "CA", ], hpi, per_rate, main = "California Max HPI vs. outcome", 
    ylab = "Rate of change per year (as percentage of max HPI)", xlab = "Maximum HPI (2005-2009)") + 
    geom_text(aes(label = city, color = "red"), size = 5, hjust = -0.1, angle = 5)
CAoutcome

plot of chunk unnamed-chunk-2

# took only these states because they seem to have strong realtion with hpi
# and outcome
FCANoutcomes <- qplot(data = maxhpi[maxhpi$state %in% c("AZ", "CA", "FL", "NV", 
    "NJ", "OR"), ], hpi, per_rate, geom = c("point", "text"), label = state, 
    colour = state, main = "Max HPI vs. outcome", ylab = "Rate of change per year (as percentage of max HPI)", 
    xlab = "Maximum HPI (2005-2009)") + theme(legend.position = "none") + facet_wrap(~state)
FCANoutcomes

plot of chunk unnamed-chunk-2


location <- read.csv("exploration/fhfa-house-price-index/loc-cbsa.csv", header = T)
head(location)

##    cbsa                                    cbsaname
## 1  cbsa                                   CBSA Name
## 2                                                  
## 3 10020 Abbeville, LA Micropolitan Statistical Area
## 4 10100  Aberdeen, SD Micropolitan Statistical Area
## 5 10140  Aberdeen, WA Micropolitan Statistical Area
## 6 10180   Abilene, TX Metropolitan Statistical Area
##                            intptlon                       intptlat
## 1 Wtd centroid W longitude, degrees Wtd centroid latitude, degrees
## 2                        -90.984558                      38.711639
## 3                        -92.174303                      29.984629
## 4                        -98.526062                      45.477781
## 5                       -123.763184                      46.995579
## 6                        -99.749932                      32.458902
##                    pop2k                     afact
## 1 Total Pop, 2000 census cbsa to cbsa alloc factor
## 2               19190644                        1 
## 3                  53807                        1 
## 4                  39827                        1 
## 5                  67194                        1 
## 6                 160245                        1

location <- location[-c(1, 2), c(1, 3, 4)]
names(location) <- c("cbsa", "longitude", "latitude")
location$cbsa <- as.numeric(location$cbsa)
location$longitude <- as.numeric(location$longitude)
location$latitude <- as.numeric(location$latitude)
head(location)

##    cbsa longitude latitude
## 3 10020    -92.17    29.98
## 4 10100    -98.53    45.48
## 5 10140   -123.76    47.00
## 6 10180    -99.75    32.46
## 7 10220    -96.67    34.77
## 8 10300    -84.05    41.92

maxhpi <- merge(maxhpi, location, by = "cbsa", all.x = T)
head(maxhpi)

##    cbsa                    city state   hpi time change   rate per_change
## 1 10180                 Abilene    TX 174.2 2009   0.00  0.000    0.00000
## 2 10420                   Akron    OH 154.5 2006  -8.56 -2.853   -0.05540
## 3 10500                  Albany    GA 173.5 2008  -7.20 -7.200   -0.04151
## 4 10580 Albany-Schenectady-Troy    NY 202.0 2008  -3.38 -3.380   -0.01673
## 5 10740             Albuquerque    NM 184.6 2008  -4.45 -4.450   -0.02411
## 6 10780              Alexandria    LA 195.8 2009  -4.58   -Inf   -0.02340
##   per_rate  pop2k longitude latitude
## 1  0.00000 160245    -99.75    32.46
## 2 -0.01847 694960    -81.46    41.12
## 3 -0.04151 157833    -84.15    31.59
## 4 -0.01673 825875    -73.82    42.78
## 5 -0.02411 729649   -106.61    35.10
## 6     -Inf 145035    -92.47    31.32

calhpi <- na.omit(maxhpi[maxhpi$state == "CA", ])
for (i in 1:nrow(calhpi)) {
    if (calhpi$latitude[i] >= 36.14) 
        calhpi$n_or_s[i] <- "n" else calhpi$n_or_s[i] <- "s"
}



for (i in 1:nrow(calhpi)) {
    if (calhpi$latitude[i] <= 36.14 & calhpi$longitude[i] <= -121.768) 
        calhpi$region[i] <- "bay_area" else if (calhpi$latitude[i] <= 35.38) 
        calhpi$region[i] <- "south_cal" else calhpi$region[i] <- "others"
}

LA <- maxhpi[210, ]
LA$latitude <- 33.93
LA$longitude <- -118.4
LA$n_or_s <- "s"
LA$region <- "south_cal"

SanFrancisco <- maxhpi[316, ]
SanFrancisco$latitude <- 37.75
SanFrancisco$longitude <- -122.68
SanFrancisco$n_or_s <- "n"
SanFrancisco$region <- "bay_area"

Oakland <- maxhpi[252, ]
Oakland$latitude <- 37.73
Oakland$longitude <- -122.22
Oakland$n_or_s <- "n"
Oakland$region <- "bay_area"

Irvine <- maxhpi[319, ]
Irvine$latitude <- 33.67
Irvine$longitude <- -117.88
Irvine$n_or_s <- "s"
Irvine$region <- "south_cal"

calhpi <- rbind(calhpi, LA, SanFrancisco, Oakland, Irvine)
calhpi[c(9, 14, 16, 19, 20, 26, 27), 14] <- "bay_area"
calhpi$region <- c("central valley", "central valley", "southern desert", "central valley", 
    "central valley", "central valley", "central valley", "central valley", 
    "bay area", "south cal", "central valley", "so cal", "central valley", "bay area", 
    "south cal", "bay area", "south cal", "south cal", "bay area", "bay area", 
    "central valley", "bay area", "central valley", "central valley", "south cal", 
    "bay area", "bay area", "south cal")
head(calhpi)

##      cbsa              city state   hpi time  change   rate per_change
## 24  12540       Bakersfield    CA 280.3 2007 -100.68 -44.75    -0.3591
## 71  17020             Chico    CA 271.7 2006  -47.55 -17.29    -0.1750
## 106 20940         El Centro    CA 258.7 2007 -102.20 -51.10    -0.3951
## 130 23420            Fresno    CA 285.6 2007  -92.45 -36.98    -0.3236
## 148 25260  Hanford-Corcoran    CA 249.7 2007  -61.81 -30.91    -0.2475
## 215 31460 Madera-Chowchilla    CA 280.9 2007  -95.39 -42.40    -0.3395
##     per_rate  pop2k longitude latitude n_or_s          region
## 24  -0.15962 661645    -118.9    35.37      s  central valley
## 71  -0.06364 203171    -121.7    39.67      n  central valley
## 106 -0.19753 142361    -115.5    32.85      s southern desert
## 130 -0.12946 799407    -119.8    36.74      n  central valley
## 148 -0.12376 129461    -119.7    36.24      n  central valley
## 215 -0.15091 123109    -120.0    37.05      n  central valley

CalByRegion <- qplot(data = calhpi, hpi, per_rate, colour = region, geom = "text", 
    label = state, main = "California Max HPI by region vs. outcome", ylab = "Rate of change per year (as percentage of max HPI)", 
    xlab = "Maximum HPI (2005-2009)")
CalByRegion

plot of chunk unnamed-chunk-2

calhpi2 <- hpi[hpi$state == "CA" | hpi$state == "CA  MSAD", ]
names(calhpi2)[3] <- "cbsa"
toget <- calhpi[, c(1, 14)]
calhpi2 <- merge(calhpi2, toget, by = "cbsa")
head(calhpi2)

##    cbsa        city state year quarter   hpi error time         region
## 1 12540 Bakersfield    CA 2000       1 104.2  0.96 2000 central valley
## 2 12540 Bakersfield    CA 2000       2 104.9  0.94 2000 central valley
## 3 12540 Bakersfield    CA 2000       3 107.2  0.94 2001 central valley
## 4 12540 Bakersfield    CA 2000       4 109.4  0.96 2001 central valley
## 5 12540 Bakersfield    CA 2001       1 111.8  0.95 2001 central valley
## 6 12540 Bakersfield    CA 2001       2 113.9  0.95 2002 central valley

CalHpiByRegion <- qplot(data = calhpi2, time, hpi, geom = "line", group = cbsa, 
    colour = region, main = "California HPI by region on time")
CalHpiByRegion

plot of chunk unnamed-chunk-2

calhpi2 <- merge(calhpi2, pop, by = "cbsa", all.x = T)
head(calhpi2)

##    cbsa        city state year quarter   hpi error time         region
## 1 12540 Bakersfield    CA 2005       1 213.6  1.78 2005 central valley
## 2 12540 Bakersfield    CA 2002       4 130.1  1.05 2003 central valley
## 3 12540 Bakersfield    CA 2005       2 227.6  1.88 2006 central valley
## 4 12540 Bakersfield    CA 2003       2 138.0  1.11 2004 central valley
## 5 12540 Bakersfield    CA 2001       3 115.5  0.96 2002 central valley
## 6 12540 Bakersfield    CA 2004       4 200.3  1.66 2005 central valley
##    pop2k
## 1 661645
## 2 661645
## 3 661645
## 4 661645
## 5 661645
## 6 661645

qplot(data = calhpi2, time, hpi, geom = "line", group = cbsa, colour = pop2k, 
    main = "California HPI by region based on the population")

plot of chunk unnamed-chunk-2