Lab 04 - Predicting MHV Change

This lab is designed to help you build your baseline model of neighborhood change before adding the policy variables in the next lab.

set.seed( 1234 )
# load necessary functions and objects ----
# note: all of these are R objects that will be used throughout this .rmd file
import::here("S_TYPE",
             "panel.cor",
             "panel.smooth",
             "jplot",
             "d",
             "df",
             "cbsa_stats_df",
             # notice the use of here::here() that points to the .R file
             # where all these R objects are created
             .from = here::here("labs/wk04/lab_04_source.R"),
             .character_only = TRUE)

Part I

Similar to your previous lab, create a data set that includes 2000 and 2010 census variables drop all rural census tracts.

Create a variable that measures the growth of median home value from 2000 to 2010.

Omit cases that have a median home value less than $10,000 in 2000.

Omit cases with growth rates above 200%.

Data


# Load Crosswalk data

crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv",  stringsAsFactors=F, colClasses="character" )



d1 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2010.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )

# check to make sure we are not losing 
# or gaining observations in the merge
nrow( d1 ) 
## [1] 72693


d1 <- select( d1, - year )
d2 <- select( d2, - year )

d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )

nrow( d )
## [1] 72693


Filter Rural Districts

table( d$urban )
## 
## rural urban 
## 12971 59722
d <- filter( d, urban == "urban" )


Identify Common Variables

# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
  # use regular expressions to remove numeric suffixes 
  var.names.1 <- names( df1 )
  var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
  var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
  
  var.names.2 <- names( df2 )
  var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
  var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
  
  shared <- intersect( var.names.1, var.names.2 ) %>% sort()
  print( "SHARED VARIABLES:")
  print( shared )
  
  not.shared <- c( setdiff( var.names.1, var.names.2 ),
                   setdiff( var.names.2, var.names.1 ) ) %>% sort()
  
  print( "NOT SHARED:" )
  print( not.shared )
  
  d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
  d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
  dd <- rbind( d.vars1, d.vars2 )
  
  return( dd )
}

vars <- compare_dfs( df1=d1, df2=d2 )
## [1] "SHARED VARIABLES:"
##   [1] "a15asn"  "a15blk"  "a15hsp"  "a15ntv"  "a15wht"  "a18und"  "a60asn" 
##   [8] "a60blk"  "a60hsp"  "a60ntv"  "a60up"   "a60wht"  "a75up"   "ag15up" 
##  [15] "ag18cv"  "ag25up"  "ag5up"   "ageasn"  "ageblk"  "agehsp"  "agentv" 
##  [22] "agewht"  "asian"   "china"   "clf"     "col"     "cuban"   "dapov"  
##  [29] "dbpov"   "dflabf"  "dfmpov"  "dhpov"   "dmulti"  "dnapov"  "dpov"   
##  [36] "dwpov"   "empclf"  "family"  "fb"      "fhh"     "filip"   "flabf"  
##  [43] "geanc"   "gefb"    "h10yrs"  "h30old"  "haw"     "hh"      "hha"    
##  [50] "hhb"     "hhh"     "hhw"     "hinc"    "hinca"   "hincb"   "hinch"  
##  [57] "hincw"   "hisp"    "hs"      "hu"      "incpc"   "india"   "iranc"  
##  [64] "irfb"    "itanc"   "itfb"    "japan"   "korea"   "lep"     "manuf"  
##  [71] "mar"     "mex"     "mhmval"  "mrent"   "multi"   "n10imm"  "n65pov" 
##  [78] "napov"   "nat"     "nbpov"   "nfmpov"  "nhblk"   "nhpov"   "nhwht"  
##  [85] "nnapov"  "npov"    "ntv"     "nwpov"   "ohu"     "olang"   "own"    
##  [92] "pop"     "pr"      "prof"    "rent"    "ruanc"   "rufb"    "scanc"  
##  [99] "scfb"    "semp"    "tractid" "unemp"   "vac"     "vet"     "viet"   
## [106] "wds"    
## [1] "NOT SHARED:"
##  [1] "a65asn"  "a65blk"  "a65hsp"  "a65ntv"  "a65wht"  "cni16u"  "dis"    
##  [8] "hu00sp"  "ohu00sp" "p10imm"  "p10yrs"  "p15asn"  "p15blk"  "p15hsp" 
## [15] "p15ntv"  "p15wht"  "p18und"  "p30old"  "p60up"   "p65asn"  "p65blk" 
## [22] "p65hsp"  "p65ntv"  "p65pov"  "p65wht"  "p75up"   "papov"   "pasian" 
## [29] "pbpov"   "pchina"  "pcol"    "pcuban"  "pfb"     "pfhh"    "pfilip" 
## [36] "pflabf"  "pfmpov"  "pgeanc"  "pgefb"   "phaw"    "phisp"   "phpov"  
## [43] "phs"     "pindia"  "piranc"  "pirfb"   "pitanc"  "pitfb"   "pjapan" 
## [50] "pkorea"  "plep"    "pmanuf"  "pmar"    "pmex"    "pmulti"  "pnapov" 
## [57] "pnat"    "pnhblk"  "pnhwht"  "pntv"    "polang"  "pown"    "ppov"   
## [64] "ppr"     "pprof"   "pruanc"  "prufb"   "pscanc"  "pscfb"   "psemp"  
## [71] "punemp"  "pvac"    "pvet"    "pviet"   "pwds"    "pwpov"
head(vars) %>% pander()
type variables
shared a15asn
shared a15blk
shared a15hsp
shared a15ntv
shared a15wht
shared a18und


Create Data Set for Analysis

d.full <- d  # keep a copy so don't have to reload 
d <- d.full  # story original in case you need to reset anything

d <- select( d, tractid, mhmval00, mhmval12, hinc00, 
             hu00, own00, rent00,  
             empclf00, clf00, unemp00, prof00,  
             dpov00, npov00,
             ag25up00, hs00, col00, 
             pop00.x, nhwht00, nhblk00, hisp00, asian00,
             cbsa, cbsaname )
d <- 
  d %>%
  mutate( p.white = 100 * nhwht00 / pop00.x,
          p.black = 100 * nhblk00 / pop00.x,
          p.hisp = 100 * hisp00 / pop00.x, 
          p.asian = 100 * asian00 / pop00.x,
          p.hs = 100 * (hs00+col00) / ag25up00,
          p.col = 100 * col00 / ag25up00,
          p.prof = 100 * prof00 / empclf00,
          p.unemp = 100 * unemp00 / clf00,
          pov.rate = 100 * npov00 / dpov00 )


stargazer( d, 
           type="text", 
           digits=0,
           summary.stat = c("min", "p25","median","mean","p75","max") )
## 
## ===========================================================
## Statistic  Min  Pctl(25) Median   Mean   Pctl(75)    Max   
## -----------------------------------------------------------
## mhmval00    0    81,600  119,900 144,738 173,894  1,000,001
## mhmval12  9,999 123,200  193,200 246,570 312,000  1,000,001
## hinc00    2,499  33,000  43,825  47,657   58,036   200,001 
## hu00        0    1,102    1,519   1,570   1,999    11,522  
## own00       0     542      902     939    1,289     4,911  
## rent00      0     195      398     516     712      8,544  
## empclf00    0    1,205    1,756   1,820   2,373    10,334  
## clf00       0    1,302    1,865   1,930   2,502    11,251  
## unemp00     0      51      87      110     140      6,405  
## prof00      0     299      539     637     873      6,610  
## dpov00      0    2,671    3,718   3,804   4,871    23,892  
## npov00      0     149      304     452     601      5,515  
## ag25up00    0    1,763    2,451   2,520   3,224    17,974  
## hs00        0     665     1,071   1,155   1,552     8,909  
## col00       0     243      492     665     923      9,313  
## pop00.x     0    2,751    3,802   3,901   4,976    36,206  
## nhwht00     0    1,308    2,514   2,591   3,713    20,619  
## nhblk00     0      41      141     522     527     14,039  
## hisp00      0      55      153     547     533     13,391  
## asian00     0      22      65      189     183      9,491  
## p.white     0      47      78      67       91       100   
## p.black     0      1        4      14       14       100   
## p.hisp      0      2        4      13       15       100   
## p.asian     0      1        2       5       5        95    
## p.hs        0      67      72      72       77       100   
## p.col       0      12      21      26       36       100   
## p.prof      0      23      31      34       43       100   
## p.unemp     0      3        5       6       8        100   
## pov.rate    0      4        9      12       17       100   
## -----------------------------------------------------------

Exploration of Median Home Values

Initial conditions in 2000:

# adjust 2000 home values for inflation 
mhv.00 <- d$mhmval00 * 1.28855  
mhv.10 <- d$mhmval12

mhv.change <- mhv.10 - mhv.00

df <- data.frame( MedianHomeValue2000=mhv.00, 
                  MedianHomeValue2010=mhv.10, 
                  Change.00.to.10=mhv.change )

d$mhv.00 <- d$mhmval00 * 1.28855  
d$mhv.10 <- d$mhmval12

stargazer( df, 
           type="text", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
## 
## ==========================================================================
## Statistic              Min     Pctl(25) Median   Mean   Pctl(75)    Max   
## --------------------------------------------------------------------------
## MedianHomeValue2000     0      105,146  154,497 186,502 224,071  1,288,551
## MedianHomeValue2010   9,999    123,200  193,200 246,570 312,000  1,000,001
## Change.00.to.10     -1,228,651  7,187   36,268  60,047   94,881  1,000,001
## --------------------------------------------------------------------------


Compare 2000 to 2010 distributions:

layout.matrix <- matrix( c( 1,3,
                            2,3 ), 
                nrow=2, ncol=2, byrow=T )

layout( mat = layout.matrix,
        heights = c(2,2), # Heights of the two rows
        widths =  c(3,4)) # Widths of the two columns

# layout.show(3)

par( mar=c(4,0,0,2) )

hist( mhv.00/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )

abline( v=seq(0,1000,100), lty=2, col="gray80" )

text( 550, 4000, labels="Median Home \nValue in 2000", 
      col="darkslateblue", cex=1.8 )



hist( mhv.10/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )

text( 550, 3500, labels="Median Home \nValue in 2010", 
      col="darkslateblue", cex=1.8 )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )


# data reduction - filter 1,000 observations

df <- data.frame( v00=mhv.00/1000, v10=mhv.10/1000 )
df <- sample_n( df, 1000 )

par( mar=c(4,5,3,2) )

jplot( df$v00, df$v10, 
       lab1="MHV in 1990", lab2="MHV in 2000",
       xlim=c(0,1000), ylim=c(0,1000),
       axes=F )

abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )


# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance 
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic


mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- mhv.change / mhv.00
summary( pct.change )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.96891  0.05918  0.25402  0.33167  0.49556 60.59261      220
mhv.growth <- mhv.change / mhv.00 * 100
d$mhv.growth <- mhv.change / mhv.00 * 100
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.96891  0.05918  0.25402  0.33167  0.49556 60.59261      220
# how many cases had increases above 500%
sum( pct.change > 2, na.rm=T )
## [1] 546
## [1] 112
# preview tracts with large increases in home values 
# to see if increases make sense 

d %>% 
  filter( pct.change < 2 ) %>% 
  head()


hist( d$mhmval00, breaks=200, xlim=c(0,500000), 
      col="gray20", border="white",
      axes=F, 
      xlab="MHV (median = $138k)",
      ylab="",
      main="Median Home Value in 2000 (2010 US dollars)" )
axis( side=1, at=seq(0,500000,100000), 
      labels=c("$0","$100k","$200k","$300k","$400k","$500k") )
abline( v=median( d$mhmval00, na.rm=T ), col="orange", lwd=3 )


Descriptives

stargazer( d, 
           type="text", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
## 
## ============================================================
## Statistic   Min  Pctl(25) Median   Mean   Pctl(75)    Max   
## ------------------------------------------------------------
## mhmval00     0    81,600  119,900 144,738 173,894  1,000,001
## mhmval12   9,999 123,200  193,200 246,570 312,000  1,000,001
## hinc00     2,499  33,000  43,825  47,657   58,036   200,001 
## hu00         0    1,102    1,519   1,570   1,999    11,522  
## own00        0     542      902     939    1,289     4,911  
## rent00       0     195      398     516     712      8,544  
## empclf00     0    1,205    1,756   1,820   2,373    10,334  
## clf00        0    1,302    1,865   1,930   2,502    11,251  
## unemp00      0      51      87      110     140      6,405  
## prof00       0     299      539     637     873      6,610  
## dpov00       0    2,671    3,718   3,804   4,871    23,892  
## npov00       0     149      304     452     601      5,515  
## ag25up00     0    1,763    2,451   2,520   3,224    17,974  
## hs00         0     665     1,071   1,155   1,552     8,909  
## col00        0     243      492     665     923      9,313  
## pop00.x      0    2,751    3,802   3,901   4,976    36,206  
## nhwht00      0    1,308    2,514   2,591   3,713    20,619  
## nhblk00      0      41      141     522     527     14,039  
## hisp00       0      55      153     547     533     13,391  
## asian00      0      22      65      189     183      9,491  
## p.white      0      47      78      67       91       100   
## p.black      0      1        4      14       14       100   
## p.hisp       0      2        4      13       15       100   
## p.asian      0      1        2       5       5        95    
## p.hs         0      67      72      72       77       100   
## p.col        0      12      21      26       36       100   
## p.prof       0      23      31      34       43       100   
## p.unemp      0      3        5       6       8        100   
## pov.rate     0      4        9      12       17       100   
## mhv.00       0   105,146  154,497 186,502 224,071  1,288,551
## mhv.10     9,999 123,200  193,200 246,570 312,000  1,000,001
## mhv.growth  -97     6       25      33       50      6,059  
## ------------------------------------------------------------


Change in MHV 2000 - 2010

If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.

The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.

hist( mhv.change/1000, breaks=500, 
      xlim=c(-100,500), yaxt="n", xaxt="n",
      xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
      ylab="", main="Change in Median Home Value 2000 to 2010",
      col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ), 
      labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
        
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500, 
      labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ), 
      col="darkorange", cex=1.8, pos=3 )
median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000, 
      labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ), 
      col="dodgerblue", cex=1.8, pos=3 )


Percent Change in MHV 2000 to 2010

The percent change variable provides some context for growth rates of value in census tracts.

Plot the percent change variable:

hg <-
hist( mhv.growth, breaks=5000, 
      xlim=c(-100,200), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
      col="gray40", border="white" )
axis( side=1, at=seq( from=-100, to=200, by=50 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
ymax <- max( hg$count )
        
mean.x <- mean( mhv.growth, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=100, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )
median.x <- median( mhv.growth, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=100, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )


Metro Level Statistics

Both changes within a census tract and changes within a city can affect the price of a home. Since our policy of interest focuses on changes made within a tract (new business and new housing created through NMTC and LIHTC), we want to control for things happening at the metro level to ensure we are not inferring that programs that target the tract are driving outcomes when it is actually broad metro-level trends.

You might want to calculate several metro-level statistics for your model (growth in population in the city, changes in demographics, changes in industry structure and wealth, for example). You can calculate a metro level statistic by aggregating up (population count) or averaging across census tracts.

# view results
cbsa_stats_df %>% head()


Part II

Modeling Change

We are going to build our policy inferences in two stages. This lab will start by modeling changes in median home values that are predicted by tract characteristics in 2000 and changes in the city between 2000 and 2010.

In the following lab we will then add our policy variables - measures of government programs.

Typically you will start with a simple model of the relationship of the policy and the outcome and add controls to improve the model. In some cases, especially when you have a large amount of data like this, it can be helpful building your models from the ground up starting with the controls. This ensures you understand all the specification considerations of your model before adding the one variable you care about. If your baseline model does not improve when adding the policy variable chances are the policy is not generating much impact.

Variable Selection

Variable selection is the process of picking the variables and their functional form that you will include in the model.

It is one of the most important steps in the process, but also it is an art, not a science.

The goal in evaluation models is to achieve strong model fit while accounting for competing hypothses (alternative explanations to the assertion that the program or policy caused the change we see in the data).

Types of Measures

Note that we have two types of variables in these models: metrics from 2000 and metrics of change between 2000 and 2010. We have two units of analysis - census tract and metro area. And we have two ways to represent quantitative variables - as counts and as proportions.

Sometimes only one type is meaningful - the population of a census tract is not very meaningful because tracts are designed to hold 2000 to 4000 people, and they split when they grow.

Population density, on the other hand, might be meaningful. The growth of the city between 2000 and 2010 is likely meaningful.

The number of white people in a tract might not be meaningful, but the proportion of white people in a tract (a measure of diversity) likely is.

So pay attention to how you are constructing variables, and whether the way you are presenting a variable (unit of analysis, time period, and count versus proportion) makes sense.

Challenges in Variable Selection

Let’s say we have 10 independent variables to choose from, and we want a simple model with 3 independent variables.

Each variable can be represented at the community (tract) or the metro (cbsa) level (unit of analysis).

And each variable can be measured as counts or proportions.

So each variable turns into 4 potential variables:

  • X as count, tract level
  • X as proportion, tract level
  • X as count, metro level
  • X as proportion, tract level

So 10 variables x 2 units of analysis x 2 metrics translates to approximately 40 distinct variables that we need to choose from.

So here is the real challenge: with 40 variables we can construct almost 10,000 unique 3-variable models!

choose( 40, 3 ) %>% format( big.mark="," )
## [1] "9,880"

We actually have close to 100 census variables in our dataset and more than 4 different ways to represent each, so we have more than 10 million possible models to explore.

choose( (4*100), 3 ) %>% format( big.mark="," )
## [1] "10,586,800"

Addressing Skew in Census Data:

Returning to our census variables, when we look at the relationship between variables in the scatterplot we see that a lot of the data clusters in the lower left hand corner of the scatterplots, which is a sign of skew.

# create subset to visualize in correlation matrix 
d2 <- select( d, mhv.growth,  p.prof,  pov.rate, p.unemp )

# reduce data density for visualization
set.seed( 1234 )
d3 <- sample_n( d2, 10000 ) %>% na.omit()

# correlation plots
pairs( d3, upper.panel=panel.cor, lower.panel=panel.smooth )
## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter


After applying log transformations note that the bivariate correlations increase except for relationship between MHV and (mhv.growth) vacancy rates (p.vacant):

set.seed( 1234 )
d2 <- select( d, mhv.growth, p.prof,  pov.rate, p.unemp )
# recode some vars to remove outliers and skew
d2$mhv.growth[ d2$mhv.growth > 200 ] <- NA
d2$p.unemp <- log10( d2$p.unemp + 1 )
d2$p.prof <- log10( d2$p.prof + 1  )
d2$pov.rate <- log10( d2$pov.rate + 1 )
d4 <- sample_n( d2, 5000 ) %>% na.omit()
pairs( d4, upper.panel=panel.cor, lower.panel=panel.smooth )
## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter


The correlation between the transformed version of median home value change and poverty rate falls, but is that a bad thing? The non-transformed version contains a statistically-significant correlation, but take a look at the scatterplot:

jplot( d3$pov.rate, d3$mhv.growth, ylim=c(-50,100),
      lab1="Poverty Rates", lab2="MHV Growth" )


Multicollinearity

Multicollinearity occurs when two variables in the model are highly correlated. If both are included in the model at the same time they can essentially cancel each other out. Standard errors will increase and slopes typically shift toward the null. As a result, even if they pair of variables are individually significant and have large effect sizes, they will have small slopes and lack statistical significance when included together.

Take this example where we try to include both poverty rate and unemployment. A sign of multicollinarity is when both coefficients get smaller and the standard errors get bigger when the two variables are included in the same model (model 3 here).

reg.data <- d
reg.data$mhv.growth[ reg.data$mhv.growth > 200 ] <- NA
reg.data$p.unemp <- log10( reg.data$p.unemp + 1 )
reg.data$pov.rate <- log10( reg.data$pov.rate + 1 )
m1 <- lm( mhv.growth ~  pov.rate, data=reg.data )
m2 <- lm( mhv.growth ~  p.unemp, data=reg.data )
m3 <- lm( mhv.growth ~  pov.rate + p.unemp, data=reg.data )
stargazer( m1, m2, m3, 
           type="text", digits=2,
           omit.stat = c("rsq","f") )
## 
## ============================================================================
##                                       Dependent variable:                   
##                     --------------------------------------------------------
##                                            mhv.growth                       
##                            (1)                (2)                (3)        
## ----------------------------------------------------------------------------
## pov.rate                 12.93***                              9.67***      
##                           (0.42)                                (0.61)      
##                                                                             
## p.unemp                                     15.57***           6.04***      
##                                              (0.56)             (0.83)      
##                                                                             
## Constant                 16.69***           17.43***           15.24***     
##                           (0.44)             (0.46)             (0.48)      
##                                                                             
## ----------------------------------------------------------------------------
## Observations              58,798             58,801             58,791      
## Adjusted R2                0.02               0.01               0.02       
## Residual Std. Error 34.89 (df = 58796) 34.94 (df = 58799) 34.87 (df = 58788)
## ============================================================================
## Note:                                            *p<0.1; **p<0.05; ***p<0.01


Group Structure

When building models it is important to think about whether there are groups in the data that would influence the outcome. You will cover this topic in the lecture on Fixed Effect Models in CPP 525.

In this domain we know that neighborhoods belong to cities, and cities each have distinctive histories, political structures, economies, geographies and climates. Some cities were thriving over the study period and experienced population growth and strong economic expansion (relatively speaking, all were impacted by the 2001 and 2008 economic crises). Others were in decline, losing population and had shrinking economies.

These are broad city-level processes that unfold over decades. No individual census tract within a city will be isolated from these trends occurring metro-wide. As a result we would expect the overall health of the city to impact the home values in each tract.

d5 <- filter( d, cbsaname %in% 
                c("Atlanta-Sandy Springs-Marietta, GA",
                  "Minneapolis-St. Paul-Bloomington, MN-WI",
                  "San Francisco-San Mateo-Redwood City,CA") )
d5$cbsaname <- factor( d5$cbsaname, labels=c("MSP-MN","SF-CA","ATL-GA") )
par( mar=c(4,6,4,6), mfrow=c(1,2) )
plot( d5$cbsaname,  d5$mhv.00, las=1, frame.plot=F, outline=F,
      xlab="", ylab="", main="Home Values in 2000" )
abline( h=seq(0,1200000,100000), lty=3, col=gray(0.5,0.3) )
axis( side=4, las=1 )
plot( d5$cbsaname,  d5$p.unemp, las=1, frame.plot=F, outline=F,
      xlab="", ylab="", main="Unemployment Rates in 2000" )
abline( h=seq(0,15,1), lty=3, col=gray(0.5,0.3) )
axis( side=4, las=1 )


We know that the health of the city in 2000 and the changes that occur at the metro level between 2000 and 2010 will impact all census tracts in the city. We can see the metro-level clustering in a plot:

d5 <- filter( d, cbsaname %in%
                c("Atlanta-Sandy Springs-Marietta, GA",
                  "Youngstown-Warren-Boardman, OH-PA",
                  "Syracuse, NY") )
d5$mhv.growth[ d5$mhv.growth > 200 ] <- NA
d5$p.unemp <- log10( d5$p.unemp + 1 )
x <- rnorm( nrow(d5), 0, 0.1 ) +
     as.numeric( d5$cbsaname == "Atlanta-Sandy Springs-Marietta, GA" ) + 
     2 * as.numeric( d5$cbsaname == "Youngstown-Warren-Boardman, OH-PA" ) + 
     3* as.numeric( d5$cbsaname == "Syracuse, NY" ) 
par( mfrow=c(1,2) )
plot( x, d5$mhv.growth, 
      pch=19, cex=1.5, bty = "n",  
        col=factor(d5$cbsa),
      ylim=c(-50,50),
      xaxt="n", 
      ylab="", xlab="",
      main="MHV Growth")
axis( side=1, at=1:3, labels=c("Atlanta","Youngstown","Syracuse"), 
      tick=F, col.axis="gray60", cex.axis=1.3 )
plot( x, d5$p.unemp, 
      pch=19, cex=1.5, bty = "n",  
        col=factor(d5$cbsa),
      # ylim=c(0,40),
      xaxt="n", 
      ylab="", xlab="",
      main="Unemployment (logged)")
axis( side=1, at=1:3, labels=c("Atlanta","Youngstown","Syracuse"), 
      tick=F, col.axis="gray60", cex.axis=1.3 )


Which would result in different baseline metro-level home value growth rates in a model when each city is given its own intercept.

# d5 <- filter( d, cbsaname %in% 
#                 c("Tyler, TX",
#                   "Youngstown-Warren-Boardman, OH-PA",
#                   "Syracuse, NY") )
# 
# d5$mhv.growth[ d5$mhv.growth > 200 ] <- NA
# d5$p.unemp <- log10( d5$p.unemp + 1 )
m <- lm( mhv.growth ~ factor(cbsaname) + p.unemp - 1, data=d5 )
b0.syracuse   <- m$coefficients[1] 
b0.tyler      <- m$coefficients[2] 
b0.youngston  <- m$coefficients[3] 
b1            <- m$coefficients[4] 
palette( c( "steelblue", "green3", "darkorange"  ) )
palette( adjustcolor( palette(), alpha.f = 0.3 ) )
plot( d5$p.unemp, d5$mhv.growth,
        pch=19, cex=1.5, bty = "n",  
        col=factor(d5$cbsa),
      ylim=c(-50,50),
      xlab="Unemployment Rate (logged)",
      ylab="Median Home Value Growth 2000-2010")
          
abline( b0.syracuse, b1, col="steelblue", lwd=3 )
abline( b0.tyler, b1, col="green3", lwd=3 )
abline( b0.youngston, b1, col="darkorange", lwd=3 )


We account for context by adding a metro-level fixed effect to the model. Alternatively, we can add the average median home value growth of the metro area as a control variable:

d.reg <- d
d.reg$mhv.growth[ d.reg$mhv.growth > 200 ] <- NA
d.reg$p.unemp <- log10( d.reg$p.unemp + 1 )
# average growth in median home value for the city
d.reg <- 
  d.reg %>%
  group_by( cbsaname ) %>%
  mutate( metro.mhv.growth = 100 * median( mhv.growth, na.rm=T ) ) %>%
  ungroup() 
m1 <- lm( mhv.growth ~ p.unemp, data=d.reg )
m2 <- lm( mhv.growth ~ p.unemp + metro.mhv.growth, data=d.reg )
m3 <- lm( mhv.growth ~ p.unemp + cbsa, data=d.reg )
stargazer( m1, m2, m3, 
           type="text", digits=2,
           omit.stat = c("rsq","f"),
           omit="cbsa",
           add.lines = list(c("Metro Fixed Effects:", "NO", "NO","YES")) )
## 
## =============================================================================
##                                        Dependent variable:                   
##                      --------------------------------------------------------
##                                             mhv.growth                       
##                             (1)                (2)                (3)        
## -----------------------------------------------------------------------------
## p.unemp                   15.57***           8.60***            10.02***     
##                            (0.56)             (0.44)             (0.47)      
##                                                                              
## metro.mhv.growth                             0.01***                         
##                                              (0.0000)                        
##                                                                              
## Constant                  17.43***           -2.73***           16.91***     
##                            (0.46)             (0.37)             (4.05)      
##                                                                              
## -----------------------------------------------------------------------------
## Metro Fixed Effects:         NO                 NO                YES        
## Observations               58,801             58,801             58,801      
## Adjusted R2                 0.01               0.41               0.41       
## Residual Std. Error  34.94 (df = 58799) 27.04 (df = 58798) 26.99 (df = 58421)
## =============================================================================
## Note:                                             *p<0.1; **p<0.05; ***p<0.01