This lab is designed to help you build your baseline model of neighborhood change before adding the policy variables in the next lab.
set.seed( 1234 )
# load necessary functions and objects ----
# note: all of these are R objects that will be used throughout this .rmd file
import::here("S_TYPE",
"panel.cor",
"panel.smooth",
"jplot",
"d",
"df",
"cbsa_stats_df",
# notice the use of here::here() that points to the .R file
# where all these R objects are created
.from = here::here("labs/wk04/lab_04_source.R"),
.character_only = TRUE)Similar to your previous lab, create a data set that includes 2000 and 2010 census variables drop all rural census tracts.
Create a variable that measures the growth of median home value from 2000 to 2010.
Omit cases that have a median home value less than $10,000 in 2000.
Omit cases with growth rates above 200%.
# Load Crosswalk data
crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv", stringsAsFactors=F, colClasses="character" )d1 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2010.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )
# check to make sure we are not losing
# or gaining observations in the merge
nrow( d1 ) ## [1] 72693
d1 <- select( d1, - year )
d2 <- select( d2, - year )
d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )
nrow( d )## [1] 72693
table( d$urban )##
## rural urban
## 12971 59722
d <- filter( d, urban == "urban" )# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
# use regular expressions to remove numeric suffixes
var.names.1 <- names( df1 )
var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
var.names.2 <- names( df2 )
var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
shared <- intersect( var.names.1, var.names.2 ) %>% sort()
print( "SHARED VARIABLES:")
print( shared )
not.shared <- c( setdiff( var.names.1, var.names.2 ),
setdiff( var.names.2, var.names.1 ) ) %>% sort()
print( "NOT SHARED:" )
print( not.shared )
d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
dd <- rbind( d.vars1, d.vars2 )
return( dd )
}
vars <- compare_dfs( df1=d1, df2=d2 )## [1] "SHARED VARIABLES:"
## [1] "a15asn" "a15blk" "a15hsp" "a15ntv" "a15wht" "a18und" "a60asn"
## [8] "a60blk" "a60hsp" "a60ntv" "a60up" "a60wht" "a75up" "ag15up"
## [15] "ag18cv" "ag25up" "ag5up" "ageasn" "ageblk" "agehsp" "agentv"
## [22] "agewht" "asian" "china" "clf" "col" "cuban" "dapov"
## [29] "dbpov" "dflabf" "dfmpov" "dhpov" "dmulti" "dnapov" "dpov"
## [36] "dwpov" "empclf" "family" "fb" "fhh" "filip" "flabf"
## [43] "geanc" "gefb" "h10yrs" "h30old" "haw" "hh" "hha"
## [50] "hhb" "hhh" "hhw" "hinc" "hinca" "hincb" "hinch"
## [57] "hincw" "hisp" "hs" "hu" "incpc" "india" "iranc"
## [64] "irfb" "itanc" "itfb" "japan" "korea" "lep" "manuf"
## [71] "mar" "mex" "mhmval" "mrent" "multi" "n10imm" "n65pov"
## [78] "napov" "nat" "nbpov" "nfmpov" "nhblk" "nhpov" "nhwht"
## [85] "nnapov" "npov" "ntv" "nwpov" "ohu" "olang" "own"
## [92] "pop" "pr" "prof" "rent" "ruanc" "rufb" "scanc"
## [99] "scfb" "semp" "tractid" "unemp" "vac" "vet" "viet"
## [106] "wds"
## [1] "NOT SHARED:"
## [1] "a65asn" "a65blk" "a65hsp" "a65ntv" "a65wht" "cni16u" "dis"
## [8] "hu00sp" "ohu00sp" "p10imm" "p10yrs" "p15asn" "p15blk" "p15hsp"
## [15] "p15ntv" "p15wht" "p18und" "p30old" "p60up" "p65asn" "p65blk"
## [22] "p65hsp" "p65ntv" "p65pov" "p65wht" "p75up" "papov" "pasian"
## [29] "pbpov" "pchina" "pcol" "pcuban" "pfb" "pfhh" "pfilip"
## [36] "pflabf" "pfmpov" "pgeanc" "pgefb" "phaw" "phisp" "phpov"
## [43] "phs" "pindia" "piranc" "pirfb" "pitanc" "pitfb" "pjapan"
## [50] "pkorea" "plep" "pmanuf" "pmar" "pmex" "pmulti" "pnapov"
## [57] "pnat" "pnhblk" "pnhwht" "pntv" "polang" "pown" "ppov"
## [64] "ppr" "pprof" "pruanc" "prufb" "pscanc" "pscfb" "psemp"
## [71] "punemp" "pvac" "pvet" "pviet" "pwds" "pwpov"
head(vars) %>% pander()| type | variables |
|---|---|
| shared | a15asn |
| shared | a15blk |
| shared | a15hsp |
| shared | a15ntv |
| shared | a15wht |
| shared | a18und |
d.full <- d # keep a copy so don't have to reload
d <- d.full # story original in case you need to reset anything
d <- select( d, tractid, mhmval00, mhmval12, hinc00,
hu00, own00, rent00,
empclf00, clf00, unemp00, prof00,
dpov00, npov00,
ag25up00, hs00, col00,
pop00.x, nhwht00, nhblk00, hisp00, asian00,
cbsa, cbsaname )
d <-
d %>%
mutate( p.white = 100 * nhwht00 / pop00.x,
p.black = 100 * nhblk00 / pop00.x,
p.hisp = 100 * hisp00 / pop00.x,
p.asian = 100 * asian00 / pop00.x,
p.hs = 100 * (hs00+col00) / ag25up00,
p.col = 100 * col00 / ag25up00,
p.prof = 100 * prof00 / empclf00,
p.unemp = 100 * unemp00 / clf00,
pov.rate = 100 * npov00 / dpov00 )stargazer( d,
type="text",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )##
## ===========================================================
## Statistic Min Pctl(25) Median Mean Pctl(75) Max
## -----------------------------------------------------------
## mhmval00 0 81,600 119,900 144,738 173,894 1,000,001
## mhmval12 9,999 123,200 193,200 246,570 312,000 1,000,001
## hinc00 2,499 33,000 43,825 47,657 58,036 200,001
## hu00 0 1,102 1,519 1,570 1,999 11,522
## own00 0 542 902 939 1,289 4,911
## rent00 0 195 398 516 712 8,544
## empclf00 0 1,205 1,756 1,820 2,373 10,334
## clf00 0 1,302 1,865 1,930 2,502 11,251
## unemp00 0 51 87 110 140 6,405
## prof00 0 299 539 637 873 6,610
## dpov00 0 2,671 3,718 3,804 4,871 23,892
## npov00 0 149 304 452 601 5,515
## ag25up00 0 1,763 2,451 2,520 3,224 17,974
## hs00 0 665 1,071 1,155 1,552 8,909
## col00 0 243 492 665 923 9,313
## pop00.x 0 2,751 3,802 3,901 4,976 36,206
## nhwht00 0 1,308 2,514 2,591 3,713 20,619
## nhblk00 0 41 141 522 527 14,039
## hisp00 0 55 153 547 533 13,391
## asian00 0 22 65 189 183 9,491
## p.white 0 47 78 67 91 100
## p.black 0 1 4 14 14 100
## p.hisp 0 2 4 13 15 100
## p.asian 0 1 2 5 5 95
## p.hs 0 67 72 72 77 100
## p.col 0 12 21 26 36 100
## p.prof 0 23 31 34 43 100
## p.unemp 0 3 5 6 8 100
## pov.rate 0 4 9 12 17 100
## -----------------------------------------------------------
Initial conditions in 2000:
# adjust 2000 home values for inflation
mhv.00 <- d$mhmval00 * 1.28855
mhv.10 <- d$mhmval12
mhv.change <- mhv.10 - mhv.00
df <- data.frame( MedianHomeValue2000=mhv.00,
MedianHomeValue2010=mhv.10,
Change.00.to.10=mhv.change )
d$mhv.00 <- d$mhmval00 * 1.28855
d$mhv.10 <- d$mhmval12
stargazer( df,
type="text",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )##
## ==========================================================================
## Statistic Min Pctl(25) Median Mean Pctl(75) Max
## --------------------------------------------------------------------------
## MedianHomeValue2000 0 105,146 154,497 186,502 224,071 1,288,551
## MedianHomeValue2010 9,999 123,200 193,200 246,570 312,000 1,000,001
## Change.00.to.10 -1,228,651 7,187 36,268 60,047 94,881 1,000,001
## --------------------------------------------------------------------------
Compare 2000 to 2010 distributions:
layout.matrix <- matrix( c( 1,3,
2,3 ),
nrow=2, ncol=2, byrow=T )
layout( mat = layout.matrix,
heights = c(2,2), # Heights of the two rows
widths = c(3,4)) # Widths of the two columns
# layout.show(3)
par( mar=c(4,0,0,2) )
hist( mhv.00/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
abline( v=seq(0,1000,100), lty=2, col="gray80" )
text( 550, 4000, labels="Median Home \nValue in 2000",
col="darkslateblue", cex=1.8 )
hist( mhv.10/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )
text( 550, 3500, labels="Median Home \nValue in 2010",
col="darkslateblue", cex=1.8 )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
# data reduction - filter 1,000 observations
df <- data.frame( v00=mhv.00/1000, v10=mhv.10/1000 )
df <- sample_n( df, 1000 )
par( mar=c(4,5,3,2) )
jplot( df$v00, df$v10,
lab1="MHV in 1990", lab2="MHV in 2000",
xlim=c(0,1000), ylim=c(0,1000),
axes=F )
abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic
mhv.00[ mhv.00 < 10000 ] <- NA
pct.change <- mhv.change / mhv.00
summary( pct.change )## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.96891 0.05918 0.25402 0.33167 0.49556 60.59261 220
mhv.growth <- mhv.change / mhv.00 * 100
d$mhv.growth <- mhv.change / mhv.00 * 100
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.96891 0.05918 0.25402 0.33167 0.49556 60.59261 220
# how many cases had increases above 500%
sum( pct.change > 2, na.rm=T )## [1] 546
## [1] 112
# preview tracts with large increases in home values
# to see if increases make sense
d %>%
filter( pct.change < 2 ) %>%
head()hist( d$mhmval00, breaks=200, xlim=c(0,500000),
col="gray20", border="white",
axes=F,
xlab="MHV (median = $138k)",
ylab="",
main="Median Home Value in 2000 (2010 US dollars)" )
axis( side=1, at=seq(0,500000,100000),
labels=c("$0","$100k","$200k","$300k","$400k","$500k") )
abline( v=median( d$mhmval00, na.rm=T ), col="orange", lwd=3 )
Descriptives
stargazer( d,
type="text",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )##
## ============================================================
## Statistic Min Pctl(25) Median Mean Pctl(75) Max
## ------------------------------------------------------------
## mhmval00 0 81,600 119,900 144,738 173,894 1,000,001
## mhmval12 9,999 123,200 193,200 246,570 312,000 1,000,001
## hinc00 2,499 33,000 43,825 47,657 58,036 200,001
## hu00 0 1,102 1,519 1,570 1,999 11,522
## own00 0 542 902 939 1,289 4,911
## rent00 0 195 398 516 712 8,544
## empclf00 0 1,205 1,756 1,820 2,373 10,334
## clf00 0 1,302 1,865 1,930 2,502 11,251
## unemp00 0 51 87 110 140 6,405
## prof00 0 299 539 637 873 6,610
## dpov00 0 2,671 3,718 3,804 4,871 23,892
## npov00 0 149 304 452 601 5,515
## ag25up00 0 1,763 2,451 2,520 3,224 17,974
## hs00 0 665 1,071 1,155 1,552 8,909
## col00 0 243 492 665 923 9,313
## pop00.x 0 2,751 3,802 3,901 4,976 36,206
## nhwht00 0 1,308 2,514 2,591 3,713 20,619
## nhblk00 0 41 141 522 527 14,039
## hisp00 0 55 153 547 533 13,391
## asian00 0 22 65 189 183 9,491
## p.white 0 47 78 67 91 100
## p.black 0 1 4 14 14 100
## p.hisp 0 2 4 13 15 100
## p.asian 0 1 2 5 5 95
## p.hs 0 67 72 72 77 100
## p.col 0 12 21 26 36 100
## p.prof 0 23 31 34 43 100
## p.unemp 0 3 5 6 8 100
## pov.rate 0 4 9 12 17 100
## mhv.00 0 105,146 154,497 186,502 224,071 1,288,551
## mhv.10 9,999 123,200 193,200 246,570 312,000 1,000,001
## mhv.growth -97 6 25 33 50 6,059
## ------------------------------------------------------------
If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.
The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.
hist( mhv.change/1000, breaks=500,
xlim=c(-100,500), yaxt="n", xaxt="n",
xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
ylab="", main="Change in Median Home Value 2000 to 2010",
col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ),
labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500,
labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ),
col="darkorange", cex=1.8, pos=3 )
median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000,
labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ),
col="dodgerblue", cex=1.8, pos=3 )
The percent change variable provides some context for growth rates of value in census tracts.
Plot the percent change variable:
hg <-
hist( mhv.growth, breaks=5000,
xlim=c(-100,200), yaxt="n", xaxt="n",
xlab="", cex.main=1.5,
ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
col="gray40", border="white" )
axis( side=1, at=seq( from=-100, to=200, by=50 ),
labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
ymax <- max( hg$count )
mean.x <- mean( mhv.growth, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=100, y=(0.5*ymax),
labels=paste0( "Mean = ", round(mean.x,0), "%"),
col="darkorange", cex=1.8, pos=4 )
median.x <- median( mhv.growth, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=100, y=(0.6*ymax),
labels=paste0( "Median = ", round(median.x,0), "%"),
col="dodgerblue", cex=1.8, pos=4 )
Both changes within a census tract and changes within a city can affect the price of a home. Since our policy of interest focuses on changes made within a tract (new business and new housing created through NMTC and LIHTC), we want to control for things happening at the metro level to ensure we are not inferring that programs that target the tract are driving outcomes when it is actually broad metro-level trends.
You might want to calculate several metro-level statistics for your model (growth in population in the city, changes in demographics, changes in industry structure and wealth, for example). You can calculate a metro level statistic by aggregating up (population count) or averaging across census tracts.
# view results
cbsa_stats_df %>% head()We are going to build our policy inferences in two stages. This lab will start by modeling changes in median home values that are predicted by tract characteristics in 2000 and changes in the city between 2000 and 2010.
In the following lab we will then add our policy variables - measures of government programs.
Typically you will start with a simple model of the relationship of the policy and the outcome and add controls to improve the model. In some cases, especially when you have a large amount of data like this, it can be helpful building your models from the ground up starting with the controls. This ensures you understand all the specification considerations of your model before adding the one variable you care about. If your baseline model does not improve when adding the policy variable chances are the policy is not generating much impact.
Variable selection is the process of picking the variables and their functional form that you will include in the model.
It is one of the most important steps in the process, but also it is an art, not a science.
The goal in evaluation models is to achieve strong model fit while accounting for competing hypothses (alternative explanations to the assertion that the program or policy caused the change we see in the data).
Note that we have two types of variables in these models: metrics from 2000 and metrics of change between 2000 and 2010. We have two units of analysis - census tract and metro area. And we have two ways to represent quantitative variables - as counts and as proportions.
Sometimes only one type is meaningful - the population of a census tract is not very meaningful because tracts are designed to hold 2000 to 4000 people, and they split when they grow.
Population density, on the other hand, might be meaningful. The growth of the city between 2000 and 2010 is likely meaningful.
The number of white people in a tract might not be meaningful, but the proportion of white people in a tract (a measure of diversity) likely is.
So pay attention to how you are constructing variables, and whether the way you are presenting a variable (unit of analysis, time period, and count versus proportion) makes sense.
Let’s say we have 10 independent variables to choose from, and we want a simple model with 3 independent variables.
Each variable can be represented at the community (tract) or the metro (cbsa) level (unit of analysis).
And each variable can be measured as counts or proportions.
So each variable turns into 4 potential variables:
So 10 variables x 2 units of analysis x 2 metrics translates to approximately 40 distinct variables that we need to choose from.
So here is the real challenge: with 40 variables we can construct almost 10,000 unique 3-variable models!
choose( 40, 3 ) %>% format( big.mark="," )## [1] "9,880"
We actually have close to 100 census variables in our dataset and more than 4 different ways to represent each, so we have more than 10 million possible models to explore.
choose( (4*100), 3 ) %>% format( big.mark="," )## [1] "10,586,800"
Returning to our census variables, when we look at the relationship between variables in the scatterplot we see that a lot of the data clusters in the lower left hand corner of the scatterplots, which is a sign of skew.
# create subset to visualize in correlation matrix
d2 <- select( d, mhv.growth, p.prof, pov.rate, p.unemp )
# reduce data density for visualization
set.seed( 1234 )
d3 <- sample_n( d2, 10000 ) %>% na.omit()
# correlation plots
pairs( d3, upper.panel=panel.cor, lower.panel=panel.smooth )## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
After applying log transformations note that the bivariate correlations increase except for relationship between MHV and (mhv.growth) vacancy rates (p.vacant):
set.seed( 1234 )
d2 <- select( d, mhv.growth, p.prof, pov.rate, p.unemp )
# recode some vars to remove outliers and skew
d2$mhv.growth[ d2$mhv.growth > 200 ] <- NA
d2$p.unemp <- log10( d2$p.unemp + 1 )
d2$p.prof <- log10( d2$p.prof + 1 )
d2$pov.rate <- log10( d2$pov.rate + 1 )
d4 <- sample_n( d2, 5000 ) %>% na.omit()
pairs( d4, upper.panel=panel.cor, lower.panel=panel.smooth )## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
The correlation between the transformed version of median home value change and poverty rate falls, but is that a bad thing? The non-transformed version contains a statistically-significant correlation, but take a look at the scatterplot:
jplot( d3$pov.rate, d3$mhv.growth, ylim=c(-50,100),
lab1="Poverty Rates", lab2="MHV Growth" )
Multicollinearity occurs when two variables in the model are highly correlated. If both are included in the model at the same time they can essentially cancel each other out. Standard errors will increase and slopes typically shift toward the null. As a result, even if they pair of variables are individually significant and have large effect sizes, they will have small slopes and lack statistical significance when included together.
Take this example where we try to include both poverty rate and unemployment. A sign of multicollinarity is when both coefficients get smaller and the standard errors get bigger when the two variables are included in the same model (model 3 here).
reg.data <- d
reg.data$mhv.growth[ reg.data$mhv.growth > 200 ] <- NA
reg.data$p.unemp <- log10( reg.data$p.unemp + 1 )
reg.data$pov.rate <- log10( reg.data$pov.rate + 1 )
m1 <- lm( mhv.growth ~ pov.rate, data=reg.data )
m2 <- lm( mhv.growth ~ p.unemp, data=reg.data )
m3 <- lm( mhv.growth ~ pov.rate + p.unemp, data=reg.data )
stargazer( m1, m2, m3,
type="text", digits=2,
omit.stat = c("rsq","f") )##
## ============================================================================
## Dependent variable:
## --------------------------------------------------------
## mhv.growth
## (1) (2) (3)
## ----------------------------------------------------------------------------
## pov.rate 12.93*** 9.67***
## (0.42) (0.61)
##
## p.unemp 15.57*** 6.04***
## (0.56) (0.83)
##
## Constant 16.69*** 17.43*** 15.24***
## (0.44) (0.46) (0.48)
##
## ----------------------------------------------------------------------------
## Observations 58,798 58,801 58,791
## Adjusted R2 0.02 0.01 0.02
## Residual Std. Error 34.89 (df = 58796) 34.94 (df = 58799) 34.87 (df = 58788)
## ============================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
When building models it is important to think about whether there are groups in the data that would influence the outcome. You will cover this topic in the lecture on Fixed Effect Models in CPP 525.
In this domain we know that neighborhoods belong to cities, and cities each have distinctive histories, political structures, economies, geographies and climates. Some cities were thriving over the study period and experienced population growth and strong economic expansion (relatively speaking, all were impacted by the 2001 and 2008 economic crises). Others were in decline, losing population and had shrinking economies.
These are broad city-level processes that unfold over decades. No individual census tract within a city will be isolated from these trends occurring metro-wide. As a result we would expect the overall health of the city to impact the home values in each tract.
d5 <- filter( d, cbsaname %in%
c("Atlanta-Sandy Springs-Marietta, GA",
"Minneapolis-St. Paul-Bloomington, MN-WI",
"San Francisco-San Mateo-Redwood City,CA") )
d5$cbsaname <- factor( d5$cbsaname, labels=c("MSP-MN","SF-CA","ATL-GA") )
par( mar=c(4,6,4,6), mfrow=c(1,2) )
plot( d5$cbsaname, d5$mhv.00, las=1, frame.plot=F, outline=F,
xlab="", ylab="", main="Home Values in 2000" )
abline( h=seq(0,1200000,100000), lty=3, col=gray(0.5,0.3) )
axis( side=4, las=1 )
plot( d5$cbsaname, d5$p.unemp, las=1, frame.plot=F, outline=F,
xlab="", ylab="", main="Unemployment Rates in 2000" )
abline( h=seq(0,15,1), lty=3, col=gray(0.5,0.3) )
axis( side=4, las=1 )
We know that the health of the city in 2000 and the changes that occur at the metro level between 2000 and 2010 will impact all census tracts in the city. We can see the metro-level clustering in a plot:
d5 <- filter( d, cbsaname %in%
c("Atlanta-Sandy Springs-Marietta, GA",
"Youngstown-Warren-Boardman, OH-PA",
"Syracuse, NY") )
d5$mhv.growth[ d5$mhv.growth > 200 ] <- NA
d5$p.unemp <- log10( d5$p.unemp + 1 )
x <- rnorm( nrow(d5), 0, 0.1 ) +
as.numeric( d5$cbsaname == "Atlanta-Sandy Springs-Marietta, GA" ) +
2 * as.numeric( d5$cbsaname == "Youngstown-Warren-Boardman, OH-PA" ) +
3* as.numeric( d5$cbsaname == "Syracuse, NY" )
par( mfrow=c(1,2) )
plot( x, d5$mhv.growth,
pch=19, cex=1.5, bty = "n",
col=factor(d5$cbsa),
ylim=c(-50,50),
xaxt="n",
ylab="", xlab="",
main="MHV Growth")
axis( side=1, at=1:3, labels=c("Atlanta","Youngstown","Syracuse"),
tick=F, col.axis="gray60", cex.axis=1.3 )
plot( x, d5$p.unemp,
pch=19, cex=1.5, bty = "n",
col=factor(d5$cbsa),
# ylim=c(0,40),
xaxt="n",
ylab="", xlab="",
main="Unemployment (logged)")
axis( side=1, at=1:3, labels=c("Atlanta","Youngstown","Syracuse"),
tick=F, col.axis="gray60", cex.axis=1.3 )
Which would result in different baseline metro-level home value growth rates in a model when each city is given its own intercept.
# d5 <- filter( d, cbsaname %in%
# c("Tyler, TX",
# "Youngstown-Warren-Boardman, OH-PA",
# "Syracuse, NY") )
#
# d5$mhv.growth[ d5$mhv.growth > 200 ] <- NA
# d5$p.unemp <- log10( d5$p.unemp + 1 )
m <- lm( mhv.growth ~ factor(cbsaname) + p.unemp - 1, data=d5 )
b0.syracuse <- m$coefficients[1]
b0.tyler <- m$coefficients[2]
b0.youngston <- m$coefficients[3]
b1 <- m$coefficients[4]
palette( c( "steelblue", "green3", "darkorange" ) )
palette( adjustcolor( palette(), alpha.f = 0.3 ) )
plot( d5$p.unemp, d5$mhv.growth,
pch=19, cex=1.5, bty = "n",
col=factor(d5$cbsa),
ylim=c(-50,50),
xlab="Unemployment Rate (logged)",
ylab="Median Home Value Growth 2000-2010")
abline( b0.syracuse, b1, col="steelblue", lwd=3 )
abline( b0.tyler, b1, col="green3", lwd=3 )
abline( b0.youngston, b1, col="darkorange", lwd=3 )
We account for context by adding a metro-level fixed effect to the model. Alternatively, we can add the average median home value growth of the metro area as a control variable:
d.reg <- d
d.reg$mhv.growth[ d.reg$mhv.growth > 200 ] <- NA
d.reg$p.unemp <- log10( d.reg$p.unemp + 1 )
# average growth in median home value for the city
d.reg <-
d.reg %>%
group_by( cbsaname ) %>%
mutate( metro.mhv.growth = 100 * median( mhv.growth, na.rm=T ) ) %>%
ungroup()
m1 <- lm( mhv.growth ~ p.unemp, data=d.reg )
m2 <- lm( mhv.growth ~ p.unemp + metro.mhv.growth, data=d.reg )
m3 <- lm( mhv.growth ~ p.unemp + cbsa, data=d.reg )
stargazer( m1, m2, m3,
type="text", digits=2,
omit.stat = c("rsq","f"),
omit="cbsa",
add.lines = list(c("Metro Fixed Effects:", "NO", "NO","YES")) )##
## =============================================================================
## Dependent variable:
## --------------------------------------------------------
## mhv.growth
## (1) (2) (3)
## -----------------------------------------------------------------------------
## p.unemp 15.57*** 8.60*** 10.02***
## (0.56) (0.44) (0.47)
##
## metro.mhv.growth 0.01***
## (0.0000)
##
## Constant 17.43*** -2.73*** 16.91***
## (0.46) (0.37) (4.05)
##
## -----------------------------------------------------------------------------
## Metro Fixed Effects: NO NO YES
## Observations 58,801 58,801 58,801
## Adjusted R2 0.01 0.41 0.41
## Residual Std. Error 34.94 (df = 58799) 27.04 (df = 58798) 26.99 (df = 58421)
## =============================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01