Lab 05 - Predicting MHV Change

This lab is designed to help you build a baseline model of neighborhood change.

Data Steps

Follow the steps from the tutorial to create a dataset that includes 2000 and 2010 census variables.

###################################
#
#     STARGAZER SETTINGS
#
###################################

# DO NOT RUN CHUNK UNLESS KNITTING:
# changes table formats to html
# before rendering RMD docs

s.type <- "html"

Helper functions for the pairs() correlation table:

panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    txt <- paste(prefix, txt, sep="")
    if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
    
    test <- cor.test(x,y)
    # borrowed from printCoefmat
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                  symbols = c("***", "**", "*", ".", " "))
    
    text(0.5, 0.5, txt, cex = 1.5 )
    text(.7, .8, Signif, cex=cex, col=2)
}

panel.smooth <- function (x, y, col = par("col"), bg = NA, pch = par("pch"), 
    cex = 0.5, col.smooth = "red", span = 2/3, iter = 3, ...) 
{
    points(x, y, pch = 19, col = gray(0.7,0.2), bg = bg, cex = cex)
    ok <- is.finite(x) & is.finite(y)
    if (any(ok)) 
        lines(stats::lowess(x[ok], y[ok], f = span, iter = iter), 
            col = col.smooth, lwd=2, ...)
}

# custom plot
jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{

    plot( x1, x2,
          pch=19, 
          col=gray(0.6, alpha = 0.2), 
          cex=0.5,  
          bty = "n",
          xlab=lab1, 
          ylab=lab2, cex.lab=1.5,
        ... )

    if( draw.line==T ){ 
        ok <- is.finite(x1) & is.finite(x2)
        lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }

}

Data

For this lab we will use data from the Longitudinal Tract Database (LTDB) Project, a project that has harmonized a set of variables from 1970 to 2010.

URL1 <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-2000.rds"
d1 <- readRDS( gzcon( url( URL1 ) ) )

URL2 <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-2010.rds"
d2 <- readRDS( gzcon( url( URL2 ) ) )

URLmd <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-META-DATA.rds"
md <- readRDS( gzcon( url( URLmd ) ) )

d1 <- select( d1, - year )
d2 <- select( d2, - year )

d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )

Drop all rural census tracts.

table( d$urban )

## 
## rural urban 
## 12971 59722

## 
## rural urban 
## 12971 59722
d <- filter( d, urban == "urban" )

Create a variable that measures the growth of median home value from 2000 to 2010.

d <- select( d, tractid, 
             mhmval00, mhmval12, 
             hinc00, 
             hu00, vac00, own00, rent00, h30old00,
             empclf00, clf00, unemp00, prof00,  
             dpov00, npov00,
             ag25up00, hs00, col00, 
             pop00.x, nhwht00, nhblk00, hisp00, asian00,
             cbsa, cbsaname )
d <- 
  d %>%
  mutate( # percent white in 2000
          p.white = 100 * nhwht00 / pop00.x,
          # percent black in 2000
          p.black = 100 * nhblk00 / pop00.x,
          # percent hispanic in 2000
          p.hisp = 100 * hisp00 / pop00.x, 
          # percent asian in 2000
          p.asian = 100 * asian00 / pop00.x,
          # percent high school grads by age 25 in 2000 
          p.hs = 100 * (hs00+col00) / ag25up00,
          # percent pop with college degree in 2000
          p.col = 100 * col00 / ag25up00,
          # percent employed in professional fields in 2000
          p.prof = 100 * prof00 / empclf00,
          # percent unemployment  in 2000
          p.unemp = 100 * unemp00 / clf00,
          # percent of housing lots in tract that are vacant in 2000
          p.vacant = 100 * vac00 / hu00,
          # dollar change in median home value 2000 to 2010 
          pov.rate = 100 * npov00 / dpov00 )


# adjust 2000 home values for inflation 
mhv.00 <- d$mhmval00 * 1.28855  
mhv.10 <- d$mhmval12

# change in MHV in dollars
mhv.change <- mhv.10 - mhv.00

Omit cases that have a median home value less than $1,000 in 2000.

# drop low 2000 median home values
# to avoid unrealistic growth rates.
#
# tracts with homes that cost less than
# $1,000 are outliers
mhv.00[ mhv.00 < 1000 ] <- NA

Omit cases with growth rates above 200%.

# change in MHV in percent
mhv.growth <- 100 * ( mhv.change / mhv.00 )
mhv.growth[ mhv.growth > 200] <- NA 
#can't get the histogram to plot if I run this.... 

d$mhv.00 <- mhv.00
d$mhv.10 <- mhv.10
d$mhv.change <- mhv.change
d$mhv.growth <- mhv.growth 

#head(d) %>% pander()

Print summary statistics about median home values in 2000 and 2010.

df <- data.frame( MedianHomeValue2000=mhv.00, 
                  MedianHomeValue2010=mhv.10, 
                  MHV.Change.00.to.10=mhv.change,
                  MHV.Growth.00.to.12=mhv.growth )

stargazer( df, 
           type=s.type, 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

MedianHomeValue2000	1,280	105,661	154,884	187,119	224,337	1,288,551
MedianHomeValue2010	9,999	123,200	193,200	246,570	312,000	1,000,001
MHV.Change.00.to.10	-1,228,651	7,187	36,268	60,047	94,881	1,000,001
MHV.Growth.00.to.12	-97	6	25	29	49	200

Visualize the distribution of changes across all urban tracts between 2000 and 2010 (these are replications of steps in the tutorial as well).

Change in MHV 2000-2010:

If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.

The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.

hist( mhv.change/1000, breaks=500, 
      xlim=c(-100,500), yaxt="n", xaxt="n",
      xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
      ylab="", main="Change in Median Home Value 2000 to 2010",
      col="gray20", border="white" )

axis( side=1, at=seq( from=-100, to=500, by=100 ), 
      labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
        
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500, 
      labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ), 
      col="darkorange", cex=1.8, pos=3 )

median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000, 
      labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ), 
      col="dodgerblue", cex=1.8, pos=3 )

Percent Change in MHV 2000 to 2010:

The percent change variable provides some context for growth rates of value in census tracts.

Plot the percent change variable:

#summary(mhv.growth)
#mhv.growth[ mhv.growth > 200] <- NA 
#can't get the histogram to plot if I run this^ ....
hg <-
hist( mhv.growth, breaks=100, 
      xlim=c(-100,200), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
      col="gray30", border="white" )

axis( side=1, at=seq( from=-100, to=200, by=50 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )

ymax <- max( hg$count )
        
mean.x <- mean( mhv.growth, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=100, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )

median.x <- median( mhv.growth, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=100, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )

Part 01 - Select Three IVs

Select three neighborhood characteristics from 2000 that you feel will be good predictors of change in MHV between 2000 and 2010. Note these are static snapshots of the tracts in 2000.

Explain why you selected the three variables and your theory about how they should related to home value changes (the predicted sign of relationships in the regression model).

colnames(d)

##  [1] "tractid"    "mhmval00"   "mhmval12"   "hinc00"     "hu00"      
##  [6] "vac00"      "own00"      "rent00"     "h30old00"   "empclf00"  
## [11] "clf00"      "unemp00"    "prof00"     "dpov00"     "npov00"    
## [16] "ag25up00"   "hs00"       "col00"      "pop00.x"    "nhwht00"   
## [21] "nhblk00"    "hisp00"     "asian00"    "cbsa"       "cbsaname"  
## [26] "p.white"    "p.black"    "p.hisp"     "p.asian"    "p.hs"      
## [31] "p.col"      "p.prof"     "p.unemp"    "p.vacant"   "pov.rate"  
## [36] "mhv.00"     "mhv.10"     "mhv.change" "mhv.growth"

Selected Variables and Hypotheses:

1. p.black: higher percentage of black population in 2000 will predict a smaller increase in home value. 2. pov.rate: higher poverty rates with predict smaller increases in home value. 3. p.vacant: higher levels of vacant lots will predict a smaller increase in home value.

Part 02 - Variable Skew

Check for variable skew on each of the variables by generating a histogram and summary statistics.

Correct skew if necessary and explain how you did so (which transformation used our outliers suppressed).

p.prof

par( mfrow=c(1,2) )

hist( d$p.black, breaks=50, col="gray20", border="white",
      yaxt="n", xlab="", ylab="", main="Income")
library(LambertW)
hist( log(d$p.black), breaks=50, col="gray20", border="white",
      yaxt="n", xlab="", ylab="", main="Income (logged)")

log transformation for percent black population IS necessary

pov.rate

par( mfrow=c(1,2) )

hist( d$pov.rate, breaks=50, col="gray20", border="white",
      yaxt="n", xlab="", ylab="", main="Poverty Rate")
hist( log(d$pov.rate+1), breaks=50, col="gray20", border="white",
      yaxt="n", xlab="", ylab="", main="Poverty Rate (logged)")

log transformation for poverty rate IS necessary

p.vacant

par( mfrow=c(1,2) )

hist( d$p.unemp, breaks=50, col="gray20", border="white",
      yaxt="n", xlab="", ylab="", main="Unemployment Rate")
hist( log(d$p.unemp+1), breaks=50, col="gray20", border="white",
      yaxt="n", xlab="", ylab="", main="Unemployment Rate (logged)")

log transformation for unemployment rate IS necessary

Part 03 - Multicollinearity

Do you expect multicollinearity to be a problem?

ANSWER: Yes. I intuitively feel that all three of these variables have a correlation with each other, unrealated to home value.

Explain the problem of multicollinearity in plain language, and it’s effect on the regression model when present.

ANSWER: Multicollinearity is defined as an existing correlation between predictor variables that is unrelated to the outcome variable. If both are included in a model used to predict the outcome variable, there is potential for the two variables to cancel each other out.

Create a correlation plot for your three variables. Are any of them highly-correlated?

reg.data <- d

reg.data$mhv.growth[ reg.data$mhv.growth > 200 ] <- NA
reg.data$p.black <- log10( reg.data$p.black + 1 )
reg.data$p.unemp <- log10( reg.data$p.unemp + 1 )
reg.data$pov.rate <- log10( reg.data$pov.rate + 1 )

m1 <- lm( mhv.growth ~  pov.rate, data=reg.data )
m2 <- lm( mhv.growth ~  p.unemp, data=reg.data )
m3 <- lm( mhv.growth ~  p.black, data=reg.data )

m4 <- lm( mhv.growth ~  p.black + pov.rate, data=reg.data )
m5 <- lm( mhv.growth ~  p.black + p.unemp, data=reg.data )
m6 <- lm( mhv.growth ~  pov.rate + p.unemp, data=reg.data)
m7 <- lm( mhv.growth ~  pov.rate + p.unemp + p.black, data=reg.data )


stargazer( m1, m2, m3,
           type=s.type, digits=2,
           omit.stat = c("rsq","f") )


	Dependent variable:

	mhv.growth
	(1)	(2)	(3)

pov.rate	12.93^***
	(0.42)

p.unemp		15.57^***
		(0.56)

p.black			4.05^***
			(0.26)

Constant	16.69^***	17.43^***	26.20^***
	(0.44)	(0.46)	(0.26)


Observations	58,798	58,801	58,839
Adjusted R²	0.02	0.01	0.004
Residual Std. Error	34.89 (df = 58796)	34.94 (df = 58799)	35.10 (df = 58837)

Note:	p<0.1; p<0.05; p<0.01

stargazer( m4, m5, m6, m7,
           type=s.type, digits=2,
           omit.stat = c("rsq","f") )


	Dependent variable:

	mhv.growth
	(1)	(2)	(3)	(4)

p.black	0.30	1.02^***		-0.07
	(0.30)	(0.29)		(0.30)

pov.rate	12.71^***		9.67^***	9.70^***
	(0.47)		(0.61)	(0.63)

p.unemp		14.60^***	6.04^***	6.08^***
		(0.63)	(0.83)	(0.84)

Constant	16.67^***	17.36^***	15.24^***	15.24^***
	(0.44)	(0.46)	(0.48)	(0.48)


Observations	58,798	58,801	58,791	58,791
Adjusted R²	0.02	0.01	0.02	0.02
Residual Std. Error	34.89 (df = 58795)	34.94 (df = 58798)	34.87 (df = 58788)	34.87 (df = 58787)

Note:	p<0.1; p<0.05; p<0.01

It’s interesting to note here that percent of black population is significant alone. However, when included with the other two variables in the model, it appears insignificant and the error only increases slightly.

Part 04 - Is the Relationship Linear?

Do you think the relationship between X and Y is a linear relationship, or do you have evidence that the slope changes depending upon the level of X?

Create a scatterplot of each X and Y overplotted with a lowess regression line and report if you find evidence of non-linearity.

log.p.unemp <- log10( d$p.unemp + 1 )
log.pov.rate <- log10( d$pov.rate + 1 )
log.p.black <- log10( d$p.black + 1 )
these <- sample( 1:length(log.pov.rate), 5000 )

par( mfrow=c(1,2) )
jplot( d$pov.rate[these], d$p.unemp[these], 
       lab1="Poverty Rate", lab2="Unemployed (percent)",
       main="Raw Measures" )
jplot( log.pov.rate[these], log.p.unemp[these], 
       lab1="Poverty Rate", lab2="Unemployed (percent)",
       main="Log Transformed" )

par( mfrow=c(1,3))

jplot( log.p.black[these], reg.data$mhv.growth[these],
       lab1="Percent Black Pop", lab2="MHV growth" )
jplot( log.pov.rate[these], reg.data$mhv.growth[these],
       lab1="Poverty Rates", lab2="MHV growth" )
jplot( log.p.unemp[these], reg.data$mhv.growth[these],
       lab1="Unemployment Rates", lab2="MHV growth" )

If you think the relationship may be non-linear, create a quadratic term for X and include it in the model.

Part 04 - Descriptives

Report a table of descriptive statistics for home values (2000 values, 2010 values, 2000-2010 change in values, 2000-2010 growth rates) and your three covariates. Include:

min 25th percentile median mean 75th percentile max

df <- data.frame( MedianHomeValue2000=mhv.00, 
                  MedianHomeValue2010=mhv.10, 
                  MHV.Change.00.to.10=mhv.change,
                  MHV.Growth.00.to.12=mhv.growth,
                  PercentBlack=d$p.black,
                  PovertyRate=d$pov.rate,
                  UnemploymentRate=d$p.unemp)

stargazer( df, 
           type=s.type, 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

MedianHomeValue2000	1,280	105,661	154,884	187,119	224,337	1,288,551
MedianHomeValue2010	9,999	123,200	193,200	246,570	312,000	1,000,001
MHV.Change.00.to.10	-1,228,651	7,187	36,268	60,047	94,881	1,000,001
MHV.Growth.00.to.12	-97	6	25	29	49	200
PercentBlack	0	1	4	14	14	100
PovertyRate	0	4	9	12	17	100
UnemploymentRate	0	3	5	6	8	100

What’s the typical change in home value between 2000 and 2010? What’s the largest change in home value between 2000 and 2010?

ANSWER: On average, home values increased by $60,047. The largest change in home value between 2000 and 2010 was $1,000,000.

What’s the relationship between the change in home value 2000-2010 and the growth in home value 2000-2010?

ANSWER: Growth measures the change over time.

Create a scatter plot of the relationship.

reg.data$mhv.change[ reg.data$mhv.change < 1000 ] <- NA
fit=lm(mhv.growth ~ mhv.change, data=reg.data)

plot(reg.data$mhv.change[these], reg.data$mhv.growth[these], xlim=c(0, 800000), ylim=c(0, 200), col="grey30")
abline(fit, col="firebrick")

jplot( reg.data$mhv.change[these]/10000, reg.data$mhv.growth[these], 
       lab1="MHV Change", lab2="MHV Growth")

How strong is the correlation?

Do you think these two variables measure the same thing?

Part 05 - Models

Run two models - Include your three variables in both models, after any transformations.

One with change in median home value (dollar amount) as the DV

m <- lm( mhv.00 ~  pov.rate + p.unemp + p.black, data=reg.data )

stargazer( m,
           type=s.type, digits=2,
           omit.stat = c("rsq","f") )


	Dependent variable:

	mhv.00

pov.rate	-111,918.80^***
	(2,221.73)

p.unemp	-27,355.51^***
	(2,952.68)

p.black	-28,164.69^***
	(1,067.82)

Constant	341,723.50^***
	(1,685.99)


Observations	59,356
Adjusted R²	0.16
Residual Std. Error	123,963.90 (df = 59352)

Note:	p<0.1; p<0.05; p<0.01

One with median home value growth (percent change from 2000 to 2010) as the DV

m <- lm( mhv.growth ~  pov.rate + p.unemp + p.black, data=reg.data )

stargazer( m,
           type=s.type, digits=2,
           omit.stat = c("rsq","f") )


	Dependent variable:

	mhv.growth

pov.rate	9.70^***
	(0.63)

p.unemp	6.08^***
	(0.84)

p.black	-0.07
	(0.30)

Constant	15.24^***
	(0.48)


Observations	58,791
Adjusted R²	0.02
Residual Std. Error	34.87 (df = 58787)

Note:	p<0.1; p<0.05; p<0.01

What are the results?

Did any of the variables predict changes to home value in a meaningful way (relationship is statistically significant)?

Which variable had the largest impact?

Did the results match your predictions?

In a short paragraph explain your findings to a general audience.

ANSWER: All three variables have an impact on home value (model A). However, only poverty rate and unemployment rate have a statistically significant impact on CHANGE in home value (model B). Poverty rate has the greatest impact on home value and the greatest impact on change in home value.

Part 06 - Effect Sizes

Calculate the effect size associated with each variable.

Report the relative importance of each factor in predicting the outcome.

You can calculate effect size as follows:

y <- mhv.growth
m <- lm( y ~ p.black, data=reg.data )
x.75 <- quantile( reg.data$p.black, p=0.75, na.rm = T )
x.25 <- quantile( reg.data$p.black, p=0.25, na.rm = T )
beta.x <- m$coefficients[2]  # position of x in the model 

effect.size.x <- ( x.75 - x.25 ) * beta.x 
effect.size.x

##      75% 
## 3.375457

m <- lm( y ~ pov.rate, data=reg.data )
x.75 <- quantile( reg.data$pov.rate, p=0.75, na.rm = T )
x.25 <- quantile( reg.data$pov.rate, p=0.25, na.rm = T )
beta.x <- m$coefficients[2]  # position of x in the model 

effect.size.x <- ( x.75 - x.25 ) * beta.x
effect.size.x

##      75% 
## 6.587377

m <- lm( y ~ p.unemp, data=reg.data )
x.75 <- quantile( reg.data$p.unemp, p=0.75, na.rm = T )
x.25 <- quantile( reg.data$p.unemp, p=0.25, na.rm = T )
beta.x <- m$coefficients[2]  # position of x in the model 

effect.size.x <- ( x.75 - x.25 ) * beta.x
effect.size.x

##     75% 
## 5.23063

Coefficient sizes are often misleading because a one-unit change of X1 might be very different from a one-unit change in X2. For example if X1 is a proportion and X2 is measured in millions of dollars, then a one-unit change in X1 is the entire range of X1, whereas a one-unit change in X2 would represent a trivial amount.

As a result, B1 (the coefficient associated with X1) would be relatively large and B2 (the coefficient associated with X2) would be relatively small.

In order to compare the effects of X1 and X2 on the outcome we need to translate the results into a standardized format. Instead of using the arbitary one-unit scale, we ask how much does the outcome change when we observe at a large change in each IV.

In this case we are using an increase in each X from it’s 25th to 75th percentile to represent a large change.

Since the range of 25th to 75th percentiles are consistent across all IVs we can now compare their effect sizes since they are all in comparable scales.

INTERPRETATION OF B: A one-unit change in X results in a B change in Y. INTERPRETATION OF EFFECT SIZE: A large change in X results in an effect size change in Y. Where large is consistently operationalized as X.75 - X.25 for each variable.

It is really important to note that when trying to establish the relative importance of variables the B coefficients in a regression are NOT comparable. Effect sizes associated with each X are comparable.