This lab is designed to help you build a baseline model of neighborhood change.
Follow the steps from the tutorial to create a dataset that includes 2000 and 2010 census variables.
###################################
#
# STARGAZER SETTINGS
#
###################################
# DO NOT RUN CHUNK UNLESS KNITTING:
# changes table formats to html
# before rendering RMD docs
s.type <- "html"panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
test <- cor.test(x,y)
# borrowed from printCoefmat
Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " "))
text(0.5, 0.5, txt, cex = 1.5 )
text(.7, .8, Signif, cex=cex, col=2)
}
panel.smooth <- function (x, y, col = par("col"), bg = NA, pch = par("pch"),
cex = 0.5, col.smooth = "red", span = 2/3, iter = 3, ...)
{
points(x, y, pch = 19, col = gray(0.7,0.2), bg = bg, cex = cex)
ok <- is.finite(x) & is.finite(y)
if (any(ok))
lines(stats::lowess(x[ok], y[ok], f = span, iter = iter),
col = col.smooth, lwd=2, ...)
}
# custom plot
jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{
plot( x1, x2,
pch=19,
col=gray(0.6, alpha = 0.2),
cex=0.5,
bty = "n",
xlab=lab1,
ylab=lab2, cex.lab=1.5,
... )
if( draw.line==T ){
ok <- is.finite(x1) & is.finite(x2)
lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }
}For this lab we will use data from the Longitudinal Tract Database (LTDB) Project, a project that has harmonized a set of variables from 1970 to 2010.
URL1 <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-2000.rds"
d1 <- readRDS( gzcon( url( URL1 ) ) )
URL2 <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-2010.rds"
d2 <- readRDS( gzcon( url( URL2 ) ) )
URLmd <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-META-DATA.rds"
md <- readRDS( gzcon( url( URLmd ) ) )
d1 <- select( d1, - year )
d2 <- select( d2, - year )
d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )table( d$urban )##
## rural urban
## 12971 59722
##
## rural urban
## 12971 59722
d <- filter( d, urban == "urban" )d <- select( d, tractid,
mhmval00, mhmval12,
hinc00,
hu00, vac00, own00, rent00, h30old00,
empclf00, clf00, unemp00, prof00,
dpov00, npov00,
ag25up00, hs00, col00,
pop00.x, nhwht00, nhblk00, hisp00, asian00,
cbsa, cbsaname )
d <-
d %>%
mutate( # percent white in 2000
p.white = 100 * nhwht00 / pop00.x,
# percent black in 2000
p.black = 100 * nhblk00 / pop00.x,
# percent hispanic in 2000
p.hisp = 100 * hisp00 / pop00.x,
# percent asian in 2000
p.asian = 100 * asian00 / pop00.x,
# percent high school grads by age 25 in 2000
p.hs = 100 * (hs00+col00) / ag25up00,
# percent pop with college degree in 2000
p.col = 100 * col00 / ag25up00,
# percent employed in professional fields in 2000
p.prof = 100 * prof00 / empclf00,
# percent unemployment in 2000
p.unemp = 100 * unemp00 / clf00,
# percent of housing lots in tract that are vacant in 2000
p.vacant = 100 * vac00 / hu00,
# dollar change in median home value 2000 to 2010
pov.rate = 100 * npov00 / dpov00 )
# adjust 2000 home values for inflation
mhv.00 <- d$mhmval00 * 1.28855
mhv.10 <- d$mhmval12
# change in MHV in dollars
mhv.change <- mhv.10 - mhv.00# drop low 2000 median home values
# to avoid unrealistic growth rates.
#
# tracts with homes that cost less than
# $1,000 are outliers
mhv.00[ mhv.00 < 1000 ] <- NA# change in MHV in percent
mhv.growth <- 100 * ( mhv.change / mhv.00 )
mhv.growth[ mhv.growth > 200] <- NA
#can't get the histogram to plot if I run this....
d$mhv.00 <- mhv.00
d$mhv.10 <- mhv.10
d$mhv.change <- mhv.change
d$mhv.growth <- mhv.growth
#head(d) %>% pander()df <- data.frame( MedianHomeValue2000=mhv.00,
MedianHomeValue2010=mhv.10,
MHV.Change.00.to.10=mhv.change,
MHV.Growth.00.to.12=mhv.growth )
stargazer( df,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )| Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
| MedianHomeValue2000 | 1,280 | 105,661 | 154,884 | 187,119 | 224,337 | 1,288,551 |
| MedianHomeValue2010 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
| MHV.Change.00.to.10 | -1,228,651 | 7,187 | 36,268 | 60,047 | 94,881 | 1,000,001 |
| MHV.Growth.00.to.12 | -97 | 6 | 25 | 29 | 49 | 200 |
Change in MHV 2000-2010:
If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.
The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.
hist( mhv.change/1000, breaks=500,
xlim=c(-100,500), yaxt="n", xaxt="n",
xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
ylab="", main="Change in Median Home Value 2000 to 2010",
col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ),
labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500,
labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ),
col="darkorange", cex=1.8, pos=3 )
median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000,
labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ),
col="dodgerblue", cex=1.8, pos=3 )
Percent Change in MHV 2000 to 2010:
The percent change variable provides some context for growth rates of value in census tracts.
Plot the percent change variable:
#summary(mhv.growth)
#mhv.growth[ mhv.growth > 200] <- NA
#can't get the histogram to plot if I run this^ ....
hg <-
hist( mhv.growth, breaks=100,
xlim=c(-100,200), yaxt="n", xaxt="n",
xlab="", cex.main=1.5,
ylab="", main="Growth in Home Value by Census Tract 2000 to 2010",
col="gray30", border="white" )
axis( side=1, at=seq( from=-100, to=200, by=50 ),
labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
ymax <- max( hg$count )
mean.x <- mean( mhv.growth, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=100, y=(0.5*ymax),
labels=paste0( "Mean = ", round(mean.x,0), "%"),
col="darkorange", cex=1.8, pos=4 )
median.x <- median( mhv.growth, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=100, y=(0.6*ymax),
labels=paste0( "Median = ", round(median.x,0), "%"),
col="dodgerblue", cex=1.8, pos=4 )
Select three neighborhood characteristics from 2000 that you feel will be good predictors of change in MHV between 2000 and 2010. Note these are static snapshots of the tracts in 2000.
Explain why you selected the three variables and your theory about how they should related to home value changes (the predicted sign of relationships in the regression model).
colnames(d)## [1] "tractid" "mhmval00" "mhmval12" "hinc00" "hu00"
## [6] "vac00" "own00" "rent00" "h30old00" "empclf00"
## [11] "clf00" "unemp00" "prof00" "dpov00" "npov00"
## [16] "ag25up00" "hs00" "col00" "pop00.x" "nhwht00"
## [21] "nhblk00" "hisp00" "asian00" "cbsa" "cbsaname"
## [26] "p.white" "p.black" "p.hisp" "p.asian" "p.hs"
## [31] "p.col" "p.prof" "p.unemp" "p.vacant" "pov.rate"
## [36] "mhv.00" "mhv.10" "mhv.change" "mhv.growth"
Selected Variables and Hypotheses:
1. p.black: higher percentage of black population in 2000 will predict a smaller increase in home value. 2. pov.rate: higher poverty rates with predict smaller increases in home value. 3. p.vacant: higher levels of vacant lots will predict a smaller increase in home value.
Check for variable skew on each of the variables by generating a histogram and summary statistics.
Correct skew if necessary and explain how you did so (which transformation used our outliers suppressed).
par( mfrow=c(1,2) )
hist( d$p.black, breaks=50, col="gray20", border="white",
yaxt="n", xlab="", ylab="", main="Income")
library(LambertW)
hist( log(d$p.black), breaks=50, col="gray20", border="white",
yaxt="n", xlab="", ylab="", main="Income (logged)")
log transformation for percent black population IS necessary
par( mfrow=c(1,2) )
hist( d$pov.rate, breaks=50, col="gray20", border="white",
yaxt="n", xlab="", ylab="", main="Poverty Rate")
hist( log(d$pov.rate+1), breaks=50, col="gray20", border="white",
yaxt="n", xlab="", ylab="", main="Poverty Rate (logged)")
log transformation for poverty rate IS necessary
par( mfrow=c(1,2) )
hist( d$p.unemp, breaks=50, col="gray20", border="white",
yaxt="n", xlab="", ylab="", main="Unemployment Rate")
hist( log(d$p.unemp+1), breaks=50, col="gray20", border="white",
yaxt="n", xlab="", ylab="", main="Unemployment Rate (logged)")
log transformation for unemployment rate IS necessary
Do you expect multicollinearity to be a problem?
ANSWER: Yes. I intuitively feel that all three of these variables have a correlation with each other, unrealated to home value.
Explain the problem of multicollinearity in plain language, and it’s effect on the regression model when present.
ANSWER: Multicollinearity is defined as an existing correlation between predictor variables that is unrelated to the outcome variable. If both are included in a model used to predict the outcome variable, there is potential for the two variables to cancel each other out.
Create a correlation plot for your three variables. Are any of them highly-correlated?
reg.data <- d
reg.data$mhv.growth[ reg.data$mhv.growth > 200 ] <- NA
reg.data$p.black <- log10( reg.data$p.black + 1 )
reg.data$p.unemp <- log10( reg.data$p.unemp + 1 )
reg.data$pov.rate <- log10( reg.data$pov.rate + 1 )
m1 <- lm( mhv.growth ~ pov.rate, data=reg.data )
m2 <- lm( mhv.growth ~ p.unemp, data=reg.data )
m3 <- lm( mhv.growth ~ p.black, data=reg.data )
m4 <- lm( mhv.growth ~ p.black + pov.rate, data=reg.data )
m5 <- lm( mhv.growth ~ p.black + p.unemp, data=reg.data )
m6 <- lm( mhv.growth ~ pov.rate + p.unemp, data=reg.data)
m7 <- lm( mhv.growth ~ pov.rate + p.unemp + p.black, data=reg.data )
stargazer( m1, m2, m3,
type=s.type, digits=2,
omit.stat = c("rsq","f") )| Dependent variable: | |||
| mhv.growth | |||
| (1) | (2) | (3) | |
| pov.rate | 12.93*** | ||
| (0.42) | |||
| p.unemp | 15.57*** | ||
| (0.56) | |||
| p.black | 4.05*** | ||
| (0.26) | |||
| Constant | 16.69*** | 17.43*** | 26.20*** |
| (0.44) | (0.46) | (0.26) | |
| Observations | 58,798 | 58,801 | 58,839 |
| Adjusted R2 | 0.02 | 0.01 | 0.004 |
| Residual Std. Error | 34.89 (df = 58796) | 34.94 (df = 58799) | 35.10 (df = 58837) |
| Note: | p<0.1; p<0.05; p<0.01 | ||
stargazer( m4, m5, m6, m7,
type=s.type, digits=2,
omit.stat = c("rsq","f") )| Dependent variable: | ||||
| mhv.growth | ||||
| (1) | (2) | (3) | (4) | |
| p.black | 0.30 | 1.02*** | -0.07 | |
| (0.30) | (0.29) | (0.30) | ||
| pov.rate | 12.71*** | 9.67*** | 9.70*** | |
| (0.47) | (0.61) | (0.63) | ||
| p.unemp | 14.60*** | 6.04*** | 6.08*** | |
| (0.63) | (0.83) | (0.84) | ||
| Constant | 16.67*** | 17.36*** | 15.24*** | 15.24*** |
| (0.44) | (0.46) | (0.48) | (0.48) | |
| Observations | 58,798 | 58,801 | 58,791 | 58,791 |
| Adjusted R2 | 0.02 | 0.01 | 0.02 | 0.02 |
| Residual Std. Error | 34.89 (df = 58795) | 34.94 (df = 58798) | 34.87 (df = 58788) | 34.87 (df = 58787) |
| Note: | p<0.1; p<0.05; p<0.01 | |||
It’s interesting to note here that percent of black population is significant alone. However, when included with the other two variables in the model, it appears insignificant and the error only increases slightly.
Do you think the relationship between X and Y is a linear relationship, or do you have evidence that the slope changes depending upon the level of X?
Create a scatterplot of each X and Y overplotted with a lowess regression line and report if you find evidence of non-linearity.
log.p.unemp <- log10( d$p.unemp + 1 )
log.pov.rate <- log10( d$pov.rate + 1 )
log.p.black <- log10( d$p.black + 1 )
these <- sample( 1:length(log.pov.rate), 5000 )
par( mfrow=c(1,2) )
jplot( d$pov.rate[these], d$p.unemp[these],
lab1="Poverty Rate", lab2="Unemployed (percent)",
main="Raw Measures" )
jplot( log.pov.rate[these], log.p.unemp[these],
lab1="Poverty Rate", lab2="Unemployed (percent)",
main="Log Transformed" )
par( mfrow=c(1,3))
jplot( log.p.black[these], reg.data$mhv.growth[these],
lab1="Percent Black Pop", lab2="MHV growth" )
jplot( log.pov.rate[these], reg.data$mhv.growth[these],
lab1="Poverty Rates", lab2="MHV growth" )
jplot( log.p.unemp[these], reg.data$mhv.growth[these],
lab1="Unemployment Rates", lab2="MHV growth" )
If you think the relationship may be non-linear, create a quadratic term for X and include it in the model.
Report a table of descriptive statistics for home values (2000 values, 2010 values, 2000-2010 change in values, 2000-2010 growth rates) and your three covariates. Include:
min 25th percentile median mean 75th percentile max
df <- data.frame( MedianHomeValue2000=mhv.00,
MedianHomeValue2010=mhv.10,
MHV.Change.00.to.10=mhv.change,
MHV.Growth.00.to.12=mhv.growth,
PercentBlack=d$p.black,
PovertyRate=d$pov.rate,
UnemploymentRate=d$p.unemp)
stargazer( df,
type=s.type,
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )| Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
| MedianHomeValue2000 | 1,280 | 105,661 | 154,884 | 187,119 | 224,337 | 1,288,551 |
| MedianHomeValue2010 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
| MHV.Change.00.to.10 | -1,228,651 | 7,187 | 36,268 | 60,047 | 94,881 | 1,000,001 |
| MHV.Growth.00.to.12 | -97 | 6 | 25 | 29 | 49 | 200 |
| PercentBlack | 0 | 1 | 4 | 14 | 14 | 100 |
| PovertyRate | 0 | 4 | 9 | 12 | 17 | 100 |
| UnemploymentRate | 0 | 3 | 5 | 6 | 8 | 100 |
What’s the typical change in home value between 2000 and 2010? What’s the largest change in home value between 2000 and 2010?
ANSWER: On average, home values increased by $60,047. The largest change in home value between 2000 and 2010 was $1,000,000.
What’s the relationship between the change in home value 2000-2010 and the growth in home value 2000-2010?
ANSWER: Growth measures the change over time.
Create a scatter plot of the relationship.
reg.data$mhv.change[ reg.data$mhv.change < 1000 ] <- NA
fit=lm(mhv.growth ~ mhv.change, data=reg.data)
plot(reg.data$mhv.change[these], reg.data$mhv.growth[these], xlim=c(0, 800000), ylim=c(0, 200), col="grey30")
abline(fit, col="firebrick")jplot( reg.data$mhv.change[these]/10000, reg.data$mhv.growth[these],
lab1="MHV Change", lab2="MHV Growth")
How strong is the correlation?
Do you think these two variables measure the same thing?
Run two models - Include your three variables in both models, after any transformations.
m <- lm( mhv.00 ~ pov.rate + p.unemp + p.black, data=reg.data )
stargazer( m,
type=s.type, digits=2,
omit.stat = c("rsq","f") )| Dependent variable: | |
| mhv.00 | |
| pov.rate | -111,918.80*** |
| (2,221.73) | |
| p.unemp | -27,355.51*** |
| (2,952.68) | |
| p.black | -28,164.69*** |
| (1,067.82) | |
| Constant | 341,723.50*** |
| (1,685.99) | |
| Observations | 59,356 |
| Adjusted R2 | 0.16 |
| Residual Std. Error | 123,963.90 (df = 59352) |
| Note: | p<0.1; p<0.05; p<0.01 |
m <- lm( mhv.growth ~ pov.rate + p.unemp + p.black, data=reg.data )
stargazer( m,
type=s.type, digits=2,
omit.stat = c("rsq","f") )| Dependent variable: | |
| mhv.growth | |
| pov.rate | 9.70*** |
| (0.63) | |
| p.unemp | 6.08*** |
| (0.84) | |
| p.black | -0.07 |
| (0.30) | |
| Constant | 15.24*** |
| (0.48) | |
| Observations | 58,791 |
| Adjusted R2 | 0.02 |
| Residual Std. Error | 34.87 (df = 58787) |
| Note: | p<0.1; p<0.05; p<0.01 |
What are the results?
Did any of the variables predict changes to home value in a meaningful way (relationship is statistically significant)?
Which variable had the largest impact?
Did the results match your predictions?
In a short paragraph explain your findings to a general audience.
ANSWER: All three variables have an impact on home value (model A). However, only poverty rate and unemployment rate have a statistically significant impact on CHANGE in home value (model B). Poverty rate has the greatest impact on home value and the greatest impact on change in home value.
Calculate the effect size associated with each variable.
Report the relative importance of each factor in predicting the outcome.
You can calculate effect size as follows:
y <- mhv.growth
m <- lm( y ~ p.black, data=reg.data )
x.75 <- quantile( reg.data$p.black, p=0.75, na.rm = T )
x.25 <- quantile( reg.data$p.black, p=0.25, na.rm = T )
beta.x <- m$coefficients[2] # position of x in the model
effect.size.x <- ( x.75 - x.25 ) * beta.x
effect.size.x## 75%
## 3.375457
m <- lm( y ~ pov.rate, data=reg.data )
x.75 <- quantile( reg.data$pov.rate, p=0.75, na.rm = T )
x.25 <- quantile( reg.data$pov.rate, p=0.25, na.rm = T )
beta.x <- m$coefficients[2] # position of x in the model
effect.size.x <- ( x.75 - x.25 ) * beta.x
effect.size.x## 75%
## 6.587377
m <- lm( y ~ p.unemp, data=reg.data )
x.75 <- quantile( reg.data$p.unemp, p=0.75, na.rm = T )
x.25 <- quantile( reg.data$p.unemp, p=0.25, na.rm = T )
beta.x <- m$coefficients[2] # position of x in the model
effect.size.x <- ( x.75 - x.25 ) * beta.x
effect.size.x## 75%
## 5.23063
Coefficient sizes are often misleading because a one-unit change of X1 might be very different from a one-unit change in X2. For example if X1 is a proportion and X2 is measured in millions of dollars, then a one-unit change in X1 is the entire range of X1, whereas a one-unit change in X2 would represent a trivial amount.
As a result, B1 (the coefficient associated with X1) would be relatively large and B2 (the coefficient associated with X2) would be relatively small.
In order to compare the effects of X1 and X2 on the outcome we need to translate the results into a standardized format. Instead of using the arbitary one-unit scale, we ask how much does the outcome change when we observe at a large change in each IV.
In this case we are using an increase in each X from it’s 25th to 75th percentile to represent a large change.
Since the range of 25th to 75th percentiles are consistent across all IVs we can now compare their effect sizes since they are all in comparable scales.
INTERPRETATION OF B: A one-unit change in X results in a B change in Y. INTERPRETATION OF EFFECT SIZE: A large change in X results in an effect size change in Y. Where large is consistently operationalized as X.75 - X.25 for each variable.
It is really important to note that when trying to establish the relative importance of variables the B coefficients in a regression are NOT comparable. Effect sizes associated with each X are comparable.