Inferential analysis is never a completely linear process. It involves many iterations of cleaning data, describing data, visualizing data, and modeling data until you are confident about the process and the results.
For Part 01 of the lab, reproduce the descriptive analysis demonstrated in the tutorial but do it for periods 1990 to 2000 instead of the 2000 to 2010 period used in the tutorial. Note that we are only using urban tracts rather than all urban and rural tracts.
How do changes in home value differ between the 1990-2000 period and 2000-2010?
# Load Crosswalk data
crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv", stringsAsFactors=F, colClasses="character" )# search for city names by strings, use the ^ anchor for "begins with"
grep( "^PHI", crosswalk$msaname, value=TRUE ) ## [1] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [4] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [7] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [1] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [4] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [7] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
# Select crosswalk data for Philadelphia, note that this MSA covers two
#states PA & NJ
these.msp <- crosswalk$msaname == "PHILADELPHIA, PA-NJ"
these.fips <- crosswalk$fipscounty[ these.msp ]
these.fips <- na.omit( these.fips )
state.fips <- substr( these.fips, 1, 2 )
county.fips <- substr( these.fips, 3, 5 )install=TRUE
key <- "da93b9c8b9f4cbfd1869025781a1c2693fe419dd"
census_api_key(key)## To install your API key for use in future sessions, run this function with `install = TRUE`.
# Select population data for Pennsylvania (state code 42), geometry = TRUE
#pulls shapefile for PA
options(tigris_use_cache = TRUE)
msp.pop1 <-
get_acs( geography = "tract", variables = "B01003_001",
state = "42", county = county.fips[state.fips=="42"], geometry =
TRUE ) %>%
select( GEOID, estimate ) %>%
rename( POP=estimate )## Getting data from the 2016-2020 5-year ACS
# Select population data for New Jersey (state code 34), geometry = TRUE
#pulls shapefile for NJ
msp.pop2 <-
get_acs( geography = "tract", variables = "B01003_001",
state = "34", county = county.fips[state.fips=="34"], geometry =
TRUE ) %>%
select( GEOID, estimate ) %>%
rename( POP=estimate )## Getting data from the 2016-2020 5-year ACS
# Join PA and NJ data to form a shapefile with census data from both states
msp.pop <- rbind( msp.pop1, msp.pop2 )d1 <- readRDS( here::here( "data/rodeo/LTDB-1990.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )
# check to make sure we are not losing
# or gaining observations in the merge
nrow( d1 ) ## [1] 72693
d1 <- select( d1, - year )
d2 <- select( d2, - year )
d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )
nrow( d )## [1] 72693
table( d$urban )##
## rural urban
## 12971 59722
d <- filter( d, urban == "urban" )# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
# use regular expressions to remove numeric suffixes
var.names.1 <- names( df1 )
var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
var.names.2 <- names( df2 )
var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
shared <- intersect( var.names.1, var.names.2 ) %>% sort()
print( "SHARED VARIABLES:")
print( shared )
not.shared <- c( setdiff( var.names.1, var.names.2 ),
setdiff( var.names.2, var.names.1 ) ) %>% sort()
print( "NOT SHARED:" )
print( not.shared )
d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
dd <- rbind( d.vars1, d.vars2 )
return( dd )
}
vars <- compare_dfs( df1=d1, df2=d2 )## [1] "SHARED VARIABLES:"
## [1] "a15asn" "a15blk" "a15hsp" "a15ntv" "a15wht" "a18und" "a60asn"
## [8] "a60blk" "a60hsp" "a60ntv" "a60up" "a60wht" "a75up" "ag15up"
## [15] "ag25up" "ag5up" "ageasn" "ageblk" "agehsp" "agentv" "agewht"
## [22] "asian" "china" "clf" "cni16u" "col" "cuban" "dapov"
## [29] "dbpov" "dflabf" "dfmpov" "dhpov" "dis" "dmulti" "dnapov"
## [36] "dpov" "dwpov" "empclf" "family" "fb" "fhh" "filip"
## [43] "flabf" "geanc" "gefb" "h10yrs" "h30old" "haw" "hh"
## [50] "hha" "hhb" "hhh" "hhw" "hinc" "hinca" "hincb"
## [57] "hinch" "hincw" "hisp" "hs" "hu" "incpc" "india"
## [64] "iranc" "irfb" "itanc" "itfb" "japan" "korea" "lep"
## [71] "manuf" "mar" "mex" "mhmval" "mrent" "multi" "n10imm"
## [78] "n65pov" "napov" "nat" "nbpov" "nfmpov" "nhblk" "nhpov"
## [85] "nhwht" "nnapov" "npov" "ntv" "nwpov" "ohu" "olang"
## [92] "own" "pop" "pr" "prof" "rent" "ruanc" "rufb"
## [99] "scanc" "scfb" "semp" "tractid" "unemp" "vac" "vet"
## [106] "viet" "wds"
## [1] "NOT SHARED:"
## [1] "ag16cv" "ag18cv" "hu00sp" "hu90sp" "ohu00sp" "ohu90sp" "pop90.1"
head(vars) %>% pander()| type | variables |
|---|---|
| shared | a15asn |
| shared | a15blk |
| shared | a15hsp |
| shared | a15ntv |
| shared | a15wht |
| shared | a18und |
d.full <- d # keep a copy so don't have to reload
d <- d.full # story original in case you need to reset anything
d <- select( d, tractid, mhmval00, mhmval90, hinc00,
hu00, own00, rent00,
empclf00, clf00, unemp00, prof00,
dpov00, npov00,
ag25up00, hs00, col00,
pop00.x, nhwht00, nhblk00, hisp00, asian00,
cbsa, cbsaname )
d <-
d %>%
mutate( p.white = 100 * nhwht00 / pop00.x,
p.black = 100 * nhblk00 / pop00.x,
p.hisp = 100 * hisp00 / pop00.x,
p.asian = 100 * asian00 / pop00.x,
p.hs = 100 * (hs00+col00) / ag25up00,
p.col = 100 * col00 / ag25up00,
p.prof = 100 * prof00 / empclf00,
p.unemp = 100 * unemp00 / clf00,
pov.rate = 100 * npov00 / dpov00 )stargazer( d,
type="text",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )##
## ===========================================================
## Statistic Min Pctl(25) Median Mean Pctl(75) Max
## -----------------------------------------------------------
## mhmval00 0 81,600 119,900 144,738 173,894 1,000,001
## mhmval90 0 58,800 86,500 112,399 141,800 500,001
## hinc00 2,499 33,000 43,825 47,657 58,036 200,001
## hu00 0 1,102 1,519 1,570 1,999 11,522
## own00 0 542 902 939 1,289 4,911
## rent00 0 195 398 516 712 8,544
## empclf00 0 1,205 1,756 1,820 2,373 10,334
## clf00 0 1,302 1,865 1,930 2,502 11,251
## unemp00 0 51 87 110 140 6,405
## prof00 0 299 539 637 873 6,610
## dpov00 0 2,671 3,718 3,804 4,871 23,892
## npov00 0 149 304 452 601 5,515
## ag25up00 0 1,763 2,451 2,520 3,224 17,974
## hs00 0 665 1,071 1,155 1,552 8,909
## col00 0 243 492 665 923 9,313
## pop00.x 0 2,751 3,802 3,901 4,976 36,206
## nhwht00 0 1,308 2,514 2,591 3,713 20,619
## nhblk00 0 41 141 522 527 14,039
## hisp00 0 55 153 547 533 13,391
## asian00 0 22 65 189 183 9,491
## p.white 0 47 78 67 91 100
## p.black 0 1 4 14 14 100
## p.hisp 0 2 4 13 15 100
## p.asian 0 1 2 5 5 95
## p.hs 0 67 72 72 77 100
## p.col 0 12 21 26 36 100
## p.prof 0 23 31 34 43 100
## p.unemp 0 3 5 6 8 100
## pov.rate 0 4 9 12 17 100
## -----------------------------------------------------------
Initial conditions in 1990:
# adjust 2000 home values for inflation
mhv.90 <- d$mhmval90 * 1.28855
mhv.00 <- d$mhmval00
mhv.change <- mhv.00 - mhv.90
df <- data.frame( MedianHomeValue1990=mhv.90,
MedianHomeValue2000=mhv.00,
Change.90.to.00=mhv.change )
stargazer( df,
type="text",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )##
## ========================================================================
## Statistic Min Pctl(25) Median Mean Pctl(75) Max
## ------------------------------------------------------------------------
## MedianHomeValue1990 0 75,767 111,460 144,832 182,716 644,276
## MedianHomeValue2000 0 81,600 119,900 144,738 173,894 1,000,001
## Change.90.to.00 -644,276 -19,539 2,673 -94 21,556 1,000,001
## ------------------------------------------------------------------------
hist( mhv.change/1000, breaks=500,
xlim=c(-100,500), yaxt="n", xaxt="n",
xlab="Thousand of US Dollars (adjusted to 2000)", cex.lab=1.5,
ylab="", main="Change in Median Home Value 1990 to 2000",
col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ),
labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500,
labels=paste0( "Mean = ", scales::dollar( round(1000*mean.x,0)) ),
col="darkorange", cex=1.8, pos=3 )
median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000,
labels=paste0( "Median = ", scales::dollar( round(1000*median.x,0)) ),
col="dodgerblue", cex=1.8, pos=3 )
# function to control plot() formatting
jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{
plot( x1, x2,
pch=19,
col=gray(0.6, alpha = 0.2),
cex=2.5,
bty = "n",
xlab=lab1,
ylab=lab2, cex.lab=1.5,
... )
if( draw.line==T ){
ok <- is.finite(x1) & is.finite(x2)
lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }
}Compare 1990 to 2000 distributions:
layout.matrix <- matrix( c( 1,3,
2,3 ),
nrow=2, ncol=2, byrow=T )
layout( mat = layout.matrix,
heights = c(2,2), # Heights of the two rows
widths = c(3,4)) # Widths of the two columns
# layout.show(3)
par( mar=c(4,0,0,2) )
hist( mhv.90/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
abline( v=seq(0,1000,100), lty=2, col="gray80" )
text( 550, 4000, labels="Median Home \nValue in 1990",
col="darkslateblue", cex=1.8 )
hist( mhv.00/1000, breaks=50,
xlim=c(-200,800), yaxt="n", xaxt="n",
xlab="", cex.lab=1,
ylab="", main="",
col="darkslateblue", border="white" )
abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )
text( 550, 3500, labels="Median Home \nValue in 2000",
col="darkslateblue", cex=1.8 )
axis( side=1, at=seq( from=0, to=1000, by=100 ),
labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )
# data reduction - filter 1,000 observations
df <- data.frame( v90=mhv.90/1000, v00=mhv.00/1000 )
df <- sample_n( df, 1000 )
par( mar=c(4,5,3,2) )
jplot( df$v90, df$v00,
lab1="MHV in 1990", lab2="MHV in 2000",
xlim=c(0,1000), ylim=c(0,1000),
axes=F )
abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ),
labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.
The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.
The percent change variable provides some context for growth rates of value in census tracts.
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic
mhv.90[ mhv.90 < 10000 ] <- NA
pct.change <- mhv.change / mhv.90
summary( pct.change )## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.00000 -0.13890 0.02888 0.07583 0.23156 29.03826 145
sum( pct.change > 5, na.rm=T )## [1] 28
d %>%
filter( pct.change > 5 ) %>%
head()Plot the percent change variable:
hg <-
hist( pct.change, breaks=2000,
xlim=c(-1,2), yaxt="n", xaxt="n",
xlab="", cex.main=1.5,
ylab="", main="Growth in Home Value by Census Tract 1990 to 2000",
col="gray40", border="white" )
axis( side=1, at=seq( from=-1, to=2, by=0.5 ),
labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )
ymax <- max( hg$count )
mean.x <- mean( pct.change, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax),
labels=paste0( "Mean = ", round(100*mean.x,0), "%"),
col="darkorange", cex=1.8, pos=4 )
median.x <- median( pct.change, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax),
labels=paste0( "Median = ", round(100*median.x,0), "%"),
col="dodgerblue", cex=1.8, pos=4 )
We often want to disagregate descriptives by some grouping in the data, such as metro areas.
dplyr makes this easy by grouping then summarizing the data.
d$mhv.change <- mhv.change
d$pct.change <- pct.change
d$mhv.00 <- mhv.00
d$mhv.90 <- mhv.90
d %>%
group_by( cbsaname ) %>%
summarize( ave.change = median( mhv.change, na.rm=T ),
ave.change.d = dollar( round(ave.change,0) ),
growth = 100 * median( pct.change, na.rm=T ) ) %>%
ungroup() %>%
arrange( - growth ) %>%
select( - ave.change ) %>%
head( 25 ) %>%
pander()Quitting from lines 423-440 (lab_03-anderson.Rmd) Error in
summarize(): ! Problem while computing
ave.change.d = dollar(round(ave.change, 0)). ℹ The error
occurred in group 1: cbsaname = “Abilene, TX”. Caused by error in
dollar(): ! could not find function “dollar” Backtrace: 1.
… %>% pander() 8. dplyr:::summarise.grouped_df(…) 9.
dplyr:::summarise_cols(.data, dplyr_quosures(…), caller_env =
caller_env()) 11. dplyr:::map(quosures, summarise_eval_one, mask = mask)
12. base::lapply(.x, .f, …) 13. dplyr FUN(X[[i]], …) 14.
mask$eval_all_summarise(quo)
The original merged dataset we saved as d.full so we don’t need to reload it:
Recall our data steps thus far:
# adjust 2000 home values for inflation
mhv.90 <- d.full$mhmval90 * 1.28855
mhv.00 <- d.full$mhmval00
mhv.change <- mhv.00 - mhv.90
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
mhv.90[ mhv.90 < 10000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.90 )
summary( pct.change )## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -100.000 -13.890 2.888 7.583 23.156 2903.825 145
d.full$mhv.90 <- mhv.90
d.full$mhv.00 <- mhv.00
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.changeSelect variables for operationalizing a definition of gentrification.
We need to add some variables from 2000.
Recall we created a 1990 to 2000 variable list for reference:
head(vars)————————————–check variables on year————————————————————-
names(d1)## [1] "tractid" "pop90.x" "nhwht90" "nhblk90" "ntv90" "asian90"
## [7] "hisp90" "haw90" "india90" "china90" "filip90" "japan90"
## [13] "korea90" "viet90" "mex90" "pr90" "cuban90" "hu90"
## [19] "vac90" "ohu90" "a18und90" "a60up90" "a75up90" "agewht90"
## [25] "a15wht90" "a60wht90" "ageblk90" "a15blk90" "a60blk90" "agehsp90"
## [31] "a15hsp90" "a60hsp90" "agentv90" "a15ntv90" "a60ntv90" "ageasn90"
## [37] "a15asn90" "a60asn90" "ag15up90" "mar90" "wds90" "mhmval90"
## [43] "mrent90" "own90" "rent90" "family90" "fhh90" "pop90.y"
## [49] "pop90.1" "ruanc90" "itanc90" "geanc90" "iranc90" "scanc90"
## [55] "rufb90" "itfb90" "gefb90" "irfb90" "scfb90" "fb90"
## [61] "nat90" "n10imm90" "ag5up90" "olang90" "lep90" "ag25up90"
## [67] "hs90" "col90" "clf90" "unemp90" "dflabf90" "flabf90"
## [73] "empclf90" "prof90" "manuf90" "semp90" "ag16cv90" "vet90"
## [79] "cni16u90" "dis90" "dpov90" "npov90" "n65pov90" "dfmpov90"
## [85] "nfmpov90" "dwpov90" "nwpov90" "dbpov90" "nbpov90" "dnapov90"
## [91] "nnapov90" "dhpov90" "nhpov90" "dapov90" "napov90" "incpc90"
## [97] "hu90sp" "h30old90" "ohu90sp" "h10yrs90" "dmulti90" "multi90"
## [103] "hinc90" "hincw90" "hincb90" "hinch90" "hinca90" "hh90"
## [109] "hhw90" "hhb90" "hhh90" "hha90"
names(d2)## [1] "tractid" "pop00.x" "nhwht00" "nhblk00" "ntv00" "asian00"
## [7] "hisp00" "haw00" "india00" "china00" "filip00" "japan00"
## [13] "korea00" "viet00" "mex00" "pr00" "cuban00" "hu00"
## [19] "vac00" "ohu00" "a18und00" "a60up00" "a75up00" "agewht00"
## [25] "a15wht00" "a60wht00" "ageblk00" "a15blk00" "a60blk00" "agehsp00"
## [31] "a15hsp00" "a60hsp00" "agentv00" "a15ntv00" "a60ntv00" "ageasn00"
## [37] "a15asn00" "a60asn00" "family00" "fhh00" "own00" "rent00"
## [43] "pop00.y" "ruanc00" "itanc00" "geanc00" "iranc00" "scanc00"
## [49] "rufb00" "itfb00" "gefb00" "irfb00" "scfb00" "fb00"
## [55] "nat00" "n10imm00" "ag5up00" "olang00" "lep00" "ag25up00"
## [61] "hs00" "col00" "ag15up00" "mar00" "wds00" "clf00"
## [67] "unemp00" "dflabf00" "flabf00" "empclf00" "prof00" "manuf00"
## [73] "semp00" "ag18cv00" "vet00" "cni16u00" "dis00" "dpov00"
## [79] "npov00" "n65pov00" "dfmpov00" "nfmpov00" "dwpov00" "nwpov00"
## [85] "dbpov00" "nbpov00" "dnapov00" "nnapov00" "dhpov00" "nhpov00"
## [91] "dapov00" "napov00" "incpc00" "hu00sp" "h30old00" "ohu00sp"
## [97] "h10yrs00" "dmulti00" "multi00" "hinc00" "hincw00" "hincb00"
## [103] "hinch00" "hinca00" "mhmval00" "mrent00" "hh00" "hhw00"
## [109] "hhb00" "hhh00" "hha00"
d3 <- select( d.full,
tractid, cbsa, cbsaname, # ids / units of analysis
mhv.90, mhv.00, mhv.change, pct.change, # home value
hinc90, hu90, own90, rent90, # ses
hinc00, hu00, own00, rent00,
empclf90, clf90, unemp90, prof90, # employment
empclf00, clf00, unemp00, prof00,
dpov90, npov90, # poverty
dpov00, npov00,
ag25up90, hs90, col90, # education
ag25up00, hs00, col00,
pop90.x, nhwht90, nhblk90, hisp90, asian90, # race
pop00.x, nhwht00, nhblk00, hisp00, asian00
) # end select
d3 <-
d3 %>%
mutate(
# 1990 variables
p.white.90 = 100 * nhwht90 / pop90.x,
p.black.90 = 100 * nhblk90 / pop90.x,
p.hisp.90 = 100 * hisp90 / pop90.x,
p.asian.90 = 100 * asian90 / pop90.x,
p.hs.edu.90 = 100 * (hs90+col90) / ag25up90,
p.col.edu.90 = 100 * col90 / ag25up00,
p.prof.90 = 100 * prof90 / empclf90,
p.unemp.90 = 100 * unemp90 / clf90,
pov.rate.90 = 100 * npov90 / dpov90,
# 2000 variables
p.white.00 = 100 * nhwht00 / pop00.x,
p.black.00 = 100 * nhblk00 / pop00.x,
p.hisp.00 = 100 * hisp00 / pop00.x,
p.asian.00 = 100 * asian00 / pop00.x,
p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
p.col.edu.00 = 100 * col00 / ag25up00,
p.prof.00 = 100 * prof00 / empclf00,
p.unemp.00 = 100 * unemp00 / clf00,
pov.rate.00 = 100 * npov00 / dpov00 )
d3 <-
d3 %>%
group_by( cbsaname ) %>%
mutate( metro.mhv.pct.90 = ntile( mhv.90, 100 ),
metro.mhv.pct.00 = ntile( mhv.00, 100 ),
metro.median.pay.90 = median( hinc90, na.rm=T ),
metro.median.pay.00 = median( hinc00, na.rm=T ),
metro.race.rank.90 = ntile( (100-p.white.90), 100 ) ) %>%
ungroup() %>%
mutate( metro.mhv.pct.change = metro.mhv.pct.00 - metro.mhv.pct.90,
pay.change = metro.median.pay.00 - metro.median.pay.90,
race.change = p.white.00 - p.white.90,
mhv.change = mhv.00 - mhv.90 )Descriptive Statistics of Change Variables:
d3 <-
d3 %>%
select( c( "tractid", "cbsa", "cbsaname",
"mhv.90", "mhv.00", "mhv.change","pct.change",
"p.white.90", "p.black.90", "p.hisp.90", "p.asian.90",
"p.hs.edu.90", "p.col.edu.90", "p.prof.90", "p.unemp.90",
"pov.rate.90", "p.white.00", "p.black.00", "p.hisp.00",
"p.asian.00", "p.hs.edu.00", "p.col.edu.00", "p.prof.00",
"p.unemp.00", "pov.rate.00", "metro.mhv.pct.90",
"metro.mhv.pct.00", "metro.median.pay.90", "metro.median.pay.00",
"metro.mhv.pct.change", "pay.change", "race.change",
"metro.race.rank.90") ) # head( d3 ) %>% pander()
d3 <- data.frame(d3)
stargazer( d3,
type="text",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )##
## =========================================================================
## Statistic Min Pctl(25) Median Mean Pctl(75) Max
## -------------------------------------------------------------------------
## mhv.90 11,433 76,024 111,588 145,184 182,974 644,276
## mhv.00 0 81,600 119,900 144,738 173,894 1,000,001
## mhv.change -644,276 -19,626 2,610 -578 21,451 964,566
## pct.change -100 -14 3 8 23 2,904
## p.white.90 0 64 87 74 95 100
## p.black.90 0 1 3 12 10 100
## p.hisp.90 0 1 3 10 9 100
## p.asian.90 0 0 1 3 3 94
## p.hs.edu.90 0 69 74 74 80 100
## p.col.edu.90 0 8 14 Inf 25 Inf
## p.prof.90 0 17 25 27 34 100
## p.unemp.90 0 4 5 7 8 64
## pov.rate.90 0 4 8 12 16 100
## p.white.00 0 47 78 67 91 100
## p.black.00 0 1 4 14 14 100
## p.hisp.00 0 2 4 13 15 100
## p.asian.00 0 1 2 5 5 95
## p.hs.edu.00 0 67 72 72 77 100
## p.col.edu.00 0 12 21 26 36 100
## p.prof.00 0 23 31 34 43 100
## p.unemp.00 0 3 5 6 8 100
## pov.rate.00 0 4 9 12 17 100
## metro.mhv.pct.90 1 20 41 45 68 100
## metro.mhv.pct.00 1 20 41 45 68 100
## metro.median.pay.90 14,871 28,906 32,457 32,924 35,833 52,374
## metro.median.pay.00 23,012 39,457 43,139 45,054 49,522 73,701
## metro.mhv.pct.change -99 -5 0 0 6 99
## pay.change 4,930 9,775 11,441 12,130 14,001 26,211
## race.change -100 -12 -5 -8 -2 100
## metro.race.rank.90 1 20 41 45 68 100
## -------------------------------------------------------------------------
Which definition did you select for gentrification, and how would you operationalize it?
# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?
# home value in lower than average home in a metro in 2000
poor.1990 <- d3$metro.mhv.pct.90 < 50
# above average diversity for metro
diverse.1990 <- d3$metro.race.rank.90 > 50
# home values increased more than overall city gains
# change in percentile rank within the metro
mhv.pct.increase <- d3$metro.mhv.pct.change > 0
# faster than average growth
# 25% growth in value is median for the country
home.val.rise <- d3$pct.change > 25
# proportion of whites increases by more than 3 percent
# measured by increase in white
loss.diversity <- d3$race.change > 3
g.flag <- poor.1990 & diverse.1990 & mhv.pct.increase & home.val.rise & loss.diversity
num.candidates <- sum( poor.1990 & diverse.1990, na.rm=T )
num.gentrified <- sum( g.flag, na.rm=T )
num.gentrified## [1] 394
num.candidates## [1] 17560
num.gentrified / num.candidates## [1] 0.02243736
By this definition only 2.24 percent of urban tracts experience gentrification between 1990 and 2000.
This might skew numbers?
# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?
mhv.90[ mhv.90 < 1000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.90 )
summary( pct.change )## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -100.000 -13.890 2.888 7.583 23.156 2903.825 145
What do you think? Is that the right way to operationalize it? Do you care about relative diversity (more diverse neighborhood than rest of the city) or absolute (percentage of non-whites regardless of city diversity).
Do we care about absolute increase in value, or relative to the national average? The national average would include all of the gentrifying tracts, which could skew it upward and thus make it a poor benchmark? Maybe look at the average increase in value for non-gentrification candadidates?
You will need to pick one or more cities that you can use as examples in your report. It is strongly recommended that you create a dorling cartogram for reporting since Census tracts introduce visual bias by over-empasizing lower density tracts and hiding a lot of the data where the greatest number of people reside in the city. Recall that Dorling cartograms correct for this by re-sizing administrative units proportion to the size of the population they represent.
#devtools::install_github( "sjewo/cartogram" )
#install.packages( "tmap" )
library( geojsonio ) # read geoJSON map files from GitHub## Warning: package 'geojsonio' was built under R version 4.2.2
## Registered S3 method overwritten by 'geojsonsf':
## method from
## print.geojson geojson
##
## Attaching package: 'geojsonio'
## The following object is masked from 'package:base':
##
## pretty
library( sp ) # spatial data class sp for shapefiles
library( cartogram ) # spatial maps w/ tract size bias reduction## Warning: package 'cartogram' was built under R version 4.2.2
library( tmap ) # thematic maps## Warning: package 'tmap' was built under R version 4.2.2
library( maptools ) # spatial object manipulation ## Checking rgeos availability: TRUE
## Please note that 'maptools' will be retired by the end of 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
library( sf ) # 'simple features' flavor of shapefiles# we have phoenix already packaged and on GitHub for easy load:
github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/phx_dorling.geojson"
phx <- geojson_read( x=github.url, what="sp" )
plot( phx )# create small dataframe for the merge
df <- data.frame( tractid=d$tractid,
mhv.90, mhv.00, mhv.change, pct.change )
# create GEOID that matches GIS format
# create a geoID for merging by tract
df$GEOID <- substr( df$tractid, 6, 18 ) # extract codes
df$GEOID <- gsub( "-", "", df$GEOID ) # remove hyphens
class( df$GEOID )## [1] "character"
head(df$GEOID)## [1] "01001020100" "01001020200" "01001020300" "01001020400" "01001020500"
## [6] "01001020600"
Note the current class of the PHX shapefile IDs:
GEOID2 is numeric, has leading zero dropped GEOID is a factor that retains leading zeros You can either convert the data frame ID to numeric to drop the leading zero and merge with GEOID2.
Or keep it as a character vector and merge with GEOID.
head( phx@data ) # sp class from GIS file, so data frame is located @data# merge census data with dorling map
nrow( phx ) # check dimensions## [1] 913
phx <- merge( phx, df, by.x="GEOID", by.y="GEOID" )
# make sure they have not changed or
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( phx ) ## [1] 913
phx <- spTransform( phx, CRS("+init=epsg:3395") )
bb <- st_bbox( c( xmin = -12519146, xmax = -12421368,
ymax = 3965924, ymin = 3899074 ),
crs = st_crs("+init=epsg:3395")) ## Warning in CPL_crs_from_input(x): GDAL Message 1: +init=epsg:XXXX syntax is
## deprecated. It might return a CRS with a non-EPSG compliant axis order.
tm_shape( phx, bbox=bb ) +
tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
tm_shape( phx, bbox=bb ) +
tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )## Variable(s) "mhv.change" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tm_shape( phx, bbox=bb ) +
tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )## Variable(s) "pct.change" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
You were asked to come up with a way to conceptualize gentrification.
This week you will create a new variable in your dataset that allows you to operationalize gentrification and examine its prevelance in the data. How many census tracts are candidates (start out at a low income level with high diversity)? And of those how many have transitioned into advanced stages of gentrification?
Provide an explanation and justification of the way you measure gentrification in the data.
Histograms and scatterplots help us understand the statistical properties and relationships between variables in our dataset. Due to the nature of cities, these relationships are all shaped by their location in the city. Geography matters a great deal in urban policy, so it should not be ignored.
Simple simple choropleth maps can be extremely helpful in understanding relationships in the data.
Revisit one of the labs from CPP 529 and prepare a Dorling Cartogram shapefile for your project. You can select one or more cities of your choice (you can use the same city if you like).
Follow the instructions for saving your cartogram as a GeoJSON file. These files are nice because they can be stored on GitHub and read directly into R. If you create it right the first time you can use it over and over after that.
Please create the following choropleth maps and report your main findings:
Home Values
Gentrification
Do you find any meaningful patterns in where gentrification occurs?
# Load packages
library( here ) # use relative file paths
library(tidyverse) # data wrangling
library( tidycensus ) # pull census data
library( sp ) # work with shapefiles
library( sf ) # work with shapefiles - simple features format
library(RColorBrewer) # color shapefile
library(zoom) # enlarge plot
library( geojsonio ) # read shapefiles
library( sp ) # work with shapefiles
library( sf ) # work with shapefiles - simple features format
library( mclust ) # cluster analysis ## Warning: package 'mclust' was built under R version 4.2.2
## Package 'mclust' version 6.0.0
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
##
## map
library( tmap ) # theme maps
library( ggplot2 ) # graphing
library( ggthemes ) # nice formats for ggplots## Warning: package 'ggthemes' was built under R version 4.2.2
library( dplyr ) # data wrangling
library( pander ) # formatting RMD tables
library( tidycensus )
library( cartogram ) # spatial maps w/ tract size bias reduction
library( maptools ) # spatial object manipulation # Load Crosswalk data
crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv", stringsAsFactors=F, colClasses="character" )
# search for city names by strings, use the ^ anchor for "begins with"
grep( "^ATL", crosswalk$msaname, value=TRUE ) ## [1] "ATLANTA, GA" "ATLANTA, GA" "ATLANTA, GA"
## [4] "ATLANTA, GA" "ATLANTA, GA" "ATLANTA, GA"
## [7] "ATLANTA, GA" "ATLANTA, GA" "ATLANTA, GA"
## [10] "ATLANTA, GA" "ATLANTA, GA" "ATLANTA, GA"
## [13] "ATLANTA, GA" "ATLANTA, GA" "ATLANTA, GA"
## [16] "ATLANTA, GA" "ATLANTA, GA" "ATLANTA, GA"
## [19] "ATLANTA, GA" "ATLANTA, GA" "ATLANTIC-CAPE MAY, NJ"
## [22] "ATLANTIC-CAPE MAY, NJ"
# Select crosswalk data for Atlanta
these.msp <- crosswalk$msaname == "ATLANTA, GA"
these.fips <- crosswalk$fipscounty[ these.msp ]
these.fips <- na.omit( these.fips )
state.fips <- substr( these.fips, 1, 2 )
county.fips <- substr( these.fips, 3, 5 )#census_api_key(key)
# Select population data for Atlanta (state code 42), geometry = TRUE
#pulls shapefile for GA
msp.pop <-
get_acs( geography = "tract", variables = "B01003_001",
state = "13", county = county.fips[state.fips=="13"], geometry =
TRUE ) %>%
select( GEOID, estimate ) %>%
rename( POP=estimate )## Getting data from the 2016-2020 5-year ACS
# Load rodeo data set
census.dat <- readRDS(here("data/rodeo/ltdb-2010.rds"))
# Extract state codes
census.dat$GEOID <- substr( census.dat$tractid, 6, 18 ) # extract codes
census.dat$GEOID <- gsub( "-", "", census.dat$GEOID ) # remove hyphens
#class( census.dat$GEOID )
# Merge shapefile and census data with ltdb rodeo dataframe
#msp <- merge( msp.pop, census.dat, by.x="GEOID", by.y="GEOID" )
msp <- merge( msp.pop, census.dat, by.x="GEOID", by.y="GEOID" )
# Filter out empty parts of shapefile
msp <- msp[ ! st_is_empty( msp ) , ]
class(msp)## [1] "sf" "data.frame"
# convert shapefile map object to a spatial version
msp.sp <- as_Spatial( msp )
# Check that it is a SpatialPolygonsDataFrame
class( msp.sp )## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
plot( msp.sp )# Use RColorBrewer to create color pallette to show median home value
#distribution
my_colors <- brewer.pal(9, "Blues")
my_colors <- colorRampPalette(my_colors)(10)
# Attribute the appropriate color to each tract of the Philadelphia area
#based on median home value in 2010
class_of_city <- cut(msp.sp@data$mhmval12, 10)
my_colors <- my_colors[as.numeric(class_of_city)]
# Plot to view, can change bg color code:
#https://txwes.libguides.com/c.php?g=978475&p=7075536
plot(msp.sp, col=my_colors , bg = "#FFFFFF")
# project map and remove empty tracts
msp.sp <- spTransform( msp.sp, CRS("+init=epsg:3395"))
msp.sp <- msp.sp[ msp.sp$POP != 0 & (! is.na( msp.sp$POP )) , ]
# convert census tract polygons to dorling cartogram
# no idea why k=0.03 works, but it does - default is k=5
msp.sp$pop.w <- msp.sp$POP / 9000 # max(msp.sp$POP) # standardizes it to max of 1.5
msp_dorling <- cartogram_dorling( x=msp.sp, weight="pop.w", k=0.05 )
plot( msp_dorling )
tm_shape( msp_dorling ) +
tm_polygons( size="POP", col="hinc12", n=7, style="quantile", palette="Spectral" )