Lab 03 - Discriptive Analysis

Inferential analysis is never a completely linear process. It involves many iterations of cleaning data, describing data, visualizing data, and modeling data until you are confident about the process and the results.

Part I

For Part 01 of the lab, reproduce the descriptive analysis demonstrated in the tutorial but do it for periods 1990 to 2000 instead of the 2000 to 2010 period used in the tutorial. Note that we are only using urban tracts rather than all urban and rural tracts.

How do changes in home value differ between the 1990-2000 period and 2000-2010?

Data

# Load Crosswalk data

crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv",  stringsAsFactors=F, colClasses="character" )


# search for city names by strings, use the ^ anchor for "begins with" 
grep( "^PHI", crosswalk$msaname, value=TRUE ) 
## [1] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [4] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [7] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [1] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [4] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"
## [7] "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ" "PHILADELPHIA, PA-NJ"

# Select crosswalk data for Philadelphia, note that this MSA covers two 
#states PA & NJ

these.msp <- crosswalk$msaname == "PHILADELPHIA, PA-NJ"
these.fips <- crosswalk$fipscounty[ these.msp ]
these.fips <- na.omit( these.fips )
state.fips <- substr( these.fips, 1, 2 )
county.fips <- substr( these.fips, 3, 5 )


install=TRUE
 key <- "da93b9c8b9f4cbfd1869025781a1c2693fe419dd"
census_api_key(key)
## To install your API key for use in future sessions, run this function with `install = TRUE`.
# Select population data for Pennsylvania (state code 42), geometry = TRUE 
#pulls shapefile for PA
options(tigris_use_cache = TRUE)
msp.pop1 <-
get_acs( geography = "tract", variables = "B01003_001",
         state = "42", county = county.fips[state.fips=="42"], geometry =
TRUE ) %>%
         select( GEOID, estimate ) %>%
         rename( POP=estimate )
## Getting data from the 2016-2020 5-year ACS
# Select population data for New Jersey (state code 34), geometry = TRUE 
#pulls shapefile for NJ
msp.pop2 <-
get_acs( geography = "tract", variables = "B01003_001",
         state = "34", county = county.fips[state.fips=="34"], geometry =
TRUE ) %>%
         select( GEOID, estimate ) %>%
         rename( POP=estimate )
## Getting data from the 2016-2020 5-year ACS
# Join PA and NJ data to form a shapefile with census data from both states
msp.pop <- rbind( msp.pop1, msp.pop2 )


d1 <- readRDS( here::here( "data/rodeo/LTDB-1990.rds" ) )
d2 <- readRDS( here::here( "data/rodeo/LTDB-2000.rds" ) )
md <- readRDS( here::here( "data/rodeo/LTDB-META-DATA.rds" ) )

# check to make sure we are not losing 
# or gaining observations in the merge
nrow( d1 ) 
## [1] 72693


d1 <- select( d1, - year )
d2 <- select( d2, - year )

d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )

nrow( d )
## [1] 72693


Filter Rural Districts

table( d$urban )
## 
## rural urban 
## 12971 59722
d <- filter( d, urban == "urban" )


Identify Common Variables

# find variables that are in both files
compare_dfs <- function( df1, df2 )
{
  # use regular expressions to remove numeric suffixes 
  var.names.1 <- names( df1 )
  var.names.1 <- gsub( "[.][xy]$", "", var.names.1 )
  var.names.1 <- gsub( "[0-9]{2}$", "", var.names.1 )
  
  var.names.2 <- names( df2 )
  var.names.2 <- gsub( "[.][xy]$", "", var.names.2 )
  var.names.2 <- gsub( "[0-9]{2}$", "", var.names.2 )
  
  shared <- intersect( var.names.1, var.names.2 ) %>% sort()
  print( "SHARED VARIABLES:")
  print( shared )
  
  not.shared <- c( setdiff( var.names.1, var.names.2 ),
                   setdiff( var.names.2, var.names.1 ) ) %>% sort()
  
  print( "NOT SHARED:" )
  print( not.shared )
  
  d.vars1 <- data.frame( type="shared", variables=shared, stringsAsFactors=F )
  d.vars2 <- data.frame( type="not shared", variables=not.shared, stringsAsFactors=F )
  dd <- rbind( d.vars1, d.vars2 )
  
  return( dd )
}

vars <- compare_dfs( df1=d1, df2=d2 )
## [1] "SHARED VARIABLES:"
##   [1] "a15asn"  "a15blk"  "a15hsp"  "a15ntv"  "a15wht"  "a18und"  "a60asn" 
##   [8] "a60blk"  "a60hsp"  "a60ntv"  "a60up"   "a60wht"  "a75up"   "ag15up" 
##  [15] "ag25up"  "ag5up"   "ageasn"  "ageblk"  "agehsp"  "agentv"  "agewht" 
##  [22] "asian"   "china"   "clf"     "cni16u"  "col"     "cuban"   "dapov"  
##  [29] "dbpov"   "dflabf"  "dfmpov"  "dhpov"   "dis"     "dmulti"  "dnapov" 
##  [36] "dpov"    "dwpov"   "empclf"  "family"  "fb"      "fhh"     "filip"  
##  [43] "flabf"   "geanc"   "gefb"    "h10yrs"  "h30old"  "haw"     "hh"     
##  [50] "hha"     "hhb"     "hhh"     "hhw"     "hinc"    "hinca"   "hincb"  
##  [57] "hinch"   "hincw"   "hisp"    "hs"      "hu"      "incpc"   "india"  
##  [64] "iranc"   "irfb"    "itanc"   "itfb"    "japan"   "korea"   "lep"    
##  [71] "manuf"   "mar"     "mex"     "mhmval"  "mrent"   "multi"   "n10imm" 
##  [78] "n65pov"  "napov"   "nat"     "nbpov"   "nfmpov"  "nhblk"   "nhpov"  
##  [85] "nhwht"   "nnapov"  "npov"    "ntv"     "nwpov"   "ohu"     "olang"  
##  [92] "own"     "pop"     "pr"      "prof"    "rent"    "ruanc"   "rufb"   
##  [99] "scanc"   "scfb"    "semp"    "tractid" "unemp"   "vac"     "vet"    
## [106] "viet"    "wds"    
## [1] "NOT SHARED:"
## [1] "ag16cv"  "ag18cv"  "hu00sp"  "hu90sp"  "ohu00sp" "ohu90sp" "pop90.1"
head(vars) %>% pander()
type variables
shared a15asn
shared a15blk
shared a15hsp
shared a15ntv
shared a15wht
shared a18und


Create Data Set for Analysis

d.full <- d  # keep a copy so don't have to reload 
d <- d.full  # story original in case you need to reset anything

d <- select( d, tractid, mhmval00, mhmval90, hinc00, 
             hu00, own00, rent00,  
             empclf00, clf00, unemp00, prof00,  
             dpov00, npov00,
             ag25up00, hs00, col00, 
             pop00.x, nhwht00, nhblk00, hisp00, asian00,
             cbsa, cbsaname )
d <- 
  d %>%
  mutate( p.white = 100 * nhwht00 / pop00.x,
          p.black = 100 * nhblk00 / pop00.x,
          p.hisp = 100 * hisp00 / pop00.x, 
          p.asian = 100 * asian00 / pop00.x,
          p.hs = 100 * (hs00+col00) / ag25up00,
          p.col = 100 * col00 / ag25up00,
          p.prof = 100 * prof00 / empclf00,
          p.unemp = 100 * unemp00 / clf00,
          pov.rate = 100 * npov00 / dpov00 )


stargazer( d, 
           type="text", 
           digits=0,
           summary.stat = c("min", "p25","median","mean","p75","max") )
## 
## ===========================================================
## Statistic  Min  Pctl(25) Median   Mean   Pctl(75)    Max   
## -----------------------------------------------------------
## mhmval00    0    81,600  119,900 144,738 173,894  1,000,001
## mhmval90    0    58,800  86,500  112,399 141,800   500,001 
## hinc00    2,499  33,000  43,825  47,657   58,036   200,001 
## hu00        0    1,102    1,519   1,570   1,999    11,522  
## own00       0     542      902     939    1,289     4,911  
## rent00      0     195      398     516     712      8,544  
## empclf00    0    1,205    1,756   1,820   2,373    10,334  
## clf00       0    1,302    1,865   1,930   2,502    11,251  
## unemp00     0      51      87      110     140      6,405  
## prof00      0     299      539     637     873      6,610  
## dpov00      0    2,671    3,718   3,804   4,871    23,892  
## npov00      0     149      304     452     601      5,515  
## ag25up00    0    1,763    2,451   2,520   3,224    17,974  
## hs00        0     665     1,071   1,155   1,552     8,909  
## col00       0     243      492     665     923      9,313  
## pop00.x     0    2,751    3,802   3,901   4,976    36,206  
## nhwht00     0    1,308    2,514   2,591   3,713    20,619  
## nhblk00     0      41      141     522     527     14,039  
## hisp00      0      55      153     547     533     13,391  
## asian00     0      22      65      189     183      9,491  
## p.white     0      47      78      67       91       100   
## p.black     0      1        4      14       14       100   
## p.hisp      0      2        4      13       15       100   
## p.asian     0      1        2       5       5        95    
## p.hs        0      67      72      72       77       100   
## p.col       0      12      21      26       36       100   
## p.prof      0      23      31      34       43       100   
## p.unemp     0      3        5       6       8        100   
## pov.rate    0      4        9      12       17       100   
## -----------------------------------------------------------

Exploration of Median Home Values

Initial conditions in 1990:

# adjust 2000 home values for inflation 
mhv.90 <- d$mhmval90 * 1.28855  
mhv.00 <- d$mhmval00

mhv.change <- mhv.00 - mhv.90

df <- data.frame( MedianHomeValue1990=mhv.90, 
                  MedianHomeValue2000=mhv.00, 
                  Change.90.to.00=mhv.change )

stargazer( df, 
           type="text", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
## 
## ========================================================================
## Statistic             Min    Pctl(25) Median   Mean   Pctl(75)    Max   
## ------------------------------------------------------------------------
## MedianHomeValue1990    0      75,767  111,460 144,832 182,716   644,276 
## MedianHomeValue2000    0      81,600  119,900 144,738 173,894  1,000,001
## Change.90.to.00     -644,276 -19,539   2,673    -94    21,556  1,000,001
## ------------------------------------------------------------------------


Histogram of MHV

hist( mhv.change/1000, breaks=500, 
      xlim=c(-100,500), yaxt="n", xaxt="n",
      xlab="Thousand of US Dollars (adjusted to 2000)", cex.lab=1.5,
      ylab="", main="Change in Median Home Value 1990 to 2000",
      col="gray20", border="white" )

axis( side=1, at=seq( from=-100, to=500, by=100 ), 
      labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
        
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500, 
      labels=paste0( "Mean = ", scales::dollar( round(1000*mean.x,0)) ), 
      col="darkorange", cex=1.8, pos=3 )

median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000, 
      labels=paste0( "Median = ", scales::dollar( round(1000*median.x,0)) ), 
      col="dodgerblue", cex=1.8, pos=3 )


# function to control plot() formatting 
jplot <- function( x1, x2, lab1="", lab2="", draw.line=T, ... )
{

    plot( x1, x2,
          pch=19, 
          col=gray(0.6, alpha = 0.2), 
          cex=2.5,  
          bty = "n",
          xlab=lab1, 
          ylab=lab2, cex.lab=1.5,
        ... )

    if( draw.line==T ){ 
        ok <- is.finite(x1) & is.finite(x2)
        lines( lowess(x2[ok]~x1[ok]), col="red", lwd=3 ) }

}

Compare 1990 to 2000 distributions:

layout.matrix <- matrix( c( 1,3,
                            2,3 ), 
                nrow=2, ncol=2, byrow=T )

layout( mat = layout.matrix,
        heights = c(2,2), # Heights of the two rows
        widths =  c(3,4)) # Widths of the two columns

# layout.show(3)

par( mar=c(4,0,0,2) )

hist( mhv.90/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )

abline( v=seq(0,1000,100), lty=2, col="gray80" )

text( 550, 4000, labels="Median Home \nValue in 1990", 
      col="darkslateblue", cex=1.8 )



hist( mhv.00/1000, breaks=50, 
      xlim=c(-200,800), yaxt="n", xaxt="n",
      xlab="", cex.lab=1,
      ylab="", main="",
      col="darkslateblue", border="white" )

abline( v=seq(0,1000, 100 ), lty=2, col="gray80" )

text( 550, 3500, labels="Median Home \nValue in 2000", 
      col="darkslateblue", cex=1.8 )

axis( side=1, at=seq( from=0, to=1000, by=100 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=100 ), "k" ) )


# data reduction - filter 1,000 observations

df <- data.frame( v90=mhv.90/1000, v00=mhv.00/1000 )
df <- sample_n( df, 1000 )

par( mar=c(4,5,3,2) )

jplot( df$v90, df$v00, 
       lab1="MHV in 1990", lab2="MHV in 2000",
       xlim=c(0,1000), ylim=c(0,1000),
       axes=F )

abline( a=0, b=1, lty=2, col="gray" )
axis( side=1, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )
axis( side=2, at=seq( from=0, to=1000, by=200 ), 
      labels=paste0( "$", seq( from=0, to=1000, by=200 ), "k" ) )


Change in MHV from 1990 to 2000

If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.

The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.

The percent change variable provides some context for growth rates of value in census tracts.

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

# interpretation is hard if there were no homes in 2000
# and thus an artificially low MHV. i don't trust cases
# that go from homes worth $10k to regular value
# because it is more likely errors in data or noise
# than meaningful variance 
#
# quick filter to remove all of the problematic obs
# but need to go back and see which cases are problematic


mhv.90[ mhv.90 < 10000 ] <- NA
pct.change <- mhv.change / mhv.90
summary( pct.change )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -1.00000 -0.13890  0.02888  0.07583  0.23156 29.03826      145
sum( pct.change > 5, na.rm=T )
## [1] 28
d %>% 
  filter( pct.change > 5 ) %>% 
  head()


Plot the percent change variable:

hg <-
hist( pct.change, breaks=2000, 
      xlim=c(-1,2), yaxt="n", xaxt="n",
      xlab="", cex.main=1.5,
      ylab="", main="Growth in Home Value by Census Tract 1990 to 2000",
      col="gray40", border="white" )

axis( side=1, at=seq( from=-1, to=2, by=0.5 ), 
      labels=paste0( seq( from=-100, to=200, by=50 ), "%" ) )

ymax <- max( hg$count )
        
mean.x <- mean( pct.change, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=1, y=(0.5*ymax), 
      labels=paste0( "Mean = ", round(100*mean.x,0), "%"), 
      col="darkorange", cex=1.8, pos=4 )

median.x <- median( pct.change, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=1, y=(0.6*ymax), 
      labels=paste0( "Median = ", round(100*median.x,0), "%"), 
      col="dodgerblue", cex=1.8, pos=4 )


Group Growth Rates By Metro Area

We often want to disagregate descriptives by some grouping in the data, such as metro areas.

dplyr makes this easy by grouping then summarizing the data.

d$mhv.change <- mhv.change 
d$pct.change <- pct.change
d$mhv.00 <- mhv.00
d$mhv.90 <- mhv.90

d %>%
  group_by( cbsaname ) %>%
  summarize( ave.change = median( mhv.change, na.rm=T ),
             ave.change.d = dollar( round(ave.change,0) ),
             growth = 100 * median( pct.change, na.rm=T ) ) %>%
  ungroup() %>%
  arrange( - growth ) %>%
  select( - ave.change ) %>% 
  head( 25 ) %>%
  pander()

Quitting from lines 423-440 (lab_03-anderson.Rmd) Error in summarize(): ! Problem while computing ave.change.d = dollar(round(ave.change, 0)). ℹ The error occurred in group 1: cbsaname = “Abilene, TX”. Caused by error in dollar(): ! could not find function “dollar” Backtrace: 1. … %>% pander() 8. dplyr:::summarise.grouped_df(…) 9. dplyr:::summarise_cols(.data, dplyr_quosures(…), caller_env = caller_env()) 11. dplyr:::map(quosures, summarise_eval_one, mask = mask) 12. base::lapply(.x, .f, …) 13. dplyr FUN(X[[i]], …) 14. mask$eval_all_summarise(quo)

Measuring Gentrification

The original merged dataset we saved as d.full so we don’t need to reload it:

Recall our data steps thus far:

# adjust 2000 home values for inflation 
mhv.90 <- d.full$mhmval90 * 1.28855  
mhv.00 <- d.full$mhmval00

mhv.change <- mhv.00 - mhv.90

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.90[ mhv.90 < 10000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.90 )
summary( pct.change )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -100.000  -13.890    2.888    7.583   23.156 2903.825      145
d.full$mhv.90 <- mhv.90
d.full$mhv.00 <- mhv.00
d.full$mhv.change <- mhv.change
d.full$pct.change <- pct.change

Select Gentrification Variables

Select variables for operationalizing a definition of gentrification.

We need to add some variables from 2000.

Recall we created a 1990 to 2000 variable list for reference:

head(vars)

————————————–check variables on year————————————————————-

names(d1)
##   [1] "tractid"  "pop90.x"  "nhwht90"  "nhblk90"  "ntv90"    "asian90" 
##   [7] "hisp90"   "haw90"    "india90"  "china90"  "filip90"  "japan90" 
##  [13] "korea90"  "viet90"   "mex90"    "pr90"     "cuban90"  "hu90"    
##  [19] "vac90"    "ohu90"    "a18und90" "a60up90"  "a75up90"  "agewht90"
##  [25] "a15wht90" "a60wht90" "ageblk90" "a15blk90" "a60blk90" "agehsp90"
##  [31] "a15hsp90" "a60hsp90" "agentv90" "a15ntv90" "a60ntv90" "ageasn90"
##  [37] "a15asn90" "a60asn90" "ag15up90" "mar90"    "wds90"    "mhmval90"
##  [43] "mrent90"  "own90"    "rent90"   "family90" "fhh90"    "pop90.y" 
##  [49] "pop90.1"  "ruanc90"  "itanc90"  "geanc90"  "iranc90"  "scanc90" 
##  [55] "rufb90"   "itfb90"   "gefb90"   "irfb90"   "scfb90"   "fb90"    
##  [61] "nat90"    "n10imm90" "ag5up90"  "olang90"  "lep90"    "ag25up90"
##  [67] "hs90"     "col90"    "clf90"    "unemp90"  "dflabf90" "flabf90" 
##  [73] "empclf90" "prof90"   "manuf90"  "semp90"   "ag16cv90" "vet90"   
##  [79] "cni16u90" "dis90"    "dpov90"   "npov90"   "n65pov90" "dfmpov90"
##  [85] "nfmpov90" "dwpov90"  "nwpov90"  "dbpov90"  "nbpov90"  "dnapov90"
##  [91] "nnapov90" "dhpov90"  "nhpov90"  "dapov90"  "napov90"  "incpc90" 
##  [97] "hu90sp"   "h30old90" "ohu90sp"  "h10yrs90" "dmulti90" "multi90" 
## [103] "hinc90"   "hincw90"  "hincb90"  "hinch90"  "hinca90"  "hh90"    
## [109] "hhw90"    "hhb90"    "hhh90"    "hha90"
names(d2)
##   [1] "tractid"  "pop00.x"  "nhwht00"  "nhblk00"  "ntv00"    "asian00" 
##   [7] "hisp00"   "haw00"    "india00"  "china00"  "filip00"  "japan00" 
##  [13] "korea00"  "viet00"   "mex00"    "pr00"     "cuban00"  "hu00"    
##  [19] "vac00"    "ohu00"    "a18und00" "a60up00"  "a75up00"  "agewht00"
##  [25] "a15wht00" "a60wht00" "ageblk00" "a15blk00" "a60blk00" "agehsp00"
##  [31] "a15hsp00" "a60hsp00" "agentv00" "a15ntv00" "a60ntv00" "ageasn00"
##  [37] "a15asn00" "a60asn00" "family00" "fhh00"    "own00"    "rent00"  
##  [43] "pop00.y"  "ruanc00"  "itanc00"  "geanc00"  "iranc00"  "scanc00" 
##  [49] "rufb00"   "itfb00"   "gefb00"   "irfb00"   "scfb00"   "fb00"    
##  [55] "nat00"    "n10imm00" "ag5up00"  "olang00"  "lep00"    "ag25up00"
##  [61] "hs00"     "col00"    "ag15up00" "mar00"    "wds00"    "clf00"   
##  [67] "unemp00"  "dflabf00" "flabf00"  "empclf00" "prof00"   "manuf00" 
##  [73] "semp00"   "ag18cv00" "vet00"    "cni16u00" "dis00"    "dpov00"  
##  [79] "npov00"   "n65pov00" "dfmpov00" "nfmpov00" "dwpov00"  "nwpov00" 
##  [85] "dbpov00"  "nbpov00"  "dnapov00" "nnapov00" "dhpov00"  "nhpov00" 
##  [91] "dapov00"  "napov00"  "incpc00"  "hu00sp"   "h30old00" "ohu00sp" 
##  [97] "h10yrs00" "dmulti00" "multi00"  "hinc00"   "hincw00"  "hincb00" 
## [103] "hinch00"  "hinca00"  "mhmval00" "mrent00"  "hh00"     "hhw00"   
## [109] "hhb00"    "hhh00"    "hha00"
d3 <- select( d.full, 
             
             tractid, cbsa, cbsaname,            # ids / units of analysis
             
             mhv.90, mhv.00, mhv.change, pct.change,    # home value 
             
             hinc90, hu90, own90, rent90,        # ses
             hinc00, hu00, own00, rent00,
             
             empclf90, clf90, unemp90, prof90,   # employment 
             empclf00, clf00, unemp00, prof00,
             
             dpov90, npov90,                     # poverty
             dpov00, npov00,
             
             ag25up90, hs90, col90,              # education 
             ag25up00, hs00, col00,
             
             pop90.x, nhwht90, nhblk90, hisp90, asian90,   # race
             pop00.x, nhwht00, nhblk00, hisp00, asian00
             
          ) # end select


d3 <- 
  d3 %>%
  mutate( 
          # 1990 variables
          p.white.90 = 100 * nhwht90 / pop90.x,
          p.black.90 = 100 * nhblk90 / pop90.x,
          p.hisp.90 = 100 * hisp90 / pop90.x, 
          p.asian.90 = 100 * asian90 / pop90.x,
          p.hs.edu.90 = 100 * (hs90+col90) / ag25up90,
          p.col.edu.90 = 100 * col90 / ag25up00,
          p.prof.90 = 100 * prof90 / empclf90,
          p.unemp.90 = 100 * unemp90 / clf90,
          pov.rate.90 = 100 * npov90 / dpov90,
          
          # 2000 variables
          p.white.00 = 100 * nhwht00 / pop00.x,
          p.black.00 = 100 * nhblk00 / pop00.x,
          p.hisp.00 = 100 * hisp00 / pop00.x, 
          p.asian.00 = 100 * asian00 / pop00.x,
          p.hs.edu.00 = 100 * (hs00+col00) / ag25up00,
          p.col.edu.00 = 100 * col00 / ag25up00,
          p.prof.00 = 100 * prof00 / empclf00,
          p.unemp.00 = 100 * unemp00 / clf00,
          pov.rate.00 = 100 * npov00 / dpov00 )

d3 <-
  d3 %>%
  group_by( cbsaname ) %>%
  mutate( metro.mhv.pct.90 = ntile( mhv.90, 100 ),
          metro.mhv.pct.00 = ntile( mhv.00, 100 ),
          metro.median.pay.90 = median( hinc90, na.rm=T ),
          metro.median.pay.00 = median( hinc00, na.rm=T ),
          metro.race.rank.90 = ntile( (100-p.white.90), 100 ) ) %>%
  ungroup() %>%
  mutate( metro.mhv.pct.change = metro.mhv.pct.00 - metro.mhv.pct.90,
          pay.change = metro.median.pay.00 - metro.median.pay.90,
          race.change = p.white.00 - p.white.90,
          mhv.change = mhv.00 - mhv.90 )


Descriptive Statistics of Change Variables:

d3 <-           
  d3 %>%
  select( c( "tractid", "cbsa", "cbsaname",
             "mhv.90", "mhv.00", "mhv.change","pct.change",
          "p.white.90", "p.black.90", "p.hisp.90", "p.asian.90", 
          "p.hs.edu.90", "p.col.edu.90", "p.prof.90",  "p.unemp.90", 
          "pov.rate.90", "p.white.00", "p.black.00", "p.hisp.00", 
          "p.asian.00", "p.hs.edu.00", "p.col.edu.00", "p.prof.00", 
          "p.unemp.00", "pov.rate.00", "metro.mhv.pct.90", 
          "metro.mhv.pct.00", "metro.median.pay.90", "metro.median.pay.00", 
          "metro.mhv.pct.change", "pay.change", "race.change",
          "metro.race.rank.90") ) 
# head( d3 ) %>% pander()
d3 <- data.frame(d3)
stargazer( d3, 
           type="text", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )
## 
## =========================================================================
## Statistic              Min    Pctl(25) Median   Mean   Pctl(75)    Max   
## -------------------------------------------------------------------------
## mhv.90                11,433   76,024  111,588 145,184 182,974   644,276 
## mhv.00                  0      81,600  119,900 144,738 173,894  1,000,001
## mhv.change           -644,276 -19,626   2,610   -578    21,451   964,566 
## pct.change             -100     -14       3       8       23      2,904  
## p.white.90              0        64      87      74       95       100   
## p.black.90              0        1        3      12       10       100   
## p.hisp.90               0        1        3      10       9        100   
## p.asian.90              0        0        1       3       3        94    
## p.hs.edu.90             0        69      74      74       80       100   
## p.col.edu.90            0        8       14      Inf      25       Inf   
## p.prof.90               0        17      25      27       34       100   
## p.unemp.90              0        4        5       7       8        64    
## pov.rate.90             0        4        8      12       16       100   
## p.white.00              0        47      78      67       91       100   
## p.black.00              0        1        4      14       14       100   
## p.hisp.00               0        2        4      13       15       100   
## p.asian.00              0        1        2       5       5        95    
## p.hs.edu.00             0        67      72      72       77       100   
## p.col.edu.00            0        12      21      26       36       100   
## p.prof.00               0        23      31      34       43       100   
## p.unemp.00              0        3        5       6       8        100   
## pov.rate.00             0        4        9      12       17       100   
## metro.mhv.pct.90        1        20      41      45       68       100   
## metro.mhv.pct.00        1        20      41      45       68       100   
## metro.median.pay.90   14,871   28,906  32,457  32,924   35,833   52,374  
## metro.median.pay.00   23,012   39,457  43,139  45,054   49,522   73,701  
## metro.mhv.pct.change   -99       -5       0       0       6        99    
## pay.change            4,930    9,775   11,441  12,130   14,001   26,211  
## race.change            -100     -12      -5      -8       -2       100   
## metro.race.rank.90      1        20      41      45       68       100   
## -------------------------------------------------------------------------


Operationalizing Gentrification

Which definition did you select for gentrification, and how would you operationalize it?

# income
# percent white
# home values absolute
# home value relative to metro
# education stats ?
# employment stats ?
# income stats ?
# growth of pop per tract (density) ?


# home value in lower than average home in a metro in 2000
poor.1990 <- d3$metro.mhv.pct.90 < 50  

# above average diversity for metro
diverse.1990 <- d3$metro.race.rank.90 > 50 

# home values increased more than overall city gains 
# change in percentile rank within the metro
mhv.pct.increase <- d3$metro.mhv.pct.change > 0

# faster than average growth  
# 25% growth in value is median for the country
home.val.rise <- d3$pct.change > 25 

# proportion of whites increases by more than 3 percent 
# measured by increase in white
loss.diversity <- d3$race.change > 3 

g.flag <- poor.1990 & diverse.1990 & mhv.pct.increase & home.val.rise & loss.diversity

num.candidates <-  sum( poor.1990 & diverse.1990, na.rm=T )
num.gentrified <- sum( g.flag, na.rm=T )

num.gentrified
## [1] 394
num.candidates
## [1] 17560
num.gentrified / num.candidates
## [1] 0.02243736


By this definition only 2.24 percent of urban tracts experience gentrification between 1990 and 2000.

This might skew numbers?

# small initial values are skewing percentages
#
# an average home value below $10k is really low -
# these must be mostly vacant lots?

mhv.90[ mhv.90 < 1000 ] <- NA
pct.change <- 100 * ( mhv.change / mhv.90 )
summary( pct.change )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -100.000  -13.890    2.888    7.583   23.156 2903.825      145


Discussion

What do you think? Is that the right way to operationalize it? Do you care about relative diversity (more diverse neighborhood than rest of the city) or absolute (percentage of non-whites regardless of city diversity).

Do we care about absolute increase in value, or relative to the national average? The national average would include all of the gentrifying tracts, which could skew it upward and thus make it a poor benchmark? Maybe look at the average increase in value for non-gentrification candadidates?

Spatial Visualization

You will need to pick one or more cities that you can use as examples in your report. It is strongly recommended that you create a dorling cartogram for reporting since Census tracts introduce visual bias by over-empasizing lower density tracts and hiding a lot of the data where the greatest number of people reside in the city. Recall that Dorling cartograms correct for this by re-sizing administrative units proportion to the size of the population they represent.

#devtools::install_github( "sjewo/cartogram" )
#install.packages( "tmap" )

library( geojsonio )  # read geoJSON map files from GitHub
## Warning: package 'geojsonio' was built under R version 4.2.2
## Registered S3 method overwritten by 'geojsonsf':
##   method        from   
##   print.geojson geojson
## 
## Attaching package: 'geojsonio'
## The following object is masked from 'package:base':
## 
##     pretty
library( sp )         # spatial data class sp for shapefiles
library( cartogram )  # spatial maps w/ tract size bias reduction
## Warning: package 'cartogram' was built under R version 4.2.2
library( tmap )       # thematic maps
## Warning: package 'tmap' was built under R version 4.2.2
library( maptools )   # spatial object manipulation 
## Checking rgeos availability: TRUE
## Please note that 'maptools' will be retired by the end of 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
library( sf )         # 'simple features' flavor of shapefiles
# we have phoenix already packaged and on GitHub for easy load: 

github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/phx_dorling.geojson"
phx <- geojson_read( x=github.url,  what="sp" )
plot( phx )

# create small dataframe for the merge
df <- data.frame(  tractid=d$tractid, 
        mhv.90,  mhv.00,  mhv.change,  pct.change  )

# create GEOID that matches GIS format

# create a geoID for merging by tract 
df$GEOID <- substr( df$tractid, 6, 18 )  # extract codes
df$GEOID <- gsub( "-", "", df$GEOID )    # remove hyphens
class( df$GEOID )
## [1] "character"
head(df$GEOID)
## [1] "01001020100" "01001020200" "01001020300" "01001020400" "01001020500"
## [6] "01001020600"


Note the current class of the PHX shapefile IDs:

GEOID2 is numeric, has leading zero dropped GEOID is a factor that retains leading zeros You can either convert the data frame ID to numeric to drop the leading zero and merge with GEOID2.

Or keep it as a character vector and merge with GEOID.

head( phx@data )  # sp class from GIS file, so data frame is located @data


# merge census data with dorling map

nrow( phx ) # check dimensions
## [1] 913
phx <- merge( phx, df, by.x="GEOID", by.y="GEOID" )

# make sure they have not changed or 
# you are missing rows in your data frame
# or merging with the wrong ID
nrow( phx ) 
## [1] 913
phx <- spTransform( phx, CRS("+init=epsg:3395") )

bb <- st_bbox( c( xmin = -12519146, xmax = -12421368, 
                 ymax = 3965924, ymin = 3899074 ), 
               crs = st_crs("+init=epsg:3395")) 
## Warning in CPL_crs_from_input(x): GDAL Message 1: +init=epsg:XXXX syntax is
## deprecated. It might return a CRS with a non-EPSG compliant axis order.
tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="mhv.00", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )


tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="mhv.change", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )
## Variable(s) "mhv.change" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.


tm_shape( phx, bbox=bb ) + 
  tm_polygons( col="pct.change", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )
## Variable(s) "pct.change" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.


Part II

You were asked to come up with a way to conceptualize gentrification.

This week you will create a new variable in your dataset that allows you to operationalize gentrification and examine its prevelance in the data. How many census tracts are candidates (start out at a low income level with high diversity)? And of those how many have transitioned into advanced stages of gentrification?

Provide an explanation and justification of the way you measure gentrification in the data.

Part III

Histograms and scatterplots help us understand the statistical properties and relationships between variables in our dataset. Due to the nature of cities, these relationships are all shaped by their location in the city. Geography matters a great deal in urban policy, so it should not be ignored.

Simple simple choropleth maps can be extremely helpful in understanding relationships in the data.

Revisit one of the labs from CPP 529 and prepare a Dorling Cartogram shapefile for your project. You can select one or more cities of your choice (you can use the same city if you like).

Follow the instructions for saving your cartogram as a GeoJSON file. These files are nice because they can be stored on GitHub and read directly into R. If you create it right the first time you can use it over and over after that.

Please create the following choropleth maps and report your main findings:

Home Values

  • Describe the distribution of home values in 1990 - where are high and low-value tracts located in the city/cities?
  • Compare values in 2000 to changes in values from 1990-2000. Do the largest gains occur in tracts with above or below-average home prices in 2000?

Gentrification

  • Create a map that highlights tracts that are candidates for gentrification in 2000 and tracts that gentrify between 1990 and 2000.

Do you find any meaningful patterns in where gentrification occurs?

Choropleth Map

# Load packages
library( here )        # use relative file paths
library(tidyverse)     # data wrangling
library( tidycensus )  # pull census data
library( sp )          # work with shapefiles
library( sf )          # work with shapefiles - simple features format
library(RColorBrewer)  # color shapefile
library(zoom)          # enlarge plot

library( geojsonio )   # read shapefiles
library( sp )          # work with shapefiles
library( sf )          # work with shapefiles - simple features format
library( mclust )      # cluster analysis 
## Warning: package 'mclust' was built under R version 4.2.2
## Package 'mclust' version 6.0.0
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
## 
##     map
library( tmap )        # theme maps
library( ggplot2 )     # graphing 
library( ggthemes )    # nice formats for ggplots
## Warning: package 'ggthemes' was built under R version 4.2.2
library( dplyr )       # data wrangling 
library( pander )      # formatting RMD tables
library( tidycensus )

library( cartogram )  # spatial maps w/ tract size bias reduction
library( maptools )   # spatial object manipulation 


# Load Crosswalk data
crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv", stringsAsFactors=F, colClasses="character" )
# search for city names by strings, use the ^ anchor for "begins with" 
grep( "^ATL", crosswalk$msaname, value=TRUE ) 
##  [1] "ATLANTA, GA"           "ATLANTA, GA"           "ATLANTA, GA"          
##  [4] "ATLANTA, GA"           "ATLANTA, GA"           "ATLANTA, GA"          
##  [7] "ATLANTA, GA"           "ATLANTA, GA"           "ATLANTA, GA"          
## [10] "ATLANTA, GA"           "ATLANTA, GA"           "ATLANTA, GA"          
## [13] "ATLANTA, GA"           "ATLANTA, GA"           "ATLANTA, GA"          
## [16] "ATLANTA, GA"           "ATLANTA, GA"           "ATLANTA, GA"          
## [19] "ATLANTA, GA"           "ATLANTA, GA"           "ATLANTIC-CAPE MAY, NJ"
## [22] "ATLANTIC-CAPE MAY, NJ"
# Select crosswalk data for Atlanta 

these.msp <- crosswalk$msaname == "ATLANTA, GA"
these.fips <- crosswalk$fipscounty[ these.msp ]
these.fips <- na.omit( these.fips )
state.fips <- substr( these.fips, 1, 2 )
county.fips <- substr( these.fips, 3, 5 )


#census_api_key(key)
# Select population data for Atlanta (state code 42), geometry = TRUE 
#pulls shapefile for GA
msp.pop <-
get_acs( geography = "tract", variables = "B01003_001",
         state = "13", county = county.fips[state.fips=="13"], geometry =
TRUE ) %>%
         select( GEOID, estimate ) %>%
         rename( POP=estimate )
## Getting data from the 2016-2020 5-year ACS
# Load rodeo data set
census.dat <- readRDS(here("data/rodeo/ltdb-2010.rds"))

# Extract state codes
census.dat$GEOID <- substr( census.dat$tractid, 6, 18 )  # extract codes

census.dat$GEOID <- gsub( "-", "", census.dat$GEOID )    # remove hyphens
#class( census.dat$GEOID )


# Merge shapefile and census data with ltdb rodeo dataframe
#msp <- merge( msp.pop, census.dat, by.x="GEOID", by.y="GEOID" )

msp <- merge( msp.pop, census.dat, by.x="GEOID", by.y="GEOID" )
# Filter out empty parts of shapefile
msp <- msp[ ! st_is_empty( msp ) , ]
class(msp)
## [1] "sf"         "data.frame"


# convert shapefile map object to a spatial version
msp.sp <- as_Spatial( msp )

# Check that it is a SpatialPolygonsDataFrame
class( msp.sp )
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
plot( msp.sp )

# Use RColorBrewer to create color pallette to show median home value 
#distribution
my_colors <- brewer.pal(9, "Blues") 
my_colors <- colorRampPalette(my_colors)(10)
# Attribute the appropriate color to each tract of the Philadelphia area 
#based on median home value in 2010
class_of_city <- cut(msp.sp@data$mhmval12, 10)
my_colors <- my_colors[as.numeric(class_of_city)]

# Plot to view, can change bg color code: 
#https://txwes.libguides.com/c.php?g=978475&p=7075536
plot(msp.sp, col=my_colors ,  bg = "#FFFFFF")


# project map and remove empty tracts
msp.sp <- spTransform( msp.sp, CRS("+init=epsg:3395"))
msp.sp <- msp.sp[ msp.sp$POP != 0 & (! is.na( msp.sp$POP )) , ]

# convert census tract polygons to dorling cartogram
# no idea why k=0.03 works, but it does - default is k=5
msp.sp$pop.w <- msp.sp$POP / 9000 # max(msp.sp$POP)   # standardizes it to max of 1.5
msp_dorling <- cartogram_dorling( x=msp.sp, weight="pop.w", k=0.05 )
plot( msp_dorling )


tm_shape( msp_dorling ) + 
  tm_polygons( size="POP", col="hinc12", n=7, style="quantile", palette="Spectral" )