Final Project Part I - Create a Data Set

This lab will have you replicate the work you did for the previous lab, but instead of using the Phoenix data you will select a city of your choice to use for this lab, subsequent labs, and the final project.

You want to select a dense city that has a lot of census tracts, so I suggest you select from the 50 largest cities in the US. The Census defines Metropolitan Statistical Areas (MSAs) as “one or more adjacent counties or county equivalents that have at least one urban core area of at least 50,000 population, plus adjacent territory that has a high degree of social and economic integration with the core as measured by commuting ties.” There are currently 384 MSAs in the US.

Wikipedia provides a convenient list of MSAs sorted by size.

library( geojsonio )   # read shapefiles
library( sp )          # work with shapefiles
library( sf )          # work with shapefiles - simple features format
library( mclust )      # cluster analysis 
library( tmap )        # theme maps
library( ggplot2 )     # graphing 
library( ggthemes )    # nice formats for ggplots
library( dplyr )       # data wrangling 
library( pander )      # formatting RMD tables
library( tidycensus )

library( cartogram )  # spatial maps w/ tract size bias reduction
library( maptools )   # spatial object manipulation

Step 1: Select Your MSA

You can select a city from the list of large MSAs.

To get Census data on the city you will first need to identify all of the counties that comprise the MSA. You can look this information up through MSA to FIPS crosswalks provided by the National Bureau for Economic Research (NBER): https://www.nber.org/data/cbsa-fips-county-crosswalk.html

I have added the file to GitHub for ease of access.

crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv",  stringsAsFactors=F, colClasses="character" )

# search for citie names by strings, use the ^ anchor for "begins with" 

grep( "^SAN D", crosswalk$msaname, value=TRUE )

## [1] "SAN DIEGO, CA"

grep( "^CALI", crosswalk$msaname, value=TRUE )

##  [1] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
##  [6] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [11] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [16] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [21] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"

Select all of your county fips. To use them in the TidyCenss package you will need to split the state and county:

these.sdp <-crosswalk$msaname == grep( "^SAN DIEGO", crosswalk$msaname,
value=TRUE )
these.fips <- crosswalk$fipscounty[ these.sdp ]
these.fips <- na.omit( these.fips )

Step 2: Download a Shapefile with Population Data

To create a Dorling cartogram we need a shapefile and a population count. We can get both through the Census download that includes simple features.

library( tidycensus )
#census_api_key("404739a4c1d1baf6067b937fe42425b977ef145d", install = TRUE)

state.fips <- substr( these.fips, 1, 2 )
county.fips <- substr( these.fips, 3, 5 )

cbind( these.fips, state.fips, county.fips ) %>% pander()

these.fips	state.fips	county.fips
06073	06	073

sd.pop <-
get_acs( geography = "tract", variables = "B01003_001",
         state = "06", county = county.fips[state.fips=="06"], geometry = TRUE ) %>% 
         dplyr::select( GEOID, estimate ) %>%
         rename( POP=estimate )

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%

getwd()

## [1] "C:/Users/Owner/Documents/Homework/CPP 529"

Step 3: Add Census Data

URL <- "https://github.com/DS4PS/cpp-529-master/raw/master/data/ltdb_std_2010_sample.rds"
census.dat <- readRDS(gzcon(url( URL )))

# can merge an sf object and data.frame
sd.pop$GEOID<-sub('.', '', sd.pop$GEOID)
sdp <- merge( sd.pop, census.dat, by.x="GEOID", by.y="tractid" )

# make sure there are no empty polygons
sdp <- sdp[ ! st_is_empty( sdp ) , ]

#head(sdp)
#head(sd) %>% pander()

Data Dictionary

This exercise uses Census data from the 2012 American Communities Survey made available through the Diversity and Disparities Project.

dd.URL <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/data-dictionary.csv"
data.dictionary <- read.csv( dd.URL, stringsAsFactors=F ) 
data.dictionary %>% pander()

LABEL	VARIABLE
tractid	GEOID
pnhwht12	Percent white, non-Hispanic
pnhblk12	Percent black, non-Hispanic
phisp12	Percent Hispanic
pntv12	Percent Native American race
pfb12	Percent foreign born
polang12	Percent speaking other language at home, age 5 plus
phs12	Percent with high school degree or less
pcol12	Percent with 4-year college degree or more
punemp12	Percent unemployed
pflabf12	Percent female labor force participation
pprof12	Percent professional employees
pmanuf12	Percent manufacturing employees
pvet12	Percent veteran
psemp12	Percent self-employed
hinc12	Median HH income, total
incpc12	Per capita income
ppov12	Percent in poverty, total
pown12	Percent owner-occupied units
pvac12	Percent vacant units
pmulti12	Percent multi-family units
mrent12	Median rent
mhmval12	Median home value
p30old12	Percent structures more than 30 years old
p10yrs12	Percent HH in neighborhood 10 years or less
p18und12	Percent 17 and under, total
p60up12	Percent 60 and older, total
p75up12	Percent 75 and older, total
pmar12	Percent currently married, not separated
pwds12	Percent widowed, divorced and separated
pfhh12	Percent female-headed families with children

Step 4: Transform the Shapefile into A Dorling Cartogram

# convert sf map object to an sp version

sd.sp <- as_Spatial( sdp )

class( sd.sp )

## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"

## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
plot( sd.sp )

# project map and remove empty tracts
sd.sp <- spTransform( sd.sp, CRS("+init=epsg:3395"))
sd.sp <- sd.sp[ sd.sp$POP != 0 & (! is.na( sd.sp$POP )) , ]

# convert census tract polygons to dorling cartogram
# no idea why k=0.03 works, but it does - default is k=5
sd.sp$pop.w <- sd.sp$POP / 9000 # max(msp.sp$POP)   # standardizes it to max of 1.5
sd_dorling <- cartogram_dorling( x=sd.sp, weight="pop.w", k=0.05 )
plot( sd_dorling )
axis(side=1)

tm_shape( sd_dorling ) + 
  tm_polygons( size="POP", col="hinc12", n=7, style="quantile", palette="Spectral" )

Use a bounding box: improve the aestetics.

# user-defined bounding box to move slocer to subjects 
bbx <- st_bbox( c( xmin =  -13061000, xmax = -13008000, 
                  ymax = 3863123, ymin = 3811123 ), 
                  crs = st_crs("+init=epsg:3395"))

tm_shape( sd_dorling, bbox = bbx) + 
  tm_polygons( col="hinc12", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )

Clustering

We will use the same set of variables as last week. The data is transformed into z-score so that they are all on similar scales.

keep.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12", 
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12", 
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12", 
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12", 
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")

d1 <- sd_dorling@data
d2 <- dplyr::select( d1, keep.these )
d3 <- apply( d2, 2, scale )
head( d3[,1:6] ) %>% pander()

pnhwht12	pnhblk12	phisp12	pntv12	pfb12	polang12
1.484	-0.8089	-0.8851	-0.3282	-0.8389	-0.6954
1.417	-0.8089	-0.956	-0.3282	-0.4502	-0.5958
1.385	-0.6709	-0.9795	-0.3282	-1.091	-0.8657
0.6613	0.04017	-0.883	-0.1466	-0.05741	-0.8179
0.843	-0.03702	-0.8381	0.5606	-0.8445	-0.8246
0.984	-0.3162	-0.8821	-0.3282	-0.9838	-0.8401

Prepare Data for Clustering

We transform all of the variables to z scorse so they are on the same scale while clustering. This ensures that each census variable has equal weight. Z-scores typically range from about -3 to +3 with a mean of zero.

keep.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12", 
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12", 
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12", 
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12", 
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")

d2 <- dplyr::select( d1, keep.these )
d3 <- apply( d2, 2, scale )

head( d3[,1:6] ) %>% pander()

pnhwht12	pnhblk12	phisp12	pntv12	pfb12	polang12
1.484	-0.8089	-0.8851	-0.3282	-0.8389	-0.6954
1.417	-0.8089	-0.956	-0.3282	-0.4502	-0.5958
1.385	-0.6709	-0.9795	-0.3282	-1.091	-0.8657
0.6613	0.04017	-0.883	-0.1466	-0.05741	-0.8179
0.843	-0.03702	-0.8381	0.5606	-0.8445	-0.8246
0.984	-0.3162	-0.8821	-0.3282	-0.9838	-0.8401

Perform Cluster Analysis

For more details on cluster analysis visit the mclust tutorial.

# library( mclust )
set.seed( 1234 )
fit <- Mclust( d3 )
sd_dorling$cluster <- as.factor( fit$classification )
summary( fit )

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 6 components: 
## 
##  log-likelihood   n  df       BIC       ICL
##       -11080.09 534 800 -27184.49 -27224.52
## 
## Clustering table:
##   1   2   3   4   5   6 
##  38  62 123 141 117  53

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------

Identifying Neighborhood Clusters

Build the charts to compare census characteristics across the groups.

df.pct <- sapply( d2, ntile, 100 )
d4 <- as.data.frame( df.pct )
d4$cluster <- as.factor( paste0("GROUP-",fit$classification) )

num.groups <- length( unique( fit$classification ) )

stats <- 
d4 %>% 
  group_by( cluster ) %>% 
  summarise_each( funs(mean) )

t <- data.frame( t(stats), stringsAsFactors=F )
names(t) <- paste0( "GROUP.", 1:num.groups )
t <- t[-1,]



for( i in 1:num.groups )
{
  z <- t[,i]
  plot( rep(1,30), 1:30, bty="n", xlim=c(-75,100), 
        type="n", xaxt="n", yaxt="n",
        xlab="Percentile", ylab="",
        main=paste("GROUP",i) )
  abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray90" )
  segments( y0=1:30, x0=0, x1=100, col="gray70", lwd=2 )
  text( -0.2, 1:30, data.dictionary$VARIABLE[-1], cex=0.85, pos=2 )
  points( z, 1:30, pch=19, col="firebrick", cex=1.5 )
  axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}

# project to standard lat-lon coordinate system 
sd_dorling <- spTransform( sd_dorling, CRS("+proj=longlat +datum=WGS84") )

# write to file 
geojson_write( sd_dorling, file="sd_dorling.geojson", geometry="polygon" )

## <geojson-file>
##   Path:       sd_dorling.geojson
##   From class: SpatialPolygonsDataFrame

#load from github
#github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/sd_dorling.geojson"
#sd <- geojson_read( x=github.url,  what="sp" )
# from local file path
sd <- geojson_read( "C:/Users/Owner/Documents/Homework/CPP 529/Final-Project_files/sd_dorling.geojson", what="sp" )

plot( sd )