This lab will have you replicate the work you did for the previous lab, but instead of using the Phoenix data you will select a city of your choice to use for this lab, subsequent labs, and the final project.

You want to select a dense city that has a lot of census tracts, so I suggest you select from the 50 largest cities in the US. The Census defines Metropolitan Statistical Areas (MSAs) as “one or more adjacent counties or county equivalents that have at least one urban core area of at least 50,000 population, plus adjacent territory that has a high degree of social and economic integration with the core as measured by commuting ties.” There are currently 384 MSAs in the US.

Wikipedia provides a convenient list of MSAs sorted by size.

library( geojsonio )   # read shapefiles
library( sp )          # work with shapefiles
library( sf )          # work with shapefiles - simple features format
library( mclust )      # cluster analysis 
library( tmap )        # theme maps
library( ggplot2 )     # graphing 
library( ggthemes )    # nice formats for ggplots
library( dplyr )       # data wrangling 
library( pander )      # formatting RMD tables
library( tidycensus )

library( cartogram )  # spatial maps w/ tract size bias reduction
library( maptools )   # spatial object manipulation 

Step 1: Select Your MSA

You can select a city from the list of large MSAs.

To get Census data on the city you will first need to identify all of the counties that comprise the MSA. You can look this information up through MSA to FIPS crosswalks provided by the National Bureau for Economic Research (NBER): https://www.nber.org/data/cbsa-fips-county-crosswalk.html

I have added the file to GitHub for ease of access.

crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv",  stringsAsFactors=F, colClasses="character" )

# search for citie names by strings, use the ^ anchor for "begins with" 

grep( "^SAN D", crosswalk$msaname, value=TRUE )
## [1] "SAN DIEGO, CA"
grep( "^CALI", crosswalk$msaname, value=TRUE )
##  [1] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
##  [6] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [11] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [16] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [21] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"


Select all of your county fips. To use them in the TidyCenss package you will need to split the state and county:

these.sdp <-crosswalk$msaname == grep( "^SAN DIEGO", crosswalk$msaname,
value=TRUE )
these.fips <- crosswalk$fipscounty[ these.sdp ]
these.fips <- na.omit( these.fips )


Step 2: Download a Shapefile with Population Data

To create a Dorling cartogram we need a shapefile and a population count. We can get both through the Census download that includes simple features.

library( tidycensus )
#census_api_key("404739a4c1d1baf6067b937fe42425b977ef145d", install = TRUE)

state.fips <- substr( these.fips, 1, 2 )
county.fips <- substr( these.fips, 3, 5 )

cbind( these.fips, state.fips, county.fips ) %>% pander()
these.fips state.fips county.fips
06073 06 073


sd.pop <-
get_acs( geography = "tract", variables = "B01003_001",
         state = "06", county = county.fips[state.fips=="06"], geometry = TRUE ) %>% 
         dplyr::select( GEOID, estimate ) %>%
         rename( POP=estimate )
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%
getwd()
## [1] "C:/Users/Owner/Documents/Homework/CPP 529"


Step 3: Add Census Data

URL <- "https://github.com/DS4PS/cpp-529-master/raw/master/data/ltdb_std_2010_sample.rds"
census.dat <- readRDS(gzcon(url( URL )))

# can merge an sf object and data.frame
sd.pop$GEOID<-sub('.', '', sd.pop$GEOID)
sdp <- merge( sd.pop, census.dat, by.x="GEOID", by.y="tractid" )

# make sure there are no empty polygons
sdp <- sdp[ ! st_is_empty( sdp ) , ]

#head(sdp)
#head(sd) %>% pander()

Data Dictionary

This exercise uses Census data from the 2012 American Communities Survey made available through the Diversity and Disparities Project.

dd.URL <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/data-dictionary.csv"
data.dictionary <- read.csv( dd.URL, stringsAsFactors=F ) 
data.dictionary %>% pander()
LABEL VARIABLE
tractid GEOID
pnhwht12 Percent white, non-Hispanic
pnhblk12 Percent black, non-Hispanic
phisp12 Percent Hispanic
pntv12 Percent Native American race
pfb12 Percent foreign born
polang12 Percent speaking other language at home, age 5 plus
phs12 Percent with high school degree or less
pcol12 Percent with 4-year college degree or more
punemp12 Percent unemployed
pflabf12 Percent female labor force participation
pprof12 Percent professional employees
pmanuf12 Percent manufacturing employees
pvet12 Percent veteran
psemp12 Percent self-employed
hinc12 Median HH income, total
incpc12 Per capita income
ppov12 Percent in poverty, total
pown12 Percent owner-occupied units
pvac12 Percent vacant units
pmulti12 Percent multi-family units
mrent12 Median rent
mhmval12 Median home value
p30old12 Percent structures more than 30 years old
p10yrs12 Percent HH in neighborhood 10 years or less
p18und12 Percent 17 and under, total
p60up12 Percent 60 and older, total
p75up12 Percent 75 and older, total
pmar12 Percent currently married, not separated
pwds12 Percent widowed, divorced and separated
pfhh12 Percent female-headed families with children


Step 4: Transform the Shapefile into A Dorling Cartogram

# convert sf map object to an sp version

sd.sp <- as_Spatial( sdp )

class( sd.sp )
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
plot( sd.sp )


# project map and remove empty tracts
sd.sp <- spTransform( sd.sp, CRS("+init=epsg:3395"))
sd.sp <- sd.sp[ sd.sp$POP != 0 & (! is.na( sd.sp$POP )) , ]

# convert census tract polygons to dorling cartogram
# no idea why k=0.03 works, but it does - default is k=5
sd.sp$pop.w <- sd.sp$POP / 9000 # max(msp.sp$POP)   # standardizes it to max of 1.5
sd_dorling <- cartogram_dorling( x=sd.sp, weight="pop.w", k=0.05 )
plot( sd_dorling )
axis(side=1)


tm_shape( sd_dorling ) + 
  tm_polygons( size="POP", col="hinc12", n=7, style="quantile", palette="Spectral" )


Use a bounding box: improve the aestetics.

# user-defined bounding box to move slocer to subjects 
bbx <- st_bbox( c( xmin =  -13061000, xmax = -13008000, 
                  ymax = 3863123, ymin = 3811123 ), 
                  crs = st_crs("+init=epsg:3395"))

tm_shape( sd_dorling, bbox = bbx) + 
  tm_polygons( col="hinc12", n=10, style="quantile", palette="Spectral" ) +
  tm_layout( "Dorling Cartogram", title.position=c("right","top") )


Clustering

We will use the same set of variables as last week. The data is transformed into z-score so that they are all on similar scales.

keep.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12", 
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12", 
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12", 
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12", 
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")

d1 <- sd_dorling@data
d2 <- dplyr::select( d1, keep.these )
d3 <- apply( d2, 2, scale )
head( d3[,1:6] ) %>% pander()
pnhwht12 pnhblk12 phisp12 pntv12 pfb12 polang12
1.484 -0.8089 -0.8851 -0.3282 -0.8389 -0.6954
1.417 -0.8089 -0.956 -0.3282 -0.4502 -0.5958
1.385 -0.6709 -0.9795 -0.3282 -1.091 -0.8657
0.6613 0.04017 -0.883 -0.1466 -0.05741 -0.8179
0.843 -0.03702 -0.8381 0.5606 -0.8445 -0.8246
0.984 -0.3162 -0.8821 -0.3282 -0.9838 -0.8401


Prepare Data for Clustering

We transform all of the variables to z scorse so they are on the same scale while clustering. This ensures that each census variable has equal weight. Z-scores typically range from about -3 to +3 with a mean of zero.

keep.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12", 
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12", 
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12", 
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12", 
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")

d2 <- dplyr::select( d1, keep.these )
d3 <- apply( d2, 2, scale )

head( d3[,1:6] ) %>% pander()
pnhwht12 pnhblk12 phisp12 pntv12 pfb12 polang12
1.484 -0.8089 -0.8851 -0.3282 -0.8389 -0.6954
1.417 -0.8089 -0.956 -0.3282 -0.4502 -0.5958
1.385 -0.6709 -0.9795 -0.3282 -1.091 -0.8657
0.6613 0.04017 -0.883 -0.1466 -0.05741 -0.8179
0.843 -0.03702 -0.8381 0.5606 -0.8445 -0.8246
0.984 -0.3162 -0.8821 -0.3282 -0.9838 -0.8401


Perform Cluster Analysis

For more details on cluster analysis visit the mclust tutorial.

# library( mclust )
set.seed( 1234 )
fit <- Mclust( d3 )
sd_dorling$cluster <- as.factor( fit$classification )
summary( fit )
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 6 components: 
## 
##  log-likelihood   n  df       BIC       ICL
##       -11080.09 534 800 -27184.49 -27224.52
## 
## Clustering table:
##   1   2   3   4   5   6 
##  38  62 123 141 117  53
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 

Identifying Neighborhood Clusters

Build the charts to compare census characteristics across the groups.

df.pct <- sapply( d2, ntile, 100 )
d4 <- as.data.frame( df.pct )
d4$cluster <- as.factor( paste0("GROUP-",fit$classification) )

num.groups <- length( unique( fit$classification ) )

stats <- 
d4 %>% 
  group_by( cluster ) %>% 
  summarise_each( funs(mean) )

t <- data.frame( t(stats), stringsAsFactors=F )
names(t) <- paste0( "GROUP.", 1:num.groups )
t <- t[-1,]



for( i in 1:num.groups )
{
  z <- t[,i]
  plot( rep(1,30), 1:30, bty="n", xlim=c(-75,100), 
        type="n", xaxt="n", yaxt="n",
        xlab="Percentile", ylab="",
        main=paste("GROUP",i) )
  abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray90" )
  segments( y0=1:30, x0=0, x1=100, col="gray70", lwd=2 )
  text( -0.2, 1:30, data.dictionary$VARIABLE[-1], cex=0.85, pos=2 )
  points( z, 1:30, pch=19, col="firebrick", cex=1.5 )
  axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}


# project to standard lat-lon coordinate system 
sd_dorling <- spTransform( sd_dorling, CRS("+proj=longlat +datum=WGS84") )

# write to file 
geojson_write( sd_dorling, file="sd_dorling.geojson", geometry="polygon" )
## <geojson-file>
##   Path:       sd_dorling.geojson
##   From class: SpatialPolygonsDataFrame
#load from github
#github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/sd_dorling.geojson"
#sd <- geojson_read( x=github.url,  what="sp" )
# from local file path
sd <- geojson_read( "C:/Users/Owner/Documents/Homework/CPP 529/Final-Project_files/sd_dorling.geojson", what="sp" )

plot( sd )