This lab will have you replicate the work you did for the previous lab, but instead of using the Phoenix data you will select a city of your choice to use for this lab, subsequent labs, and the final project.
You want to select a dense city that has a lot of census tracts, so I suggest you select from the 50 largest cities in the US. The Census defines Metropolitan Statistical Areas (MSAs) as “one or more adjacent counties or county equivalents that have at least one urban core area of at least 50,000 population, plus adjacent territory that has a high degree of social and economic integration with the core as measured by commuting ties.” There are currently 384 MSAs in the US.
Wikipedia provides a convenient list of MSAs sorted by size.
library( geojsonio ) # read shapefiles
library( sp ) # work with shapefiles
library( sf ) # work with shapefiles - simple features format
library( mclust ) # cluster analysis
library( tmap ) # theme maps
library( ggplot2 ) # graphing
library( ggthemes ) # nice formats for ggplots
library( dplyr ) # data wrangling
library( pander ) # formatting RMD tables
library( tidycensus )
library( cartogram ) # spatial maps w/ tract size bias reduction
library( maptools ) # spatial object manipulation You can select a city from the list of large MSAs.
To get Census data on the city you will first need to identify all of the counties that comprise the MSA. You can look this information up through MSA to FIPS crosswalks provided by the National Bureau for Economic Research (NBER): https://www.nber.org/data/cbsa-fips-county-crosswalk.html
I have added the file to GitHub for ease of access.
crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv", stringsAsFactors=F, colClasses="character" )
# search for citie names by strings, use the ^ anchor for "begins with"
grep( "^SAN D", crosswalk$msaname, value=TRUE )## [1] "SAN DIEGO, CA"
grep( "^CALI", crosswalk$msaname, value=TRUE )## [1] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [6] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [11] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [16] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
## [21] "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA" "CALIFORNIA"
Select all of your county fips. To use them in the TidyCenss package you will need to split the state and county:
these.sdp <-crosswalk$msaname == grep( "^SAN DIEGO", crosswalk$msaname,
value=TRUE )
these.fips <- crosswalk$fipscounty[ these.sdp ]
these.fips <- na.omit( these.fips )To create a Dorling cartogram we need a shapefile and a population count. We can get both through the Census download that includes simple features.
library( tidycensus )
#census_api_key("404739a4c1d1baf6067b937fe42425b977ef145d", install = TRUE)
state.fips <- substr( these.fips, 1, 2 )
county.fips <- substr( these.fips, 3, 5 )
cbind( these.fips, state.fips, county.fips ) %>% pander()| these.fips | state.fips | county.fips |
|---|---|---|
| 06073 | 06 | 073 |
sd.pop <-
get_acs( geography = "tract", variables = "B01003_001",
state = "06", county = county.fips[state.fips=="06"], geometry = TRUE ) %>%
dplyr::select( GEOID, estimate ) %>%
rename( POP=estimate )##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|======================================================================| 99%
|
|======================================================================| 100%
getwd()## [1] "C:/Users/Owner/Documents/Homework/CPP 529"
URL <- "https://github.com/DS4PS/cpp-529-master/raw/master/data/ltdb_std_2010_sample.rds"
census.dat <- readRDS(gzcon(url( URL )))
# can merge an sf object and data.frame
sd.pop$GEOID<-sub('.', '', sd.pop$GEOID)
sdp <- merge( sd.pop, census.dat, by.x="GEOID", by.y="tractid" )
# make sure there are no empty polygons
sdp <- sdp[ ! st_is_empty( sdp ) , ]
#head(sdp)
#head(sd) %>% pander()This exercise uses Census data from the 2012 American Communities Survey made available through the Diversity and Disparities Project.
dd.URL <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/data-dictionary.csv"
data.dictionary <- read.csv( dd.URL, stringsAsFactors=F )
data.dictionary %>% pander()| LABEL | VARIABLE |
|---|---|
| tractid | GEOID |
| pnhwht12 | Percent white, non-Hispanic |
| pnhblk12 | Percent black, non-Hispanic |
| phisp12 | Percent Hispanic |
| pntv12 | Percent Native American race |
| pfb12 | Percent foreign born |
| polang12 | Percent speaking other language at home, age 5 plus |
| phs12 | Percent with high school degree or less |
| pcol12 | Percent with 4-year college degree or more |
| punemp12 | Percent unemployed |
| pflabf12 | Percent female labor force participation |
| pprof12 | Percent professional employees |
| pmanuf12 | Percent manufacturing employees |
| pvet12 | Percent veteran |
| psemp12 | Percent self-employed |
| hinc12 | Median HH income, total |
| incpc12 | Per capita income |
| ppov12 | Percent in poverty, total |
| pown12 | Percent owner-occupied units |
| pvac12 | Percent vacant units |
| pmulti12 | Percent multi-family units |
| mrent12 | Median rent |
| mhmval12 | Median home value |
| p30old12 | Percent structures more than 30 years old |
| p10yrs12 | Percent HH in neighborhood 10 years or less |
| p18und12 | Percent 17 and under, total |
| p60up12 | Percent 60 and older, total |
| p75up12 | Percent 75 and older, total |
| pmar12 | Percent currently married, not separated |
| pwds12 | Percent widowed, divorced and separated |
| pfhh12 | Percent female-headed families with children |
# convert sf map object to an sp version
sd.sp <- as_Spatial( sdp )
class( sd.sp )## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
plot( sd.sp )
# project map and remove empty tracts
sd.sp <- spTransform( sd.sp, CRS("+init=epsg:3395"))
sd.sp <- sd.sp[ sd.sp$POP != 0 & (! is.na( sd.sp$POP )) , ]
# convert census tract polygons to dorling cartogram
# no idea why k=0.03 works, but it does - default is k=5
sd.sp$pop.w <- sd.sp$POP / 9000 # max(msp.sp$POP) # standardizes it to max of 1.5
sd_dorling <- cartogram_dorling( x=sd.sp, weight="pop.w", k=0.05 )
plot( sd_dorling )
axis(side=1)
tm_shape( sd_dorling ) +
tm_polygons( size="POP", col="hinc12", n=7, style="quantile", palette="Spectral" )
Use a bounding box: improve the aestetics.
# user-defined bounding box to move slocer to subjects
bbx <- st_bbox( c( xmin = -13061000, xmax = -13008000,
ymax = 3863123, ymin = 3811123 ),
crs = st_crs("+init=epsg:3395"))
tm_shape( sd_dorling, bbox = bbx) +
tm_polygons( col="hinc12", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "Dorling Cartogram", title.position=c("right","top") )
We will use the same set of variables as last week. The data is transformed into z-score so that they are all on similar scales.
keep.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12",
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12",
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12",
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12",
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")
d1 <- sd_dorling@data
d2 <- dplyr::select( d1, keep.these )
d3 <- apply( d2, 2, scale )
head( d3[,1:6] ) %>% pander()| pnhwht12 | pnhblk12 | phisp12 | pntv12 | pfb12 | polang12 |
|---|---|---|---|---|---|
| 1.484 | -0.8089 | -0.8851 | -0.3282 | -0.8389 | -0.6954 |
| 1.417 | -0.8089 | -0.956 | -0.3282 | -0.4502 | -0.5958 |
| 1.385 | -0.6709 | -0.9795 | -0.3282 | -1.091 | -0.8657 |
| 0.6613 | 0.04017 | -0.883 | -0.1466 | -0.05741 | -0.8179 |
| 0.843 | -0.03702 | -0.8381 | 0.5606 | -0.8445 | -0.8246 |
| 0.984 | -0.3162 | -0.8821 | -0.3282 | -0.9838 | -0.8401 |
We transform all of the variables to z scorse so they are on the same scale while clustering. This ensures that each census variable has equal weight. Z-scores typically range from about -3 to +3 with a mean of zero.
keep.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12",
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12",
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12",
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12",
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")
d2 <- dplyr::select( d1, keep.these )
d3 <- apply( d2, 2, scale )
head( d3[,1:6] ) %>% pander()| pnhwht12 | pnhblk12 | phisp12 | pntv12 | pfb12 | polang12 |
|---|---|---|---|---|---|
| 1.484 | -0.8089 | -0.8851 | -0.3282 | -0.8389 | -0.6954 |
| 1.417 | -0.8089 | -0.956 | -0.3282 | -0.4502 | -0.5958 |
| 1.385 | -0.6709 | -0.9795 | -0.3282 | -1.091 | -0.8657 |
| 0.6613 | 0.04017 | -0.883 | -0.1466 | -0.05741 | -0.8179 |
| 0.843 | -0.03702 | -0.8381 | 0.5606 | -0.8445 | -0.8246 |
| 0.984 | -0.3162 | -0.8821 | -0.3282 | -0.9838 | -0.8401 |
For more details on cluster analysis visit the mclust tutorial.
# library( mclust )
set.seed( 1234 )
fit <- Mclust( d3 )
sd_dorling$cluster <- as.factor( fit$classification )
summary( fit )## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 6 components:
##
## log-likelihood n df BIC ICL
## -11080.09 534 800 -27184.49 -27224.52
##
## Clustering table:
## 1 2 3 4 5 6
## 38 62 123 141 117 53
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ---------------------------------------------------- Build the charts to compare census characteristics across the groups.
df.pct <- sapply( d2, ntile, 100 )
d4 <- as.data.frame( df.pct )
d4$cluster <- as.factor( paste0("GROUP-",fit$classification) )
num.groups <- length( unique( fit$classification ) )
stats <-
d4 %>%
group_by( cluster ) %>%
summarise_each( funs(mean) )
t <- data.frame( t(stats), stringsAsFactors=F )
names(t) <- paste0( "GROUP.", 1:num.groups )
t <- t[-1,]
for( i in 1:num.groups )
{
z <- t[,i]
plot( rep(1,30), 1:30, bty="n", xlim=c(-75,100),
type="n", xaxt="n", yaxt="n",
xlab="Percentile", ylab="",
main=paste("GROUP",i) )
abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray90" )
segments( y0=1:30, x0=0, x1=100, col="gray70", lwd=2 )
text( -0.2, 1:30, data.dictionary$VARIABLE[-1], cex=0.85, pos=2 )
points( z, 1:30, pch=19, col="firebrick", cex=1.5 )
axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}
# project to standard lat-lon coordinate system
sd_dorling <- spTransform( sd_dorling, CRS("+proj=longlat +datum=WGS84") )
# write to file
geojson_write( sd_dorling, file="sd_dorling.geojson", geometry="polygon" )## <geojson-file>
## Path: sd_dorling.geojson
## From class: SpatialPolygonsDataFrame
#load from github
#github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/sd_dorling.geojson"
#sd <- geojson_read( x=github.url, what="sp" )
# from local file path
sd <- geojson_read( "C:/Users/Owner/Documents/Homework/CPP 529/Final-Project_files/sd_dorling.geojson", what="sp" )
plot( sd )