library( geojsonio ) # read shapefiles
library( sp ) # work with shapefiles
library( sf ) # work with shapefiles - simple features format
library( mclust ) # cluster analysis
library( tmap ) # theme maps
library( ggplot2 ) # graphing
library( ggthemes ) # nice formats for ggplots
library( dplyr ) # data wrangling
library( pander ) # formatting RMD tables
library( tidycensus )
library( cartogram ) # spatial maps w/ tract size bias reduction
library( maptools ) # spatial object manipulation
library( corrplot )
census_api_key("49d90e91867ec8c4c1eab210c0c0636ad21f5b6e")
crosswalk <- read.csv( "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/cbsatocountycrosswalk.csv", stringsAsFactors=F, colClasses="character" )
# search for cities names San Diego
# grep( "^SAN DIEGO, CA", crosswalk$msaname, value=TRUE )
these.msa <- crosswalk$msaname == "SAN DIEGO, CA"
# Get the FIPS codes for the selected MSA
these.fips <- crosswalk$fipscounty[these.msa]
these.fips <- na.omit(these.fips)
# Get San Diego
sd_data <- get_acs(geography = "tract",
variables = "B01003_001",
state = "06",
county = "073",
geometry = TRUE) %>%
select(GEOID, estimate) %>%
rename(POP = estimate)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
# Load Census data
URL <- "https://github.com/DS4PS/cpp-529-master/raw/master/data/ltdb_std_2010_sample.rds"
census_data <- readRDS(gzcon(url(URL)))
# remove leading 0s
sd_data$GEOID <- substr(sd_data$GEOID, 2, nchar(sd_data$GEOID))
# Merge sd_data (San Diego tracts) with census_data
sd_data <- merge(sd_data, census_data, by.x = "GEOID", by.y = "tractid")
# Remove any empty polygons
sd_merged <- sd_data[!st_is_empty(sd_data), ]
This exercise uses Census data from the 2012 American Communities Survey made available through the Diversity and Disparities Project.
DATA DICTIONARY
data.dictionary <-
structure(list(LABEL = c("tractid", "pnhwht12", "pnhblk12", "phisp12",
"pntv12", "pfb12", "polang12", "phs12", "pcol12", "punemp12",
"pflabf12", "pprof12", "pmanuf12", "pvet12", "psemp12", "hinc12",
"incpc12", "ppov12", "pown12", "pvac12", "pmulti12", "mrent12",
"mhmval12", "p30old12", "p10yrs12", "p18und12", "p60up12", "p75up12",
"pmar12", "pwds12", "pfhh12"), VARIABLE = c("GEOID", "Percent white, non-Hispanic",
"Percent black, non-Hispanic", "Percent Hispanic", "Percent Native American race",
"Percent foreign born", "Percent speaking other language at home, age 5 plus",
"Percent with high school degree or less", "Percent with 4-year college degree or more",
"Percent unemployed", "Percent female labor force participation",
"Percent professional employees", "Percent manufacturing employees",
"Percent veteran", "Percent self-employed", "Median HH income, total",
"Per capita income", "Percent in poverty, total", "Percent owner-occupied units",
"Percent vacant units", "Percent multi-family units", "Median rent",
"Median home value", "Percent structures more than 30 years old",
"Percent HH in neighborhood 10 years or less", "Percent 17 and under, total",
"Percent 60 and older, total", "Percent 75 and older, total",
"Percent currently married, not separated", "Percent widowed, divorced and separated",
"Percent female-headed families with children")), class = "data.frame", row.names = c(NA,
-31L))
# dorling cartogram of San Diego Census Tracts
# Convert the sf map object to an sp version
sd_sp <- as_Spatial(sd_merged)
# Project the map and remove empty tracts
sd_sp <- spTransform(sd_sp, CRS("+init=epsg:3395"))
sd_sp <- sd_sp[sd_sp$POP != 0 & (!is.na(sd_sp$POP)), ]
# Convert census tract polygons to a Dorling cartogram
sd_sp$pop_w <- sd_sp$POP / 9000 # Adjust the scaling factor as needed
sd_dorling <- cartogram_dorling(x = sd_sp, weight = "pop_w", k = 1)
# Plot the Dorling cartogram
plot(sd_dorling)
library(tmap)
# Transform the CRS of sd_dorling
sd_dorling <- spTransform(sd_dorling, CRS("+init=epsg:3395"))
# Define a bounding box for the San Diego area
bb <- st_bbox(c(xmin = -13100000, xmax = -12850000,
ymax = 4025000, ymin = 3775000),
crs = st_crs("+init=epsg:3395"))
# Create the tmap visualization for the Dorling cartogram
tm_shape(sd_dorling, bbox = bb) +
tm_polygons(col = "hinc12", n = 10, style = "quantile", palette = "Spectral") +
tm_layout("San Diego Dorling Cartogram", title.position = c("right", "top"))
tmap_mode("view")
tm_basemap( "HikeBike.HillShading" ) +
tm_shape( sd_dorling, bbox=bb ) +
tm_polygons( col="phisp12", n=7, style="quantile", palette="-inferno" )
tm_basemap( "Stamen.Watercolor" ) +
tm_shape( sd_dorling, bbox=bb ) +
tm_polygons( col="hinc12", n=7, style="quantile", palette="RdYlBu" ) +
tm_legend( show=FALSE )
We are going to extract the data from the shapefile and save it as a separate data frame so we can use it for analysis independent of the shapefile.
d1 <- sd_dorling@data
We transform all of the variables to z scores so they are on the same scale while clustering. This ensures that each census variable has equal weight. Z-scores typically range from about -3 to +3 with a mean of zero.
keep.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12",
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12",
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12",
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12",
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")
d2 <- select( d1, keep.these )
d3 <- apply( d2, 2, scale )
set.seed( 1234 )
fit <- Mclust( d3 )
sd_dorling$cluster <- as.factor( fit$classification )
summary( fit )
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 6 components:
##
## log-likelihood n df BIC ICL
## -11080.09 534 800 -27184.49 -27224.52
##
## Clustering table:
## 1 2 3 4 5 6
## 38 62 123 141 117 53
Some visuals of model fit statistics (doesn’t work well with a lot of variables):
plot( fit, what = "classification" )
d2$cluster <- d2$cluster <- as.factor( paste0("GROUP-",fit$classification) )
ggplot( d2, aes( x=phisp12 ) ) +
geom_density( alpha = 0.5, fill="blue" ) + # xlim( -3, 3 ) +
xlab( "Percent Hispanic" ) + facet_wrap( ~ cluster, nrow=2 ) + theme_minimal()
ggplot( d2, aes( x=pcol12) ) +
geom_density( alpha = 0.5, fill="blue" ) + # xlim( -3, 3 ) +
xlab( "Percent with a College Degree" ) + facet_wrap( ~ cluster, nrow=2 ) + theme_minimal()
df.pct <- sapply( d2, ntile, 100 )
d3 <- as.data.frame( df.pct )
d3$cluster <- as.factor( paste0("GROUP-",fit$classification) )
stats <-
d3 %>%
group_by( cluster ) %>%
summarise_each( funs(mean) )
t <- data.frame( t(stats), stringsAsFactors=F )
names(t) <- paste0( "GROUP.", 1:6 )
t <- t[-1,]
for( i in 1:6 )
{
z <- t[,i]
plot( rep(1,30), 1:30, bty="n", xlim=c(-75,100),
type="n", xaxt="n", yaxt="n",
xlab="Percentile", ylab="",
main=paste("GROUP",i) )
abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray90" )
segments( y0=1:30, x0=0, x1=100, col="gray70", lwd=2 )
text( -0.2, 1:30, data.dictionary$VARIABLE[-1], cex=0.85, pos=2 )
points( z, 1:30, pch=19, col="firebrick", cex=1.5 )
axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}
tmap_mode("plot")
tmap_style("cobalt")
tm_shape( sd_dorling, bbox=bb ) +
tm_polygons( col="cluster", palette="Accent" )
GROUP-1: Mixed-Ethnicity Older Neighborhoods with Moderate Socioeconomic Status GROUP-2: Urban Highly Educated Neighborhoods with Diverse Populations GROUP-3: Economically Disadvantaged and Diverse Urban Communities GROUP-4: Affluent and Educated Older Communities GROUP-5: Majority Hispanic, Low-Income Urban Communities GROUP-6: Affluent, Educated, and Diverse Urban Communities
d3 <- as.data.frame(d3)
df.dim1 <- dplyr::select( d3, pown12, pmulti12, p10yrs12, pwds12, pfhh12 )
corrplot( cor(df.dim1, use="complete.obs"),
order = "hclust", tl.col='black', tl.cex=.75 )
# flip the signs
df.dim1$pmulti12 <- - df.dim1$pmulti12
df.dim1$p10yrs12 <- - df.dim1$p10yrs12
df.dim1$pwds12 <- - df.dim1$pwds12
df.dim1$pfhh12 <- - df.dim1$pfhh12
corrplot( cor(df.dim1, use="complete.obs"),
order = "hclust", tl.col='black', tl.cex=.75 )
df.dim1 <- dplyr::select(d3, pnhwht12, pnhblk12, phisp12, pfb12,polang12 )
corrplot( cor(df.dim1, use="complete.obs"),
order = "hclust", tl.col='black', tl.cex=.75 )
df.dim2 <- d3[ c("pnhwht12", "pnhblk12", "phisp12", "pfb12", "polang12") ]
# Flip the sign of the percent white variable
df.dim2$pnhwht12 <- -df.dim2$pnhwht12
# Generate the correlation plot
corrplot(cor(df.dim2, use = "complete.obs"), order = "hclust", tl.col = 'black', tl.cex = .75)
# Flip the sign of phs12
df.dim3 <- select( d3, pcol12, phs12, pprof12, hinc12, mhmval12 )
df.dim3$phs12 <- -df.dim3$phs12
# Create the correlation plot with the updated sign
corrplot(cor(df.dim3, use="complete.obs"),
order = "hclust", tl.col='black', tl.cex=.75)
dim1 = d3$pown12 - d3$pmulti12 - d3$p10yrs12 - d3$pwds12 - d3$pfhh12
dim2 = -d3$pnhwht12 + d3$pnhblk12 + d3$phisp12 + d3$pfb12 + d3$polang12
dim3 = d3$pcol12 - d3$phs12 + d3$pprof12 + d3$hinc12 + d3$mhmval12
df.nhood.metrics <- data.frame(dim1,dim2,dim3)
summary( df.nhood.metrics )
## dim1 dim2 dim3
## Min. :-372.00 Min. :-95.00 Min. :-90.00
## 1st Qu.:-234.00 1st Qu.: 44.25 1st Qu.: 27.25
## Median :-149.00 Median :143.00 Median :144.50
## Mean :-145.20 Mean :145.20 Mean :145.20
## 3rd Qu.: -54.25 3rd Qu.:253.75 3rd Qu.:260.50
## Max. : 95.00 Max. :369.00 Max. :397.00
corrplot( cor( df.nhood.metrics, use="complete.obs" ),
order = "hclust", tl.col='black', tl.cex=.75 )
# Fit the Mclust model using the df.nhood.
fit2 <- Mclust(df.nhood.metrics)
summary(fit2)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 5 components:
##
## log-likelihood n df BIC ICL
## -9451.964 534 29 -19086.06 -19226.04
##
## Clustering table:
## 1 2 3 4 5
## 35 271 120 58 50
# Update the sd_dorling data frame with the new classifications
sd_dorling$cluster2 <- as.factor(fit2$classification)
# cluster with dataset of three census variables
d33 <- data.frame( d3$p18und12, d3$pflabf12, d3$hinc12 )
fit3 <- Mclust( d33 )
summary( fit3 )
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVI (diagonal, varying volume and shape) model with 6 components:
##
## log-likelihood n df BIC ICL
## -7341.632 534 41 -14940.76 -15101.73
##
## Clustering table:
## 1 2 3 4 5 6
## 81 40 156 64 98 95
sd_dorling$cluster3 <- as.factor( fit3$classification )
tmap_mode("plot")
tmap_style("cobalt")
tm1 <-
tm_shape( sd_dorling, bbox=bb ) +
tm_polygons( col="cluster", palette="Accent" )
tm2 <-
tm_shape( sd_dorling, bbox=bb ) +
tm_polygons( col="cluster2", palette="Accent" )
tm3 <-
tm_shape( sd_dorling, bbox=bb ) +
tm_polygons( col="cluster3", palette="Accent" )
tmap_arrange( tm1, tm2, tm3 )
plot( fit2, what = "classification" )
plot( fit3, what = "classification" )
plot( fit2, what = "classification" )
plot( fit3, what = "classification" )
Compare that set of groups to the groups identified by the model using only the three indices above. Are they identifying the same groups? Which group is missing?
They do not perfectly align with the original six groups. There is some overlap, but the new groups capture different aspects of the data.
From the six original groups, Group 3 and Group 5 seems to be the ones least represented in the new groups formed. These group have unique combinations of characteristics that make it difficult to be accurately represented. For example, differences in foreign-born residents, language spoken at home, housing characteristics, percentage 18 and under, and female labor force participation.
# use dput( data.dictionary ) to create reproducible data frames for RMD files
data.dictionary1 <-
structure(list(LABEL = c("pnhwht12", "punemp12","p30old12", "p10yrs12"), VARIABLE = c("Percent white","Percent unemployed","Percent structures more than 30 years old")), class = "data.frame", row.names = c(NA,
-30L))
# list variables for clustering
use.these <- c("pnhwht12", "punemp12", "p30old12")
dd.cluster1 <- data.dictionary[ data.dictionary$LABEL %in% use.these , ]
# cluster 2
LABEL <- c("dim1","dim2","dim3")
VARIABLE <- c("Neighborhood transitivity","Neighborhood diversity","Human capital")
dd.cluster2 <- data.frame( LABEL, VARIABLE )
# cluster 3 - update with your variables
use.these <- c("pnhwht12", "punemp12", "p30old12")
dd.cluster3 <- data.dictionary[ data.dictionary$LABEL %in% use.these , ]
d5 <- select( d1, use.these )
d6 <- apply( d5, 2, scale )
set.seed( 1234 )
fit <- Mclust( d6 )
sd_dorling$cluster <- as.factor( fit$classification )
summary( fit )
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVI (diagonal, varying volume and shape) model with 4 components:
##
## log-likelihood n df BIC ICL
## -2025.49 534 27 -4220.551 -4391.385
##
## Clustering table:
## 1 2 3 4
## 126 189 110 109
Some visuals of model fit statistics (doesn’t work well with a lot of variables):
plot( fit, what = "classification" )
d5$cluster <- d5$cluster <- as.factor( paste0("GROUP-",fit$classification) )
ggplot( d5, aes( x=pnhwht12) ) +
geom_density( alpha = 0.5, fill="blue" ) + # xlim( -3, 3 ) +
xlab( "Percent White" ) + facet_wrap( ~ cluster, nrow=2 ) + theme_minimal()
ggplot( d5, aes( x=punemp12) ) +
geom_density( alpha = 0.5, fill="blue" ) + # xlim( -3, 3 ) +
xlab( "Percent Unemployed" ) + facet_wrap( ~ cluster, nrow=2 ) + theme_minimal()
df.pct <- sapply( d5, ntile, 100 )
d6 <- as.data.frame( df.pct )
d6$cluster <- as.factor( paste0("GROUP-",fit$classification) )
stats <-
d6 %>%
group_by( cluster ) %>%
summarise_each( funs(mean) )
t <- data.frame( t(stats), stringsAsFactors=F )
names(t) <- paste0( "GROUP.", 1:4 )
t <- t[-1,]
for (i in 1:4) {
z <- t[, i]
plot(rep(1, 3), 1:3, bty = "n", xlim = c(-75, 100),
type = "n", xaxt = "n", yaxt = "n",
xlab = "Percentile", ylab = "",
main = paste("GROUP", i))
abline(v = seq(0, 100, 25), lty = 3, lwd = 1.5, col = "gray90")
# Modify the 'segments' function call to display only 3 segments instead of 30
segments(y0 = 1:3, x0 = 0, x1 = 100, col = "gray70", lwd = 2)
# Use the filtered data dictionary for the labels
text(-0.2, 1:3, data.dictionary1$VARIABLE, cex = 0.85, pos = 2)
points(z, 1:3, pch = 19, col = "firebrick", cex = 1.5)
axis(side = 1, at = c(0, 50, 100), col.axis = "gray", col = "gray")
}
print(stats)
## # A tibble: 4 × 4
## cluster pnhwht12 punemp12 p30old12
## <fct> <dbl> <dbl> <dbl>
## 1 GROUP-1 82.9 26.8 62.1
## 2 GROUP-2 42.3 56.9 55.3
## 3 GROUP-3 9.76 72.5 58.9
## 4 GROUP-4 58.1 34.3 10.1