Load Packages

library(here)
library(mclust)
library(raster)
library(rgdal)
library(sp)
library(RColorBrewer)
library(PCAtools)
library(sf)
library(tmap)
library(terra)
library(rgeos)
library(spData)
library(knitr)
library(kableExtra)

Import Data

#import raw data
dat_raw <- read.csv(here("data/CommunityData-raw-2015-v2.csv"))
#select columns of interest, removed co2_hhs
dat  <- dat_raw[,c(4:12,14:21)]
#convert to dataframe
dat <- as.data.frame(unclass(dat))
#set rownames to city names
rownames(dat)=dat_raw$ME
# remove any records that have NAs
dat = na.omit(dat)
# convert to z-scores
dat <- as.matrix(scale(dat, center = TRUE, scale = TRUE))

Run PCA on dataset

dat_comps <- prcomp(dat, center = F, scale=F) #set center and scale to FALSE because done pre-PCA
#get summary
s <- summary(dat_comps)
kable(s$importance)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17
Standard deviation 1.97617 1.497713 1.156896 1.146363 1.105432 1.02912 0.9569311 0.9148077 0.8858084 0.8289295 0.8038107 0.7349555 0.6692265 0.641272 0.5542993 0.4588888 0.3611388
Proportion of Variance 0.22972 0.131950 0.078730 0.077300 0.071880 0.06230 0.0538700 0.0492300 0.0461600 0.0404200 0.0380100 0.0317700 0.0263400 0.024190 0.0180700 0.0123900 0.0076700
Cumulative Proportion 0.22972 0.361670 0.440400 0.517700 0.589580 0.65188 0.7057500 0.7549800 0.8011300 0.8415500 0.8795600 0.9113300 0.9376800 0.961870 0.9799400 0.9923300 1.0000000
#get eigenvalues
ev <- data.frame(Component=paste("PC",1:17, sep=""),eigenvalue=dat_comps$sdev^2)
kable(ev)
Component eigenvalue
PC1 3.9052476
PC2 2.2431433
PC3 1.3384091
PC4 1.3141490
PC5 1.2219791
PC6 1.0590881
PC7 0.9157172
PC8 0.8368731
PC9 0.7846564
PC10 0.6871241
PC11 0.6461116
PC12 0.5401596
PC13 0.4478641
PC14 0.4112297
PC15 0.3072477
PC16 0.2105789
PC17 0.1304213

Extract components with eigenvalues > 1

dat_pcs <- dat_comps$x[,1:5]

Plot loadings

#plot loadings of pc1 and 2
plot(dat_comps$x[,1],dat_comps$x[,2], 
     xlab=paste("PCA 1 (", round(s$importance[2,1]*100, 1), "%)", sep = ""), 
     ylab=paste("PCA 2 (", round(s$importance[2,2]*100, 1), "%)", sep = ""), 
     pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)
#get loadings
l.x <- dat_comps$rotation[,1]*10
l.y <- dat_comps$rotation[,2]*10
arrows(x0=mean(dat_comps$x[,1]), x1=l.x, y0=mean(dat_comps$x[,2]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

#plot loadings of pc 3 and 4
plot(dat_comps$x[,3],dat_comps$x[,4], 
     xlab=paste("PCA 3 (", round(s$importance[2,3]*100, 1), "%)", sep = ""), 
     ylab=paste("PCA 4 (", round(s$importance[2,4]*100, 1), "%)", sep = ""), 
     pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)

#get loadings
l.x <- dat_comps$rotation[,3]*10
l.y <- dat_comps$rotation[,4]*10
arrows(x0=mean(dat_comps$x[,3]), x1=l.x, y0=mean(dat_comps$x[,4]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

Run Cluster GMM

Assumptions: - MSAs form clusters characterized by a multivariate distribution - Model forms: shape, volume, orientation - GMM fits a series of models with different forms and numbers of clusters - Models with highest probability and fewest parameters selected as most optimal - Based on Bayesian Information Criterion (BIC)

#run GMM on 1-20 clusters
dat_pcs_mc <- Mclust(dat_pcs, G=c(1:20))
#print summary of model fit
summary(dat_pcs_mc)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEI (diagonal, equal shape) model with 9 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -6806.744 845 66 -14058.28 -14568.29
## 
## Clustering table:
##   1   2   3   4   5   6   7   8   9 
## 107 208  73 213  25  43  16  49 111

Model comparison

#plot BIC scores of different models
plot(dat_pcs_mc, what='BIC')

#zoom in on best-fitting models
plot(dat_pcs_mc, what='BIC', ylim=c(-15300,-15150))

#model type VEI and VEE have similar BIC scores for 8-9 clusters
#do additional model comparisons
m_VEI_8 <- Mclust(dat_pcs, G=8, modelNames="VEI")
m_VEE_8 <- Mclust(dat_pcs, G=8, modelNames="VEE")
m_VEI_9 <- Mclust(dat_pcs, G=9, modelNames="VEI")
m_VEE_9 <- Mclust(dat_pcs, G=9, modelNames="VEE")

#extract BIC scores
BICs<-c(m_VEI_8$BIC[1],m_VEE_8$BIC[1],m_VEI_9$BIC[1],m_VEE_9$BIC[1])
#calculate change in BIC score, since in this method the goal to to maximize BIC
#we calculate change from highest scoring model
delta_BIC <- max(BICs) - BICs
#calculate BIC weights
w_BIC <- round(exp(-0.5*delta_BIC)/sum(exp(-0.5*delta_BIC)), digits=3)
#make table of results
results_table <- cbind.data.frame(clusters=c("VEI_8","VEE_8","VEI_9","VEE_9"),BIC=BICs,delta_BIC,weight=w_BIC)
#order by delta
results_table <- results_table[order(delta_BIC),] 
rownames(results_table)<-NULL
#print table
kable(results_table[1:4], caption = "Model Comparison")
Model Comparison
clusters BIC delta_BIC weight
VEI_9 -14058.28 0.00000 1
VEE_9 -14093.10 34.81417 0
VEI_8 -14096.51 38.23128 0
VEE_8 -14105.20 46.91507 0

Extract and map cluster classifications

#extract classifications, e.g., which city belongs to which cluster, export to csv and reload
write.csv(m_VEE_8$classification,file=here("results/cluster_assignment_pca.csv"))
data<- read.csv(here('results/cluster_assignment_pca.csv'))

#import shapefile
msa_Boundary <-readOGR(here("data/MSA"),"tl_2015_us_cbsa") 
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/clipo/PycharmProjects/sustainable_communities/data/MSA", layer: "tl_2015_us_cbsa"
## with 929 features
## It has 12 fields
## Integer64 fields read as strings:  ALAND AWATER
#merge them
merged <- merge(msa_Boundary,data,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert x to factor
merged_clean$Cluster <- as.factor(merged_clean$x)
#convert to sf object
cluster_sf <- st_as_sf(merged_clean)
#make us map
us_map <- tm_shape(us_states) + tm_borders()
#combine them
cluster_map <- tm_shape(cluster_sf) + tm_fill(col="Cluster", palette="Set1")
#plot them
us_map + cluster_map

Examine how clusters score in different metrics

#new data frame
dat_clust <- as.data.frame(dat)
#add names
dat_clust$ME <- rownames(dat)
rownames(dat_clust) <- NULL
#merge in cluster assignments
dat_clust <- merge(dat_clust,data,by.x="ME",by.y="X")
#calculate cluster means for different variables
c1m <- data.frame(one=colMeans(subset(dat_clust, x=="1")[2:18]))
c2m <- data.frame(two=colMeans(subset(dat_clust, x=="2")[2:18]))
c3m <- data.frame(three=colMeans(subset(dat_clust, x=="3")[2:18]))
c4m <- data.frame(four=colMeans(subset(dat_clust, x=="4")[2:18]))
c5m <- data.frame(five=colMeans(subset(dat_clust, x=="5")[2:18]))
c6m <- data.frame(six=colMeans(subset(dat_clust, x=="6")[2:18]))
c7m <- data.frame(seven=colMeans(subset(dat_clust, x=="7")[2:18]))
c8m <- data.frame(eight=colMeans(subset(dat_clust, x=="8")[2:18]))
cluster_means <- cbind.data.frame(c1m,c2m,c3m,c4m,c5m,c6m,c7m,c8m)
cluster_means$variable <- row.names(cluster_means)
rownames(cluster_means) <- NULL
#boxplots of scores for clusters
par(mfrow=c(2,4), mar=c(2.5,5,2.5,3))
boxplot(subset(dat_clust, x=="1")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 1")
boxplot(subset(dat_clust, x=="2")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 2")
boxplot(subset(dat_clust, x=="3")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 3")
boxplot(subset(dat_clust, x=="4")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 4")
boxplot(subset(dat_clust, x=="5")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 5")
boxplot(subset(dat_clust, x=="6")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 6")
boxplot(subset(dat_clust, x=="7")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 7")
boxplot(subset(dat_clust, x=="8")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 8")

par(mfrow=c(1,1))
#dotplots of cluster mean values
par(mfrow=c(2,5))
dotchart(cluster_means$one, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 1', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$two, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 2', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$three, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 3', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$four, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 4', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$five, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 5', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$six, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 6', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$seven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 7', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$eight, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 8', pch=16)
abline(v=0, lwd=2)
par(mfrow=c(1,1))

Which MSAs best characterize their cluster?

Typical members of each cluster
Cluster Name
1 Green Bay, WI
2 Lumberton, NC
3 Canton-Massillon, OH
4 Florence-Muscle Shoals, AL
5 Elizabethtown-Fort Knox, KY
6 Athens-Clarke County, GA
7 Atlanta-Sandy Springs-Roswell, GA
8 North Port-Sarasota-Bradenton, FL

Which cluster scores the best on the different metrics?

Metric Cluster
AQI_Good one
Bachelor_Over_25 six
Per_Poverty one
Gini one
Per_Sev_Hous one
Xstreamlengthimpaired two
Per_Avg_Land_Cov seven
poor_health_percent six
GHG_Percap one
UNEMPLOY seven
FOOD_INS one
VIO_CRIME one
broadband_access one
Female_Inequality three
Black_equality seven
Hispanic_equality six

Table showing which clusters score best and worst for different metrics

cluster_means_t %>%
  kbl() %>%
  kable_paper(full_width=T) %>%
  #column_spec(1, color = ifelse(cluster_means_t$AQI_Good < max(cluster_means_t$AQI_Good), 'black','blue'))
  column_spec(1, color = ifelse(cluster_means_t[,1] == max(cluster_means_t[,1]), "blue",
                                   ifelse(cluster_means_t[,1] == min(cluster_means_t[,1]), "red", "black"))) %>%
  column_spec(2, color = ifelse(cluster_means_t[,2] == max(cluster_means_t[,2]), "blue",
                                   ifelse(cluster_means_t[,2] == min(cluster_means_t[,2]), "red", "black"))) %>%
  column_spec(8, color = ifelse(cluster_means_t[,8] == max(cluster_means_t[,8]), "blue",
                                   ifelse(cluster_means_t[,8] == min(cluster_means_t[,8]), "red", "black"))) %>%
  column_spec(3, color = ifelse(cluster_means_t[,3] == max(cluster_means_t[,3]), "red",
                                   ifelse(cluster_means_t[,3] == min(cluster_means_t[,3]), "blue", "black"))) %>%
  column_spec(4, color = ifelse(cluster_means_t[,4] == max(cluster_means_t[,4]), "red",
                                   ifelse(cluster_means_t[,4] == min(cluster_means_t[,4]), "blue", "black"))) %>%
  column_spec(5, color = ifelse(cluster_means_t[,5] == max(cluster_means_t[,5]), "red",
                                   ifelse(cluster_means_t[,5] == min(cluster_means_t[,5]), "blue", "black"))) %>%
  column_spec(6, color = ifelse(cluster_means_t[,6] == max(cluster_means_t[,6]), "red",
                                   ifelse(cluster_means_t[,6] == min(cluster_means_t[,6]), "blue", "black"))) %>%
  column_spec(7, color = ifelse(cluster_means_t[,7] == max(cluster_means_t[,7]), "red",
                                   ifelse(cluster_means_t[,7] == min(cluster_means_t[,7]), "blue", "black"))) %>%
  column_spec(9, color = ifelse(cluster_means_t[,9] == max(cluster_means_t[,9]), "red",
                                   ifelse(cluster_means_t[,9] == min(cluster_means_t[,9]), "blue", "black"))) %>%
  column_spec(10, color = ifelse(cluster_means_t[,10] == max(cluster_means_t[,10]), "red",
                                   ifelse(cluster_means_t[,10] == min(cluster_means_t[,10]), "blue", "black"))) %>%
  column_spec(11, color = ifelse(cluster_means_t[,11] == max(cluster_means_t[,11]), "red",
                                   ifelse(cluster_means_t[,11] == min(cluster_means_t[,11]), "blue", "black"))) %>%
  column_spec(12, color = ifelse(cluster_means_t[,12] == max(cluster_means_t[,12]), "red",
                                   ifelse(cluster_means_t[,12] == min(cluster_means_t[,12]), "blue", "black"))) %>%
  column_spec(13, color = ifelse(cluster_means_t[,13] == max(cluster_means_t[,13]), "red",
                                   ifelse(cluster_means_t[,13] == min(cluster_means_t[,13]), "blue", "black"))) %>%
  column_spec(14, color = ifelse(cluster_means_t[,14] == max(cluster_means_t[,14]), "red",
                                   ifelse(cluster_means_t[,14] == min(cluster_means_t[,14]), "blue", "black"))) %>%
  column_spec(15, color = ifelse(cluster_means_t[,15] == max(cluster_means_t[,15]), "red",
                                   ifelse(cluster_means_t[,15] == min(cluster_means_t[,15]), "blue", "black"))) %>%
  column_spec(16, color = ifelse(cluster_means_t[,16] == max(cluster_means_t[,16]), "red",
                                   ifelse(cluster_means_t[,16] == min(cluster_means_t[,16]), "blue", "black"))) %>%
  column_spec(17, color = ifelse(cluster_means_t[,17] == max(cluster_means_t[,17]), "red",
                                   ifelse(cluster_means_t[,17] == min(cluster_means_t[,17]), "blue", "black")))
AQI_Good Bachelor_Over_25 Per_Poverty Gini non_migration Per_Sev_Hous Xstreamlengthimpaired Per_Avg_Land_Cov poor_health_percent GHG_Percap UNEMPLOY FOOD_INS VIO_CRIME broadband_access Female_Inequality Black_equality Hispanic_equality cluster
0.5227677 -0.1548260 -0.7394925 -0.8007960 0.2833437 -0.7699925 -0.1596751 -0.8755457 -0.6616046 0.0995944 -0.6496025 -0.8121395 -0.5935864 -0.0726886 0.4767847 -0.9369008 -0.5638859 one
0.4023375 -0.9781268 2.0365238 1.1761342 0.2900523 0.6564276 -0.4754092 0.0944046 1.3269691 -0.1784631 1.1618210 2.0960565 1.3582874 0.8401411 1.7970294 -0.0458918 0.0247256 two
-0.0959478 -0.4019677 -0.3041514 -0.5230507 0.3973103 -0.5865422 2.4553991 -1.2152311 -0.1071342 -0.1154738 -0.0674007 -0.1866480 -0.4051062 -0.2489778 -0.2436760 -0.3822506 0.1204851 three
0.2160813 -0.3042771 0.0910227 -0.0632170 0.0385947 -0.2081119 -0.3130276 0.2379382 0.2733245 0.1837136 0.0067253 0.0922417 0.0149596 0.2086675 -0.2014455 0.1694413 0.0644684 four
0.1619162 -0.2645486 0.7608756 0.0139556 -0.1893895 0.1984677 0.3943266 0.2922282 0.2075924 0.8651459 0.4813617 0.5026922 -0.2400475 0.6288675 -0.2217439 1.9367016 2.1579208 five
0.3720010 1.9912709 0.8836366 1.2049951 -2.1792014 1.2095387 -0.1783095 0.0313130 -0.8636444 -0.1529746 -0.2939828 0.6723443 -0.2968025 0.0128457 0.3384290 0.2356916 0.1863681 six
-2.5106620 1.0630890 -0.3947348 0.4868429 -0.0050762 1.6436289 -0.3886443 0.6985673 -0.1189012 -0.6544651 0.3545440 -0.5576185 0.4501286 -0.6152487 -0.1931982 0.1814520 -0.3917566 seven
-0.3144578 0.6377410 -0.4030628 0.2571761 0.0144714 0.3539462 -0.1680067 0.3143117 -0.3614585 -0.4553723 0.0080698 -0.2426774 0.2517643 -0.5946423 -0.1165541 -0.2177116 -0.3533740 eight