Load Packages

library(here)
library(mclust)
library(raster)
library(rgdal)
library(sp)
library(RColorBrewer)
library(PCAtools)
library(sf)
library(tmap)
library(terra)
library(rgeos)
library(spData)
library(knitr)
library(kableExtra)

Import Data

#import raw data
dat_raw <- read.csv(here("data/CommunityData-raw-2015-v2.csv"))
#select columns of interest, removed co2_hhs
dat  <- dat_raw[,c(4:12,14:21)]
#convert to dataframe
dat <- as.data.frame(unclass(dat))
#set rownames to city names
rownames(dat)=dat_raw$ME
# remove any records that have NAs
dat = na.omit(dat)
# convert to z-scores
dat <- as.matrix(scale(dat, center = TRUE, scale = TRUE))

Run PCA on dataset

dat_comps <- prcomp(dat, center = F, scale=F) #set center and scale to FALSE because done pre-PCA
#get summary
s <- summary(dat_comps)
kable(s$importance)

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11	PC12	PC13	PC14	PC15	PC16	PC17
Standard deviation	1.97617	1.497713	1.156896	1.146363	1.105432	1.02912	0.9569311	0.9148077	0.8858084	0.8289295	0.8038107	0.7349555	0.6692265	0.641272	0.5542993	0.4588888	0.3611388
Proportion of Variance	0.22972	0.131950	0.078730	0.077300	0.071880	0.06230	0.0538700	0.0492300	0.0461600	0.0404200	0.0380100	0.0317700	0.0263400	0.024190	0.0180700	0.0123900	0.0076700
Cumulative Proportion	0.22972	0.361670	0.440400	0.517700	0.589580	0.65188	0.7057500	0.7549800	0.8011300	0.8415500	0.8795600	0.9113300	0.9376800	0.961870	0.9799400	0.9923300	1.0000000

#get eigenvalues
ev <- data.frame(Component=paste("PC",1:17, sep=""),eigenvalue=dat_comps$sdev^2)
kable(ev)

Component	eigenvalue
PC1	3.9052476
PC2	2.2431433
PC3	1.3384091
PC4	1.3141490
PC5	1.2219791
PC6	1.0590881
PC7	0.9157172
PC8	0.8368731
PC9	0.7846564
PC10	0.6871241
PC11	0.6461116
PC12	0.5401596
PC13	0.4478641
PC14	0.4112297
PC15	0.3072477
PC16	0.2105789
PC17	0.1304213

Extract components with eigenvalues > 1

dat_pcs <- dat_comps$x[,1:5]

Plot loadings

#plot loadings of pc1 and 2
plot(dat_comps$x[,1],dat_comps$x[,2], 
     xlab=paste("PCA 1 (", round(s$importance[2,1]*100, 1), "%)", sep = ""), 
     ylab=paste("PCA 2 (", round(s$importance[2,2]*100, 1), "%)", sep = ""), 
     pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)
#get loadings
l.x <- dat_comps$rotation[,1]*10
l.y <- dat_comps$rotation[,2]*10
arrows(x0=mean(dat_comps$x[,1]), x1=l.x, y0=mean(dat_comps$x[,2]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

#plot loadings of pc 3 and 4
plot(dat_comps$x[,3],dat_comps$x[,4], 
     xlab=paste("PCA 3 (", round(s$importance[2,3]*100, 1), "%)", sep = ""), 
     ylab=paste("PCA 4 (", round(s$importance[2,4]*100, 1), "%)", sep = ""), 
     pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)

#get loadings
l.x <- dat_comps$rotation[,3]*10
l.y <- dat_comps$rotation[,4]*10
arrows(x0=mean(dat_comps$x[,3]), x1=l.x, y0=mean(dat_comps$x[,4]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

Run Cluster GMM

Assumptions: - MSAs form clusters characterized by a multivariate distribution - Model forms: shape, volume, orientation - GMM fits a series of models with different forms and numbers of clusters - Models with highest probability and fewest parameters selected as most optimal - Based on Bayesian Information Criterion (BIC)

#run GMM on 1-20 clusters
dat_pcs_mc <- Mclust(dat_pcs, G=c(1:20))
#print summary of model fit
summary(dat_pcs_mc)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEI (diagonal, equal shape) model with 9 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -6806.744 845 66 -14058.28 -14568.29
## 
## Clustering table:
##   1   2   3   4   5   6   7   8   9 
## 107 208  73 213  25  43  16  49 111

Model comparison

#plot BIC scores of different models
plot(dat_pcs_mc, what='BIC')

#zoom in on best-fitting models
plot(dat_pcs_mc, what='BIC', ylim=c(-15300,-15150))

#model type VEI and VEE have similar BIC scores for 8-9 clusters
#do additional model comparisons
m_VEI_8 <- Mclust(dat_pcs, G=8, modelNames="VEI")
m_VEE_8 <- Mclust(dat_pcs, G=8, modelNames="VEE")
m_VEI_9 <- Mclust(dat_pcs, G=9, modelNames="VEI")
m_VEE_9 <- Mclust(dat_pcs, G=9, modelNames="VEE")

#extract BIC scores
BICs<-c(m_VEI_8$BIC[1],m_VEE_8$BIC[1],m_VEI_9$BIC[1],m_VEE_9$BIC[1])
#calculate change in BIC score, since in this method the goal to to maximize BIC
#we calculate change from highest scoring model
delta_BIC <- max(BICs) - BICs
#calculate BIC weights
w_BIC <- round(exp(-0.5*delta_BIC)/sum(exp(-0.5*delta_BIC)), digits=3)
#make table of results
results_table <- cbind.data.frame(clusters=c("VEI_8","VEE_8","VEI_9","VEE_9"),BIC=BICs,delta_BIC,weight=w_BIC)
#order by delta
results_table <- results_table[order(delta_BIC),] 
rownames(results_table)<-NULL
#print table
kable(results_table[1:4], caption = "Model Comparison")

Model Comparison
clusters	BIC	delta_BIC	weight
VEI_9	-14058.28	0.00000	1
VEE_9	-14093.10	34.81417	0
VEI_8	-14096.51	38.23128	0
VEE_8	-14105.20	46.91507	0

Extract and map cluster classifications

#extract classifications, e.g., which city belongs to which cluster, export to csv and reload
write.csv(m_VEE_8$classification,file=here("results/cluster_assignment_pca.csv"))
data<- read.csv(here('results/cluster_assignment_pca.csv'))

#import shapefile
msa_Boundary <-readOGR(here("data/MSA"),"tl_2015_us_cbsa")

## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/clipo/PycharmProjects/sustainable_communities/data/MSA", layer: "tl_2015_us_cbsa"
## with 929 features
## It has 12 fields
## Integer64 fields read as strings:  ALAND AWATER

#merge them
merged <- merge(msa_Boundary,data,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert x to factor
merged_clean$Cluster <- as.factor(merged_clean$x)
#convert to sf object
cluster_sf <- st_as_sf(merged_clean)
#make us map
us_map <- tm_shape(us_states) + tm_borders()
#combine them
cluster_map <- tm_shape(cluster_sf) + tm_fill(col="Cluster", palette="Set1")
#plot them
us_map + cluster_map

Examine how clusters score in different metrics

#new data frame
dat_clust <- as.data.frame(dat)
#add names
dat_clust$ME <- rownames(dat)
rownames(dat_clust) <- NULL
#merge in cluster assignments
dat_clust <- merge(dat_clust,data,by.x="ME",by.y="X")
#calculate cluster means for different variables
c1m <- data.frame(one=colMeans(subset(dat_clust, x=="1")[2:18]))
c2m <- data.frame(two=colMeans(subset(dat_clust, x=="2")[2:18]))
c3m <- data.frame(three=colMeans(subset(dat_clust, x=="3")[2:18]))
c4m <- data.frame(four=colMeans(subset(dat_clust, x=="4")[2:18]))
c5m <- data.frame(five=colMeans(subset(dat_clust, x=="5")[2:18]))
c6m <- data.frame(six=colMeans(subset(dat_clust, x=="6")[2:18]))
c7m <- data.frame(seven=colMeans(subset(dat_clust, x=="7")[2:18]))
c8m <- data.frame(eight=colMeans(subset(dat_clust, x=="8")[2:18]))
cluster_means <- cbind.data.frame(c1m,c2m,c3m,c4m,c5m,c6m,c7m,c8m)
cluster_means$variable <- row.names(cluster_means)
rownames(cluster_means) <- NULL

#boxplots of scores for clusters
par(mfrow=c(2,4), mar=c(2.5,5,2.5,3))
boxplot(subset(dat_clust, x=="1")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 1")
boxplot(subset(dat_clust, x=="2")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 2")
boxplot(subset(dat_clust, x=="3")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 3")
boxplot(subset(dat_clust, x=="4")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 4")
boxplot(subset(dat_clust, x=="5")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 5")
boxplot(subset(dat_clust, x=="6")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 6")
boxplot(subset(dat_clust, x=="7")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 7")
boxplot(subset(dat_clust, x=="8")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 8")

par(mfrow=c(1,1))

#dotplots of cluster mean values
par(mfrow=c(2,5))
dotchart(cluster_means$one, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 1', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$two, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 2', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$three, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 3', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$four, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 4', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$five, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 5', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$six, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 6', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$seven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 7', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$eight, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 8', pch=16)
abline(v=0, lwd=2)
par(mfrow=c(1,1))

Which MSAs best characterize their cluster?

Typical members of each cluster
Cluster	Name
1	Green Bay, WI
2	Lumberton, NC
3	Canton-Massillon, OH
4	Florence-Muscle Shoals, AL
5	Elizabethtown-Fort Knox, KY
6	Athens-Clarke County, GA
7	Atlanta-Sandy Springs-Roswell, GA
8	North Port-Sarasota-Bradenton, FL

Which cluster scores the best on the different metrics?

Metric	Cluster
AQI_Good	one
Bachelor_Over_25	six
Per_Poverty	one
Gini	one
Per_Sev_Hous	one
Xstreamlengthimpaired	two
Per_Avg_Land_Cov	seven
poor_health_percent	six
GHG_Percap	one
UNEMPLOY	seven
FOOD_INS	one
VIO_CRIME	one
broadband_access	one
Female_Inequality	three
Black_equality	seven
Hispanic_equality	six

Table showing which clusters score best and worst for different metrics

cluster_means_t %>%
  kbl() %>%
  kable_paper(full_width=T) %>%
  #column_spec(1, color = ifelse(cluster_means_t$AQI_Good < max(cluster_means_t$AQI_Good), 'black','blue'))
  column_spec(1, color = ifelse(cluster_means_t[,1] == max(cluster_means_t[,1]), "blue",
                                   ifelse(cluster_means_t[,1] == min(cluster_means_t[,1]), "red", "black"))) %>%
  column_spec(2, color = ifelse(cluster_means_t[,2] == max(cluster_means_t[,2]), "blue",
                                   ifelse(cluster_means_t[,2] == min(cluster_means_t[,2]), "red", "black"))) %>%
  column_spec(8, color = ifelse(cluster_means_t[,8] == max(cluster_means_t[,8]), "blue",
                                   ifelse(cluster_means_t[,8] == min(cluster_means_t[,8]), "red", "black"))) %>%
  column_spec(3, color = ifelse(cluster_means_t[,3] == max(cluster_means_t[,3]), "red",
                                   ifelse(cluster_means_t[,3] == min(cluster_means_t[,3]), "blue", "black"))) %>%
  column_spec(4, color = ifelse(cluster_means_t[,4] == max(cluster_means_t[,4]), "red",
                                   ifelse(cluster_means_t[,4] == min(cluster_means_t[,4]), "blue", "black"))) %>%
  column_spec(5, color = ifelse(cluster_means_t[,5] == max(cluster_means_t[,5]), "red",
                                   ifelse(cluster_means_t[,5] == min(cluster_means_t[,5]), "blue", "black"))) %>%
  column_spec(6, color = ifelse(cluster_means_t[,6] == max(cluster_means_t[,6]), "red",
                                   ifelse(cluster_means_t[,6] == min(cluster_means_t[,6]), "blue", "black"))) %>%
  column_spec(7, color = ifelse(cluster_means_t[,7] == max(cluster_means_t[,7]), "red",
                                   ifelse(cluster_means_t[,7] == min(cluster_means_t[,7]), "blue", "black"))) %>%
  column_spec(9, color = ifelse(cluster_means_t[,9] == max(cluster_means_t[,9]), "red",
                                   ifelse(cluster_means_t[,9] == min(cluster_means_t[,9]), "blue", "black"))) %>%
  column_spec(10, color = ifelse(cluster_means_t[,10] == max(cluster_means_t[,10]), "red",
                                   ifelse(cluster_means_t[,10] == min(cluster_means_t[,10]), "blue", "black"))) %>%
  column_spec(11, color = ifelse(cluster_means_t[,11] == max(cluster_means_t[,11]), "red",
                                   ifelse(cluster_means_t[,11] == min(cluster_means_t[,11]), "blue", "black"))) %>%
  column_spec(12, color = ifelse(cluster_means_t[,12] == max(cluster_means_t[,12]), "red",
                                   ifelse(cluster_means_t[,12] == min(cluster_means_t[,12]), "blue", "black"))) %>%
  column_spec(13, color = ifelse(cluster_means_t[,13] == max(cluster_means_t[,13]), "red",
                                   ifelse(cluster_means_t[,13] == min(cluster_means_t[,13]), "blue", "black"))) %>%
  column_spec(14, color = ifelse(cluster_means_t[,14] == max(cluster_means_t[,14]), "red",
                                   ifelse(cluster_means_t[,14] == min(cluster_means_t[,14]), "blue", "black"))) %>%
  column_spec(15, color = ifelse(cluster_means_t[,15] == max(cluster_means_t[,15]), "red",
                                   ifelse(cluster_means_t[,15] == min(cluster_means_t[,15]), "blue", "black"))) %>%
  column_spec(16, color = ifelse(cluster_means_t[,16] == max(cluster_means_t[,16]), "red",
                                   ifelse(cluster_means_t[,16] == min(cluster_means_t[,16]), "blue", "black"))) %>%
  column_spec(17, color = ifelse(cluster_means_t[,17] == max(cluster_means_t[,17]), "red",
                                   ifelse(cluster_means_t[,17] == min(cluster_means_t[,17]), "blue", "black")))

AQI_Good	Bachelor_Over_25	Per_Poverty	Gini	non_migration	Per_Sev_Hous	Xstreamlengthimpaired	Per_Avg_Land_Cov	poor_health_percent	GHG_Percap	UNEMPLOY	FOOD_INS	VIO_CRIME	broadband_access	Female_Inequality	Black_equality	Hispanic_equality	cluster
0.5227677	-0.1548260	-0.7394925	-0.8007960	0.2833437	-0.7699925	-0.1596751	-0.8755457	-0.6616046	0.0995944	-0.6496025	-0.8121395	-0.5935864	-0.0726886	0.4767847	-0.9369008	-0.5638859	one
0.4023375	-0.9781268	2.0365238	1.1761342	0.2900523	0.6564276	-0.4754092	0.0944046	1.3269691	-0.1784631	1.1618210	2.0960565	1.3582874	0.8401411	1.7970294	-0.0458918	0.0247256	two
-0.0959478	-0.4019677	-0.3041514	-0.5230507	0.3973103	-0.5865422	2.4553991	-1.2152311	-0.1071342	-0.1154738	-0.0674007	-0.1866480	-0.4051062	-0.2489778	-0.2436760	-0.3822506	0.1204851	three
0.2160813	-0.3042771	0.0910227	-0.0632170	0.0385947	-0.2081119	-0.3130276	0.2379382	0.2733245	0.1837136	0.0067253	0.0922417	0.0149596	0.2086675	-0.2014455	0.1694413	0.0644684	four
0.1619162	-0.2645486	0.7608756	0.0139556	-0.1893895	0.1984677	0.3943266	0.2922282	0.2075924	0.8651459	0.4813617	0.5026922	-0.2400475	0.6288675	-0.2217439	1.9367016	2.1579208	five
0.3720010	1.9912709	0.8836366	1.2049951	-2.1792014	1.2095387	-0.1783095	0.0313130	-0.8636444	-0.1529746	-0.2939828	0.6723443	-0.2968025	0.0128457	0.3384290	0.2356916	0.1863681	six
-2.5106620	1.0630890	-0.3947348	0.4868429	-0.0050762	1.6436289	-0.3886443	0.6985673	-0.1189012	-0.6544651	0.3545440	-0.5576185	0.4501286	-0.6152487	-0.1931982	0.1814520	-0.3917566	seven
-0.3144578	0.6377410	-0.4030628	0.2571761	0.0144714	0.3539462	-0.1680067	0.3143117	-0.3614585	-0.4553723	0.0080698	-0.2426774	0.2517643	-0.5946423	-0.1165541	-0.2177116	-0.3533740	eight

MSA Cluster Modeling

Robert J. DiNapoli

9/21/2021