Load Packages

library(here)
library(mclust)
library(raster)
library(rgdal)
library(sp)
library(RColorBrewer)
library(PCAtools)
library(sf)
library(tmap)
library(terra)
library(rgeos)
library(spData)
library(knitr)
library(kableExtra)

## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'

Import Data

#import raw data
dat_raw <- read.csv(here("data/CommunityData-raw-2015-v3.csv"))
#select columns of interest, for this run remove asian_equality and female_equality
dat  <- dat_raw[,c(4:18,20:21)]
#convert to dataframe
dat <- as.data.frame(unclass(dat))
#set rownames to city names
rownames(dat)=dat_raw$ME
# remove any records that have NAs
dat = na.omit(dat)
# convert to z-scores
dat <- as.matrix(scale(dat, center = TRUE, scale = TRUE))

Run PCA on dataset

dat_comps <- prcomp(dat, center = F, scale=F) #set center and scale to FALSE because done pre-PCA
#get summary
s <- summary(dat_comps)
kable(s$importance)

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11	PC12	PC13	PC14	PC15	PC16	PC17
Standard deviation	1.930256	1.57564	1.238658	1.161608	1.051675	1.001707	0.937954	0.8924302	0.8844498	0.8199171	0.7855821	0.7447965	0.6612225	0.6399578	0.5471293	0.4593311	0.3725125
Proportion of Variance	0.219170	0.14604	0.090250	0.079370	0.065060	0.059020	0.051750	0.0468500	0.0460100	0.0395400	0.0363000	0.0326300	0.0257200	0.0240900	0.0176100	0.0124100	0.0081600
Cumulative Proportion	0.219170	0.36521	0.455460	0.534830	0.599890	0.658920	0.710670	0.7575200	0.8035300	0.8430800	0.8793800	0.9120100	0.9377300	0.9618200	0.9794300	0.9918400	1.0000000

#get eigenvalues
#### change this # of columns to match
ev <- data.frame(Component=paste("PC",1:17, sep=""),eigenvalue=dat_comps$sdev^2)
kable(ev)

Component	eigenvalue
PC1	3.7258878
PC2	2.4826401
PC3	1.5342730
PC4	1.3493324
PC5	1.1060209
PC6	1.0034176
PC7	0.8797577
PC8	0.7964316
PC9	0.7822515
PC10	0.6722641
PC11	0.6171392
PC12	0.5547218
PC13	0.4372152
PC14	0.4095460
PC15	0.2993505
PC16	0.2109850
PC17	0.1387656

#write to CSV
write.csv(dat_comps$x,file=here("results/community_loadings.csv"))

Extract components with eigenvalues > 1

dat_pcs <- dat_comps$x[,1:5]

Plot loadings

#plot loadings of pc1 and 2
plot(dat_comps$x[,1],dat_comps$x[,2], 
     xlab=paste("PCA 1 (", round(s$importance[2,1]*100, 1), "%)", sep = ""), 
     ylab=paste("PCA 2 (", round(s$importance[2,2]*100, 1), "%)", sep = ""), 
     pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)
#get loadings
l.x <- dat_comps$rotation[,1]*10
l.y <- dat_comps$rotation[,2]*10
arrows(x0=mean(dat_comps$x[,1]), x1=l.x, y0=mean(dat_comps$x[,2]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

#plot loadings of pc 3 and 4
plot(dat_comps$x[,3],dat_comps$x[,4], 
     xlab=paste("PCA 3 (", round(s$importance[2,3]*100, 1), "%)", sep = ""), 
     ylab=paste("PCA 4 (", round(s$importance[2,4]*100, 1), "%)", sep = ""), 
     pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)

#get loadings
l.x <- dat_comps$rotation[,3]*10
l.y <- dat_comps$rotation[,4]*10
arrows(x0=mean(dat_comps$x[,3]), x1=l.x, y0=mean(dat_comps$x[,4]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

## Summary of PC loadings

library(broom)
print(dat_comps)

## Standard deviations (1, .., p=17):
##  [1] 1.9302559 1.5756396 1.2386578 1.1616077 1.0516753 1.0017073 0.9379540
##  [8] 0.8924302 0.8844498 0.8199171 0.7855821 0.7447965 0.6612225 0.6399578
## [15] 0.5471293 0.4593311 0.3725125
## 
## Rotation (n x k) = (17 x 17):
##                                PC1         PC2         PC3         PC4
## AQI_Good              -0.062866767  0.18051196 -0.05245837  0.52089617
## Bachelor_Over_25      -0.155104384 -0.49978145  0.19800492  0.13648219
## Per_Poverty            0.446860133  0.08223058  0.13323024  0.19642620
## Gini                   0.342239993 -0.18630649  0.22744433  0.08291974
## non_migration         -0.039451114  0.32797305 -0.14279570 -0.44691749
## Per_Sev_Hous           0.290477489 -0.35489641  0.13570938 -0.14642252
## Xstreamlengthimpaired -0.150768007  0.20034069  0.49469903 -0.14593805
## Per_Avg_Land_Cov       0.199050643 -0.23359150 -0.38405605 -0.08772787
## poor_health_percent    0.345400154  0.29918893 -0.13558026 -0.17710772
## Z_Water_Index          0.089114628 -0.21417037 -0.51339880 -0.10248384
## GHG_Percap            -0.057559630  0.17289329 -0.22412476  0.26544463
## UNEMPLOY               0.348499784  0.06211658  0.10988728 -0.14813861
## FOOD_INS               0.404596427  0.06714552  0.17288182  0.28511761
## VIO_CRIME              0.281544821 -0.02500852 -0.05848821 -0.18430169
## broadband_access       0.097667318  0.20271579 -0.22647746  0.37023324
## Black_equality        -0.009441226 -0.04032470 -0.13725376  0.15597909
## Hispanic_equality      0.047862361  0.36187781  0.09909611 -0.05894649
##                               PC5         PC6          PC7          PC8
## AQI_Good              -0.17198617  0.47084965 -0.245408998  0.075952675
## Bachelor_Over_25       0.06317829 -0.14379264  0.006828514 -0.100615801
## Per_Poverty           -0.01117834  0.05591647 -0.043226995 -0.034108019
## Gini                   0.18211836 -0.08791491 -0.019313124 -0.187925720
## non_migration          0.04911408  0.07325898  0.181902454 -0.435429816
## Per_Sev_Hous          -0.08291192  0.02485665 -0.077857224 -0.089007519
## Xstreamlengthimpaired -0.16377164 -0.14024134  0.043948672  0.115291952
## Per_Avg_Land_Cov      -0.33426782  0.10134533 -0.043844200 -0.269248508
## poor_health_percent   -0.10633054 -0.00034835 -0.016614099 -0.004408456
## Z_Water_Index          0.07603166  0.05259835 -0.186814103  0.449609515
## GHG_Percap             0.39901479 -0.33882522 -0.420146662 -0.490613531
## UNEMPLOY              -0.18769732  0.18575024 -0.035549241 -0.092668435
## FOOD_INS               0.03800870 -0.01872166  0.044161993 -0.015578578
## VIO_CRIME              0.36690013 -0.33459868  0.011439418  0.341182493
## broadband_access       0.08437931 -0.15341761  0.692466509  0.100549260
## Black_equality        -0.64721099 -0.60813110  0.002651743 -0.058554005
## Hispanic_equality     -0.10817632 -0.23044695 -0.447125969  0.287533693
##                                 PC9         PC10         PC11        PC12
## AQI_Good              -0.2732818757  0.369833692 -0.009624339 -0.20331317
## Bachelor_Over_25      -0.2334242202 -0.078197015 -0.031481748 -0.16573561
## Per_Poverty           -0.0245636963  0.016947565  0.154246386  0.21982401
## Gini                  -0.4861096856  0.085832390  0.147741301  0.10811329
## non_migration         -0.4132019509  0.218477058  0.089364559 -0.06116509
## Per_Sev_Hous           0.0446048786 -0.115712777  0.231037130  0.17385569
## Xstreamlengthimpaired  0.0253390855  0.155672391  0.605550191 -0.34735193
## Per_Avg_Land_Cov      -0.1659868117 -0.261271007 -0.016345748 -0.49977794
## poor_health_percent   -0.0040994359  0.072377014 -0.044938235  0.33180728
## Z_Water_Index         -0.0377583050  0.120419062  0.540428303  0.05352150
## GHG_Percap             0.2681578313 -0.009279866  0.239808768 -0.09976052
## UNEMPLOY               0.4801011740 -0.029974932 -0.003317954 -0.34486606
## FOOD_INS               0.0560305175  0.046806400 -0.092761124 -0.12661455
## VIO_CRIME             -0.0572106296  0.383223273 -0.325866902 -0.41196138
## broadband_access      -0.0569698528 -0.312191714  0.218030047 -0.12299343
## Black_equality        -0.0007453668  0.324460848 -0.056475264  0.13079921
## Hispanic_equality     -0.3456850595 -0.568614566 -0.088614696 -0.08343529
##                               PC13        PC14         PC15         PC16
## AQI_Good               0.303872519  0.07309613  0.104304854 -0.093683961
## Bachelor_Over_25      -0.001954031 -0.15583734  0.256923869 -0.476352663
## Per_Poverty           -0.008240500  0.07231763 -0.275258258  0.311682555
## Gini                  -0.106885678 -0.24642899  0.320157613  0.359607861
## non_migration          0.136331220 -0.23832763 -0.275877069 -0.211797074
## Per_Sev_Hous           0.613823895  0.30058746 -0.209812289 -0.243800467
## Xstreamlengthimpaired -0.149611888  0.26222491  0.076977466  0.006579837
## Per_Avg_Land_Cov      -0.242586898  0.34390463 -0.001524303  0.176231700
## poor_health_percent   -0.149281323  0.31464822  0.589497336 -0.369660030
## Z_Water_Index         -0.146542371 -0.27380123 -0.056719257 -0.124572415
## GHG_Percap             0.066815986  0.07397619  0.063048282 -0.037571715
## UNEMPLOY               0.201109247 -0.54869803  0.258413340  0.017119678
## FOOD_INS              -0.445219653 -0.06995732 -0.421482336 -0.485922760
## VIO_CRIME              0.214208286  0.19152471 -0.005089669  0.073591852
## broadband_access       0.250148875 -0.01889712  0.111346665 -0.037870686
## Black_equality         0.079725844 -0.15759454 -0.063851166  0.034039957
## Hispanic_equality      0.133648201 -0.15148568 -0.056217176 -0.066700180
##                               PC17
## AQI_Good               0.043346545
## Bachelor_Over_25      -0.476673841
## Per_Poverty           -0.692618987
## Gini                   0.363061375
## non_migration         -0.117205154
## Per_Sev_Hous           0.242410796
## Xstreamlengthimpaired -0.007762626
## Per_Avg_Land_Cov      -0.012555352
## poor_health_percent   -0.083779334
## Z_Water_Index         -0.021951419
## GHG_Percap             0.009117904
## UNEMPLOY              -0.046479162
## FOOD_INS               0.264151308
## VIO_CRIME             -0.074616693
## broadband_access       0.024255223
## Black_equality         0.031215061
## Hispanic_equality      0.000125565

summary(dat_comps)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.9303 1.5756 1.23866 1.16161 1.05168 1.00171 0.93795
## Proportion of Variance 0.2192 0.1460 0.09025 0.07937 0.06506 0.05902 0.05175
## Cumulative Proportion  0.2192 0.3652 0.45546 0.53483 0.59989 0.65892 0.71067
##                            PC8     PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.89243 0.88445 0.81992 0.7856 0.74480 0.66122 0.63996
## Proportion of Variance 0.04685 0.04601 0.03954 0.0363 0.03263 0.02572 0.02409
## Cumulative Proportion  0.75752 0.80353 0.84308 0.8794 0.91201 0.93773 0.96182
##                           PC15    PC16    PC17
## Standard deviation     0.54713 0.45933 0.37251
## Proportion of Variance 0.01761 0.01241 0.00816
## Cumulative Proportion  0.97943 0.99184 1.00000

# information about PCs
tidy(dat_comps, "pcs")

## # A tibble: 17 × 4
##       PC std.dev percent cumulative
##    <dbl>   <dbl>   <dbl>      <dbl>
##  1     1   1.93  0.219        0.219
##  2     2   1.58  0.146        0.365
##  3     3   1.24  0.0902       0.455
##  4     4   1.16  0.0794       0.535
##  5     5   1.05  0.0651       0.600
##  6     6   1.00  0.0590       0.659
##  7     7   0.938 0.0518       0.711
##  8     8   0.892 0.0468       0.758
##  9     9   0.884 0.0460       0.804
## 10    10   0.820 0.0395       0.843
## 11    11   0.786 0.0363       0.879
## 12    12   0.745 0.0326       0.912
## 13    13   0.661 0.0257       0.938
## 14    14   0.640 0.0241       0.962
## 15    15   0.547 0.0176       0.979
## 16    16   0.459 0.0124       0.992
## 17    17   0.373 0.00816      1

Run Cluster GMM

Assumptions: - MSAs form clusters characterized by a multivariate distribution - Model forms: shape, volume, orientation - GMM fits a series of models with different forms and numbers of clusters - Models with highest probability and fewest parameters selected as most optimal - Based on Bayesian Information Criterion (BIC)

#run GMM on 1-20 clusters
dat_pcs_mc <- Mclust(dat_pcs, G=c(1:20))
#print summary of model fit
summary(dat_pcs_mc)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 6 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -6961.305 852 75 -14428.68 -14911.71
## 
## Clustering table:
##   1   2   3   4   5   6 
##  62 295 105 196 143  51

Cluster assignments

plot(dat_pcs_mc, what = c("classification"))

Uncertainty in assignments

write.csv(dat_pcs_mc$uncertainty,file=here("results/community_group_uncertainty.csv"))
plot(dat_pcs_mc, what = "uncertainty")

uncerPlot(z = dat_pcs_mc$z)

Model comparison

#plot BIC scores of different models
plot(dat_pcs_mc, what='BIC')

#zoom in on best-fitting models
plot(dat_pcs_mc, what='BIC', ylim=c(-15200,-14600))

#model type VEI and VEE have similar BIC scores for 8-9 clusters
#do additional model comparisons
m_VEI_8 <- Mclust(dat_pcs, G=8, modelNames="VEI")
m_VEE_8 <- Mclust(dat_pcs, G=8, modelNames="VEE")
m_VEI_9 <- Mclust(dat_pcs, G=9, modelNames="VEI")
m_VEI_10 <- Mclust(dat_pcs, G=10, modelNames="VEI")
m_VEI_11 <- Mclust(dat_pcs, G=11, modelNames="VEI")
m_VEE_9 <- Mclust(dat_pcs, G=9, modelNames="VEE")
m_VEE_10 <- Mclust(dat_pcs, G=10, modelNames="VEE")
m_VEE_11 <- Mclust(dat_pcs, G=11, modelNames="VEE")
#extract BIC scores
BICs<-c(m_VEI_8$BIC[1],m_VEE_8$BIC[1],m_VEI_9$BIC[1],m_VEE_9$BIC[1],m_VEI_10$BIC[1],m_VEE_10$BIC[1],m_VEI_11$BIC[1],m_VEE_11$BIC[1])
#calculate change in BIC score, since in this method the goal to to maximize BIC
#we calculate change from highest scoring model
delta_BIC <- max(BICs) - BICs
#calculate BIC weights
w_BIC <- round(exp(-0.5*delta_BIC)/sum(exp(-0.5*delta_BIC)), digits=3)
#make table of results
results_table <- cbind.data.frame(clusters=c("VEI_8","VEE_8","VEI_9","VEE_9","VEI_10","VEE_10","VEI_11","VEE_11"),BIC=BICs,delta_BIC,weight=w_BIC)
#order by delta
results_table <- results_table[order(delta_BIC),] 
rownames(results_table)<-NULL
#print table
kable(results_table[1:4], caption = "Model Comparison")

Model Comparison
clusters	BIC	delta_BIC	weight
VEE_8	-14445.12	0.00000	0.993
VEI_10	-14455.47	10.34412	0.006
VEI_8	-14459.06	13.93913	0.001
VEE_9	-14475.68	30.55874	0.000
VEI_9	-14482.69	37.57235	0.000
VEI_11	-14491.84	46.71835	0.000
VEE_10	-14502.13	57.01166	0.000
VEE_11	-14539.98	94.85692	0.000

## providing some output about the model. It seems that VEI 11 is the best model. 
## MODEL CHOSEN FOR REST OF ANALYSES

output <- clustCombi(data = dat_pcs, modelName = "VEI", G = 11)
# plot the hierarchy of combined solutions
plot(output, what = "classification")

# plot some "entropy plots" which may help one to select the number of classes
plot(output, what = "entropy")

# plot the tree structure obtained from combining mixture components
plot(output, what = "tree")

head( output$combiz[[output$MclustOutput$G]] )

##                      [,1]         [,2]        [,3]         [,4]         [,5]
## Aberdeen, SD 1.229467e-01 4.273945e-04 0.049219750 3.947476e-05 1.853775e-07
## Aberdeen, WA 4.706026e-03 7.934584e-01 0.003429176 1.371212e-01 2.057163e-04
## Abilene, TX  7.701404e-01 5.006758e-07 0.153208714 1.477199e-05 4.902602e-12
## Ada, OK      1.101953e-01 1.227952e-01 0.012399875 2.618893e-01 5.521927e-06
## Adrian, MI   1.025823e-07 1.276254e-04 0.001540898 2.765506e-06 9.966006e-01
## Akron, OH    5.538328e-07 1.118966e-04 0.008356152 9.755845e-05 9.674297e-01
##                      [,6]         [,7]        [,8]         [,9]        [,10]
## Aberdeen, SD 3.131616e-02 2.129300e-02 0.773633869 1.011946e-06 2.374023e-06
## Aberdeen, WA 7.776788e-03 3.236991e-02 0.013716479 7.059625e-03 1.326322e-04
## Abilene, TX  4.535274e-08 9.136533e-05 0.074763537 1.266791e-04 7.006480e-07
## Ada, OK      3.576783e-02 1.756334e-01 0.274582045 5.813599e-03 1.365190e-04
## Adrian, MI   1.365361e-09 4.567665e-04 0.001252312 1.151469e-06 4.032494e-06
## Akron, OH    3.543681e-09 2.036351e-02 0.001293644 2.225669e-05 1.262125e-03
##                     [,11]
## Aberdeen, SD 1.120125e-03
## Aberdeen, WA 2.405733e-05
## Abilene, TX  1.653260e-03
## Ada, OK      7.812642e-04
## Adrian, MI   1.379307e-05
## Akron, OH    1.062631e-03

## output the probabilities by cluster.
write.csv(output$combiz[[output$MclustOutput$G]],file=here("results/cluster_probabilities.csv"))

Extract and map cluster classifications

library(here)
#extract classifications, e.g., which city belongs to which cluster, export to csv and reload

### Change this value to pick a different model
## currently the best solution is VEI with 11 clusters.

write.csv(m_VEI_11$classification,file=here("results/cluster_assignment_pca.csv"))
data<- read.csv(here('results/cluster_assignment_pca.csv'))

#import shapefile
msa_Boundary <-readOGR(here("data/MSA"),"tl_2015_us_cbsa")

## Warning: OGR support is provided by the sf and terra packages among others

## Warning: OGR support is provided by the sf and terra packages among others

## Warning: OGR support is provided by the sf and terra packages among others

## Warning: OGR support is provided by the sf and terra packages among others

## Warning: OGR support is provided by the sf and terra packages among others

## Warning: OGR support is provided by the sf and terra packages among others

## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/clipo/PycharmProjects/sustainable_communities/data/MSA", layer: "tl_2015_us_cbsa"
## with 929 features
## It has 12 fields
## Integer64 fields read as strings:  ALAND AWATER

#merge them
merged <- merge(msa_Boundary,data,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert x to factor
merged_clean$Cluster <- as.factor(merged_clean$x)
#convert to sf object
cluster_sf <- st_as_sf(merged_clean)
#make us map
us_map <- tm_shape(us_states) + tm_borders()
#combine them
cluster_map <- tm_shape(cluster_sf) + tm_fill(col="Cluster", palette="Set1")
#plot them
us_map + cluster_map

## now show the distribution of each of the clusters one at a time
## dat_pcs_mc$G is the number of clusters
par(mfrow = c(6, 2))  # Set up a 2 x 2 plotting space
output_map <- us_map
library(svMisc)

## 
## Attaching package: 'svMisc'

## The following object is masked from 'package:utils':
## 
##     ?

pb = txtProgressBar(min = 1, max = dat_pcs_mc$G, initial = 1)
number_of_clusters_chosen <- 11
for (g in 1:number_of_clusters_chosen) {
#for (g in 1:dat_pcs_mc$G) {
 setTxtProgressBar(pb,g)
 subset_dat <-subset(data,x==g)
  merged <- merge(msa_Boundary,subset_dat,by.x="NAME",by.y="X")
  #remove any cases MSAs with no cluster assignments
  merged_clean <- merged[!is.na(merged$x),]
  #convert x to factor
  merged_clean$Cluster <- as.factor(merged_clean$x)
  #convert to sf object
  cluster_sf <- st_as_sf(merged_clean)
  #make us map
  us_map <- tm_shape(us_states) + tm_borders()
  #combine them
  new_cluster_map <- tm_shape(cluster_sf)+tm_fill(col="Cluster", palette="Set1")

  #plot them
  print (us_map + new_cluster_map)
  #dev.off()#
}

## ================

## ================

## ================

## ================

## ================

Examine how clusters score in different metrics

#new data frame
dat_clust <- as.data.frame(dat)
#add names
dat_clust$ME <- rownames(dat)
rownames(dat_clust) <- NULL
#merge in cluster assignments
dat_clust <- merge(dat_clust,data,by.x="ME",by.y="X")
#calculate cluster means for different variables
c1m <- data.frame(one=colMeans(subset(dat_clust, x=="1")[2:18]))
c2m <- data.frame(two=colMeans(subset(dat_clust, x=="2")[2:18]))
c3m <- data.frame(three=colMeans(subset(dat_clust, x=="3")[2:18]))
c4m <- data.frame(four=colMeans(subset(dat_clust, x=="4")[2:18]))
c5m <- data.frame(five=colMeans(subset(dat_clust, x=="5")[2:18]))
c6m <- data.frame(six=colMeans(subset(dat_clust, x=="6")[2:18]))
c7m <- data.frame(seven=colMeans(subset(dat_clust, x=="7")[2:18]))
c8m <- data.frame(eight=colMeans(subset(dat_clust, x=="8")[2:18]))
c9m <- data.frame(nine=colMeans(subset(dat_clust, x=="9")[2:18]))
c10m <- data.frame(ten=colMeans(subset(dat_clust, x=="10")[2:18]))
c11m <- data.frame(eleven=colMeans(subset(dat_clust, x=="11")[2:18]))
cluster_means <- cbind.data.frame(c1m,c2m,c3m,c4m,c5m,c6m,c7m,c8m,c9m,c10m,c11m)
cluster_means$variable <- row.names(cluster_means)
rownames(cluster_means) <- NULL

#boxplots of scores for clusters
par(mfrow=c(2,4), mar=c(2.5,5,2.5,3))
boxplot(subset(dat_clust, x=="1")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 1")
boxplot(subset(dat_clust, x=="2")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 2")
boxplot(subset(dat_clust, x=="3")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 3")
boxplot(subset(dat_clust, x=="4")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 4")
boxplot(subset(dat_clust, x=="5")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 5")
boxplot(subset(dat_clust, x=="6")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 6")
boxplot(subset(dat_clust, x=="7")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 7")
boxplot(subset(dat_clust, x=="8")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 8")

boxplot(subset(dat_clust, x=="9")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 9")
boxplot(subset(dat_clust, x=="10")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 10")
boxplot(subset(dat_clust, x=="11")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 11")
par(mfrow=c(1,1))

#dotplots of cluster mean values
par(mfrow=c(2,5))
dotchart(cluster_means$one, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 1', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$two, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 2', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$three, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 3', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$four, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 4', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$five, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 5', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$six, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 6', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$seven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 7', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$eight, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 8', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$nine, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 9', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$ten, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 10', pch=16)
abline(v=0, lwd=2)

dotchart(cluster_means$eleven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 11', pch=16)
abline(v=0, lwd=2)
par(mfrow=c(1,1))

Which MSAs best characterize their cluster?

Typical members of each cluster
Cluster	Name
1	Sierra Vista-Douglas, AZ
2	Danville, VA
3	Roseburg, OR
4	Russellville, AR
5	Canton-Massillon, OH
6	Lewiston, ID-WA
7	Richmond, VA
8	Grand Island, NE
9	Texarkana, TX-AR
10	Sacramento–Roseville–Arden-Arcade, CA
11	Lexington-Fayette, KY

Which cluster scores the best on the different metrics?

Metric	Cluster
AQI_Good	three
Bachelor_Over_25	eleven
Per_Poverty	eight
Gini	six
non_migration	eight
Per_Sev_Hous	one
Xstreamlengthimpaired	one
Per_Avg_Land_Cov	eleven
poor_health_percent	five
Z_Water_Index	eight
GHG_Percap	ten
UNEMPLOY	eight
FOOD_INS	eight
VIO_CRIME	six
broadband_access	ten
Black_equality	eleven

Table showing which clusters score best and worst for different metrics

cluster_means_t %>%
  kbl() %>%
  kable_paper(full_width=T) %>%
  #column_spec(1, color = ifelse(cluster_means_t$AQI_Good < max(cluster_means_t$AQI_Good), 'black','blue'))
  column_spec(1, color = ifelse(cluster_means_t[,1] == max(cluster_means_t[,1]), "blue",
                                   ifelse(cluster_means_t[,1] == min(cluster_means_t[,1]), "red", "black"))) %>%
  column_spec(2, color = ifelse(cluster_means_t[,2] == max(cluster_means_t[,2]), "blue",
                                   ifelse(cluster_means_t[,2] == min(cluster_means_t[,2]), "red", "black"))) %>%
  column_spec(8, color = ifelse(cluster_means_t[,8] == max(cluster_means_t[,8]), "blue",
                                   ifelse(cluster_means_t[,8] == min(cluster_means_t[,8]), "red", "black"))) %>%
  column_spec(3, color = ifelse(cluster_means_t[,3] == max(cluster_means_t[,3]), "red",
                                   ifelse(cluster_means_t[,3] == min(cluster_means_t[,3]), "blue", "black"))) %>%
  column_spec(4, color = ifelse(cluster_means_t[,4] == max(cluster_means_t[,4]), "red",
                                   ifelse(cluster_means_t[,4] == min(cluster_means_t[,4]), "blue", "black"))) %>%
  column_spec(5, color = ifelse(cluster_means_t[,5] == max(cluster_means_t[,5]), "red",
                                   ifelse(cluster_means_t[,5] == min(cluster_means_t[,5]), "blue", "black"))) %>%
  column_spec(6, color = ifelse(cluster_means_t[,6] == max(cluster_means_t[,6]), "red",
                                   ifelse(cluster_means_t[,6] == min(cluster_means_t[,6]), "blue", "black"))) %>%
  column_spec(7, color = ifelse(cluster_means_t[,7] == max(cluster_means_t[,7]), "red",
                                   ifelse(cluster_means_t[,7] == min(cluster_means_t[,7]), "blue", "black"))) %>%
  column_spec(9, color = ifelse(cluster_means_t[,9] == max(cluster_means_t[,9]), "red",
                                   ifelse(cluster_means_t[,9] == min(cluster_means_t[,9]), "blue", "black"))) %>%
  column_spec(10, color = ifelse(cluster_means_t[,10] == max(cluster_means_t[,10]), "red",
                                   ifelse(cluster_means_t[,10] == min(cluster_means_t[,10]), "blue", "black"))) %>%
  column_spec(11, color = ifelse(cluster_means_t[,11] == max(cluster_means_t[,11]), "red",
                                   ifelse(cluster_means_t[,11] == min(cluster_means_t[,11]), "blue", "black"))) %>%
  column_spec(12, color = ifelse(cluster_means_t[,12] == max(cluster_means_t[,12]), "red",
                                   ifelse(cluster_means_t[,12] == min(cluster_means_t[,12]), "blue", "black"))) %>%
  column_spec(13, color = ifelse(cluster_means_t[,13] == max(cluster_means_t[,13]), "red",
                                   ifelse(cluster_means_t[,13] == min(cluster_means_t[,13]), "blue", "black"))) %>%
  column_spec(14, color = ifelse(cluster_means_t[,14] == max(cluster_means_t[,14]), "red",
                                   ifelse(cluster_means_t[,14] == min(cluster_means_t[,14]), "blue", "black"))) %>%
  column_spec(15, color = ifelse(cluster_means_t[,15] == max(cluster_means_t[,15]), "red",
                                   ifelse(cluster_means_t[,15] == min(cluster_means_t[,15]), "blue", "black"))) %>%
  column_spec(16, color = ifelse(cluster_means_t[,16] == max(cluster_means_t[,16]), "red",
                                   ifelse(cluster_means_t[,16] == min(cluster_means_t[,16]), "blue", "black"))) %>%
  column_spec(17, color = ifelse(cluster_means_t[,17] == max(cluster_means_t[,17]), "red",
                                   ifelse(cluster_means_t[,17] == min(cluster_means_t[,17]), "blue", "black")))

AQI_Good	Bachelor_Over_25	Per_Poverty	Gini	non_migration	Per_Sev_Hous	Xstreamlengthimpaired	Per_Avg_Land_Cov	poor_health_percent	Z_Water_Index	GHG_Percap	UNEMPLOY	FOOD_INS	VIO_CRIME	broadband_access	Black_equality	Hispanic_equality	cluster
0.0360776	-0.3772906	0.1285585	-0.1417523	-0.2271069	-0.0060785	-0.8160342	1.1140696	0.3486497	2.5919687	0.3380479	0.1018823	-0.1285667	0.3617222	0.6673693	0.1242672	-0.4037226	one
0.2070975	-0.7300326	0.1300856	-0.3109608	0.5813250	-0.4313629	-0.0577310	0.2818725	0.7295881	-0.3904548	-0.1553898	0.2275428	-0.0736980	-0.2697039	0.0210235	0.1356737	0.5583812	two
0.4563573	-0.4305608	0.7510521	0.1327282	0.1806435	0.1833906	-0.0264739	0.1850960	0.5623121	0.0713802	1.1695585	1.3840682	0.9258692	0.1446759	0.7049975	1.6250379	0.8441835	three
0.2210572	-0.3058475	0.6537110	0.6712720	-0.1848036	0.1439168	-0.3057518	0.2683944	0.4639552	-0.2632204	-0.0714899	0.3540815	0.8208251	0.4348394	0.0494244	-0.2088244	-0.0697042	four
-0.0811630	-0.4332021	-0.3191460	-0.5784421	0.4023384	-0.6201188	2.2799306	-1.2272393	-0.1033147	-0.6353407	-0.0449609	-0.1033253	-0.2222390	-0.4492769	-0.2598925	-0.1819317	0.7232332	five
0.4022994	0.0672796	-0.4292573	-0.7393748	-0.5624809	0.0646084	-0.3205879	0.9030815	-0.3885820	-0.1435492	-0.3025483	-0.0067968	-0.1855458	-0.5951859	-0.2016881	1.5591468	-0.3096278	six
-0.3162789	0.7441927	-0.5847208	0.0712687	-0.0012234	0.3072409	-0.0979369	0.3631567	-0.5367045	-0.0280526	-0.4180048	-0.1529441	-0.3993503	0.0460079	-0.5137043	-0.1714813	-0.4249124	seven
0.3918498	-0.0796812	-0.6817636	-0.6533770	0.1062352	-0.8417880	-0.3681663	-0.9029930	-0.6491451	-0.0752663	0.5875915	-0.8113360	-0.6455292	-0.3546319	0.2757751	-0.3596472	-0.2134944	eight
0.2550985	-0.8978973	1.6865618	1.1141420	0.6385780	0.3993115	-0.4387603	0.0943710	1.4420051	-0.3248971	0.1971615	0.7316543	1.6786762	1.3221785	0.8053089	-0.2432397	0.4792626	nine
-2.8779699	0.6585421	-0.0343152	0.6485074	0.1089758	2.0889358	-0.2900334	0.3595380	0.2706320	0.6258660	-0.7102967	0.7269916	-0.5195354	0.7081738	-0.6620594	-0.0753673	-0.5652390	ten
0.4178735	1.6618611	0.9724359	1.0914319	-2.1513159	1.0749451	-0.0369365	-0.1582788	-0.8346335	-0.2521397	-0.0807870	-0.2685013	0.7280590	-0.3963026	-0.0582145	0.2022337	-0.3025342	eleven

Sustainable Communities Metrics - MSA Cluster Modeling

Robert J. DiNapoli, Carl P. Lipo, Pamela Mischen

1/25/2023