Load Packages

library(here)
library(mclust)
library(raster)
library(rgdal)
library(sp)
library(RColorBrewer)
library(PCAtools)
library(sf)
library(tmap)
library(terra)
library(rgeos)
library(spData)
library(knitr)
library(kableExtra)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'

Import Data

#import raw data
dat_raw <- read.csv(here("data/CommunityData-raw-2015-v3.csv"))
#select columns of interest, for this run remove asian_equality and female_equality
dat  <- dat_raw[,c(4:18,20:21)]
#convert to dataframe
dat <- as.data.frame(unclass(dat))
#set rownames to city names
rownames(dat)=dat_raw$ME
# remove any records that have NAs
dat = na.omit(dat)
# convert to z-scores
dat <- as.matrix(scale(dat, center = TRUE, scale = TRUE))

Run PCA on dataset

dat_comps <- prcomp(dat, center = F, scale=F) #set center and scale to FALSE because done pre-PCA
#get summary
s <- summary(dat_comps)
kable(s$importance)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17
Standard deviation 1.930256 1.57564 1.238658 1.161608 1.051675 1.001707 0.937954 0.8924302 0.8844498 0.8199171 0.7855821 0.7447965 0.6612225 0.6399578 0.5471293 0.4593311 0.3725125
Proportion of Variance 0.219170 0.14604 0.090250 0.079370 0.065060 0.059020 0.051750 0.0468500 0.0460100 0.0395400 0.0363000 0.0326300 0.0257200 0.0240900 0.0176100 0.0124100 0.0081600
Cumulative Proportion 0.219170 0.36521 0.455460 0.534830 0.599890 0.658920 0.710670 0.7575200 0.8035300 0.8430800 0.8793800 0.9120100 0.9377300 0.9618200 0.9794300 0.9918400 1.0000000
#get eigenvalues
#### change this # of columns to match
ev <- data.frame(Component=paste("PC",1:17, sep=""),eigenvalue=dat_comps$sdev^2)
kable(ev)
Component eigenvalue
PC1 3.7258878
PC2 2.4826401
PC3 1.5342730
PC4 1.3493324
PC5 1.1060209
PC6 1.0034176
PC7 0.8797577
PC8 0.7964316
PC9 0.7822515
PC10 0.6722641
PC11 0.6171392
PC12 0.5547218
PC13 0.4372152
PC14 0.4095460
PC15 0.2993505
PC16 0.2109850
PC17 0.1387656
#write to CSV
write.csv(dat_comps$x,file=here("results/community_loadings.csv"))

Extract components with eigenvalues > 1

dat_pcs <- dat_comps$x[,1:5]

Plot loadings

#plot loadings of pc1 and 2
plot(dat_comps$x[,1],dat_comps$x[,2], 
     xlab=paste("PCA 1 (", round(s$importance[2,1]*100, 1), "%)", sep = ""), 
     ylab=paste("PCA 2 (", round(s$importance[2,2]*100, 1), "%)", sep = ""), 
     pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)
#get loadings
l.x <- dat_comps$rotation[,1]*10
l.y <- dat_comps$rotation[,2]*10
arrows(x0=mean(dat_comps$x[,1]), x1=l.x, y0=mean(dat_comps$x[,2]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

#plot loadings of pc 3 and 4
plot(dat_comps$x[,3],dat_comps$x[,4], 
     xlab=paste("PCA 3 (", round(s$importance[2,3]*100, 1), "%)", sep = ""), 
     ylab=paste("PCA 4 (", round(s$importance[2,4]*100, 1), "%)", sep = ""), 
     pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)

#get loadings
l.x <- dat_comps$rotation[,3]*10
l.y <- dat_comps$rotation[,4]*10
arrows(x0=mean(dat_comps$x[,3]), x1=l.x, y0=mean(dat_comps$x[,4]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

## Summary of PC loadings

library(broom)
print(dat_comps)
## Standard deviations (1, .., p=17):
##  [1] 1.9302559 1.5756396 1.2386578 1.1616077 1.0516753 1.0017073 0.9379540
##  [8] 0.8924302 0.8844498 0.8199171 0.7855821 0.7447965 0.6612225 0.6399578
## [15] 0.5471293 0.4593311 0.3725125
## 
## Rotation (n x k) = (17 x 17):
##                                PC1         PC2         PC3         PC4
## AQI_Good              -0.062866767  0.18051196 -0.05245837  0.52089617
## Bachelor_Over_25      -0.155104384 -0.49978145  0.19800492  0.13648219
## Per_Poverty            0.446860133  0.08223058  0.13323024  0.19642620
## Gini                   0.342239993 -0.18630649  0.22744433  0.08291974
## non_migration         -0.039451114  0.32797305 -0.14279570 -0.44691749
## Per_Sev_Hous           0.290477489 -0.35489641  0.13570938 -0.14642252
## Xstreamlengthimpaired -0.150768007  0.20034069  0.49469903 -0.14593805
## Per_Avg_Land_Cov       0.199050643 -0.23359150 -0.38405605 -0.08772787
## poor_health_percent    0.345400154  0.29918893 -0.13558026 -0.17710772
## Z_Water_Index          0.089114628 -0.21417037 -0.51339880 -0.10248384
## GHG_Percap            -0.057559630  0.17289329 -0.22412476  0.26544463
## UNEMPLOY               0.348499784  0.06211658  0.10988728 -0.14813861
## FOOD_INS               0.404596427  0.06714552  0.17288182  0.28511761
## VIO_CRIME              0.281544821 -0.02500852 -0.05848821 -0.18430169
## broadband_access       0.097667318  0.20271579 -0.22647746  0.37023324
## Black_equality        -0.009441226 -0.04032470 -0.13725376  0.15597909
## Hispanic_equality      0.047862361  0.36187781  0.09909611 -0.05894649
##                               PC5         PC6          PC7          PC8
## AQI_Good              -0.17198617  0.47084965 -0.245408998  0.075952675
## Bachelor_Over_25       0.06317829 -0.14379264  0.006828514 -0.100615801
## Per_Poverty           -0.01117834  0.05591647 -0.043226995 -0.034108019
## Gini                   0.18211836 -0.08791491 -0.019313124 -0.187925720
## non_migration          0.04911408  0.07325898  0.181902454 -0.435429816
## Per_Sev_Hous          -0.08291192  0.02485665 -0.077857224 -0.089007519
## Xstreamlengthimpaired -0.16377164 -0.14024134  0.043948672  0.115291952
## Per_Avg_Land_Cov      -0.33426782  0.10134533 -0.043844200 -0.269248508
## poor_health_percent   -0.10633054 -0.00034835 -0.016614099 -0.004408456
## Z_Water_Index          0.07603166  0.05259835 -0.186814103  0.449609515
## GHG_Percap             0.39901479 -0.33882522 -0.420146662 -0.490613531
## UNEMPLOY              -0.18769732  0.18575024 -0.035549241 -0.092668435
## FOOD_INS               0.03800870 -0.01872166  0.044161993 -0.015578578
## VIO_CRIME              0.36690013 -0.33459868  0.011439418  0.341182493
## broadband_access       0.08437931 -0.15341761  0.692466509  0.100549260
## Black_equality        -0.64721099 -0.60813110  0.002651743 -0.058554005
## Hispanic_equality     -0.10817632 -0.23044695 -0.447125969  0.287533693
##                                 PC9         PC10         PC11        PC12
## AQI_Good              -0.2732818757  0.369833692 -0.009624339 -0.20331317
## Bachelor_Over_25      -0.2334242202 -0.078197015 -0.031481748 -0.16573561
## Per_Poverty           -0.0245636963  0.016947565  0.154246386  0.21982401
## Gini                  -0.4861096856  0.085832390  0.147741301  0.10811329
## non_migration         -0.4132019509  0.218477058  0.089364559 -0.06116509
## Per_Sev_Hous           0.0446048786 -0.115712777  0.231037130  0.17385569
## Xstreamlengthimpaired  0.0253390855  0.155672391  0.605550191 -0.34735193
## Per_Avg_Land_Cov      -0.1659868117 -0.261271007 -0.016345748 -0.49977794
## poor_health_percent   -0.0040994359  0.072377014 -0.044938235  0.33180728
## Z_Water_Index         -0.0377583050  0.120419062  0.540428303  0.05352150
## GHG_Percap             0.2681578313 -0.009279866  0.239808768 -0.09976052
## UNEMPLOY               0.4801011740 -0.029974932 -0.003317954 -0.34486606
## FOOD_INS               0.0560305175  0.046806400 -0.092761124 -0.12661455
## VIO_CRIME             -0.0572106296  0.383223273 -0.325866902 -0.41196138
## broadband_access      -0.0569698528 -0.312191714  0.218030047 -0.12299343
## Black_equality        -0.0007453668  0.324460848 -0.056475264  0.13079921
## Hispanic_equality     -0.3456850595 -0.568614566 -0.088614696 -0.08343529
##                               PC13        PC14         PC15         PC16
## AQI_Good               0.303872519  0.07309613  0.104304854 -0.093683961
## Bachelor_Over_25      -0.001954031 -0.15583734  0.256923869 -0.476352663
## Per_Poverty           -0.008240500  0.07231763 -0.275258258  0.311682555
## Gini                  -0.106885678 -0.24642899  0.320157613  0.359607861
## non_migration          0.136331220 -0.23832763 -0.275877069 -0.211797074
## Per_Sev_Hous           0.613823895  0.30058746 -0.209812289 -0.243800467
## Xstreamlengthimpaired -0.149611888  0.26222491  0.076977466  0.006579837
## Per_Avg_Land_Cov      -0.242586898  0.34390463 -0.001524303  0.176231700
## poor_health_percent   -0.149281323  0.31464822  0.589497336 -0.369660030
## Z_Water_Index         -0.146542371 -0.27380123 -0.056719257 -0.124572415
## GHG_Percap             0.066815986  0.07397619  0.063048282 -0.037571715
## UNEMPLOY               0.201109247 -0.54869803  0.258413340  0.017119678
## FOOD_INS              -0.445219653 -0.06995732 -0.421482336 -0.485922760
## VIO_CRIME              0.214208286  0.19152471 -0.005089669  0.073591852
## broadband_access       0.250148875 -0.01889712  0.111346665 -0.037870686
## Black_equality         0.079725844 -0.15759454 -0.063851166  0.034039957
## Hispanic_equality      0.133648201 -0.15148568 -0.056217176 -0.066700180
##                               PC17
## AQI_Good               0.043346545
## Bachelor_Over_25      -0.476673841
## Per_Poverty           -0.692618987
## Gini                   0.363061375
## non_migration         -0.117205154
## Per_Sev_Hous           0.242410796
## Xstreamlengthimpaired -0.007762626
## Per_Avg_Land_Cov      -0.012555352
## poor_health_percent   -0.083779334
## Z_Water_Index         -0.021951419
## GHG_Percap             0.009117904
## UNEMPLOY              -0.046479162
## FOOD_INS               0.264151308
## VIO_CRIME             -0.074616693
## broadband_access       0.024255223
## Black_equality         0.031215061
## Hispanic_equality      0.000125565
summary(dat_comps)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.9303 1.5756 1.23866 1.16161 1.05168 1.00171 0.93795
## Proportion of Variance 0.2192 0.1460 0.09025 0.07937 0.06506 0.05902 0.05175
## Cumulative Proportion  0.2192 0.3652 0.45546 0.53483 0.59989 0.65892 0.71067
##                            PC8     PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.89243 0.88445 0.81992 0.7856 0.74480 0.66122 0.63996
## Proportion of Variance 0.04685 0.04601 0.03954 0.0363 0.03263 0.02572 0.02409
## Cumulative Proportion  0.75752 0.80353 0.84308 0.8794 0.91201 0.93773 0.96182
##                           PC15    PC16    PC17
## Standard deviation     0.54713 0.45933 0.37251
## Proportion of Variance 0.01761 0.01241 0.00816
## Cumulative Proportion  0.97943 0.99184 1.00000
# information about PCs
tidy(dat_comps, "pcs")
## # A tibble: 17 × 4
##       PC std.dev percent cumulative
##    <dbl>   <dbl>   <dbl>      <dbl>
##  1     1   1.93  0.219        0.219
##  2     2   1.58  0.146        0.365
##  3     3   1.24  0.0902       0.455
##  4     4   1.16  0.0794       0.535
##  5     5   1.05  0.0651       0.600
##  6     6   1.00  0.0590       0.659
##  7     7   0.938 0.0518       0.711
##  8     8   0.892 0.0468       0.758
##  9     9   0.884 0.0460       0.804
## 10    10   0.820 0.0395       0.843
## 11    11   0.786 0.0363       0.879
## 12    12   0.745 0.0326       0.912
## 13    13   0.661 0.0257       0.938
## 14    14   0.640 0.0241       0.962
## 15    15   0.547 0.0176       0.979
## 16    16   0.459 0.0124       0.992
## 17    17   0.373 0.00816      1

Run Cluster GMM

Assumptions: - MSAs form clusters characterized by a multivariate distribution - Model forms: shape, volume, orientation - GMM fits a series of models with different forms and numbers of clusters - Models with highest probability and fewest parameters selected as most optimal - Based on Bayesian Information Criterion (BIC)

#run GMM on 1-20 clusters
dat_pcs_mc <- Mclust(dat_pcs, G=c(1:20))
#print summary of model fit
summary(dat_pcs_mc)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 6 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -6961.305 852 75 -14428.68 -14911.71
## 
## Clustering table:
##   1   2   3   4   5   6 
##  62 295 105 196 143  51

Cluster assignments

plot(dat_pcs_mc, what = c("classification"))

Uncertainty in assignments

write.csv(dat_pcs_mc$uncertainty,file=here("results/community_group_uncertainty.csv"))
plot(dat_pcs_mc, what = "uncertainty")

uncerPlot(z = dat_pcs_mc$z)

Model comparison

#plot BIC scores of different models
plot(dat_pcs_mc, what='BIC')

#zoom in on best-fitting models
plot(dat_pcs_mc, what='BIC', ylim=c(-15200,-14600))

#model type VEI and VEE have similar BIC scores for 8-9 clusters
#do additional model comparisons
m_VEI_8 <- Mclust(dat_pcs, G=8, modelNames="VEI")
m_VEE_8 <- Mclust(dat_pcs, G=8, modelNames="VEE")
m_VEI_9 <- Mclust(dat_pcs, G=9, modelNames="VEI")
m_VEI_10 <- Mclust(dat_pcs, G=10, modelNames="VEI")
m_VEI_11 <- Mclust(dat_pcs, G=11, modelNames="VEI")
m_VEE_9 <- Mclust(dat_pcs, G=9, modelNames="VEE")
m_VEE_10 <- Mclust(dat_pcs, G=10, modelNames="VEE")
m_VEE_11 <- Mclust(dat_pcs, G=11, modelNames="VEE")
#extract BIC scores
BICs<-c(m_VEI_8$BIC[1],m_VEE_8$BIC[1],m_VEI_9$BIC[1],m_VEE_9$BIC[1],m_VEI_10$BIC[1],m_VEE_10$BIC[1],m_VEI_11$BIC[1],m_VEE_11$BIC[1])
#calculate change in BIC score, since in this method the goal to to maximize BIC
#we calculate change from highest scoring model
delta_BIC <- max(BICs) - BICs
#calculate BIC weights
w_BIC <- round(exp(-0.5*delta_BIC)/sum(exp(-0.5*delta_BIC)), digits=3)
#make table of results
results_table <- cbind.data.frame(clusters=c("VEI_8","VEE_8","VEI_9","VEE_9","VEI_10","VEE_10","VEI_11","VEE_11"),BIC=BICs,delta_BIC,weight=w_BIC)
#order by delta
results_table <- results_table[order(delta_BIC),] 
rownames(results_table)<-NULL
#print table
kable(results_table[1:4], caption = "Model Comparison")
Model Comparison
clusters BIC delta_BIC weight
VEE_8 -14445.12 0.00000 0.993
VEI_10 -14455.47 10.34412 0.006
VEI_8 -14459.06 13.93913 0.001
VEE_9 -14475.68 30.55874 0.000
VEI_9 -14482.69 37.57235 0.000
VEI_11 -14491.84 46.71835 0.000
VEE_10 -14502.13 57.01166 0.000
VEE_11 -14539.98 94.85692 0.000
## providing some output about the model. It seems that VEI 11 is the best model. 
## MODEL CHOSEN FOR REST OF ANALYSES

output <- clustCombi(data = dat_pcs, modelName = "VEI", G = 11)
# plot the hierarchy of combined solutions
plot(output, what = "classification")

# plot some "entropy plots" which may help one to select the number of classes
plot(output, what = "entropy")

# plot the tree structure obtained from combining mixture components
plot(output, what = "tree")

head( output$combiz[[output$MclustOutput$G]] )
##                      [,1]         [,2]        [,3]         [,4]         [,5]
## Aberdeen, SD 1.229467e-01 4.273945e-04 0.049219750 3.947476e-05 1.853775e-07
## Aberdeen, WA 4.706026e-03 7.934584e-01 0.003429176 1.371212e-01 2.057163e-04
## Abilene, TX  7.701404e-01 5.006758e-07 0.153208714 1.477199e-05 4.902602e-12
## Ada, OK      1.101953e-01 1.227952e-01 0.012399875 2.618893e-01 5.521927e-06
## Adrian, MI   1.025823e-07 1.276254e-04 0.001540898 2.765506e-06 9.966006e-01
## Akron, OH    5.538328e-07 1.118966e-04 0.008356152 9.755845e-05 9.674297e-01
##                      [,6]         [,7]        [,8]         [,9]        [,10]
## Aberdeen, SD 3.131616e-02 2.129300e-02 0.773633869 1.011946e-06 2.374023e-06
## Aberdeen, WA 7.776788e-03 3.236991e-02 0.013716479 7.059625e-03 1.326322e-04
## Abilene, TX  4.535274e-08 9.136533e-05 0.074763537 1.266791e-04 7.006480e-07
## Ada, OK      3.576783e-02 1.756334e-01 0.274582045 5.813599e-03 1.365190e-04
## Adrian, MI   1.365361e-09 4.567665e-04 0.001252312 1.151469e-06 4.032494e-06
## Akron, OH    3.543681e-09 2.036351e-02 0.001293644 2.225669e-05 1.262125e-03
##                     [,11]
## Aberdeen, SD 1.120125e-03
## Aberdeen, WA 2.405733e-05
## Abilene, TX  1.653260e-03
## Ada, OK      7.812642e-04
## Adrian, MI   1.379307e-05
## Akron, OH    1.062631e-03
## output the probabilities by cluster.
write.csv(output$combiz[[output$MclustOutput$G]],file=here("results/cluster_probabilities.csv"))

Extract and map cluster classifications

library(here)
#extract classifications, e.g., which city belongs to which cluster, export to csv and reload

### Change this value to pick a different model
## currently the best solution is VEI with 11 clusters.

write.csv(m_VEI_11$classification,file=here("results/cluster_assignment_pca.csv"))
data<- read.csv(here('results/cluster_assignment_pca.csv'))

#import shapefile
msa_Boundary <-readOGR(here("data/MSA"),"tl_2015_us_cbsa") 
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/clipo/PycharmProjects/sustainable_communities/data/MSA", layer: "tl_2015_us_cbsa"
## with 929 features
## It has 12 fields
## Integer64 fields read as strings:  ALAND AWATER
#merge them
merged <- merge(msa_Boundary,data,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert x to factor
merged_clean$Cluster <- as.factor(merged_clean$x)
#convert to sf object
cluster_sf <- st_as_sf(merged_clean)
#make us map
us_map <- tm_shape(us_states) + tm_borders()
#combine them
cluster_map <- tm_shape(cluster_sf) + tm_fill(col="Cluster", palette="Set1")
#plot them
us_map + cluster_map

## now show the distribution of each of the clusters one at a time
## dat_pcs_mc$G is the number of clusters
par(mfrow = c(6, 2))  # Set up a 2 x 2 plotting space
output_map <- us_map
library(svMisc)
## 
## Attaching package: 'svMisc'
## The following object is masked from 'package:utils':
## 
##     ?
pb = txtProgressBar(min = 1, max = dat_pcs_mc$G, initial = 1)
number_of_clusters_chosen <- 11
for (g in 1:number_of_clusters_chosen) {
#for (g in 1:dat_pcs_mc$G) {
 setTxtProgressBar(pb,g)
 subset_dat <-subset(data,x==g)
  merged <- merge(msa_Boundary,subset_dat,by.x="NAME",by.y="X")
  #remove any cases MSAs with no cluster assignments
  merged_clean <- merged[!is.na(merged$x),]
  #convert x to factor
  merged_clean$Cluster <- as.factor(merged_clean$x)
  #convert to sf object
  cluster_sf <- st_as_sf(merged_clean)
  #make us map
  us_map <- tm_shape(us_states) + tm_borders()
  #combine them
  new_cluster_map <- tm_shape(cluster_sf)+tm_fill(col="Cluster", palette="Set1")

  #plot them
  print (us_map + new_cluster_map)
  #dev.off()#
}

## ================

## ================

## ================

## ================

## ================

Examine how clusters score in different metrics

#new data frame
dat_clust <- as.data.frame(dat)
#add names
dat_clust$ME <- rownames(dat)
rownames(dat_clust) <- NULL
#merge in cluster assignments
dat_clust <- merge(dat_clust,data,by.x="ME",by.y="X")
#calculate cluster means for different variables
c1m <- data.frame(one=colMeans(subset(dat_clust, x=="1")[2:18]))
c2m <- data.frame(two=colMeans(subset(dat_clust, x=="2")[2:18]))
c3m <- data.frame(three=colMeans(subset(dat_clust, x=="3")[2:18]))
c4m <- data.frame(four=colMeans(subset(dat_clust, x=="4")[2:18]))
c5m <- data.frame(five=colMeans(subset(dat_clust, x=="5")[2:18]))
c6m <- data.frame(six=colMeans(subset(dat_clust, x=="6")[2:18]))
c7m <- data.frame(seven=colMeans(subset(dat_clust, x=="7")[2:18]))
c8m <- data.frame(eight=colMeans(subset(dat_clust, x=="8")[2:18]))
c9m <- data.frame(nine=colMeans(subset(dat_clust, x=="9")[2:18]))
c10m <- data.frame(ten=colMeans(subset(dat_clust, x=="10")[2:18]))
c11m <- data.frame(eleven=colMeans(subset(dat_clust, x=="11")[2:18]))
cluster_means <- cbind.data.frame(c1m,c2m,c3m,c4m,c5m,c6m,c7m,c8m,c9m,c10m,c11m)
cluster_means$variable <- row.names(cluster_means)
rownames(cluster_means) <- NULL
#boxplots of scores for clusters
par(mfrow=c(2,4), mar=c(2.5,5,2.5,3))
boxplot(subset(dat_clust, x=="1")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 1")
boxplot(subset(dat_clust, x=="2")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 2")
boxplot(subset(dat_clust, x=="3")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 3")
boxplot(subset(dat_clust, x=="4")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 4")
boxplot(subset(dat_clust, x=="5")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 5")
boxplot(subset(dat_clust, x=="6")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 6")
boxplot(subset(dat_clust, x=="7")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 7")
boxplot(subset(dat_clust, x=="8")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 8")

boxplot(subset(dat_clust, x=="9")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 9")
boxplot(subset(dat_clust, x=="10")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 10")
boxplot(subset(dat_clust, x=="11")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 11")
par(mfrow=c(1,1))

#dotplots of cluster mean values
par(mfrow=c(2,5))
dotchart(cluster_means$one, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 1', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$two, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 2', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$three, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 3', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$four, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 4', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$five, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 5', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$six, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 6', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$seven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 7', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$eight, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 8', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$nine, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 9', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$ten, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 10', pch=16)
abline(v=0, lwd=2)

dotchart(cluster_means$eleven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 11', pch=16)
abline(v=0, lwd=2)
par(mfrow=c(1,1))

Which MSAs best characterize their cluster?

Typical members of each cluster
Cluster Name
1 Sierra Vista-Douglas, AZ
2 Danville, VA
3 Roseburg, OR
4 Russellville, AR
5 Canton-Massillon, OH
6 Lewiston, ID-WA
7 Richmond, VA
8 Grand Island, NE
9 Texarkana, TX-AR
10 Sacramento–Roseville–Arden-Arcade, CA
11 Lexington-Fayette, KY

Which cluster scores the best on the different metrics?

Metric Cluster
AQI_Good three
Bachelor_Over_25 eleven
Per_Poverty eight
Gini six
non_migration eight
Per_Sev_Hous one
Xstreamlengthimpaired one
Per_Avg_Land_Cov eleven
poor_health_percent five
Z_Water_Index eight
GHG_Percap ten
UNEMPLOY eight
FOOD_INS eight
VIO_CRIME six
broadband_access ten
Black_equality eleven

Table showing which clusters score best and worst for different metrics

cluster_means_t %>%
  kbl() %>%
  kable_paper(full_width=T) %>%
  #column_spec(1, color = ifelse(cluster_means_t$AQI_Good < max(cluster_means_t$AQI_Good), 'black','blue'))
  column_spec(1, color = ifelse(cluster_means_t[,1] == max(cluster_means_t[,1]), "blue",
                                   ifelse(cluster_means_t[,1] == min(cluster_means_t[,1]), "red", "black"))) %>%
  column_spec(2, color = ifelse(cluster_means_t[,2] == max(cluster_means_t[,2]), "blue",
                                   ifelse(cluster_means_t[,2] == min(cluster_means_t[,2]), "red", "black"))) %>%
  column_spec(8, color = ifelse(cluster_means_t[,8] == max(cluster_means_t[,8]), "blue",
                                   ifelse(cluster_means_t[,8] == min(cluster_means_t[,8]), "red", "black"))) %>%
  column_spec(3, color = ifelse(cluster_means_t[,3] == max(cluster_means_t[,3]), "red",
                                   ifelse(cluster_means_t[,3] == min(cluster_means_t[,3]), "blue", "black"))) %>%
  column_spec(4, color = ifelse(cluster_means_t[,4] == max(cluster_means_t[,4]), "red",
                                   ifelse(cluster_means_t[,4] == min(cluster_means_t[,4]), "blue", "black"))) %>%
  column_spec(5, color = ifelse(cluster_means_t[,5] == max(cluster_means_t[,5]), "red",
                                   ifelse(cluster_means_t[,5] == min(cluster_means_t[,5]), "blue", "black"))) %>%
  column_spec(6, color = ifelse(cluster_means_t[,6] == max(cluster_means_t[,6]), "red",
                                   ifelse(cluster_means_t[,6] == min(cluster_means_t[,6]), "blue", "black"))) %>%
  column_spec(7, color = ifelse(cluster_means_t[,7] == max(cluster_means_t[,7]), "red",
                                   ifelse(cluster_means_t[,7] == min(cluster_means_t[,7]), "blue", "black"))) %>%
  column_spec(9, color = ifelse(cluster_means_t[,9] == max(cluster_means_t[,9]), "red",
                                   ifelse(cluster_means_t[,9] == min(cluster_means_t[,9]), "blue", "black"))) %>%
  column_spec(10, color = ifelse(cluster_means_t[,10] == max(cluster_means_t[,10]), "red",
                                   ifelse(cluster_means_t[,10] == min(cluster_means_t[,10]), "blue", "black"))) %>%
  column_spec(11, color = ifelse(cluster_means_t[,11] == max(cluster_means_t[,11]), "red",
                                   ifelse(cluster_means_t[,11] == min(cluster_means_t[,11]), "blue", "black"))) %>%
  column_spec(12, color = ifelse(cluster_means_t[,12] == max(cluster_means_t[,12]), "red",
                                   ifelse(cluster_means_t[,12] == min(cluster_means_t[,12]), "blue", "black"))) %>%
  column_spec(13, color = ifelse(cluster_means_t[,13] == max(cluster_means_t[,13]), "red",
                                   ifelse(cluster_means_t[,13] == min(cluster_means_t[,13]), "blue", "black"))) %>%
  column_spec(14, color = ifelse(cluster_means_t[,14] == max(cluster_means_t[,14]), "red",
                                   ifelse(cluster_means_t[,14] == min(cluster_means_t[,14]), "blue", "black"))) %>%
  column_spec(15, color = ifelse(cluster_means_t[,15] == max(cluster_means_t[,15]), "red",
                                   ifelse(cluster_means_t[,15] == min(cluster_means_t[,15]), "blue", "black"))) %>%
  column_spec(16, color = ifelse(cluster_means_t[,16] == max(cluster_means_t[,16]), "red",
                                   ifelse(cluster_means_t[,16] == min(cluster_means_t[,16]), "blue", "black"))) %>%
  column_spec(17, color = ifelse(cluster_means_t[,17] == max(cluster_means_t[,17]), "red",
                                   ifelse(cluster_means_t[,17] == min(cluster_means_t[,17]), "blue", "black")))
AQI_Good Bachelor_Over_25 Per_Poverty Gini non_migration Per_Sev_Hous Xstreamlengthimpaired Per_Avg_Land_Cov poor_health_percent Z_Water_Index GHG_Percap UNEMPLOY FOOD_INS VIO_CRIME broadband_access Black_equality Hispanic_equality cluster
0.0360776 -0.3772906 0.1285585 -0.1417523 -0.2271069 -0.0060785 -0.8160342 1.1140696 0.3486497 2.5919687 0.3380479 0.1018823 -0.1285667 0.3617222 0.6673693 0.1242672 -0.4037226 one
0.2070975 -0.7300326 0.1300856 -0.3109608 0.5813250 -0.4313629 -0.0577310 0.2818725 0.7295881 -0.3904548 -0.1553898 0.2275428 -0.0736980 -0.2697039 0.0210235 0.1356737 0.5583812 two
0.4563573 -0.4305608 0.7510521 0.1327282 0.1806435 0.1833906 -0.0264739 0.1850960 0.5623121 0.0713802 1.1695585 1.3840682 0.9258692 0.1446759 0.7049975 1.6250379 0.8441835 three
0.2210572 -0.3058475 0.6537110 0.6712720 -0.1848036 0.1439168 -0.3057518 0.2683944 0.4639552 -0.2632204 -0.0714899 0.3540815 0.8208251 0.4348394 0.0494244 -0.2088244 -0.0697042 four
-0.0811630 -0.4332021 -0.3191460 -0.5784421 0.4023384 -0.6201188 2.2799306 -1.2272393 -0.1033147 -0.6353407 -0.0449609 -0.1033253 -0.2222390 -0.4492769 -0.2598925 -0.1819317 0.7232332 five
0.4022994 0.0672796 -0.4292573 -0.7393748 -0.5624809 0.0646084 -0.3205879 0.9030815 -0.3885820 -0.1435492 -0.3025483 -0.0067968 -0.1855458 -0.5951859 -0.2016881 1.5591468 -0.3096278 six
-0.3162789 0.7441927 -0.5847208 0.0712687 -0.0012234 0.3072409 -0.0979369 0.3631567 -0.5367045 -0.0280526 -0.4180048 -0.1529441 -0.3993503 0.0460079 -0.5137043 -0.1714813 -0.4249124 seven
0.3918498 -0.0796812 -0.6817636 -0.6533770 0.1062352 -0.8417880 -0.3681663 -0.9029930 -0.6491451 -0.0752663 0.5875915 -0.8113360 -0.6455292 -0.3546319 0.2757751 -0.3596472 -0.2134944 eight
0.2550985 -0.8978973 1.6865618 1.1141420 0.6385780 0.3993115 -0.4387603 0.0943710 1.4420051 -0.3248971 0.1971615 0.7316543 1.6786762 1.3221785 0.8053089 -0.2432397 0.4792626 nine
-2.8779699 0.6585421 -0.0343152 0.6485074 0.1089758 2.0889358 -0.2900334 0.3595380 0.2706320 0.6258660 -0.7102967 0.7269916 -0.5195354 0.7081738 -0.6620594 -0.0753673 -0.5652390 ten
0.4178735 1.6618611 0.9724359 1.0914319 -2.1513159 1.0749451 -0.0369365 -0.1582788 -0.8346335 -0.2521397 -0.0807870 -0.2685013 0.7280590 -0.3963026 -0.0582145 0.2022337 -0.3025342 eleven