Load Packages
library(here)
library(mclust)
library(raster)
library(rgdal)
library(sp)
library(RColorBrewer)
library(PCAtools)
library(sf)
library(tmap)
library(terra)
library(rgeos)
library(spData)
library(knitr)
library(kableExtra)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'
Import Data
#import raw data
dat_raw <- read.csv(here("data/CommunityData-raw-2015-v3.csv"))
#select columns of interest, for this run remove asian_equality and female_equality
dat <- dat_raw[,c(4:18,20:21)]
#convert to dataframe
dat <- as.data.frame(unclass(dat))
#set rownames to city names
rownames(dat)=dat_raw$ME
# remove any records that have NAs
dat = na.omit(dat)
# convert to z-scores
dat <- as.matrix(scale(dat, center = TRUE, scale = TRUE))
Run PCA on dataset
dat_comps <- prcomp(dat, center = F, scale=F) #set center and scale to FALSE because done pre-PCA
#get summary
s <- summary(dat_comps)
kable(s$importance)
|
|
PC1
|
PC2
|
PC3
|
PC4
|
PC5
|
PC6
|
PC7
|
PC8
|
PC9
|
PC10
|
PC11
|
PC12
|
PC13
|
PC14
|
PC15
|
PC16
|
PC17
|
|
Standard deviation
|
1.930256
|
1.57564
|
1.238658
|
1.161608
|
1.051675
|
1.001707
|
0.937954
|
0.8924302
|
0.8844498
|
0.8199171
|
0.7855821
|
0.7447965
|
0.6612225
|
0.6399578
|
0.5471293
|
0.4593311
|
0.3725125
|
|
Proportion of Variance
|
0.219170
|
0.14604
|
0.090250
|
0.079370
|
0.065060
|
0.059020
|
0.051750
|
0.0468500
|
0.0460100
|
0.0395400
|
0.0363000
|
0.0326300
|
0.0257200
|
0.0240900
|
0.0176100
|
0.0124100
|
0.0081600
|
|
Cumulative Proportion
|
0.219170
|
0.36521
|
0.455460
|
0.534830
|
0.599890
|
0.658920
|
0.710670
|
0.7575200
|
0.8035300
|
0.8430800
|
0.8793800
|
0.9120100
|
0.9377300
|
0.9618200
|
0.9794300
|
0.9918400
|
1.0000000
|
#get eigenvalues
#### change this # of columns to match
ev <- data.frame(Component=paste("PC",1:17, sep=""),eigenvalue=dat_comps$sdev^2)
kable(ev)
|
Component
|
eigenvalue
|
|
PC1
|
3.7258878
|
|
PC2
|
2.4826401
|
|
PC3
|
1.5342730
|
|
PC4
|
1.3493324
|
|
PC5
|
1.1060209
|
|
PC6
|
1.0034176
|
|
PC7
|
0.8797577
|
|
PC8
|
0.7964316
|
|
PC9
|
0.7822515
|
|
PC10
|
0.6722641
|
|
PC11
|
0.6171392
|
|
PC12
|
0.5547218
|
|
PC13
|
0.4372152
|
|
PC14
|
0.4095460
|
|
PC15
|
0.2993505
|
|
PC16
|
0.2109850
|
|
PC17
|
0.1387656
|
#write to CSV
write.csv(dat_comps$x,file=here("results/community_loadings.csv"))
Plot loadings
#plot loadings of pc1 and 2
plot(dat_comps$x[,1],dat_comps$x[,2],
xlab=paste("PCA 1 (", round(s$importance[2,1]*100, 1), "%)", sep = ""),
ylab=paste("PCA 2 (", round(s$importance[2,2]*100, 1), "%)", sep = ""),
pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)
#get loadings
l.x <- dat_comps$rotation[,1]*10
l.y <- dat_comps$rotation[,2]*10
arrows(x0=mean(dat_comps$x[,1]), x1=l.x, y0=mean(dat_comps$x[,2]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)

#plot loadings of pc 3 and 4
plot(dat_comps$x[,3],dat_comps$x[,4],
xlab=paste("PCA 3 (", round(s$importance[2,3]*100, 1), "%)", sep = ""),
ylab=paste("PCA 4 (", round(s$importance[2,4]*100, 1), "%)", sep = ""),
pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)
#get loadings
l.x <- dat_comps$rotation[,3]*10
l.y <- dat_comps$rotation[,4]*10
arrows(x0=mean(dat_comps$x[,3]), x1=l.x, y0=mean(dat_comps$x[,4]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)
## Summary of PC loadings
library(broom)
print(dat_comps)
## Standard deviations (1, .., p=17):
## [1] 1.9302559 1.5756396 1.2386578 1.1616077 1.0516753 1.0017073 0.9379540
## [8] 0.8924302 0.8844498 0.8199171 0.7855821 0.7447965 0.6612225 0.6399578
## [15] 0.5471293 0.4593311 0.3725125
##
## Rotation (n x k) = (17 x 17):
## PC1 PC2 PC3 PC4
## AQI_Good -0.062866767 0.18051196 -0.05245837 0.52089617
## Bachelor_Over_25 -0.155104384 -0.49978145 0.19800492 0.13648219
## Per_Poverty 0.446860133 0.08223058 0.13323024 0.19642620
## Gini 0.342239993 -0.18630649 0.22744433 0.08291974
## non_migration -0.039451114 0.32797305 -0.14279570 -0.44691749
## Per_Sev_Hous 0.290477489 -0.35489641 0.13570938 -0.14642252
## Xstreamlengthimpaired -0.150768007 0.20034069 0.49469903 -0.14593805
## Per_Avg_Land_Cov 0.199050643 -0.23359150 -0.38405605 -0.08772787
## poor_health_percent 0.345400154 0.29918893 -0.13558026 -0.17710772
## Z_Water_Index 0.089114628 -0.21417037 -0.51339880 -0.10248384
## GHG_Percap -0.057559630 0.17289329 -0.22412476 0.26544463
## UNEMPLOY 0.348499784 0.06211658 0.10988728 -0.14813861
## FOOD_INS 0.404596427 0.06714552 0.17288182 0.28511761
## VIO_CRIME 0.281544821 -0.02500852 -0.05848821 -0.18430169
## broadband_access 0.097667318 0.20271579 -0.22647746 0.37023324
## Black_equality -0.009441226 -0.04032470 -0.13725376 0.15597909
## Hispanic_equality 0.047862361 0.36187781 0.09909611 -0.05894649
## PC5 PC6 PC7 PC8
## AQI_Good -0.17198617 0.47084965 -0.245408998 0.075952675
## Bachelor_Over_25 0.06317829 -0.14379264 0.006828514 -0.100615801
## Per_Poverty -0.01117834 0.05591647 -0.043226995 -0.034108019
## Gini 0.18211836 -0.08791491 -0.019313124 -0.187925720
## non_migration 0.04911408 0.07325898 0.181902454 -0.435429816
## Per_Sev_Hous -0.08291192 0.02485665 -0.077857224 -0.089007519
## Xstreamlengthimpaired -0.16377164 -0.14024134 0.043948672 0.115291952
## Per_Avg_Land_Cov -0.33426782 0.10134533 -0.043844200 -0.269248508
## poor_health_percent -0.10633054 -0.00034835 -0.016614099 -0.004408456
## Z_Water_Index 0.07603166 0.05259835 -0.186814103 0.449609515
## GHG_Percap 0.39901479 -0.33882522 -0.420146662 -0.490613531
## UNEMPLOY -0.18769732 0.18575024 -0.035549241 -0.092668435
## FOOD_INS 0.03800870 -0.01872166 0.044161993 -0.015578578
## VIO_CRIME 0.36690013 -0.33459868 0.011439418 0.341182493
## broadband_access 0.08437931 -0.15341761 0.692466509 0.100549260
## Black_equality -0.64721099 -0.60813110 0.002651743 -0.058554005
## Hispanic_equality -0.10817632 -0.23044695 -0.447125969 0.287533693
## PC9 PC10 PC11 PC12
## AQI_Good -0.2732818757 0.369833692 -0.009624339 -0.20331317
## Bachelor_Over_25 -0.2334242202 -0.078197015 -0.031481748 -0.16573561
## Per_Poverty -0.0245636963 0.016947565 0.154246386 0.21982401
## Gini -0.4861096856 0.085832390 0.147741301 0.10811329
## non_migration -0.4132019509 0.218477058 0.089364559 -0.06116509
## Per_Sev_Hous 0.0446048786 -0.115712777 0.231037130 0.17385569
## Xstreamlengthimpaired 0.0253390855 0.155672391 0.605550191 -0.34735193
## Per_Avg_Land_Cov -0.1659868117 -0.261271007 -0.016345748 -0.49977794
## poor_health_percent -0.0040994359 0.072377014 -0.044938235 0.33180728
## Z_Water_Index -0.0377583050 0.120419062 0.540428303 0.05352150
## GHG_Percap 0.2681578313 -0.009279866 0.239808768 -0.09976052
## UNEMPLOY 0.4801011740 -0.029974932 -0.003317954 -0.34486606
## FOOD_INS 0.0560305175 0.046806400 -0.092761124 -0.12661455
## VIO_CRIME -0.0572106296 0.383223273 -0.325866902 -0.41196138
## broadband_access -0.0569698528 -0.312191714 0.218030047 -0.12299343
## Black_equality -0.0007453668 0.324460848 -0.056475264 0.13079921
## Hispanic_equality -0.3456850595 -0.568614566 -0.088614696 -0.08343529
## PC13 PC14 PC15 PC16
## AQI_Good 0.303872519 0.07309613 0.104304854 -0.093683961
## Bachelor_Over_25 -0.001954031 -0.15583734 0.256923869 -0.476352663
## Per_Poverty -0.008240500 0.07231763 -0.275258258 0.311682555
## Gini -0.106885678 -0.24642899 0.320157613 0.359607861
## non_migration 0.136331220 -0.23832763 -0.275877069 -0.211797074
## Per_Sev_Hous 0.613823895 0.30058746 -0.209812289 -0.243800467
## Xstreamlengthimpaired -0.149611888 0.26222491 0.076977466 0.006579837
## Per_Avg_Land_Cov -0.242586898 0.34390463 -0.001524303 0.176231700
## poor_health_percent -0.149281323 0.31464822 0.589497336 -0.369660030
## Z_Water_Index -0.146542371 -0.27380123 -0.056719257 -0.124572415
## GHG_Percap 0.066815986 0.07397619 0.063048282 -0.037571715
## UNEMPLOY 0.201109247 -0.54869803 0.258413340 0.017119678
## FOOD_INS -0.445219653 -0.06995732 -0.421482336 -0.485922760
## VIO_CRIME 0.214208286 0.19152471 -0.005089669 0.073591852
## broadband_access 0.250148875 -0.01889712 0.111346665 -0.037870686
## Black_equality 0.079725844 -0.15759454 -0.063851166 0.034039957
## Hispanic_equality 0.133648201 -0.15148568 -0.056217176 -0.066700180
## PC17
## AQI_Good 0.043346545
## Bachelor_Over_25 -0.476673841
## Per_Poverty -0.692618987
## Gini 0.363061375
## non_migration -0.117205154
## Per_Sev_Hous 0.242410796
## Xstreamlengthimpaired -0.007762626
## Per_Avg_Land_Cov -0.012555352
## poor_health_percent -0.083779334
## Z_Water_Index -0.021951419
## GHG_Percap 0.009117904
## UNEMPLOY -0.046479162
## FOOD_INS 0.264151308
## VIO_CRIME -0.074616693
## broadband_access 0.024255223
## Black_equality 0.031215061
## Hispanic_equality 0.000125565
summary(dat_comps)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9303 1.5756 1.23866 1.16161 1.05168 1.00171 0.93795
## Proportion of Variance 0.2192 0.1460 0.09025 0.07937 0.06506 0.05902 0.05175
## Cumulative Proportion 0.2192 0.3652 0.45546 0.53483 0.59989 0.65892 0.71067
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.89243 0.88445 0.81992 0.7856 0.74480 0.66122 0.63996
## Proportion of Variance 0.04685 0.04601 0.03954 0.0363 0.03263 0.02572 0.02409
## Cumulative Proportion 0.75752 0.80353 0.84308 0.8794 0.91201 0.93773 0.96182
## PC15 PC16 PC17
## Standard deviation 0.54713 0.45933 0.37251
## Proportion of Variance 0.01761 0.01241 0.00816
## Cumulative Proportion 0.97943 0.99184 1.00000
# information about PCs
tidy(dat_comps, "pcs")
## # A tibble: 17 × 4
## PC std.dev percent cumulative
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1.93 0.219 0.219
## 2 2 1.58 0.146 0.365
## 3 3 1.24 0.0902 0.455
## 4 4 1.16 0.0794 0.535
## 5 5 1.05 0.0651 0.600
## 6 6 1.00 0.0590 0.659
## 7 7 0.938 0.0518 0.711
## 8 8 0.892 0.0468 0.758
## 9 9 0.884 0.0460 0.804
## 10 10 0.820 0.0395 0.843
## 11 11 0.786 0.0363 0.879
## 12 12 0.745 0.0326 0.912
## 13 13 0.661 0.0257 0.938
## 14 14 0.640 0.0241 0.962
## 15 15 0.547 0.0176 0.979
## 16 16 0.459 0.0124 0.992
## 17 17 0.373 0.00816 1
Run Cluster GMM
Assumptions: - MSAs form clusters characterized by a multivariate
distribution - Model forms: shape, volume, orientation - GMM fits a
series of models with different forms and numbers of clusters - Models
with highest probability and fewest parameters selected as most optimal
- Based on Bayesian Information Criterion (BIC)
#run GMM on 1-20 clusters
dat_pcs_mc <- Mclust(dat_pcs, G=c(1:20))
#print summary of model fit
summary(dat_pcs_mc)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 6 components:
##
## log-likelihood n df BIC ICL
## -6961.305 852 75 -14428.68 -14911.71
##
## Clustering table:
## 1 2 3 4 5 6
## 62 295 105 196 143 51
Cluster assignments
plot(dat_pcs_mc, what = c("classification"))
Uncertainty in assignments
write.csv(dat_pcs_mc$uncertainty,file=here("results/community_group_uncertainty.csv"))
plot(dat_pcs_mc, what = "uncertainty")

uncerPlot(z = dat_pcs_mc$z)

Model comparison
#plot BIC scores of different models
plot(dat_pcs_mc, what='BIC')

#zoom in on best-fitting models
plot(dat_pcs_mc, what='BIC', ylim=c(-15200,-14600))

#model type VEI and VEE have similar BIC scores for 8-9 clusters
#do additional model comparisons
m_VEI_8 <- Mclust(dat_pcs, G=8, modelNames="VEI")
m_VEE_8 <- Mclust(dat_pcs, G=8, modelNames="VEE")
m_VEI_9 <- Mclust(dat_pcs, G=9, modelNames="VEI")
m_VEI_10 <- Mclust(dat_pcs, G=10, modelNames="VEI")
m_VEI_11 <- Mclust(dat_pcs, G=11, modelNames="VEI")
m_VEE_9 <- Mclust(dat_pcs, G=9, modelNames="VEE")
m_VEE_10 <- Mclust(dat_pcs, G=10, modelNames="VEE")
m_VEE_11 <- Mclust(dat_pcs, G=11, modelNames="VEE")
#extract BIC scores
BICs<-c(m_VEI_8$BIC[1],m_VEE_8$BIC[1],m_VEI_9$BIC[1],m_VEE_9$BIC[1],m_VEI_10$BIC[1],m_VEE_10$BIC[1],m_VEI_11$BIC[1],m_VEE_11$BIC[1])
#calculate change in BIC score, since in this method the goal to to maximize BIC
#we calculate change from highest scoring model
delta_BIC <- max(BICs) - BICs
#calculate BIC weights
w_BIC <- round(exp(-0.5*delta_BIC)/sum(exp(-0.5*delta_BIC)), digits=3)
#make table of results
results_table <- cbind.data.frame(clusters=c("VEI_8","VEE_8","VEI_9","VEE_9","VEI_10","VEE_10","VEI_11","VEE_11"),BIC=BICs,delta_BIC,weight=w_BIC)
#order by delta
results_table <- results_table[order(delta_BIC),]
rownames(results_table)<-NULL
#print table
kable(results_table[1:4], caption = "Model Comparison")
Model Comparison
|
clusters
|
BIC
|
delta_BIC
|
weight
|
|
VEE_8
|
-14445.12
|
0.00000
|
0.993
|
|
VEI_10
|
-14455.47
|
10.34412
|
0.006
|
|
VEI_8
|
-14459.06
|
13.93913
|
0.001
|
|
VEE_9
|
-14475.68
|
30.55874
|
0.000
|
|
VEI_9
|
-14482.69
|
37.57235
|
0.000
|
|
VEI_11
|
-14491.84
|
46.71835
|
0.000
|
|
VEE_10
|
-14502.13
|
57.01166
|
0.000
|
|
VEE_11
|
-14539.98
|
94.85692
|
0.000
|
## providing some output about the model. It seems that VEI 11 is the best model.
## MODEL CHOSEN FOR REST OF ANALYSES
output <- clustCombi(data = dat_pcs, modelName = "VEI", G = 11)
# plot the hierarchy of combined solutions
plot(output, what = "classification")











# plot some "entropy plots" which may help one to select the number of classes
plot(output, what = "entropy")


# plot the tree structure obtained from combining mixture components
plot(output, what = "tree")

head( output$combiz[[output$MclustOutput$G]] )
## [,1] [,2] [,3] [,4] [,5]
## Aberdeen, SD 1.229467e-01 4.273945e-04 0.049219750 3.947476e-05 1.853775e-07
## Aberdeen, WA 4.706026e-03 7.934584e-01 0.003429176 1.371212e-01 2.057163e-04
## Abilene, TX 7.701404e-01 5.006758e-07 0.153208714 1.477199e-05 4.902602e-12
## Ada, OK 1.101953e-01 1.227952e-01 0.012399875 2.618893e-01 5.521927e-06
## Adrian, MI 1.025823e-07 1.276254e-04 0.001540898 2.765506e-06 9.966006e-01
## Akron, OH 5.538328e-07 1.118966e-04 0.008356152 9.755845e-05 9.674297e-01
## [,6] [,7] [,8] [,9] [,10]
## Aberdeen, SD 3.131616e-02 2.129300e-02 0.773633869 1.011946e-06 2.374023e-06
## Aberdeen, WA 7.776788e-03 3.236991e-02 0.013716479 7.059625e-03 1.326322e-04
## Abilene, TX 4.535274e-08 9.136533e-05 0.074763537 1.266791e-04 7.006480e-07
## Ada, OK 3.576783e-02 1.756334e-01 0.274582045 5.813599e-03 1.365190e-04
## Adrian, MI 1.365361e-09 4.567665e-04 0.001252312 1.151469e-06 4.032494e-06
## Akron, OH 3.543681e-09 2.036351e-02 0.001293644 2.225669e-05 1.262125e-03
## [,11]
## Aberdeen, SD 1.120125e-03
## Aberdeen, WA 2.405733e-05
## Abilene, TX 1.653260e-03
## Ada, OK 7.812642e-04
## Adrian, MI 1.379307e-05
## Akron, OH 1.062631e-03
## output the probabilities by cluster.
write.csv(output$combiz[[output$MclustOutput$G]],file=here("results/cluster_probabilities.csv"))
Extract and map cluster classifications
library(here)
#extract classifications, e.g., which city belongs to which cluster, export to csv and reload
### Change this value to pick a different model
## currently the best solution is VEI with 11 clusters.
write.csv(m_VEI_11$classification,file=here("results/cluster_assignment_pca.csv"))
data<- read.csv(here('results/cluster_assignment_pca.csv'))
#import shapefile
msa_Boundary <-readOGR(here("data/MSA"),"tl_2015_us_cbsa")
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/clipo/PycharmProjects/sustainable_communities/data/MSA", layer: "tl_2015_us_cbsa"
## with 929 features
## It has 12 fields
## Integer64 fields read as strings: ALAND AWATER
#merge them
merged <- merge(msa_Boundary,data,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert x to factor
merged_clean$Cluster <- as.factor(merged_clean$x)
#convert to sf object
cluster_sf <- st_as_sf(merged_clean)
#make us map
us_map <- tm_shape(us_states) + tm_borders()
#combine them
cluster_map <- tm_shape(cluster_sf) + tm_fill(col="Cluster", palette="Set1")
#plot them
us_map + cluster_map

## now show the distribution of each of the clusters one at a time
## dat_pcs_mc$G is the number of clusters
par(mfrow = c(6, 2)) # Set up a 2 x 2 plotting space
output_map <- us_map
library(svMisc)
##
## Attaching package: 'svMisc'
## The following object is masked from 'package:utils':
##
## ?
pb = txtProgressBar(min = 1, max = dat_pcs_mc$G, initial = 1)
number_of_clusters_chosen <- 11
for (g in 1:number_of_clusters_chosen) {
#for (g in 1:dat_pcs_mc$G) {
setTxtProgressBar(pb,g)
subset_dat <-subset(data,x==g)
merged <- merge(msa_Boundary,subset_dat,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert x to factor
merged_clean$Cluster <- as.factor(merged_clean$x)
#convert to sf object
cluster_sf <- st_as_sf(merged_clean)
#make us map
us_map <- tm_shape(us_states) + tm_borders()
#combine them
new_cluster_map <- tm_shape(cluster_sf)+tm_fill(col="Cluster", palette="Set1")
#plot them
print (us_map + new_cluster_map)
#dev.off()#
}

## ================

## ================

## ================

## ================

## ================






Examine how clusters score in different metrics
#new data frame
dat_clust <- as.data.frame(dat)
#add names
dat_clust$ME <- rownames(dat)
rownames(dat_clust) <- NULL
#merge in cluster assignments
dat_clust <- merge(dat_clust,data,by.x="ME",by.y="X")
#calculate cluster means for different variables
c1m <- data.frame(one=colMeans(subset(dat_clust, x=="1")[2:18]))
c2m <- data.frame(two=colMeans(subset(dat_clust, x=="2")[2:18]))
c3m <- data.frame(three=colMeans(subset(dat_clust, x=="3")[2:18]))
c4m <- data.frame(four=colMeans(subset(dat_clust, x=="4")[2:18]))
c5m <- data.frame(five=colMeans(subset(dat_clust, x=="5")[2:18]))
c6m <- data.frame(six=colMeans(subset(dat_clust, x=="6")[2:18]))
c7m <- data.frame(seven=colMeans(subset(dat_clust, x=="7")[2:18]))
c8m <- data.frame(eight=colMeans(subset(dat_clust, x=="8")[2:18]))
c9m <- data.frame(nine=colMeans(subset(dat_clust, x=="9")[2:18]))
c10m <- data.frame(ten=colMeans(subset(dat_clust, x=="10")[2:18]))
c11m <- data.frame(eleven=colMeans(subset(dat_clust, x=="11")[2:18]))
cluster_means <- cbind.data.frame(c1m,c2m,c3m,c4m,c5m,c6m,c7m,c8m,c9m,c10m,c11m)
cluster_means$variable <- row.names(cluster_means)
rownames(cluster_means) <- NULL
#boxplots of scores for clusters
par(mfrow=c(2,4), mar=c(2.5,5,2.5,3))
boxplot(subset(dat_clust, x=="1")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 1")
boxplot(subset(dat_clust, x=="2")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 2")
boxplot(subset(dat_clust, x=="3")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 3")
boxplot(subset(dat_clust, x=="4")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 4")
boxplot(subset(dat_clust, x=="5")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 5")
boxplot(subset(dat_clust, x=="6")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 6")
boxplot(subset(dat_clust, x=="7")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 7")
boxplot(subset(dat_clust, x=="8")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 8")

boxplot(subset(dat_clust, x=="9")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 9")
boxplot(subset(dat_clust, x=="10")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 10")
boxplot(subset(dat_clust, x=="11")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 11")
par(mfrow=c(1,1))

#dotplots of cluster mean values
par(mfrow=c(2,5))
dotchart(cluster_means$one, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 1', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$two, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 2', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$three, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 3', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$four, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 4', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$five, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 5', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$six, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 6', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$seven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 7', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$eight, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 8', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$nine, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 9', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$ten, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 10', pch=16)
abline(v=0, lwd=2)

dotchart(cluster_means$eleven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 11', pch=16)
abline(v=0, lwd=2)
par(mfrow=c(1,1))

Which MSAs best characterize their cluster?
Typical members of each cluster
|
Cluster
|
Name
|
|
1
|
Sierra Vista-Douglas, AZ
|
|
2
|
Danville, VA
|
|
3
|
Roseburg, OR
|
|
4
|
Russellville, AR
|
|
5
|
Canton-Massillon, OH
|
|
6
|
Lewiston, ID-WA
|
|
7
|
Richmond, VA
|
|
8
|
Grand Island, NE
|
|
9
|
Texarkana, TX-AR
|
|
10
|
Sacramento–Roseville–Arden-Arcade, CA
|
|
11
|
Lexington-Fayette, KY
|
Which cluster scores the best on the different metrics?
|
Metric
|
Cluster
|
|
AQI_Good
|
three
|
|
Bachelor_Over_25
|
eleven
|
|
Per_Poverty
|
eight
|
|
Gini
|
six
|
|
non_migration
|
eight
|
|
Per_Sev_Hous
|
one
|
|
Xstreamlengthimpaired
|
one
|
|
Per_Avg_Land_Cov
|
eleven
|
|
poor_health_percent
|
five
|
|
Z_Water_Index
|
eight
|
|
GHG_Percap
|
ten
|
|
UNEMPLOY
|
eight
|
|
FOOD_INS
|
eight
|
|
VIO_CRIME
|
six
|
|
broadband_access
|
ten
|
|
Black_equality
|
eleven
|
Table showing which clusters score best and worst for different
metrics
cluster_means_t %>%
kbl() %>%
kable_paper(full_width=T) %>%
#column_spec(1, color = ifelse(cluster_means_t$AQI_Good < max(cluster_means_t$AQI_Good), 'black','blue'))
column_spec(1, color = ifelse(cluster_means_t[,1] == max(cluster_means_t[,1]), "blue",
ifelse(cluster_means_t[,1] == min(cluster_means_t[,1]), "red", "black"))) %>%
column_spec(2, color = ifelse(cluster_means_t[,2] == max(cluster_means_t[,2]), "blue",
ifelse(cluster_means_t[,2] == min(cluster_means_t[,2]), "red", "black"))) %>%
column_spec(8, color = ifelse(cluster_means_t[,8] == max(cluster_means_t[,8]), "blue",
ifelse(cluster_means_t[,8] == min(cluster_means_t[,8]), "red", "black"))) %>%
column_spec(3, color = ifelse(cluster_means_t[,3] == max(cluster_means_t[,3]), "red",
ifelse(cluster_means_t[,3] == min(cluster_means_t[,3]), "blue", "black"))) %>%
column_spec(4, color = ifelse(cluster_means_t[,4] == max(cluster_means_t[,4]), "red",
ifelse(cluster_means_t[,4] == min(cluster_means_t[,4]), "blue", "black"))) %>%
column_spec(5, color = ifelse(cluster_means_t[,5] == max(cluster_means_t[,5]), "red",
ifelse(cluster_means_t[,5] == min(cluster_means_t[,5]), "blue", "black"))) %>%
column_spec(6, color = ifelse(cluster_means_t[,6] == max(cluster_means_t[,6]), "red",
ifelse(cluster_means_t[,6] == min(cluster_means_t[,6]), "blue", "black"))) %>%
column_spec(7, color = ifelse(cluster_means_t[,7] == max(cluster_means_t[,7]), "red",
ifelse(cluster_means_t[,7] == min(cluster_means_t[,7]), "blue", "black"))) %>%
column_spec(9, color = ifelse(cluster_means_t[,9] == max(cluster_means_t[,9]), "red",
ifelse(cluster_means_t[,9] == min(cluster_means_t[,9]), "blue", "black"))) %>%
column_spec(10, color = ifelse(cluster_means_t[,10] == max(cluster_means_t[,10]), "red",
ifelse(cluster_means_t[,10] == min(cluster_means_t[,10]), "blue", "black"))) %>%
column_spec(11, color = ifelse(cluster_means_t[,11] == max(cluster_means_t[,11]), "red",
ifelse(cluster_means_t[,11] == min(cluster_means_t[,11]), "blue", "black"))) %>%
column_spec(12, color = ifelse(cluster_means_t[,12] == max(cluster_means_t[,12]), "red",
ifelse(cluster_means_t[,12] == min(cluster_means_t[,12]), "blue", "black"))) %>%
column_spec(13, color = ifelse(cluster_means_t[,13] == max(cluster_means_t[,13]), "red",
ifelse(cluster_means_t[,13] == min(cluster_means_t[,13]), "blue", "black"))) %>%
column_spec(14, color = ifelse(cluster_means_t[,14] == max(cluster_means_t[,14]), "red",
ifelse(cluster_means_t[,14] == min(cluster_means_t[,14]), "blue", "black"))) %>%
column_spec(15, color = ifelse(cluster_means_t[,15] == max(cluster_means_t[,15]), "red",
ifelse(cluster_means_t[,15] == min(cluster_means_t[,15]), "blue", "black"))) %>%
column_spec(16, color = ifelse(cluster_means_t[,16] == max(cluster_means_t[,16]), "red",
ifelse(cluster_means_t[,16] == min(cluster_means_t[,16]), "blue", "black"))) %>%
column_spec(17, color = ifelse(cluster_means_t[,17] == max(cluster_means_t[,17]), "red",
ifelse(cluster_means_t[,17] == min(cluster_means_t[,17]), "blue", "black")))
|
AQI_Good
|
Bachelor_Over_25
|
Per_Poverty
|
Gini
|
non_migration
|
Per_Sev_Hous
|
Xstreamlengthimpaired
|
Per_Avg_Land_Cov
|
poor_health_percent
|
Z_Water_Index
|
GHG_Percap
|
UNEMPLOY
|
FOOD_INS
|
VIO_CRIME
|
broadband_access
|
Black_equality
|
Hispanic_equality
|
cluster
|
|
0.0360776
|
-0.3772906
|
0.1285585
|
-0.1417523
|
-0.2271069
|
-0.0060785
|
-0.8160342
|
1.1140696
|
0.3486497
|
2.5919687
|
0.3380479
|
0.1018823
|
-0.1285667
|
0.3617222
|
0.6673693
|
0.1242672
|
-0.4037226
|
one
|
|
0.2070975
|
-0.7300326
|
0.1300856
|
-0.3109608
|
0.5813250
|
-0.4313629
|
-0.0577310
|
0.2818725
|
0.7295881
|
-0.3904548
|
-0.1553898
|
0.2275428
|
-0.0736980
|
-0.2697039
|
0.0210235
|
0.1356737
|
0.5583812
|
two
|
|
0.4563573
|
-0.4305608
|
0.7510521
|
0.1327282
|
0.1806435
|
0.1833906
|
-0.0264739
|
0.1850960
|
0.5623121
|
0.0713802
|
1.1695585
|
1.3840682
|
0.9258692
|
0.1446759
|
0.7049975
|
1.6250379
|
0.8441835
|
three
|
|
0.2210572
|
-0.3058475
|
0.6537110
|
0.6712720
|
-0.1848036
|
0.1439168
|
-0.3057518
|
0.2683944
|
0.4639552
|
-0.2632204
|
-0.0714899
|
0.3540815
|
0.8208251
|
0.4348394
|
0.0494244
|
-0.2088244
|
-0.0697042
|
four
|
|
-0.0811630
|
-0.4332021
|
-0.3191460
|
-0.5784421
|
0.4023384
|
-0.6201188
|
2.2799306
|
-1.2272393
|
-0.1033147
|
-0.6353407
|
-0.0449609
|
-0.1033253
|
-0.2222390
|
-0.4492769
|
-0.2598925
|
-0.1819317
|
0.7232332
|
five
|
|
0.4022994
|
0.0672796
|
-0.4292573
|
-0.7393748
|
-0.5624809
|
0.0646084
|
-0.3205879
|
0.9030815
|
-0.3885820
|
-0.1435492
|
-0.3025483
|
-0.0067968
|
-0.1855458
|
-0.5951859
|
-0.2016881
|
1.5591468
|
-0.3096278
|
six
|
|
-0.3162789
|
0.7441927
|
-0.5847208
|
0.0712687
|
-0.0012234
|
0.3072409
|
-0.0979369
|
0.3631567
|
-0.5367045
|
-0.0280526
|
-0.4180048
|
-0.1529441
|
-0.3993503
|
0.0460079
|
-0.5137043
|
-0.1714813
|
-0.4249124
|
seven
|
|
0.3918498
|
-0.0796812
|
-0.6817636
|
-0.6533770
|
0.1062352
|
-0.8417880
|
-0.3681663
|
-0.9029930
|
-0.6491451
|
-0.0752663
|
0.5875915
|
-0.8113360
|
-0.6455292
|
-0.3546319
|
0.2757751
|
-0.3596472
|
-0.2134944
|
eight
|
|
0.2550985
|
-0.8978973
|
1.6865618
|
1.1141420
|
0.6385780
|
0.3993115
|
-0.4387603
|
0.0943710
|
1.4420051
|
-0.3248971
|
0.1971615
|
0.7316543
|
1.6786762
|
1.3221785
|
0.8053089
|
-0.2432397
|
0.4792626
|
nine
|
|
-2.8779699
|
0.6585421
|
-0.0343152
|
0.6485074
|
0.1089758
|
2.0889358
|
-0.2900334
|
0.3595380
|
0.2706320
|
0.6258660
|
-0.7102967
|
0.7269916
|
-0.5195354
|
0.7081738
|
-0.6620594
|
-0.0753673
|
-0.5652390
|
ten
|
|
0.4178735
|
1.6618611
|
0.9724359
|
1.0914319
|
-2.1513159
|
1.0749451
|
-0.0369365
|
-0.1582788
|
-0.8346335
|
-0.2521397
|
-0.0807870
|
-0.2685013
|
0.7280590
|
-0.3963026
|
-0.0582145
|
0.2022337
|
-0.3025342
|
eleven
|