library(here)
library(mclust)
library(raster)
library(rgdal)
library(sp)
library(RColorBrewer)
library(PCAtools)
library(sf)
library(tmap)
library(terra)
library(rgeos)
library(spData)
library(knitr)
library(kableExtra)
#import raw data
dat_raw <- read.csv(here("data/CommunityData-raw-2015-v2.csv"))
#select columns of interest, removed co2_hhs
dat <- dat_raw[,c(4:12,14:21)]
#convert to dataframe
dat <- as.data.frame(unclass(dat))
#set rownames to city names
rownames(dat)=dat_raw$ME
# remove any records that have NAs
dat = na.omit(dat)
# convert to z-scores
dat <- as.matrix(scale(dat, center = TRUE, scale = TRUE))
dat_comps <- prcomp(dat, center = F, scale=F) #set center and scale to FALSE because done pre-PCA
#get summary
s <- summary(dat_comps)
kable(s$importance)
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | PC12 | PC13 | PC14 | PC15 | PC16 | PC17 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Standard deviation | 1.97617 | 1.497713 | 1.156896 | 1.146363 | 1.105432 | 1.02912 | 0.9569311 | 0.9148077 | 0.8858084 | 0.8289295 | 0.8038107 | 0.7349555 | 0.6692265 | 0.641272 | 0.5542993 | 0.4588888 | 0.3611388 |
| Proportion of Variance | 0.22972 | 0.131950 | 0.078730 | 0.077300 | 0.071880 | 0.06230 | 0.0538700 | 0.0492300 | 0.0461600 | 0.0404200 | 0.0380100 | 0.0317700 | 0.0263400 | 0.024190 | 0.0180700 | 0.0123900 | 0.0076700 |
| Cumulative Proportion | 0.22972 | 0.361670 | 0.440400 | 0.517700 | 0.589580 | 0.65188 | 0.7057500 | 0.7549800 | 0.8011300 | 0.8415500 | 0.8795600 | 0.9113300 | 0.9376800 | 0.961870 | 0.9799400 | 0.9923300 | 1.0000000 |
#get eigenvalues
ev <- data.frame(Component=paste("PC",1:17, sep=""),eigenvalue=dat_comps$sdev^2)
kable(ev)
| Component | eigenvalue |
|---|---|
| PC1 | 3.9052476 |
| PC2 | 2.2431433 |
| PC3 | 1.3384091 |
| PC4 | 1.3141490 |
| PC5 | 1.2219791 |
| PC6 | 1.0590881 |
| PC7 | 0.9157172 |
| PC8 | 0.8368731 |
| PC9 | 0.7846564 |
| PC10 | 0.6871241 |
| PC11 | 0.6461116 |
| PC12 | 0.5401596 |
| PC13 | 0.4478641 |
| PC14 | 0.4112297 |
| PC15 | 0.3072477 |
| PC16 | 0.2105789 |
| PC17 | 0.1304213 |
dat_pcs <- dat_comps$x[,1:5]
#plot loadings of pc1 and 2
plot(dat_comps$x[,1],dat_comps$x[,2],
xlab=paste("PCA 1 (", round(s$importance[2,1]*100, 1), "%)", sep = ""),
ylab=paste("PCA 2 (", round(s$importance[2,2]*100, 1), "%)", sep = ""),
pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)
#get loadings
l.x <- dat_comps$rotation[,1]*10
l.y <- dat_comps$rotation[,2]*10
arrows(x0=mean(dat_comps$x[,1]), x1=l.x, y0=mean(dat_comps$x[,2]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)
#plot loadings of pc 3 and 4
plot(dat_comps$x[,3],dat_comps$x[,4],
xlab=paste("PCA 3 (", round(s$importance[2,3]*100, 1), "%)", sep = ""),
ylab=paste("PCA 4 (", round(s$importance[2,4]*100, 1), "%)", sep = ""),
pch=16, col="blue", cex=0.5)
abline(v=0, lwd=2, lty=2)
abline(h=0, lwd=2, lty=2)
#get loadings
l.x <- dat_comps$rotation[,3]*10
l.y <- dat_comps$rotation[,4]*10
arrows(x0=mean(dat_comps$x[,3]), x1=l.x, y0=mean(dat_comps$x[,4]), y1=l.y, col="black", length=0.15, lwd=1.5)
# Label position
l.pos <- l.y # Create a vector of y axis coordinates
lo <- which(l.y < 0) # Get the variables on the bottom half of the plot
hi <- which(l.y > 0) # Get variables on the top half
# Replace values in the vector
l.pos <- replace(l.pos, lo, "1")
l.pos <- replace(l.pos, hi, "3")
text(l.x, l.y, labels=row.names(dat_comps$rotation), col="black", pos=l.pos, cex=1.25)
Assumptions: - MSAs form clusters characterized by a multivariate distribution - Model forms: shape, volume, orientation - GMM fits a series of models with different forms and numbers of clusters - Models with highest probability and fewest parameters selected as most optimal - Based on Bayesian Information Criterion (BIC)
#run GMM on 1-20 clusters
dat_pcs_mc <- Mclust(dat_pcs, G=c(1:20))
#print summary of model fit
summary(dat_pcs_mc)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEI (diagonal, equal shape) model with 9 components:
##
## log-likelihood n df BIC ICL
## -6806.744 845 66 -14058.28 -14568.29
##
## Clustering table:
## 1 2 3 4 5 6 7 8 9
## 107 208 73 213 25 43 16 49 111
#plot BIC scores of different models
plot(dat_pcs_mc, what='BIC')
#zoom in on best-fitting models
plot(dat_pcs_mc, what='BIC', ylim=c(-15300,-15150))
#model type VEI and VEE have similar BIC scores for 8-9 clusters
#do additional model comparisons
m_VEI_8 <- Mclust(dat_pcs, G=8, modelNames="VEI")
m_VEE_8 <- Mclust(dat_pcs, G=8, modelNames="VEE")
m_VEI_9 <- Mclust(dat_pcs, G=9, modelNames="VEI")
m_VEE_9 <- Mclust(dat_pcs, G=9, modelNames="VEE")
#extract BIC scores
BICs<-c(m_VEI_8$BIC[1],m_VEE_8$BIC[1],m_VEI_9$BIC[1],m_VEE_9$BIC[1])
#calculate change in BIC score, since in this method the goal to to maximize BIC
#we calculate change from highest scoring model
delta_BIC <- max(BICs) - BICs
#calculate BIC weights
w_BIC <- round(exp(-0.5*delta_BIC)/sum(exp(-0.5*delta_BIC)), digits=3)
#make table of results
results_table <- cbind.data.frame(clusters=c("VEI_8","VEE_8","VEI_9","VEE_9"),BIC=BICs,delta_BIC,weight=w_BIC)
#order by delta
results_table <- results_table[order(delta_BIC),]
rownames(results_table)<-NULL
#print table
kable(results_table[1:4], caption = "Model Comparison")
| clusters | BIC | delta_BIC | weight |
|---|---|---|---|
| VEI_9 | -14058.28 | 0.00000 | 1 |
| VEE_9 | -14093.10 | 34.81417 | 0 |
| VEI_8 | -14096.51 | 38.23128 | 0 |
| VEE_8 | -14105.20 | 46.91507 | 0 |
#extract classifications, e.g., which city belongs to which cluster, export to csv and reload
write.csv(m_VEE_8$classification,file=here("results/cluster_assignment_pca.csv"))
data<- read.csv(here('results/cluster_assignment_pca.csv'))
#import shapefile
msa_Boundary <-readOGR(here("data/MSA"),"tl_2015_us_cbsa")
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/clipo/PycharmProjects/sustainable_communities/data/MSA", layer: "tl_2015_us_cbsa"
## with 929 features
## It has 12 fields
## Integer64 fields read as strings: ALAND AWATER
#merge them
merged <- merge(msa_Boundary,data,by.x="NAME",by.y="X")
#remove any cases MSAs with no cluster assignments
merged_clean <- merged[!is.na(merged$x),]
#convert x to factor
merged_clean$Cluster <- as.factor(merged_clean$x)
#convert to sf object
cluster_sf <- st_as_sf(merged_clean)
#make us map
us_map <- tm_shape(us_states) + tm_borders()
#combine them
cluster_map <- tm_shape(cluster_sf) + tm_fill(col="Cluster", palette="Set1")
#plot them
us_map + cluster_map
#new data frame
dat_clust <- as.data.frame(dat)
#add names
dat_clust$ME <- rownames(dat)
rownames(dat_clust) <- NULL
#merge in cluster assignments
dat_clust <- merge(dat_clust,data,by.x="ME",by.y="X")
#calculate cluster means for different variables
c1m <- data.frame(one=colMeans(subset(dat_clust, x=="1")[2:18]))
c2m <- data.frame(two=colMeans(subset(dat_clust, x=="2")[2:18]))
c3m <- data.frame(three=colMeans(subset(dat_clust, x=="3")[2:18]))
c4m <- data.frame(four=colMeans(subset(dat_clust, x=="4")[2:18]))
c5m <- data.frame(five=colMeans(subset(dat_clust, x=="5")[2:18]))
c6m <- data.frame(six=colMeans(subset(dat_clust, x=="6")[2:18]))
c7m <- data.frame(seven=colMeans(subset(dat_clust, x=="7")[2:18]))
c8m <- data.frame(eight=colMeans(subset(dat_clust, x=="8")[2:18]))
cluster_means <- cbind.data.frame(c1m,c2m,c3m,c4m,c5m,c6m,c7m,c8m)
cluster_means$variable <- row.names(cluster_means)
rownames(cluster_means) <- NULL
#boxplots of scores for clusters
par(mfrow=c(2,4), mar=c(2.5,5,2.5,3))
boxplot(subset(dat_clust, x=="1")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 1")
boxplot(subset(dat_clust, x=="2")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 2")
boxplot(subset(dat_clust, x=="3")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 3")
boxplot(subset(dat_clust, x=="4")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 4")
boxplot(subset(dat_clust, x=="5")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 5")
boxplot(subset(dat_clust, x=="6")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 6")
boxplot(subset(dat_clust, x=="7")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 7")
boxplot(subset(dat_clust, x=="8")[2:18], horizontal=T, las=1, ylim=c(-4,4), main="Cluster 8")
par(mfrow=c(1,1))
#dotplots of cluster mean values
par(mfrow=c(2,5))
dotchart(cluster_means$one, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 1', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$two, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 2', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$three, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 3', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$four, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 4', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$five, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 5', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$six, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 6', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$seven, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 7', pch=16)
abline(v=0, lwd=2)
dotchart(cluster_means$eight, labels=cluster_means$variable,xlim=c(-2,2), main='Cluster 8', pch=16)
abline(v=0, lwd=2)
par(mfrow=c(1,1))
| Cluster | Name |
|---|---|
| 1 | Green Bay, WI |
| 2 | Lumberton, NC |
| 3 | Canton-Massillon, OH |
| 4 | Florence-Muscle Shoals, AL |
| 5 | Elizabethtown-Fort Knox, KY |
| 6 | Athens-Clarke County, GA |
| 7 | Atlanta-Sandy Springs-Roswell, GA |
| 8 | North Port-Sarasota-Bradenton, FL |
| Metric | Cluster |
|---|---|
| AQI_Good | one |
| Bachelor_Over_25 | six |
| Per_Poverty | one |
| Gini | one |
| Per_Sev_Hous | one |
| Xstreamlengthimpaired | two |
| Per_Avg_Land_Cov | seven |
| poor_health_percent | six |
| GHG_Percap | one |
| UNEMPLOY | seven |
| FOOD_INS | one |
| VIO_CRIME | one |
| broadband_access | one |
| Female_Inequality | three |
| Black_equality | seven |
| Hispanic_equality | six |
cluster_means_t %>%
kbl() %>%
kable_paper(full_width=T) %>%
#column_spec(1, color = ifelse(cluster_means_t$AQI_Good < max(cluster_means_t$AQI_Good), 'black','blue'))
column_spec(1, color = ifelse(cluster_means_t[,1] == max(cluster_means_t[,1]), "blue",
ifelse(cluster_means_t[,1] == min(cluster_means_t[,1]), "red", "black"))) %>%
column_spec(2, color = ifelse(cluster_means_t[,2] == max(cluster_means_t[,2]), "blue",
ifelse(cluster_means_t[,2] == min(cluster_means_t[,2]), "red", "black"))) %>%
column_spec(8, color = ifelse(cluster_means_t[,8] == max(cluster_means_t[,8]), "blue",
ifelse(cluster_means_t[,8] == min(cluster_means_t[,8]), "red", "black"))) %>%
column_spec(3, color = ifelse(cluster_means_t[,3] == max(cluster_means_t[,3]), "red",
ifelse(cluster_means_t[,3] == min(cluster_means_t[,3]), "blue", "black"))) %>%
column_spec(4, color = ifelse(cluster_means_t[,4] == max(cluster_means_t[,4]), "red",
ifelse(cluster_means_t[,4] == min(cluster_means_t[,4]), "blue", "black"))) %>%
column_spec(5, color = ifelse(cluster_means_t[,5] == max(cluster_means_t[,5]), "red",
ifelse(cluster_means_t[,5] == min(cluster_means_t[,5]), "blue", "black"))) %>%
column_spec(6, color = ifelse(cluster_means_t[,6] == max(cluster_means_t[,6]), "red",
ifelse(cluster_means_t[,6] == min(cluster_means_t[,6]), "blue", "black"))) %>%
column_spec(7, color = ifelse(cluster_means_t[,7] == max(cluster_means_t[,7]), "red",
ifelse(cluster_means_t[,7] == min(cluster_means_t[,7]), "blue", "black"))) %>%
column_spec(9, color = ifelse(cluster_means_t[,9] == max(cluster_means_t[,9]), "red",
ifelse(cluster_means_t[,9] == min(cluster_means_t[,9]), "blue", "black"))) %>%
column_spec(10, color = ifelse(cluster_means_t[,10] == max(cluster_means_t[,10]), "red",
ifelse(cluster_means_t[,10] == min(cluster_means_t[,10]), "blue", "black"))) %>%
column_spec(11, color = ifelse(cluster_means_t[,11] == max(cluster_means_t[,11]), "red",
ifelse(cluster_means_t[,11] == min(cluster_means_t[,11]), "blue", "black"))) %>%
column_spec(12, color = ifelse(cluster_means_t[,12] == max(cluster_means_t[,12]), "red",
ifelse(cluster_means_t[,12] == min(cluster_means_t[,12]), "blue", "black"))) %>%
column_spec(13, color = ifelse(cluster_means_t[,13] == max(cluster_means_t[,13]), "red",
ifelse(cluster_means_t[,13] == min(cluster_means_t[,13]), "blue", "black"))) %>%
column_spec(14, color = ifelse(cluster_means_t[,14] == max(cluster_means_t[,14]), "red",
ifelse(cluster_means_t[,14] == min(cluster_means_t[,14]), "blue", "black"))) %>%
column_spec(15, color = ifelse(cluster_means_t[,15] == max(cluster_means_t[,15]), "red",
ifelse(cluster_means_t[,15] == min(cluster_means_t[,15]), "blue", "black"))) %>%
column_spec(16, color = ifelse(cluster_means_t[,16] == max(cluster_means_t[,16]), "red",
ifelse(cluster_means_t[,16] == min(cluster_means_t[,16]), "blue", "black"))) %>%
column_spec(17, color = ifelse(cluster_means_t[,17] == max(cluster_means_t[,17]), "red",
ifelse(cluster_means_t[,17] == min(cluster_means_t[,17]), "blue", "black")))
| AQI_Good | Bachelor_Over_25 | Per_Poverty | Gini | non_migration | Per_Sev_Hous | Xstreamlengthimpaired | Per_Avg_Land_Cov | poor_health_percent | GHG_Percap | UNEMPLOY | FOOD_INS | VIO_CRIME | broadband_access | Female_Inequality | Black_equality | Hispanic_equality | cluster |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.5227677 | -0.1548260 | -0.7394925 | -0.8007960 | 0.2833437 | -0.7699925 | -0.1596751 | -0.8755457 | -0.6616046 | 0.0995944 | -0.6496025 | -0.8121395 | -0.5935864 | -0.0726886 | 0.4767847 | -0.9369008 | -0.5638859 | one |
| 0.4023375 | -0.9781268 | 2.0365238 | 1.1761342 | 0.2900523 | 0.6564276 | -0.4754092 | 0.0944046 | 1.3269691 | -0.1784631 | 1.1618210 | 2.0960565 | 1.3582874 | 0.8401411 | 1.7970294 | -0.0458918 | 0.0247256 | two |
| -0.0959478 | -0.4019677 | -0.3041514 | -0.5230507 | 0.3973103 | -0.5865422 | 2.4553991 | -1.2152311 | -0.1071342 | -0.1154738 | -0.0674007 | -0.1866480 | -0.4051062 | -0.2489778 | -0.2436760 | -0.3822506 | 0.1204851 | three |
| 0.2160813 | -0.3042771 | 0.0910227 | -0.0632170 | 0.0385947 | -0.2081119 | -0.3130276 | 0.2379382 | 0.2733245 | 0.1837136 | 0.0067253 | 0.0922417 | 0.0149596 | 0.2086675 | -0.2014455 | 0.1694413 | 0.0644684 | four |
| 0.1619162 | -0.2645486 | 0.7608756 | 0.0139556 | -0.1893895 | 0.1984677 | 0.3943266 | 0.2922282 | 0.2075924 | 0.8651459 | 0.4813617 | 0.5026922 | -0.2400475 | 0.6288675 | -0.2217439 | 1.9367016 | 2.1579208 | five |
| 0.3720010 | 1.9912709 | 0.8836366 | 1.2049951 | -2.1792014 | 1.2095387 | -0.1783095 | 0.0313130 | -0.8636444 | -0.1529746 | -0.2939828 | 0.6723443 | -0.2968025 | 0.0128457 | 0.3384290 | 0.2356916 | 0.1863681 | six |
| -2.5106620 | 1.0630890 | -0.3947348 | 0.4868429 | -0.0050762 | 1.6436289 | -0.3886443 | 0.6985673 | -0.1189012 | -0.6544651 | 0.3545440 | -0.5576185 | 0.4501286 | -0.6152487 | -0.1931982 | 0.1814520 | -0.3917566 | seven |
| -0.3144578 | 0.6377410 | -0.4030628 | 0.2571761 | 0.0144714 | 0.3539462 | -0.1680067 | 0.3143117 | -0.3614585 | -0.4553723 | 0.0080698 | -0.2426774 | 0.2517643 | -0.5946423 | -0.1165541 | -0.2177116 | -0.3533740 | eight |