This file reads in data from our ACA visualization project database. The data here are mostly from the 2014 (5 year wave) of the American Communicty Survey. I used data from two sources: demographics (age, sex,race) and from the economics (income, employment, insurance etc). Economic varaibles have the prefix ec in front of them. Note: some code use from Quick R examples at http://www.statmethods.net/advstats/cluster.html
What I do here is look at the posbile clusters of counties–using ACS data. I then look at a number of kmeans methods and then do a hierchcial clsutering method to see waht it looks like with 16 or 5 clusters. I choose these two becasue it seems that 5 is a “good” number from the clsutering output and 16 is what the state has used for regional rating areas.
rm()
#cluster
#install.packages("cluster")
#install.packages("NbClust")
#get libraries
library(NbClust)
library(cluster)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
dev.off()
## null device
## 1
# Read in Data--date set has be slightly cleaned to some tex vars and those with NA in total column
data1 <- read.csv("C:/Users/mgrace/Dropbox/!classes/!acs_tableau/clsuterdemogsfilega3.csv", stringsAsFactors=FALSE)
# change missing information to 0 for beds and docs
# and get rid of character data feilds for state and statefips
data1$sum_emergency[is.na(data1$sum_emergency)]<-0
data1$sum_beds[is.na(data1$sum_beds)]<-0
data1$statecode<-NULL
data1$statefips<-NULL
#hold onto rating area for later use
xran<-data1$rating_area_number
#drop rating area so that it does not confound clustering
data1$rating_area_number<-NULL
#scale data to make it ieasier to run in clsuter
# scale is generic function whose default method centers and/or scales the columns of a numeric matrix.
#make sure data stays in a data frame
data2<-data.frame(scale(data1))
# Create soem variables in data1 for later
data1$pct_uninsured<-100*((data1$ECHC01_VC134)/(data1$ECHC01_VC130))
data1$pct_unemployed<-100*(data1$ECHC01_VC07/data1$ECHC01_VC03)
dplyr::tbl_df(data.frame(data2))
## Source: local data frame [159 x 210]
##
## HC01_VC03 HC01_VC04 HC01_VC05 HC01_VC08 HC01_VC09 HC01_VC10
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 -0.3321726 -0.33140176 -0.33283978 -0.3162455 -0.3250694 -0.3290945
## 2 -0.4087907 -0.41058896 -0.40702212 -0.3793349 -0.4007917 -0.3995694
## 3 -0.3868511 -0.38668310 -0.38694025 -0.3783643 -0.3790818 -0.3857738
## 4 -0.4462899 -0.44941845 -0.44326061 -0.4375713 -0.4477773 -0.4433087
## 5 -0.1245606 -0.11832299 -0.13041874 -0.1867234 -0.1705833 -0.2180886
## 6 -0.3329597 -0.32790866 -0.33766178 -0.3481677 -0.3337743 -0.3235335
## 7 0.0647823 0.07290835 0.05710997 0.1021938 0.1676500 0.1082987
## 8 0.2910493 0.30259491 0.28011271 0.2629910 0.2771432 0.3682750
## 9 -0.3387870 -0.34103896 -0.33660329 -0.3070787 -0.3641891 -0.3065297
## 10 -0.3271021 -0.32834530 -0.32587140 -0.3229319 -0.3448914 -0.3001132
## .. ... ... ... ... ... ...
## Variables not shown: HC01_VC11 (dbl), HC01_VC12 (dbl), HC01_VC13 (dbl),
## HC01_VC14 (dbl), HC01_VC15 (dbl), HC01_VC16 (dbl), HC01_VC17 (dbl),
## HC01_VC18 (dbl), HC01_VC19 (dbl), HC01_VC20 (dbl), HC01_VC23 (dbl),
## HC01_VC26 (dbl), HC01_VC27 (dbl), HC01_VC28 (dbl), HC01_VC29 (dbl),
## HC01_VC32 (dbl), HC01_VC33 (dbl), HC01_VC34 (dbl), HC01_VC37 (dbl),
## HC01_VC38 (dbl), HC01_VC39 (dbl), HC01_VC43 (dbl), HC01_VC44 (dbl),
## HC01_VC45 (dbl), HC01_VC48 (dbl), HC01_VC49 (dbl), HC01_VC50 (dbl),
## HC01_VC51 (dbl), HC01_VC52 (dbl), HC01_VC53 (dbl), HC01_VC54 (dbl),
## HC01_VC55 (dbl), HC01_VC56 (dbl), HC01_VC57 (dbl), HC01_VC58 (dbl),
## HC01_VC59 (dbl), HC01_VC60 (dbl), HC01_VC61 (dbl), HC01_VC62 (dbl),
## HC01_VC63 (dbl), HC01_VC64 (dbl), HC01_VC65 (dbl), HC01_VC66 (dbl),
## HC01_VC67 (dbl), HC01_VC68 (dbl), HC01_VC69 (dbl), HC01_VC70 (dbl),
## HC01_VC71 (dbl), HC01_VC72 (dbl), HC01_VC73 (dbl), HC01_VC74 (dbl),
## HC01_VC77 (dbl), HC01_VC78 (dbl), HC01_VC79 (dbl), HC01_VC80 (dbl),
## HC01_VC81 (dbl), HC01_VC82 (dbl), HC01_VC83 (dbl), HC01_VC87 (dbl),
## HC01_VC88 (dbl), HC01_VC89 (dbl), HC01_VC90 (dbl), HC01_VC91 (dbl),
## HC01_VC92 (dbl), HC01_VC93 (dbl), HC01_VC94 (dbl), HC01_VC95 (dbl),
## HC01_VC96 (dbl), HC01_VC97 (dbl), HC01_VC98 (dbl), HC01_VC99 (dbl),
## HC01_VC100 (dbl), HC01_VC101 (dbl), HC01_VC102 (dbl), HC01_VC104 (dbl),
## countyfips (dbl), UR_code_2013 (dbl), ur_code_2006 (dbl), ur_code_1990
## (dbl), ECHC01_VC03 (dbl), ECHC01_VC04 (dbl), ECHC01_VC05 (dbl),
## ECHC01_VC06 (dbl), ECHC01_VC07 (dbl), ECHC01_VC08 (dbl), ECHC01_VC09
## (dbl), ECHC01_VC11 (dbl), ECHC01_VC14 (dbl), ECHC01_VC15 (dbl),
## ECHC01_VC16 (dbl), ECHC01_VC17 (dbl), ECHC01_VC19 (dbl), ECHC01_VC20
## (dbl), ECHC01_VC22 (dbl), ECHC01_VC23 (dbl), ECHC01_VC27 (dbl),
## ECHC01_VC28 (dbl), ECHC01_VC29 (dbl), ECHC01_VC30 (dbl), ECHC01_VC31
## (dbl), ECHC01_VC32 (dbl), ECHC01_VC33 (dbl), ECHC01_VC36 (dbl),
## ECHC01_VC40 (dbl), ECHC01_VC41 (dbl), ECHC01_VC42 (dbl), ECHC01_VC43
## (dbl), ECHC01_VC44 (dbl), ECHC01_VC45 (dbl), ECHC01_VC49 (dbl),
## ECHC01_VC50 (dbl), ECHC01_VC51 (dbl), ECHC01_VC52 (dbl), ECHC01_VC53
## (dbl), ECHC01_VC54 (dbl), ECHC01_VC55 (dbl), ECHC01_VC56 (dbl),
## ECHC01_VC57 (dbl), ECHC01_VC58 (dbl), ECHC01_VC59 (dbl), ECHC01_VC60
## (dbl), ECHC01_VC61 (dbl), ECHC01_VC62 (dbl), ECHC01_VC66 (dbl),
## ECHC01_VC67 (dbl), ECHC01_VC68 (dbl), ECHC01_VC69 (dbl), ECHC01_VC70
## (dbl), ECHC01_VC74 (dbl), ECHC01_VC75 (dbl), ECHC01_VC76 (dbl),
## ECHC01_VC77 (dbl), ECHC01_VC78 (dbl), ECHC01_VC79 (dbl), ECHC01_VC80
## (dbl), ECHC01_VC81 (dbl), ECHC01_VC82 (dbl), ECHC01_VC83 (dbl),
## ECHC01_VC84 (dbl), ECHC01_VC85 (dbl), ECHC01_VC86 (dbl), ECHC01_VC89
## (dbl), ECHC01_VC90 (dbl), ECHC01_VC91 (dbl), ECHC01_VC92 (dbl),
## ECHC01_VC93 (dbl), ECHC01_VC94 (dbl), ECHC01_VC97 (dbl), ECHC01_VC98
## (dbl), ECHC01_VC99 (dbl), ECHC01_VC100 (dbl), ECHC01_VC101 (dbl),
## ECHC01_VC103 (dbl), ECHC01_VC104 (dbl), ECHC01_VC105 (dbl), ECHC01_VC106
## (dbl), ECHC01_VC107 (dbl), ECHC01_VC108 (dbl), ECHC01_VC109 (dbl),
## ECHC01_VC110 (dbl), ECHC01_VC111 (dbl), ECHC01_VC112 (dbl), ECHC01_VC113
## (dbl), ECHC01_VC114 (dbl), ECHC01_VC115 (dbl), ECHC01_VC118 (dbl),
## ECHC01_VC120 (dbl), ECHC01_VC121 (dbl), ECHC01_VC122 (dbl), ECHC01_VC124
## (dbl), ECHC01_VC125 (dbl), ECHC01_VC126 (dbl), ECHC01_VC130 (dbl),
## ECHC01_VC131 (dbl), ECHC01_VC132 (dbl), ECHC01_VC133 (dbl), ECHC01_VC134
## (dbl), ECHC01_VC137 (dbl), ECHC01_VC138 (dbl), ECHC01_VC141 (dbl),
## ECHC01_VC142 (dbl), ECHC01_VC143 (dbl), ECHC01_VC144 (dbl), ECHC01_VC145
## (dbl), ECHC01_VC146 (dbl), ECHC01_VC147 (dbl), ECHC01_VC148 (dbl),
## ECHC01_VC149 (dbl), ECHC01_VC150 (dbl), ECHC01_VC151 (dbl), ECHC01_VC152
## (dbl), ECHC01_VC153 (dbl), ECHC01_VC154 (dbl), ECHC01_VC155 (dbl),
## ECHC01_VC156 (dbl), ECHC01_VC157 (dbl), Median_Income (dbl),
## Median_Family_income (dbl), sum_beds (dbl), sum_emergency (dbl),
## FIPSCode (dbl), Grand_Total (dbl), population (dbl), docs1k (dbl)
#
wss <- (nrow(data2)-1)*sum(apply(data2,2,var))
for (i in 2:30) wss[i] <- sum(kmeans(data2, centers=i)$withinss)
This is a plot to help determine the number clusters
## Source: local data frame [30 x 2]
##
## cluster wss
## (chr) (dbl)
## 1 1 33180.000
## 2 2 11035.510
## 3 3 7392.569
## 4 4 6531.915
## 5 5 6076.613
## 6 6 5920.597
## 7 7 5322.927
## 8 8 5157.568
## 9 9 5056.626
## 10 10 4868.683
## .. ... ...
## logical(0)
output1<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="kl")
output2<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="ch")
output3<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="hartigan")
#output4<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="ccc")
#output5<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="scott")
#output6<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="marriot")
#output7<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="trcovw")
#output8<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="tracew")
#output9<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="friedman")
#output10<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="rubin")
output11<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="cindex")
output12<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="db")
output13<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="silhouette")
output14<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="duda")
output15<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="pseudot2")
output16<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="beale")
output17<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="ratkowsky")
output18<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="ball")
output19<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="ptbiserial")
output20<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="gap")
output21<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="frey")
output22<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="mcclain")
output23<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="gamma")
output24<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="gplus")
output25<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="tau")
output26<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="dunn")
output27<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="hubert")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
output28<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="sdindex")
output29<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="dindex")
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
output30<-NbClust(data2,min.nc=2,max.nc=20,method="kmeans",index="sdbw")
x1<-data.frame(output1$Best.nc)
#get optional indices for kmeans method
list1<-c("kl","ch","hartigan","cindex","db","silhouette","duda","pseudot2","beale","ratkowsky","ball","ptbiserial","gap","frey","mcclain","gamma","gplus","tau","dunn","sdindex","sdbw")
l2<-data.frame(unlist(list1))
#put together into a table
table<-rbind(output1$Best.nc[1],
output2$Best.nc[1],
output3$Best.nc[1],
output11$Best.nc[1],
output12$Best.nc[1],
output13$Best.nc[1],
output14$Best.nc[1],
output15$Best.nc[1],
output16$Best.nc[1],
output17$Best.nc[1],
output18$Best.nc[1],
output19$Best.nc[1],
output20$Best.nc[1],
output21$Best.nc[1],
output22$Best.nc[1],
output23$Best.nc[1],
output24$Best.nc[1],
output25$Best.nc[1],
output26$Best.nc[1],
output28$Best.nc[1],
output30$Best.nc[1])
table2<-cbind(data.frame(table),data.frame(l2))
# bar plot of cluster results
barplot(table2$Number_clusters,xlab="Various Methods", ylab="Number of Clusters")
# Show Table 2
table2
## Number_clusters unlist.list1.
## 1 2 kl
## 2 2 ch
## 3 3 hartigan
## 4 19 cindex
## 5 2 db
## 6 2 silhouette
## 7 3 duda
## 8 3 pseudot2
## 9 3 beale
## 10 2 ratkowsky
## 11 3 ball
## 12 2 ptbiserial
## 13 2 gap
## 14 3 frey
## 15 2 mcclain
## 16 2 gamma
## 17 2 gplus
## 18 4 tau
## 19 2 dunn
## 20 11 sdindex
## 21 20 sdbw
# K-Means Cluster Analysis
fit <- kmeans(data2, 16) # 16 cluster solution
# get cluster means
data2a<-aggregate(data2,by=list(fit$cluster),FUN=mean)
# append cluster assignment
data3 <- data.frame(data2, data.frame(fit$cluster))
p<-data.frame(cbind(fit$cluster,xran))
#### Note the square graph–it is a xy projection of corrleation between cluster “proposed”" and actual rating area assuming we have 16 clusters.
fit\(cluster<-as.factor(fit\)cluster) ggplot(data2, aes(HC01_VC03, Median_Family_income,color=fit$cluster)) + geom_point()
dplyr::summarise_each(p, funs(mean))
## V1 xran
## 1 7.377358 8.320755
p<-dplyr::rename(p,fit=V1,rating_area=xran)
cor(p)
## fit rating_area
## fit 1.00000000 -0.04445346
## rating_area -0.04445346 1.00000000
p
## fit rating_area
## 1 7 14
## 2 7 11
## 3 7 6
## 4 16 1
## 5 10 16
## 6 7 10
## 7 2 2
## 8 2 3
## 9 7 15
## 10 7 15
## 11 4 12
## 12 7 12
## 13 7 6
## 14 10 15
## 15 5 14
## 16 10 14
## 17 7 5
## 18 16 3
## 19 7 1
## 20 15 6
## 21 7 14
## 22 2 4
## 23 2 7
## 24 7 6
## 25 14 14
## 26 15 8
## 27 7 13
## 28 11 3
## 29 2 2
## 30 1 1
## 31 14 3
## 32 7 15
## 33 6 3
## 34 10 11
## 35 10 15
## 36 11 5
## 37 7 15
## 38 4 3
## 39 16 12
## 40 7 1
## 41 16 7
## 42 5 10
## 43 10 15
## 44 6 3
## 45 7 12
## 46 1 12
## 47 2 1
## 48 4 3
## 49 7 15
## 50 7 15
## 51 5 14
## 52 7 2
## 53 7 5
## 54 7 14
## 55 13 9
## 56 11 3
## 57 2 13
## 58 11 3
## 59 7 10
## 60 6 3
## 61 13 13
## 62 3 5
## 63 2 6
## 64 9 13
## 65 10 15
## 66 13 2
## 67 6 3
## 68 9 10
## 69 4 10
## 70 1 16
## 71 16 4
## 72 5 8
## 73 7 10
## 74 16 4
## 75 4 3
## 76 4 12
## 77 3 15
## 78 5 2
## 79 16 3
## 80 7 11
## 81 1 5
## 82 1 5
## 83 3 11
## 84 5 12
## 85 16 3
## 86 8 15
## 87 9 11
## 88 5 1
## 89 12 14
## 90 3 5
## 91 16 14
## 92 2 15
## 93 9 10
## 94 8 5
## 95 16 6
## 96 1 8
## 97 16 2
## 98 16 8
## 99 16 8
## 100 3 15
## 101 1 1
## 102 16 12
## 103 3 11
## 104 16 2
## 105 9 9
## 106 14 8
## 107 2 3
## 108 5 2
## 109 16 2
## 110 4 3
## 111 9 12
## 112 5 13
## 113 8 6
## 114 16 3
## 115 9 13
## 116 8 12
## 117 13 12
## 118 1 8
## 119 13 10
## 120 3 1
## 121 14 5
## 122 12 3
## 123 8 1
## 124 3 14
## 125 3 15
## 126 2 3
## 127 8 10
## 128 1 8
## 129 9 1
## 130 3 8
## 131 1 5
## 132 3 14
## 133 3 8
## 134 1 11
## 135 3 1
## 136 9 15
## 137 9 15
## 138 9 11
## 139 13 10
## 140 3 11
## 141 9 8
## 142 3 15
## 143 8 12
## 144 13 10
## 145 8 8
## 146 2 7
## 147 2 3
## 148 9 6
## 149 3 5
## 150 3 16
## 151 9 6
## 152 1 8
## 153 3 11
## 154 13 10
## 155 2 9
## 156 3 12
## 157 3 5
## 158 3 16
## 159 8 1
f1<-ggplot(p,aes(fit,rating_area))+geom_point()
f1<-f1+labs(y="Rating Area", x="K Means Fit for 16 Clusters")
f1<-f1+ggtitle('Number of Clusters in GA Counties')
f1
fit$cluster<-as.factor(fit$cluster)
ggplot(data1, aes(HC01_VC03, Median_Family_income,color=fit$cluster)) + geom_point() +labs(x="Population in County",y="Med. Family Income")+ggtitle("Cluster Analysis: GA. County Demogs #1")
ggplot(data1, aes(UR_code_2013, Median_Family_income,color=fit$cluster)) + geom_point() +labs(x="<- Most Urban -- Urban Code -- Least Urban ->",y="Med. Family Income")+ggtitle("Cluster Analysis: GA. County Demogs #2")
ggplot(data1, aes(pct_uninsured, Median_Family_income,color=fit$cluster)) + geom_point() +labs(x="Percent Uninsured",y="Med. Family Income")+ggtitle("Cluster Analysis: GA. County Demogs #3")
ggplot(data1, aes(pct_unemployed, Median_Family_income,color=fit$cluster)) + geom_point() +labs(x="Percent Unemployed",y="Med. Family Income")+ggtitle("Cluster Analysis: GA. County Demogs #4")
d <- dist(data2, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward.D")
plot(fit,hang=-1,labels=FALSE,xlab="",sub="", main="Hierarchical 16 Cluster") # display dendogram
groups <- cutree(fit, k=16) # cut tree into 16 clusters
plot(groups, main="Countys in each of 16 Groups")
# draw dendogram with red borders around the 16 clusters
rect.hclust(fit, k=16, border="red")
#counties in each cluster
t<-rect.hclust(fit, k=16, border="red")
# show realtionship between clsuter and rating area
#show t
t
## [[1]]
## [1] 60
##
## [[2]]
## [1] 67
##
## [[3]]
## [1] 33
##
## [[4]]
## [1] 44
##
## [[5]]
## [1] 28 36 38 48 56 58 75 76 110
##
## [[6]]
## [1] 31
##
## [[7]]
## [1] 25 106 121
##
## [[8]]
## [1] 1 2 3 9 10 12 13 17 19 21 24 27 32 34 37 40 43
## [18] 45 46 49 50 52 55 59 73 81 82 86 101
##
## [[9]]
## [1] 14 30 53 65 70 96 116 118 120 123 128 131 133 134 135 142 143
## [18] 149 152 153 156 157 158
##
## [[10]]
## [1] 87 105 115 127 129 132 136 137 138 145 148 150 151 159
##
## [[11]]
## [1] 39 54 61 62 66 77 80 83 90 93 100 103 117 119 124 125 130
## [18] 139 140 144 154
##
## [[12]]
## [1] 8 11 22 29 47 69 92 155
##
## [[13]]
## [1] 89 122
##
## [[14]]
## [1] 5 7 16 20 23 26 35 57 63 64 68 78 107 126 141 146 147
##
## [[15]]
## [1] 15 51 72 88 108
##
## [[16]]
## [1] 4 6 18 41 42 71 74 79 84 85 91 94 95 97 98 99 102
## [18] 104 109 111 112 113 114
group5<-cutree(fit,k=5)
plot(group5,main="Countys in each of 5 Groups")
plot(fit,hang=-1,labels=FALSE,xlab="",sub="", main="Hierarchical 5 Cluster") # display dendogram
rect.hclust(fit, k=5, border="blue")
Lots of ways to get at number of clusters–but it is not likley to be 16!