## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
##
## as.spam.listw, as_dgRMatrix_listw, as_dsCMatrix_I,
## as_dsCMatrix_IrW, as_dsTMatrix_listw, can.be.simmed, cheb_setup,
## create_WX, do_ldet, eigen_pre_setup, eigen_setup, eigenw,
## errorsarlm, get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, GMargminImage, GMerrorsar,
## griffith_sone, gstsls, Hausman.test, impacts, intImpacts,
## Jacobian_W, jacobianSetup, l_max, lagmess, lagsarlm, lextrB,
## lextrS, lextrW, lmSLX, LU_prepermutate_setup, LU_setup,
## Matrix_J_setup, Matrix_setup, mcdet_setup, MCMCsamp, ME, mom_calc,
## mom_calc_int2, moments_setup, powerWeights, sacsarlm,
## SE_classic_setup, SE_interp_setup, SE_whichMin_setup,
## set.ClusterOption, set.coresOption, set.mcOption,
## set.VerboseOption, set.ZeroPolicyOption, similar.listw, spam_setup,
## spam_update_setup, SpatialFiltering, spautolm, spBreg_err,
## spBreg_lag, spBreg_sac, stsls, subgraph_eigenw, trW
## Loading required package: gtools
dataurl<-"https://github.com/coreysparks/data/blob/master/uscounty_data.Rdata?raw=true"
load(url(dataurl))
summary(usco)
## pblack phisp prural rucc
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Length:3101
## 1st Qu.: 0.4603 1st Qu.: 1.154 1st Qu.: 33.60 Class :character
## Median : 2.1182 Median : 2.353 Median : 59.50 Mode :character
## Mean : 9.0072 Mean : 7.114 Mean : 58.62
## 3rd Qu.:10.3887 3rd Qu.: 6.478 3rd Qu.: 87.20
## Max. :85.9945 Max. :97.571 Max. :100.00
##
## ppov pfemhh unemp medhouseval
## Min. : 2.50 Min. : 2.7 Min. : 1.800 Min. : 31500
## 1st Qu.:10.80 1st Qu.:12.7 1st Qu.: 4.100 1st Qu.: 84500
## Median :14.20 Median :15.5 Median : 5.100 Median :108900
## Mean :15.33 Mean :16.7 Mean : 5.408 Mean :130283
## 3rd Qu.:18.60 3rd Qu.:19.3 3rd Qu.: 6.200 3rd Qu.:152900
## Max. :51.00 Max. :47.8 Max. :16.000 Max. :912600
## NA's :1
## popdensity gini County Deaths
## Min. :9.269e+04 Min. :0.2070 Length:3101 Min. : 12
## 1st Qu.:1.715e+07 1st Qu.:0.4070 Class :character 1st Qu.: 646
## Median :4.405e+07 Median :0.4290 Mode :character Median : 1402
## Mean :2.279e+08 Mean :0.4313 Mean : 4182
## 3rd Qu.:1.058e+08 3rd Qu.:0.4530 3rd Qu.: 3368
## Max. :6.979e+10 Max. :0.6450 Max. :298108
## NA's :2
## Population Age.Adjusted.Rate state geoid
## Min. : 481 Min. : 300.9 Length:3101 Length:3101
## 1st Qu.: 56089 1st Qu.: 721.6 Class :character Class :character
## Median : 131260 Median : 806.4 Mode :character Mode :character
## Mean : 511096 Mean : 815.3
## 3rd Qu.: 344108 3rd Qu.: 906.4
## Max. :50036337 Max. :1435.7
## NA's :7
## geometry metro_cut
## MULTIPOLYGON :3101 Metropolitan :2049
## epsg:9311 : 0 Non-Metropolitan:1052
## +proj=laea...: 0
##
##
##
##
head(usco)
## Simple feature collection with 6 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1259088 ymin: -1452583 xmax: 1390058 ymax: -1107765
## Projected CRS: NAD27 / US National Atlas Equal Area
## pblack phisp prural rucc ppov pfemhh unemp medhouseval popdensity
## 1 3.810512 1.8326418 100.0 08 15.5 14.5 3.5 101000 25816818
## 2 18.489258 3.2830777 47.2 04 15.1 18.5 3.5 137000 67111751
## 3 31.616198 1.5051066 100.0 08 18.7 20.5 4.6 73100 17147879
## 4 12.566549 0.9458693 69.7 06 18.4 19.2 4.0 90600 35909345
## 5 24.557442 0.7867706 100.0 08 23.5 22.7 4.3 68100 22546153
## 6 20.917781 3.3047510 50.9 04 16.0 21.4 4.0 103300 86871603
## gini County Deaths Population Age.Adjusted.Rate state geoid
## 1 0.424 Cleburne County, AL 926 75007 979.53 01 01029
## 2 0.452 Coffee County, AL 2484 254213 851.78 01 01031
## 3 0.443 Coosa County, AL 687 55109 954.81 01 01037
## 4 0.475 Covington County, AL 2578 189254 963.49 01 01039
## 5 0.466 Crenshaw County, AL 876 69471 989.55 01 01041
## 6 0.424 Dale County, AL 2392 248927 876.17 01 01045
## geometry metro_cut
## 1 MULTIPOLYGON (((1349148 -11... Non-Metropolitan
## 2 MULTIPOLYGON (((1327060 -13... Metropolitan
## 3 MULTIPOLYGON (((1305589 -12... Non-Metropolitan
## 4 MULTIPOLYGON (((1306630 -14... Metropolitan
## 5 MULTIPOLYGON (((1315155 -13... Non-Metropolitan
## 6 MULTIPOLYGON (((1354333 -14... Metropolitan
usco <- st_transform(usco, crs = 2163)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
usd<-usco
st_geometry(usd)<-NULL
usd%>%
select(unemp, medhouseval,pblack, state, gini)%>%
filter(complete.cases(.), state=="06")%>%
select(unemp, medhouseval,pblack, state, gini)%>%
ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
usd_sub<-usd%>%
select(unemp, medhouseval,pblack, gini, state)%>%
filter(complete.cases(.), state =="06")
dmat<-dist(usd_sub[, -4],
method="euclidean") #-4 drops state from the calculations
hc1<-hclust(d= dmat,
method="single")
plot(hc1,
hang=-1,
main="Single linkage cluster analysis of California County data")
library(scorecard)
##
## Attaching package: 'scorecard'
## The following object is masked from 'package:tidyr':
##
## replace_na
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(class)
library(RColorBrewer)
fviz_dend(hc1, k=3,
k_colors =brewer.pal(n=3, name="Accent"),
color_labels_by_k = TRUE,
ggtheme = theme_minimal())
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
groups<-cutree(hc1, k=4)
table(groups)
## groups
## 1 2 3 4
## 51 3 3 1
hc2<-hclust(d= dmat, method="ward.D")
plot(hc2,
hang=-1,
main="Ward's cluster analysis of California county data")
fviz_dend(hc2, k=3,
k_colors = brewer.pal(n=3, name="Accent"),
color_labels_by_k = TRUE,
ggtheme = theme_minimal())
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
groups<-cutree(hc2, k=3)
table(groups)
## groups
## 1 2 3
## 34 4 20
usd_sub$group1<-factor(cutree(hc2, k=3))
usd_sub%>%
select(-state)%>%
ggpairs(aes(color=group1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tx<- usco%>%
filter(state =="06", complete.cases(unemp, medhouseval,pblack, state, gini))
tx$group <- factor(cutree(hc2, k=3))
tx%>%
ggplot()+
geom_sf(aes(fill = factor(group)))
library(ClusterR)
km2<-KMeans_rcpp(data=usd_sub[, c(1:4)], cluster=3, num_init = 10)
km2
## $clusters
## [1] 3 3 2 1 3 3 3 3 1 3 3 3 2 1 1 3 3 3 3 1 3 1 3 3 3 3 3 3 3 1 3 2 3 1 3 1 1 1
## [39] 1 1 1 3 3 3 1 1 3 1 2 2 3 3 1 3 3 3 3 1
##
## $centroids
## [,1] [,2] [,3] [,4]
## [1,] 5.136842 394800.0 3.328688 0.4450526
## [2,] 5.020000 702240.0 3.549608 0.4778000
## [3,] 7.820588 203829.4 3.803258 0.4347353
##
## $total_SSE
## [1] 1.445539e+12
##
## $best_initialization
## [1] 1
##
## $WCSS_per_cluster
## [,1] [,2] [,3]
## [1,] 75654300277 33916712024 46846731184
##
## $obs_per_cluster
## [,1] [,2] [,3]
## [1,] 19 5 34
##
## $between.SS_DIV_total.SS
## [1] 0.8917928
##
## attr(,"class")
## [1] "k-means clustering"
usd_sub$cluster<-as.factor(km2$cluster)
usd_sub%>%
select(-state, -group1)%>%
ggpairs(aes(color=cluster))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tx$km_group <- as.factor(km2$cluster)
tx%>%
ggplot()+
geom_sf(aes(fill = factor(km_group)))+
ggtitle("K-means clusters for California Counties, k=3")
library("factoextra")
my_data <- scale(usd_sub[, c(1:4)])
fviz_nbclust(my_data, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)
fviz_nbclust(my_data, kmeans, method = "silhouette")
fviz_nbclust(my_data, kmeans, method = "gap_stat")
Selected Variables: unemp, medhouseval,pblack, state, gini. Three methods were used to decide what k is: elbow method, silhouette method, and gap statistic. Two (elbow method and silhouette method) out of three methods identified 3 clusters. Thus, I selected 3 as my k number of clusters.Three groups (clusters) vary across the variables: blue group, green group, and red group. The highest proportion of unemployment is among the red group compared to the other two groups (blue and green). The highest median house value was among the green group, followed by the blue group, and then the red group. The proportion of blacks were roughly equal among the 3 groups. The green group has the highest value on the Gini index, followed by the blue group, and then the red group. Across California, most of the north and southeast is composed of the red group. The blue group is primary in the west of California with a small portion in the east of California. The green group occupied a small portion on the west of California.