## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
##
## as.spam.listw, as_dgRMatrix_listw, as_dsCMatrix_I,
## as_dsCMatrix_IrW, as_dsTMatrix_listw, can.be.simmed, cheb_setup,
## create_WX, do_ldet, eigen_pre_setup, eigen_setup, eigenw,
## errorsarlm, get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, GMargminImage, GMerrorsar,
## griffith_sone, gstsls, Hausman.test, impacts, intImpacts,
## Jacobian_W, jacobianSetup, l_max, lagmess, lagsarlm, lextrB,
## lextrS, lextrW, lmSLX, LU_prepermutate_setup, LU_setup,
## Matrix_J_setup, Matrix_setup, mcdet_setup, MCMCsamp, ME, mom_calc,
## mom_calc_int2, moments_setup, powerWeights, sacsarlm,
## SE_classic_setup, SE_interp_setup, SE_whichMin_setup,
## set.ClusterOption, set.coresOption, set.mcOption,
## set.VerboseOption, set.ZeroPolicyOption, similar.listw, spam_setup,
## spam_update_setup, SpatialFiltering, spautolm, spBreg_err,
## spBreg_lag, spBreg_sac, stsls, subgraph_eigenw, trW
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
load("C:/Users/maman/OneDrive/DEM Fall 2021/Spatial Demography/adi_tract_final.Rdata")
head(data_final)
## Simple feature collection with 6 features and 16 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -98.56606 ymin: 29.36662 xmax: -98.44832 ymax: 29.54044
## Geodetic CRS: NAD83
## GEOID NAME hispanic DP05_0071PM
## 1 48029190601 Census Tract 1906.01, Bexar County, Texas 86.2 6.7
## 2 48029181100 Census Tract 1811, Bexar County, Texas 47.5 5.8
## 3 48029140700 Census Tract 1407, Bexar County, Texas 79.9 6.6
## 4 48029180602 Census Tract 1806.02, Bexar County, Texas 67.6 7.0
## 5 48029140800 Census Tract 1408, Bexar County, Texas 90.4 3.2
## 6 48029170200 Census Tract 1702, Bexar County, Texas 98.9 1.0
## nh_black DP05_0078PM trfips adi_st_avg adi_nat_avg diabetes uninsured high_bp
## 1 0.7 1.1 48029 8.000000 84.66667 16.6 44.8 33.7
## 2 1.8 3.0 48029 4.000000 51.25000 13.7 21.5 37.0
## 3 4.7 3.2 48029 7.800000 83.00000 17.8 37.4 36.6
## 4 6.8 5.0 48029 6.000000 69.66667 19.0 35.0 43.2
## 5 1.7 1.3 48029 8.600000 87.60000 19.7 45.4 37.6
## 6 0.2 0.3 48029 9.666667 95.66667 24.7 53.5 41.9
## curr_smk high_chol obesity pop geometry
## 1 17.5 33.6 42.3 3768 MULTIPOLYGON (((-98.52602 2...
## 2 11.7 36.8 34.4 5665 MULTIPOLYGON (((-98.56225 2...
## 3 15.3 35.5 41.8 4734 MULTIPOLYGON (((-98.47348 2...
## 4 12.4 39.5 37.4 3155 MULTIPOLYGON (((-98.56606 2...
## 5 19.5 36.4 45.1 5028 MULTIPOLYGON (((-98.4778 29...
## 6 21.9 39.7 47.8 5382 MULTIPOLYGON (((-98.5283 29...
Selecting 4 variables: the population proportion that is Hispanic, area deprivation index, uninsured rate, and obesity rate.
k_clstr <- data_final %>%
select(hispanic, adi_st_avg, uninsured, obesity) %>%
filter(complete.cases(adi_st_avg))
Using K-means to create 3 cluster groups
library(ClusterR)
## Loading required package: gtools
test<-k_clstr
st_geometry(test)<-NULL # Getting rid of the geometry variable
km<-KMeans_rcpp(data=test[, c(1:4)], cluster=3, num_init =10)
km
## $clusters
## [1] 2 3 2 3 2 2 1 1 2 3 3 3 3 3 2 2 3 2 2 1 3 1 3 1 1 3 3 3 3 1 2 2 2 3 3 3 2
## [38] 1 2 2 3 1 3 3 1 3 2 1 2 3 1 2 2 2 3 3 2 2 3 2 1 2 3 3 1 3 3 1 3 3 2 3 2 3
## [75] 2 2 1 2 2 3 1 2 2 1 3 3 3 1 1 2 2 2 1 1 3 3 1 2 2 3 2 3 2 2 3 2 2 2 2 1 1
## [112] 3 3 2 2 2 2 1 3 2 2 2 1 2 1 1 3 3 3 3 1 2 1 1 1 2 1 2 2 2 1 1 3 3 2 3 3 1
## [149] 2 1 2 3 1 3 3 1 2 1 2 3 1 3 1 1 3 2 3 2 2 1 1 2 3 3 1 2 1 1 3 3 1 2 2 3 1
## [186] 1 3 1 1 2 3 3 3 2 2 2 3 2 2 2 2 3 1 3 2 3 3 2 3 3 2 1 2 1 2 2 1 1 1 1 2 1
## [223] 3 2 3 1 2 3 2 3 1 3 1 1 1 3 3 1 3 1 3 2 3 2 2 2 2 3 3 3 2 2 3 1 1 1 2 2 3
## [260] 3 2 3 2 2 3 3 1 3 1 3 2 3 1 2 1 1 1 3 2 2 3 2 3 1 2 3 2 3 3 3 2 1 2 1 1 2
## [297] 3 2 1 2 3 3 1 3 3 1 3 1 3 3 2 3 3 3 3 3 3 1 1 1 1 2 2 1 2 1 1 1 3 3 1 3 2
## [334] 3 2 1 1 3 3 3 1 3 2 3 3 1 3 1 1 3 1 1 2 3 1 3 1 1 3 1 2 1 1 1 1 1 3 2 2 3
## [371] 1 1 2 1 3 3 3 2 2 1 1 3 1 3 3 2 3 1 3 3 1 1 1 2 2 1 3 3 2 1 2 2 2 1 3 3 3
## [408] 3 3 3 1 1 2 3 1 1 1 2 2 1 2 3 3 3 1 3 3 1 3 3 1 2 1 2 3 2 3 2 3 1 2 3 1 1
## [445] 2 2 1 3 3 2 3 1 1 1 2 3 2 2 1 1 1 1 3 2 2 1 3 3 1 1 2 3 1
##
## $centroids
## [,1] [,2] [,3] [,4]
## [1,] 28.98733 2.836444 18.30067 33.13533
## [2,] 84.93133 8.058159 42.67867 42.88667
## [3,] 53.56358 5.036141 27.24798 36.86358
##
## $total_SSE
## [1] 343289.8
##
## $best_initialization
## [1] 2
##
## $WCSS_per_cluster
## [,1] [,2] [,3]
## [1,] 15086.5 17604.32 19539.72
##
## $obs_per_cluster
## [,1] [,2] [,3]
## [1,] 150 150 173
##
## $between.SS_DIV_total.SS
## [1] 0.8478529
##
## attr(,"class")
## [1] "k-means clustering"
Cluster descriptive statistics
test$cluster<-as.factor(km$cluster)
test %>%
ggpairs(aes(color=cluster))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The analysis resulted in three clusters. The first cluster contains tracts with lowest levels of area deprivation, obesity, uninsured rate, and proportion Hispanic population. Conversely, the tracts in cluster 2 contain the highest rates of all the previously mentioned variables. Lastly, the third cluster mainly represents tracts with intermediate values of each variable.
Mapping the clusters
k_clstr$km_group <- as.factor(km$cluster)
k_clstr %>%
ggplot()+
geom_sf(aes(fill = factor(km_group)))+
ggtitle("K-means deprivation clusters, San Antonio, k=3")
Finding the optimal number of clusters
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(test, kmeans, method="wss")+
geom_vline(xintercept =3, linetype =2)
According to the elbow method, which plots within cluster variation against the number of clusters, the optimal number of clusters appears to be 3.