Here are the steps on how to K-Means and K-Medoids clustering:
setwd("~/Documents/CT 2023 prescription counts by town")
library(tidyverse) # data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df_PMP<- read.csv("Prescription_Monitoring_Program_CT_2023.csv", row.names = 1)
head(df_PMP,5)
## Naloxone Opiate.Agonist Opiate.Partial.Agonist Benzodiazepine
## Andover 10 1256 131 1217
## Ansonia 185 9629 1645 6559
## Ashford 28 1678 182 1490
## Avon 45 4425 421 6723
## Barkhamsted 12 1490 80 1142
## Stimulant Gabapentin
## Andover 939 817
## Ansonia 5425 4459
## Ashford 1562 1185
## Avon 6803 3313
## Barkhamsted 1194 862
df <- scale(df_PMP)
head(df, 5)
## Naloxone Opiate.Agonist Opiate.Partial.Agonist Benzodiazepine
## Andover -0.4316241 -0.7170122 -0.6779584 -0.84984926
## Ansonia 0.1340374 0.2623889 0.1031724 -0.11800769
## Ashford -0.3734418 -0.6676502 -0.6516455 -0.81244890
## Avon -0.3184918 -0.3463300 -0.5283362 -0.09554007
## Barkhamsted -0.4251594 -0.6896409 -0.7042712 -0.86012409
## Stimulant Gabapentin
## Andover -0.90373498 -0.70622561
## Ansonia -0.17529756 -0.02757079
## Ashford -0.80257214 -0.63765203
## Avon 0.04846231 -0.24111786
## Barkhamsted -0.86232804 -0.69784025
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")
set.seed(123)
fviz_nbclust(df, kmeans, method = "silhouette")
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 9
## logW E.logW gap SE.sim
## [1,] 4.624627 5.495884 0.8712566 0.02664885
## [2,] 4.163791 5.132332 0.9685413 0.02026650
## [3,] 4.024020 5.002492 0.9784720 0.02089624
## [4,] 3.731987 4.901349 1.1693619 0.01698707
## [5,] 3.628889 4.814277 1.1853876 0.01758481
## [6,] 3.544905 4.744819 1.1999139 0.01794194
## [7,] 3.410301 4.689321 1.2790199 0.01628376
## [8,] 3.352308 4.642488 1.2901800 0.01577213
## [9,] 3.274861 4.602972 1.3281108 0.01458306
## [10,] 3.246693 4.567509 1.3208152 0.01434512
fviz_gap_stat(gap_stat)
k9 <- kmeans(df, centers = 9, nstart = 25)
# print(k9)
fviz_cluster(k9, data = df, main = "K-Means clustering (k =9) prescription counts by town, 2023")
library(fpc)
library(cluster)
library(factoextra)
pam.res.9 <- pam(df, 9)
# print(pam.res.9)
fviz_cluster(pam.res.9, data = df, main = "K-Medoids (PAM) clustering (k =9) prescription counts by town, 2023")
# K-Means clusters
print(k9)
## K-means clustering with 9 clusters of sizes 25, 5, 8, 3, 1, 11, 73, 38, 5
##
## Cluster means:
## Naloxone Opiate.Agonist Opiate.Partial.Agonist Benzodiazepine Stimulant
## 1 -0.19967057 0.03174478 -0.1331067 0.2695039 0.4991452
## 2 1.82908242 1.98926011 3.1466932 1.9129828 1.5483679
## 3 0.85889217 1.60304122 1.2390754 1.7769099 1.4170024
## 4 4.37919213 4.13577773 3.1929557 2.7532809 1.8404898
## 5 7.86258942 4.37498412 4.4117742 2.9971824 3.2241312
## 6 0.36794208 0.79414056 1.1316253 0.8879289 0.6270081
## 7 -0.38566272 -0.65222605 -0.5858103 -0.7577268 -0.7583669
## 8 -0.22152126 -0.19371933 -0.1963977 -0.2068543 -0.2137517
## 9 0.09977446 1.17834462 0.2940696 2.3264966 3.2568346
## Gabapentin
## 1 0.07130933
## 2 2.12429794
## 3 1.34853603
## 4 4.04249474
## 5 4.89276985
## 6 0.84072861
## 7 -0.65124984
## 8 -0.23503234
## 9 1.40233741
##
## Clustering vector:
## Andover Ansonia Ashford Avon
## 7 8 7 8
## Barkhamsted Beacon Falls Berlin Bethany
## 7 7 8 7
## Bethel Bethlehem Bloomfield Bolton
## 1 7 8 7
## Bozrah Branford Bridgeport Bridgewater
## 7 6 4 7
## Bristol Brookfield Brooklyn Burlington
## 2 8 7 7
## Canaan Canterbury Canton Chaplin
## 7 7 7 7
## Cheshire Chester Clinton Colchester
## 1 7 8 8
## Colebrook Columbia Cornwall Coventry
## 7 7 7 8
## Cromwell Danbury Darien Deep River
## 8 3 1 7
## Derby Durham East Hampton East Granby
## 8 7 8 7
## East Haddam East Hartford East Haven East Lyme
## 7 6 6 1
## East Windsor Eastford Easton Ellington
## 7 7 8 8
## Enfield Essex Fairfield Farmington
## 6 7 9 1
## Franklin Glastonbury Goshen Granby
## 7 1 7 7
## Greenwich Griswold Groton Guilford
## 9 8 6 1
## Haddam Hamden Hampton Hartford
## 7 3 7 4
## Hartland Harwinton Hebron Kent
## 7 7 7 7
## Killingly Killingworth Lebanon Ledyard
## 1 7 7 8
## Lisbon Litchfield Lyme Madison
## 7 7 7 1
## Manchester Mansfield Marlborough Meriden
## 3 8 7 2
## Middlebury Middlefield Middletown Milford
## 7 7 2 3
## Monroe Montville Morris Naugatuck
## 8 1 7 6
## New Britain New Canaan New Fairfield New Hartford
## 2 1 8 7
## New Haven New London New Milford Newington
## 5 6 1 1
## Newtown Norfolk North Stonington North Branford
## 1 7 7 8
## North Canaan North Haven Norwalk Norwich
## 7 1 9 2
## Old Lyme Old Saybrook Orange Oxford
## 7 8 8 8
## Plainfield Plainville Plymouth Pomfret
## 8 8 8 7
## Portland Preston Prospect Putnam
## 8 7 7 8
## Redding Ridgefield Rocky Hill Roxbury
## 7 1 8 7
## Salem Salisbury Scotland Seymour
## 7 7 7 8
## Sharon Shelton Sherman Simsbury
## 7 6 7 1
## Somers South Windsor Southbury Southington
## 8 1 8 6
## Sprague Stafford Stamford Sterling
## 7 8 9 7
## Stonington Stratford Suffield Thomaston
## 1 3 8 7
## Thompson Tolland Torrington Trumbull
## 7 8 3 1
## Union Vernon Voluntown Wallingford
## 7 6 7 3
## Warren Washington Waterbury Waterford
## 7 7 4 1
## Watertown West Hartford West Haven Westbrook
## 1 9 3 7
## Weston Westport Wethersfield Willington
## 7 1 1 7
## Wilton Winchester Windham Windsor
## 8 8 6 1
## Windsor Locks Wolcott Woodbridge Woodbury
## 8 8 7 7
## Woodstock
## 7
##
## Within cluster sum of squares by cluster:
## [1] 8.366051 7.857811 6.466731 12.180094 0.000000 8.752020 7.572513
## [8] 7.266665 8.637783
## (between_SS / total_SS = 93.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# K-Medoids (PAM) clusters
print(pam.res.9)
## Medoids:
## ID Naloxone Opiate.Agonist Opiate.Partial.Agonist
## Chaplin 24 -0.42515939 -0.74052339 -0.6485499
## Farmington 52 -0.23121831 0.01323991 -0.1981356
## Ellington 48 -0.27970358 -0.25310383 -0.3271202
## New Hartford 92 -0.37667412 -0.56027056 -0.4875771
## Vernon 146 0.13726974 0.82911499 0.8321934
## Bridgeport 15 5.88115805 4.92568518 2.0756050
## New Britain 89 2.13163051 2.58321717 3.6394144
## Wallingford 148 0.54131365 1.35232778 1.5421247
## West Hartford 154 -0.06313605 0.71389822 0.2512468
## Benzodiazepine Stimulant Gabapentin
## Chaplin -0.8939625 -0.9142897 -0.74032606
## Farmington 0.1518777 0.2891097 0.06373642
## Ellington -0.1989733 -0.2901003 -0.28583976
## New Hartford -0.6402428 -0.6565924 -0.56535163
## Vernon 0.9913995 0.4862392 0.92761445
## Bridgeport 3.0536255 1.6925615 3.85429008
## New Britain 2.0243619 1.2604679 2.50536579
## Wallingford 1.7747522 1.4439575 1.49055136
## West Hartford 1.8921592 2.9326588 1.23321409
## Clustering vector:
## Andover Ansonia Ashford Avon
## 1 2 1 3
## Barkhamsted Beacon Falls Berlin Bethany
## 1 4 2 4
## Bethel Bethlehem Bloomfield Bolton
## 2 1 3 1
## Bozrah Branford Bridgeport Bridgewater
## 1 5 6 1
## Bristol Brookfield Brooklyn Burlington
## 7 3 4 4
## Canaan Canterbury Canton Chaplin
## 1 4 4 1
## Cheshire Chester Clinton Colchester
## 2 1 3 2
## Colebrook Columbia Cornwall Coventry
## 1 4 1 3
## Cromwell Danbury Darien Deep River
## 3 8 2 4
## Derby Durham East Hampton East Granby
## 3 4 3 1
## East Haddam East Hartford East Haven East Lyme
## 4 5 5 2
## East Windsor Eastford Easton Ellington
## 4 1 3 3
## Enfield Essex Fairfield Farmington
## 8 4 9 2
## Franklin Glastonbury Goshen Granby
## 1 2 1 4
## Greenwich Griswold Groton Guilford
## 9 3 5 2
## Haddam Hamden Hampton Hartford
## 4 8 1 7
## Hartland Harwinton Hebron Kent
## 1 4 4 1
## Killingly Killingworth Lebanon Ledyard
## 2 4 4 3
## Lisbon Litchfield Lyme Madison
## 1 4 1 2
## Manchester Mansfield Marlborough Meriden
## 8 3 4 7
## Middlebury Middlefield Middletown Milford
## 4 4 7 8
## Monroe Montville Morris Naugatuck
## 3 2 1 5
## New Britain New Canaan New Fairfield New Hartford
## 7 2 3 4
## New Haven New London New Milford Newington
## 6 5 2 2
## Newtown Norfolk North Stonington North Branford
## 2 1 1 3
## North Canaan North Haven Norwalk Norwich
## 1 2 9 8
## Old Lyme Old Saybrook Orange Oxford
## 4 3 3 3
## Plainfield Plainville Plymouth Pomfret
## 2 2 3 1
## Portland Preston Prospect Putnam
## 3 4 4 3
## Redding Ridgefield Rocky Hill Roxbury
## 4 2 3 1
## Salem Salisbury Scotland Seymour
## 1 1 1 2
## Sharon Shelton Sherman Simsbury
## 1 5 1 2
## Somers South Windsor Southbury Southington
## 4 2 2 8
## Sprague Stafford Stamford Sterling
## 1 3 9 1
## Stonington Stratford Suffield Thomaston
## 2 5 3 4
## Thompson Tolland Torrington Trumbull
## 4 3 8 2
## Union Vernon Voluntown Wallingford
## 1 5 1 8
## Warren Washington Waterbury Waterford
## 1 1 7 2
## Watertown West Hartford West Haven Westbrook
## 2 9 8 4
## Weston Westport Wethersfield Willington
## 4 2 2 4
## Wilton Winchester Windham Windsor
## 3 3 5 2
## Windsor Locks Wolcott Woodbridge Woodbury
## 3 3 4 4
## Woodstock
## 4
## Objective function:
## build swap
## 0.4761830 0.4746105
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
df_PMP %>%
mutate(Cluster = k9$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 9 × 7
## Cluster Naloxone Opiate.Agonist Opiate.Partial.Agonist Benzodiazepine
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 81.8 7657. 1187. 9388.
## 2 2 709. 24392. 7544 21384
## 3 3 409. 21090. 3847. 20391.
## 4 4 1498. 42743 7634. 27518.
## 5 5 2576 44788 9996 29298
## 6 6 257. 14175 3638. 13902.
## 7 7 24.2 1810. 310. 1889.
## 8 8 75 5730. 1064. 5910.
## 9 9 174. 17460. 2015 24402.
## # ℹ 2 more variables: Stimulant <dbl>, Gabapentin <dbl>
Note. A disadvantage of K-means is that it’s sensitive to outliers and different results can occur if you change the ordering of your data. The Partitioning Around Medoids (PAM) clustering approach is less sensitive to outliers and provides a robust alternative to k-means to deal with these situations.