Cluster Final data

library(readr)
library(readr)
Harshitha_29_11_2022 <- read_csv("~/Library/CloudStorage/OneDrive-Personal/2_Apple/Harshitha_analysis/20_11_2022/27_11_2022/Harshitha_29_11_2022.csv")

## Rows: 28 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): State
## dbl (7): prevOfAlcoholCurrentUse, prevOfAlcoholDependence, RTAwithDUI, Compo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(Harshitha_29_11_2022)

## spc_tbl_ [28 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ State                  : chr [1:28] "Kerala" "Andra pradesh" "Maharashtra" "Punjab" ...
##  $ prevOfAlcoholCurrentUse: num [1:28] 12.4 13.7 5.7 28.5 8.9 3.5 6.4 14.2 16.8 16.7 ...
##  $ prevOfAlcoholDependence: num [1:28] 0.6 7.2 2.5 6 0.7 0.3 2.6 14.2 1.8 0.9 ...
##  $ RTAwithDUI             : num [1:28] 157 1345 188 112 322 ...
##  $ Composite score        : num [1:28] 26 28 20 21 18 20 19 28 24 21 ...
##  $ SDI                    : num [1:28] 0.66 0.54 0.62 0.62 0.63 0.59 0.57 0.62 0.58 0.54 ...
##  $ IPV                    : num [1:28] 16 45 23 21 7 14 24 45 46 35 ...
##  $ ratio                  : num [1:28] 0.0445 0.0825 0.0829 0.1646 0.2052 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   State = col_character(),
##   ..   prevOfAlcoholCurrentUse = col_double(),
##   ..   prevOfAlcoholDependence = col_double(),
##   ..   RTAwithDUI = col_double(),
##   ..   `Composite score` = col_double(),
##   ..   SDI = col_double(),
##   ..   IPV = col_double(),
##   ..   ratio = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Harshitha_29_11_2022$State <- as.factor(Harshitha_29_11_2022$State)
Harshitha_29_11_2022$prevOfAlcoholCurrentUse <- as.numeric(scale(Harshitha_29_11_2022$prevOfAlcoholCurrentUse))
Harshitha_29_11_2022$prevOfAlcoholDependence <- as.numeric(scale(Harshitha_29_11_2022$prevOfAlcoholDependence))
Harshitha_29_11_2022$RTAwithDUI <- as.numeric(scale(Harshitha_29_11_2022$RTAwithDUI))
Harshitha_29_11_2022$`Composite score` <- as.numeric(scale(Harshitha_29_11_2022$`Composite score`))
Harshitha_29_11_2022$SDI <- as.numeric(scale(Harshitha_29_11_2022$SDI))
Harshitha_29_11_2022$IPV <- as.numeric(scale(Harshitha_29_11_2022$IPV))
Harshitha_29_11_2022$ratio <- as.numeric(scale(Harshitha_29_11_2022$ratio))

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ dplyr   1.0.10
## ✔ tibble  3.1.8      ✔ stringr 1.4.1 
## ✔ tidyr   1.2.1      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

head(Harshitha_29_11_2022)

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

gowerDist <- daisy(Harshitha_29_11_2022[,-1],"gower")
gowerMat <- as.matrix(gowerDist)
sil_width <- c(NA)
for(i in 2:8){
pam_fit <- pam(gowerMat, diss = TRUE, k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
 }
plot(1:8, sil_width,
  xlab = "Number of clusters",
  ylab = "Silhouette Width")
 lines(1:8, sil_width)

pam_fit <- pam(gowerMat, diss = TRUE,4) 
summary(pam_fit)

## Medoids:
##      ID       
## [1,] "15" "15"
## [2,] "2"  "2" 
## [3,] "10" "10"
## [4,] "13" "13"
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  2  1  1  1  1  1  2  3  3  3  3  4  1  1  1  2  1  3  3  1  4  3  1  3  3 
## 27 28 
##  1  4 
## Objective function:
##    build     swap 
## 0.140639 0.140639 
## 
## Numerical information per cluster:
##      size  max_diss   av_diss  diameter separation
## [1,]   13 0.2107602 0.1521802 0.4146999 0.09887301
## [2,]    3 0.2160501 0.1180801 0.3248355 0.13072109
## [3,]    9 0.2070331 0.1274466 0.3399221 0.09887301
## [4,]    3 0.2318496 0.1527632 0.4220424 0.17542892
## 
## Isolated clusters:
##  L-clusters: character(0)
##  L*-clusters: character(0)
## 
## Silhouette plot information:
##    cluster neighbor    sil_width
## 14       1        3  0.279313308
## 16       1        3  0.257892199
## 5        1        3  0.252330759
## 21       1        3  0.244288163
## 24       1        3  0.190214420
## 15       1        3  0.182606557
## 1        1        3  0.142762003
## 4        1        3  0.118574559
## 3        1        3  0.065467822
## 6        1        3  0.060531284
## 27       1        3  0.002237028
## 18       1        3 -0.065660742
## 7        1        3 -0.069573032
## 8        2        4  0.370100710
## 2        2        3  0.369303049
## 17       2        3 -0.193883364
## 25       3        1  0.373049180
## 10       3        1  0.352082543
## 9        3        1  0.288775331
## 26       3        2  0.271520632
## 23       3        1  0.268699401
## 11       3        2  0.234128241
## 19       3        1  0.205045998
## 12       3        1  0.185994916
## 20       3        4  0.066886602
## 13       4        3  0.182919096
## 28       4        3  0.010888472
## 22       4        3 -0.056313886
## Average silhouette width per cluster:
## [1] 0.12776803 0.18184013 0.24957587 0.04583123
## Average silhouette width of total data set:
## [1] 0.163935
## 
## Available components:
## [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
## [6] "clusinfo"   "silinfo"    "diss"       "call"

plot(pam_fit)

library(Rtsne)
tsne_obj <- Rtsne(gowerMat, is_distance = TRUE,perplexity = 1)
 tsne_data <- tsne_obj$Y %>%
      data.frame() %>%
      setNames(c("X", "Y")) %>%
      mutate(cluster = factor(pam_fit$clustering))
  ggplot(aes(x = X, y = Y), data = tsne_data) +
      geom_point(aes(color = cluster))

Harshitha_29_11_2022$cluster <- pam_fit$clustering
 
tab1 <- Harshitha_29_11_2022 %>%
  group_by(cluster)%>%
  summarise(across(c(2:8),mean))
tab2 <- Harshitha_29_11_2022 %>%
  group_by(cluster)%>%
  summarise("NumOfStates"= n(),
            "States"= paste0(State,collapse = ", "))

tableSummary <- left_join(tab1,tab2)

## Joining, by = "cluster"

tableSummary$medioids <- Harshitha_29_11_2022$State[pam_fit$id.med]
tableSummary <- t(tableSummary)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(tableSummary,digits = 2)

cluster	1	2	3	4
prevOfAlcoholCurrentUse	-0.1037789	-0.5459735	-0.2051326	1.6110800
prevOfAlcoholDependence	-0.3077175	0.9496820	-0.2617974	1.1691529
RTAwithDUI	-0.43095350	0.79048993	-0.03339319	1.17715481
Composite score	-0.4875734	1.6261806	-0.1527501	0.9448880
SDI	0.8182033	-0.5234569	-0.6537395	-1.0608726
IPV	-0.7995955	1.2210100	0.5288395	0.6573854
ratio	0.3441939	-1.1012991	-0.1744223	0.1330588
NumOfStates	13	3	9	3
States	Kerala, Maharashtra, Punjab, Himachalpradesh, Jammu and Kashm, Karnataka, Sikkim, Chandigarh, Goa, Delhi, Andaman and Nic, Uttarakhand, Dadra and Nagar	Andra pradesh, Tamil nadu, Jharkand	Telangana, West bengal, Meghalaya, Haryana, Assam, Arunachal Prade, Rajasthan, Madhya Pradesh, Odisha	Chattisgarh, Tripura, Uttar Pradesh
medioids	Chandigarh	Andra pradesh	West bengal	Chattisgarh