library(readr)
library(readr)
Harshitha_29_11_2022 <- read_csv("~/Library/CloudStorage/OneDrive-Personal/2_Apple/Harshitha_analysis/20_11_2022/27_11_2022/Harshitha_29_11_2022.csv")
## Rows: 28 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): State
## dbl (7): prevOfAlcoholCurrentUse, prevOfAlcoholDependence, RTAwithDUI, Compo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(Harshitha_29_11_2022)
## spc_tbl_ [28 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ State                  : chr [1:28] "Kerala" "Andra pradesh" "Maharashtra" "Punjab" ...
##  $ prevOfAlcoholCurrentUse: num [1:28] 12.4 13.7 5.7 28.5 8.9 3.5 6.4 14.2 16.8 16.7 ...
##  $ prevOfAlcoholDependence: num [1:28] 0.6 7.2 2.5 6 0.7 0.3 2.6 14.2 1.8 0.9 ...
##  $ RTAwithDUI             : num [1:28] 157 1345 188 112 322 ...
##  $ Composite score        : num [1:28] 26 28 20 21 18 20 19 28 24 21 ...
##  $ SDI                    : num [1:28] 0.66 0.54 0.62 0.62 0.63 0.59 0.57 0.62 0.58 0.54 ...
##  $ IPV                    : num [1:28] 16 45 23 21 7 14 24 45 46 35 ...
##  $ ratio                  : num [1:28] 0.0445 0.0825 0.0829 0.1646 0.2052 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   State = col_character(),
##   ..   prevOfAlcoholCurrentUse = col_double(),
##   ..   prevOfAlcoholDependence = col_double(),
##   ..   RTAwithDUI = col_double(),
##   ..   `Composite score` = col_double(),
##   ..   SDI = col_double(),
##   ..   IPV = col_double(),
##   ..   ratio = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
Harshitha_29_11_2022$State <- as.factor(Harshitha_29_11_2022$State)
Harshitha_29_11_2022$prevOfAlcoholCurrentUse <- as.numeric(scale(Harshitha_29_11_2022$prevOfAlcoholCurrentUse))
Harshitha_29_11_2022$prevOfAlcoholDependence <- as.numeric(scale(Harshitha_29_11_2022$prevOfAlcoholDependence))
Harshitha_29_11_2022$RTAwithDUI <- as.numeric(scale(Harshitha_29_11_2022$RTAwithDUI))
Harshitha_29_11_2022$`Composite score` <- as.numeric(scale(Harshitha_29_11_2022$`Composite score`))
Harshitha_29_11_2022$SDI <- as.numeric(scale(Harshitha_29_11_2022$SDI))
Harshitha_29_11_2022$IPV <- as.numeric(scale(Harshitha_29_11_2022$IPV))
Harshitha_29_11_2022$ratio <- as.numeric(scale(Harshitha_29_11_2022$ratio))
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ dplyr   1.0.10
## ✔ tibble  3.1.8      ✔ stringr 1.4.1 
## ✔ tidyr   1.2.1      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
head(Harshitha_29_11_2022)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
gowerDist <- daisy(Harshitha_29_11_2022[,-1],"gower")
gowerMat <- as.matrix(gowerDist)
sil_width <- c(NA)
for(i in 2:8){
pam_fit <- pam(gowerMat, diss = TRUE, k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
 }
plot(1:8, sil_width,
  xlab = "Number of clusters",
  ylab = "Silhouette Width")
 lines(1:8, sil_width)

pam_fit <- pam(gowerMat, diss = TRUE,4) 
summary(pam_fit)
## Medoids:
##      ID       
## [1,] "15" "15"
## [2,] "2"  "2" 
## [3,] "10" "10"
## [4,] "13" "13"
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  2  1  1  1  1  1  2  3  3  3  3  4  1  1  1  2  1  3  3  1  4  3  1  3  3 
## 27 28 
##  1  4 
## Objective function:
##    build     swap 
## 0.140639 0.140639 
## 
## Numerical information per cluster:
##      size  max_diss   av_diss  diameter separation
## [1,]   13 0.2107602 0.1521802 0.4146999 0.09887301
## [2,]    3 0.2160501 0.1180801 0.3248355 0.13072109
## [3,]    9 0.2070331 0.1274466 0.3399221 0.09887301
## [4,]    3 0.2318496 0.1527632 0.4220424 0.17542892
## 
## Isolated clusters:
##  L-clusters: character(0)
##  L*-clusters: character(0)
## 
## Silhouette plot information:
##    cluster neighbor    sil_width
## 14       1        3  0.279313308
## 16       1        3  0.257892199
## 5        1        3  0.252330759
## 21       1        3  0.244288163
## 24       1        3  0.190214420
## 15       1        3  0.182606557
## 1        1        3  0.142762003
## 4        1        3  0.118574559
## 3        1        3  0.065467822
## 6        1        3  0.060531284
## 27       1        3  0.002237028
## 18       1        3 -0.065660742
## 7        1        3 -0.069573032
## 8        2        4  0.370100710
## 2        2        3  0.369303049
## 17       2        3 -0.193883364
## 25       3        1  0.373049180
## 10       3        1  0.352082543
## 9        3        1  0.288775331
## 26       3        2  0.271520632
## 23       3        1  0.268699401
## 11       3        2  0.234128241
## 19       3        1  0.205045998
## 12       3        1  0.185994916
## 20       3        4  0.066886602
## 13       4        3  0.182919096
## 28       4        3  0.010888472
## 22       4        3 -0.056313886
## Average silhouette width per cluster:
## [1] 0.12776803 0.18184013 0.24957587 0.04583123
## Average silhouette width of total data set:
## [1] 0.163935
## 
## Available components:
## [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
## [6] "clusinfo"   "silinfo"    "diss"       "call"
plot(pam_fit)

library(Rtsne)
tsne_obj <- Rtsne(gowerMat, is_distance = TRUE,perplexity = 1)
 tsne_data <- tsne_obj$Y %>%
      data.frame() %>%
      setNames(c("X", "Y")) %>%
      mutate(cluster = factor(pam_fit$clustering))
  ggplot(aes(x = X, y = Y), data = tsne_data) +
      geom_point(aes(color = cluster))

Harshitha_29_11_2022$cluster <- pam_fit$clustering
 
tab1 <- Harshitha_29_11_2022 %>%
  group_by(cluster)%>%
  summarise(across(c(2:8),mean))
tab2 <- Harshitha_29_11_2022 %>%
  group_by(cluster)%>%
  summarise("NumOfStates"= n(),
            "States"= paste0(State,collapse = ", "))

tableSummary <- left_join(tab1,tab2)
## Joining, by = "cluster"
tableSummary$medioids <- Harshitha_29_11_2022$State[pam_fit$id.med]
tableSummary <- t(tableSummary)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
kable(tableSummary,digits = 2)
cluster 1 2 3 4
prevOfAlcoholCurrentUse -0.1037789 -0.5459735 -0.2051326 1.6110800
prevOfAlcoholDependence -0.3077175 0.9496820 -0.2617974 1.1691529
RTAwithDUI -0.43095350 0.79048993 -0.03339319 1.17715481
Composite score -0.4875734 1.6261806 -0.1527501 0.9448880
SDI 0.8182033 -0.5234569 -0.6537395 -1.0608726
IPV -0.7995955 1.2210100 0.5288395 0.6573854
ratio 0.3441939 -1.1012991 -0.1744223 0.1330588
NumOfStates 13 3 9 3
States Kerala, Maharashtra, Punjab, Himachalpradesh, Jammu and Kashm, Karnataka, Sikkim, Chandigarh, Goa, Delhi, Andaman and Nic, Uttarakhand, Dadra and Nagar Andra pradesh, Tamil nadu, Jharkand Telangana, West bengal, Meghalaya, Haryana, Assam, Arunachal Prade, Rajasthan, Madhya Pradesh, Odisha Chattisgarh, Tripura, Uttar Pradesh
medioids Chandigarh Andra pradesh West bengal Chattisgarh