library(readr)
library(readr)
Harshitha_29_11_2022 <- read_csv("~/Library/CloudStorage/OneDrive-Personal/2_Apple/Harshitha_analysis/20_11_2022/27_11_2022/Harshitha_29_11_2022.csv")
## Rows: 28 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): State
## dbl (7): prevOfAlcoholCurrentUse, prevOfAlcoholDependence, RTAwithDUI, Compo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(Harshitha_29_11_2022)
## spc_tbl_ [28 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ State : chr [1:28] "Kerala" "Andra pradesh" "Maharashtra" "Punjab" ...
## $ prevOfAlcoholCurrentUse: num [1:28] 12.4 13.7 5.7 28.5 8.9 3.5 6.4 14.2 16.8 16.7 ...
## $ prevOfAlcoholDependence: num [1:28] 0.6 7.2 2.5 6 0.7 0.3 2.6 14.2 1.8 0.9 ...
## $ RTAwithDUI : num [1:28] 157 1345 188 112 322 ...
## $ Composite score : num [1:28] 26 28 20 21 18 20 19 28 24 21 ...
## $ SDI : num [1:28] 0.66 0.54 0.62 0.62 0.63 0.59 0.57 0.62 0.58 0.54 ...
## $ IPV : num [1:28] 16 45 23 21 7 14 24 45 46 35 ...
## $ ratio : num [1:28] 0.0445 0.0825 0.0829 0.1646 0.2052 ...
## - attr(*, "spec")=
## .. cols(
## .. State = col_character(),
## .. prevOfAlcoholCurrentUse = col_double(),
## .. prevOfAlcoholDependence = col_double(),
## .. RTAwithDUI = col_double(),
## .. `Composite score` = col_double(),
## .. SDI = col_double(),
## .. IPV = col_double(),
## .. ratio = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Harshitha_29_11_2022$State <- as.factor(Harshitha_29_11_2022$State)
Harshitha_29_11_2022$prevOfAlcoholCurrentUse <- as.numeric(scale(Harshitha_29_11_2022$prevOfAlcoholCurrentUse))
Harshitha_29_11_2022$prevOfAlcoholDependence <- as.numeric(scale(Harshitha_29_11_2022$prevOfAlcoholDependence))
Harshitha_29_11_2022$RTAwithDUI <- as.numeric(scale(Harshitha_29_11_2022$RTAwithDUI))
Harshitha_29_11_2022$`Composite score` <- as.numeric(scale(Harshitha_29_11_2022$`Composite score`))
Harshitha_29_11_2022$SDI <- as.numeric(scale(Harshitha_29_11_2022$SDI))
Harshitha_29_11_2022$IPV <- as.numeric(scale(Harshitha_29_11_2022$IPV))
Harshitha_29_11_2022$ratio <- as.numeric(scale(Harshitha_29_11_2022$ratio))
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ dplyr 1.0.10
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
head(Harshitha_29_11_2022)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
gowerDist <- daisy(Harshitha_29_11_2022[,-1],"gower")
gowerMat <- as.matrix(gowerDist)
sil_width <- c(NA)
for(i in 2:8){
pam_fit <- pam(gowerMat, diss = TRUE, k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
plot(1:8, sil_width,
xlab = "Number of clusters",
ylab = "Silhouette Width")
lines(1:8, sil_width)

pam_fit <- pam(gowerMat, diss = TRUE,4)
summary(pam_fit)
## Medoids:
## ID
## [1,] "15" "15"
## [2,] "2" "2"
## [3,] "10" "10"
## [4,] "13" "13"
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 1 2 1 1 1 1 1 2 3 3 3 3 4 1 1 1 2 1 3 3 1 4 3 1 3 3
## 27 28
## 1 4
## Objective function:
## build swap
## 0.140639 0.140639
##
## Numerical information per cluster:
## size max_diss av_diss diameter separation
## [1,] 13 0.2107602 0.1521802 0.4146999 0.09887301
## [2,] 3 0.2160501 0.1180801 0.3248355 0.13072109
## [3,] 9 0.2070331 0.1274466 0.3399221 0.09887301
## [4,] 3 0.2318496 0.1527632 0.4220424 0.17542892
##
## Isolated clusters:
## L-clusters: character(0)
## L*-clusters: character(0)
##
## Silhouette plot information:
## cluster neighbor sil_width
## 14 1 3 0.279313308
## 16 1 3 0.257892199
## 5 1 3 0.252330759
## 21 1 3 0.244288163
## 24 1 3 0.190214420
## 15 1 3 0.182606557
## 1 1 3 0.142762003
## 4 1 3 0.118574559
## 3 1 3 0.065467822
## 6 1 3 0.060531284
## 27 1 3 0.002237028
## 18 1 3 -0.065660742
## 7 1 3 -0.069573032
## 8 2 4 0.370100710
## 2 2 3 0.369303049
## 17 2 3 -0.193883364
## 25 3 1 0.373049180
## 10 3 1 0.352082543
## 9 3 1 0.288775331
## 26 3 2 0.271520632
## 23 3 1 0.268699401
## 11 3 2 0.234128241
## 19 3 1 0.205045998
## 12 3 1 0.185994916
## 20 3 4 0.066886602
## 13 4 3 0.182919096
## 28 4 3 0.010888472
## 22 4 3 -0.056313886
## Average silhouette width per cluster:
## [1] 0.12776803 0.18184013 0.24957587 0.04583123
## Average silhouette width of total data set:
## [1] 0.163935
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call"
plot(pam_fit)

library(Rtsne)
tsne_obj <- Rtsne(gowerMat, is_distance = TRUE,perplexity = 1)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster))

Harshitha_29_11_2022$cluster <- pam_fit$clustering
tab1 <- Harshitha_29_11_2022 %>%
group_by(cluster)%>%
summarise(across(c(2:8),mean))
tab2 <- Harshitha_29_11_2022 %>%
group_by(cluster)%>%
summarise("NumOfStates"= n(),
"States"= paste0(State,collapse = ", "))
tableSummary <- left_join(tab1,tab2)
## Joining, by = "cluster"
tableSummary$medioids <- Harshitha_29_11_2022$State[pam_fit$id.med]
tableSummary <- t(tableSummary)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(tableSummary,digits = 2)
|
cluster
|
1
|
2
|
3
|
4
|
|
prevOfAlcoholCurrentUse
|
-0.1037789
|
-0.5459735
|
-0.2051326
|
1.6110800
|
|
prevOfAlcoholDependence
|
-0.3077175
|
0.9496820
|
-0.2617974
|
1.1691529
|
|
RTAwithDUI
|
-0.43095350
|
0.79048993
|
-0.03339319
|
1.17715481
|
|
Composite score
|
-0.4875734
|
1.6261806
|
-0.1527501
|
0.9448880
|
|
SDI
|
0.8182033
|
-0.5234569
|
-0.6537395
|
-1.0608726
|
|
IPV
|
-0.7995955
|
1.2210100
|
0.5288395
|
0.6573854
|
|
ratio
|
0.3441939
|
-1.1012991
|
-0.1744223
|
0.1330588
|
|
NumOfStates
|
13
|
3
|
9
|
3
|
|
States
|
Kerala, Maharashtra, Punjab, Himachalpradesh, Jammu and Kashm,
Karnataka, Sikkim, Chandigarh, Goa, Delhi, Andaman and Nic, Uttarakhand,
Dadra and Nagar
|
Andra pradesh, Tamil nadu, Jharkand
|
Telangana, West bengal, Meghalaya, Haryana, Assam, Arunachal Prade,
Rajasthan, Madhya Pradesh, Odisha
|
Chattisgarh, Tripura, Uttar Pradesh
|
|
medioids
|
Chandigarh
|
Andra pradesh
|
West bengal
|
Chattisgarh
|