library(readr)
Alcohol_policy <- read_csv("~/Library/CloudStorage/OneDrive-Personal/2_Apple/Harshitha_analysis/20_11_2022/Alcohol_policy.csv")
## Rows: 31 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): state
## dbl (19): SaleTimings, MinLegalDrinkingAge, PercapitaIncome, HealthIndex, Pe...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
attach(Alcohol_policy)
Alcohol_policy$state <- as.factor(Alcohol_policy$state)
Alcohol_policy$SaleTimings <- as.numeric(Alcohol_policy$SaleTimings)
Alcohol_policy$MinLegalDrinkingAge <- as.factor(Alcohol_policy$MinLegalDrinkingAge)
Alcohol_policy$PercapitaIncome <- as.numeric(Alcohol_policy$PercapitaIncome)
Alcohol_policy$HealthIndex <- as.numeric(Alcohol_policy$HealthIndex)
Alcohol_policy$PercentageBPL <- as.numeric(Alcohol_policy$PercentageBPL)
Alcohol_policy$DrinkDrive <- as.numeric(Alcohol_policy$DrinkDrive)
Alcohol_policy$SDI2017 <- as.numeric(Alcohol_policy$SDI2017)
Alcohol_policy$PercentageIPV <- as.numeric(Alcohol_policy$PercentageIPV)
Alcohol_policy$banofsalepublicplaces <- as.factor(Alcohol_policy$banofsalepublicplaces)
Alcohol_policy$minimumsaleprice <- as.factor(Alcohol_policy$minimumsaleprice)
Alcohol_policy$policycontroldensity <- as.factor(Alcohol_policy$policycontroldensity)
Alcohol_policy$quotaforretailsale <- as.factor(Alcohol_policy$quotaforretailsale)
Alcohol_policy$warninquality <- as.factor(Alcohol_policy$warninquality)
Alcohol_policy$distributionsystem <- as.factor(Alcohol_policy$distributionsystem)
Alcohol_policy$pointofsale <- as.factor(Alcohol_policy$pointofsale)
Alcohol_policy$CurrentAlcoholUse <- as.numeric(Alcohol_policy$CurrentAlcoholUse)
Alcohol_policy$AlcoholDependence <- as.numeric(Alcohol_policy$AlcoholDependence)
Alcohol_policy$OwnTaxRevenue <- as.numeric(Alcohol_policy$OwnTaxRevenue)
Alcohol_policy$StateExcise <- as.numeric(Alcohol_policy$StateExcise)
str(Alcohol_policy)
## spc_tbl_ [31 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ state : Factor w/ 31 levels "Andaman and Nic",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ SaleTimings : num [1:31] 12 9 12 11 0 14 9 12 12 12 ...
## $ MinLegalDrinkingAge : Factor w/ 3 levels "1","2","3": 1 2 2 2 2 3 2 2 1 3 ...
## $ PercapitaIncome : num [1:31] 218649 168480 169742 86801 46292 ...
## $ HealthIndex : num [1:31] 45.4 65.1 46.1 48.5 32.1 ...
## $ PercentageBPL : num [1:31] 1 12.3 34.7 32 33.7 ...
## $ DrinkDrive : num [1:31] 20 1345 55 377 10 ...
## $ SDI2017 : num [1:31] 0.65 0.54 0.56 0.53 0.43 0.65 0.51 0.65 0.65 0.72 ...
## $ PercentageIPV : num [1:31] 20 45 35 27 45 23 38 36 29 30 ...
## $ banofsalepublicplaces: Factor w/ 2 levels "1","2": 1 1 1 1 1 2 1 1 1 1 ...
## $ minimumsaleprice : Factor w/ 2 levels "1","2": 1 1 2 2 1 1 1 1 1 2 ...
## $ policycontroldensity : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 1 1 2 ...
## $ quotaforretailsale : Factor w/ 2 levels "1","2": 1 1 1 1 1 2 2 1 1 2 ...
## $ warninquality : Factor w/ 2 levels "1","2": 2 1 2 2 1 1 2 1 1 1 ...
## $ distributionsystem : Factor w/ 4 levels "1","2","3","4": 4 1 2 2 1 4 1 4 4 4 ...
## $ pointofsale : Factor w/ 3 levels "1","2","3": 2 2 2 2 1 2 2 2 2 2 ...
## $ CurrentAlcoholUse : num [1:31] 25.4 13.7 28 8.8 0.9 17.5 35.6 11.6 18.3 21.3 ...
## $ AlcoholDependence : num [1:31] 7.1 13.7 10.2 1.3 0.15 1.1 6.2 0.5 3.3 2.4 ...
## $ OwnTaxRevenue : num [1:31] 40305 7543770 144000 1799415 3617469 ...
## $ StateExcise : num [1:31] 14475 622020 20836 145000 0 ...
## - attr(*, "spec")=
## .. cols(
## .. state = col_character(),
## .. SaleTimings = col_double(),
## .. MinLegalDrinkingAge = col_double(),
## .. PercapitaIncome = col_double(),
## .. HealthIndex = col_double(),
## .. PercentageBPL = col_double(),
## .. DrinkDrive = col_double(),
## .. SDI2017 = col_double(),
## .. PercentageIPV = col_double(),
## .. banofsalepublicplaces = col_double(),
## .. minimumsaleprice = col_double(),
## .. policycontroldensity = col_double(),
## .. quotaforretailsale = col_double(),
## .. warninquality = col_double(),
## .. distributionsystem = col_double(),
## .. pointofsale = col_double(),
## .. CurrentAlcoholUse = col_double(),
## .. AlcoholDependence = col_double(),
## .. OwnTaxRevenue = col_double(),
## .. StateExcise = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(Alcohol_policy)
## state SaleTimings MinLegalDrinkingAge PercapitaIncome
## Andaman and Nic: 1 Min. : 0.00 1: 6 Min. : 46292
## Andra pradesh : 1 1st Qu.:10.00 2:20 1st Qu.:107360
## Arunachal Prade: 1 Median :12.00 3: 5 Median :190407
## Assam : 1 Mean :11.52 Mean :192662
## Bihar : 1 3rd Qu.:14.00 3rd Qu.:228250
## Chandigarh : 1 Max. :15.00 Max. :435959
## (Other) :25
## HealthIndex PercentageBPL DrinkDrive SDI2017
## Min. :28.61 Min. : 0.71 Min. : 1.0 Min. :0.4300
## 1st Qu.:43.30 1st Qu.: 9.80 1st Qu.: 20.0 1st Qu.:0.5350
## Median :51.90 Median :14.88 Median : 139.0 Median :0.5900
## Mean :50.67 Mean :18.79 Mean : 378.7 Mean :0.5848
## 3rd Qu.:59.70 3rd Qu.:31.80 3rd Qu.: 355.0 3rd Qu.:0.6400
## Max. :74.01 Max. :39.90 Max. :3595.0 Max. :0.7400
##
## PercentageIPV banofsalepublicplaces minimumsaleprice policycontroldensity
## Min. : 3.50 1:30 1:22 1:21
## 1st Qu.:22.00 2: 1 2: 9 2:10
## Median :30.00
## Mean :28.92
## 3rd Qu.:36.00
## Max. :46.00
##
## quotaforretailsale warninquality distributionsystem pointofsale
## 1:26 1:22 1:10 1: 5
## 2: 5 2: 9 2: 4 2:22
## 3: 3 3: 4
## 4:14
##
##
##
## CurrentAlcoholUse AlcoholDependence OwnTaxRevenue StateExcise
## Min. : 0.90 Min. : 0.150 Min. : 40305 Min. : 0
## 1st Qu.: 8.85 1st Qu.: 1.000 1st Qu.: 299413 1st Qu.: 45697
## Median :16.40 Median : 2.400 Median : 2375000 Median : 298374
## Mean :15.83 Mean : 3.776 Mean : 4259104 Mean : 563100
## 3rd Qu.:21.45 3rd Qu.: 4.500 3rd Qu.: 6715358 3rd Qu.: 713116
## Max. :35.60 Max. :14.200 Max. :21082429 Max. :3151741
##
Exploratory analysis
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ dplyr 1.0.10
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
dataSetForAnalysis <- Alcohol_policy %>%
filter(!Alcohol_policy$state %in% c("Bihar","Jammu and Kashm"))
head(dataSetForAnalysis)
scaling of data
dataSetForAnalysis$PercapitaIncome <- as.numeric(scale(
dataSetForAnalysis$PercapitaIncome))
dataSetForAnalysis$SaleTimings <- as.numeric(
scale(dataSetForAnalysis$SaleTimings)
)
dataSetForAnalysis$PercentageBPL <- as.numeric(scale(
dataSetForAnalysis$PercentageBPL))
dataSetForAnalysis$PercentageIPV <- as.numeric(
scale(dataSetForAnalysis$PercentageIPV)
)
dataSetForAnalysis$DrinkDrive <- as.numeric(scale(
dataSetForAnalysis$DrinkDrive))
dataSetForAnalysis$CurrentAlcoholUse <- as.numeric(
scale(dataSetForAnalysis$CurrentAlcoholUse)
)
dataSetForAnalysis$AlcoholDependence <- as.numeric(scale(
dataSetForAnalysis$AlcoholDependence))
dataSetForAnalysis$OwnTaxRevenue <- as.numeric(scale(
dataSetForAnalysis$OwnTaxRevenue))
dataSetForAnalysis$StateExcise <- as.numeric(scale(
dataSetForAnalysis$StateExcise))
dataSetForAnalysis$SDI2017 <- as.numeric(scale(
dataSetForAnalysis$SDI2017))
dataSetForAnalysis$HealthIndex <- as.numeric(scale(
dataSetForAnalysis$HealthIndex))
Cluster
head(dataSetForAnalysis)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
gowerDist <- daisy(dataSetForAnalysis[,-1],"gower")
gowerMat <- as.matrix(gowerDist)
sil_width <- c(NA)
for(i in 2:8){
pam_fit <- pam(gowerMat, diss = TRUE, k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
plot(1:8, sil_width,
xlab = "Number of clusters",
ylab = "Silhouette Width")
lines(1:8, sil_width)
pam_fit <- pam(gowerMat, diss = TRUE,3)
summary(pam_fit)
## Medoids:
## ID
## [1,] "8" "8"
## [2,] "25" "25"
## [3,] "4" "4"
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 1 2 3 3 1 3 1 1 1 1 2 1 3 2 2 2 3 3 2 1 1 2 1 2 2 1
## 27 28 29
## 1 1 3
## Objective function:
## build swap
## 0.2036461 0.1942472
##
## Numerical information per cluster:
## size max_diss av_diss diameter separation
## [1,] 13 0.3359294 0.1819493 0.5303640 0.1400893
## [2,] 9 0.3442333 0.2031681 0.5547262 0.1400893
## [3,] 7 0.3982648 0.2056165 0.5806916 0.1545567
##
## Isolated clusters:
## L-clusters: character(0)
## L*-clusters: character(0)
##
## Silhouette plot information:
## cluster neighbor sil_width
## 8 1 2 0.41773482
## 20 1 2 0.35038648
## 23 1 3 0.31350985
## 1 1 3 0.30820377
## 10 1 2 0.26192950
## 7 1 2 0.25791630
## 12 1 2 0.21326986
## 26 1 2 0.20581068
## 28 1 2 0.19628238
## 5 1 3 0.18426024
## 21 1 2 0.16929580
## 9 1 3 0.08634998
## 27 1 2 -0.04193604
## 25 2 1 0.29301159
## 24 2 1 0.29279633
## 14 2 3 0.20662231
## 2 2 1 0.18460122
## 22 2 3 0.13023988
## 19 2 1 0.05289560
## 15 2 3 0.04486175
## 11 2 1 -0.07879906
## 16 2 3 -0.09373460
## 4 3 2 0.37067018
## 18 3 1 0.27391236
## 29 3 2 0.25580248
## 6 3 2 0.22282490
## 3 3 1 0.11299608
## 13 3 2 -0.01455115
## 17 3 2 -0.11959388
## Average silhouette width per cluster:
## [1] 0.2248472 0.1147217 0.1574373
## Average silhouette width of total data set:
## [1] 0.174399
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call"
plot(pam_fit)
library(Rtsne)
tsne_obj <- Rtsne(gowerMat, is_distance = TRUE,perplexity = 1)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster))
dataSetForAnalysis$cluster <- pam_fit$clustering
giveProp <- function(x){
y <- mean(x=="1",na.rm=FALSE)
y <- ceiling(y*100)
return(y)}
tab1 <- dataSetForAnalysis %>%
group_by(cluster)%>%
summarise(across(c(10,11,12,13,14),giveProp))
tab2 <- dataSetForAnalysis %>%
group_by(cluster)%>%
summarise(across(c(2,4,5,6,7,8,9,17:20),mean))
tableSummary <- left_join(tab1,tab2)
## Joining, by = "cluster"
tab3 <- dataSetForAnalysis %>%
group_by(cluster)%>%
summarise("NumOfStates"= n(),
"States"= paste0(state,collapse = ", "))
tableSummary <- left_join(tableSummary,tab3)
## Joining, by = "cluster"
tableSummary$medioids <- dataSetForAnalysis$state[pam_fit$id.med]
tableSummary <- t(tableSummary)
library(kableExtra)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(tableSummary,digits = 2)
| cluster | 1 | 2 | 3 |
| banofsalepublicplaces | 93 | 100 | 100 |
| minimumsaleprice | 85 | 67 | 43 |
| policycontroldensity | 85 | 78 | 29 |
| quotaforretailsale | 85 | 89 | 72 |
| warninquality | 85 | 89 | 15 |
| SaleTimings | 0.3942232 | -0.1802729 | -0.5003494 |
| PercapitaIncome | 0.5606044 | -0.1921445 | -0.7940794 |
| HealthIndex | -0.3791615 | 0.3388837 | 0.2684495 |
| PercentageBPL | -0.3931195 | -0.1542900 | 0.9284520 |
| DrinkDrive | -0.08224523 | 0.31513739 | -0.25243551 |
| SDI2017 | 0.6240879 | -0.3863401 | -0.6622974 |
| PercentageIPV | -0.4857901 | 0.4793232 | 0.2859090 |
| CurrentAlcoholUse | 0.3693525 | -0.3734651 | -0.2057710 |
| AlcoholDependence | 0.002477435 | 0.153311190 | -0.201715338 |
| OwnTaxRevenue | -0.4602922 | 0.5789406 | 0.1104762 |
| StateExcise | -0.26715603 | 0.44447218 | -0.07531732 |
| NumOfStates | 13 | 9 | 7 |
| States | Andaman and Nic, Chandigarh, Dadra and Nagar, Daman and Diu, Delhi, Goa, Himachalpradesh, Puducherry, Punjab, Sikkim, Tripura, Uttar Pradesh, Uttarakhand | Andra pradesh, Haryana, Karnataka, Kerala, Madhya Pradesh, Odisha, Rajasthan, Tamil nadu, Telangana | Arunachal Prade, Assam, Chattisgarh, Jharkand, Maharashtra, Meghalaya, West bengal |
| medioids | Daman and Diu | Telangana | Assam |
dataSetForAnalysis %>% group_by(cluster,MinLegalDrinkingAge)%>% summarise("N"= n())
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.
dataSetForAnalysis %>% group_by(cluster,distributionsystem)%>% summarise("N"= n())
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.
dataSetForAnalysis %>% group_by(cluster,pointofsale)%>% summarise("N"= n())
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.