Harshitha

library(readr)
Alcohol_policy <- read_csv("~/Library/CloudStorage/OneDrive-Personal/2_Apple/Harshitha_analysis/20_11_2022/Alcohol_policy.csv")

## Rows: 31 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): state
## dbl (19): SaleTimings, MinLegalDrinkingAge, PercapitaIncome, HealthIndex, Pe...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

attach(Alcohol_policy)
Alcohol_policy$state <- as.factor(Alcohol_policy$state)
Alcohol_policy$SaleTimings <- as.numeric(Alcohol_policy$SaleTimings)
Alcohol_policy$MinLegalDrinkingAge <- as.factor(Alcohol_policy$MinLegalDrinkingAge)
Alcohol_policy$PercapitaIncome <- as.numeric(Alcohol_policy$PercapitaIncome)
Alcohol_policy$HealthIndex <- as.numeric(Alcohol_policy$HealthIndex)
Alcohol_policy$PercentageBPL <- as.numeric(Alcohol_policy$PercentageBPL)
Alcohol_policy$DrinkDrive <- as.numeric(Alcohol_policy$DrinkDrive)
Alcohol_policy$SDI2017 <- as.numeric(Alcohol_policy$SDI2017)
Alcohol_policy$PercentageIPV <- as.numeric(Alcohol_policy$PercentageIPV)
Alcohol_policy$banofsalepublicplaces <- as.factor(Alcohol_policy$banofsalepublicplaces)
Alcohol_policy$minimumsaleprice <- as.factor(Alcohol_policy$minimumsaleprice)
Alcohol_policy$policycontroldensity <- as.factor(Alcohol_policy$policycontroldensity)
Alcohol_policy$quotaforretailsale <- as.factor(Alcohol_policy$quotaforretailsale)
Alcohol_policy$warninquality <- as.factor(Alcohol_policy$warninquality)
Alcohol_policy$distributionsystem <- as.factor(Alcohol_policy$distributionsystem)
Alcohol_policy$pointofsale <- as.factor(Alcohol_policy$pointofsale)
Alcohol_policy$CurrentAlcoholUse <- as.numeric(Alcohol_policy$CurrentAlcoholUse)
Alcohol_policy$AlcoholDependence <- as.numeric(Alcohol_policy$AlcoholDependence)
Alcohol_policy$OwnTaxRevenue <- as.numeric(Alcohol_policy$OwnTaxRevenue)
Alcohol_policy$StateExcise <- as.numeric(Alcohol_policy$StateExcise)
str(Alcohol_policy)

## spc_tbl_ [31 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ state                : Factor w/ 31 levels "Andaman and Nic",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ SaleTimings          : num [1:31] 12 9 12 11 0 14 9 12 12 12 ...
##  $ MinLegalDrinkingAge  : Factor w/ 3 levels "1","2","3": 1 2 2 2 2 3 2 2 1 3 ...
##  $ PercapitaIncome      : num [1:31] 218649 168480 169742 86801 46292 ...
##  $ HealthIndex          : num [1:31] 45.4 65.1 46.1 48.5 32.1 ...
##  $ PercentageBPL        : num [1:31] 1 12.3 34.7 32 33.7 ...
##  $ DrinkDrive           : num [1:31] 20 1345 55 377 10 ...
##  $ SDI2017              : num [1:31] 0.65 0.54 0.56 0.53 0.43 0.65 0.51 0.65 0.65 0.72 ...
##  $ PercentageIPV        : num [1:31] 20 45 35 27 45 23 38 36 29 30 ...
##  $ banofsalepublicplaces: Factor w/ 2 levels "1","2": 1 1 1 1 1 2 1 1 1 1 ...
##  $ minimumsaleprice     : Factor w/ 2 levels "1","2": 1 1 2 2 1 1 1 1 1 2 ...
##  $ policycontroldensity : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 1 1 2 ...
##  $ quotaforretailsale   : Factor w/ 2 levels "1","2": 1 1 1 1 1 2 2 1 1 2 ...
##  $ warninquality        : Factor w/ 2 levels "1","2": 2 1 2 2 1 1 2 1 1 1 ...
##  $ distributionsystem   : Factor w/ 4 levels "1","2","3","4": 4 1 2 2 1 4 1 4 4 4 ...
##  $ pointofsale          : Factor w/ 3 levels "1","2","3": 2 2 2 2 1 2 2 2 2 2 ...
##  $ CurrentAlcoholUse    : num [1:31] 25.4 13.7 28 8.8 0.9 17.5 35.6 11.6 18.3 21.3 ...
##  $ AlcoholDependence    : num [1:31] 7.1 13.7 10.2 1.3 0.15 1.1 6.2 0.5 3.3 2.4 ...
##  $ OwnTaxRevenue        : num [1:31] 40305 7543770 144000 1799415 3617469 ...
##  $ StateExcise          : num [1:31] 14475 622020 20836 145000 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   state = col_character(),
##   ..   SaleTimings = col_double(),
##   ..   MinLegalDrinkingAge = col_double(),
##   ..   PercapitaIncome = col_double(),
##   ..   HealthIndex = col_double(),
##   ..   PercentageBPL = col_double(),
##   ..   DrinkDrive = col_double(),
##   ..   SDI2017 = col_double(),
##   ..   PercentageIPV = col_double(),
##   ..   banofsalepublicplaces = col_double(),
##   ..   minimumsaleprice = col_double(),
##   ..   policycontroldensity = col_double(),
##   ..   quotaforretailsale = col_double(),
##   ..   warninquality = col_double(),
##   ..   distributionsystem = col_double(),
##   ..   pointofsale = col_double(),
##   ..   CurrentAlcoholUse = col_double(),
##   ..   AlcoholDependence = col_double(),
##   ..   OwnTaxRevenue = col_double(),
##   ..   StateExcise = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

summary(Alcohol_policy)

##              state     SaleTimings    MinLegalDrinkingAge PercapitaIncome 
##  Andaman and Nic: 1   Min.   : 0.00   1: 6                Min.   : 46292  
##  Andra pradesh  : 1   1st Qu.:10.00   2:20                1st Qu.:107360  
##  Arunachal Prade: 1   Median :12.00   3: 5                Median :190407  
##  Assam          : 1   Mean   :11.52                       Mean   :192662  
##  Bihar          : 1   3rd Qu.:14.00                       3rd Qu.:228250  
##  Chandigarh     : 1   Max.   :15.00                       Max.   :435959  
##  (Other)        :25                                                       
##   HealthIndex    PercentageBPL     DrinkDrive        SDI2017      
##  Min.   :28.61   Min.   : 0.71   Min.   :   1.0   Min.   :0.4300  
##  1st Qu.:43.30   1st Qu.: 9.80   1st Qu.:  20.0   1st Qu.:0.5350  
##  Median :51.90   Median :14.88   Median : 139.0   Median :0.5900  
##  Mean   :50.67   Mean   :18.79   Mean   : 378.7   Mean   :0.5848  
##  3rd Qu.:59.70   3rd Qu.:31.80   3rd Qu.: 355.0   3rd Qu.:0.6400  
##  Max.   :74.01   Max.   :39.90   Max.   :3595.0   Max.   :0.7400  
##                                                                   
##  PercentageIPV   banofsalepublicplaces minimumsaleprice policycontroldensity
##  Min.   : 3.50   1:30                  1:22             1:21                
##  1st Qu.:22.00   2: 1                  2: 9             2:10                
##  Median :30.00                                                              
##  Mean   :28.92                                                              
##  3rd Qu.:36.00                                                              
##  Max.   :46.00                                                              
##                                                                             
##  quotaforretailsale warninquality distributionsystem pointofsale
##  1:26               1:22          1:10               1: 5       
##  2: 5               2: 9          2: 4               2:22       
##                                   3: 3               3: 4       
##                                   4:14                          
##                                                                 
##                                                                 
##                                                                 
##  CurrentAlcoholUse AlcoholDependence OwnTaxRevenue       StateExcise     
##  Min.   : 0.90     Min.   : 0.150    Min.   :   40305   Min.   :      0  
##  1st Qu.: 8.85     1st Qu.: 1.000    1st Qu.:  299413   1st Qu.:  45697  
##  Median :16.40     Median : 2.400    Median : 2375000   Median : 298374  
##  Mean   :15.83     Mean   : 3.776    Mean   : 4259104   Mean   : 563100  
##  3rd Qu.:21.45     3rd Qu.: 4.500    3rd Qu.: 6715358   3rd Qu.: 713116  
##  Max.   :35.60     Max.   :14.200    Max.   :21082429   Max.   :3151741  
##

Exploratory analysis

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ dplyr   1.0.10
## ✔ tibble  3.1.8      ✔ stringr 1.4.1 
## ✔ tidyr   1.2.1      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

dataSetForAnalysis <- Alcohol_policy %>%
 filter(!Alcohol_policy$state %in% c("Bihar","Jammu and Kashm"))
head(dataSetForAnalysis)

scaling of data

dataSetForAnalysis$PercapitaIncome <- as.numeric(scale(
  dataSetForAnalysis$PercapitaIncome))
dataSetForAnalysis$SaleTimings <- as.numeric(
  scale(dataSetForAnalysis$SaleTimings)
)
dataSetForAnalysis$PercentageBPL <- as.numeric(scale(
  dataSetForAnalysis$PercentageBPL))
dataSetForAnalysis$PercentageIPV <- as.numeric(
  scale(dataSetForAnalysis$PercentageIPV)
)
dataSetForAnalysis$DrinkDrive <- as.numeric(scale(
  dataSetForAnalysis$DrinkDrive))
dataSetForAnalysis$CurrentAlcoholUse <- as.numeric(
  scale(dataSetForAnalysis$CurrentAlcoholUse)
)
dataSetForAnalysis$AlcoholDependence <- as.numeric(scale(
  dataSetForAnalysis$AlcoholDependence))
dataSetForAnalysis$OwnTaxRevenue <- as.numeric(scale(
  dataSetForAnalysis$OwnTaxRevenue))
dataSetForAnalysis$StateExcise <- as.numeric(scale(
  dataSetForAnalysis$StateExcise))
dataSetForAnalysis$SDI2017 <- as.numeric(scale(
  dataSetForAnalysis$SDI2017))
dataSetForAnalysis$HealthIndex <- as.numeric(scale(
  dataSetForAnalysis$HealthIndex))

Cluster

head(dataSetForAnalysis)

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

gowerDist <- daisy(dataSetForAnalysis[,-1],"gower")
gowerMat <- as.matrix(gowerDist)
sil_width <- c(NA)
for(i in 2:8){
pam_fit <- pam(gowerMat, diss = TRUE, k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
 }
plot(1:8, sil_width,
  xlab = "Number of clusters",
  ylab = "Silhouette Width")
 lines(1:8, sil_width)

pam_fit <- pam(gowerMat, diss = TRUE,3) 
summary(pam_fit)

## Medoids:
##      ID       
## [1,] "8"  "8" 
## [2,] "25" "25"
## [3,] "4"  "4" 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  2  3  3  1  3  1  1  1  1  2  1  3  2  2  2  3  3  2  1  1  2  1  2  2  1 
## 27 28 29 
##  1  1  3 
## Objective function:
##     build      swap 
## 0.2036461 0.1942472 
## 
## Numerical information per cluster:
##      size  max_diss   av_diss  diameter separation
## [1,]   13 0.3359294 0.1819493 0.5303640  0.1400893
## [2,]    9 0.3442333 0.2031681 0.5547262  0.1400893
## [3,]    7 0.3982648 0.2056165 0.5806916  0.1545567
## 
## Isolated clusters:
##  L-clusters: character(0)
##  L*-clusters: character(0)
## 
## Silhouette plot information:
##    cluster neighbor   sil_width
## 8        1        2  0.41773482
## 20       1        2  0.35038648
## 23       1        3  0.31350985
## 1        1        3  0.30820377
## 10       1        2  0.26192950
## 7        1        2  0.25791630
## 12       1        2  0.21326986
## 26       1        2  0.20581068
## 28       1        2  0.19628238
## 5        1        3  0.18426024
## 21       1        2  0.16929580
## 9        1        3  0.08634998
## 27       1        2 -0.04193604
## 25       2        1  0.29301159
## 24       2        1  0.29279633
## 14       2        3  0.20662231
## 2        2        1  0.18460122
## 22       2        3  0.13023988
## 19       2        1  0.05289560
## 15       2        3  0.04486175
## 11       2        1 -0.07879906
## 16       2        3 -0.09373460
## 4        3        2  0.37067018
## 18       3        1  0.27391236
## 29       3        2  0.25580248
## 6        3        2  0.22282490
## 3        3        1  0.11299608
## 13       3        2 -0.01455115
## 17       3        2 -0.11959388
## Average silhouette width per cluster:
## [1] 0.2248472 0.1147217 0.1574373
## Average silhouette width of total data set:
## [1] 0.174399
## 
## Available components:
## [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
## [6] "clusinfo"   "silinfo"    "diss"       "call"

plot(pam_fit)

library(Rtsne)
tsne_obj <- Rtsne(gowerMat, is_distance = TRUE,perplexity = 1)
 tsne_data <- tsne_obj$Y %>%
      data.frame() %>%
      setNames(c("X", "Y")) %>%
      mutate(cluster = factor(pam_fit$clustering))
  ggplot(aes(x = X, y = Y), data = tsne_data) +
      geom_point(aes(color = cluster))

dataSetForAnalysis$cluster <- pam_fit$clustering
 giveProp <- function(x){
   y <- mean(x=="1",na.rm=FALSE)
   y <- ceiling(y*100)
   return(y)}
tab1 <- dataSetForAnalysis %>%
   group_by(cluster)%>%
   summarise(across(c(10,11,12,13,14),giveProp))
tab2 <- dataSetForAnalysis %>%
  group_by(cluster)%>%
  summarise(across(c(2,4,5,6,7,8,9,17:20),mean))
tableSummary <- left_join(tab1,tab2)

## Joining, by = "cluster"

tab3 <- dataSetForAnalysis %>%
  group_by(cluster)%>%
  summarise("NumOfStates"= n(),
            "States"= paste0(state,collapse = ", "))

tableSummary <- left_join(tableSummary,tab3)

## Joining, by = "cluster"

tableSummary$medioids <- dataSetForAnalysis$state[pam_fit$id.med]
tableSummary <- t(tableSummary)
library(kableExtra)

## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(tableSummary,digits = 2)

cluster	1	2	3
banofsalepublicplaces	93	100	100
minimumsaleprice	85	67	43
policycontroldensity	85	78	29
quotaforretailsale	85	89	72
warninquality	85	89	15
SaleTimings	0.3942232	-0.1802729	-0.5003494
PercapitaIncome	0.5606044	-0.1921445	-0.7940794
HealthIndex	-0.3791615	0.3388837	0.2684495
PercentageBPL	-0.3931195	-0.1542900	0.9284520
DrinkDrive	-0.08224523	0.31513739	-0.25243551
SDI2017	0.6240879	-0.3863401	-0.6622974
PercentageIPV	-0.4857901	0.4793232	0.2859090
CurrentAlcoholUse	0.3693525	-0.3734651	-0.2057710
AlcoholDependence	0.002477435	0.153311190	-0.201715338
OwnTaxRevenue	-0.4602922	0.5789406	0.1104762
StateExcise	-0.26715603	0.44447218	-0.07531732
NumOfStates	13	9	7
States	Andaman and Nic, Chandigarh, Dadra and Nagar, Daman and Diu, Delhi, Goa, Himachalpradesh, Puducherry, Punjab, Sikkim, Tripura, Uttar Pradesh, Uttarakhand	Andra pradesh, Haryana, Karnataka, Kerala, Madhya Pradesh, Odisha, Rajasthan, Tamil nadu, Telangana	Arunachal Prade, Assam, Chattisgarh, Jharkand, Maharashtra, Meghalaya, West bengal
medioids	Daman and Diu	Telangana	Assam

dataSetForAnalysis %>% group_by(cluster,MinLegalDrinkingAge)%>% summarise("N"= n())

## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.

dataSetForAnalysis %>% group_by(cluster,distributionsystem)%>% summarise("N"= n())

## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.

dataSetForAnalysis %>% group_by(cluster,pointofsale)%>% summarise("N"= n())

## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.

Harshitha_paper