Harshitha

Read about regression models: Regression models are used to describe relationships between variables by fitting a line to the observed data. Regression allows you to estimate how a dependent variable changes as the independent variable(s) change. - Multiple linear regression is used to estimate the relationship between two or more independent variables and one dependent variable. You can use multiple linear regression when you want to know: 1. How strong the relationship is between two or more independent variables and one dependent variable (e.g. how rainfall, temperature, and amount of fertilizer added affect crop growth).

The value of the dependent variable at a certain value of the independent variables (e.g. the expected yield of a crop at certain levels of rainfall, temperature, and fertilizer addition).

There are two types of multiple linear regression: 1. ordinary least squares (OLS) 2. generalized least squares (GLS) The main difference between the two is that OLS assumes there is not a strong correlation between any two independent variables. GLS deals with correlated independent variables by transforming the data and then using OLS to build the model with transformed data.

#Analysis involves influence of various continous variables and categorical variables on the current use and dependence of alcohol in different states 
# The data sheet contains the following variables 

library(readr)
Alcohol_policy <- read_csv("~/Library/CloudStorage/OneDrive-Personal/2_Apple/Harshitha_analysis/20_11_2022/Alcohol_policy.csv")

## Rows: 31 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): state
## dbl (17): SaleTimings, MinLegalDrinkingAge, PercapitaIncome, HealthIndex, Pe...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

attach(Alcohol_policy)
Alcohol_policy$state <- as.factor(Alcohol_policy$state)
Alcohol_policy$SaleTimings <- as.numeric(Alcohol_policy$SaleTimings)
Alcohol_policy$MinLegalDrinkingAge <- as.factor(Alcohol_policy$MinLegalDrinkingAge)
Alcohol_policy$PercapitaIncome <- as.numeric(Alcohol_policy$PercapitaIncome)
Alcohol_policy$HealthIndex <- as.factor(Alcohol_policy$HealthIndex)
Alcohol_policy$PercentageBPL <- as.numeric(Alcohol_policy$PercentageBPL)
Alcohol_policy$DrinkDrive <- as.numeric(Alcohol_policy$DrinkDrive)
Alcohol_policy$SDI2017 <- as.factor(Alcohol_policy$SDI2017)
Alcohol_policy$PercentageIPV <- as.numeric(Alcohol_policy$PercentageIPV)
Alcohol_policy$banofsalepublicplaces <- as.factor(Alcohol_policy$banofsalepublicplaces)
Alcohol_policy$minimumsaleprice <- as.factor(Alcohol_policy$minimumsaleprice)
Alcohol_policy$policycontroldensity <- as.factor(Alcohol_policy$policycontroldensity)
Alcohol_policy$quotaforretailsale <- as.factor(Alcohol_policy$quotaforretailsale)
Alcohol_policy$warninquality <- as.factor(Alcohol_policy$warninquality)
Alcohol_policy$distributionsystem <- as.factor(Alcohol_policy$distributionsystem)
Alcohol_policy$pointofsale <- as.factor(Alcohol_policy$pointofsale)
Alcohol_policy$CurrentAlcoholUse <- as.numeric(Alcohol_policy$CurrentAlcoholUse)
Alcohol_policy$AlcoholDependence <- as.numeric(Alcohol_policy$AlcoholDependence)
str(Alcohol_policy)

## spc_tbl_ [31 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ state                : Factor w/ 31 levels "Andaman and Nic",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ SaleTimings          : num [1:31] 12 9 12 11 0 14 9 12 12 12 ...
##  $ MinLegalDrinkingAge  : Factor w/ 3 levels "1","2","3": 1 2 2 2 2 3 2 2 1 3 ...
##  $ PercapitaIncome      : num [1:31] 218649 168480 169742 86801 46292 ...
##  $ HealthIndex          : Factor w/ 3 levels "1","2","3": 2 1 3 2 3 1 2 1 2 2 ...
##  $ PercentageBPL        : num [1:31] 1 12.3 34.7 32 33.7 ...
##  $ DrinkDrive           : num [1:31] 20 1345 55 377 10 ...
##  $ SDI2017              : Factor w/ 3 levels "1","2","3": 3 2 2 1 1 3 1 3 3 3 ...
##  $ PercentageIPV        : num [1:31] 20 45 35 27 45 23 38 36 29 30 ...
##  $ banofsalepublicplaces: Factor w/ 2 levels "1","2": 1 1 1 1 1 2 1 1 1 1 ...
##  $ minimumsaleprice     : Factor w/ 2 levels "1","2": 1 1 2 2 1 1 1 1 1 2 ...
##  $ policycontroldensity : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 1 1 2 ...
##  $ quotaforretailsale   : Factor w/ 2 levels "1","2": 1 1 1 1 1 2 2 1 1 2 ...
##  $ warninquality        : Factor w/ 2 levels "1","2": 2 1 2 2 1 1 2 1 1 1 ...
##  $ distributionsystem   : Factor w/ 4 levels "1","2","3","4": 4 1 2 2 1 4 1 4 4 4 ...
##  $ pointofsale          : Factor w/ 3 levels "1","2","3": 2 2 2 2 1 2 2 2 2 2 ...
##  $ CurrentAlcoholUse    : num [1:31] 25.4 13.7 28 8.8 0.9 17.5 35.6 11.6 18.3 21.3 ...
##  $ AlcoholDependence    : num [1:31] 7.1 13.7 10.2 1.3 0.15 1.1 6.2 0.5 3.3 2.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   state = col_character(),
##   ..   SaleTimings = col_double(),
##   ..   MinLegalDrinkingAge = col_double(),
##   ..   PercapitaIncome = col_double(),
##   ..   HealthIndex = col_double(),
##   ..   PercentageBPL = col_double(),
##   ..   DrinkDrive = col_double(),
##   ..   SDI2017 = col_double(),
##   ..   PercentageIPV = col_double(),
##   ..   banofsalepublicplaces = col_double(),
##   ..   minimumsaleprice = col_double(),
##   ..   policycontroldensity = col_double(),
##   ..   quotaforretailsale = col_double(),
##   ..   warninquality = col_double(),
##   ..   distributionsystem = col_double(),
##   ..   pointofsale = col_double(),
##   ..   CurrentAlcoholUse = col_double(),
##   ..   AlcoholDependence = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

summary(Alcohol_policy)

##              state     SaleTimings    MinLegalDrinkingAge PercapitaIncome 
##  Andaman and Nic: 1   Min.   : 0.00   1: 6                Min.   : 46292  
##  Andra pradesh  : 1   1st Qu.:10.00   2:20                1st Qu.:107360  
##  Arunachal Prade: 1   Median :12.00   3: 5                Median :190407  
##  Assam          : 1   Mean   :11.52                       Mean   :192662  
##  Bihar          : 1   3rd Qu.:14.00                       3rd Qu.:228250  
##  Chandigarh     : 1   Max.   :15.00                       Max.   :435959  
##  (Other)        :25                                                       
##  HealthIndex PercentageBPL     DrinkDrive     SDI2017 PercentageIPV  
##  1:11        Min.   : 0.71   Min.   :   1.0   1: 8    Min.   : 3.50  
##  2:11        1st Qu.: 9.80   1st Qu.:  20.0   2: 9    1st Qu.:22.00  
##  3: 9        Median :14.88   Median : 139.0   3:14    Median :30.00  
##              Mean   :18.79   Mean   : 378.7           Mean   :28.92  
##              3rd Qu.:31.80   3rd Qu.: 355.0           3rd Qu.:36.00  
##              Max.   :39.90   Max.   :3595.0           Max.   :46.00  
##                                                                      
##  banofsalepublicplaces minimumsaleprice policycontroldensity quotaforretailsale
##  1:30                  1:22             1:21                 1:26              
##  2: 1                  2: 9             2:10                 2: 5              
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##  warninquality distributionsystem pointofsale CurrentAlcoholUse
##  1:22          1:10               1: 5        Min.   : 0.90    
##  2: 9          2: 4               2:22        1st Qu.: 8.85    
##                3: 3               3: 4        Median :16.40    
##                4:14                           Mean   :15.83    
##                                               3rd Qu.:21.45    
##                                               Max.   :35.60    
##                                                                
##  AlcoholDependence
##  Min.   : 0.150   
##  1st Qu.: 1.000   
##  Median : 2.400   
##  Mean   : 3.776   
##  3rd Qu.: 4.500   
##  Max.   :14.200   
##

#considering there is only one variable for licensing places of consumption and licensing places of hrs 
# we have 4 continuous variables and 10 categorical variables
#The dependent variables are current use, dependence, drink and drive
# Overall there are 14 independent variables, hence, we need atleast 140 observations to run the analysis, we only have 30 observations. Therefore, the justification of running a multiple linear regression need to be reconsidered.

Multiple linear regression is a generalization of simple linear regression, in the sense that this approach makes it possible to relate one variable with several variables through a linear function in its parameters.

Multiple linear regression is used to assess the relationship between two variables while taking into account the effect of other variables. By taking into account the effect of other variables, we cancel out the effect of these other variables in order to isolate and measure the relationship between the two variables of interest. This point is the main difference with simple linear regression

We conduct a multiple linear regression below for the alcohol dependence

Dependence <- lm(Alcohol_policy$AlcoholDependence ~ Alcohol_policy$SaleTimings + Alcohol_policy$MinLegalDrinkingAge + Alcohol_policy$PercapitaIncome + Alcohol_policy$PercapitaIncome  + Alcohol_policy$HealthIndex +Alcohol_policy$PercentageBPL + Alcohol_policy$banofsalepublicplaces  +Alcohol_policy$SDI2017+ Alcohol_policy$PercentageIPV +Alcohol_policy$minimumsaleprice + Alcohol_policy$policycontroldensity +Alcohol_policy$quotaforretailsale +Alcohol_policy$warninquality +Alcohol_policy$distributionsystem +Alcohol_policy$pointofsale, 
  data = Alcohol_policy
)

summary(Dependence)

## 
## Call:
## lm(formula = Alcohol_policy$AlcoholDependence ~ Alcohol_policy$SaleTimings + 
##     Alcohol_policy$MinLegalDrinkingAge + Alcohol_policy$PercapitaIncome + 
##     Alcohol_policy$PercapitaIncome + Alcohol_policy$HealthIndex + 
##     Alcohol_policy$PercentageBPL + Alcohol_policy$banofsalepublicplaces + 
##     Alcohol_policy$SDI2017 + Alcohol_policy$PercentageIPV + Alcohol_policy$minimumsaleprice + 
##     Alcohol_policy$policycontroldensity + Alcohol_policy$quotaforretailsale + 
##     Alcohol_policy$warninquality + Alcohol_policy$distributionsystem + 
##     Alcohol_policy$pointofsale, data = Alcohol_policy)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.235 -1.448  0.188  1.342  6.684 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                           -5.842e+00  9.080e+00  -0.643    0.534  
## Alcohol_policy$SaleTimings             2.073e-01  4.797e-01   0.432    0.675  
## Alcohol_policy$MinLegalDrinkingAge2    1.503e-02  3.534e+00   0.004    0.997  
## Alcohol_policy$MinLegalDrinkingAge3   -1.026e-01  4.181e+00  -0.025    0.981  
## Alcohol_policy$PercapitaIncome        -7.765e-06  1.745e-05  -0.445    0.666  
## Alcohol_policy$HealthIndex2           -1.101e+00  3.460e+00  -0.318    0.757  
## Alcohol_policy$HealthIndex3            3.856e+00  3.812e+00   1.012    0.336  
## Alcohol_policy$PercentageBPL          -1.927e-01  1.331e-01  -1.448    0.178  
## Alcohol_policy$banofsalepublicplaces2 -5.141e-01  6.897e+00  -0.075    0.942  
## Alcohol_policy$SDI20172                2.891e+00  4.197e+00   0.689    0.507  
## Alcohol_policy$SDI20173                4.344e+00  6.728e+00   0.646    0.533  
## Alcohol_policy$PercentageIPV           2.919e-01  1.310e-01   2.229    0.050 *
## Alcohol_policy$minimumsaleprice2      -1.009e-01  2.617e+00  -0.039    0.970  
## Alcohol_policy$policycontroldensity2  -1.440e+00  3.429e+00  -0.420    0.683  
## Alcohol_policy$quotaforretailsale2     2.736e+00  3.449e+00   0.793    0.446  
## Alcohol_policy$warninquality2          2.287e+00  2.488e+00   0.920    0.379  
## Alcohol_policy$distributionsystem2    -6.706e-01  4.211e+00  -0.159    0.877  
## Alcohol_policy$distributionsystem3    -2.137e+00  5.516e+00  -0.388    0.706  
## Alcohol_policy$distributionsystem4    -3.760e+00  3.918e+00  -0.959    0.360  
## Alcohol_policy$pointofsale2            2.826e+00  3.160e+00   0.894    0.392  
## Alcohol_policy$pointofsale3           -1.762e+00  5.982e+00  -0.295    0.774  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.568 on 10 degrees of freedom
## Multiple R-squared:  0.5746, Adjusted R-squared:  -0.2761 
## F-statistic: 0.6755 on 20 and 10 DF,  p-value: 0.7819

We conduct a multiple linear regression below for the current alcohol use

Use_current <- lm(Alcohol_policy$CurrentAlcoholUse ~ Alcohol_policy$SaleTimings + Alcohol_policy$MinLegalDrinkingAge + Alcohol_policy$PercapitaIncome + Alcohol_policy$PercapitaIncome  + Alcohol_policy$HealthIndex +Alcohol_policy$PercentageBPL + Alcohol_policy$banofsalepublicplaces  +Alcohol_policy$SDI2017+ Alcohol_policy$PercentageIPV +Alcohol_policy$minimumsaleprice + Alcohol_policy$policycontroldensity +Alcohol_policy$quotaforretailsale +Alcohol_policy$warninquality +Alcohol_policy$distributionsystem +Alcohol_policy$pointofsale, 
  data = Alcohol_policy
)

summary(Use_current)

## 
## Call:
## lm(formula = Alcohol_policy$CurrentAlcoholUse ~ Alcohol_policy$SaleTimings + 
##     Alcohol_policy$MinLegalDrinkingAge + Alcohol_policy$PercapitaIncome + 
##     Alcohol_policy$PercapitaIncome + Alcohol_policy$HealthIndex + 
##     Alcohol_policy$PercentageBPL + Alcohol_policy$banofsalepublicplaces + 
##     Alcohol_policy$SDI2017 + Alcohol_policy$PercentageIPV + Alcohol_policy$minimumsaleprice + 
##     Alcohol_policy$policycontroldensity + Alcohol_policy$quotaforretailsale + 
##     Alcohol_policy$warninquality + Alcohol_policy$distributionsystem + 
##     Alcohol_policy$pointofsale, data = Alcohol_policy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.3456  -1.9749  -0.2165   2.7526  11.7556 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                           -2.428e+01  1.742e+01  -1.394   0.1935  
## Alcohol_policy$SaleTimings             9.771e-01  9.201e-01   1.062   0.3132  
## Alcohol_policy$MinLegalDrinkingAge2    1.174e+01  6.778e+00   1.731   0.1140  
## Alcohol_policy$MinLegalDrinkingAge3    9.646e+00  8.018e+00   1.203   0.2567  
## Alcohol_policy$PercapitaIncome         1.853e-05  3.348e-05   0.553   0.5921  
## Alcohol_policy$HealthIndex2            7.249e+00  6.637e+00   1.092   0.3004  
## Alcohol_policy$HealthIndex3            1.451e+01  7.311e+00   1.985   0.0752 .
## Alcohol_policy$PercentageBPL          -3.663e-01  2.553e-01  -1.435   0.1819  
## Alcohol_policy$banofsalepublicplaces2  6.004e+00  1.323e+01   0.454   0.6596  
## Alcohol_policy$SDI20172               -5.399e+00  8.049e+00  -0.671   0.5175  
## Alcohol_policy$SDI20173               -2.405e+00  1.290e+01  -0.186   0.8559  
## Alcohol_policy$PercentageIPV           3.917e-01  2.512e-01   1.559   0.1500  
## Alcohol_policy$minimumsaleprice2      -2.557e+00  5.019e+00  -0.509   0.6215  
## Alcohol_policy$policycontroldensity2  -7.191e+00  6.577e+00  -1.093   0.2999  
## Alcohol_policy$quotaforretailsale2     7.961e+00  6.615e+00   1.204   0.2565  
## Alcohol_policy$warninquality2          7.992e+00  4.771e+00   1.675   0.1249  
## Alcohol_policy$distributionsystem2    -2.443e+00  8.076e+00  -0.303   0.7684  
## Alcohol_policy$distributionsystem3     9.794e+00  1.058e+01   0.926   0.3763  
## Alcohol_policy$distributionsystem4    -2.421e+00  7.515e+00  -0.322   0.7540  
## Alcohol_policy$pointofsale2            9.365e+00  6.060e+00   1.545   0.1533  
## Alcohol_policy$pointofsale3            5.113e+00  1.147e+01   0.446   0.6654  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.76 on 10 degrees of freedom
## Multiple R-squared:  0.7025, Adjusted R-squared:  0.1076 
## F-statistic: 1.181 on 20 and 10 DF,  p-value: 0.4075

Based on the above analysis, there is a 1. significant positive relationship between Plot the model below - Minimum legal age of drinking and current alcohol use Others to be related 2. Significant negative association between Health index and alcohol use. States with higher health index has lower proportion of alcohol use For one unit increase in health index there is -7.900e-01 (e to be considered while interpretation) units decrease in prevalence of alcohol use.

Drink_drive_outcome <- lm(Alcohol_policy$DrinkDrive ~ Alcohol_policy$SaleTimings + Alcohol_policy$MinLegalDrinkingAge + Alcohol_policy$PercapitaIncome + Alcohol_policy$PercapitaIncome  + Alcohol_policy$HealthIndex +Alcohol_policy$PercentageBPL + Alcohol_policy$banofsalepublicplaces  +Alcohol_policy$SDI2017+ Alcohol_policy$PercentageIPV +Alcohol_policy$minimumsaleprice + Alcohol_policy$policycontroldensity +Alcohol_policy$quotaforretailsale +Alcohol_policy$warninquality +Alcohol_policy$distributionsystem +Alcohol_policy$pointofsale, 
  data = Alcohol_policy
)

summary(Drink_drive_outcome)

## 
## Call:
## lm(formula = Alcohol_policy$DrinkDrive ~ Alcohol_policy$SaleTimings + 
##     Alcohol_policy$MinLegalDrinkingAge + Alcohol_policy$PercapitaIncome + 
##     Alcohol_policy$PercapitaIncome + Alcohol_policy$HealthIndex + 
##     Alcohol_policy$PercentageBPL + Alcohol_policy$banofsalepublicplaces + 
##     Alcohol_policy$SDI2017 + Alcohol_policy$PercentageIPV + Alcohol_policy$minimumsaleprice + 
##     Alcohol_policy$policycontroldensity + Alcohol_policy$quotaforretailsale + 
##     Alcohol_policy$warninquality + Alcohol_policy$distributionsystem + 
##     Alcohol_policy$pointofsale, data = Alcohol_policy)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -801.4 -216.9    0.0  288.4  993.8 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                            1.367e+03  1.347e+03   1.015  0.33401   
## Alcohol_policy$SaleTimings            -5.079e+01  7.117e+01  -0.714  0.49171   
## Alcohol_policy$MinLegalDrinkingAge2    4.521e+02  5.243e+02   0.862  0.40872   
## Alcohol_policy$MinLegalDrinkingAge3    3.507e+02  6.202e+02   0.565  0.58421   
## Alcohol_policy$PercapitaIncome         2.345e-03  2.589e-03   0.906  0.38648   
## Alcohol_policy$HealthIndex2           -5.972e+02  5.134e+02  -1.163  0.27169   
## Alcohol_policy$HealthIndex3           -6.313e+02  5.655e+02  -1.116  0.29040   
## Alcohol_policy$PercentageBPL          -2.933e+01  1.974e+01  -1.485  0.16826   
## Alcohol_policy$banofsalepublicplaces2  3.453e+01  1.023e+03   0.034  0.97374   
## Alcohol_policy$SDI20172               -2.080e+03  6.226e+02  -3.341  0.00748 **
## Alcohol_policy$SDI20173               -2.941e+03  9.981e+02  -2.947  0.01461 * 
## Alcohol_policy$PercentageIPV           1.647e+01  1.943e+01   0.848  0.41653   
## Alcohol_policy$minimumsaleprice2       2.750e+02  3.883e+02   0.708  0.49498   
## Alcohol_policy$policycontroldensity2  -8.689e+02  5.087e+02  -1.708  0.11843   
## Alcohol_policy$quotaforretailsale2     2.635e+02  5.116e+02   0.515  0.61770   
## Alcohol_policy$warninquality2         -4.944e+01  3.690e+02  -0.134  0.89609   
## Alcohol_policy$distributionsystem2     2.539e+02  6.247e+02   0.406  0.69297   
## Alcohol_policy$distributionsystem3     1.214e+03  8.182e+02   1.484  0.16874   
## Alcohol_policy$distributionsystem4     1.241e+03  5.813e+02   2.135  0.05855 . 
## Alcohol_policy$pointofsale2            7.630e+02  4.688e+02   1.628  0.13464   
## Alcohol_policy$pointofsale3            9.094e+02  8.875e+02   1.025  0.32968   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 677.6 on 10 degrees of freedom
## Multiple R-squared:  0.6921, Adjusted R-squared:  0.07637 
## F-statistic: 1.124 on 20 and 10 DF,  p-value: 0.4416

Plotting the multiple linear regression Using ggPredict to plot, however, there are only three independent variables that are allowed Link to read: https://cran.r-project.org/web/packages/ggiraphExtra/vignettes/ggPredict.html The above link provides all the necessary codes to plot for both linear and logistic regression

library(ggplot2)
require(moonBook)

## Loading required package: moonBook

require(ggiraph)

## Loading required package: ggiraph

require(ggiraphExtra)

## Loading required package: ggiraphExtra

## 
## Attaching package: 'ggiraphExtra'

## The following objects are masked from 'package:moonBook':
## 
##     addLabelDf, getMapping

require(plyr)

## Loading required package: plyr

ggPredict(Drink_drive_outcome,interactive = TRUE)

## Warning in ggPredict(Drink_drive_outcome, interactive = TRUE): maximum three
## independent variables are allowed

## NULL

Exploratory analysis

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ purrr   0.3.5      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()

dataSetForAnalysis <- Alcohol_policy %>%
 filter(!Alcohol_policy$state %in% c("Bihar","Jammu and Kashm"))
head(dataSetForAnalysis)

scaling of data

dataSetForAnalysis$PercapitaIncome <- as.numeric(scale(
  dataSetForAnalysis$PercapitaIncome))
dataSetForAnalysis$SaleTimings <- as.numeric(
  scale(dataSetForAnalysis$SaleTimings)
)
dataSetForAnalysis$PercentageBPL <- as.numeric(scale(
  dataSetForAnalysis$PercentageBPL))
dataSetForAnalysis$PercentageIPV <- as.numeric(
  scale(dataSetForAnalysis$PercentageIPV)
)
dataSetForAnalysis$DrinkDrive <- as.numeric(scale(
  dataSetForAnalysis$DrinkDrive))
dataSetForAnalysis$CurrentAlcoholUse <- as.numeric(
  scale(dataSetForAnalysis$CurrentAlcoholUse)
)
dataSetForAnalysis$AlcoholDependence <- as.numeric(scale(
  dataSetForAnalysis$AlcoholDependence))

Cluster

head(dataSetForAnalysis)

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

gowerDist <- daisy(dataSetForAnalysis,"gower")
gowerMat <- as.matrix(gowerDist)
sil_width <- c(NA)
for(i in 2:8){
pam_fit <- pam(gowerMat, diss = TRUE, k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
 }
plot(1:8, sil_width,
  xlab = "Number of clusters",
  ylab = "Silhouette Width")
 lines(1:8, sil_width)

pam_fit <- pam(gowerMat, diss = TRUE,3) 
summary(pam_fit)

## Medoids:
##      ID       
## [1,] "8"  "8" 
## [2,] "25" "25"
## [3,] "13" "13"
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  2  2  3  1  3  1  1  1  1  2  1  3  2  2  3  1  3  3  1  1  3  1  2  2  1 
## 27 28 29 
##  3  1  3 
## Objective function:
##     build      swap 
## 0.2764953 0.2612441 
## 
## Numerical information per cluster:
##      size  max_diss   av_diss  diameter separation
## [1,]   13 0.4106128 0.2529199 0.5307471  0.2758026
## [2,]    7 0.4433533 0.2533553 0.5565546  0.2772527
## [3,]    9 0.3897091 0.2794037 0.5927557  0.2758026
## 
## Isolated clusters:
##  L-clusters: character(0)
##  L*-clusters: character(0)
## 
## Silhouette plot information:
##    cluster neighbor   sil_width
## 8        1        2  0.36897905
## 20       1        2  0.32025263
## 23       1        2  0.31894789
## 1        1        2  0.31048562
## 5        1        3  0.21803917
## 12       1        2  0.19916845
## 10       1        2  0.19635027
## 28       1        2  0.18798819
## 21       1        2  0.17550809
## 7        1        2  0.15840872
## 9        1        3  0.15440953
## 17       1        2  0.07744852
## 26       1        2 -0.01617070
## 25       2        1  0.30174129
## 24       2        1  0.24907709
## 2        2        3  0.24092850
## 14       2        3  0.18653566
## 15       2        1  0.13907254
## 3        2        3 -0.08842137
## 11       2        1 -0.11487833
## 6        3        2  0.29418474
## 13       3        2  0.29109776
## 4        3        2  0.27546398
## 18       3        1  0.23629348
## 16       3        1  0.22295659
## 27       3        1  0.11367842
## 19       3        2  0.10905854
## 29       3        1  0.08048918
## 22       3        2  0.01447485
## Average silhouette width per cluster:
## [1] 0.2053704 0.1305793 0.1819664
## Average silhouette width of total data set:
## [1] 0.1800541
## 
## Available components:
## [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
## [6] "clusinfo"   "silinfo"    "diss"       "call"

plot(pam_fit)

library(Rtsne)
tsne_obj <- Rtsne(gowerMat, is_distance = TRUE,perplexity = 1)
 tsne_data <- tsne_obj$Y %>%
      data.frame() %>%
      setNames(c("X", "Y")) %>%
      mutate(cluster = factor(pam_fit$clustering))
  ggplot(aes(x = X, y = Y), data = tsne_data) +
      geom_point(aes(color = cluster))

dataSetForAnalysis$cluster <- pam_fit$clustering
 giveProp <- function(x){
   y <- mean(x=="yes",na.rm=TRUE)
   y <- ceiling(y*100)
   return(y)
 }
tab1 <- dataSetForAnalysis %>%
   group_by(cluster)%>%
   summarise(across(c(2,4,5,6,8,10),giveProp))
tab1

tab2 <- dataSetForAnalysis %>%
  group_by(cluster)%>%
  summarise(across(where(is.numeric),mean))
tableSummary <- left_join(tab1,tab2)

## Joining, by = c("cluster", "SaleTimings", "PercapitaIncome", "PercentageBPL")

tab3 <- dataSetForAnalysis %>%
  group_by(cluster)%>%
  summarise("NumOfStates"= n(),
            "state"= paste0(state,collapse = ", "))

tableSummary <- left_join(tableSummary,tab3)

## Joining, by = "cluster"

tableSummary$medioids <- dataSetForAnalysis$State[pam_fit$id.med]

## Warning: Unknown or uninitialised column: `State`.

tableSummary <- t(tableSummary)
library(kableExtra)

## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(tableSummary,digits = 3)

cluster	1	2	3
SaleTimings	0	0	0
PercapitaIncome	0	0	0
HealthIndex	0	0	0
PercentageBPL	0	0	0
SDI2017	0	0	0
banofsalepublicplaces	0	0	0
DrinkDrive	NA	NA	NA
PercentageIPV	NA	NA	NA
CurrentAlcoholUse	NA	NA	NA
AlcoholDependence	NA	NA	NA
NumOfStates	13	7	9
state	Andaman and Nic, Chandigarh, Dadra and Nagar, Daman and Diu, Delhi, Goa, Himachalpradesh, Maharashtra, Puducherry, Punjab, Sikkim, Tripura, Uttarakhand	Andra pradesh, Arunachal Prade, Haryana, Karnataka, Kerala, Tamil nadu, Telangana	Assam, Chattisgarh, Jharkand, Madhya Pradesh, Meghalaya, Odisha, Rajasthan, Uttar Pradesh, West bengal

Harshitha_paper