Analyzing Gujarat Elections and Understanding the factors that played a key role.

library(ggplot2)
library(data.table)
library(plyr)
library(DMwR)
library(car)
library(caret)
library(SDMTools)
package 㤼㸱SDMTools㤼㸲 was built under R version 3.4.3
Attaching package: 㤼㸱SDMTools㤼㸲

The following objects are masked from 㤼㸱package:ModelMetrics㤼㸲:

    auc, sensitivity, specificity

The following objects are masked from 㤼㸱package:caret㤼㸲:

    sensitivity, specificity
GUJanalysis<-read.csv(file.choose(), header=T)
GUJanalysis<-na.omit(GUJanalysis)
GUJanalysis$Criminal <-as.numeric(GUJanalysis$Criminal.Case)
GUJanalysis$Crime<-ifelse(GUJanalysis$Criminal>0,1,0)
GUJanalysis$Crorepati<-ifelse(GUJanalysis$Total.Assets>10000000,1,0)
GUJanalysis$INC<-ifelse(GUJanalysis$Party=="INC",1,0)
GUJanalysis$BJP<-ifelse(GUJanalysis$Party=="BJP",1,0)
GUJanalysis$Graduate<-ifelse(GUJanalysis$Education =="Graduate",1,0)
GUJanalysis$Doctorate<-ifelse(GUJanalysis$Education=="Doctorate",1,0)
GUJanalysis$Graduate.Professional<-ifelse(GUJanalysis$Education=="Graduate Professional",1,0)
GUJanalysis$Post.Graduate<-ifelse(GUJanalysis$Education=="Post Graduate",1,0)
GUJanalysis$Grad=GUJanalysis$Graduate+GUJanalysis$Doctorate+GUJanalysis$Graduate.Professional + GUJanalysis$Post.Graduate
GUJanalysis$Serious<-ifelse(GUJanalysis$ser>0,1,0)
levels(GUJanalysis$Education )
 [1] "10th Pass"             "12th Pass"             "5th Pass"             
 [4] "8th Pass"              "Doctorate"             "Graduate"             
 [7] "Graduate Professional" "Illiterate"            "Literate"             
[10] "Not Given"             "Others"                "Post Graduate"        
GUJanalysis$Qual<-as.character(GUJanalysis$Education)
GUJanalysis$Qual[GUJanalysis$Qual=="10th Pass"]<-"School.Complete"
GUJanalysis$Qual[GUJanalysis$Qual=="12th Pass"]<-"School.Complete"
GUJanalysis$Qual[GUJanalysis$Qual=="5th Pass"]<-"Secondary"
GUJanalysis$Qual[GUJanalysis$Qual=="8th Pass"]<-"Secondary"
GUJanalysis$Qual[GUJanalysis$Qual=="Doctorate"]<-"Graduate"
GUJanalysis$Qual[GUJanalysis$Qual=="Graduate"]<-"Graduate"
GUJanalysis$Qual[GUJanalysis$Qual=="Graduate Professional"]<-"Graduate"
GUJanalysis$Qual[GUJanalysis$Qual=="Iliterate"]<-"No.Schooling"
GUJanalysis$Qual[GUJanalysis$Qual=="Literate"]<-"No.Schooling"
GUJanalysis$Qual[GUJanalysis$Qual=="Others"]<-"School.Complete"
GUJanalysis$Qual[GUJanalysis$Qual=="Post Graduate"]<-"Graduate"
GUJanalysis$Qual<-as.factor((GUJanalysis$Qual))
GUJanalysis$years.studied<-ifelse(GUJanalysis$Education=="Illiterate",0,ifelse(GUJanalysis$Education=="Literate",1,
                                                                                 ifelse(GUJanalysis$Education=="5th Pass",5,
                                                                                        ifelse(GUJanalysis$Education=="8th Pass",8,ifelse(GUJanalysis$Education=="10th Pass",10,                     
                                                                                                                                           ifelse(GUJanalysis$Education=="12th Pass",12,ifelse(GUJanalysis$Education=="Graduate",
                                                                                                                                                                                                15,ifelse(GUJanalysis$Education=="Graduate Professional",16,ifelse(GUJanalysis$Education=="Post Graduate",17,
       
Error: Incomplete expression: GUJanalysis$years.studied<-ifelse(GUJanalysis$Education=="Illiterate",0,ifelse(GUJanalysis$Education=="Literate",1,
                                                                                 ifelse(GUJanalysis$Education=="5th Pass",5,
                                                                                        ifelse(GUJanalysis$Education=="8th Pass",8,ifelse(GUJanalysis$Education=="10th Pass",10,                     
                                                                                                                                           ifelse(GUJanalysis$Education=="12th Pass",12,ifelse(GUJanalysis$Education=="Graduate",
                                                                                                                                                                                                15,ifelse(GUJanalysis$Education=="Graduate Professional",16,ifelse(GUJanalysis$Education=="Post Graduate",17,
  
GUJanalysis$years.studied<-ifelse(GUJanalysis$Education=="Illiterate",0,ifelse(GUJanalysis$Education=="Literate",1,
                                                                                 ifelse(GUJanalysis$Education=="5th Pass",5,
                                                                                        ifelse(GUJanalysis$Education=="8th Pass",8,ifelse(GUJanalysis$Education=="10th Pass",10,                     
                                                                                                                                           ifelse(GUJanalysis$Education=="12th Pass",12,ifelse(GUJanalysis$Education=="Graduate",
                                                                                                                                                                                                15,ifelse(GUJanalysis$Education=="Graduate Professional",16,ifelse(GUJanalysis$Education=="Post Graduate",17,
                                                                                                                                                                                                                                                                    ifelse(GUJanalysis$Education=="Doctorate",22,5))))))))))
sum(GUJanalysis$win)
[1] 182
GUJanalysis<-transform(GUJanalysis, totvot = ave(GUJanalysis$Vote, GUJanalysis$const,
                                                   FUN = sum))
GUJanalysis$voteshare<-GUJanalysis$Vote/GUJanalysis$totvot
GUJanalysis <- transform(GUJanalysis, 
                          rank = ave(GUJanalysis$Total.Assets, GUJanalysis$const, 
                                     FUN = function(x) rank(-x, ties.method = "first")))
GUJanalysis$Asset.Rank<-GUJanalysis$rank
normalize<-function(x){
  +return((x-min(x))/(max(x)-min(x)))}
GUJanalysis<-transform(GUJanalysis, mean.studied=ave(GUJanalysis$years.studied, GUJanalysis$const, 
                                                       FUN = normalize))
GUJanalysis<-transform(GUJanalysis, mean.assets=ave(GUJanalysis$Total.Assets, GUJanalysis$const, 
                                                      FUN = normalize))
GUJanalysis$Rich<-ifelse(GUJanalysis$Asset.Rank==1,"R1",
                          ifelse(GUJanalysis$Asset.Rank==2,"R2",
                                 ifelse(GUJanalysis$Asset.Rank==3,"R3","NR")))
GUJanalysis$Rich<-as.factor(GUJanalysis$Rich)
GUJanalysis$Status<-as.factor(ifelse(GUJanalysis$win==1,"Win","No"))
qplot(mean.assets,  Asset.Rank, colour = Status, data=GUJanalysis)

summary(GUJanalysis$voteshare)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000000 0.002069 0.004921 0.099615 0.017445 0.903633 
qplot(mean.assets,  voteshare, colour = Rich, data=GUJanalysis)

qplot(mean.assets,  mean.studied, colour = Status, data=GUJanalysis)

qplot(Asset.Rank,  mean.studied, colour = win, data=GUJanalysis)

boxplot(GUJanalysis$voteshare  ~GUJanalysis$Rich)

aov.Rich<-aov(GUJanalysis$voteshare  ~GUJanalysis$Rich)
summary(aov.Rich)
                   Df Sum Sq Mean Sq F value Pr(>F)    
GUJanalysis$Rich    3  37.02  12.340   772.9 <2e-16 ***
Residuals        1813  28.95   0.016                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
tk.1<-TukeyHSD(aov.Rich)
tk.1
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = GUJanalysis$voteshare ~ GUJanalysis$Rich)

$`GUJanalysis$Rich`
             diff         lwr         upr    p adj
R1-NR  0.37709899  0.35129063  0.40290734 0.000000
R2-NR  0.34581726  0.32000890  0.37162561 0.000000
R3-NR  0.10405825  0.07818719  0.12992930 0.000000
R2-R1 -0.03128173 -0.06543637  0.00287291 0.086327
R3-R1 -0.27304074 -0.30724279 -0.23883870 0.000000
R3-R2 -0.24175901 -0.27596106 -0.20755697 0.000000
boxplot(GUJanalysis$voteshare  ~GUJanalysis$Qual)

aov.Qualification<-aov(GUJanalysis$voteshare  ~GUJanalysis$Qual)
summary(aov.Qualification)
                   Df Sum Sq Mean Sq F value Pr(>F)    
GUJanalysis$Qual    5   3.32  0.6649   19.22 <2e-16 ***
Residuals        1811  62.64  0.0346                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
tk.3<-TukeyHSD(aov.Qualification)
tk.3
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = GUJanalysis$voteshare ~ GUJanalysis$Qual)

$`GUJanalysis$Qual`
                                     diff         lwr          upr     p adj
Illiterate-Graduate          -0.121404654 -0.23716139 -0.005647921 0.0334231
No.Schooling-Graduate        -0.098855060 -0.15196468 -0.045745442 0.0000018
Not Given-Graduate           -0.127568323 -0.24332506 -0.011811590 0.0209710
School.Complete-Graduate     -0.029607792 -0.06143924  0.002223655 0.0853726
Secondary-Graduate           -0.101968385 -0.13626473 -0.067672036 0.0000000
No.Schooling-Illiterate       0.022549594 -0.09997424  0.145073429 0.9951919
Not Given-Illiterate         -0.006163669 -0.16613579  0.153808454 0.9999978
School.Complete-Illiterate    0.091796863 -0.02311484  0.206708567 0.2031735
Secondary-Illiterate          0.019436269 -0.09618249  0.135055032 0.9968656
Not Given-No.Schooling       -0.028713263 -0.15123710  0.093810572 0.9853343
School.Complete-No.Schooling  0.069247269  0.01800559  0.120488950 0.0016667
Secondary-No.Schooling       -0.003113325 -0.05592155  0.049694899 0.9999814
School.Complete-Not Given     0.097960532 -0.01695117  0.212872236 0.1457994
Secondary-Not Given           0.025599938 -0.09001882  0.141218701 0.9886680
Secondary-School.Complete    -0.072360593 -0.10368659 -0.041034598 0.0000000
Gujconst<-subset(GUJanalysis, select=c(2))
Gujconst$const<-as.factor(Gujconst$const)
Gujconst$flag<-1
Gujconst1<-Gujconst[!duplicated(Gujconst), ]
set.seed(1234)
pd<-sample(2,nrow(Gujconst1),replace=TRUE, prob=c(0.7,0.3))
trainconst<-Gujconst1[pd==1,]
valconst<-Gujconst1[pd==2,]
train<-merge(GUJanalysis,trainconst,by=c("const"),all=FALSE)
val<-merge(GUJanalysis,valconst,by=c("const"),all=FALSE)
head(train)
head(val)
lpm.1<-win  ~female+    Crime   +Crorepati+ Grad+   Serious+    Asset.Rank+ mean.studied+   mean.assets
logit.1<-win    ~female+    Crime   +Crorepati+ Grad+   Serious+    Asset.Rank+ mean.studied+   mean.assets
Linear.1<-voteshare ~female+    Crime   +Crorepati+ Grad+   Serious+    Asset.Rank+ mean.studied+   mean.assets
OLS1<-lm(Linear.1, train)
summary(OLS1)

Call:
lm(formula = Linear.1, data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.47019 -0.03859 -0.01014  0.01349  0.60113 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.0287916  0.0107858   2.669  0.00769 ** 
female        0.0356152  0.0141653   2.514  0.01204 *  
Crime         0.0701047  0.0161768   4.334 1.57e-05 ***
Crorepati     0.1363147  0.0124104  10.984  < 2e-16 ***
Grad         -0.0213060  0.0116562  -1.828  0.06779 .  
Serious      -0.0188298  0.0198634  -0.948  0.34331    
Asset.Rank   -0.0036601  0.0009141  -4.004 6.56e-05 ***
mean.studied  0.0290803  0.0155399   1.871  0.06151 .  
mean.assets   0.2292471  0.0164992  13.894  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1309 on 1378 degrees of freedom
Multiple R-squared:  0.526, Adjusted R-squared:  0.5233 
F-statistic: 191.2 on 8 and 1378 DF,  p-value: < 2.2e-16
vif(OLS1)
      female        Crime    Crorepati         Grad      Serious   Asset.Rank mean.studied 
    1.015365     2.446866     2.177067     2.149750     2.407646     1.541390     2.127204 
 mean.assets 
    2.289479 

Retain Only Significant ones

Linear<-voteshare   ~female+    Crime   +Crorepati+ Grad+   Asset.Rank+ mean.studied+   mean.assets
OLS<-lm(Linear, data=train)
summary (OLS)

Call:
lm(formula = Linear, data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.46074 -0.03875 -0.01024  0.01348  0.60131 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.029195   0.010777   2.709  0.00683 ** 
female        0.035704   0.014165   2.521  0.01183 *  
Crime         0.058509   0.010586   5.527 3.89e-08 ***
Crorepati     0.136673   0.012404  11.018  < 2e-16 ***
Grad         -0.020624   0.011634  -1.773  0.07648 .  
Asset.Rank   -0.003673   0.000914  -4.019 6.16e-05 ***
mean.studied  0.028235   0.015514   1.820  0.06898 .  
mean.assets   0.228480   0.016479  13.865  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1309 on 1379 degrees of freedom
Multiple R-squared:  0.5257,    Adjusted R-squared:  0.5233 
F-statistic: 218.4 on 7 and 1379 DF,  p-value: < 2.2e-16
val$Pred_OLS <- predict(OLS, val)
Pred.winner<-ddply(val, .(const), transform, 
                   max.share=max(Pred_OLS))
Pred.winner$winner<-ifelse(Pred.winner$Pred_OLS==Pred.winner$max.share,1,0)
tab.LM<-table(Pred.winner$winner, Pred.winner$win)
tab.LM
   
      0   1
  0 351  34
  1  34  11
accuracy.LM<-sum(diag(tab.LM))/sum(tab.LM)
accuracy.LM
[1] 0.8418605
logit.1<-win~female+Grad+Serious+mean.studied+mean.assets
Logit.WOSM<-glm(logit.1,data=train, family=binomial)
summary(Logit.WOSM)

Call:
glm(formula = logit.1, family = binomial, data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3642  -0.3230  -0.2454  -0.2380   2.6655  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -3.591774   0.255183 -14.075   <2e-16 ***
female        0.913674   0.388978   2.349   0.0188 *  
Grad          0.007308   0.329816   0.022   0.9823    
Serious       0.557695   0.294087   1.896   0.0579 .  
mean.studied  0.090100   0.469803   0.192   0.8479    
mean.assets   3.365658   0.248473  13.545   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 894.29  on 1386  degrees of freedom
Residual deviance: 664.77  on 1381  degrees of freedom
AIC: 676.77

Number of Fisher Scoring iterations: 6
vif(Logit.WOSM)
      female         Grad      Serious mean.studied  mean.assets 
    1.055292     2.306362     1.020196     2.313092     1.090951 
logit.2<-win~Serious+mean.assets
Logit.WOSM<-glm(logit.2,data=train, family=binomial)
summary(Logit.WOSM)

Call:
glm(formula = logit.2, family = binomial, data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3468  -0.3022  -0.2547  -0.2509   2.6253  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -3.4441     0.1633 -21.095   <2e-16 ***
Serious       0.5279     0.2914   1.812   0.0701 .  
mean.assets   3.3060     0.2395  13.803   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 894.29  on 1386  degrees of freedom
Residual deviance: 669.79  on 1384  degrees of freedom
AIC: 675.79

Number of Fisher Scoring iterations: 6
vif(Logit.WOSM)
    Serious mean.assets 
     1.0159      1.0159 
pred.logit.WOSM <- predict.glm(Logit.WOSM, newdata=val, type="response")
tab.logit.WOSM<-confusion.matrix(val$win,pred.logit.WOSM,threshold = 0.5)
tab.logit.WOSM
    obs
pred   0  1
   0 376 43
   1   9  2
attr(,"class")
[1] "confusion.matrix"
accuracy.logit.WOSM<-sum(diag(tab.logit.WOSM))/sum(tab.logit.WOSM)
accuracy.logit.WOSM
[1] 0.8790698
