Analyzing Gujarat Elections and Understanding the factors that played a key role.
library(ggplot2)
library(data.table)
library(plyr)
library(DMwR)
library(car)
library(caret)
library(SDMTools)
package 㤼㸱SDMTools㤼㸲 was built under R version 3.4.3
Attaching package: 㤼㸱SDMTools㤼㸲
The following objects are masked from 㤼㸱package:ModelMetrics㤼㸲:
auc, sensitivity, specificity
The following objects are masked from 㤼㸱package:caret㤼㸲:
sensitivity, specificity
GUJanalysis<-read.csv(file.choose(), header=T)
GUJanalysis<-na.omit(GUJanalysis)
GUJanalysis$Criminal <-as.numeric(GUJanalysis$Criminal.Case)
GUJanalysis$Crime<-ifelse(GUJanalysis$Criminal>0,1,0)
GUJanalysis$Crorepati<-ifelse(GUJanalysis$Total.Assets>10000000,1,0)
GUJanalysis$INC<-ifelse(GUJanalysis$Party=="INC",1,0)
GUJanalysis$BJP<-ifelse(GUJanalysis$Party=="BJP",1,0)
GUJanalysis$Graduate<-ifelse(GUJanalysis$Education =="Graduate",1,0)
GUJanalysis$Doctorate<-ifelse(GUJanalysis$Education=="Doctorate",1,0)
GUJanalysis$Graduate.Professional<-ifelse(GUJanalysis$Education=="Graduate Professional",1,0)
GUJanalysis$Post.Graduate<-ifelse(GUJanalysis$Education=="Post Graduate",1,0)
GUJanalysis$Grad=GUJanalysis$Graduate+GUJanalysis$Doctorate+GUJanalysis$Graduate.Professional + GUJanalysis$Post.Graduate
GUJanalysis$Serious<-ifelse(GUJanalysis$ser>0,1,0)
levels(GUJanalysis$Education )
[1] "10th Pass" "12th Pass" "5th Pass"
[4] "8th Pass" "Doctorate" "Graduate"
[7] "Graduate Professional" "Illiterate" "Literate"
[10] "Not Given" "Others" "Post Graduate"
GUJanalysis$Qual<-as.character(GUJanalysis$Education)
GUJanalysis$Qual[GUJanalysis$Qual=="10th Pass"]<-"School.Complete"
GUJanalysis$Qual[GUJanalysis$Qual=="12th Pass"]<-"School.Complete"
GUJanalysis$Qual[GUJanalysis$Qual=="5th Pass"]<-"Secondary"
GUJanalysis$Qual[GUJanalysis$Qual=="8th Pass"]<-"Secondary"
GUJanalysis$Qual[GUJanalysis$Qual=="Doctorate"]<-"Graduate"
GUJanalysis$Qual[GUJanalysis$Qual=="Graduate"]<-"Graduate"
GUJanalysis$Qual[GUJanalysis$Qual=="Graduate Professional"]<-"Graduate"
GUJanalysis$Qual[GUJanalysis$Qual=="Iliterate"]<-"No.Schooling"
GUJanalysis$Qual[GUJanalysis$Qual=="Literate"]<-"No.Schooling"
GUJanalysis$Qual[GUJanalysis$Qual=="Others"]<-"School.Complete"
GUJanalysis$Qual[GUJanalysis$Qual=="Post Graduate"]<-"Graduate"
GUJanalysis$Qual<-as.factor((GUJanalysis$Qual))
GUJanalysis$years.studied<-ifelse(GUJanalysis$Education=="Illiterate",0,ifelse(GUJanalysis$Education=="Literate",1,
ifelse(GUJanalysis$Education=="5th Pass",5,
ifelse(GUJanalysis$Education=="8th Pass",8,ifelse(GUJanalysis$Education=="10th Pass",10,
ifelse(GUJanalysis$Education=="12th Pass",12,ifelse(GUJanalysis$Education=="Graduate",
15,ifelse(GUJanalysis$Education=="Graduate Professional",16,ifelse(GUJanalysis$Education=="Post Graduate",17,
Error: Incomplete expression: GUJanalysis$years.studied<-ifelse(GUJanalysis$Education=="Illiterate",0,ifelse(GUJanalysis$Education=="Literate",1,
ifelse(GUJanalysis$Education=="5th Pass",5,
ifelse(GUJanalysis$Education=="8th Pass",8,ifelse(GUJanalysis$Education=="10th Pass",10,
ifelse(GUJanalysis$Education=="12th Pass",12,ifelse(GUJanalysis$Education=="Graduate",
15,ifelse(GUJanalysis$Education=="Graduate Professional",16,ifelse(GUJanalysis$Education=="Post Graduate",17,
GUJanalysis$years.studied<-ifelse(GUJanalysis$Education=="Illiterate",0,ifelse(GUJanalysis$Education=="Literate",1,
ifelse(GUJanalysis$Education=="5th Pass",5,
ifelse(GUJanalysis$Education=="8th Pass",8,ifelse(GUJanalysis$Education=="10th Pass",10,
ifelse(GUJanalysis$Education=="12th Pass",12,ifelse(GUJanalysis$Education=="Graduate",
15,ifelse(GUJanalysis$Education=="Graduate Professional",16,ifelse(GUJanalysis$Education=="Post Graduate",17,
ifelse(GUJanalysis$Education=="Doctorate",22,5))))))))))
sum(GUJanalysis$win)
[1] 182
GUJanalysis<-transform(GUJanalysis, totvot = ave(GUJanalysis$Vote, GUJanalysis$const,
FUN = sum))
GUJanalysis$voteshare<-GUJanalysis$Vote/GUJanalysis$totvot
GUJanalysis <- transform(GUJanalysis,
rank = ave(GUJanalysis$Total.Assets, GUJanalysis$const,
FUN = function(x) rank(-x, ties.method = "first")))
GUJanalysis$Asset.Rank<-GUJanalysis$rank
normalize<-function(x){
+return((x-min(x))/(max(x)-min(x)))}
GUJanalysis<-transform(GUJanalysis, mean.studied=ave(GUJanalysis$years.studied, GUJanalysis$const,
FUN = normalize))
GUJanalysis<-transform(GUJanalysis, mean.assets=ave(GUJanalysis$Total.Assets, GUJanalysis$const,
FUN = normalize))
GUJanalysis$Rich<-ifelse(GUJanalysis$Asset.Rank==1,"R1",
ifelse(GUJanalysis$Asset.Rank==2,"R2",
ifelse(GUJanalysis$Asset.Rank==3,"R3","NR")))
GUJanalysis$Rich<-as.factor(GUJanalysis$Rich)
GUJanalysis$Status<-as.factor(ifelse(GUJanalysis$win==1,"Win","No"))
qplot(mean.assets, Asset.Rank, colour = Status, data=GUJanalysis)
summary(GUJanalysis$voteshare)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000000 0.002069 0.004921 0.099615 0.017445 0.903633
qplot(mean.assets, voteshare, colour = Rich, data=GUJanalysis)
qplot(mean.assets, mean.studied, colour = Status, data=GUJanalysis)
qplot(Asset.Rank, mean.studied, colour = win, data=GUJanalysis)
boxplot(GUJanalysis$voteshare ~GUJanalysis$Rich)
aov.Rich<-aov(GUJanalysis$voteshare ~GUJanalysis$Rich)
summary(aov.Rich)
Df Sum Sq Mean Sq F value Pr(>F)
GUJanalysis$Rich 3 37.02 12.340 772.9 <2e-16 ***
Residuals 1813 28.95 0.016
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
tk.1<-TukeyHSD(aov.Rich)
tk.1
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = GUJanalysis$voteshare ~ GUJanalysis$Rich)
$`GUJanalysis$Rich`
diff lwr upr p adj
R1-NR 0.37709899 0.35129063 0.40290734 0.000000
R2-NR 0.34581726 0.32000890 0.37162561 0.000000
R3-NR 0.10405825 0.07818719 0.12992930 0.000000
R2-R1 -0.03128173 -0.06543637 0.00287291 0.086327
R3-R1 -0.27304074 -0.30724279 -0.23883870 0.000000
R3-R2 -0.24175901 -0.27596106 -0.20755697 0.000000
boxplot(GUJanalysis$voteshare ~GUJanalysis$Qual)
aov.Qualification<-aov(GUJanalysis$voteshare ~GUJanalysis$Qual)
summary(aov.Qualification)
Df Sum Sq Mean Sq F value Pr(>F)
GUJanalysis$Qual 5 3.32 0.6649 19.22 <2e-16 ***
Residuals 1811 62.64 0.0346
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
tk.3<-TukeyHSD(aov.Qualification)
tk.3
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = GUJanalysis$voteshare ~ GUJanalysis$Qual)
$`GUJanalysis$Qual`
diff lwr upr p adj
Illiterate-Graduate -0.121404654 -0.23716139 -0.005647921 0.0334231
No.Schooling-Graduate -0.098855060 -0.15196468 -0.045745442 0.0000018
Not Given-Graduate -0.127568323 -0.24332506 -0.011811590 0.0209710
School.Complete-Graduate -0.029607792 -0.06143924 0.002223655 0.0853726
Secondary-Graduate -0.101968385 -0.13626473 -0.067672036 0.0000000
No.Schooling-Illiterate 0.022549594 -0.09997424 0.145073429 0.9951919
Not Given-Illiterate -0.006163669 -0.16613579 0.153808454 0.9999978
School.Complete-Illiterate 0.091796863 -0.02311484 0.206708567 0.2031735
Secondary-Illiterate 0.019436269 -0.09618249 0.135055032 0.9968656
Not Given-No.Schooling -0.028713263 -0.15123710 0.093810572 0.9853343
School.Complete-No.Schooling 0.069247269 0.01800559 0.120488950 0.0016667
Secondary-No.Schooling -0.003113325 -0.05592155 0.049694899 0.9999814
School.Complete-Not Given 0.097960532 -0.01695117 0.212872236 0.1457994
Secondary-Not Given 0.025599938 -0.09001882 0.141218701 0.9886680
Secondary-School.Complete -0.072360593 -0.10368659 -0.041034598 0.0000000
Gujconst<-subset(GUJanalysis, select=c(2))
Gujconst$const<-as.factor(Gujconst$const)
Gujconst$flag<-1
Gujconst1<-Gujconst[!duplicated(Gujconst), ]
set.seed(1234)
pd<-sample(2,nrow(Gujconst1),replace=TRUE, prob=c(0.7,0.3))
trainconst<-Gujconst1[pd==1,]
valconst<-Gujconst1[pd==2,]
train<-merge(GUJanalysis,trainconst,by=c("const"),all=FALSE)
val<-merge(GUJanalysis,valconst,by=c("const"),all=FALSE)
head(train)
head(val)
lpm.1<-win ~female+ Crime +Crorepati+ Grad+ Serious+ Asset.Rank+ mean.studied+ mean.assets
logit.1<-win ~female+ Crime +Crorepati+ Grad+ Serious+ Asset.Rank+ mean.studied+ mean.assets
Linear.1<-voteshare ~female+ Crime +Crorepati+ Grad+ Serious+ Asset.Rank+ mean.studied+ mean.assets
OLS1<-lm(Linear.1, train)
summary(OLS1)
Call:
lm(formula = Linear.1, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.47019 -0.03859 -0.01014 0.01349 0.60113
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.0287916 0.0107858 2.669 0.00769 **
female 0.0356152 0.0141653 2.514 0.01204 *
Crime 0.0701047 0.0161768 4.334 1.57e-05 ***
Crorepati 0.1363147 0.0124104 10.984 < 2e-16 ***
Grad -0.0213060 0.0116562 -1.828 0.06779 .
Serious -0.0188298 0.0198634 -0.948 0.34331
Asset.Rank -0.0036601 0.0009141 -4.004 6.56e-05 ***
mean.studied 0.0290803 0.0155399 1.871 0.06151 .
mean.assets 0.2292471 0.0164992 13.894 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1309 on 1378 degrees of freedom
Multiple R-squared: 0.526, Adjusted R-squared: 0.5233
F-statistic: 191.2 on 8 and 1378 DF, p-value: < 2.2e-16
vif(OLS1)
female Crime Crorepati Grad Serious Asset.Rank mean.studied
1.015365 2.446866 2.177067 2.149750 2.407646 1.541390 2.127204
mean.assets
2.289479
Linear<-voteshare ~female+ Crime +Crorepati+ Grad+ Asset.Rank+ mean.studied+ mean.assets
OLS<-lm(Linear, data=train)
summary (OLS)
Call:
lm(formula = Linear, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.46074 -0.03875 -0.01024 0.01348 0.60131
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.029195 0.010777 2.709 0.00683 **
female 0.035704 0.014165 2.521 0.01183 *
Crime 0.058509 0.010586 5.527 3.89e-08 ***
Crorepati 0.136673 0.012404 11.018 < 2e-16 ***
Grad -0.020624 0.011634 -1.773 0.07648 .
Asset.Rank -0.003673 0.000914 -4.019 6.16e-05 ***
mean.studied 0.028235 0.015514 1.820 0.06898 .
mean.assets 0.228480 0.016479 13.865 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1309 on 1379 degrees of freedom
Multiple R-squared: 0.5257, Adjusted R-squared: 0.5233
F-statistic: 218.4 on 7 and 1379 DF, p-value: < 2.2e-16
val$Pred_OLS <- predict(OLS, val)
Pred.winner<-ddply(val, .(const), transform,
max.share=max(Pred_OLS))
Pred.winner$winner<-ifelse(Pred.winner$Pred_OLS==Pred.winner$max.share,1,0)
tab.LM<-table(Pred.winner$winner, Pred.winner$win)
tab.LM
0 1
0 351 34
1 34 11
accuracy.LM<-sum(diag(tab.LM))/sum(tab.LM)
accuracy.LM
[1] 0.8418605
logit.1<-win~female+Grad+Serious+mean.studied+mean.assets
Logit.WOSM<-glm(logit.1,data=train, family=binomial)
summary(Logit.WOSM)
Call:
glm(formula = logit.1, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.3642 -0.3230 -0.2454 -0.2380 2.6655
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.591774 0.255183 -14.075 <2e-16 ***
female 0.913674 0.388978 2.349 0.0188 *
Grad 0.007308 0.329816 0.022 0.9823
Serious 0.557695 0.294087 1.896 0.0579 .
mean.studied 0.090100 0.469803 0.192 0.8479
mean.assets 3.365658 0.248473 13.545 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 894.29 on 1386 degrees of freedom
Residual deviance: 664.77 on 1381 degrees of freedom
AIC: 676.77
Number of Fisher Scoring iterations: 6
vif(Logit.WOSM)
female Grad Serious mean.studied mean.assets
1.055292 2.306362 1.020196 2.313092 1.090951
logit.2<-win~Serious+mean.assets
Logit.WOSM<-glm(logit.2,data=train, family=binomial)
summary(Logit.WOSM)
Call:
glm(formula = logit.2, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.3468 -0.3022 -0.2547 -0.2509 2.6253
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.4441 0.1633 -21.095 <2e-16 ***
Serious 0.5279 0.2914 1.812 0.0701 .
mean.assets 3.3060 0.2395 13.803 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 894.29 on 1386 degrees of freedom
Residual deviance: 669.79 on 1384 degrees of freedom
AIC: 675.79
Number of Fisher Scoring iterations: 6
vif(Logit.WOSM)
Serious mean.assets
1.0159 1.0159
pred.logit.WOSM <- predict.glm(Logit.WOSM, newdata=val, type="response")
tab.logit.WOSM<-confusion.matrix(val$win,pred.logit.WOSM,threshold = 0.5)
tab.logit.WOSM
obs
pred 0 1
0 376 43
1 9 2
attr(,"class")
[1] "confusion.matrix"
accuracy.logit.WOSM<-sum(diag(tab.logit.WOSM))/sum(tab.logit.WOSM)
accuracy.logit.WOSM
[1] 0.8790698