Logistic Regression

Elections Data set

install.packages (“MASS”) install.packages (“caret”) install.packages (“car”) install.packages (“carData”)

elec <- read.csv("D:\\Users\\jayapate\\Downloads\\election_data.csv")
elec <- elec[-1,]
View(elec)
attach(elec)
summary(elec)

##   Election.id        Result         Year        Amount.Spent  
##  Min.   :122.0   Min.   :0.0   Min.   :32.00   Min.   :2.930  
##  1st Qu.:202.2   1st Qu.:0.0   1st Qu.:39.25   1st Qu.:3.618  
##  Median :362.5   Median :1.0   Median :43.00   Median :4.005  
##  Mean   :451.6   Mean   :0.6   Mean   :43.30   Mean   :4.229  
##  3rd Qu.:710.2   3rd Qu.:1.0   3rd Qu.:49.50   3rd Qu.:4.470  
##  Max.   :965.0   Max.   :1.0   Max.   :52.00   Max.   :6.320  
##  Popularity.Rank
##  Min.   :1.00   
##  1st Qu.:2.00   
##  Median :3.00   
##  Mean   :2.70   
##  3rd Qu.:3.75   
##  Max.   :4.00

elec1 <- elec

#Finding the Linear Regression
colnames(elec1)

## [1] "Election.id"     "Result"          "Year"            "Amount.Spent"   
## [5] "Popularity.Rank"

elec1 <- lm(Result~Year+Amount.Spent+Popularity.Rank)
summary(elec1)

## 
## Call:
## lm(formula = Result ~ Year + Amount.Spent + Popularity.Rank)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36265 -0.15265 -0.09902  0.08992  0.55615 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      0.65329    1.31682   0.496   0.6375  
## Year             0.01021    0.02151   0.475   0.6517  
## Amount.Spent     0.07523    0.12208   0.616   0.5604  
## Popularity.Rank -0.30137    0.13057  -2.308   0.0604 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3784 on 6 degrees of freedom
## Multiple R-squared:  0.642,  Adjusted R-squared:  0.463 
## F-statistic: 3.586 on 3 and 6 DF,  p-value: 0.08576

#Linear Regression cannot be applied. So go for Logistic Regression
elec2 <- glm(Result~Year+Amount.Spent+Popularity.Rank)
summary(elec2)

## 
## Call:
## glm(formula = Result ~ Year + Amount.Spent + Popularity.Rank)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.36265  -0.15265  -0.09902   0.08992   0.55615  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      0.65329    1.31682   0.496   0.6375  
## Year             0.01021    0.02151   0.475   0.6517  
## Amount.Spent     0.07523    0.12208   0.616   0.5604  
## Popularity.Rank -0.30137    0.13057  -2.308   0.0604 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1432053)
## 
##     Null deviance: 2.40000  on 9  degrees of freedom
## Residual deviance: 0.85923  on 6  degrees of freedom
## AIC: 13.836
## 
## Number of Fisher Scoring iterations: 2

library ("car")

## Warning: package 'car' was built under R version 3.5.1

## Loading required package: carData

library ("caret")

## Warning: package 'caret' was built under R version 3.5.1

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.1

library ("MASS")

## Warning: package 'MASS' was built under R version 3.5.1

x<-stepAIC(elec2)

## Start:  AIC=13.84
## Result ~ Year + Amount.Spent + Popularity.Rank
## 
##                   Df Deviance    AIC
## - Year             1  0.89152 12.205
## - Amount.Spent     1  0.91361 12.449
## <none>                0.85923 13.836
## - Popularity.Rank  1  1.62217 18.191
## 
## Step:  AIC=12.2
## Result ~ Amount.Spent + Popularity.Rank
## 
##                   Df Deviance    AIC
## - Amount.Spent     1  0.94215 10.757
## <none>                0.89152 12.205
## - Popularity.Rank  1  2.18851 19.185
## 
## Step:  AIC=10.76
## Result ~ Popularity.Rank
## 
##                   Df Deviance    AIC
## <none>                0.94215 10.757
## - Popularity.Rank  1  2.40000 18.108

vif(elec2)

##            Year    Amount.Spent Popularity.Rank 
##        1.389879        1.043188        1.440479

coef(elec2)

##     (Intercept)            Year    Amount.Spent Popularity.Rank 
##      0.65329307      0.01021448      0.07522508     -0.30137290

#confusion Matrix Table
prob <- predict(elec2, type=c("response"),elec)
prob <- as.data.frame(prob)
final <- cbind(prob, elec)
confusion <- table(prob > 0.5, elec$Result)
table (prob > 0.5)

## 
## FALSE  TRUE 
##     5     5

#Now as only Popularity Rank is considered, build Logistic Regression on Popularity Rank
elec3 <- glm(Result~Popularity.Rank)
summary(elec3)

## 
## Call:
## glm(formula = Result ~ Popularity.Rank)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4959  -0.1797  -0.1488   0.1570   0.5041  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.53719    0.28763   5.344 0.000691 ***
## Popularity.Rank -0.34711    0.09866  -3.518 0.007865 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1177686)
## 
##     Null deviance: 2.40000  on 9  degrees of freedom
## Residual deviance: 0.94215  on 8  degrees of freedom
## AIC: 10.757
## 
## Number of Fisher Scoring iterations: 2

x<-stepAIC(elec3)

## Start:  AIC=10.76
## Result ~ Popularity.Rank
## 
##                   Df Deviance    AIC
## <none>                0.94215 10.757
## - Popularity.Rank  1  2.40000 18.108

vif(elec1)

##            Year    Amount.Spent Popularity.Rank 
##        1.389879        1.043188        1.440479

coef(elec3)

##     (Intercept) Popularity.Rank 
##       1.5371901      -0.3471074

#confusion Matrix Table
prob1 <- predict(elec3, type=c("response"),elec)
prob1 <- as.data.frame(prob1)
final <- cbind(prob1, elec)
confusion <- table(prob1 > 0.5, elec$Result)
confusion

##        
##         0 1
##   FALSE 4 2
##   TRUE  0 4

table (prob1 > 0.5)

## 
## FALSE  TRUE 
##     6     4

accuracy <- sum(diag(confusion)/sum(confusion))
accuracy

## [1] 0.8