Classification Project: Creating a Predictive Model for the 13th Malaysian General Election

The goal of this project is to build a Decision Tree that can classify the constituency according to whether Barisan National (BN) will win the seat or not. The model predicts the outcome by learning simple decision rules from the election data which was obtained from the department of Econometrics and Business Statistics from Monash University Malaysia.

Data Description

CONSTITUENCY: Parliamentary seat number

CHINESE PERCENT: Percentage of Chinese voters in the parliamentary constituency

INDIAN PERCENT: Percentage of Indian voters in the parliamentary constituency

BUMIPUTERA PERCENT: Percentage of Bumiputra voters in the parliamentary constituency

OTHER PERCENT: Percentage of other ethnic voters in the parliamentary constituency

AREA: Area of constituency; Proxy for the level of urbanization; the smaller the value, the more urbanized the constituency.

BN: A dummy variable; 1 = BN won the seat; 0 = PR won the seat

Below are some basic data exploration before we begin building our models.

#set working directory
getwd()

## [1] "C:/Users/Thinithi/Desktop/PORTFOLIO"

setwd("C:/Users/Thinithi/Desktop/DATA")
#Load Library
library(rpart) #Classification 
library(RColorBrewer) #Colour palettes for the plots
library(ggplot2) #To plot the graphs
library(MLmetrics) #For the Acurracy function

## 
## Attaching package: 'MLmetrics'

## The following object is masked from 'package:base':
## 
##     Recall

library(caret) #For the confusion matrix

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:MLmetrics':
## 
##     MAE, RMSE

library(rpart.plot) #Fancy classification tree plot
library(rattle) #To predict using the classification model

## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

#Load the data
elect<-read.csv("GE13.csv")
head(elect)

##   Constituency BN chinesepercent indianpercent bumiputerapercent
## 1          P26  1            1.8           0.1              97.7
## 2          P70  0           60.4           9.6              29.8
## 3         P139  1           18.1          10.2              71.6
## 4         P111  0           24.8          29.5              45.1
## 5          P67  1           23.9           7.0              68.9
## 6          P47  0           37.3          17.6              45.0
##   otherspercent      area     random
## 1           0.4 0.0134112 0.01384259
## 2           0.2 0.0365580 0.02204790
## 3           0.1 0.0410657 0.02384827
## 4           0.6 0.0106435 0.03033885
## 5           0.2 0.0475139 0.03539176
## 6           0.1 0.0141229 0.04382275

Data Exploration

ggplot(elect,aes(x=chinesepercent,color='red'))+geom_histogram(fill='red',binwidth = 5,boundary = 0)+ggtitle("Fig:1.1- Percentage of Chinese Votes for the 13th Malaysian General Election")+geom_vline(aes(xintercept=mean(chinesepercent)),linetype="dashed")+ scale_x_continuous(breaks = seq(0, 100, 5))+xlab("Percentage of Chinese votes")+ylab("Frequency")+geom_vline(aes(xintercept=median(chinesepercent)))

ggplot(elect,aes(x=indianpercent,colour='blue'))+geom_histogram(fill='blue',binwidth = 5,boundary = 0)+ggtitle("Fig:1.2-Percentage of Indian Votes for the 13th Malaysian General Election")+geom_vline(aes(xintercept=mean(indianpercent)),linetype="dashed")+ scale_x_continuous(breaks = seq(0, 100, 5))+xlab("Percentage of Indian votes")+ylab("Frequency")+geom_vline(aes(xintercept=median(indianpercent)))

ggplot(elect,aes(x=bumiputerapercent,colour='purple'))+geom_histogram(fill='purple',binwidth = 5,boundary = 0)+ggtitle("Fig:1.3-Percentage of Bumiputera Votes for the 13th Malaysian General Election")+geom_vline(aes(xintercept=mean(bumiputerapercent)),linetype="dashed")+ scale_x_continuous(breaks = seq(0, 100, 5))+xlab("Percentage of Bumiputera votes")+ylab("Frequency")+geom_vline(aes(xintercept=median(bumiputerapercent)))

ggplot(elect,aes(x=otherspercent,colour='green'))+geom_histogram(fill='green',binwidth = 5,boundary = 0)+ggtitle("Fig:1.4-Percentage of Other Votes for the 13th Malaysian General Election")+geom_vline(aes(xintercept=mean(otherspercent)),linetype="dashed")+ scale_x_continuous(breaks = seq(0, 100, 5))+xlab("Percentage of other votes")+ylab("Frequency")+geom_vline(aes(xintercept=median(otherspercent)))

Figure 1.1 shows a positively skewed histogram of the percentage of Chinese votes in the 13th Malaysian General Election. Only 25% or less were Chinese votes in the elections of 82 constituencies, while the remaining showed percentages ranging from 25% to 90%. Only 5 or less parliamentary elections, per bin, had a Chinese voting percentage greater than 50%. The figure 1.2 above, seemed to be positively skewed as well, with the maximum percentage of Indian votes in the election being around 25% to 30%. The Bumiputera histogram, on the other hand, is negatively skewed, with 50% of the elections having a voting percentage greater than 65%. In addition, 16 elections have a 100% voting percentage. Figure 1.4 makes it clear that the percentage of votes of other ethnicities are insignificant compared to that of Chinese, Indian and Bumiputera.

ggplot(elect,aes(x=area,colour='orange'))+geom_histogram(fill='orange',binwidth =0.04,boundary = 0)+ggtitle("Fig:2.1-Histogram of Constituency Areas")+ scale_x_continuous(breaks = seq(0, 1.6, 0.04))+geom_vline(aes(xintercept=mean(area)),linetype="dashed")

ggplot(elect,aes(x=area,y=chinesepercent)) + geom_point(aes(color=BN))+ggtitle("Fig:2.2-Percentage of Chinese Votes against the area of constituency")+scale_color_gradient(low = "red", high = "blue")+ylab("Chinese votes(%)")

ggplot(elect,aes(x=area,y=indianpercent)) + geom_point(aes(color=BN))+ggtitle("Fig:2.3-Percentage of Indian Votes against the area of constituency")+scale_color_gradient(low = "red", high = "blue")+ylab("Indian votes(%)")

ggplot(elect,aes(x=area,y=bumiputerapercent)) + geom_point(aes(color=BN))+ggtitle("Fig:2.4-Percentage of Bumiputera Votes against the area of constituency")+scale_color_gradient(low = "red", high = "blue")+ylab("Bumiputera votes(%)")

ggplot(elect,aes(x=area,y=otherspercent)) + geom_point(aes(color=BN))+ggtitle("Fig:2.5-Percentage of Other Votes against the area of constituency")+scale_color_gradient(low = "red", high = "blue")+ylab("Other votes(%)")

Figure 2.1 shows that most of the constituencies are from highly urbanized areas. According to the 4 scatterplots, which display the percentage of votes from various ethnicities against the level of urbanization in the constituency, BN appears to have won in nearly all the areas that are less urbanized. Furthermore, the percentage of Chinese, Indian and other ethnicity votes seem to be low at less urbanized regions, while Bumiputera votes remains high regardless of whether the area is urbanized or not. Figure 2.2 and 2.4 seems to be similar but inverse, showing that the Chinese and Bumiputera makes up the majority of the votes. Therefore, when one increases the other decreases. When we take a closer look at Figure 2.2, BN appears to have lost all the constituencies with a high Chinese voting percentage. This supports the finding made by Ng, Rangel, Vaithilingam and Pillay (2015, Journal of East Asian Studies), which is the fact that urbanization is a stronger variable in explaining electoral outcomes, with the Chinese ethnicity factor making the impact worse.

#Partition the dataset
set.seed(1)
inTrain <- sample(1:nrow(elect), 0.75*nrow(elect)) #setting the no.of obs in the training set 
train_set <- elect[inTrain,] #training set 
test_set <- elect[-inTrain,] #test set

Classification Tree

#Fit and plot classification tree
fit <- rpart(BN ~  
               chinesepercent+indianpercent+bumiputerapercent+otherspercent+area
             , method = "class", data = train_set) 
summary(fit)

## Call:
## rpart(formula = BN ~ chinesepercent + indianpercent + bumiputerapercent + 
##     otherspercent + area, data = train_set, method = "class")
##   n= 123 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.50000000      0 1.0000000 1.0000000 0.09545312
## 2 0.05172414      1 0.5000000 0.6724138 0.08897979
## 3 0.04310345      2 0.4482759 0.5862069 0.08551729
## 4 0.01000000      4 0.3620690 0.5517241 0.08389095
## 
## Variable importance
## bumiputerapercent    chinesepercent              area     indianpercent 
##                31                28                20                14 
##     otherspercent 
##                 7 
## 
## Node number 1: 123 observations,    complexity param=0.5
##   predicted class=1  expected loss=0.4715447  P(node) =1
##     class counts:    58    65
##    probabilities: 0.472 0.528 
##   left son=2 (43 obs) right son=3 (80 obs)
##   Primary splits:
##       bumiputerapercent < 56.05      to the left,  improve=17.679880, (0 missing)
##       area              < 0.0367542  to the left,  improve=17.052930, (0 missing)
##       chinesepercent    < 47.2       to the right, improve=14.143950, (0 missing)
##       otherspercent     < 0.35       to the right, improve= 7.515981, (0 missing)
##       indianpercent     < 7.2        to the right, improve= 6.699537, (0 missing)
##   Surrogate splits:
##       chinesepercent < 33.95      to the right, agree=0.951, adj=0.860, (0 split)
##       indianpercent  < 7.2        to the right, agree=0.789, adj=0.395, (0 split)
##       area           < 0.00663295 to the left,  agree=0.764, adj=0.326, (0 split)
##       otherspercent  < 0.35       to the right, agree=0.667, adj=0.047, (0 split)
## 
## Node number 2: 43 observations
##   predicted class=0  expected loss=0.1627907  P(node) =0.3495935
##     class counts:    36     7
##    probabilities: 0.837 0.163 
## 
## Node number 3: 80 observations,    complexity param=0.05172414
##   predicted class=1  expected loss=0.275  P(node) =0.6504065
##     class counts:    22    58
##    probabilities: 0.275 0.725 
##   left son=6 (29 obs) right son=7 (51 obs)
##   Primary splits:
##       area              < 0.0354039  to the left,  improve=6.966937, (0 missing)
##       bumiputerapercent < 97         to the right, improve=4.011111, (0 missing)
##       chinesepercent    < 2.25       to the left,  improve=2.414286, (0 missing)
##       indianpercent     < 0.15       to the left,  improve=2.177778, (0 missing)
##       otherspercent     < 0.35       to the right, improve=1.633333, (0 missing)
##   Surrogate splits:
##       otherspercent     < 0.25       to the right, agree=0.738, adj=0.276, (0 split)
##       chinesepercent    < 2          to the left,  agree=0.700, adj=0.172, (0 split)
##       bumiputerapercent < 97.65      to the right, agree=0.700, adj=0.172, (0 split)
##       indianpercent     < 0.15       to the left,  agree=0.688, adj=0.138, (0 split)
## 
## Node number 6: 29 observations,    complexity param=0.04310345
##   predicted class=0  expected loss=0.4482759  P(node) =0.2357724
##     class counts:    16    13
##    probabilities: 0.552 0.448 
##   left son=12 (9 obs) right son=13 (20 obs)
##   Primary splits:
##       indianpercent     < 1.15       to the left,  improve=1.3337160, (0 missing)
##       otherspercent     < 0.25       to the right, improve=0.7027223, (0 missing)
##       chinesepercent    < 4.65       to the left,  improve=0.4876847, (0 missing)
##       bumiputerapercent < 93.05      to the right, improve=0.4876847, (0 missing)
##       area              < 0.0179732  to the right, improve=0.1448276, (0 missing)
##   Surrogate splits:
##       bumiputerapercent < 82.65      to the right, agree=0.931, adj=0.778, (0 split)
##       chinesepercent    < 7.85       to the left,  agree=0.897, adj=0.667, (0 split)
##       otherspercent     < 0.7        to the right, agree=0.724, adj=0.111, (0 split)
##       area              < 0.00397845 to the left,  agree=0.724, adj=0.111, (0 split)
## 
## Node number 7: 51 observations
##   predicted class=1  expected loss=0.1176471  P(node) =0.4146341
##     class counts:     6    45
##    probabilities: 0.118 0.882 
## 
## Node number 12: 9 observations
##   predicted class=0  expected loss=0.2222222  P(node) =0.07317073
##     class counts:     7     2
##    probabilities: 0.778 0.222 
## 
## Node number 13: 20 observations,    complexity param=0.04310345
##   predicted class=1  expected loss=0.45  P(node) =0.1626016
##     class counts:     9    11
##    probabilities: 0.450 0.550 
##   left son=26 (13 obs) right son=27 (7 obs)
##   Primary splits:
##       otherspercent     < 0.25       to the right, improve=2.0318680, (0 missing)
##       chinesepercent    < 17.05      to the right, improve=1.0666670, (0 missing)
##       bumiputerapercent < 75.3       to the left,  improve=1.0666670, (0 missing)
##       indianpercent     < 4.4        to the right, improve=0.9000000, (0 missing)
##       area              < 0.0124586  to the right, improve=0.5813187, (0 missing)
##   Surrogate splits:
##       chinesepercent    < 31.25      to the left,  agree=0.8, adj=0.429, (0 split)
##       indianpercent     < 1.6        to the right, agree=0.7, adj=0.143, (0 split)
##       bumiputerapercent < 65         to the right, agree=0.7, adj=0.143, (0 split)
##       area              < 0.022348   to the left,  agree=0.7, adj=0.143, (0 split)
## 
## Node number 26: 13 observations
##   predicted class=0  expected loss=0.3846154  P(node) =0.1056911
##     class counts:     8     5
##    probabilities: 0.615 0.385 
## 
## Node number 27: 7 observations
##   predicted class=1  expected loss=0.1428571  P(node) =0.05691057
##     class counts:     1     6
##    probabilities: 0.143 0.857

#Fancy Plot
fancyRpartPlot(fit,main = "Classification Tree for BN",type =2,cex=0.6 )

The training data set to build the above classification model consists of data from 123 elections, where BN won 53% of the parliamentary constituencies, while PR won 47%. The percentage displayed on each node, signifies the proportion of the data held at that node. According to the 6 parallel nodes at the bottom, the model predicted that BN would only win 47% of the constituencies.

Model Predictions

#Predicting for the test set
predictions<-predict(fit,test_set, type = "class")
#Print Results
my_solution <- data.frame(Constituency = test_set$Constituency, BN_Wins =predictions,test_set$BN)
my_solution

##     Constituency BN_Wins test_set.BN
## 4           P111       0           0
## 6            P47       0           0
## 7           P100       0           0
## 12           P54       1           1
## 15          P115       0           0
## 16          P132       0           0
## 20          P119       0           1
## 24           P64       0           0
## 29            P7       1           1
## 41           P14       1           1
## 42          P130       0           0
## 46           P45       0           0
## 49           P95       1           1
## 51          P106       0           0
## 58          P137       0           0
## 61           P57       1           0
## 63           P91       1           1
## 67           P58       1           1
## 69           P87       1           1
## 70           P20       0           0
## 78           P22       0           0
## 80          P103       0           0
## 82          P160       0           1
## 84          P104       0           0
## 88           P38       1           1
## 91            P2       0           1
## 93           P34       1           1
## 98          P150       0           0
## 100          P78       0           0
## 113         P129       1           1
## 119           P6       1           1
## 121          P74       0           0
## 123          P33       1           1
## 126         P109       0           0
## 127         P153       1           1
## 128          P19       0           0
## 132         P165       0           1
## 139          P46       0           0
## 141           P4       1           1
## 143          P41       1           1
## 146          P88       1           0
## 158         P120       0           0

Model Evaluation

#Prediction accuracy
 Accuracy(my_solution$BN_Wins,my_solution$test_set.BN)

## [1] 0.8571429

 CM<-ConfusionMatrix(my_solution$BN_Wins,my_solution$test_set.BN)
 CM

##       y_pred
## y_true  0  1
##      0 21  2
##      1  4 15

 sensitivity(CM)

## [1] 0.84

 specificity(CM)

## [1] 0.8823529