Introduction

What predicts voting outcomes? In this competition, you’ll be using data from Show of Hands, an informal polling platform for use on mobile devices and the web, to see what aspects and characteristics of people’s lives predict how they will be voting for the presidential election.

Show of Hands has been downloaded over 300,000 times across Apple and Android app stores, and users have cast more than 75 million votes. In this problem, we’ll use data from thousands of users and one hundred different questions to see which responses predict voting outcomes.

Initial Setup

setwd("C:\\Users\\apundhir\\Desktop\\Analytics Edge\\Unit Kaggle")
library(caret)
library(caTools)
library(randomForest)
library(rpart)
library(rpart.plot)
library(flexclust)
library(tm)
library(dplyr)
library(rattle)
library(e1071)
library(gbm)

Loading and Exploratory Analysis

polling <- read.csv("train2016.csv")
finaldata <- read.csv("test2016.csv")
questions <- read.csv("Questions.csv", stringsAsFactors = FALSE)
questions <- questions[,1:2]
questions$Question.ID <- as.integer(questions$Question.ID)
questions$Question.ID <- paste("Q", questions$Question.ID, sep = "")
nonyesno <- read.csv("nonyesnoquestions.csv", stringsAsFactors = FALSE)
questionidyesno <- filter(questions, !Question.ID %in% nonyesno$Question.ID)
quesidgeneral <- filter(questions, Question.ID %in% nonyesno$Question.ID)

Preprocessing Data

partypolling <- polling$Party
polling <- polling[, -7]
incomplete <- polling[is.na(polling$YOB),]
complete <- polling[!is.na(polling$YOB),]
modelage <- lm(YOB ~ .-USER_ID, data = complete)
predmissingage <- predict(modelage, newdata = incomplete)
incomplete$YOB <- predmissingage
polling <- rbind(complete, incomplete)
polling <- polling[order(polling$USER_ID),]
polling$Party <- partypolling


incompletefinal <- finaldata[is.na(finaldata$YOB),]
completefinal <- finaldata[!is.na(finaldata$YOB),]
predmissingage <- predict(modelage, newdata = incompletefinal)
incompletefinal$YOB <- predmissingage
finaldata <- rbind(completefinal, incompletefinal)
finaldata <- finaldata[order(finaldata$USER_ID),]

Cleaning Questions and Clustering Questions

#Overall Cluster
questioncorpus <- Corpus(VectorSource(questions$Question.Text))
questioncorpus <- tm_map(questioncorpus, tolower)
questioncorpus <- tm_map(questioncorpus, PlainTextDocument)
questioncorpus <- tm_map(questioncorpus, stripWhitespace)
questioncorpus <- tm_map(questioncorpus, removePunctuation)
questioncorpus <- tm_map(questioncorpus, removeWords, stopwords("english"))
questioncorpus <- tm_map(questioncorpus, stemDocument)
dtm <- DocumentTermMatrix(questioncorpus)
findFreqTerms(dtm, 5)
##  [1] "day"     "ever"    "feel"    "general" "get"     "life"    "like"   
##  [8] "parent"  "person"  "right"   "time"
questionmatrix <- as.data.frame(as.matrix(dtm))
colnames(questionmatrix) = make.names(colnames(questionmatrix))
rownames(questionmatrix) <- 1:nrow(questionmatrix)
distance <- dist(questionmatrix, method = "euclidean")
clusterquestions <- hclust(distance, method = "ward.D") 
plot(clusterquestions)
rect.hclust(clusterquestions, k=2, border="red")

clusterGroups = cutree(clusterquestions, k = 2)
table(clusterGroups)
## clusterGroups
##  1  2 
## 92  9

Cluser 1

# Cluster 1
questions$Question.Text[which(clusterGroups == 1)]
##  [1] "Are you good at math?"                                                                                                                
##  [2] "Do/did you have any siblings?"                                                                                                        
##  [3] "Do you have a go-to creative outlet?"                                                                                                 
##  [4] "Do you pray or meditate on a regular basis?"                                                                                          
##  [5] "Do you exercise 3 or more times per week?"                                                                                            
##  [6] "Does life have a purpose?"                                                                                                            
##  [7] "Did your parents spank you as a form of discipline/punishment?"                                                                       
##  [8] "Are you left-handed?"                                                                                                                 
##  [9] "Do you live alone?"                                                                                                                   
## [10] "Do you keep check-lists of tasks you need to accomplish?"                                                                             
## [11] "Do you watch some amount of TV most days?"                                                                                            
## [12] "Do you think your life will be better five years from now than it is today?"                                                          
## [13] "Have you cried in the past 60 days?"                                                                                                  
## [14] "Do you feel like you are currently overweight?"                                                                                       
## [15] "Are you generally more of an optimist or a pessimist?"                                                                                
## [16] "Which parent wore the pants in your household?"                                                                                       
## [17] "As a kid did you ever build (or help build) a tree-house?"                                                                            
## [18] "Do you rent or own your primary residence?"                                                                                           
## [19] "Does your life feel adventurous?"                                                                                                     
## [20] "Do you have any credit card debt that is more than one month old?"                                                                    
## [21] "Do you eat breakfast every day?"                                                                                                      
## [22] "Are you currently carrying a grudge against anyone in your personal life?"                                                            
## [23] "Do you have more than one pet?"                                                                                                       
## [24] "Do you brush your teeth two or more times every day?"                                                                                 
## [25] "Were you awakened by an alarm clock this morning?"                                                                                    
## [26] "Do you ever treat yourself to retail therapy?"                                                                                        
## [27] "Are you taking any prescription medications?"                                                                                         
## [28] "Do you own any power tools? power saws drills etc"                                                                                    
## [29] "Do you work 50+ hours per week?"                                                                                                      
## [30] "Are you a good/effective liar?"                                                                                                       
## [31] "Do you like your given first name?"                                                                                                   
## [32] "Do you generally like people or do most of them tend to get on your nerves pretty easily?"                                            
## [33] "Do you punctuate text messages?"                                                                                                      
## [34] "Do you feel like you're normal?"                                                                                                      
## [35] "Do you spend more time with friends online or in-person?"                                                                             
## [36] "Do you feel like you have too much personal financial debt?"                                                                          
## [37] "Do you live in a single-parent household?"                                                                                            
## [38] "Do both of your parents have college degrees?"                                                                                        
## [39] "Do you enjoy getting together with your extended family?"                                                                             
## [40] "Lots of people are around! Are you more likely to be right in the middle of things or looking for your own quieter space?"            
## [41] "Are you generally a cautious person or are you comfortable taking risks?"                                                             
## [42] "Are you a feminist?"                                                                                                                  
## [43] "Have you ever been poor (however you personally defined it at the time)?"                                                             
## [44] "Mac or PC?"                                                                                                                           
## [45] "Is your alarm clock intentionally set to be a few minutes fast?"                                                                      
## [46] "As a teenager do/did you have parents who were generally more supportive or demanding?"                                               
## [47] "Are you better looking than your best friend?"                                                                                        
## [48] "Do you have any phobias?"                                                                                                             
## [49] "Are you naturally skeptical?"                                                                                                         
## [50] "Do you meditate or pray on a regular basis?"                                                                                          
## [51] "While driving: music or talk/news radio?"                                                                                             
## [52] "During your average day do you spend more time interacting with people (face-to-face) or technology?"                                 
## [53] "Do you gamble?"                                                                                                                       
## [54] "Do you support a particular charitable cause with a lot of your time and/or money?"                                                   
## [55] "Are you more likely to over-share or under-share?"                                                                                    
## [56] "Do you turn a TV on in the morning while getting ready for your day?"                                                                 
## [57] "Do you drink the unfiltered tap water in your home?"                                                                                  
## [58] "Can money buy happiness?"                                                                                                             
## [59] "Do you live within 20 miles of a major metropolitan area?"                                                                            
## [60] "Has your personality changed much from what you were like as a child?"                                                                
## [61] "Were you an obedient child?"                                                                                                          
## [62] "Does the power of positive thinking actually work?"                                                                                   
## [63] "Do you personally own a gun?"                                                                                                         
## [64] "Are you a morning person or a night person?"                                                                                          
## [65] "Do you have a car payment?"                                                                                                           
## [66] "Have you ever traveled out of the U.S.?"                                                                                              
## [67] "Do you take a daily multi-vitamin?"                                                                                                   
## [68] "Would you rather be happy or right?"                                                                                                  
## [69] "Do you like rules?"                                                                                                                   
## [70] "Do you have a quick temper?"                                                                                                          
## [71] "Have you lived in the same state your whole life?"                                                                                    
## [72] "Are you more of an idealist or a pragmatist?"                                                                                         
## [73] "Have you ever had your life genuinely threatened by intentional violence (or the threat of it)?"                                      
## [74] "Do you feel like you are in over-your-head in any aspect of your life right now?"                                                     
## [75] "Do you wear glasses or contact lenses?"                                                                                               
## [76] "Which do you really enjoy more: giving or receiving?"                                                                                 
## [77] "Are you in the middle of reading a good book right now?"                                                                              
## [78] "Does the weather have a large effect on your mood?"                                                                                   
## [79] "Are you more successful than most of your high-school friends?"                                                                       
## [80] "Do you have (or plan to pursue) a Masters or Doctoral degree?"                                                                        
## [81] "Science or Art?"                                                                                                                      
## [82] "Were your parents married when you were born?"                                                                                        
## [83] "As a kid did you watch Sesame Street on a regular basis?"                                                                             
## [84] "2013: did you drink alcohol?"                                                                                                         
## [85] "2013: did you start a new romantic relationship?"                                                                                     
## [86] "Your significant other takes an extra long look at a very attractive person (of your gender) walking past both of you. Are you upset?"
## [87] "Do you collect anything (as a hobby)?"                                                                                                
## [88] "Do you have more than $20 cash in your wallet or purse right now?"                                                                    
## [89] "Do you currently have a job that pays minimum wage?"                                                                                  
## [90] "Are you currently employed in a full-time job?"                                                                                       
## [91] "Did your parents fight in front of you?"                                                                                              
## [92] "Do you have to personally interact with anyone that you really dislike on a daily basis?"

Cluster 2

# Cluster 2
questions$Question.Text[which(clusterGroups == 2)]
## [1] "Did you ever get a straight-A report card in high school or college?"                                                                                                                                                           
## [2] "Do you find it easier to start and maintain a new good habit or to permanently kick a bad habit?"                                                                                                                               
## [3] "Would you say most of the hardship in your life has been the result of circumstances beyond your own control or has it been mostly the result of your own decisions and actions?"                                               
## [4] "If you had to stop telling *any* lies for 6 months (even the smallest little-white-lie would immediately make you violently ill) would it change your life in any noticeable way?"                                              
## [5] "Do you work (or attend school) on a pretty standard 9-to-5ish daytime schedule or do you have to work unusual hours?"                                                                                                           
## [6] "Did you accomplish anything exciting or inspiring in 2013? (comments from the 2012 poll are linked for inspiration)"                                                                                                            
## [7] "Your generally preferred approach to starting a new task: read up on everything you can before trying it out or dive in with almost no knowledge and learn as you go?"                                                          
## [8] "Changing or losing a job getting married or divorced the death of a close relative moving a major health issue bankruptcy all are life events that can create high stress for people Have you experienced any of these in 2013?"
## [9] "Do/did you get most of your K-12 education in public school or private school?"

Feature Engineering

Splitting Data in Training and Testing Set

# Breaking the data in training and testing sets
set.seed(2712)
split <- sample.split(polling$Party, SplitRatio = 0.7)
trainsample <- subset(polling, split)
testsample <- subset(polling, !split)

Modelling and Accuracy Comparision

Adding Repeated K Fold Cross Validation using package caret

fitControl <- trainControl(method = "repeatedcv",
                           number = 10,
                           repeats = 3)

# Normal Cross Validation Declaration
numFolds <- trainControl( method = "cv", number = 10)

# Complexity Parameter Tuning for CART
cpGrid = expand.grid( .cp = seq(0.01,0.5,0.01))

Logistic Regresstion Based Modelling

# glmmodel <- train(Party ~ .-USER_ID, 
#                   data = trainsample, 
#                   method = "glm",
#                   trControl = fitCOntrol)
# model1pred <- predict(glmmodel, newdata = testsample)
# confusionMatrix(model1pred, testsample$Party)
#Sample code for predicting using glm
glmmodel <- glm(Party ~ .-USER_ID, data = trainsample, family = "binomial")
model1pred <- predict(glmmodel, newdata = testsample, type = "response")
threshold <- 0.5
predtestlabels1 <- as.factor(ifelse(model1pred < threshold, "Democrat", "Republican"))
confusionMatrix(predtestlabels1, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        564        331
##   Republican      321        454
##                                           
##                Accuracy : 0.6096          
##                  95% CI : (0.5857, 0.6331)
##     No Information Rate : 0.5299          
##     P-Value [Acc > NIR] : 3.28e-11        
##                                           
##                   Kappa : 0.2158          
##  Mcnemar's Test P-Value : 0.7245          
##                                           
##             Sensitivity : 0.6373          
##             Specificity : 0.5783          
##          Pos Pred Value : 0.6302          
##          Neg Pred Value : 0.5858          
##              Prevalence : 0.5299          
##          Detection Rate : 0.3377          
##    Detection Prevalence : 0.5359          
##       Balanced Accuracy : 0.6078          
##                                           
##        'Positive' Class : Democrat        
## 
  • Based on running both the logistic models with/ without cross validation. We have identified there is no serious performance improvement with cross validation. Hence the for more efficient execution the normal GLM model is used for prediction.

CART Based Modelling

# Sample run using rpart package without cross validation
# cartmodel <- rpart(Party ~ .-USER_ID, data = trainsample, method = "class")
# cart1pred <- predict(cartmodel, newdata = testsample, type = "class")
# confusionMatrix(cart1pred, testsample$Party)


# Sample run with cross validation

# cpGrid <- expand.grid( .cp = seq(0.01,0.02,0.001))
# cartmodelcvtune <- train(Party ~ .-USER_ID,
#                      data = trainsample,
#                      method = "rpart",
#                      trControl = trainControl(method = "cv"),
#                      tuneGrid = cpGrid)
# cart1predcvtune <- predict(cartmodelcvtune, newdata = testsample)
# confusionMatrix(cart1predcvtune, testsample$Party)

# Tuning rpart model
# Explore minsplit
# Gives best split 10
# audit.rpart <- tune.rpart(Party ~ .-USER_ID, data=trainsample, minsplit=seq(10,100,10))
# plot(audit.rpart, main="Tune rpart on minsplit")

# cp: Gives best CP: 0.015
# audit.rpart <- tune.rpart(Party ~ .-USER_ID, 
#                           data=trainsample, 
#                           cp = c(0.002,0.005,0.01,0.015,0.02,0.03))
# plot(audit.rpart,main="Performance of rpart vs. cp")
# Best CP : 0.015

# maxdepth
# audit.rpart <- tune.rpart(Party ~ .-USER_ID, 
#                           data=trainsample, 
#                           maxdepth = 1:5)
# plot(audit.rpart,main="Performance of rpart vs. cp")
# Best Maxdepth : 4

# Running Tuned model
cartmodeltuned <- rpart(Party ~ .-USER_ID, 
                        data = trainsample, 
                        method = "class", 
                        control = rpart.control(cp = 0.015, 
                                                maxdepth = 4, 
                                                minsplit = 10))
cart1predtuned <- predict(cartmodeltuned, newdata = testsample, type = "class")
confusionMatrix(cart1predtuned, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        679        421
##   Republican      206        364
##                                           
##                Accuracy : 0.6246          
##                  95% CI : (0.6008, 0.6478)
##     No Information Rate : 0.5299          
##     P-Value [Acc > NIR] : 3.789e-15       
##                                           
##                   Kappa : 0.2346          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7672          
##             Specificity : 0.4637          
##          Pos Pred Value : 0.6173          
##          Neg Pred Value : 0.6386          
##              Prevalence : 0.5299          
##          Detection Rate : 0.4066          
##    Detection Prevalence : 0.6587          
##       Balanced Accuracy : 0.6155          
##                                           
##        'Positive' Class : Democrat        
## 
prp(cartmodeltuned)

  • Based on running both the CART models with/ without cross validation. We have identified there is no serious performance improvement with cross validation. Hence the for more efficient execution the normal CART model using rpart library for prediction.

  • So we can see the following variations of CART model produces the following accuracy improvement:

  • Normal CART without tuning accuracy: 0.6198
  • CART with Cross Validation and Tuned CP accuracy: 0.6246
  • CART model with optimized CP and maxdepth accuracy: 0.6246

Random Forest Based Modelling

# Sample run using randomforest package without cross validation and tuning
# rfmodel <- randomForest(Party ~ .-USER_ID, data = trainsample)
# rf1pred <- predict(rfmodel, newdata = testsample)
# confusionMatrix(rf1pred, testsample$Party)
# To print importance of variables in variable selection
# importance(rfmodel)


# Tuning Random Forest checking various values of ntree
# bestmtry1 <- tuneRF(trainsample[,2:107], trainsample[,108], stepFactor=1.5, improve=1e-5, ntree=100)
# bestmtry2 <- tuneRF(trainsample[,2:107], trainsample[,108], stepFactor=1.5, improve=1e-5, ntree=200)
# bestmtry3 <- tuneRF(trainsample[,2:107], trainsample[,108], stepFactor=1.5, improve=1e-5, ntree=300)
# bestmtry4 <- tuneRF(trainsample[,2:107], trainsample[,108], stepFactor=1.5, improve=1e-5, ntree=400)
bestmtry5 <- tuneRF(trainsample[,2:107], trainsample[,108], stepFactor=1.5, improve=1e-5, ntree=500)
## mtry = 10  OOB error = 38.17% 
## Searching left ...
## mtry = 7     OOB error = 38.64% 
## -0.01209677 1e-05 
## Searching right ...
## mtry = 15    OOB error = 38.12% 
## 0.001344086 1e-05 
## mtry = 22    OOB error = 38.05% 
## 0.002018843 1e-05 
## mtry = 33    OOB error = 38.1% 
## -0.001348618 1e-05

# This shows mtry optimal value for this problem is 10 

# Sample run using randomforest package with tuning without CV
rfmodeltune <- randomForest(Party ~ .-USER_ID,
                            data = trainsample,
                            mtry = 10)
rf1predtune <- predict(rfmodeltune, newdata = testsample)
confusionMatrix(rf1predtune, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        597        354
##   Republican      288        431
##                                          
##                Accuracy : 0.6156         
##                  95% CI : (0.5918, 0.639)
##     No Information Rate : 0.5299         
##     P-Value [Acc > NIR] : 1.052e-12      
##                                          
##                   Kappa : 0.2247         
##  Mcnemar's Test P-Value : 0.01031        
##                                          
##             Sensitivity : 0.6746         
##             Specificity : 0.5490         
##          Pos Pred Value : 0.6278         
##          Neg Pred Value : 0.5994         
##              Prevalence : 0.5299         
##          Detection Rate : 0.3575         
##    Detection Prevalence : 0.5695         
##       Balanced Accuracy : 0.6118         
##                                          
##        'Positive' Class : Democrat       
## 
varImpPlot(rfmodeltune)

# Random forest using CV with caret package
load("rfmodelcv.RData")
# Commented to cut the processing time
# rfmodelcv <- train(Party ~ .-USER_ID,
#                   data = trainsample,
#                   method = "rf",
#                   trControl = numFolds)
# save(rfmodelcv,file = "rfmodelcv.RData")
rf1predcv <- predict(rfmodelcv, newdata = testsample)
confusionMatrix(rf1predcv, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        580        295
##   Republican      305        490
##                                           
##                Accuracy : 0.6407          
##                  95% CI : (0.6172, 0.6638)
##     No Information Rate : 0.5299          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.2794          
##  Mcnemar's Test P-Value : 0.7133          
##                                           
##             Sensitivity : 0.6554          
##             Specificity : 0.6242          
##          Pos Pred Value : 0.6629          
##          Neg Pred Value : 0.6164          
##              Prevalence : 0.5299          
##          Detection Rate : 0.3473          
##    Detection Prevalence : 0.5240          
##       Balanced Accuracy : 0.6398          
##                                           
##        'Positive' Class : Democrat        
## 
# Random forest using CV with caret package and tuning
mtryGrid <- expand.grid(.mtry = seq(5,30,2))
load("rfmodelcvtune.RData")
# Commented to cut the processing time
# rfmodelcvtune <- train(Party ~ .-USER_ID,
#                   data = trainsample,
#                   method = "rf",
#                   trControl = numFolds,
#                   tunegrid = mtryGrid)
# save(rfmodelcvtune,file = "rfmodelcvtune.RData")
rf1predcvtune <- predict(rfmodelcvtune, newdata = testsample)
confusionMatrix(rf1predcvtune, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        578        290
##   Republican      307        495
##                                          
##                Accuracy : 0.6425         
##                  95% CI : (0.619, 0.6655)
##     No Information Rate : 0.5299         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.2833         
##  Mcnemar's Test P-Value : 0.5126         
##                                          
##             Sensitivity : 0.6531         
##             Specificity : 0.6306         
##          Pos Pred Value : 0.6659         
##          Neg Pred Value : 0.6172         
##              Prevalence : 0.5299         
##          Detection Rate : 0.3461         
##    Detection Prevalence : 0.5198         
##       Balanced Accuracy : 0.6418         
##                                          
##        'Positive' Class : Democrat       
## 
plot(rfmodelcvtune)

  • Base model accuracy without cross validation and tuning: 0.6269
  • Accuracy of random forest with 10 fold cross validation: 0.6407
  • Accuracy of random forest with 10 fold cross validation and tune mtry: 0.6413

Support Vector Machine Model

load("svmtmodel1.RData")
svmmodel1
## 
## Call:
## svm(formula = Party ~ . - USER_ID, data = trainsample, kernel = "linear", 
##     cost = 10, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.004444444 
## 
## Number of Support Vectors:  2988
# svmmodel1 <- svm(Party ~ .-USER_ID,
#                 data = trainsample,
#                 kernel = "linear", cost = 10, scale = FALSE)
# In the above using scale = FALSE tells SVM to not to scale each feature to have mean 0 and SD # 1 This approach gave a lower accuracy than scalling approach
# svmmodel1 <- svm(Party ~ .-USER_ID,
#                 data = trainsample,
#                 kernel = "linear", cost = 10, scale = TRUE)
svm1pred <- predict(svmmodel1, newdata = testsample)
confusionMatrix(svm1pred, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        571        337
##   Republican      314        448
##                                           
##                Accuracy : 0.6102          
##                  95% CI : (0.5863, 0.6337)
##     No Information Rate : 0.5299          
##     P-Value [Acc > NIR] : 2.351e-11       
##                                           
##                   Kappa : 0.2163          
##  Mcnemar's Test P-Value : 0.3886          
##                                           
##             Sensitivity : 0.6452          
##             Specificity : 0.5707          
##          Pos Pred Value : 0.6289          
##          Neg Pred Value : 0.5879          
##              Prevalence : 0.5299          
##          Detection Rate : 0.3419          
##    Detection Prevalence : 0.5437          
##       Balanced Accuracy : 0.6079          
##                                           
##        'Positive' Class : Democrat        
## 
# save(svmmodel1,file = "svmtmodel1.RData")

load("svmtuneparam.RData")

# svmtuneparam <- tune(svm,
#                      Party ~ .-USER_ID, 
#                      data = trainsample,
#                      kernel = "linear",
#                      ranges = list(cost = c(0.001,
#                                             0.01,
#                                             0.1,
#                                             1,
#                                             5,
#                                             10,
#                                             100)))
summary(svmtuneparam)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.01
## 
## - best performance: 0.3876521 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.4045620 0.02723199
## 2 1e-02 0.3876521 0.02608725
## 3 1e-01 0.3876534 0.02874150
## 4 1e+00 0.4004726 0.02354187
## 5 5e+00 0.3984187 0.02420171
## 6 1e+01 0.3994450 0.02581215
## 7 1e+02 0.4030380 0.02624589
save(svmtuneparam,file = "svmtuneparam.RData")
bestsvmtunemodel<- svmtuneparam$best.model
summary(bestsvmtunemodel)
## 
## Call:
## best.tune(method = svm, train.x = Party ~ . - USER_ID, data = trainsample, 
##     ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)), 
##     kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
##       gamma:  0.004444444 
## 
## Number of Support Vectors:  3227
## 
##  ( 1623 1604 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  Democrat Republican
# save(bestsvmtunemodel,file = "bestsvmtunemodel.RData")
 
svm1tunecvpred <- predict(bestsvmtunemodel, newdata = testsample)
confusionMatrix(svm1tunecvpred, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        642        377
##   Republican      243        408
##                                          
##                Accuracy : 0.6287         
##                  95% CI : (0.6051, 0.652)
##     No Information Rate : 0.5299         
##     P-Value [Acc > NIR] : 2.261e-16      
##                                          
##                   Kappa : 0.2476         
##  Mcnemar's Test P-Value : 9.223e-08      
##                                          
##             Sensitivity : 0.7254         
##             Specificity : 0.5197         
##          Pos Pred Value : 0.6300         
##          Neg Pred Value : 0.6267         
##              Prevalence : 0.5299         
##          Detection Rate : 0.3844         
##    Detection Prevalence : 0.6102         
##       Balanced Accuracy : 0.6226         
##                                          
##        'Positive' Class : Democrat       
## 
# Now we will attempt to use "radial" kernel
# The model provides a very weak accuracy almost random
# svmmodelradial <- svm(Party ~ .-USER_ID,
#                 data = trainsample,
#                 kernel = "radial", gamma = 1,cost = 1)
# svmradialpred <- predict(svmmodelradial, newdata = testsample)
# confusionMatrix(svmradialpred, testsample$Party)
  • Base model accuracy without tuning C = 10: 0.6102
  • Base model accuracy without tuning C = 0.01: 0.6287

Gradient Boosting Machine

load("gbmmodel.RData")
# gbmmodel <- gbm(Party ~ .-USER_ID,
#                 data = trainsample,
#                 distribution = "gaussian",
#                 n.tree = 5000,
#                 interaction.depth = 4)
# save(gbmmodel,file = "gbmmodel.RData")

head(summary(gbmmodel))

##                             var   rel.inf
## Q109244                 Q109244 23.686961
## Q115611                 Q115611  9.778289
## HouseholdStatus HouseholdStatus  5.009832
## Income                   Income  4.635784
## EducationLevel   EducationLevel  4.235445
## Q98197                   Q98197  3.666536
gbm1pred <- predict(gbmmodel, newdata = testsample, n.trees = 5000)
gbm1pred <- as.factor(ifelse(gbm1pred < 1.5, "Democrat", "Republican"))
confusionMatrix(gbm1pred, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        598        319
##   Republican      287        466
##                                           
##                Accuracy : 0.6371          
##                  95% CI : (0.6135, 0.6602)
##     No Information Rate : 0.5299          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.27            
##  Mcnemar's Test P-Value : 0.2079          
##                                           
##             Sensitivity : 0.6757          
##             Specificity : 0.5936          
##          Pos Pred Value : 0.6521          
##          Neg Pred Value : 0.6189          
##              Prevalence : 0.5299          
##          Detection Rate : 0.3581          
##    Detection Prevalence : 0.5491          
##       Balanced Accuracy : 0.6347          
##                                           
##        'Positive' Class : Democrat        
## 
plot(gbmmodel)

load("gbmmodelcv.RData")
# gbmmodelcv <- train(Party ~ .-USER_ID, data = trainsample,
#                  method = "gbm",
#                  trControl = numFolds,
#                  ## This last option is actually one
#                  ## for gbm() that passes through
#                  verbose = FALSE)
# save(gbmmodelcv,file = "gbmmodelcv.RData")
head(summary(gbmmodelcv))

##                   var   rel.inf
## Q109244Yes Q109244Yes 16.733640
## Q115611Yes Q115611Yes  7.401716
## YOB               YOB  5.024044
## Q98197No     Q98197No  3.604363
## Q109244No   Q109244No  3.313708
## Q113181Yes Q113181Yes  3.009477
gbm1predcv <- predict(gbmmodelcv, newdata = testsample)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:modeltools':
## 
##     empty
confusionMatrix(gbm1predcv, testsample$Party)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Democrat Republican
##   Democrat        585        330
##   Republican      300        455
##                                          
##                Accuracy : 0.6228         
##                  95% CI : (0.599, 0.6461)
##     No Information Rate : 0.5299         
##     P-Value [Acc > NIR] : 1.221e-14      
##                                          
##                   Kappa : 0.2412         
##  Mcnemar's Test P-Value : 0.2479         
##                                          
##             Sensitivity : 0.6610         
##             Specificity : 0.5796         
##          Pos Pred Value : 0.6393         
##          Neg Pred Value : 0.6026         
##              Prevalence : 0.5299         
##          Detection Rate : 0.3503         
##    Detection Prevalence : 0.5479         
##       Balanced Accuracy : 0.6203         
##                                          
##        'Positive' Class : Democrat       
## 
gbmGrid <-  expand.grid(interaction.depth = c(1, 5, 9),
                        n.trees = c(4000, 5000, 6000),
                        shrinkage = 0.1,
                        n.minobsinnode = 20)

# load("gbmmodelcvtune.RData")

# gbmmodelcvtune <- train(Party ~ .-USER_ID, data = trainsample,
#                  method = "gbm",
#                  trControl = numFolds,
#                  ## This last option is actually one
#                  ## for gbm() that passes through
#                  verbose = FALSE,
#                  tuneGrid = gbmGrid)

# save(gbmmodelcvtune, "gbmmodelcvtune.RData")

# gbmmodel$bestTune
# gbm1predcvtune <- predict(gbmmodelcvtune, newdata = testsample)
# confusionMatrix(gbm1predcvtune, testsample$Party)
  • Base GBM model with n.trees = 5000 has accuracy: 0.6371
  • GBM model with cross validation has accuracy: 0.6228
  • GBM model with cross validation and tuning:

Stacking Various Models

# Best Logistic Regresstion
final1 <- predict(glmmodel, newdata = finaldata, type = "response")
final1 <- as.factor(ifelse(final1 < 0.5, "Democrat", "Republican"))
# Best CART Model
final2 <- predict(cartmodeltuned, newdata = finaldata, type = "class")
# Best Random Forest
final3 <- predict(rfmodelcvtune, newdata = finaldata)
# Best SVM 
final4 <- predict(bestsvmtunemodel, newdata = finaldata)
# Best GBM
final5 <- predict(gbmmodel, newdata = finaldata, n.trees = 5000)
final5 <- as.factor(ifelse(final5 < 1.5, "Democrat", "Republican"))



# Merging the files in a consolidated dataframe

finalmerg <- data.frame(final1, final2, final3, final4, final5)
finalmerg$final1 <- as.integer(finalmerg$final1)
finalmerg$final2 <- as.integer(finalmerg$final2)
finalmerg$final3 <- as.integer(finalmerg$final3)
finalmerg$final4 <- as.integer(finalmerg$final4)
finalmerg$final5 <- as.integer(finalmerg$final5)

finalprediction <- (as.matrix(finalmerg))
finalprediction <- rowSums(finalprediction)

finalprediction <- as.factor(ifelse(finalprediction <= 7, "Democrat", "Republican"))

MySubmission <- data.frame(USER_ID = finaldata$USER_ID, Predictions = finalprediction)
write.csv(MySubmission, "Submission3333.csv", row.names=FALSE)