Voter Dataset from the Democracy Fund Voter Study Group
“The Democracy Fund Voter Study Group is using a unique longitudinal data set that most recently surveyed 8,000 adults (age 18+) in December 2016 via YouGov. Participants were identified from a pool of respondents who participated in a similar survey in December 2011, as well as a second pre-election interview in 2012, and a third interview following the 2012 presidential election. For these 8,000 respondents, we have measures of their political attitudes, values, and affinities in 2011 as well as self-reports of their turnout and vote choice in November 2012.”
library(readr)
library(dplyr)
library(gmodels)
library(class)
library(scales)
voterdata<-read_csv("/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Education/QC Teaching/SOC765_SocialMediaMarketingAnalytics/Assignment 3 Audience Investigation Excel/Data/VOTER_Survey_December16_Release1 copy.csv")
voterdata<- voterdata%>%
mutate(immigrants = as.numeric(ifelse(ft_immig_2016=="0 - Unfavorable feeling",0,
ifelse(ft_immig_2016=="100 - Favorable feeling",100,
ifelse(ft_immig_2016=="25 -Unfavorable feeling",25,
ifelse(ft_immig_2016=="50 - No feeling at all",50,
ifelse(ft_immig_2016=="75 - Favorable feeling",75,
ifelse(ft_immig_2016=="Don't know",NA,ft_immig_2016))))))),
blacklivesmatter = as.numeric(ifelse(ft_blm_2016=="0 - Unfavorable feeling",0,
ifelse(ft_blm_2016=="100 - Favorable feeling",100,
ifelse(ft_blm_2016=="25 -Unfavorable feeling",25,
ifelse(ft_blm_2016=="50 - No feeling at all",50,
ifelse(ft_blm_2016=="75 - Favorable feeling",75,
ifelse(ft_blm_2016=="Don't know",NA,ft_blm_2016))))))),
police = as.numeric(ifelse(ft_police_2016=="0 - Unfavorable feeling",0,
ifelse(ft_police_2016=="100 - Favorable feeling",100,
ifelse(ft_police_2016=="25 -Unfavorable feeling",25,
ifelse(ft_police_2016=="50 - No feeling at all",50,
ifelse(ft_police_2016=="75 - Favorable feeling",75,
ifelse(ft_police_2016=="Don't know",NA,ft_police_2016))))))),
muslims = as.numeric(ifelse(ft_muslim_2016=="0 - Unfavorable feeling",0,
ifelse(ft_muslim_2016=="100 - Favorable feeling",100,
ifelse(ft_muslim_2016=="25 -Unfavorable feeling",25,
ifelse(ft_muslim_2016=="50 - No feeling at all",50,
ifelse(ft_muslim_2016=="75 - Favorable feeling",75,
ifelse(ft_muslim_2016=="Don't know",NA,ft_muslim_2016))))))),
gays = as.numeric(ifelse(ft_gays_2016=="0 - Unfavorable feeling",0,
ifelse(ft_gays_2016=="100 - Favorable feeling",100,
ifelse(ft_gays_2016=="25 -Unfavorable feeling",25,
ifelse(ft_gays_2016=="50 - No feeling at all",50,
ifelse(ft_gays_2016=="75 - Favorable feeling",75,
ifelse(ft_gays_2016=="Don't know",NA,ft_gays_2016))))))),
altright = as.numeric(ifelse(ft_altright_2016=="0 - Unfavorable feeling",0,
ifelse(ft_altright_2016=="100 - Favorable feeling",100,
ifelse(ft_altright_2016=="25 -Unfavorable feeling",25,
ifelse(ft_altright_2016=="50 - No feeling at all",50,
ifelse(ft_altright_2016=="75 - Favorable feeling",75,
ifelse(ft_altright_2016=="Don't know",NA,ft_altright_2016))))))),
feminists = as.numeric(ifelse(ft_fem_2016=="0 - Unfavorable feeling",0,
ifelse(ft_fem_2016=="100 - Favorable feeling",100,
ifelse(ft_fem_2016=="25 -Unfavorable feeling",25,
ifelse(ft_fem_2016=="50 - No feeling at all",50,
ifelse(ft_fem_2016=="75 - Favorable feeling",75,
ifelse(ft_fem_2016=="Don't know",NA,ft_fem_2016))))))),
christians = as.numeric(ifelse(ft_christ_2016=="0 - Unfavorable feeling",0,
ifelse(ft_christ_2016=="100 - Favorable feeling",100,
ifelse(ft_christ_2016=="25 -Unfavorable feeling",25,
ifelse(ft_christ_2016=="50 - No feeling at all",50,
ifelse(ft_christ_2016=="75 - Favorable feeling",75,
ifelse(ft_christ_2016=="Don't know",NA,ft_christ_2016))))))),
jews = as.numeric(ifelse(ft_jew_2016=="0 - Unfavorable feeling",0,
ifelse(ft_jew_2016=="100 - Favorable feeling",100,
ifelse(ft_jew_2016=="25 -Unfavorable feeling",25,
ifelse(ft_jew_2016=="50 - No feeling at all",50,
ifelse(ft_jew_2016=="75 - Favorable feeling",75,
ifelse(ft_jew_2016=="Don't know",NA,ft_jew_2016))))))),
hillary = ifelse(presvote16post_2016=="Hillary Clinton",1,0),
trump = ifelse(presvote16post_2016=="Donald Trump",1,0),
votefor = factor(ifelse(hillary==1,"hillary",ifelse(trump==1,"trump","other"))))
voterdata2 <- voterdata%>%
select(votefor,immigrants,blacklivesmatter,police,muslims,gays,altright,
feminists, christians)%>%
filter(!is.na(blacklivesmatter),
!is.na(immigrants),
!is.na(muslims),
!is.na(police),
!is.na(gays),
!is.na(altright),
!is.na(feminists),
!is.na(christians),
votefor%in%c("hillary","trump"))
#Wrangling data into training & testing dataframes for a KNN algorithm
#install.packages("class")
library(class)
#Write a normalization function
normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}
#Normalize
voterdata3 <- as.data.frame(lapply(voterdata2[2:9], normalize))
voterdata2 <-cbind(voterdata2[1],voterdata3)
unique(voterdata2$votefor)
[1] hillary trump
Levels: hillary other trump
#Create two randomly assigned groups
ind <- sample(2, nrow(voterdata2), replace=TRUE, prob=c(0.67, 0.33))
#Training
voterdata.training <- voterdata2[ind==1, 2:9]
voterdata.trainLabels <- voterdata2[ind==1,1]
#Test
voterdata.test <- voterdata2[ind==2, 2:9]
voterdata.testLabels <- voterdata2[ind==2, 1]
Comparing Predicted voting outcomes to observed voting outcomes
#Build knn prediction model
voterdata_pred <- knn(train = voterdata.training,
test = voterdata.test,
cl= voterdata.trainLabels,
k=3)
# Merge `Predicted` and `Observed`
merged <- data.frame(voterdata_pred, voterdata.testLabels)
names(merged) <- c("Predicted", "Observed")
print(merged)
merged%>%
mutate(Prediction=ifelse(Predicted==Observed,"Accurate","Not Accurate"))%>%
group_by(Prediction)%>%
summarize(n=n())%>%
mutate(freq=percent(n/sum(n)))
The model is able to predict hillary voters with slightly better accuracy than trump voters.
CrossTable(x = voterdata_pred, y = voterdata.testLabels, prop.chisq=FALSE)
Cell Contents
|-------------------------|
| N |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 1676
| voterdata.testLabels
voterdata_pred | hillary | trump | Row Total |
---------------|-----------|-----------|-----------|
hillary | 762 | 100 | 862 |
| 0.884 | 0.116 | 0.514 |
| 0.902 | 0.120 | |
| 0.455 | 0.060 | |
---------------|-----------|-----------|-----------|
trump | 83 | 731 | 814 |
| 0.102 | 0.898 | 0.486 |
| 0.098 | 0.880 | |
| 0.050 | 0.436 | |
---------------|-----------|-----------|-----------|
Column Total | 845 | 831 | 1676 |
| 0.504 | 0.496 | |
---------------|-----------|-----------|-----------|
A quick 3-dimensional plot to see what clusters we can spot when looking at feelings towards Black Lives Matter, Immigrants, and Feminists.
We see alot of stratification here.
voterdata3 <- voterdata%>%
select(votefor,immigrants,blacklivesmatter,feminists)%>%
filter(!is.na(blacklivesmatter),
!is.na(immigrants),
!is.na(feminists),
votefor%in%c("hillary","trump"))
#install.packages("plot3D")
library(plot3D)
scatter3D(voterdata3$blacklivesmatter, voterdata3$immigrants, voterdata3$feminists, phi = 30, bty ="g",main = "Voter Data", xlab = "Black Lives Matter",ylab ="immigrants", zlab = "feminists")
Democracy Fund Voter Study Group. VIEWS OF THE ELECTORATE RESEARCH SURVEY, December 2016. [Computer File] Release 1: August 28, 2017. Washington DC: Democracy Fund Voter Study Group [producer] https://www.voterstudygroup.org/.
“k-nearest neighbour classification for test set from training set. For each row of the test set, the k nearest (in Euclidean distance) training set vectors are found, and the classification is decided by majority vote, with ties broken at random. If there are ties for the kth nearest vector, all candidates are included in the vote.” source