Using a K-Nearest Neighbor Prediction Model


Data:

Voter Dataset from the Democracy Fund Voter Study Group

“The Democracy Fund Voter Study Group is using a unique longitudinal data set that most recently surveyed 8,000 adults (age 18+) in December 2016 via YouGov. Participants were identified from a pool of respondents who participated in a similar survey in December 2011, as well as a second pre-election interview in 2012, and a third interview following the 2012 presidential election. For these 8,000 respondents, we have measures of their political attitudes, values, and affinities in 2011 as well as self-reports of their turnout and vote choice in November 2012.”

library(readr)
library(dplyr)
library(gmodels)
library(class)
library(scales)

voterdata<-read_csv("/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Education/QC Teaching/SOC765_SocialMediaMarketingAnalytics/Assignment 3 Audience Investigation Excel/Data/VOTER_Survey_December16_Release1 copy.csv")

voterdata<- voterdata%>%
  mutate(immigrants = as.numeric(ifelse(ft_immig_2016=="0 - Unfavorable feeling",0,
                      ifelse(ft_immig_2016=="100 - Favorable feeling",100,
                      ifelse(ft_immig_2016=="25 -Unfavorable feeling",25,
                      ifelse(ft_immig_2016=="50 - No feeling at all",50,
                      ifelse(ft_immig_2016=="75 - Favorable feeling",75,
                      ifelse(ft_immig_2016=="Don't know",NA,ft_immig_2016))))))),
         blacklivesmatter = as.numeric(ifelse(ft_blm_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_blm_2016=="100 - Favorable feeling",100,
                            ifelse(ft_blm_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_blm_2016=="50 - No feeling at all",50,
                            ifelse(ft_blm_2016=="75 - Favorable feeling",75,
                            ifelse(ft_blm_2016=="Don't know",NA,ft_blm_2016))))))),
          police = as.numeric(ifelse(ft_police_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_police_2016=="100 - Favorable feeling",100,
                            ifelse(ft_police_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_police_2016=="50 - No feeling at all",50,
                            ifelse(ft_police_2016=="75 - Favorable feeling",75,
                            ifelse(ft_police_2016=="Don't know",NA,ft_police_2016))))))),
         muslims = as.numeric(ifelse(ft_muslim_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_muslim_2016=="100 - Favorable feeling",100,
                            ifelse(ft_muslim_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_muslim_2016=="50 - No feeling at all",50,
                            ifelse(ft_muslim_2016=="75 - Favorable feeling",75,
                            ifelse(ft_muslim_2016=="Don't know",NA,ft_muslim_2016))))))),
         gays = as.numeric(ifelse(ft_gays_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_gays_2016=="100 - Favorable feeling",100,
                            ifelse(ft_gays_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_gays_2016=="50 - No feeling at all",50,
                            ifelse(ft_gays_2016=="75 - Favorable feeling",75,
                            ifelse(ft_gays_2016=="Don't know",NA,ft_gays_2016))))))),
         altright = as.numeric(ifelse(ft_altright_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_altright_2016=="100 - Favorable feeling",100,
                            ifelse(ft_altright_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_altright_2016=="50 - No feeling at all",50,
                            ifelse(ft_altright_2016=="75 - Favorable feeling",75,
                            ifelse(ft_altright_2016=="Don't know",NA,ft_altright_2016))))))),
         feminists = as.numeric(ifelse(ft_fem_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_fem_2016=="100 - Favorable feeling",100,
                            ifelse(ft_fem_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_fem_2016=="50 - No feeling at all",50,
                            ifelse(ft_fem_2016=="75 - Favorable feeling",75,
                            ifelse(ft_fem_2016=="Don't know",NA,ft_fem_2016))))))),
         christians = as.numeric(ifelse(ft_christ_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_christ_2016=="100 - Favorable feeling",100,
                            ifelse(ft_christ_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_christ_2016=="50 - No feeling at all",50,
                            ifelse(ft_christ_2016=="75 - Favorable feeling",75,
                            ifelse(ft_christ_2016=="Don't know",NA,ft_christ_2016))))))),
         jews = as.numeric(ifelse(ft_jew_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_jew_2016=="100 - Favorable feeling",100,
                            ifelse(ft_jew_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_jew_2016=="50 - No feeling at all",50,
                            ifelse(ft_jew_2016=="75 - Favorable feeling",75,
                            ifelse(ft_jew_2016=="Don't know",NA,ft_jew_2016))))))),
         hillary = ifelse(presvote16post_2016=="Hillary Clinton",1,0),
         trump = ifelse(presvote16post_2016=="Donald Trump",1,0),
         votefor = factor(ifelse(hillary==1,"hillary",ifelse(trump==1,"trump","other"))))

voterdata2 <- voterdata%>%
  select(votefor,immigrants,blacklivesmatter,police,muslims,gays,altright, 
         feminists, christians)%>%
  filter(!is.na(blacklivesmatter),
         !is.na(immigrants),
         !is.na(muslims),
         !is.na(police),
         !is.na(gays),
         !is.na(altright),
         !is.na(feminists),
         !is.na(christians),
         votefor%in%c("hillary","trump"))

How corelated are self reported 0-100 ratings of favorability of various groups/issues.

x<-cor(voterdata2%>%select(-votefor))
library(corrplot)
corrplot(x, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

Can we predice who someone will vote for based on their self-reported favorability of these groups/issues?

#Wrangling data into training & testing dataframes for a KNN algorithm
#install.packages("class")
library(class)
#Write a normalization function
normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}
#Normalize 
voterdata3 <- as.data.frame(lapply(voterdata2[2:9], normalize))
voterdata2 <-cbind(voterdata2[1],voterdata3)
unique(voterdata2$votefor)
[1] hillary trump  
Levels: hillary other trump
#Create two randomly assigned groups
ind <- sample(2, nrow(voterdata2), replace=TRUE, prob=c(0.67, 0.33))
#Training
  voterdata.training <- voterdata2[ind==1, 2:9]
  voterdata.trainLabels <- voterdata2[ind==1,1]
  
#Test         
  voterdata.test <- voterdata2[ind==2, 2:9]         
  voterdata.testLabels <- voterdata2[ind==2, 1]

KNN Prediction Model

Comparing Predicted voting outcomes to observed voting outcomes

#Build knn prediction model  
voterdata_pred <- knn(train = voterdata.training, 
                      test = voterdata.test, 
                      cl= voterdata.trainLabels, 
                      k=3)
# Merge `Predicted` and `Observed` 
merged <- data.frame(voterdata_pred, voterdata.testLabels)
names(merged) <- c("Predicted", "Observed")
print(merged)

How accurate is the model?

merged%>%
  mutate(Prediction=ifelse(Predicted==Observed,"Accurate","Not Accurate"))%>%
  group_by(Prediction)%>%
  summarize(n=n())%>%
  mutate(freq=percent(n/sum(n)))

Does accuracy vary by candidate?

The model is able to predict hillary voters with slightly better accuracy than trump voters.

CrossTable(x = voterdata_pred, y = voterdata.testLabels, prop.chisq=FALSE)

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  1676 

 
               | voterdata.testLabels 
voterdata_pred |   hillary |     trump | Row Total | 
---------------|-----------|-----------|-----------|
       hillary |       762 |       100 |       862 | 
               |     0.884 |     0.116 |     0.514 | 
               |     0.902 |     0.120 |           | 
               |     0.455 |     0.060 |           | 
---------------|-----------|-----------|-----------|
         trump |        83 |       731 |       814 | 
               |     0.102 |     0.898 |     0.486 | 
               |     0.098 |     0.880 |           | 
               |     0.050 |     0.436 |           | 
---------------|-----------|-----------|-----------|
  Column Total |       845 |       831 |      1676 | 
               |     0.504 |     0.496 |           | 
---------------|-----------|-----------|-----------|

 

3 Dimensional Plot

A quick 3-dimensional plot to see what clusters we can spot when looking at feelings towards Black Lives Matter, Immigrants, and Feminists.

We see alot of stratification here.

voterdata3 <- voterdata%>%
  select(votefor,immigrants,blacklivesmatter,feminists)%>%
  filter(!is.na(blacklivesmatter),
         !is.na(immigrants),
         !is.na(feminists),
         votefor%in%c("hillary","trump"))
#install.packages("plot3D")
library(plot3D)
scatter3D(voterdata3$blacklivesmatter, voterdata3$immigrants, voterdata3$feminists, phi = 30, bty ="g",main = "Voter Data", xlab = "Black Lives Matter",ylab ="immigrants", zlab = "feminists")

References

*https://image.slidesharecdn.com/bnlintromlgenomicswwcjune2017public-170628225446/95/an-introduction-to-machine-learning-and-genomics-24-638.jpg?cb=1498690655*

Democracy Fund Voter Study Group. VIEWS OF THE ELECTORATE RESEARCH SURVEY, December 2016. [Computer File] Release 1: August 28, 2017. Washington DC: Democracy Fund Voter Study Group [producer] https://www.voterstudygroup.org/.

About KNN Procedure

“k-nearest neighbour classification for test set from training set. For each row of the test set, the k nearest (in Euclidean distance) training set vectors are found, and the classification is decided by majority vote, with ties broken at random. If there are ties for the kth nearest vector, all candidates are included in the vote.” source

---
title: ""
output: html_notebook
---

![](/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Employers/Resume/MDRC/Predicting Voter Outcomes.png)


###Using a K-Nearest Neighbor Prediction Model

<br/>

#####Data:

[Voter Dataset](https://www.voterstudygroup.org/publications/2016-elections/data) from the Democracy Fund Voter Study Group 

*"The Democracy Fund Voter Study Group is using a unique longitudinal data set that most recently surveyed 8,000 adults (age 18+) in December 2016 via YouGov. Participants were identified from a pool of respondents who participated in a similar survey in December 2011, as well as a second pre-election interview in 2012, and a third interview following the 2012 presidential election. For these 8,000 respondents, we have measures of their political attitudes, values, and affinities in 2011 as well as self-reports of their turnout and vote choice in November 2012."*


```{r, fig.height=6, fig.width=6, message=FALSE, warning=FALSE}
library(readr)
library(dplyr)
library(gmodels)
library(class)
library(scales)

voterdata<-read_csv("/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Education/QC Teaching/SOC765_SocialMediaMarketingAnalytics/Assignment 3 Audience Investigation Excel/Data/VOTER_Survey_December16_Release1 copy.csv")

voterdata<- voterdata%>%
  mutate(immigrants = as.numeric(ifelse(ft_immig_2016=="0 - Unfavorable feeling",0,
                      ifelse(ft_immig_2016=="100 - Favorable feeling",100,
                      ifelse(ft_immig_2016=="25 -Unfavorable feeling",25,
                      ifelse(ft_immig_2016=="50 - No feeling at all",50,
                      ifelse(ft_immig_2016=="75 - Favorable feeling",75,
                      ifelse(ft_immig_2016=="Don't know",NA,ft_immig_2016))))))),
         blacklivesmatter = as.numeric(ifelse(ft_blm_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_blm_2016=="100 - Favorable feeling",100,
                            ifelse(ft_blm_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_blm_2016=="50 - No feeling at all",50,
                            ifelse(ft_blm_2016=="75 - Favorable feeling",75,
                            ifelse(ft_blm_2016=="Don't know",NA,ft_blm_2016))))))),
          police = as.numeric(ifelse(ft_police_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_police_2016=="100 - Favorable feeling",100,
                            ifelse(ft_police_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_police_2016=="50 - No feeling at all",50,
                            ifelse(ft_police_2016=="75 - Favorable feeling",75,
                            ifelse(ft_police_2016=="Don't know",NA,ft_police_2016))))))),
         muslims = as.numeric(ifelse(ft_muslim_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_muslim_2016=="100 - Favorable feeling",100,
                            ifelse(ft_muslim_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_muslim_2016=="50 - No feeling at all",50,
                            ifelse(ft_muslim_2016=="75 - Favorable feeling",75,
                            ifelse(ft_muslim_2016=="Don't know",NA,ft_muslim_2016))))))),
         gays = as.numeric(ifelse(ft_gays_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_gays_2016=="100 - Favorable feeling",100,
                            ifelse(ft_gays_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_gays_2016=="50 - No feeling at all",50,
                            ifelse(ft_gays_2016=="75 - Favorable feeling",75,
                            ifelse(ft_gays_2016=="Don't know",NA,ft_gays_2016))))))),
         altright = as.numeric(ifelse(ft_altright_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_altright_2016=="100 - Favorable feeling",100,
                            ifelse(ft_altright_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_altright_2016=="50 - No feeling at all",50,
                            ifelse(ft_altright_2016=="75 - Favorable feeling",75,
                            ifelse(ft_altright_2016=="Don't know",NA,ft_altright_2016))))))),
         feminists = as.numeric(ifelse(ft_fem_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_fem_2016=="100 - Favorable feeling",100,
                            ifelse(ft_fem_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_fem_2016=="50 - No feeling at all",50,
                            ifelse(ft_fem_2016=="75 - Favorable feeling",75,
                            ifelse(ft_fem_2016=="Don't know",NA,ft_fem_2016))))))),
         christians = as.numeric(ifelse(ft_christ_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_christ_2016=="100 - Favorable feeling",100,
                            ifelse(ft_christ_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_christ_2016=="50 - No feeling at all",50,
                            ifelse(ft_christ_2016=="75 - Favorable feeling",75,
                            ifelse(ft_christ_2016=="Don't know",NA,ft_christ_2016))))))),
         jews = as.numeric(ifelse(ft_jew_2016=="0 - Unfavorable feeling",0,
                            ifelse(ft_jew_2016=="100 - Favorable feeling",100,
                            ifelse(ft_jew_2016=="25 -Unfavorable feeling",25,
                            ifelse(ft_jew_2016=="50 - No feeling at all",50,
                            ifelse(ft_jew_2016=="75 - Favorable feeling",75,
                            ifelse(ft_jew_2016=="Don't know",NA,ft_jew_2016))))))),
         hillary = ifelse(presvote16post_2016=="Hillary Clinton",1,0),
         trump = ifelse(presvote16post_2016=="Donald Trump",1,0),
         votefor = factor(ifelse(hillary==1,"hillary",ifelse(trump==1,"trump","other"))))

voterdata2 <- voterdata%>%
  select(votefor,immigrants,blacklivesmatter,police,muslims,gays,altright, 
         feminists, christians)%>%
  filter(!is.na(blacklivesmatter),
         !is.na(immigrants),
         !is.na(muslims),
         !is.na(police),
         !is.na(gays),
         !is.na(altright),
         !is.na(feminists),
         !is.na(christians),
         votefor%in%c("hillary","trump"))
```


###How corelated are self reported 0-100 ratings of favorability of various groups/issues.
```{r, fig.height=8, fig.width=8}
x<-cor(voterdata2%>%select(-votefor))
library(corrplot)
corrplot(x, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)
```

###Can we predice who someone will vote for based on their self-reported favorability of these groups/issues?

```{r}
#Wrangling data into training & testing dataframes for a KNN algorithm

#install.packages("class")
library(class)

#Write a normalization function
normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}

#Normalize 
voterdata3 <- as.data.frame(lapply(voterdata2[2:9], normalize))
voterdata2 <-cbind(voterdata2[1],voterdata3)

unique(voterdata2$votefor)

#Create two randomly assigned groups
ind <- sample(2, nrow(voterdata2), replace=TRUE, prob=c(0.67, 0.33))


#Training
  voterdata.training <- voterdata2[ind==1, 2:9]
  voterdata.trainLabels <- voterdata2[ind==1,1]

  
#Test         
  voterdata.test <- voterdata2[ind==2, 2:9]         
  voterdata.testLabels <- voterdata2[ind==2, 1]
```


####KNN Prediction Model

Comparing Predicted voting outcomes to observed voting outcomes

```{r, fig.height=6, fig.width=6}
#Build knn prediction model  
voterdata_pred <- knn(train = voterdata.training, 
                      test = voterdata.test, 
                      cl= voterdata.trainLabels, 
                      k=3)



# Merge `Predicted` and `Observed` 
merged <- data.frame(voterdata_pred, voterdata.testLabels)
names(merged) <- c("Predicted", "Observed")
print(merged)
```

####How accurate is the model?
```{r, fig.height=6, fig.width=6}
merged%>%
  mutate(Prediction=ifelse(Predicted==Observed,"Accurate","Not Accurate"))%>%
  group_by(Prediction)%>%
  summarize(n=n())%>%
  mutate(freq=percent(n/sum(n)))
```


####Does accuracy vary by candidate?

The model is able to predict hillary voters with slightly better accuracy than trump voters.
```{r}
CrossTable(x = voterdata_pred, y = voterdata.testLabels, prop.chisq=FALSE)
```


###3 Dimensional Plot

A quick 3-dimensional plot to see what clusters we can spot when looking at feelings towards Black Lives Matter, Immigrants, and Feminists.

We see alot of stratification here.

```{r}
voterdata3 <- voterdata%>%
  select(votefor,immigrants,blacklivesmatter,feminists)%>%
  filter(!is.na(blacklivesmatter),
         !is.na(immigrants),
         !is.na(feminists),
         votefor%in%c("hillary","trump"))


#install.packages("plot3D")
library(plot3D)

scatter3D(voterdata3$blacklivesmatter, voterdata3$immigrants, voterdata3$feminists, phi = 30, bty ="g",main = "Voter Data", xlab = "Black Lives Matter",ylab ="immigrants", zlab = "feminists")
```

##References

> *https://image.slidesharecdn.com/bnlintromlgenomicswwcjune2017public-170628225446/95/an-introduction-to-machine-learning-and-genomics-24-638.jpg?cb=1498690655*

> Democracy Fund Voter Study Group. VIEWS OF THE ELECTORATE RESEARCH SURVEY, December 2016. [Computer File] Release 1: August 28, 2017. Washington DC: Democracy Fund Voter Study Group [producer] https://www.voterstudygroup.org/.

##About KNN Procedure

> "k-nearest neighbour classification for test set from training set. For each row of the test set, the k nearest (in Euclidean distance) training set vectors are found, and the classification is decided by majority vote, with ties broken at random. If there are ties for the kth nearest vector, all candidates are included in the vote." [source](https://stat.ethz.ch/R-manual/R-devel/library/class/html/knn.html)


