library(haven)
library(foreign) 
library(readr)
library(dplyr)
library(ggplot2)
library(broom)
library(car)
library(MASS) 
library(lmtest)
library(zoo)
library(nortest)
library(plotrix)
library(scales)
library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)
library(VGAM)
library(stargazer)
library(sandwich)
library(pastecs)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)
library(reshape2)
library(data.table)
library(magrittr)
library(tidyverse)
library(sjmisc)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(weights)
library(GGally)
library(tigris)
library(RColorBrewer)
library(patchwork)
library(tidycensus)
library(censusapi)
library(spdep)
library(classInt)
library(sf)
library(spatialreg)
library(naivebayes)
library(psych)
library(mlbench)
library(caret)
library(caretEnsemble)
library(Amelia)
library(mice)
library(rpart)
library(randomForest)
library(e1071)
library(klaR)

scratch DF

weight<-c(100,200,101,209,100,200,101,209,100,200,101,209,100,200,101,209,100,200,101,209)
height<-c(60,72,66,60,60,72,66,60,60,72,66,60,60,72,66,60,60,72,66,60)
age<-c(21,25,22,30,21,25,22,30,21,25,22,30,21,25,22,30,21,25,22,30)
sex<-c(1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1)
covidvac<-c(1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0)
attributes<-data.frame(weight,height,age,sex,covidvac)

frequency of covid vaccination by sex

xtabs(~covidvac+sex, data = attributes)
##         sex
## covidvac 0 1
##        0 5 5
##        1 5 5
str(attributes)
## 'data.frame':    20 obs. of  5 variables:
##  $ weight  : num  100 200 101 209 100 200 101 209 100 200 ...
##  $ height  : num  60 72 66 60 60 72 66 60 60 72 ...
##  $ age     : num  21 25 22 30 21 25 22 30 21 25 ...
##  $ sex     : num  1 0 0 1 1 0 0 1 1 0 ...
##  $ covidvac: num  1 1 0 0 1 1 0 0 1 1 ...

#convert factors

attributes$covidvac<-as.factor(attributes$covidvac)
attributes$sex<-as.factor(attributes$sex)

check for collinearity

pairs.panels(attributes[-1])

assess distributions

attributes %>%
         ggplot(aes(x=covidvac, y=weight, fill = covidvac)) +
         geom_boxplot() +theme_bw()+
         ggtitle("Box Plot")

attributes %>%
         ggplot(aes(x=covidvac, y=height, fill = covidvac)) +
         geom_boxplot() +theme_bw()+
         ggtitle("Box Plot")

attributes %>%
         ggplot(aes(x=covidvac, y=age, fill = covidvac)) +
         geom_boxplot() +theme_bw()+
         ggtitle("Box Plot")

create training and data set

set.seed(1)
ind <- sample(2, nrow(attributes), replace = T, prob = c(0.9, 0.1))
train <- attributes[ind == 1,]
test <- attributes[ind == 2,]

classify, Naive Bayes

model <- naive_bayes(covidvac ~ ., data = train, usekernel = T) 
plot(model)

predictions

p <- predict(model, train, type = 'prob')
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
head(cbind(p, train))
##             0          1 weight height age sex covidvac
## 1 0.309350103 0.69064990    100     60  21   1        1
## 2 0.008130883 0.99186912    200     72  25   0        1
## 3 0.631310106 0.36868989    101     66  22   0        0
## 5 0.309350103 0.69064990    100     60  21   1        1
## 6 0.008130883 0.99186912    200     72  25   0        1
## 8 0.986139206 0.01386079    209     60  30   1        0

training set performance, confusion matrix

p1 <- predict(model, train)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab1 <- table(p1, train$covidvac))
##    
## p1  0 1
##   0 8 0
##   1 0 9
1 - sum(diag(tab1)) / sum(tab1)
## [1] 0
p2 <- predict(model, test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab2 <- table(p2, test$covidvac))
##    
## p2  0 1
##   0 2 0
##   1 0 1
1 - sum(diag(tab2)) / sum(tab2)
## [1] 0

real DF - American National Election Studies Survey (ANES), 2020

Prediction of Biden vote (2020) (post-election responses)

a20<-read_dta("C://Users//Jaire//OneDrive//Desktop//Research//Data//anes2020t.dta")
# sex 1= male, 0= female
a20$sex<-recode(a20$V202637, recodes = "2=0;-8:-1=NA", as.factor = T)
# 2020 presidential vote 1= Biden, 0= other
a20$biden20<-recode(a20$V202073, recodes = "2:12=0;-9:-1=NA", as.factor = T)
# income range, min= <9,999, max= 250k<=
a20$inc<-recode(a20$V202468x, recodes = "-9:-1=NA", as.numeric = T)
# voted in presidential election, 1= yes
a20$votd<-recode(a20$V201028, recodes = "2=0;-9:-1=NA", as.factor = T)
table(a20$biden20)
## 
##    0    1 
## 2632 3267

frequency of covid vaccination by sex

randsample <- a20[sample(1:nrow(a20), 1000,
   replace=FALSE),] 

xtabs(~biden20+sex, data = randsample)
##        sex
## biden20  0  1
##       0  6  5
##       1 10 12
sub <- subset(a20,select=c(sex, biden20,inc))
sub[complete.cases(sub),]
## # A tibble: 238 x 3
##    sex   biden20                       inc
##    <fct> <fct>                   <dbl+lbl>
##  1 1     0       20 [20. $150,000-174,999]
##  2 1     1        4 [4. $20,000-24,999]   
##  3 1     0       16 [16. $90,000-99,999]  
##  4 1     1       20 [20. $150,000-174,999]
##  5 1     1        3 [3. $15,000-19,999]   
##  6 0     1       10 [10. $50,000-59,999]  
##  7 0     1       19 [19. $125,000-149,999]
##  8 1     1       19 [19. $125,000-149,999]
##  9 0     1       17 [17. $100,000-109,999]
## 10 1     1        9 [9. $45,000-49,999]   
## # ... with 228 more rows
xtabs(~biden20+sex, data = sub)
##        sex
## biden20  0  1
##       0 44 39
##       1 87 71

assess distributions and check for collinearity

pairs.panels(sub[-2])

barchart(sub$sex)

barchart(sub$biden20)

hist(sub$inc)

a20 %>%
         ggplot(aes(x=biden20, y=inc, fill = biden20)) +
         geom_boxplot() +theme_bw()+
         ggtitle("Box Plot")
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.
## Warning: Removed 300 rows containing non-finite values (stat_boxplot).

set.seed(1)
ind <- sample(2, nrow(sub), replace = T, prob = c(0.9, 0.1))
train <- sub[ind == 1,]
test <- sub[ind == 2,]

classify, Naive Bayes

model <- naive_bayes(biden20 ~ ., data = train, usekernel = T) 
## Warning: naive_bayes(): y contains NAs. They are excluded from the estimation
## process.
summary(model)
## 
## ================================== Naive Bayes ================================== 
##  
## - Call: naive_bayes.formula(formula = biden20 ~ ., data = train, usekernel = T) 
## - Laplace: 0 
## - Classes: 2 
## - Samples: 7414 
## - Features: 2 
## - Conditional distributions: 
##     - Bernoulli: 1
##     - KDE: 1
## - Prior probabilities: 
##     - 0: 0.4489
##     - 1: 0.5511
## 
## ---------------------------------------------------------------------------------
plot(model)

predictions

p1 <- predict(model, train, type = 'prob')
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
head(na.omit(cbind(p1, train)))
##              0         1 sex biden20 inc
## 6732 0.4266728 0.5733272   1       0  20
## 6737 0.4644322 0.5355678   1       0  16
## 6739 0.4266728 0.5733272   1       1  20
## 6741 0.4444297 0.5555703   1       1   3
## 6743 0.4713807 0.5286193   0       1  10
## 6744 0.4425369 0.5574631   0       1  19

test set performance, confusion matrix

p1 <- predict(model, train)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab1 <- table(p1, train$biden20))
##    
## p1     0    1
##   0    0    0
##   1 2365 2904
1 - sum(diag(tab1)) / sum(tab1)
## [1] 0.4488518
p2 <- predict(model, test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab2 <- table(p2, test$biden20))
##    
## p2    0   1
##   0   0   0
##   1 267 363
1 - sum(diag(tab2)) / sum(tab2)
## [1] 0.4238095

include sex and income

models <- naive_bayes(biden20 ~ sex+inc, data = sub)
## Warning: naive_bayes(): y contains NAs. They are excluded from the estimation
## process.
summary(models)
## 
## ================================== Naive Bayes ================================== 
##  
## - Call: naive_bayes.formula(formula = biden20 ~ sex + inc, data = sub) 
## - Laplace: 0 
## - Classes: 2 
## - Samples: 8280 
## - Features: 2 
## - Conditional distributions: 
##     - Bernoulli: 1
##     - Gaussian: 1
## - Prior probabilities: 
##     - 0: 0.4462
##     - 1: 0.5538
## 
## ---------------------------------------------------------------------------------
plot(models)

# results

p3 <-predict(models, train, type = "prob")
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
models %class% test
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [519] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [593] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [630] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [704] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [741] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [778] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [815] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [852] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Levels: 0 1
head(na.omit(cbind(p3, train)))
##              0         1 sex biden20 inc
## 6732 0.4412751 0.5587249   1       0  20
## 6737 0.4552317 0.5447683   1       0  16
## 6739 0.4412751 0.5587249   1       1  20
## 6741 0.4675439 0.5324561   1       1   3
## 6743 0.4467413 0.5532587   0       1  10
## 6744 0.4249066 0.5750934   0       1  19
p3 <- predict(model, test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab2 <- table(p3, test$biden20))
##    
## p3    0   1
##   0   0   0
##   1 267 363
1 - sum(diag(tab2)) / sum(tab2)
## [1] 0.4238095

alternative method and outcome

assess data

str(sub)
## tibble [8,280 x 3] (S3: tbl_df/tbl/data.frame)
##  $ sex    : Factor w/ 2 levels "0","1": NA NA NA NA NA NA NA NA NA NA ...
##  $ biden20: Factor w/ 2 levels "0","1": NA 1 2 2 1 2 1 NA NA 2 ...
##  $ inc    : dbl+lbl [1:8280] 21, 13, 17,  7, 22,  3,  4,  3, 10, 11,  9, 18,  1, 2...
##    ..@ label       : chr "PRE-POST: SUMMARY: Total (family) income"
##    ..@ format.stata: chr "%12.0g"
##    ..@ labels      : Named num [1:24] -9 -5 1 2 3 4 5 6 7 8 ...
##    .. ..- attr(*, "names")= chr [1:24] "-9. Refused" "-5. Breakoff, sufficient partial IW" "1. Under $9,999" "2. $10,000-14,999" ...
head(sub)
## # A tibble: 6 x 3
##   sex   biden20                       inc
##   <fct> <fct>                   <dbl+lbl>
## 1 <NA>  <NA>    21 [21. $175,000-249,999]
## 2 <NA>  0       13 [13. $70,000-74,999]  
## 3 <NA>  1       17 [17. $100,000-109,999]
## 4 <NA>  1        7 [7. $35,000-39,999]   
## 5 <NA>  0       22 [22. $250,000 or more]
## 6 <NA>  1        3 [3. $15,000-19,999]
describe(sub)
##          vars    n  mean   sd median trimmed mad min max range  skew kurtosis
## sex*        1  270  1.45 0.50      1    1.44 0.0   1   2     1  0.19    -1.97
## biden20*    2 5899  1.55 0.50      2    1.57 0.0   1   2     1 -0.22    -1.95
## inc         3 7980 11.73 6.74     12   11.84 8.9   1  22    21 -0.11    -1.27
##            se
## sex*     0.03
## biden20* 0.01
## inc      0.08
missmap(sub)
## Warning: Unknown or uninitialised column: `arguments`.

## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

imputations

imp_mod <- mice(sub[, c("sex","biden20","inc")], method='rf')
## 
##  iter imp variable
##   1   1  sex  biden20  inc
##   1   2  sex  biden20  inc
##   1   3  sex  biden20  inc
##   1   4  sex  biden20  inc
##   1   5  sex  biden20  inc
##   2   1  sex  biden20  inc
##   2   2  sex  biden20  inc
##   2   3  sex  biden20  inc
##   2   4  sex  biden20  inc
##   2   5  sex  biden20  inc
##   3   1  sex  biden20  inc
##   3   2  sex  biden20  inc
##   3   3  sex  biden20  inc
##   3   4  sex  biden20  inc
##   3   5  sex  biden20  inc
##   4   1  sex  biden20  inc
##   4   2  sex  biden20  inc
##   4   3  sex  biden20  inc
##   4   4  sex  biden20  inc
##   4   5  sex  biden20  inc
##   5   1  sex  biden20  inc
##   5   2  sex  biden20  inc
##   5   3  sex  biden20  inc
##   5   4  sex  biden20  inc
##   5   5  sex  biden20  inc
imp_complete <- complete(imp_mod)

transfer missing values

sub$biden20 <- imp_complete$biden20
sub$trump20 <- imp_complete$trump20
sub$sex <- imp_complete$sex
sub$inc <- imp_complete$inc
missmap(sub)
## Warning: Unknown or uninitialised column: `arguments`.

## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

check visuals

ggplot(sub, aes(inc, colour = biden20)) +
geom_freqpoly(binwidth = 1) + labs(title="inc range  by 2020 vote")
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

c <- ggplot(sub, aes(x=inc, fill=biden20, color=biden20)) +
geom_histogram(binwidth = 1) + labs(title="inc Distribution by Biden vote 2020")
c + theme_bw()
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

barchart(sub$sex) 

barchart(sub$biden20)

data splice

subxTrain <- createDataPartition(y = sub$biden20,p = 0.90,list = FALSE)
training <- sub[subxTrain,]
testing <- sub[-subxTrain,] 

prop.table(table(sub$biden20)) * 100
## 
##        0        1 
## 45.02415 54.97585
prop.table(table(training$biden20)) * 100
## 
##        0        1 
## 45.02885 54.97115
prop.table(table(testing$biden20)) * 100
## 
##        0        1 
## 44.98186 55.01814

comparing the outcome of the training and testing phase

x = training[,-3]
y = training$biden20

construct naive bayes training model

model = train(training,training$biden20,'nb',trControl=trainControl(method='cv',number=4))
## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.
model
## Naive Bayes 
## 
## 7453 samples
##    3 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 5590, 5589, 5590, 5590 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy  Kappa
##   FALSE      1         1    
##    TRUE      1         1    
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
##  = 1.

check training model with test data and confusion matrix

Predict <- predict(model,newdata = testing )  
confusionMatrix(Predict, testing$biden20 )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 372   0
##          1   0 455
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9955, 1)
##     No Information Rate : 0.5502     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4498     
##          Detection Rate : 0.4498     
##    Detection Prevalence : 0.4498     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
## 

Naive Bayes classifier that can predict whether a person identified as a Biden voter in 2020, with an accuracy of approximately 100% - as a caveat these data may not be the most appropriate.

check variable performance

roc_imp.training <- filterVarImp(x = training[, -ncol(training)], y = training$biden20)
head(roc_imp.training)
##               X0       X1
## sex     0.503603 0.503603
## biden20 1.000000 1.000000
roc_imp.test <- filterVarImp(x = testing[, -ncol(testing)], y = testing$biden20)
head(roc_imp.test)
##                X0        X1
## sex     0.5002865 0.5002865
## biden20 1.0000000 1.0000000