library(haven)
library(foreign) 
library(readr)
library(dplyr)
library(ggplot2)
library(broom)
library(car)
library(MASS) 
library(lmtest)
library(zoo)
library(nortest)
library(plotrix)
library(scales)
library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)
library(VGAM)
library(stargazer)
library(sandwich)
library(pastecs)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)
library(reshape2)
library(data.table)
library(magrittr)
library(tidyverse)
library(sjmisc)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(weights)
library(GGally)
library(tigris)
library(RColorBrewer)
library(patchwork)
library(tidycensus)
library(censusapi)
library(spdep)
library(classInt)
library(sf)
library(spatialreg)
library(naivebayes)
library(psych)
library(mlbench)
library(caret)
library(caretEnsemble)
library(Amelia)
library(mice)
library(rpart)
library(randomForest)
library(e1071)
library(klaR)

scratch DF

weight<-c(100,200,101,209,100,200,101,209,100,200,101,209,100,200,101,209,100,200,101,209)
height<-c(60,72,66,60,60,72,66,60,60,72,66,60,60,72,66,60,60,72,66,60)
age<-c(21,25,22,30,21,25,22,30,21,25,22,30,21,25,22,30,21,25,22,30)
sex<-c(1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1)
covidvac<-c(1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0)

attributes<-data.frame(weight,height,age,sex,covidvac)

frequency of covid vaccination by sex

xtabs(~covidvac+sex, data = attributes)

##         sex
## covidvac 0 1
##        0 5 5
##        1 5 5

str(attributes)

## 'data.frame':    20 obs. of  5 variables:
##  $ weight  : num  100 200 101 209 100 200 101 209 100 200 ...
##  $ height  : num  60 72 66 60 60 72 66 60 60 72 ...
##  $ age     : num  21 25 22 30 21 25 22 30 21 25 ...
##  $ sex     : num  1 0 0 1 1 0 0 1 1 0 ...
##  $ covidvac: num  1 1 0 0 1 1 0 0 1 1 ...

#convert factors

attributes$covidvac<-as.factor(attributes$covidvac)
attributes$sex<-as.factor(attributes$sex)

check for collinearity

pairs.panels(attributes[-1])

assess distributions

attributes %>%
         ggplot(aes(x=covidvac, y=weight, fill = covidvac)) +
         geom_boxplot() +theme_bw()+
         ggtitle("Box Plot")

attributes %>%
         ggplot(aes(x=covidvac, y=height, fill = covidvac)) +
         geom_boxplot() +theme_bw()+
         ggtitle("Box Plot")

attributes %>%
         ggplot(aes(x=covidvac, y=age, fill = covidvac)) +
         geom_boxplot() +theme_bw()+
         ggtitle("Box Plot")

create training and data set

set.seed(1)
ind <- sample(2, nrow(attributes), replace = T, prob = c(0.9, 0.1))
train <- attributes[ind == 1,]
test <- attributes[ind == 2,]

classify, Naive Bayes

model <- naive_bayes(covidvac ~ ., data = train, usekernel = T) 
plot(model)

predictions

p <- predict(model, train, type = 'prob')

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

head(cbind(p, train))

##             0          1 weight height age sex covidvac
## 1 0.309350103 0.69064990    100     60  21   1        1
## 2 0.008130883 0.99186912    200     72  25   0        1
## 3 0.631310106 0.36868989    101     66  22   0        0
## 5 0.309350103 0.69064990    100     60  21   1        1
## 6 0.008130883 0.99186912    200     72  25   0        1
## 8 0.986139206 0.01386079    209     60  30   1        0

training set performance, confusion matrix

p1 <- predict(model, train)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

(tab1 <- table(p1, train$covidvac))

##    
## p1  0 1
##   0 8 0
##   1 0 9

1 - sum(diag(tab1)) / sum(tab1)

## [1] 0

p2 <- predict(model, test)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

(tab2 <- table(p2, test$covidvac))

##    
## p2  0 1
##   0 2 0
##   1 0 1

1 - sum(diag(tab2)) / sum(tab2)

## [1] 0

real DF - American National Election Studies Survey (ANES), 2020

Prediction of Biden vote (2020) (post-election responses)

a20<-read_dta("C://Users//Jaire//OneDrive//Desktop//Research//Data//anes2020t.dta")

# sex 1= male, 0= female
a20$sex<-recode(a20$V202637, recodes = "2=0;-8:-1=NA", as.factor = T)
# 2020 presidential vote 1= Biden, 0= other
a20$biden20<-recode(a20$V202073, recodes = "2:12=0;-9:-1=NA", as.factor = T)
# income range, min= <9,999, max= 250k<=
a20$inc<-recode(a20$V202468x, recodes = "-9:-1=NA", as.numeric = T)
# voted in presidential election, 1= yes
a20$votd<-recode(a20$V201028, recodes = "2=0;-9:-1=NA", as.factor = T)

table(a20$biden20)

## 
##    0    1 
## 2632 3267

frequency of covid vaccination by sex

randsample <- a20[sample(1:nrow(a20), 1000,
   replace=FALSE),] 

xtabs(~biden20+sex, data = randsample)

##        sex
## biden20  0  1
##       0  6  5
##       1 10 12

sub <- subset(a20,select=c(sex, biden20,inc))
sub[complete.cases(sub),]

## # A tibble: 238 x 3
##    sex   biden20                       inc
##    <fct> <fct>                   <dbl+lbl>
##  1 1     0       20 [20. $150,000-174,999]
##  2 1     1        4 [4. $20,000-24,999]   
##  3 1     0       16 [16. $90,000-99,999]  
##  4 1     1       20 [20. $150,000-174,999]
##  5 1     1        3 [3. $15,000-19,999]   
##  6 0     1       10 [10. $50,000-59,999]  
##  7 0     1       19 [19. $125,000-149,999]
##  8 1     1       19 [19. $125,000-149,999]
##  9 0     1       17 [17. $100,000-109,999]
## 10 1     1        9 [9. $45,000-49,999]   
## # ... with 228 more rows

xtabs(~biden20+sex, data = sub)

##        sex
## biden20  0  1
##       0 44 39
##       1 87 71

assess distributions and check for collinearity

pairs.panels(sub[-2])

barchart(sub$sex)

barchart(sub$biden20)

hist(sub$inc)

a20 %>%
         ggplot(aes(x=biden20, y=inc, fill = biden20)) +
         geom_boxplot() +theme_bw()+
         ggtitle("Box Plot")

## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

## Warning: Removed 300 rows containing non-finite values (stat_boxplot).

set.seed(1)
ind <- sample(2, nrow(sub), replace = T, prob = c(0.9, 0.1))
train <- sub[ind == 1,]
test <- sub[ind == 2,]

classify, Naive Bayes

model <- naive_bayes(biden20 ~ ., data = train, usekernel = T)

## Warning: naive_bayes(): y contains NAs. They are excluded from the estimation
## process.

summary(model)

## 
## ================================== Naive Bayes ================================== 
##  
## - Call: naive_bayes.formula(formula = biden20 ~ ., data = train, usekernel = T) 
## - Laplace: 0 
## - Classes: 2 
## - Samples: 7414 
## - Features: 2 
## - Conditional distributions: 
##     - Bernoulli: 1
##     - KDE: 1
## - Prior probabilities: 
##     - 0: 0.4489
##     - 1: 0.5511
## 
## ---------------------------------------------------------------------------------

plot(model)

predictions

p1 <- predict(model, train, type = 'prob')

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

head(na.omit(cbind(p1, train)))

##              0         1 sex biden20 inc
## 6732 0.4266728 0.5733272   1       0  20
## 6737 0.4644322 0.5355678   1       0  16
## 6739 0.4266728 0.5733272   1       1  20
## 6741 0.4444297 0.5555703   1       1   3
## 6743 0.4713807 0.5286193   0       1  10
## 6744 0.4425369 0.5574631   0       1  19

test set performance, confusion matrix

p1 <- predict(model, train)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

(tab1 <- table(p1, train$biden20))

##    
## p1     0    1
##   0    0    0
##   1 2365 2904

1 - sum(diag(tab1)) / sum(tab1)

## [1] 0.4488518

p2 <- predict(model, test)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

(tab2 <- table(p2, test$biden20))

##    
## p2    0   1
##   0   0   0
##   1 267 363

1 - sum(diag(tab2)) / sum(tab2)

## [1] 0.4238095

include sex and income

models <- naive_bayes(biden20 ~ sex+inc, data = sub)

## Warning: naive_bayes(): y contains NAs. They are excluded from the estimation
## process.

summary(models)

## 
## ================================== Naive Bayes ================================== 
##  
## - Call: naive_bayes.formula(formula = biden20 ~ sex + inc, data = sub) 
## - Laplace: 0 
## - Classes: 2 
## - Samples: 8280 
## - Features: 2 
## - Conditional distributions: 
##     - Bernoulli: 1
##     - Gaussian: 1
## - Prior probabilities: 
##     - 0: 0.4462
##     - 1: 0.5538
## 
## ---------------------------------------------------------------------------------

plot(models)

# results

p3 <-predict(models, train, type = "prob")

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

models %class% test

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [519] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [593] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [630] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [704] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [741] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [778] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [815] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [852] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Levels: 0 1

head(na.omit(cbind(p3, train)))

##              0         1 sex biden20 inc
## 6732 0.4412751 0.5587249   1       0  20
## 6737 0.4552317 0.5447683   1       0  16
## 6739 0.4412751 0.5587249   1       1  20
## 6741 0.4675439 0.5324561   1       1   3
## 6743 0.4467413 0.5532587   0       1  10
## 6744 0.4249066 0.5750934   0       1  19

p3 <- predict(model, test)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

(tab2 <- table(p3, test$biden20))

##    
## p3    0   1
##   0   0   0
##   1 267 363

1 - sum(diag(tab2)) / sum(tab2)

## [1] 0.4238095

alternative method and outcome

assess data

str(sub)

## tibble [8,280 x 3] (S3: tbl_df/tbl/data.frame)
##  $ sex    : Factor w/ 2 levels "0","1": NA NA NA NA NA NA NA NA NA NA ...
##  $ biden20: Factor w/ 2 levels "0","1": NA 1 2 2 1 2 1 NA NA 2 ...
##  $ inc    : dbl+lbl [1:8280] 21, 13, 17,  7, 22,  3,  4,  3, 10, 11,  9, 18,  1, 2...
##    ..@ label       : chr "PRE-POST: SUMMARY: Total (family) income"
##    ..@ format.stata: chr "%12.0g"
##    ..@ labels      : Named num [1:24] -9 -5 1 2 3 4 5 6 7 8 ...
##    .. ..- attr(*, "names")= chr [1:24] "-9. Refused" "-5. Breakoff, sufficient partial IW" "1. Under $9,999" "2. $10,000-14,999" ...

head(sub)

## # A tibble: 6 x 3
##   sex   biden20                       inc
##   <fct> <fct>                   <dbl+lbl>
## 1 <NA>  <NA>    21 [21. $175,000-249,999]
## 2 <NA>  0       13 [13. $70,000-74,999]  
## 3 <NA>  1       17 [17. $100,000-109,999]
## 4 <NA>  1        7 [7. $35,000-39,999]   
## 5 <NA>  0       22 [22. $250,000 or more]
## 6 <NA>  1        3 [3. $15,000-19,999]

describe(sub)

##          vars    n  mean   sd median trimmed mad min max range  skew kurtosis
## sex*        1  270  1.45 0.50      1    1.44 0.0   1   2     1  0.19    -1.97
## biden20*    2 5899  1.55 0.50      2    1.57 0.0   1   2     1 -0.22    -1.95
## inc         3 7980 11.73 6.74     12   11.84 8.9   1  22    21 -0.11    -1.27
##            se
## sex*     0.03
## biden20* 0.01
## inc      0.08

missmap(sub)

## Warning: Unknown or uninitialised column: `arguments`.

## Warning: Unknown or uninitialised column: `arguments`.

## Warning: Unknown or uninitialised column: `imputations`.

imputations

imp_mod <- mice(sub[, c("sex","biden20","inc")], method='rf')

## 
##  iter imp variable
##   1   1  sex  biden20  inc
##   1   2  sex  biden20  inc
##   1   3  sex  biden20  inc
##   1   4  sex  biden20  inc
##   1   5  sex  biden20  inc
##   2   1  sex  biden20  inc
##   2   2  sex  biden20  inc
##   2   3  sex  biden20  inc
##   2   4  sex  biden20  inc
##   2   5  sex  biden20  inc
##   3   1  sex  biden20  inc
##   3   2  sex  biden20  inc
##   3   3  sex  biden20  inc
##   3   4  sex  biden20  inc
##   3   5  sex  biden20  inc
##   4   1  sex  biden20  inc
##   4   2  sex  biden20  inc
##   4   3  sex  biden20  inc
##   4   4  sex  biden20  inc
##   4   5  sex  biden20  inc
##   5   1  sex  biden20  inc
##   5   2  sex  biden20  inc
##   5   3  sex  biden20  inc
##   5   4  sex  biden20  inc
##   5   5  sex  biden20  inc

imp_complete <- complete(imp_mod)

transfer missing values

sub$biden20 <- imp_complete$biden20
sub$trump20 <- imp_complete$trump20
sub$sex <- imp_complete$sex
sub$inc <- imp_complete$inc

missmap(sub)

## Warning: Unknown or uninitialised column: `arguments`.

## Warning: Unknown or uninitialised column: `arguments`.

## Warning: Unknown or uninitialised column: `imputations`.

check visuals

ggplot(sub, aes(inc, colour = biden20)) +
geom_freqpoly(binwidth = 1) + labs(title="inc range  by 2020 vote")

## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

c <- ggplot(sub, aes(x=inc, fill=biden20, color=biden20)) +
geom_histogram(binwidth = 1) + labs(title="inc Distribution by Biden vote 2020")
c + theme_bw()

## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

barchart(sub$sex)

barchart(sub$biden20)

data splice

subxTrain <- createDataPartition(y = sub$biden20,p = 0.90,list = FALSE)
training <- sub[subxTrain,]
testing <- sub[-subxTrain,] 

prop.table(table(sub$biden20)) * 100

## 
##        0        1 
## 45.02415 54.97585

prop.table(table(training$biden20)) * 100

## 
##        0        1 
## 45.02885 54.97115

prop.table(table(testing$biden20)) * 100

## 
##        0        1 
## 44.98186 55.01814

comparing the outcome of the training and testing phase

x = training[,-3]
y = training$biden20

construct naive bayes training model

model = train(training,training$biden20,'nb',trControl=trainControl(method='cv',number=4))

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

model

## Naive Bayes 
## 
## 7453 samples
##    3 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold) 
## Summary of sample sizes: 5590, 5589, 5590, 5590 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy  Kappa
##   FALSE      1         1    
##    TRUE      1         1    
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
##  = 1.

check training model with test data and confusion matrix

Predict <- predict(model,newdata = testing )  
confusionMatrix(Predict, testing$biden20 )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 372   0
##          1   0 455
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9955, 1)
##     No Information Rate : 0.5502     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4498     
##          Detection Rate : 0.4498     
##    Detection Prevalence : 0.4498     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

Naive Bayes classifier that can predict whether a person identified as a Biden voter in 2020, with an accuracy of approximately 100% - as a caveat these data may not be the most appropriate.

check variable performance

roc_imp.training <- filterVarImp(x = training[, -ncol(training)], y = training$biden20)
head(roc_imp.training)

##               X0       X1
## sex     0.503603 0.503603
## biden20 1.000000 1.000000

roc_imp.test <- filterVarImp(x = testing[, -ncol(testing)], y = testing$biden20)
head(roc_imp.test)

##                X0        X1
## sex     0.5002865 0.5002865
## biden20 1.0000000 1.0000000

Naive Bayes Predictive Analysis

Jairemy Edwards

11/16/2021

scratch DF

frequency of covid vaccination by sex

check for collinearity

assess distributions

create training and data set

classify, Naive Bayes

predictions

training set performance, confusion matrix

real DF - American National Election Studies Survey (ANES), 2020

frequency of covid vaccination by sex

assess distributions and check for collinearity

classify, Naive Bayes

predictions

test set performance, confusion matrix

include sex and income

alternative method and outcome

assess data

imputations

transfer missing values

check visuals

data splice

comparing the outcome of the training and testing phase

construct naive bayes training model

check training model with test data and confusion matrix

check variable performance