library(haven)
library(foreign)
library(readr)
library(dplyr)
library(ggplot2)
library(broom)
library(car)
library(MASS)
library(lmtest)
library(zoo)
library(nortest)
library(plotrix)
library(scales)
library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)
library(VGAM)
library(stargazer)
library(sandwich)
library(pastecs)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)
library(reshape2)
library(data.table)
library(magrittr)
library(tidyverse)
library(sjmisc)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(weights)
library(GGally)
library(tigris)
library(RColorBrewer)
library(patchwork)
library(tidycensus)
library(censusapi)
library(spdep)
library(classInt)
library(sf)
library(spatialreg)
library(naivebayes)
library(psych)
library(mlbench)
library(caret)
library(caretEnsemble)
library(Amelia)
library(mice)
library(rpart)
library(randomForest)
library(e1071)
library(klaR)
weight<-c(100,200,101,209,100,200,101,209,100,200,101,209,100,200,101,209,100,200,101,209)
height<-c(60,72,66,60,60,72,66,60,60,72,66,60,60,72,66,60,60,72,66,60)
age<-c(21,25,22,30,21,25,22,30,21,25,22,30,21,25,22,30,21,25,22,30)
sex<-c(1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1)
covidvac<-c(1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0)
attributes<-data.frame(weight,height,age,sex,covidvac)
xtabs(~covidvac+sex, data = attributes)
## sex
## covidvac 0 1
## 0 5 5
## 1 5 5
str(attributes)
## 'data.frame': 20 obs. of 5 variables:
## $ weight : num 100 200 101 209 100 200 101 209 100 200 ...
## $ height : num 60 72 66 60 60 72 66 60 60 72 ...
## $ age : num 21 25 22 30 21 25 22 30 21 25 ...
## $ sex : num 1 0 0 1 1 0 0 1 1 0 ...
## $ covidvac: num 1 1 0 0 1 1 0 0 1 1 ...
#convert factors
attributes$covidvac<-as.factor(attributes$covidvac)
attributes$sex<-as.factor(attributes$sex)
pairs.panels(attributes[-1])
attributes %>%
ggplot(aes(x=covidvac, y=weight, fill = covidvac)) +
geom_boxplot() +theme_bw()+
ggtitle("Box Plot")
attributes %>%
ggplot(aes(x=covidvac, y=height, fill = covidvac)) +
geom_boxplot() +theme_bw()+
ggtitle("Box Plot")
attributes %>%
ggplot(aes(x=covidvac, y=age, fill = covidvac)) +
geom_boxplot() +theme_bw()+
ggtitle("Box Plot")
set.seed(1)
ind <- sample(2, nrow(attributes), replace = T, prob = c(0.9, 0.1))
train <- attributes[ind == 1,]
test <- attributes[ind == 2,]
model <- naive_bayes(covidvac ~ ., data = train, usekernel = T)
plot(model)
p <- predict(model, train, type = 'prob')
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
head(cbind(p, train))
## 0 1 weight height age sex covidvac
## 1 0.309350103 0.69064990 100 60 21 1 1
## 2 0.008130883 0.99186912 200 72 25 0 1
## 3 0.631310106 0.36868989 101 66 22 0 0
## 5 0.309350103 0.69064990 100 60 21 1 1
## 6 0.008130883 0.99186912 200 72 25 0 1
## 8 0.986139206 0.01386079 209 60 30 1 0
p1 <- predict(model, train)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab1 <- table(p1, train$covidvac))
##
## p1 0 1
## 0 8 0
## 1 0 9
1 - sum(diag(tab1)) / sum(tab1)
## [1] 0
p2 <- predict(model, test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab2 <- table(p2, test$covidvac))
##
## p2 0 1
## 0 2 0
## 1 0 1
1 - sum(diag(tab2)) / sum(tab2)
## [1] 0
Prediction of Biden vote (2020) (post-election responses)
a20<-read_dta("C://Users//Jaire//OneDrive//Desktop//Research//Data//anes2020t.dta")
# sex 1= male, 0= female
a20$sex<-recode(a20$V202637, recodes = "2=0;-8:-1=NA", as.factor = T)
# 2020 presidential vote 1= Biden, 0= other
a20$biden20<-recode(a20$V202073, recodes = "2:12=0;-9:-1=NA", as.factor = T)
# income range, min= <9,999, max= 250k<=
a20$inc<-recode(a20$V202468x, recodes = "-9:-1=NA", as.numeric = T)
# voted in presidential election, 1= yes
a20$votd<-recode(a20$V201028, recodes = "2=0;-9:-1=NA", as.factor = T)
table(a20$biden20)
##
## 0 1
## 2632 3267
randsample <- a20[sample(1:nrow(a20), 1000,
replace=FALSE),]
xtabs(~biden20+sex, data = randsample)
## sex
## biden20 0 1
## 0 6 5
## 1 10 12
sub <- subset(a20,select=c(sex, biden20,inc))
sub[complete.cases(sub),]
## # A tibble: 238 x 3
## sex biden20 inc
## <fct> <fct> <dbl+lbl>
## 1 1 0 20 [20. $150,000-174,999]
## 2 1 1 4 [4. $20,000-24,999]
## 3 1 0 16 [16. $90,000-99,999]
## 4 1 1 20 [20. $150,000-174,999]
## 5 1 1 3 [3. $15,000-19,999]
## 6 0 1 10 [10. $50,000-59,999]
## 7 0 1 19 [19. $125,000-149,999]
## 8 1 1 19 [19. $125,000-149,999]
## 9 0 1 17 [17. $100,000-109,999]
## 10 1 1 9 [9. $45,000-49,999]
## # ... with 228 more rows
xtabs(~biden20+sex, data = sub)
## sex
## biden20 0 1
## 0 44 39
## 1 87 71
pairs.panels(sub[-2])
barchart(sub$sex)
barchart(sub$biden20)
hist(sub$inc)
a20 %>%
ggplot(aes(x=biden20, y=inc, fill = biden20)) +
geom_boxplot() +theme_bw()+
ggtitle("Box Plot")
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.
## Warning: Removed 300 rows containing non-finite values (stat_boxplot).
set.seed(1)
ind <- sample(2, nrow(sub), replace = T, prob = c(0.9, 0.1))
train <- sub[ind == 1,]
test <- sub[ind == 2,]
model <- naive_bayes(biden20 ~ ., data = train, usekernel = T)
## Warning: naive_bayes(): y contains NAs. They are excluded from the estimation
## process.
summary(model)
##
## ================================== Naive Bayes ==================================
##
## - Call: naive_bayes.formula(formula = biden20 ~ ., data = train, usekernel = T)
## - Laplace: 0
## - Classes: 2
## - Samples: 7414
## - Features: 2
## - Conditional distributions:
## - Bernoulli: 1
## - KDE: 1
## - Prior probabilities:
## - 0: 0.4489
## - 1: 0.5511
##
## ---------------------------------------------------------------------------------
plot(model)
p1 <- predict(model, train, type = 'prob')
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
head(na.omit(cbind(p1, train)))
## 0 1 sex biden20 inc
## 6732 0.4266728 0.5733272 1 0 20
## 6737 0.4644322 0.5355678 1 0 16
## 6739 0.4266728 0.5733272 1 1 20
## 6741 0.4444297 0.5555703 1 1 3
## 6743 0.4713807 0.5286193 0 1 10
## 6744 0.4425369 0.5574631 0 1 19
p1 <- predict(model, train)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab1 <- table(p1, train$biden20))
##
## p1 0 1
## 0 0 0
## 1 2365 2904
1 - sum(diag(tab1)) / sum(tab1)
## [1] 0.4488518
p2 <- predict(model, test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab2 <- table(p2, test$biden20))
##
## p2 0 1
## 0 0 0
## 1 267 363
1 - sum(diag(tab2)) / sum(tab2)
## [1] 0.4238095
models <- naive_bayes(biden20 ~ sex+inc, data = sub)
## Warning: naive_bayes(): y contains NAs. They are excluded from the estimation
## process.
summary(models)
##
## ================================== Naive Bayes ==================================
##
## - Call: naive_bayes.formula(formula = biden20 ~ sex + inc, data = sub)
## - Laplace: 0
## - Classes: 2
## - Samples: 8280
## - Features: 2
## - Conditional distributions:
## - Bernoulli: 1
## - Gaussian: 1
## - Prior probabilities:
## - 0: 0.4462
## - 1: 0.5538
##
## ---------------------------------------------------------------------------------
plot(models)
# results
p3 <-predict(models, train, type = "prob")
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
models %class% test
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [519] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [593] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [630] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [667] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [704] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [741] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [778] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [815] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [852] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Levels: 0 1
head(na.omit(cbind(p3, train)))
## 0 1 sex biden20 inc
## 6732 0.4412751 0.5587249 1 0 20
## 6737 0.4552317 0.5447683 1 0 16
## 6739 0.4412751 0.5587249 1 1 20
## 6741 0.4675439 0.5324561 1 1 3
## 6743 0.4467413 0.5532587 0 1 10
## 6744 0.4249066 0.5750934 0 1 19
p3 <- predict(model, test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab2 <- table(p3, test$biden20))
##
## p3 0 1
## 0 0 0
## 1 267 363
1 - sum(diag(tab2)) / sum(tab2)
## [1] 0.4238095
str(sub)
## tibble [8,280 x 3] (S3: tbl_df/tbl/data.frame)
## $ sex : Factor w/ 2 levels "0","1": NA NA NA NA NA NA NA NA NA NA ...
## $ biden20: Factor w/ 2 levels "0","1": NA 1 2 2 1 2 1 NA NA 2 ...
## $ inc : dbl+lbl [1:8280] 21, 13, 17, 7, 22, 3, 4, 3, 10, 11, 9, 18, 1, 2...
## ..@ label : chr "PRE-POST: SUMMARY: Total (family) income"
## ..@ format.stata: chr "%12.0g"
## ..@ labels : Named num [1:24] -9 -5 1 2 3 4 5 6 7 8 ...
## .. ..- attr(*, "names")= chr [1:24] "-9. Refused" "-5. Breakoff, sufficient partial IW" "1. Under $9,999" "2. $10,000-14,999" ...
head(sub)
## # A tibble: 6 x 3
## sex biden20 inc
## <fct> <fct> <dbl+lbl>
## 1 <NA> <NA> 21 [21. $175,000-249,999]
## 2 <NA> 0 13 [13. $70,000-74,999]
## 3 <NA> 1 17 [17. $100,000-109,999]
## 4 <NA> 1 7 [7. $35,000-39,999]
## 5 <NA> 0 22 [22. $250,000 or more]
## 6 <NA> 1 3 [3. $15,000-19,999]
describe(sub)
## vars n mean sd median trimmed mad min max range skew kurtosis
## sex* 1 270 1.45 0.50 1 1.44 0.0 1 2 1 0.19 -1.97
## biden20* 2 5899 1.55 0.50 2 1.57 0.0 1 2 1 -0.22 -1.95
## inc 3 7980 11.73 6.74 12 11.84 8.9 1 22 21 -0.11 -1.27
## se
## sex* 0.03
## biden20* 0.01
## inc 0.08
missmap(sub)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
imp_mod <- mice(sub[, c("sex","biden20","inc")], method='rf')
##
## iter imp variable
## 1 1 sex biden20 inc
## 1 2 sex biden20 inc
## 1 3 sex biden20 inc
## 1 4 sex biden20 inc
## 1 5 sex biden20 inc
## 2 1 sex biden20 inc
## 2 2 sex biden20 inc
## 2 3 sex biden20 inc
## 2 4 sex biden20 inc
## 2 5 sex biden20 inc
## 3 1 sex biden20 inc
## 3 2 sex biden20 inc
## 3 3 sex biden20 inc
## 3 4 sex biden20 inc
## 3 5 sex biden20 inc
## 4 1 sex biden20 inc
## 4 2 sex biden20 inc
## 4 3 sex biden20 inc
## 4 4 sex biden20 inc
## 4 5 sex biden20 inc
## 5 1 sex biden20 inc
## 5 2 sex biden20 inc
## 5 3 sex biden20 inc
## 5 4 sex biden20 inc
## 5 5 sex biden20 inc
imp_complete <- complete(imp_mod)
sub$biden20 <- imp_complete$biden20
sub$trump20 <- imp_complete$trump20
sub$sex <- imp_complete$sex
sub$inc <- imp_complete$inc
missmap(sub)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
ggplot(sub, aes(inc, colour = biden20)) +
geom_freqpoly(binwidth = 1) + labs(title="inc range by 2020 vote")
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.
c <- ggplot(sub, aes(x=inc, fill=biden20, color=biden20)) +
geom_histogram(binwidth = 1) + labs(title="inc Distribution by Biden vote 2020")
c + theme_bw()
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.
barchart(sub$sex)
barchart(sub$biden20)
subxTrain <- createDataPartition(y = sub$biden20,p = 0.90,list = FALSE)
training <- sub[subxTrain,]
testing <- sub[-subxTrain,]
prop.table(table(sub$biden20)) * 100
##
## 0 1
## 45.02415 54.97585
prop.table(table(training$biden20)) * 100
##
## 0 1
## 45.02885 54.97115
prop.table(table(testing$biden20)) * 100
##
## 0 1
## 44.98186 55.01814
x = training[,-3]
y = training$biden20
model = train(training,training$biden20,'nb',trControl=trainControl(method='cv',number=4))
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
model
## Naive Bayes
##
## 7453 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 5590, 5589, 5590, 5590
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 1 1
## TRUE 1 1
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
## = 1.
Predict <- predict(model,newdata = testing )
confusionMatrix(Predict, testing$biden20 )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 372 0
## 1 0 455
##
## Accuracy : 1
## 95% CI : (0.9955, 1)
## No Information Rate : 0.5502
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4498
## Detection Rate : 0.4498
## Detection Prevalence : 0.4498
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
Naive Bayes classifier that can predict whether a person identified as a Biden voter in 2020, with an accuracy of approximately 100% - as a caveat these data may not be the most appropriate.
roc_imp.training <- filterVarImp(x = training[, -ncol(training)], y = training$biden20)
head(roc_imp.training)
## X0 X1
## sex 0.503603 0.503603
## biden20 1.000000 1.000000
roc_imp.test <- filterVarImp(x = testing[, -ncol(testing)], y = testing$biden20)
head(roc_imp.test)
## X0 X1
## sex 0.5002865 0.5002865
## biden20 1.0000000 1.0000000