library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.1
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
library(haven)
library(readr)
library(tableone, quietly = T)
library(ggplot2, quietly = T)
library(caret, quietly = T)
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## The following object is masked from 'package:survival':
##
## cluster
GSS2018 <- read_csv("C:/Users/chris/OneDrive/Desktop/GSS2018/GSS2018.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double()
## )
## i Use `spec()` for the full column specifications.
# Work on names of variables
nams<- names(GSS2018)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(GSS2018)<-newnames
head(GSS2018)
## # A tibble: 6 x 1,067
## year id wrkstat hrs1 hrs2 evwork wrkslf wrkgovt occ10 prestg10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2018 1 3 -1 41 0 2 2 630 47
## 2 2018 2 5 -1 -1 1 2 2 9640 22
## 3 2018 3 1 40 -1 0 2 2 1106 61
## 4 2018 4 1 40 -1 0 2 2 3320 59
## 5 2018 5 5 -1 -1 1 2 2 136 53
## 6 2018 6 5 -1 -1 1 2 2 120 53
## # ... with 1,057 more variables: prestg105plus <dbl>, indus10 <dbl>,
## # marital <dbl>, martype <dbl>, divorce <dbl>, widowed <dbl>, spwrksta <dbl>,
## # sphrs1 <dbl>, sphrs2 <dbl>, spevwork <dbl>, cowrksta <dbl>, cowrkslf <dbl>,
## # coevwork <dbl>, cohrs1 <dbl>, cohrs2 <dbl>, spwrkslf <dbl>, spocc10 <dbl>,
## # sppres10 <dbl>, sppres105plus <dbl>, spind10 <dbl>, coocc10 <dbl>,
## # coind10 <dbl>, pawrkslf <dbl>, paocc10 <dbl>, papres10 <dbl>,
## # papres105plus <dbl>, paind10 <dbl>, mawrkslf <dbl>, maocc10 <dbl>,
## # mapres10 <dbl>, mapres105plus <dbl>, maind10 <dbl>, sibs <dbl>,
## # childs <dbl>, age <dbl>, agekdbrn <dbl>, educ <dbl>, paeduc <dbl>,
## # maeduc <dbl>, speduc <dbl>, coeduc <dbl>, codeg <dbl>, degree <dbl>,
## # padeg <dbl>, madeg <dbl>, spdeg <dbl>, major1 <dbl>, major2 <dbl>,
## # dipged <dbl>, sex <dbl>, race <dbl>, res16 <dbl>, reg16 <dbl>,
## # mobile16 <dbl>, family16 <dbl>, famdif16 <dbl>, mawrkgrw <dbl>,
## # incom16 <dbl>, born <dbl>, parborn <dbl>, granborn <dbl>, hompop <dbl>,
## # babies <dbl>, preteen <dbl>, teens <dbl>, adults <dbl>, unrelat <dbl>,
## # earnrs <dbl>, income <dbl>, rincome <dbl>, income16 <dbl>, rincom16 <dbl>,
## # region <dbl>, xnorcsiz <dbl>, srcbelt <dbl>, size <dbl>, partyid <dbl>,
## # vote12 <dbl>, pres12 <dbl>, if12who <dbl>, vote16 <dbl>, pres16 <dbl>,
## # if16who <dbl>, polviews <dbl>, natspac <dbl>, natenvir <dbl>,
## # natheal <dbl>, natcity <dbl>, natcrime <dbl>, natdrug <dbl>, nateduc <dbl>,
## # natrace <dbl>, natarms <dbl>, nataid <dbl>, natfare <dbl>, natroad <dbl>,
## # natsoc <dbl>, natmass <dbl>, natpark <dbl>, natchld <dbl>, ...
View(GSS2018)
I will use US presidential candidate voted for in 2016 election (Clinton or Trump) as my outcome.
# Sex
GSS2018$male<-as.factor(ifelse(GSS2018$sex==1, "Male", "Female"))
#race/ethnicity
GSS2018$black<-Recode(GSS2018$race, recodes="2=1; else=0")
GSS2018$white<-Recode(GSS2018$race, recodes="1=1; else=0")
GSS2018$other<-Recode(GSS2018$race, recodes="3=1; else=0")
GSS2018$race_eth<-Recode(GSS2018$race, recodes="1='nhwhite'; 2='nhblack'; 3='nhother'", as.factor = T)
#education level
GSS2018$educ<-Recode(GSS2018$educ, recodes="1:4='0Prim'; 5:10='1somehs'; 11:12='2hsgrad'; 13:14='3somecol'; 15:20='4colgrad';else=NA", as.factor=T)
#marital status
GSS2018$marst<-Recode(GSS2018$marital, recodes="1='married'; 3='divorced'; 2='widowed'; 4='separated'; 5='nm'; else=NA", as.factor=T)
# Political affiliation
GSS2018$partyaff<- Recode(GSS2018$partyid, recode= "0=0; 6= 1; else=NA", as.factor=T)
# Political views
GSS2018$polviews<- Recode(GSS2018$polviews, recode= "1:3= 'Liberal'; 5:7= 'Conservative'; else=NA", as.factor=T)
# Presidential candidate voted for in 2016 election
GSS2018$vote<- Recode(GSS2018$pres16, recode= "1=0; 2=1; else=NA", as.factor=T)
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1, strata= ~vstrat, weights= ~wtssnr, data = GSS2018 )
fit.logit<-svyglm(vote ~ race_eth + polviews, design = des, family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = vote ~ race_eth + polviews, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~vstrat, weights = ~wtssnr, data = GSS2018)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.4218 0.5622 -4.307 1.85e-05 ***
## race_ethnhother 3.3188 0.7596 4.369 1.41e-05 ***
## race_ethnhwhite 4.5409 0.6017 7.546 1.20e-13 ***
## polviewsLiberal -3.9015 0.2709 -14.400 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.695266)
##
## Number of Fisher Scoring iterations: 6
set.seed(1115)
train<-createDataPartition(y = GSS2018$pres16, p = .80, list=F)
GSS2018train<- GSS2018[train,]
GSS2018test<- GSS2018[-train,]
table(GSS2018train$pres16)
##
## 0 1 2 3 4 8 9
## 651 609 462 70 17 27 43
wat<- GSS2018 %>%
select(vote, educ, male, black, white, other, marst, partyid, polviews, pres16, partyid) %>%
mutate(educ=as.numeric(educ), vote= as.factor(ifelse(vote=="Clinton", 1, ifelse(vote=="Trump", 0, NA))))
library(caret)
set.seed(1115)
train<-createDataPartition(wat$pres16, p = .80, list=F)
wattrain<-wat[train,]
wattest<-wat[-train,]
table(wattrain$pres16)
##
## 0 1 2 3 4 8 9
## 651 609 462 70 17 27 43
prop.table(table(wattrain$pres16))
##
## 0 1 2 3 4 8
## 0.346460883 0.324108568 0.245875466 0.037253858 0.009047366 0.014369345
## 9
## 0.022884513
summary(wattrain)
## vote educ male black white
## NA's:1879 Min. :1.000 Female:1035 Min. :0.0000 Min. :0.0000
## 1st Qu.:3.000 Male : 844 1st Qu.:0.0000 1st Qu.:0.0000
## Median :4.000 Median :0.0000 Median :1.0000
## Mean :3.879 Mean :0.1676 Mean :0.7174
## 3rd Qu.:5.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :5.000 Max. :1.0000 Max. :1.0000
## NA's :7
## other marst partyid polviews
## Min. :0.000 divorced :319 Min. :0.000 Conservative:589
## 1st Qu.:0.000 married :806 1st Qu.:1.000 Liberal :538
## Median :0.000 nm :531 Median :3.000 NA's :752
## Mean :0.115 separated: 63 Mean :2.982
## 3rd Qu.:0.000 widowed :159 3rd Qu.:5.000
## Max. :1.000 NA's : 1 Max. :9.000
##
## pres16
## Min. :0.000
## 1st Qu.:0.000
## Median :1.000
## Mean :1.285
## 3rd Qu.:2.000
## Max. :9.000
##
glm1<-glm(as.factor(pres16)~factor(educ)+factor(marst)+factor(polviews)+factor(male)+factor(partyid),
data=wattrain[,-1],
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = as.factor(pres16) ~ factor(educ) + factor(marst) +
## factor(polviews) + factor(male) + factor(partyid), family = binomial,
## data = wattrain[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6184 -0.6776 0.5011 0.7563 2.1479
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.00202 0.90025 0.002 0.998210
## factor(educ)2 0.96579 0.88413 1.092 0.274674
## factor(educ)3 1.94323 0.85288 2.278 0.022701 *
## factor(educ)4 2.18965 0.85653 2.556 0.010575 *
## factor(educ)5 3.07658 0.85036 3.618 0.000297 ***
## factor(marst)married -0.32922 0.23195 -1.419 0.155783
## factor(marst)nm -1.11780 0.23976 -4.662 3.13e-06 ***
## factor(marst)separated -0.25940 0.45989 -0.564 0.572724
## factor(marst)widowed 0.11368 0.34600 0.329 0.742496
## factor(polviews)Liberal -0.16616 0.18787 -0.884 0.376461
## factor(male)Male 0.24992 0.15374 1.626 0.104041
## factor(partyid)1 -1.12222 0.27060 -4.147 3.37e-05 ***
## factor(partyid)2 -1.28817 0.27892 -4.618 3.87e-06 ***
## factor(partyid)3 -2.67415 0.31502 -8.489 < 2e-16 ***
## factor(partyid)4 -1.37196 0.31221 -4.394 1.11e-05 ***
## factor(partyid)5 -0.86037 0.31599 -2.723 0.006473 **
## factor(partyid)6 -0.04729 0.33646 -0.141 0.888227
## factor(partyid)7 -0.94516 0.44900 -2.105 0.035287 *
## factor(partyid)9 -1.71126 1.02109 -1.676 0.093756 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1353.0 on 1122 degrees of freedom
## Residual deviance: 1085.4 on 1104 degrees of freedom
## (756 observations deleted due to missingness)
## AIC: 1123.4
##
## Number of Fisher Scoring iterations: 4
tr_pred<- predict(glm1,
newdata = wattrain,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.6138245 0.8358041 NA 0.9637792 0.8504264 0.5540377
tr_predcl<- as.factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=wattrain$pres16)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of voting for a candidate", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 756 rows containing non-finite values (stat_bin).
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
ggtitle(label = "Probability of voting for a candidate", subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 756 rows containing non-finite values (stat_bin).
table( tr_predcl,wattrain$pres16)
##
## tr_predcl 0 1 2 3 4 8 9
## 0 140 35 12 4 0 1 2
## 1 186 343 330 35 8 13 14
wat$pres16=as.factor(wat$pres16)
cm1<-confusionMatrix(tr_predcl, as.factor(wattrain$pres16) )
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(tr_predcl, as.factor(wattrain$pres16)):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 8 9
## 0 140 35 12 4 0 1 2
## 1 186 343 330 35 8 13 14
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.4301
## 95% CI : (0.4009, 0.4597)
## No Information Rate : 0.3366
## P-Value [Acc > NIR] : 4.617e-11
##
## Kappa : 0.1512
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 8
## Sensitivity 0.4294 0.9074 0.0000 0.00000 0.000000 0.00000
## Specificity 0.9322 0.2134 1.0000 1.00000 1.000000 1.00000
## Pos Pred Value 0.7216 0.3692 NaN NaN NaN NaN
## Neg Pred Value 0.7998 0.8196 0.6955 0.96527 0.992876 0.98753
## Prevalence 0.2903 0.3366 0.3045 0.03473 0.007124 0.01247
## Detection Rate 0.1247 0.3054 0.0000 0.00000 0.000000 0.00000
## Detection Prevalence 0.1728 0.8272 0.0000 0.00000 0.000000 0.00000
## Balanced Accuracy 0.6808 0.5604 0.5000 0.50000 0.500000 0.50000
## Class: 9
## Sensitivity 0.00000
## Specificity 1.00000
## Pos Pred Value NaN
## Neg Pred Value 0.98575
## Prevalence 0.01425
## Detection Rate 0.00000
## Detection Prevalence 0.00000
## Balanced Accuracy 0.50000
pred_test<-predict(glm1,
newdata=wattest,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(wattest$pres16==1)), 1, 0))
table(wattest$pres16,pred_cl)
## pred_cl
## 0 1
## 0 18 59
## 1 2 89
## 2 2 79
## 3 1 7
## 4 0 1
## 8 1 3
## 9 2 1
confusionMatrix(data = pred_cl,as.factor(wattest$pres16) )
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(data = pred_cl, as.factor(wattest$pres16)):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 8 9
## 0 18 2 2 1 0 1 2
## 1 59 89 79 7 1 3 1
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.4038
## 95% CI : (0.3442, 0.4655)
## No Information Rate : 0.3434
## P-Value [Acc > NIR] : 0.02344
##
## Kappa : 0.0991
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 8
## Sensitivity 0.23377 0.9780 0.0000 0.00000 0.000000 0.00000
## Specificity 0.95745 0.1379 1.0000 1.00000 1.000000 1.00000
## Pos Pred Value 0.69231 0.3724 NaN NaN NaN NaN
## Neg Pred Value 0.75314 0.9231 0.6943 0.96981 0.996226 0.98491
## Prevalence 0.29057 0.3434 0.3057 0.03019 0.003774 0.01509
## Detection Rate 0.06792 0.3358 0.0000 0.00000 0.000000 0.00000
## Detection Prevalence 0.09811 0.9019 0.0000 0.00000 0.000000 0.00000
## Balanced Accuracy 0.59561 0.5580 0.5000 0.50000 0.500000 0.50000
## Class: 9
## Sensitivity 0.00000
## Specificity 1.00000
## Pos Pred Value NaN
## Neg Pred Value 0.98868
## Prevalence 0.01132
## Detection Rate 0.00000
## Detection Prevalence 0.00000
## Balanced Accuracy 0.50000
tr_predcl<-factor(ifelse(tr_pred>mean(I(wattrain$pres16==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=wattrain$pres16)
pred2%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of Voting", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(wattrain$pres16==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 756 rows containing non-finite values (stat_bin).
pred2%>%
ggplot(aes(x=pr, fill=modcon))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of Voting", subtitle = "Truth")+
geom_vline(xintercept=mean(I(wattrain$pres16==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 756 rows containing non-finite values (stat_bin).
confusionMatrix(tr_predcl,
as.factor(wattrain$pres16),
positive = "1" )
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(tr_predcl, as.factor(wattrain$pres16), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 8 9
## 0 72 11 1 2 0 0 2
## 1 254 367 341 37 8 14 14
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.3909
## 95% CI : (0.3623, 0.4202)
## No Information Rate : 0.3366
## P-Value [Acc > NIR] : 7.878e-05
##
## Kappa : 0.0869
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 8
## Sensitivity 0.22086 0.9709 0.0000 0.00000 0.000000 0.00000
## Specificity 0.97992 0.1034 1.0000 1.00000 1.000000 1.00000
## Pos Pred Value 0.81818 0.3546 NaN NaN NaN NaN
## Neg Pred Value 0.75459 0.8750 0.6955 0.96527 0.992876 0.98753
## Prevalence 0.29029 0.3366 0.3045 0.03473 0.007124 0.01247
## Detection Rate 0.06411 0.3268 0.0000 0.00000 0.000000 0.00000
## Detection Prevalence 0.07836 0.9216 0.0000 0.00000 0.000000 0.00000
## Balanced Accuracy 0.60039 0.5371 0.5000 0.50000 0.500000 0.50000
## Class: 9
## Sensitivity 0.00000
## Specificity 1.00000
## Pos Pred Value NaN
## Neg Pred Value 0.98575
## Prevalence 0.01425
## Detection Rate 0.00000
## Detection Prevalence 0.00000
## Balanced Accuracy 0.50000
pred_test<-predict(glm1,
newdata=wattest,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(wattest$pres16==1)), 1, 0))
table(wattest$pres16,pred_cl)
## pred_cl
## 0 1
## 0 18 59
## 1 2 89
## 2 2 79
## 3 1 7
## 4 0 1
## 8 1 3
## 9 2 1
confusionMatrix(data = pred_cl,as.factor(wattest$pres16))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(data = pred_cl, as.factor(wattest$pres16)):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 8 9
## 0 18 2 2 1 0 1 2
## 1 59 89 79 7 1 3 1
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.4038
## 95% CI : (0.3442, 0.4655)
## No Information Rate : 0.3434
## P-Value [Acc > NIR] : 0.02344
##
## Kappa : 0.0991
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 8
## Sensitivity 0.23377 0.9780 0.0000 0.00000 0.000000 0.00000
## Specificity 0.95745 0.1379 1.0000 1.00000 1.000000 1.00000
## Pos Pred Value 0.69231 0.3724 NaN NaN NaN NaN
## Neg Pred Value 0.75314 0.9231 0.6943 0.96981 0.996226 0.98491
## Prevalence 0.29057 0.3434 0.3057 0.03019 0.003774 0.01509
## Detection Rate 0.06792 0.3358 0.0000 0.00000 0.000000 0.00000
## Detection Prevalence 0.09811 0.9019 0.0000 0.00000 0.000000 0.00000
## Balanced Accuracy 0.59561 0.5580 0.5000 0.50000 0.500000 0.50000
## Class: 9
## Sensitivity 0.00000
## Specificity 1.00000
## Pos Pred Value NaN
## Neg Pred Value 0.98868
## Prevalence 0.01132
## Detection Rate 0.00000
## Detection Prevalence 0.00000
## Balanced Accuracy 0.50000