Load libraries

library(car)

## Loading required package: carData

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(questionr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.1
## v readr   1.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
## x tidyr::unpack() masks Matrix::unpack()

library(haven)
library(readr)
library(tableone, quietly = T)
library(ggplot2, quietly = T)
library(caret, quietly = T)

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

## The following object is masked from 'package:survival':
## 
##     cluster

Load data

GSS2018 <- read_csv("C:/Users/chris/OneDrive/Desktop/GSS2018/GSS2018.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double()
## )
## i Use `spec()` for the full column specifications.

# Work on names of variables
nams<- names(GSS2018)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(GSS2018)<-newnames
head(GSS2018)

## # A tibble: 6 x 1,067
##    year    id wrkstat  hrs1  hrs2 evwork wrkslf wrkgovt occ10 prestg10
##   <dbl> <dbl>   <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl> <dbl>    <dbl>
## 1  2018     1       3    -1    41      0      2       2   630       47
## 2  2018     2       5    -1    -1      1      2       2  9640       22
## 3  2018     3       1    40    -1      0      2       2  1106       61
## 4  2018     4       1    40    -1      0      2       2  3320       59
## 5  2018     5       5    -1    -1      1      2       2   136       53
## 6  2018     6       5    -1    -1      1      2       2   120       53
## # ... with 1,057 more variables: prestg105plus <dbl>, indus10 <dbl>,
## #   marital <dbl>, martype <dbl>, divorce <dbl>, widowed <dbl>, spwrksta <dbl>,
## #   sphrs1 <dbl>, sphrs2 <dbl>, spevwork <dbl>, cowrksta <dbl>, cowrkslf <dbl>,
## #   coevwork <dbl>, cohrs1 <dbl>, cohrs2 <dbl>, spwrkslf <dbl>, spocc10 <dbl>,
## #   sppres10 <dbl>, sppres105plus <dbl>, spind10 <dbl>, coocc10 <dbl>,
## #   coind10 <dbl>, pawrkslf <dbl>, paocc10 <dbl>, papres10 <dbl>,
## #   papres105plus <dbl>, paind10 <dbl>, mawrkslf <dbl>, maocc10 <dbl>,
## #   mapres10 <dbl>, mapres105plus <dbl>, maind10 <dbl>, sibs <dbl>,
## #   childs <dbl>, age <dbl>, agekdbrn <dbl>, educ <dbl>, paeduc <dbl>,
## #   maeduc <dbl>, speduc <dbl>, coeduc <dbl>, codeg <dbl>, degree <dbl>,
## #   padeg <dbl>, madeg <dbl>, spdeg <dbl>, major1 <dbl>, major2 <dbl>,
## #   dipged <dbl>, sex <dbl>, race <dbl>, res16 <dbl>, reg16 <dbl>,
## #   mobile16 <dbl>, family16 <dbl>, famdif16 <dbl>, mawrkgrw <dbl>,
## #   incom16 <dbl>, born <dbl>, parborn <dbl>, granborn <dbl>, hompop <dbl>,
## #   babies <dbl>, preteen <dbl>, teens <dbl>, adults <dbl>, unrelat <dbl>,
## #   earnrs <dbl>, income <dbl>, rincome <dbl>, income16 <dbl>, rincom16 <dbl>,
## #   region <dbl>, xnorcsiz <dbl>, srcbelt <dbl>, size <dbl>, partyid <dbl>,
## #   vote12 <dbl>, pres12 <dbl>, if12who <dbl>, vote16 <dbl>, pres16 <dbl>,
## #   if16who <dbl>, polviews <dbl>, natspac <dbl>, natenvir <dbl>,
## #   natheal <dbl>, natcity <dbl>, natcrime <dbl>, natdrug <dbl>, nateduc <dbl>,
## #   natrace <dbl>, natarms <dbl>, nataid <dbl>, natfare <dbl>, natroad <dbl>,
## #   natsoc <dbl>, natmass <dbl>, natpark <dbl>, natchld <dbl>, ...

View(GSS2018)

1) Define a binary outcome of your choosing

I will use US presidential candidate voted for in 2016 election (Clinton or Trump) as my outcome.

2) Fit a predictive logistic regression model using as many predictor variables as you think you need.

Recode Variables

# Sex
GSS2018$male<-as.factor(ifelse(GSS2018$sex==1, "Male", "Female"))


#race/ethnicity
GSS2018$black<-Recode(GSS2018$race, recodes="2=1; else=0")
GSS2018$white<-Recode(GSS2018$race, recodes="1=1; else=0")
GSS2018$other<-Recode(GSS2018$race, recodes="3=1; else=0")


GSS2018$race_eth<-Recode(GSS2018$race, recodes="1='nhwhite'; 2='nhblack'; 3='nhother'", as.factor = T)


#education level
GSS2018$educ<-Recode(GSS2018$educ, recodes="1:4='0Prim'; 5:10='1somehs'; 11:12='2hsgrad'; 13:14='3somecol'; 15:20='4colgrad';else=NA", as.factor=T)


#marital status
GSS2018$marst<-Recode(GSS2018$marital, recodes="1='married'; 3='divorced'; 2='widowed'; 4='separated'; 5='nm'; else=NA", as.factor=T)

# Political affiliation
GSS2018$partyaff<- Recode(GSS2018$partyid, recode= "0=0; 6= 1; else=NA", as.factor=T)

# Political views
GSS2018$polviews<- Recode(GSS2018$polviews, recode= "1:3= 'Liberal'; 5:7= 'Conservative'; else=NA", as.factor=T)

# Presidential candidate voted for in 2016 election
GSS2018$vote<- Recode(GSS2018$pres16, recode= "1=0; 2=1; else=NA", as.factor=T)

Survey Design

options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1, strata= ~vstrat, weights= ~wtssnr, data = GSS2018 )

Logit model

fit.logit<-svyglm(vote ~ race_eth + polviews, design = des, family = binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.logit)

## 
## Call:
## svyglm(formula = vote ~ race_eth + polviews, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~vstrat, weights = ~wtssnr, data = GSS2018)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.4218     0.5622  -4.307 1.85e-05 ***
## race_ethnhother   3.3188     0.7596   4.369 1.41e-05 ***
## race_ethnhwhite   4.5409     0.6017   7.546 1.20e-13 ***
## polviewsLiberal  -3.9015     0.2709 -14.400  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.695266)
## 
## Number of Fisher Scoring iterations: 6

Creating training and test data set

set.seed(1115)

train<-createDataPartition(y = GSS2018$pres16, p = .80, list=F)
                           

GSS2018train<- GSS2018[train,]
GSS2018test<- GSS2018[-train,]

table(GSS2018train$pres16)

## 
##   0   1   2   3   4   8   9 
## 651 609 462  70  17  27  43

wat<- GSS2018 %>% 
  select(vote, educ, male, black, white, other, marst, partyid, polviews, pres16, partyid) %>% 
  mutate(educ=as.numeric(educ), vote= as.factor(ifelse(vote=="Clinton", 1, ifelse(vote=="Trump", 0, NA))))

library(caret)

set.seed(1115)

train<-createDataPartition(wat$pres16, p = .80, list=F)
                           

wattrain<-wat[train,]
wattest<-wat[-train,]

table(wattrain$pres16)

## 
##   0   1   2   3   4   8   9 
## 651 609 462  70  17  27  43

prop.table(table(wattrain$pres16))

## 
##           0           1           2           3           4           8 
## 0.346460883 0.324108568 0.245875466 0.037253858 0.009047366 0.014369345 
##           9 
## 0.022884513

summary(wattrain)

##    vote           educ           male          black            white       
##  NA's:1879   Min.   :1.000   Female:1035   Min.   :0.0000   Min.   :0.0000  
##              1st Qu.:3.000   Male  : 844   1st Qu.:0.0000   1st Qu.:0.0000  
##              Median :4.000                 Median :0.0000   Median :1.0000  
##              Mean   :3.879                 Mean   :0.1676   Mean   :0.7174  
##              3rd Qu.:5.000                 3rd Qu.:0.0000   3rd Qu.:1.0000  
##              Max.   :5.000                 Max.   :1.0000   Max.   :1.0000  
##              NA's   :7                                                      
##      other             marst        partyid              polviews  
##  Min.   :0.000   divorced :319   Min.   :0.000   Conservative:589  
##  1st Qu.:0.000   married  :806   1st Qu.:1.000   Liberal     :538  
##  Median :0.000   nm       :531   Median :3.000   NA's        :752  
##  Mean   :0.115   separated: 63   Mean   :2.982                     
##  3rd Qu.:0.000   widowed  :159   3rd Qu.:5.000                     
##  Max.   :1.000   NA's     :  1   Max.   :9.000                     
##                                                                    
##      pres16     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :1.000  
##  Mean   :1.285  
##  3rd Qu.:2.000  
##  Max.   :9.000  
##

3) Report the % correct classification from the training data using the .5 decision rule and again useing the mean of the outcome decision rule

glm1<-glm(as.factor(pres16)~factor(educ)+factor(marst)+factor(polviews)+factor(male)+factor(partyid),
          data=wattrain[,-1],
          family = binomial)
summary(glm1)

## 
## Call:
## glm(formula = as.factor(pres16) ~ factor(educ) + factor(marst) + 
##     factor(polviews) + factor(male) + factor(partyid), family = binomial, 
##     data = wattrain[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6184  -0.6776   0.5011   0.7563   2.1479  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              0.00202    0.90025   0.002 0.998210    
## factor(educ)2            0.96579    0.88413   1.092 0.274674    
## factor(educ)3            1.94323    0.85288   2.278 0.022701 *  
## factor(educ)4            2.18965    0.85653   2.556 0.010575 *  
## factor(educ)5            3.07658    0.85036   3.618 0.000297 ***
## factor(marst)married    -0.32922    0.23195  -1.419 0.155783    
## factor(marst)nm         -1.11780    0.23976  -4.662 3.13e-06 ***
## factor(marst)separated  -0.25940    0.45989  -0.564 0.572724    
## factor(marst)widowed     0.11368    0.34600   0.329 0.742496    
## factor(polviews)Liberal -0.16616    0.18787  -0.884 0.376461    
## factor(male)Male         0.24992    0.15374   1.626 0.104041    
## factor(partyid)1        -1.12222    0.27060  -4.147 3.37e-05 ***
## factor(partyid)2        -1.28817    0.27892  -4.618 3.87e-06 ***
## factor(partyid)3        -2.67415    0.31502  -8.489  < 2e-16 ***
## factor(partyid)4        -1.37196    0.31221  -4.394 1.11e-05 ***
## factor(partyid)5        -0.86037    0.31599  -2.723 0.006473 ** 
## factor(partyid)6        -0.04729    0.33646  -0.141 0.888227    
## factor(partyid)7        -0.94516    0.44900  -2.105 0.035287 *  
## factor(partyid)9        -1.71126    1.02109  -1.676 0.093756 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1353.0  on 1122  degrees of freedom
## Residual deviance: 1085.4  on 1104  degrees of freedom
##   (756 observations deleted due to missingness)
## AIC: 1123.4
## 
## Number of Fisher Scoring iterations: 4

3a) Does changing the decision rule threshold affect your classification accuracy?

tr_pred<- predict(glm1,
                  newdata = wattrain,
                  type = "response")

head(tr_pred)

##         1         2         3         4         5         6 
## 0.6138245 0.8358041        NA 0.9637792 0.8504264 0.5540377

Using decision rule

tr_predcl<- as.factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=wattrain$pres16)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of voting for a candidate", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 756 rows containing non-finite values (stat_bin).

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
  ggtitle(label = "Probability of voting for a candidate", subtitle = "Truth")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 756 rows containing non-finite values (stat_bin).

table( tr_predcl,wattrain$pres16)

##          
## tr_predcl   0   1   2   3   4   8   9
##         0 140  35  12   4   0   1   2
##         1 186 343 330  35   8  13  14

wat$pres16=as.factor(wat$pres16)

cm1<-confusionMatrix(tr_predcl, as.factor(wattrain$pres16) )

## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length

## Warning in confusionMatrix.default(tr_predcl, as.factor(wattrain$pres16)):
## Levels are not in the same order for reference and data. Refactoring data to
## match.

cm1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   8   9
##          0 140  35  12   4   0   1   2
##          1 186 343 330  35   8  13  14
##          2   0   0   0   0   0   0   0
##          3   0   0   0   0   0   0   0
##          4   0   0   0   0   0   0   0
##          8   0   0   0   0   0   0   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4301          
##                  95% CI : (0.4009, 0.4597)
##     No Information Rate : 0.3366          
##     P-Value [Acc > NIR] : 4.617e-11       
##                                           
##                   Kappa : 0.1512          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 8
## Sensitivity            0.4294   0.9074   0.0000  0.00000 0.000000  0.00000
## Specificity            0.9322   0.2134   1.0000  1.00000 1.000000  1.00000
## Pos Pred Value         0.7216   0.3692      NaN      NaN      NaN      NaN
## Neg Pred Value         0.7998   0.8196   0.6955  0.96527 0.992876  0.98753
## Prevalence             0.2903   0.3366   0.3045  0.03473 0.007124  0.01247
## Detection Rate         0.1247   0.3054   0.0000  0.00000 0.000000  0.00000
## Detection Prevalence   0.1728   0.8272   0.0000  0.00000 0.000000  0.00000
## Balanced Accuracy      0.6808   0.5604   0.5000  0.50000 0.500000  0.50000
##                      Class: 9
## Sensitivity           0.00000
## Specificity           1.00000
## Pos Pred Value            NaN
## Neg Pred Value        0.98575
## Prevalence            0.01425
## Detection Rate        0.00000
## Detection Prevalence  0.00000
## Balanced Accuracy     0.50000

The accuracy in the train data is 43% accurate which is not too good for prediction of an election winner, 43% sensitivity with a specificity of 93% who vote.

Data prediction useing the mean outcome decision rule

pred_test<-predict(glm1,
                   newdata=wattest,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(wattest$pres16==1)), 1, 0))

table(wattest$pres16,pred_cl)

##    pred_cl
##      0  1
##   0 18 59
##   1  2 89
##   2  2 79
##   3  1  7
##   4  0  1
##   8  1  3
##   9  2  1

confusionMatrix(data = pred_cl,as.factor(wattest$pres16) )

## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length

## Warning in confusionMatrix.default(data = pred_cl, as.factor(wattest$pres16)):
## Levels are not in the same order for reference and data. Refactoring data to
## match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  8  9
##          0 18  2  2  1  0  1  2
##          1 59 89 79  7  1  3  1
##          2  0  0  0  0  0  0  0
##          3  0  0  0  0  0  0  0
##          4  0  0  0  0  0  0  0
##          8  0  0  0  0  0  0  0
##          9  0  0  0  0  0  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4038          
##                  95% CI : (0.3442, 0.4655)
##     No Information Rate : 0.3434          
##     P-Value [Acc > NIR] : 0.02344         
##                                           
##                   Kappa : 0.0991          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 8
## Sensitivity           0.23377   0.9780   0.0000  0.00000 0.000000  0.00000
## Specificity           0.95745   0.1379   1.0000  1.00000 1.000000  1.00000
## Pos Pred Value        0.69231   0.3724      NaN      NaN      NaN      NaN
## Neg Pred Value        0.75314   0.9231   0.6943  0.96981 0.996226  0.98491
## Prevalence            0.29057   0.3434   0.3057  0.03019 0.003774  0.01509
## Detection Rate        0.06792   0.3358   0.0000  0.00000 0.000000  0.00000
## Detection Prevalence  0.09811   0.9019   0.0000  0.00000 0.000000  0.00000
## Balanced Accuracy     0.59561   0.5580   0.5000  0.50000 0.500000  0.50000
##                      Class: 9
## Sensitivity           0.00000
## Specificity           1.00000
## Pos Pred Value            NaN
## Neg Pred Value        0.98868
## Prevalence            0.01132
## Detection Rate        0.00000
## Detection Prevalence  0.00000
## Balanced Accuracy     0.50000

tr_predcl<-factor(ifelse(tr_pred>mean(I(wattrain$pres16==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=wattrain$pres16)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Voting", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(wattrain$pres16==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 756 rows containing non-finite values (stat_bin).

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Voting", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(wattrain$pres16==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 756 rows containing non-finite values (stat_bin).

confusionMatrix(tr_predcl,
                as.factor(wattrain$pres16),
                positive = "1" )

## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length

## Warning in confusionMatrix.default(tr_predcl, as.factor(wattrain$pres16), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   8   9
##          0  72  11   1   2   0   0   2
##          1 254 367 341  37   8  14  14
##          2   0   0   0   0   0   0   0
##          3   0   0   0   0   0   0   0
##          4   0   0   0   0   0   0   0
##          8   0   0   0   0   0   0   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3909          
##                  95% CI : (0.3623, 0.4202)
##     No Information Rate : 0.3366          
##     P-Value [Acc > NIR] : 7.878e-05       
##                                           
##                   Kappa : 0.0869          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 8
## Sensitivity           0.22086   0.9709   0.0000  0.00000 0.000000  0.00000
## Specificity           0.97992   0.1034   1.0000  1.00000 1.000000  1.00000
## Pos Pred Value        0.81818   0.3546      NaN      NaN      NaN      NaN
## Neg Pred Value        0.75459   0.8750   0.6955  0.96527 0.992876  0.98753
## Prevalence            0.29029   0.3366   0.3045  0.03473 0.007124  0.01247
## Detection Rate        0.06411   0.3268   0.0000  0.00000 0.000000  0.00000
## Detection Prevalence  0.07836   0.9216   0.0000  0.00000 0.000000  0.00000
## Balanced Accuracy     0.60039   0.5371   0.5000  0.50000 0.500000  0.50000
##                      Class: 9
## Sensitivity           0.00000
## Specificity           1.00000
## Pos Pred Value            NaN
## Neg Pred Value        0.98575
## Prevalence            0.01425
## Detection Rate        0.00000
## Detection Prevalence  0.00000
## Balanced Accuracy     0.50000

Chaanging the rule from 0.5 outcome to decision to mean outcome decision resulted in lower percent correct classification but an increase in specificity and lower percentage in the sensitivity.

pred_test<-predict(glm1,
                   newdata=wattest,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(wattest$pres16==1)), 1, 0))

table(wattest$pres16,pred_cl)

##    pred_cl
##      0  1
##   0 18 59
##   1  2 89
##   2  2 79
##   3  1  7
##   4  0  1
##   8  1  3
##   9  2  1

confusionMatrix(data = pred_cl,as.factor(wattest$pres16))

## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length

## Warning in confusionMatrix.default(data = pred_cl, as.factor(wattest$pres16)):
## Levels are not in the same order for reference and data. Refactoring data to
## match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  8  9
##          0 18  2  2  1  0  1  2
##          1 59 89 79  7  1  3  1
##          2  0  0  0  0  0  0  0
##          3  0  0  0  0  0  0  0
##          4  0  0  0  0  0  0  0
##          8  0  0  0  0  0  0  0
##          9  0  0  0  0  0  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4038          
##                  95% CI : (0.3442, 0.4655)
##     No Information Rate : 0.3434          
##     P-Value [Acc > NIR] : 0.02344         
##                                           
##                   Kappa : 0.0991          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 8
## Sensitivity           0.23377   0.9780   0.0000  0.00000 0.000000  0.00000
## Specificity           0.95745   0.1379   1.0000  1.00000 1.000000  1.00000
## Pos Pred Value        0.69231   0.3724      NaN      NaN      NaN      NaN
## Neg Pred Value        0.75314   0.9231   0.6943  0.96981 0.996226  0.98491
## Prevalence            0.29057   0.3434   0.3057  0.03019 0.003774  0.01509
## Detection Rate        0.06792   0.3358   0.0000  0.00000 0.000000  0.00000
## Detection Prevalence  0.09811   0.9019   0.0000  0.00000 0.000000  0.00000
## Balanced Accuracy     0.59561   0.5580   0.5000  0.50000 0.500000  0.50000
##                      Class: 9
## Sensitivity           0.00000
## Specificity           1.00000
## Pos Pred Value            NaN
## Neg Pred Value        0.98868
## Prevalence            0.01132
## Detection Rate        0.00000
## Detection Prevalence  0.00000
## Balanced Accuracy     0.50000

In the test data, the model does about as well as it did on the training data, which is ideal.

DEM 7283 Stats II Homework 4

Assogba Albert

3/2/2021

Load libraries

Load data

1) Define a binary outcome of your choosing

2) Fit a predictive logistic regression model using as many predictor variables as you think you need.

Recode Variables

Survey Design

Logit model

Creating training and test data set

3) Report the % correct classification from the training data using the .5 decision rule and again useing the mean of the outcome decision rule

3a) Does changing the decision rule threshold affect your classification accuracy?

Using decision rule

The accuracy in the train data is 43% accurate which is not too good for prediction of an election winner, 43% sensitivity with a specificity of 93% who vote.

Data prediction useing the mean outcome decision rule

Chaanging the rule from 0.5 outcome to decision to mean outcome decision resulted in lower percent correct classification but an increase in specificity and lower percentage in the sensitivity.

In the test data, the model does about as well as it did on the training data, which is ideal.