DEM 7283 - HW 4: Logistic Regression

1) Define a binary outcome of your choosing

Unable to work due to a Pandemic(COVID19) shutdown

2) Fit a predictive logistic regression model using as many predictor variables as you think you need

3) Use a 80% training/20% test split for your data.

3) Report the % correct classification from the training data using the .5 decision rule and again useing the mean of the outcome decision rule

## 3a) Does changing the decision rule threshold affect your classification accuracy? Yes, Accuracy decreases, sensitivity decreases, and specificity improves.

4) Report the % correct classification from the test data using the .5 decision rule and again useing the mean of the outcome decision rule

#load libraries
library(car, quietly = T)
library(stargazer, quietly = T)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey, quietly = T)

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(questionr, quietly = T)
library(dplyr, quietly = T)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2, quietly = T)

Logistic Regression as a Predictive Model

In this example, we will use the outcome of unable to work due to the Covid shutdown as our outcome.

library(readr)
ds1 <- read_csv("cps_00009.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double()
## )
## i Use `spec()` for the full column specifications.

ds1<-ds1[sample(nrow(ds1), 50000), ]

Here we recode some of our variables and limit our data to those women who are not currently pregnant and who are sexually active.

library(dplyr)

#The names in the data are very ugly, so I make them less ugly
nams<-names(ds1)
head(nams, n=10)

##  [1] "YEAR"    "SERIAL"  "MONTH"   "HWTFINL" "CPSID"   "PERNUM"  "WTFINL" 
##  [8] "CPSIDP"  "AGE"     "SEX"

#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(ds1)<-newnames
head(newnames, n=23)

##  [1] "year"      "serial"    "month"     "hwtfinl"   "cpsid"     "pernum"   
##  [7] "wtfinl"    "cpsidp"    "age"       "sex"       "race"      "marst"    
## [13] "popstat"   "vetstat"   "citizen"   "hispan"    "labforce"  "ind"      
## [19] "classwkr"  "wkstat"    "multjob"   "educ99"    "covidunaw"

model.dat2<-ds1%>%
  mutate(
         sex= as.factor(ifelse(sex ==9,NA, sex)),
         educ = as.factor(ifelse(educ99 ==00,NA, educ99)),
         race = as.factor(ifelse(race ==999,NA, race)),
         marst = as.factor(ifelse(marst ==9,NA, marst)),
         vetstat = Recode(vetstat,
                       recodes="2 ='1'; 1 ='0'; else = NA",
                       as.factor=T),
         popstat = as.factor(popstat),
         citizen = as.factor(ifelse(citizen ==9,NA, citizen)),
         hispan = Recode(hispan,
                       recodes="901:902 =NA",
                       as.factor=T),
         classwkr = Recode(classwkr,
                       recodes="00 =NA;99=NA",
                       as.factor=T),
         wkstat= as.factor(ifelse(wkstat ==99,NA, wkstat)),
         multjob = as.factor(ifelse(multjob ==0,NA, multjob)),
         covidunaw = Recode(covidunaw,
                       recodes="02 ='1'; 01 ='0'; else = NA",
                       as.factor=T),
         ind= as.factor(ifelse(ind==0,NA, ind))
         )%>%
  filter(labforce==2,complete.cases( . ))%>% #in labor force
  dplyr::select(covidunaw, sex, educ,race, age,marst, popstat, vetstat,citizen,hispan,classwkr,wkstat,multjob,ind)

knitr::kable(head(model.dat2))

sex	educ	race	age	marst	popstat	citizen	classwkr	wkstat	multjob	ind
1	11	100	26	6	1	1	27	41	1	7870
2	15	651	64	1	1	4	22	11	1	8191
2	15	100	29	6	1	1	22	11	1	3895
2	15	100	55	1	1	1	22	11	1	7870
1	15	100	50	1	1	1	23	13	1	8191
1	15	100	48	1	1	1	22	11	1	4880

using caret to create training and test sets.

In predictive models, we split the data into two sets, a training set and a test set. The training set will be used to estimate the model parameters, and the test set will be used to validate the model’s predictive ability.

We use an 80% training fraction, which is standard.

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

set.seed(721)
train<- createDataPartition(y = model.dat2$covidunaw,
                            p = .80,
                            list=F)

model.dat2train<-model.dat2[train,]

## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

model.dat2test<-model.dat2[-train,]

table(model.dat2train$covidunaw)

## 
##     0     1 
## 21399  2197

prop.table(table(model.dat2train$covidunaw))

## 
##        0        1 
## 0.906891 0.093109

summary(model.dat2train)

##  covidunaw sex            educ           race            age        marst    
##  0:21399   1:12382   15     :6130   100    :19300   Min.   :17.00   1:13037  
##  1: 2197   2:11214   10     :6021   200    : 2191   1st Qu.:31.00   2:  320  
##                      11     :3778   651    : 1351   Median :42.00   3:  362  
##                      16     :2538   300    :  264   Mean   :41.64   4: 2415  
##                      14     :1518   802    :  114   3rd Qu.:53.00   5:  286  
##                      13     :1084   801    :  103   Max.   :64.00   6: 7176  
##                      (Other):2527   (Other):  273                            
##  popstat   vetstat   citizen       hispan         classwkr         wkstat     
##  1:23596   0:22571   1:19975   0      :20400   22     :16122   11     :17003  
##  2:    0   1: 1025   2:   95   100    : 1928   23     : 1611   41     : 2569  
##                      3:  231   200    :  269   28     : 1536   12     : 1752  
##                      4: 1689   612    :  214   13     : 1434   13     :  801  
##                      5: 1606   611    :  203   27     : 1227   22     :  565  
##                                600    :  194   14     :  953   21     :  545  
##                                (Other):  388   (Other):  713   (Other):  361  
##  multjob        ind       
##  1:22421   770    : 1761  
##  2: 1175   7860   : 1409  
##            8191   : 1201  
##            8680   : 1175  
##            7870   :  606  
##            7380   :  602  
##            (Other):16842

Logistic regression for classification

Here we use a basic binomial GLM to estimate the probability of being unable to work due to Covid shutdown. We use information on age,level of education, sex, marital status, veteran status, citizenship, hispanic heritage, worker class, full or pert-time work status, having multiple jobs, and industry.

This model can be written: \[ln \left ( \frac{Pr(\text{No work COVID})}{1-Pr(\text{No work COVID})} \right ) = X' \beta\]

Which can be converted to the probability scale via the inverse logit transform:

\[Pr(\text{No work COVID}) = \frac{1}{1+exp (-X' \beta)}\]

head(newnames, n=23)

##  [1] "year"      "serial"    "month"     "hwtfinl"   "cpsid"     "pernum"   
##  [7] "wtfinl"    "cpsidp"    "age"       "sex"       "race"      "marst"    
## [13] "popstat"   "vetstat"   "citizen"   "hispan"    "labforce"  "ind"      
## [19] "classwkr"  "wkstat"    "multjob"   "educ99"    "covidunaw"

glm1<-glm(covidunaw ~factor(marst)+scale(age)+factor(sex)+factor(race)+factor(educ)+factor(vetstat)+factor(citizen)+factor(hispan)+factor(classwkr)+factor(wkstat)+factor(multjob)+factor(ind),
          data=model.dat2train,
          family = binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(glm1)

## 
## Call:
## glm(formula = covidunaw ~ factor(marst) + scale(age) + factor(sex) + 
##     factor(race) + factor(educ) + factor(vetstat) + factor(citizen) + 
##     factor(hispan) + factor(classwkr) + factor(wkstat) + factor(multjob) + 
##     factor(ind), family = binomial, data = model.dat2train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3350  -0.4109  -0.3043  -0.2268   3.1111  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -4.547e+00  1.138e+00  -3.996 6.43e-05 ***
## factor(marst)2      2.248e-01  2.011e-01   1.118 0.263574    
## factor(marst)3     -1.053e-01  2.071e-01  -0.508 0.611201    
## factor(marst)4      1.401e-01  8.245e-02   1.699 0.089364 .  
## factor(marst)5     -2.698e-01  2.380e-01  -1.134 0.256857    
## factor(marst)6      5.858e-02  6.753e-02   0.867 0.385702    
## scale(age)          1.043e-01  3.002e-02   3.475 0.000510 ***
## factor(sex)2       -3.698e-02  5.814e-02  -0.636 0.524765    
## factor(race)200     2.399e-01  8.763e-02   2.737 0.006197 ** 
## factor(race)300     3.435e-01  2.200e-01   1.561 0.118503    
## factor(race)651     2.714e-01  1.174e-01   2.312 0.020764 *  
## factor(race)652     8.001e-01  3.114e-01   2.569 0.010189 *  
## factor(race)801     8.544e-02  3.960e-01   0.216 0.829190    
## factor(race)802     4.291e-02  3.341e-01   0.128 0.897827    
## factor(race)803    -5.420e-02  3.864e-01  -0.140 0.888458    
## factor(race)804     3.884e-01  7.843e-01   0.495 0.620433    
## factor(race)805    -1.138e-01  1.082e+00  -0.105 0.916286    
## factor(race)806    -1.527e+01  2.236e+03  -0.007 0.994550    
## factor(race)807    -1.418e+01  3.956e+03  -0.004 0.997140    
## factor(race)808    -1.647e+01  3.956e+03  -0.004 0.996678    
## factor(race)809    -1.446e+01  1.130e+03  -0.013 0.989790    
## factor(race)810    -1.496e+01  1.399e+03  -0.011 0.991469    
## factor(race)811     2.952e+00  1.360e+00   2.171 0.029941 *  
## factor(race)812    -1.400e+01  3.956e+03  -0.004 0.997177    
## factor(race)813     9.362e-01  8.605e-01   1.088 0.276589    
## factor(race)816    -1.453e+01  2.707e+03  -0.005 0.995719    
## factor(race)817     6.926e-01  1.165e+00   0.594 0.552320    
## factor(educ)4       1.841e+00  1.138e+00   1.619 0.105507    
## factor(educ)5       1.689e+00  1.104e+00   1.529 0.126242    
## factor(educ)6       1.614e+00  1.119e+00   1.443 0.149043    
## factor(educ)7       1.375e+00  1.119e+00   1.228 0.219336    
## factor(educ)8       1.370e+00  1.106e+00   1.239 0.215523    
## factor(educ)9       1.717e+00  1.114e+00   1.542 0.123180    
## factor(educ)10      1.523e+00  1.094e+00   1.392 0.163952    
## factor(educ)11      1.571e+00  1.095e+00   1.435 0.151375    
## factor(educ)13      1.596e+00  1.100e+00   1.452 0.146604    
## factor(educ)14      1.487e+00  1.098e+00   1.354 0.175735    
## factor(educ)15      1.559e+00  1.095e+00   1.423 0.154789    
## factor(educ)16      1.311e+00  1.098e+00   1.194 0.232659    
## factor(educ)17      1.569e+00  1.115e+00   1.407 0.159368    
## factor(educ)18      1.484e+00  1.109e+00   1.338 0.180956    
## factor(vetstat)1   -1.041e-01  1.397e-01  -0.745 0.456107    
## factor(citizen)2   -4.295e-01  4.630e-01  -0.928 0.353549    
## factor(citizen)3    9.662e-02  2.404e-01   0.402 0.687754    
## factor(citizen)4    1.071e-01  1.031e-01   1.039 0.298678    
## factor(citizen)5    1.220e-01  1.079e-01   1.131 0.258138    
## factor(hispan)100   2.602e-01  9.386e-02   2.772 0.005577 ** 
## factor(hispan)200   3.283e-01  2.502e-01   1.312 0.189414    
## factor(hispan)300   2.689e-01  2.904e-01   0.926 0.354420    
## factor(hispan)400  -4.992e-01  4.683e-01  -1.066 0.286395    
## factor(hispan)500   7.855e-01  2.447e-01   3.210 0.001327 ** 
## factor(hispan)600   2.138e-01  2.627e-01   0.814 0.415734    
## factor(hispan)611   2.119e-01  2.422e-01   0.875 0.381715    
## factor(hispan)612   3.500e-01  2.374e-01   1.475 0.140300    
## factor(classwkr)14 -3.267e-02  1.184e-01  -0.276 0.782561    
## factor(classwkr)22 -8.720e-01  8.594e-02 -10.147  < 2e-16 ***
## factor(classwkr)23 -1.022e+00  1.572e-01  -6.499 8.07e-11 ***
## factor(classwkr)25 -1.054e+00  2.858e-01  -3.686 0.000228 ***
## factor(classwkr)27 -7.237e-01  1.856e-01  -3.900 9.61e-05 ***
## factor(classwkr)28 -9.615e-01  1.807e-01  -5.321 1.03e-07 ***
## factor(classwkr)29 -1.740e+00  1.081e+00  -1.610 0.107446    
## factor(wkstat)12    7.767e-01  8.882e-02   8.745  < 2e-16 ***
## factor(wkstat)13    1.720e+00  9.558e-02  17.991  < 2e-16 ***
## factor(wkstat)14    1.727e+00  5.022e-01   3.439 0.000584 ***
## factor(wkstat)15    1.148e+00  2.983e-01   3.849 0.000119 ***
## factor(wkstat)21    2.949e+00  1.002e-01  29.438  < 2e-16 ***
## factor(wkstat)22    1.952e+00  1.056e-01  18.477  < 2e-16 ***
## factor(wkstat)41    7.535e-01  7.700e-02   9.786  < 2e-16 ***
## factor(wkstat)42    1.940e+00  1.567e-01  12.384  < 2e-16 ***
## factor(multjob)2    5.004e-01  1.015e-01   4.932 8.14e-07 ***
## factor(ind)180      2.957e-01  4.242e-01   0.697 0.485757    
## factor(ind)190      1.606e+00  7.660e-01   2.097 0.036020 *  
## factor(ind)270      1.094e+00  8.697e-01   1.258 0.208537    
## factor(ind)280      1.543e+00  9.739e-01   1.585 0.113050    
## factor(ind)290     -9.065e-02  1.122e+00  -0.081 0.935594    
## factor(ind)370      1.667e+00  8.428e-01   1.978 0.047976 *  
## factor(ind)380     -1.430e+01  9.049e+02  -0.016 0.987395    
## factor(ind)390     -1.381e+01  1.247e+03  -0.011 0.991169    
## factor(ind)470     -1.422e+01  1.201e+03  -0.012 0.990549    
## factor(ind)490      9.468e-01  4.971e-01   1.904 0.056846 .  
## factor(ind)570     -1.632e-01  6.080e-01  -0.268 0.788316    
## factor(ind)580      1.410e+00  6.982e-01   2.019 0.043518 *  
## factor(ind)590      3.043e-01  1.102e+00   0.276 0.782447    
## factor(ind)670      5.238e-01  7.951e-01   0.659 0.510017    
## factor(ind)680     -1.789e-01  1.135e+00  -0.158 0.874754    
## factor(ind)690     -1.401e+01  1.761e+03  -0.008 0.993652    
## factor(ind)770      8.916e-01  3.175e-01   2.808 0.004979 ** 
## factor(ind)1070     3.543e-01  1.064e+00   0.333 0.739177    
## factor(ind)1080    -1.377e+01  1.096e+03  -0.013 0.989968    
## factor(ind)1090     7.979e-01  8.530e-01   0.935 0.349587    
## factor(ind)1170     1.278e-01  1.075e+00   0.119 0.905386    
## factor(ind)1180     3.911e-01  6.055e-01   0.646 0.518270    
## factor(ind)1190     1.027e+00  5.560e-01   1.848 0.064588 .  
## factor(ind)1270     1.445e+00  6.837e-01   2.113 0.034608 *  
## factor(ind)1280    -1.418e+01  7.746e+02  -0.018 0.985393    
## factor(ind)1290     7.283e-01  1.086e+00   0.670 0.502564    
## factor(ind)1370     1.018e+00  8.022e-01   1.269 0.204515    
## factor(ind)1390    -1.359e+01  3.956e+03  -0.003 0.997259    
## factor(ind)1480    -1.408e+01  1.185e+03  -0.012 0.990516    
## factor(ind)1490     3.423e+00  1.478e+00   2.315 0.020595 *  
## factor(ind)1570     1.962e+00  1.220e+00   1.608 0.107869    
## factor(ind)1590     2.580e+00  7.005e-01   3.683 0.000231 ***
## factor(ind)1670    -1.403e+01  2.251e+03  -0.006 0.995028    
## factor(ind)1691     6.645e-01  8.061e-01   0.824 0.409727    
## factor(ind)1770    -1.384e+01  2.796e+03  -0.005 0.996049    
## factor(ind)1790     1.984e+00  1.286e+00   1.543 0.122792    
## factor(ind)1870     8.955e-01  8.026e-01   1.116 0.264539    
## factor(ind)1880     9.310e-01  1.087e+00   0.856 0.391877    
## factor(ind)1890     1.165e+00  8.610e-01   1.353 0.175912    
## factor(ind)1990     1.132e+00  5.143e-01   2.202 0.027687 *  
## factor(ind)2070    -1.381e+01  7.305e+02  -0.019 0.984912    
## factor(ind)2090    -1.374e+01  2.797e+03  -0.005 0.996081    
## factor(ind)2170    -7.393e-02  1.079e+00  -0.069 0.945374    
## factor(ind)2180    -1.396e+01  1.955e+03  -0.007 0.994303    
## factor(ind)2190     1.023e-01  6.711e-01   0.152 0.878813    
## factor(ind)2270     2.006e+00  8.867e-01   2.262 0.023686 *  
## factor(ind)2280     6.494e-01  1.122e+00   0.579 0.562631    
## factor(ind)2290     3.193e-01  7.990e-01   0.400 0.689429    
## factor(ind)2370     2.776e-01  6.888e-01   0.403 0.686959    
## factor(ind)2380     2.052e+00  1.188e+00   1.727 0.084147 .  
## factor(ind)2390     1.694e+00  1.110e+00   1.527 0.126851    
## factor(ind)2470    -1.405e+01  2.740e+03  -0.005 0.995910    
## factor(ind)2480     2.027e+00  1.150e+00   1.763 0.077846 .  
## factor(ind)2490     2.978e-01  1.129e+00   0.264 0.791906    
## factor(ind)2570     6.156e-01  8.324e-01   0.739 0.459627    
## factor(ind)2590    -1.407e+01  1.219e+03  -0.012 0.990791    
## factor(ind)2670     5.310e-01  7.018e-01   0.757 0.449221    
## factor(ind)2680     2.185e+00  1.167e+00   1.872 0.061206 .  
## factor(ind)2690     1.104e+00  1.181e+00   0.935 0.349640    
## factor(ind)2770     1.588e+00  8.221e-01   1.931 0.053457 .  
## factor(ind)2780    -1.442e+01  1.878e+03  -0.008 0.993874    
## factor(ind)2790     2.276e+00  8.427e-01   2.701 0.006913 ** 
## factor(ind)2870    -1.412e+01  5.575e+02  -0.025 0.979788    
## factor(ind)2880    -7.725e-01  1.107e+00  -0.698 0.485300    
## factor(ind)2890    -1.425e+01  1.755e+03  -0.008 0.993520    
## factor(ind)2970    -1.422e+01  1.387e+03  -0.010 0.991816    
## factor(ind)2980     1.308e+00  5.409e-01   2.418 0.015592 *  
## factor(ind)2990    -1.388e+01  1.974e+03  -0.007 0.994392    
## factor(ind)3070    -1.412e+01  8.287e+02  -0.017 0.986408    
## factor(ind)3080     1.761e+00  5.876e-01   2.997 0.002730 ** 
## factor(ind)3095     1.087e+00  1.097e+00   0.991 0.321880    
## factor(ind)3170     5.942e-01  1.146e+00   0.518 0.604165    
## factor(ind)3180     1.697e+00  1.122e+00   1.513 0.130348    
## factor(ind)3291     1.147e+00  4.517e-01   2.540 0.011097 *  
## factor(ind)3365    -1.437e+01  7.918e+02  -0.018 0.985523    
## factor(ind)3370    -1.431e+01  1.054e+03  -0.014 0.989171    
## factor(ind)3380    -1.413e+01  8.623e+02  -0.016 0.986925    
## factor(ind)3390    -2.384e-02  6.654e-01  -0.036 0.971415    
## factor(ind)3470    -1.372e+01  1.494e+03  -0.009 0.992670    
## factor(ind)3490     1.097e+00  6.866e-01   1.597 0.110215    
## factor(ind)3570     7.539e-01  4.361e-01   1.729 0.083860 .  
## factor(ind)3580     1.246e+00  4.447e-01   2.801 0.005097 ** 
## factor(ind)3590     1.877e+00  1.133e+00   1.657 0.097548 .  
## factor(ind)3670    -1.377e+01  1.977e+03  -0.007 0.994442    
## factor(ind)3680     5.125e-01  1.068e+00   0.480 0.631227    
## factor(ind)3690     1.987e+00  1.207e+00   1.646 0.099750 .  
## factor(ind)3770     5.456e-01  1.071e+00   0.509 0.610480    
## factor(ind)3780    -1.375e+01  1.768e+03  -0.008 0.993794    
## factor(ind)3790    -1.495e+01  2.610e+03  -0.006 0.995430    
## factor(ind)3875    -1.426e+01  6.381e+02  -0.022 0.982167    
## factor(ind)3895     1.442e+00  4.989e-01   2.890 0.003856 ** 
## factor(ind)3960     3.075e-01  5.392e-01   0.570 0.568503    
## factor(ind)3970     1.562e+00  8.578e-01   1.821 0.068542 .  
## factor(ind)3980     5.906e-01  5.098e-01   1.158 0.246714    
## factor(ind)3990     5.914e-01  7.071e-01   0.836 0.402924    
## factor(ind)4070     9.364e-01  7.699e-01   1.216 0.223884    
## factor(ind)4080    -1.428e+01  1.275e+03  -0.011 0.991065    
## factor(ind)4090     1.064e-01  1.093e+00   0.097 0.922488    
## factor(ind)4170     2.904e-01  8.114e-01   0.358 0.720364    
## factor(ind)4180     7.425e-01  1.299e+00   0.572 0.567498    
## factor(ind)4195     8.471e-01  8.070e-01   1.050 0.293816    
## factor(ind)4265    -1.430e+01  7.917e+02  -0.018 0.985593    
## factor(ind)4270     7.006e-01  5.990e-01   1.170 0.242129    
## factor(ind)4280     1.484e+00  8.408e-01   1.765 0.077613 .  
## factor(ind)4290     2.104e+00  8.309e-01   2.533 0.011320 *  
## factor(ind)4370    -1.373e+01  1.978e+03  -0.007 0.994463    
## factor(ind)4380     8.569e-01  6.794e-01   1.261 0.207215    
## factor(ind)4390     1.698e+00  1.716e+00   0.990 0.322378    
## factor(ind)4470    -4.566e-02  5.646e-01  -0.081 0.935549    
## factor(ind)4480     4.326e-01  1.226e+00   0.353 0.724275    
## factor(ind)4490    -1.392e+01  1.005e+03  -0.014 0.988951    
## factor(ind)4560     8.699e-01  1.128e+00   0.771 0.440524    
## factor(ind)4570    -1.428e+01  1.270e+03  -0.011 0.991028    
## factor(ind)4580    -1.406e+01  6.819e+02  -0.021 0.983551    
## factor(ind)4585     1.345e+00  8.953e-01   1.502 0.133100    
## factor(ind)4590    -1.388e+01  1.244e+03  -0.011 0.991099    
## factor(ind)4670     1.297e+00  3.882e-01   3.342 0.000832 ***
## factor(ind)4680     6.935e-01  8.238e-01   0.842 0.399891    
## factor(ind)4690     5.307e-01  6.095e-01   0.871 0.383863    
## factor(ind)4770     8.188e-01  4.952e-01   1.653 0.098264 .  
## factor(ind)4780    -1.396e+01  1.378e+03  -0.010 0.991916    
## factor(ind)4795     7.043e-02  6.751e-01   0.104 0.916907    
## factor(ind)4870     7.031e-01  4.389e-01   1.602 0.109117    
## factor(ind)4880    -2.151e-01  1.064e+00  -0.202 0.839753    
## factor(ind)4890    -3.250e-01  1.067e+00  -0.304 0.760781    
## factor(ind)4971     1.474e-01  3.966e-01   0.372 0.710151    
## factor(ind)4972    -1.059e+00  1.090e+00  -0.971 0.331308    
## factor(ind)4980     7.096e-01  7.007e-01   1.013 0.311244    
## factor(ind)4990     5.818e-01  8.213e-01   0.708 0.478654    
## factor(ind)5070     4.701e-01  4.869e-01   0.965 0.334339    
## factor(ind)5080     1.338e+00  5.240e-01   2.554 0.010643 *  
## factor(ind)5090     2.451e-01  6.140e-01   0.399 0.689775    
## factor(ind)5170     1.274e+00  4.101e-01   3.106 0.001895 ** 
## factor(ind)5180     2.032e-01  1.148e+00   0.177 0.859526    
## factor(ind)5190     1.294e+00  6.788e-01   1.907 0.056566 .  
## factor(ind)5275     1.088e+00  4.888e-01   2.225 0.026089 *  
## factor(ind)5280    -1.457e+01  2.743e+03  -0.005 0.995763    
## factor(ind)5295    -1.468e+01  3.956e+03  -0.004 0.997039    
## factor(ind)5370     1.514e+00  7.492e-01   2.020 0.043345 *  
## factor(ind)5381     1.014e+00  5.609e-01   1.808 0.070613 .  
## factor(ind)5391    -5.736e-01  4.798e-01  -1.195 0.231895    
## factor(ind)5470     2.231e+00  7.053e-01   3.163 0.001563 ** 
## factor(ind)5480    -1.396e-01  1.108e+00  -0.126 0.899796    
## factor(ind)5490     1.551e+00  5.708e-01   2.718 0.006567 ** 
## factor(ind)5570     1.646e+00  5.907e-01   2.786 0.005328 ** 
## factor(ind)5580     1.160e+00  5.069e-01   2.288 0.022156 *  
## factor(ind)5593     2.669e-01  4.958e-01   0.538 0.590276    
## factor(ind)5670     1.875e+01  3.956e+03   0.005 0.996219    
## factor(ind)5680    -1.422e+01  8.174e+02  -0.017 0.986123    
## factor(ind)5690    -5.254e-01  1.108e+00  -0.474 0.635462    
## factor(ind)5790     1.020e+00  5.475e-01   1.862 0.062557 .  
## factor(ind)6070     1.108e+00  4.765e-01   2.324 0.020114 *  
## factor(ind)6080     1.003e+00  6.806e-01   1.474 0.140546    
## factor(ind)6090     1.444e+00  1.106e+00   1.306 0.191543    
## factor(ind)6170     9.866e-01  3.625e-01   2.722 0.006492 ** 
## factor(ind)6180     1.447e+00  5.280e-01   2.740 0.006149 ** 
## factor(ind)6190     1.511e+00  4.820e-01   3.135 0.001719 ** 
## factor(ind)6270    -1.399e+01  1.003e+03  -0.014 0.988865    
## factor(ind)6280     3.405e+00  1.287e+00   2.645 0.008170 ** 
## factor(ind)6290     9.732e-01  4.305e-01   2.261 0.023790 *  
## factor(ind)6370    -1.030e+00  1.092e+00  -0.943 0.345566    
## factor(ind)6380     1.526e-01  4.416e-01   0.346 0.729679    
## factor(ind)6390    -3.958e-01  6.887e-01  -0.575 0.565471    
## factor(ind)6470     6.884e-01  1.080e+00   0.637 0.524013    
## factor(ind)6480     3.942e-02  1.085e+00   0.036 0.971018    
## factor(ind)6490     1.340e+00  1.107e+00   1.210 0.226116    
## factor(ind)6570     1.328e+00  4.776e-01   2.781 0.005416 ** 
## factor(ind)6590     2.347e+00  1.205e+00   1.949 0.051327 .  
## factor(ind)6670     2.888e-01  6.853e-01   0.421 0.673480    
## factor(ind)6672     1.141e+00  6.007e-01   1.899 0.057549 .  
## factor(ind)6680     1.496e-01  6.951e-01   0.215 0.829648    
## factor(ind)6690    -1.412e+01  4.768e+02  -0.030 0.976369    
## factor(ind)6695     1.039e+00  1.106e+00   0.939 0.347620    
## factor(ind)6770     1.144e+00  7.162e-01   1.598 0.110072    
## factor(ind)6780    -1.400e+01  1.966e+03  -0.007 0.994318    
## factor(ind)6870     5.338e-01  3.999e-01   1.335 0.182001    
## factor(ind)6880    -8.373e-01  1.070e+00  -0.782 0.433934    
## factor(ind)6890     2.742e-01  4.835e-01   0.567 0.570626    
## factor(ind)6970    -3.571e-01  5.219e-01  -0.684 0.493842    
## factor(ind)6991    -4.179e-01  5.502e-01  -0.760 0.447552    
## factor(ind)6992     3.541e-01  4.847e-01   0.731 0.464959    
## factor(ind)7071     1.062e+00  3.532e-01   3.006 0.002644 ** 
## factor(ind)7072     5.362e-01  5.208e-01   1.029 0.303259    
## factor(ind)7080     6.587e-01  1.097e+00   0.600 0.548387    
## factor(ind)7181    -1.430e+01  1.262e+03  -0.011 0.990959    
## factor(ind)7190     1.250e+00  8.184e-01   1.528 0.126562    
## factor(ind)7270     4.840e-01  4.128e-01   1.172 0.241019    
## factor(ind)7280     6.839e-01  4.353e-01   1.571 0.116133    
## factor(ind)7290     6.945e-01  3.896e-01   1.782 0.074693 .  
## factor(ind)7370     6.413e-01  4.981e-01   1.287 0.197947    
## factor(ind)7380     4.596e-01  3.609e-01   1.273 0.202844    
## factor(ind)7390     5.786e-01  3.883e-01   1.490 0.136231    
## factor(ind)7460     2.679e-01  5.686e-01   0.471 0.637597    
## factor(ind)7470     3.575e-01  5.973e-01   0.599 0.549447    
## factor(ind)7480     1.151e-01  6.802e-01   0.169 0.865602    
## factor(ind)7490     1.318e+00  4.546e-01   2.900 0.003732 ** 
## factor(ind)7570    -1.481e+01  1.083e+03  -0.014 0.989083    
## factor(ind)7580     5.423e-01  5.134e-01   1.056 0.290817    
## factor(ind)7590     1.027e+00  4.803e-01   2.138 0.032516 *  
## factor(ind)7670     2.621e-01  7.652e-01   0.342 0.731982    
## factor(ind)7680     1.191e+00  4.329e-01   2.752 0.005920 ** 
## factor(ind)7690     1.111e+00  3.582e-01   3.103 0.001918 ** 
## factor(ind)7770     8.616e-01  3.866e-01   2.229 0.025838 *  
## factor(ind)7780     8.419e-01  5.244e-01   1.605 0.108409    
## factor(ind)7790    -4.824e-01  7.952e-01  -0.607 0.544130    
## factor(ind)7860     7.782e-01  3.456e-01   2.252 0.024348 *  
## factor(ind)7870     2.364e-01  3.856e-01   0.613 0.539823    
## factor(ind)7880     1.727e+00  7.564e-01   2.284 0.022398 *  
## factor(ind)7890     1.432e+00  3.811e-01   3.756 0.000172 ***
## factor(ind)7970     1.237e+00  3.804e-01   3.252 0.001146 ** 
## factor(ind)7980     1.346e+00  3.932e-01   3.422 0.000621 ***
## factor(ind)7990     1.414e+00  6.743e-01   2.096 0.036046 *  
## factor(ind)8070     1.496e+00  6.008e-01   2.490 0.012767 *  
## factor(ind)8080     1.349e-02  6.394e-01   0.021 0.983164    
## factor(ind)8090     1.221e+00  3.599e-01   3.393 0.000692 ***
## factor(ind)8170     6.401e-01  4.030e-01   1.589 0.112169    
## factor(ind)8180     2.013e-01  4.243e-01   0.474 0.635183    
## factor(ind)8191     4.892e-01  3.393e-01   1.441 0.149447    
## factor(ind)8192    -1.406e+01  1.233e+03  -0.011 0.990896    
## factor(ind)8270     6.580e-01  3.924e-01   1.677 0.093575 .  
## factor(ind)8290     1.816e-01  4.897e-01   0.371 0.710827    
## factor(ind)8370     8.265e-02  4.288e-01   0.193 0.847140    
## factor(ind)8380     1.937e-01  1.087e+00   0.178 0.858620    
## factor(ind)8390     1.892e+00  7.593e-01   2.492 0.012708 *  
## factor(ind)8470     1.488e+00  3.643e-01   4.085 4.40e-05 ***
## factor(ind)8561     2.879e+00  7.142e-01   4.031 5.56e-05 ***
## factor(ind)8562     1.457e+00  7.536e-01   1.933 0.053188 .  
## factor(ind)8563     1.919e+00  7.167e-01   2.678 0.007406 ** 
## factor(ind)8564     2.078e+00  4.588e-01   4.528 5.95e-06 ***
## factor(ind)8570     1.234e+00  5.974e-01   2.066 0.038848 *  
## factor(ind)8580     3.302e+00  1.266e+00   2.608 0.009115 ** 
## factor(ind)8590     1.741e+00  3.524e-01   4.941 7.76e-07 ***
## factor(ind)8660     1.879e+00  3.779e-01   4.972 6.62e-07 ***
## factor(ind)8670     1.896e+00  6.921e-01   2.739 0.006154 ** 
## factor(ind)8680     1.297e+00  3.202e-01   4.050 5.13e-05 ***
## factor(ind)8690     1.572e+00  5.955e-01   2.640 0.008285 ** 
## factor(ind)8770     8.635e-01  3.910e-01   2.209 0.027209 *  
## factor(ind)8780     4.476e-01  7.769e-01   0.576 0.564528    
## factor(ind)8790     1.249e+00  9.752e-01   1.281 0.200231    
## factor(ind)8870     9.549e-01  5.709e-01   1.673 0.094373 .  
## factor(ind)8891     1.007e+00  5.918e-01   1.702 0.088805 .  
## factor(ind)8970     1.949e+00  6.548e-01   2.976 0.002922 ** 
## factor(ind)8980     1.834e+00  3.707e-01   4.947 7.53e-07 ***
## factor(ind)8990     1.846e+00  4.114e-01   4.486 7.25e-06 ***
## factor(ind)9070     1.340e+00  6.138e-01   2.183 0.029063 *  
## factor(ind)9080    -5.871e-03  1.135e+00  -0.005 0.995873    
## factor(ind)9090     1.196e+00  4.630e-01   2.583 0.009785 ** 
## factor(ind)9160     1.240e+00  4.343e-01   2.855 0.004299 ** 
## factor(ind)9170     8.726e-01  4.990e-01   1.749 0.080361 .  
## factor(ind)9180     1.385e+00  8.820e-01   1.570 0.116321    
## factor(ind)9190     1.361e+00  7.184e-01   1.895 0.058125 .  
## factor(ind)9290     1.627e+00  4.137e-01   3.932 8.41e-05 ***
## factor(ind)9370     8.612e-02  5.033e-01   0.171 0.864134    
## factor(ind)9380     4.396e-01  8.022e-01   0.548 0.583700    
## factor(ind)9390     1.036e+00  8.326e-01   1.245 0.213264    
## factor(ind)9470     5.080e-02  4.309e-01   0.118 0.906143    
## factor(ind)9480    -4.339e-01  5.749e-01  -0.755 0.450411    
## factor(ind)9490     7.896e-01  6.299e-01   1.253 0.210032    
## factor(ind)9570     1.123e-01  5.922e-01   0.190 0.849665    
## factor(ind)9590     3.937e-01  5.507e-01   0.715 0.474750    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14614  on 23595  degrees of freedom
## Residual deviance: 11927  on 23266  degrees of freedom
## AIC: 12587
## 
## Number of Fisher Scoring iterations: 16

We see that some of the predictors are significantly related to our outcome

Next we see how the model performs in terms of accuracy of prediction. This is new comparison to how we typically use logistic regression.

We use the predict() function to get the estimated class probabilities for each case

tr_pred<- predict(glm1,
                  newdata = model.dat2train,
                  type = "response")

head(tr_pred)

##          1          2          3          4          5          6 
## 0.05854062 0.05476191 0.07586854 0.02784787 0.15011180 0.01756699

These are the estimated probability that each of these people is unable to work due to a Pandemic shutdown, based on the model.

In order to create classes (unable to work vs able to work) we have to use a decision rule. A decision rule is when we choose a cut off point, or threshold value of the probability to classify each observation as belonging to one class or the other.

A basic decision rule is if \(Pr(y=\text{Unable to work} |X) >.5\) Then classify the observation as unable to work, and otherwise is able. This is what we will use here.

tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred, gr=tr_predcl, coviduna2w=model.dat2train$covidunaw)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of Unable to Work due to a Pandemic Shutdown", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=coviduna2w, fill=coviduna2w))+
  ggtitle(label = "Probability of Unable to Work due to a Pandemic Shutdown", subtitle = "Truth")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Next we need to see how we did. A simple cross tab of the observed classes versus the predicted classes is called the confusion matrix.

table( tr_predcl,model.dat2train$covidunaw)

##          
## tr_predcl     0     1
##         0 21186  1848
##         1   213   349

This is great, but typically it’s easier to understand the model’s predictive ability by converting these to proportions. The confusionMatrix() function in caret can do this, plus other stuff.

cm1<-confusionMatrix(data = tr_predcl,
                     reference = model.dat2train$covidunaw )
cm1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 21186  1848
##          1   213   349
##                                          
##                Accuracy : 0.9127         
##                  95% CI : (0.909, 0.9162)
##     No Information Rate : 0.9069         
##     P-Value [Acc > NIR] : 0.001102       
##                                          
##                   Kappa : 0.2235         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9900         
##             Specificity : 0.1589         
##          Pos Pred Value : 0.9198         
##          Neg Pred Value : 0.6210         
##              Prevalence : 0.9069         
##          Detection Rate : 0.8979         
##    Detection Prevalence : 0.9762         
##       Balanced Accuracy : 0.5744         
##                                          
##        'Positive' Class : 0              
##

Overall the model has a 91.265% accuracy, which isn’t bad! What is bad is some of the other measures. The sensitivity 99%, and the Specificity is 16 In other word the model is pretty good at predicting if you are able to work, but not at predicting if you are unable to work.

We could try a different decision rule, in this case, I use the mean of the response as the cutoff value.

tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$covidunaw==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred, gr=tr_predcl, coviduna2w=model.dat2train$covidunaw)

pred2%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Unable to Work due to a Pandemic Shutdown", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(model.dat2train$covidunaw==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=coviduna2w))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of Unable to Work due to a Pandemic Shutdown*", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(model.dat2train$covidunaw==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

confusionMatrix(data = tr_predcl,
                model.dat2train$covidunaw,
                positive = "1" )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16407   748
##          1  4992  1449
##                                           
##                Accuracy : 0.7567          
##                  95% CI : (0.7512, 0.7622)
##     No Information Rate : 0.9069          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2283          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.65954         
##             Specificity : 0.76672         
##          Pos Pred Value : 0.22497         
##          Neg Pred Value : 0.95640         
##              Prevalence : 0.09311         
##          Detection Rate : 0.06141         
##    Detection Prevalence : 0.27297         
##       Balanced Accuracy : 0.71313         
##                                           
##        'Positive' Class : 1               
##

Which produces the same accuracy, but decreases the sensitivity at the cost of increased specificity.

Next we do this on the test set to evaluate model performance outside of the training data

pred_test<-predict(glm1,
                   newdata=model.dat2test,
                   type="response")

pred_cl<-factor(ifelse(pred_test>.5, 1, 0))

table(model.dat2test$covidunaw,pred_cl)

##    pred_cl
##        0    1
##   0 5296   53
##   1  458   91

confusionMatrix(data = pred_cl,model.dat2test$covidunaw )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5296  458
##          1   53   91
##                                           
##                Accuracy : 0.9134          
##                  95% CI : (0.9059, 0.9204)
##     No Information Rate : 0.9069          
##     P-Value [Acc > NIR] : 0.04531         
##                                           
##                   Kappa : 0.233           
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9901          
##             Specificity : 0.1658          
##          Pos Pred Value : 0.9204          
##          Neg Pred Value : 0.6319          
##              Prevalence : 0.9069          
##          Detection Rate : 0.8979          
##    Detection Prevalence : 0.9756          
##       Balanced Accuracy : 0.5779          
##                                           
##        'Positive' Class : 0               
##

pred_cl.mean<-factor(ifelse(pred_test > mean( I(model.dat2test$covidunaw==1)), 1, 0))

table(model.dat2test$covidunaw,pred_cl.mean)

##    pred_cl.mean
##        0    1
##   0 4080 1269
##   1  184  365

confusionMatrix(data = pred_cl.mean,model.dat2test$covidunaw )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4080  184
##          1 1269  365
##                                           
##                Accuracy : 0.7536          
##                  95% CI : (0.7424, 0.7646)
##     No Information Rate : 0.9069          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2266          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7628          
##             Specificity : 0.6648          
##          Pos Pred Value : 0.9568          
##          Neg Pred Value : 0.2234          
##              Prevalence : 0.9069          
##          Detection Rate : 0.6918          
##    Detection Prevalence : 0.7230          
##       Balanced Accuracy : 0.7138          
##                                           
##        'Positive' Class : 0               
##

In the test data, the model does about as well as it did on the training data, which is ideal.