DEM 7283 - HW3

Research Question

What is the likelihood of experiencing joblessness due to the Covid-19 pandemic by sex and educational attainment?

library(readr)
cps1 <- read_csv("C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/HW/HW3/cps_00008.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   YEAR = col_double(),
##   SERIAL = col_double(),
##   MONTH = col_double(),
##   CPSID = col_double(),
##   PERNUM = col_double(),
##   WTFINL = col_double(),
##   CPSIDP = col_double(),
##   SEX = col_double(),
##   LABFORCE = col_double(),
##   EDUC99 = col_double(),
##   EDCYC = col_double(),
##   COVIDUNAW = col_double()
## )

cps2<-cps1[sample(nrow(cps1), 10000), ]

Fix variable names

#The names in the data are very ugly, so I make them less ugly
nams<-names(cps2)
head(nams, n=10)

##  [1] "YEAR"     "SERIAL"   "MONTH"    "CPSID"    "PERNUM"   "WTFINL"  
##  [7] "CPSIDP"   "SEX"      "LABFORCE" "EDUC99"

#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(cps2)<-newnames

#Unable to work due to COVID 
cps2$covdunaw<-Recode(cps2$covidunaw, recodes="2=1; 1=0; else=NA")

#sex
cps2$female<- Recode(cps2$sex, recodes="2=1; 1=0; else=NA")

#education level
cps2$edatt<-Recode(cps2$educ99,
                      recodes="1:5='0to8'; 6:9='1somehs'; 10='2hsgrad'; 11='3somecol'; 12:15='4undgrad'; 16:18='5advgrad' ;else=NA",
      as.factor=T)

Analysis

First, we will do some descriptive analysis, such as means and cross tabulations.

sub<-cps2 %>%
  select(covdunaw,female, edatt,wtfinl) %>%
  filter( complete.cases( . ))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids= ~1,
               weights= ~wtfinl,
               data = sub )

First, we examine the % of US adults with with COVID related unemployment by Education level, and do a survey-corrected chi-square test for independence.

cat<-svyby(formula = ~covdunaw,
           by = ~edatt,
           design = des,
           FUN = svymean,
           na.rm=T)

svychisq(~covdunaw+edatt,
         design = des)

## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~covdunaw + edatt, design = des)
## F = 3.6511, ndf = 4.9545, ddf = 39933.3413, p-value = 0.002738

plot of estimates with standard errors

library(ggplot2)
cat%>%
  ggplot()+
  geom_point(aes(x=edatt,y=covdunaw))+
  geom_errorbar(aes(x=edatt, ymin = covdunaw-1.96*se, 
                    ymax= covdunaw+1.96*se),
                width=.25)+
   labs(title = "Percent % of US Adults not working due to COVID by Education", 
        caption = "Source: IPUMS CPS, 2020 \n Calculations by David R. Rodriguez, MA.",
       x = "Education",
       y = "%  Not working due to COVID shutdown")+
  theme_minimal()

Calculate sex*covdunaw cross tabulation, and plot it

dog<-svyby(formula = ~covdunaw,
           by = ~female, 
           design = des, 
           FUN = svymean,
           na.rm=T)

svychisq(~covdunaw+female,
         design = des)

## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~covdunaw + female, design = des)
## F = 0.63052, ndf = 1, ddf = 8060, p-value = 0.4272

dog%>%
  ggplot()+
  geom_point(aes(x=female,y=covdunaw))+
  geom_errorbar(aes(x=female, ymin = covdunaw-1.96*se, 
                    ymax= covdunaw+1.96*se),
                width=.25)+
   labs(title = "Percent % of US Adults not working due to COVID by Sex", 
        caption = "Source: IPUMS CPS, 2020 \n Calculations by David R. Rodriguez, MA.",
       x = "Sex",
       y = "%  Not working due to COVID shutdown")+
  theme_minimal()

Calculate sex by education by covdunaw cross tabulation, and plot it

catdog<-svyby(formula = ~covdunaw,
              by = ~female+edatt,
              design = des,
              FUN = svymean,
              na.rm=T)

#this plot is a little more complicated, but facet_wrap() plots separate plots for groups

catdog%>%
  ggplot()+
  geom_point(aes(x=edatt, y = covdunaw))+ 
  geom_errorbar(aes(x=edatt, ymin = covdunaw-1.96*se, 
                    ymax= covdunaw+1.96*se),
                width=.25)+
  facet_wrap(~ female, nrow = 3)+
  labs(title = "Percent % of US Adults not working due to COVID by Sex and education", 
        caption = "Source: IPUMS CPS, 2020 \n Calculations by David R. Rodriguez, MA.",
       x = "Sex",
       x = "Education",
       y = "%  Not working due to COVID shutdown")+
  theme_minimal()

Fitting the logistic regression model

To fit the model, we use svyglm()

#Logit model
fit.logit<-svyglm(covdunaw ~ female + edatt,
                  design = des,
                  family = binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

library(broom)
fit.logit%>%
  tidy()%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	-2.216	0.288	-7.688	0.000
female	0.068	0.092	0.742	0.458
edatt1somehs	-0.596	0.344	-1.730	0.084
edatt2hsgrad	0.067	0.294	0.227	0.821
edatt3somecol	-0.039	0.303	-0.130	0.897
edatt4undgrad	0.059	0.292	0.201	0.841
edatt5advgrad	-0.446	0.318	-1.403	0.161

Get odds ratios and confidence intervals for the estimates

The most straighforward way to get odds ratios is to exponentiate the \(\beta\) parameters. These are stored in the coefficients(fit.logit)

fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value	OR
(Intercept)	-2.216	0.288	-7.688	0.000	0.109
female	0.068	0.092	0.742	0.458	1.070
edatt1somehs	-0.596	0.344	-1.730	0.084	0.551
edatt2hsgrad	0.067	0.294	0.227	0.821	1.069
edatt3somecol	-0.039	0.303	-0.130	0.897	0.961
edatt4undgrad	0.059	0.292	0.201	0.841	1.060
edatt5advgrad	-0.446	0.318	-1.403	0.161	0.640

And we can get simple confidence intervals for these esimates using confint() which will work for most models.

fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value	OR	LowerOR_Ci	UpperOR_Ci
(Intercept)	-2.216	0.288	-7.688	0.000	0.109	0.062	0.192
female	0.068	0.092	0.742	0.458	1.070	0.894	1.281
edatt1somehs	-0.596	0.344	-1.730	0.084	0.551	0.281	1.082
edatt2hsgrad	0.067	0.294	0.227	0.821	1.069	0.601	1.902
edatt3somecol	-0.039	0.303	-0.130	0.897	0.961	0.531	1.742
edatt4undgrad	0.059	0.292	0.201	0.841	1.060	0.598	1.880
edatt5advgrad	-0.446	0.318	-1.403	0.161	0.640	0.343	1.194

A sligtly more digestible form can be obtained from the sjPlot library. In this plot, if the error bars overlap 1, the effects are not statistically significant.

library(sjPlot)

## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car

## Learn more about sjPlot with 'browseVignettes("sjPlot")'.

plot_model(fit.logit,
           title = "Odds ratios for Not Working due to COVID shut down")

#get a series of predicted probabilities for different "types" of people for each model
#ref_grid will generate all possible combinations of predictors from a model

library(emmeans)
rg<-ref_grid(fit.logit)

marg_logit<-emmeans(object = rg,
              specs = c( "female",  "edatt"),
              type="response" )

knitr::kable(marg_logit,  digits = 4)

female	edatt	prob	SE	df	asymp.LCL	asymp.UCL
0	0to8	0.0983	0.0256	Inf	0.0583	0.1609
1	0to8	0.1045	0.0265	Inf	0.0628	0.1689
0	1somehs	0.0567	0.0108	Inf	0.0389	0.0820
1	1somehs	0.0604	0.0115	Inf	0.0414	0.0874
0	2hsgrad	0.1044	0.0089	Inf	0.0882	0.1231
1	2hsgrad	0.1109	0.0093	Inf	0.0939	0.1305
0	3somecol	0.0949	0.0104	Inf	0.0764	0.1173
1	3somecol	0.1009	0.0111	Inf	0.0810	0.1249
0	4undgrad	0.1036	0.0086	Inf	0.0880	0.1216
1	4undgrad	0.1101	0.0086	Inf	0.0944	0.1281
0	5advgrad	0.0652	0.0096	Inf	0.0487	0.0868
1	5advgrad	0.0695	0.0096	Inf	0.0529	0.0909

Fitted Values

An interesting case is that of Men with only some high school education having the lowest probability (.065) of losing work due to a Covid shutdown. This might be explained by this group being over represented in occupations considered “essential” work.

comps<-as.data.frame(marg_logit)

comps[comps$female=="0" & comps$edatt == "1somehs" , ]

comps[comps$female=="0" & comps$edatt == "2hsgrad" , ]

comps[comps$female=="0" & comps$edatt == "3somecol" , ]

comps[comps$female=="0" & comps$edatt == "4undgrad" , ]

comps[comps$female=="0" & comps$edatt == "5advgrad" , ]