library(tidyverse)
library(kableExtra)
library(broom)
library(texreg)
library(patchwork)
library(readr)
library(lubridate)
library(boot)
library(stargazer)
library(fixest)
library(pander)
library(MatchIt)
df_2019 <- read.csv("nov19pub.csv")
df_2021 <- read.csv("nov21pub.csv")
names(df_2019) <- tolower(names(df_2019))
names(df_2021) <- tolower(names(df_2021))
n_raw_2019 <- as.character(nrow(df_2019))
n_raw_2021 <- as.character(nrow(df_2021))
n_unique_2019 <- as.character(length(unique(df_2019$hrhhid)))
n_unique_2021 <- as.character(length(unique(df_2021$hrhhid)))
The raw data contains 138850 and 127375 observations for 2019 and 2021, respectively.
The unique household IDs are 55107 and 54184 for 2019 and 2021, respectively.
Variable prtage
indicates the age of members of a
household, which is uniquely identified by variable
hrhhid
Filter to people between 18 to 40.
df_2019 <- df_2019 %>% filter(prtage > 21 & prtage < 35)
n_row_agerestrict_2019 <- as.character(nrow(df_2019))
df_2021 <- df_2021 %>% filter(prtage> 21 & prtage < 35)
n_row_agerestrict_2021 <- as.character(nrow(df_2021))
df_2019_s <- df_2019 %>% select(hrhhid, hryear4, pemlr, pruntype, gereg, pemaritl, peeduca, peedtrai, pesex, ptdtrace, gtmetsta, prtage)
df_2021_s <- df_2021 %>% select(hrhhid, hryear4, pemlr, pruntype, gereg, pemaritl, peeduca, peedtrai, pesex, ptdtrace, gtmetsta, prtage)
df <- rbind(df_2019_s, df_2021_s)
df %>% names()
## [1] "hrhhid" "hryear4" "pemlr" "pruntype" "gereg" "pemaritl"
## [7] "peeduca" "peedtrai" "pesex" "ptdtrace" "gtmetsta" "prtage"
df %>% count(hryear4)
## hryear4 n
## 1 2019 18594
## 2 2021 15925
df %>% group_by(hryear4) %>% count(pemlr)
## # A tibble: 16 × 3
## # Groups: hryear4 [2]
## hryear4 pemlr n
## <int> <int> <int>
## 1 2019 -1 205
## 2 2019 1 14357
## 3 2019 2 318
## 4 2019 3 54
## 5 2019 4 514
## 6 2019 5 108
## 7 2019 6 458
## 8 2019 7 2580
## 9 2021 -1 152
## 10 2021 1 12009
## 11 2021 2 342
## 12 2021 3 46
## 13 2021 4 479
## 14 2021 5 137
## 15 2021 6 407
## 16 2021 7 2353
df %>% group_by(pemlr) %>% count(pruntype)
## # A tibble: 12 × 3
## # Groups: pemlr [8]
## pemlr pruntype n
## <int> <int> <int>
## 1 -1 -1 357
## 2 1 -1 26366
## 3 2 -1 660
## 4 3 1 100
## 5 4 2 273
## 6 4 3 124
## 7 4 4 180
## 8 4 5 355
## 9 4 6 61
## 10 5 -1 245
## 11 6 -1 865
## 12 7 -1 4933
labor
= employed,
unemployed_layoff, and unemployed_lookingdf$arm <-df$pruntype
df$arm[df$pruntype==-1]<-"employed_control"
df$arm[df$pruntype==1 | df$pruntype==2] <-"involuntary exit"
df$arm[df$pruntype==3 | df$pruntype==4] <-"voluntary exit"
df$arm[df$pruntype==5] <-"re-entrants"
df$arm[df$pruntype==6] <-"new-entrants"
df$labor <-df$pemlr
df$labor[df$labor==1 | df$labor==2] <- "employed"
df$labor[df$labor==5| df$labor==6 | df$labor==7] <-"not in LB"
df$labor[df$labor==3] <-"unemployed_layoff"
df$labor[df$labor==4] <-"unemployed_looking"
df %>% group_by(arm) %>% count(labor) %>%
kable("html", col.names=(c("Treatment Arm", "Labor Force Status", "Count"))) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%")
Treatment Arm | Labor Force Status | Count |
---|---|---|
employed_control | -1 | 357 |
employed_control | employed | 27026 |
employed_control | not in LB | 6043 |
involuntary exit | unemployed_layoff | 100 |
involuntary exit | unemployed_looking | 273 |
new-entrants | unemployed_looking | 61 |
re-entrants | unemployed_looking | 355 |
voluntary exit | unemployed_looking | 304 |
# 1. Region (assuming gereg is already present in df)
df$region <- df$gereg
# 2. Marital Status
df$married <- 0
df$married[df$pemaritl == 1 | df$pemaritl == 2] <- 1
# 3. Gender: Male binary
df$male <- ifelse(df$pesex == 1, 1, 0)
# 4. Education Status
df$educ <- 1
df$educ[df$peeduca %in% 31:38] <- 1
df$educ[df$peeduca %in% 39:40] <- 2
df$educ[df$peeduca %in% 41:42] <- 3
df$educ[df$peeduca == 43] <- 4
df$educ[df$peeduca %in% 44:46] <- 5
table(df$educ) # Equivalent to tab educ in Stata
##
## 1 2 3 4 5
## 2053 15831 3513 9519 3603
# 5. Race
df$race <- 0
df$race[df$ptdtrace == 1] <- 1
df$race[df$ptdtrace == 2] <- 2
df$race[df$ptdtrace == 4] <- 3
df$race[df$race == 0] <- 4
table(df$race) # Equivalent to tab race in Stata
##
## 1 2 3 4
## 26805 3748 2340 1626
df$white <- ifelse(df$race == 1, 1, 0)
df$black <- ifelse(df$race == 2, 1, 0)
# 6. Metropolitan Status
df$metro <- ifelse(df$gtmetsta == 1, 1, 0)
df1 <- df %>%
filter(labor=="employed" | labor == "unemployed_layoff" | labor=="unemployed_looking")
df1 %>%
group_by(arm) %>%
count(labor)
## # A tibble: 6 × 3
## # Groups: arm [5]
## arm labor n
## <chr> <chr> <int>
## 1 employed_control employed 27026
## 2 involuntary exit unemployed_layoff 100
## 3 involuntary exit unemployed_looking 273
## 4 new-entrants unemployed_looking 61
## 5 re-entrants unemployed_looking 355
## 6 voluntary exit unemployed_looking 304
df1 %>%
group_by(hryear4) %>%
count(arm)
## # A tibble: 10 × 3
## # Groups: hryear4 [2]
## hryear4 arm n
## <int> <chr> <int>
## 1 2019 employed_control 14675
## 2 2019 involuntary exit 197
## 3 2019 new-entrants 36
## 4 2019 re-entrants 177
## 5 2019 voluntary exit 158
## 6 2021 employed_control 12351
## 7 2021 involuntary exit 176
## 8 2021 new-entrants 25
## 9 2021 re-entrants 178
## 10 2021 voluntary exit 146
#Proportion of involuntary unemployed is 1.27% in 2019 and 1.52% in 2021.
df1 %>%
count(peedtrai)
## peedtrai n
## 1 -1 15217
## 2 1 4047
## 3 2 8855
df1$onlineeduc[df1$peedtrai==1] = 1
df1$onlineeduc[df1$peedtrai==2] = 0
df1 %>%
count(onlineeduc)
## onlineeduc n
## 1 0 8855
## 2 1 4047
## 3 NA 15217
data_regression <-df1 %>%
select(onlineeduc, labor, arm, hryear4, region, educ, married, male, black, prtage) %>% filter(arm=="involuntary exit" | arm=="voluntary exit" | arm=="employed_control")
regression1 <- feols(onlineeduc ~ i(arm) + i(hryear4) + i(educ) + prtage, data = data_regression)
etable(
regression1)
## regression1
## Dependent Var.: onlineeduc
##
## Constant 0.4133*** (0.0396)
## arm = involuntaryexit -0.0830* (0.0353)
## arm = voluntaryexit 0.0078 (0.0382)
## hryear4 = 2021 0.0487*** (0.0081)
## educ = 2 0.0950*** (0.0222)
## educ = 3 0.1669*** (0.0246)
## educ = 4 0.2298*** (0.0225)
## educ = 5 0.2997*** (0.0240)
## prtage -0.0101*** (0.0012)
## _____________________ ___________________
## S.E. type IID
## Observations 12,733
## R2 0.03716
## Adj. R2 0.03655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
probit <- glm(onlineeduc ~i(region) + i(educ) + i(arm) , family = binomial(link = "probit"), data = data_regression)
probit
##
## Call: glm(formula = onlineeduc ~ i(region) + i(educ) + i(arm), family = binomial(link = "probit"),
## data = data_regression)
##
## Coefficients:
## (Intercept) i(region)1 i(region)2
## -0.03477 -0.17154 -0.05077
## i(region)3 i(region)4 i(educ)1
## -0.12135 NA -0.89579
## i(educ)2 i(educ)3 i(educ)4
## -0.52994 -0.33558 -0.14902
## i(educ)5 i(arm)employed_control i(arm)involuntary exit
## NA -0.04860 -0.31327
## i(arm)voluntary exit
## NA
##
## Degrees of Freedom: 12732 Total (i.e. Null); 12723 Residual
## (14970 observations deleted due to missingness)
## Null Deviance: 15850
## Residual Deviance: 15460 AIC: 15480
# Estimate propensity scores
data_regression2 <- df1 %>%
filter(arm == "involuntary exit" | arm == "involuntary exit" | arm == "voluntary exit" )
# create binary variable for treatment
data_regression2$inv <- ifelse(data_regression2$arm == "involuntary exit", 1, 0)
propensity_model <- glm(inv ~i(region) + i(educ), family = binomial(link = "probit"), data = data_regression2)
# Extract propensity scores
data_regression2$pscore <- predict(propensity_model, type = "response")
outcome_model <- lm(onlineeduc ~ inv + pscore, data = data_regression2)
summary(outcome_model)
##
## Call:
## lm(formula = onlineeduc ~ inv + pscore, data = data_regression2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4121 -0.2577 -0.1835 -0.1104 0.8896
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.59856 0.13839 4.325 2.06e-05 ***
## inv -0.09818 0.04815 -2.039 0.0423 *
## pscore -0.56578 0.25260 -2.240 0.0258 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4191 on 311 degrees of freedom
## (363 observations deleted due to missingness)
## Multiple R-squared: 0.03421, Adjusted R-squared: 0.028
## F-statistic: 5.508 on 2 and 311 DF, p-value: 0.004459