library(tidyverse)
library(stargazer)
library(MatchIt)
library(glmnet)
data_url <- "http://www.mfilipski.com/files/ecls.csv"
ecls_full <- read.csv(data_url)
ecls <- select(ecls_full, c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)
#mean difference
ecls %>%
group_by(catholic) %>%
summarise(n_students = n(),
mean_math = mean(c5r2mtsc_std),
std_error = sd(c5r2mtsc_std) / sqrt(n_students))
## # A tibble: 2 x 4
## catholic n_students mean_math std_error
## <int> <int> <dbl> <dbl>
## 1 0 4499 0.163 0.0145
## 2 1 930 0.220 0.0281
Yes, their average of math scores are different. However, it is important to consider the potential self-selection issue that may arise, where the characteristics of the parents who choose Catholic schools for their children may differ from those who opt for other types of schools.Therefore, this variation does not necessarily indicate any significant disparity.
reg1 <- lm(c5r2mtsc_std ~ catholic, data = ecls)
reg1
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = ecls)
##
## Coefficients:
## (Intercept) catholic
## 0.16313 0.05656
reg2 <- lm(c5r2mtsc_std ~ catholic +
w3income_1k + p5numpla + w3momed_hsb, data = ecls)
reg2
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla +
## w3momed_hsb, data = ecls)
##
## Coefficients:
## (Intercept) catholic w3income_1k p5numpla w3momed_hsb
## 0.073329 -0.127390 0.005325 -0.100943 -0.374035
comments: On average, individuals who attend Catholic schools demonstrate a standardized math score increase of 0.5656 in comparison to those who do not attend Catholic schools. Once I include the control variables, the coefficients are flipped. On average, individuals who attend Catholic schools demonstrate a standardized math score decrease of 0.1274 in comparison to those who do not attend Catholic schools. However, there are other variables that may be affecting the outcome of our study, which we have not accounted for and therefore may introduce bias in our results.
ecls %>%
group_by(catholic) %>%
summarise_all(mean)
## # A tibble: 2 x 5
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.163 65.4 1.11 0.392
## 2 1 0.220 86.2 1.07 0.205
t1 <- t.test(ecls[, "w3income_1k"] ~ ecls[, 'catholic'])
t2 <- t.test(ecls[, "p5numpla"] ~ ecls[, 'catholic'])
t3 <- t.test(ecls[, "w3momed_hsb"] ~ ecls[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1.212805e-37
## [1] 0.001791557
## [1] 1.530072e-33
Discuss what would be the problems with this identification strategy: the t-test indicates that there are notable discrepancies between the characteristics of the treated and control groups at the start of the study, which need further consideration when interpreting the findings.
#Run a logit regression to predict catholic based on the covariates we chose.
cath_logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb,
family = binomial(), data = ecls)
cath_logit
##
## Call: glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## family = binomial(), data = ecls)
##
## Coefficients:
## (Intercept) w3income_1k p5numpla w3momed_hsb
## -1.670295 0.007792 -0.277435 -0.650476
##
## Degrees of Freedom: 5428 Total (i.e. Null); 5425 Residual
## Null Deviance: 4972
## Residual Deviance: 4751 AIC: 4759
#Make a prediction for who is likely to go to catholic school
prs_df <- data.frame(pr_score = predict(cath_logit, type = "response"),
catholic = cath_logit$model$catholic)
head(prs_df)
## pr_score catholic
## 1 0.1883576 0
## 2 0.1683901 0
## 3 0.1883576 0
## 4 0.2199574 1
## 5 0.3145556 0
## 6 0.1883576 0
#Visually check income versus your predicted value
ecls$prs_df = prs_df$pr_score
ggplot(ecls, aes(x = prs_df, y = w3income_1k, color = as.factor(catholic))) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Logit prediction")
#Check that there is common support.
labs <- paste("Actual school type attended:", c("Catholic", "Public"))
prs_df %>%
mutate(catholic = ifelse(catholic == 1, labs[1], labs[2])) %>%
ggplot(aes(x = pr_score)) +
geom_histogram(color = "white") +
facet_wrap(~catholic) +
xlab("Probability of going to Catholic school") +
theme_bw()
bal <- ggplot(prs_df, aes(pr_score, fill = as.factor(catholic))) +
stat_density(aes(y = ..density..),position = "identity"
, color = "black", alpha=0.5)
bal
comments: It appears that there is a considerable amount of common support.
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb,
method = "nearest", data = ecls)
# This is the new dataframe with matched data
psm_data <- match.data(psm)
dim(psm_data)
## [1] 1860 9
A total of 2704 observations were made, comprising 1352 pairs.
head(arrange(psm_data, distance), 10)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prs_df distance
## 1 -1.9516170 0 17.5005 2 1 0.06069529 0.06069529
## 2 -2.5922340 1 17.5005 2 1 0.06069529 0.06069529
## 3 0.2015457 0 22.5005 2 1 0.06295488 0.06295488
## 4 0.5271555 1 22.5005 2 1 0.06295488 0.06295488
## 5 -0.1217994 0 27.5005 2 1 0.06529274 0.06529274
## 6 -1.6878765 1 27.5005 2 1 0.06529274 0.06529274
## 7 -0.3333479 0 32.5005 2 1 0.06771114 0.06771114
## 8 -1.2722942 1 32.5005 2 1 0.06771114 0.06771114
## 9 -0.7403859 0 37.5005 2 1 0.07021240 0.07021240
## 10 -0.7841369 1 37.5005 2 1 0.07021240 0.07021240
## weights subclass
## 1 1 598
## 2 1 598
## 3 1 222
## 4 1 222
## 5 1 272
## 6 1 272
## 7 1 859
## 8 1 859
## 9 1 728
## 10 1 728
psm_data %>%
group_by(catholic) %>%
summarise_all(mean)
## # A tibble: 2 x 9
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prs_df distance weights
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.321 86.2 1.07 0.205 0.203 0.203 1
## 2 1 0.220 86.2 1.07 0.205 0.203 0.203 1
## # i 1 more variable: subclass <dbl>
t1 <- t.test(psm_data[, "w3income_1k"] ~ psm_data[, 'catholic'])
t2 <- t.test(psm_data[, "p5numpla"] ~ psm_data[, 'catholic'])
t3 <- t.test(psm_data[, "w3momed_hsb"] ~ psm_data[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1
## [1] 1
## [1] 1
comments: The new dataset has been matched with regards to the covariates, indicating that the treated and control groups have similar covariate values on average.
#Plot the income variable against the propensity score (by catholic) and plot the lowess smooth densities
ggplot(psm_data, aes(x = distance, y = w3momed_hsb, color = as.factor(catholic))) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Propensity score")
g1 <- ggplot(psm_data, aes(x = distance, y = w3momed_hsb, color = as.factor(catholic), shape = as.factor(catholic))) +
geom_point(data = subset(psm_data, catholic == 0), size = 1.9, color = "red") +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Propensity score")
g1
The matching is perfect.
psm_data %>%
group_by(catholic) %>%
summarise(n_students = n(),
mean_math = mean(c5r2mtsc_std),
std_error = sd(c5r2mtsc_std) / sqrt(n_students))
## # A tibble: 2 x 4
## catholic n_students mean_math std_error
## <int> <int> <dbl> <dbl>
## 1 0 930 0.321 0.0301
## 2 1 930 0.220 0.0281
lm_psm_naive <- lm(c5r2mtsc_std ~ catholic, data = psm_data)
lm_psm_naive
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = psm_data)
##
## Coefficients:
## (Intercept) catholic
## 0.3208 -0.1012
lm_psm_controls <- lm(c5r2mtsc_std ~ catholic +
w3income_1k + p5numpla + w3momed_hsb, data = psm_data)
lm_psm_controls
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla +
## w3momed_hsb, data = psm_data)
##
## Coefficients:
## (Intercept) catholic w3income_1k p5numpla w3momed_hsb
## 0.15105 -0.10116 0.00411 -0.11500 -0.29723
comments: On average, individuals who attend Catholic schools demonstrate a standardized math score decrease of 0.115 in comparison to those who do not attend Catholic schools regardless of including the covariates.
numeric <- unlist(lapply(ecls_full, is.numeric))
ymat <- as.matrix(ecls_full$catholic)
xall <- as.matrix(ecls_full[ , numeric])
xall <- xall[,2:dim(xall)[2]]
head(xall)
## race_white race_black race_hispanic race_asian p5numpla p5hmage p5hdage
## [1,] 1 0 0 0 1 47 45
## [2,] 1 0 0 0 1 41 48
## [3,] 1 0 0 0 1 43 55
## [4,] 1 0 0 0 1 38 39
## [5,] 1 0 0 0 1 47 57
## [6,] 1 0 0 0 1 41 41
## w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty p5fstamp
## [1,] 0 0 53.50 77.5 62500.5 0 0
## [2,] 0 0 34.95 53.5 45000.5 0 0
## [3,] 0 0 63.43 53.5 62500.5 0 0
## [4,] 0 0 53.50 53.5 87500.5 0 0
## [5,] 0 0 61.56 77.5 150000.5 0 0
## [6,] 0 0 38.18 53.5 62500.5 0 0
## c5r2mtsc c5r2mtsc_std w3income_1k
## [1,] 60.043 0.9817533 62.5005
## [2,] 56.280 0.5943775 45.0005
## [3,] 55.272 0.4906106 62.5005
## [4,] 64.604 1.4512779 87.5005
## [5,] 75.721 2.5956991 150.0005
## [6,] 54.248 0.3851966 62.5005
pick_lasso <- cv.glmnet(y=ymat, x=xall, alpha=1, family="binomial")
plot(pick_lasso)
best_lasso <- glmnet(y=ymat, x=xall, alpha=1, lambda=pick_lasso$lambda.1se)
coef(best_lasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 2.443081e-02
## race_white 1.140340e-02
## race_black .
## race_hispanic .
## race_asian .
## p5numpla .
## p5hmage 3.275488e-03
## p5hdage 2.128876e-05
## w3daded_hsb -3.375192e-02
## w3momed_hsb -3.319165e-02
## w3momscr 3.788223e-04
## w3dadscr .
## w3income 7.825630e-07
## w3povrty -2.274717e-02
## p5fstamp .
## c5r2mtsc -5.563925e-04
## c5r2mtsc_std -1.816243e-09
## w3income_1k 2.691617e-06
As evidenced in the plot, Lasso has chosen seven variables in addition to an intercept.
pr_lasso = predict(best_lasso, newx = xall)
psm_lasso <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb,
distance = as.numeric(pr_lasso), method= "nearest", data = ecls)
psm_data_lasso <- match.data(psm_lasso)
lm_psm_lasso <- lm(c5r2mtsc_std ~ catholic +
w3income_1k + p5numpla + w3momed_hsb, data = psm_data_lasso)
lm_psm_lasso
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla +
## w3momed_hsb, data = psm_data_lasso)
##
## Coefficients:
## (Intercept) catholic w3income_1k p5numpla w3momed_hsb
## 0.239193 -0.168722 0.003913 -0.106540 -0.358850