library(tidyverse)
library(stargazer)
library(MatchIt)
library(glmnet)

1. Download the data

data_url <- "http://www.mfilipski.com/files/ecls.csv"
ecls_full <- read.csv(data_url)
ecls <- select(ecls_full, c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)

2. See the problem

#mean difference
ecls %>%
  group_by(catholic) %>%
  summarise(n_students = n(),
            mean_math = mean(c5r2mtsc_std),
            std_error = sd(c5r2mtsc_std) / sqrt(n_students))
## # A tibble: 2 x 4
##   catholic n_students mean_math std_error
##      <int>      <int>     <dbl>     <dbl>
## 1        0       4499     0.163    0.0145
## 2        1        930     0.220    0.0281

Yes, their average of math scores are different. However, it is important to consider the potential self-selection issue that may arise, where the characteristics of the parents who choose Catholic schools for their children may differ from those who opt for other types of schools.Therefore, this variation does not necessarily indicate any significant disparity.

reg1 <- lm(c5r2mtsc_std ~ catholic, data = ecls)
reg1 
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = ecls)
## 
## Coefficients:
## (Intercept)     catholic  
##     0.16313      0.05656
reg2 <- lm(c5r2mtsc_std ~ catholic  +
                          w3income_1k + p5numpla + w3momed_hsb, data = ecls)
reg2
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + 
##     w3momed_hsb, data = ecls)
## 
## Coefficients:
## (Intercept)     catholic  w3income_1k     p5numpla  w3momed_hsb  
##    0.073329    -0.127390     0.005325    -0.100943    -0.374035

comments: On average, individuals who attend Catholic schools demonstrate a standardized math score increase of 0.5656 in comparison to those who do not attend Catholic schools. Once I include the control variables, the coefficients are flipped. On average, individuals who attend Catholic schools demonstrate a standardized math score decrease of 0.1274 in comparison to those who do not attend Catholic schools. However, there are other variables that may be affecting the outcome of our study, which we have not accounted for and therefore may introduce bias in our results.

ecls %>%
  group_by(catholic) %>%
  summarise_all(mean)
## # A tibble: 2 x 5
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
##      <int>        <dbl>       <dbl>    <dbl>       <dbl>
## 1        0        0.163        65.4     1.11       0.392
## 2        1        0.220        86.2     1.07       0.205
t1 <- t.test(ecls[, "w3income_1k"] ~ ecls[, 'catholic'])
t2 <- t.test(ecls[, "p5numpla"] ~ ecls[, 'catholic'])
t3 <- t.test(ecls[, "w3momed_hsb"] ~ ecls[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1.212805e-37
## [1] 0.001791557
## [1] 1.530072e-33

Discuss what would be the problems with this identification strategy: the t-test indicates that there are notable discrepancies between the characteristics of the treated and control groups at the start of the study, which need further consideration when interpreting the findings.

3. Estimate the propensity to go to catholic school, by hand

#Run a logit regression to predict catholic based on the covariates we chose.
cath_logit <- glm(catholic ~  w3income_1k  + p5numpla + w3momed_hsb,
            family = binomial(), data = ecls)
cath_logit
## 
## Call:  glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb, 
##     family = binomial(), data = ecls)
## 
## Coefficients:
## (Intercept)  w3income_1k     p5numpla  w3momed_hsb  
##   -1.670295     0.007792    -0.277435    -0.650476  
## 
## Degrees of Freedom: 5428 Total (i.e. Null);  5425 Residual
## Null Deviance:       4972 
## Residual Deviance: 4751  AIC: 4759
#Make a prediction for who is likely to go to catholic school
prs_df <- data.frame(pr_score = predict(cath_logit, type = "response"),
                     catholic = cath_logit$model$catholic)
head(prs_df)
##    pr_score catholic
## 1 0.1883576        0
## 2 0.1683901        0
## 3 0.1883576        0
## 4 0.2199574        1
## 5 0.3145556        0
## 6 0.1883576        0
#Visually check income versus your predicted value
ecls$prs_df = prs_df$pr_score
ggplot(ecls, aes(x = prs_df, y = w3income_1k, color = as.factor(catholic))) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Logit prediction")

#Check that there is common support.
labs <- paste("Actual school type attended:", c("Catholic", "Public"))
prs_df %>%
  mutate(catholic = ifelse(catholic == 1, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(color = "white") +
  facet_wrap(~catholic) +
  xlab("Probability of going to Catholic school") +
  theme_bw()

bal <- ggplot(prs_df, aes(pr_score, fill = as.factor(catholic))) + 
  stat_density(aes(y = ..density..),position = "identity" 
               , color = "black", alpha=0.5)
bal

comments: It appears that there is a considerable amount of common support.

4. Run a Matching algorithm to get the impact of catholic schools by PSM

psm <- matchit(catholic ~  w3income_1k  + p5numpla + w3momed_hsb,
                     method = "nearest",  data = ecls)
# This is the new dataframe with matched data
psm_data <- match.data(psm)
dim(psm_data)
## [1] 1860    9

A total of 2704 observations were made, comprising 1352 pairs.

head(arrange(psm_data, distance), 10)
##    c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb     prs_df   distance
## 1    -1.9516170        0     17.5005        2           1 0.06069529 0.06069529
## 2    -2.5922340        1     17.5005        2           1 0.06069529 0.06069529
## 3     0.2015457        0     22.5005        2           1 0.06295488 0.06295488
## 4     0.5271555        1     22.5005        2           1 0.06295488 0.06295488
## 5    -0.1217994        0     27.5005        2           1 0.06529274 0.06529274
## 6    -1.6878765        1     27.5005        2           1 0.06529274 0.06529274
## 7    -0.3333479        0     32.5005        2           1 0.06771114 0.06771114
## 8    -1.2722942        1     32.5005        2           1 0.06771114 0.06771114
## 9    -0.7403859        0     37.5005        2           1 0.07021240 0.07021240
## 10   -0.7841369        1     37.5005        2           1 0.07021240 0.07021240
##    weights subclass
## 1        1      598
## 2        1      598
## 3        1      222
## 4        1      222
## 5        1      272
## 6        1      272
## 7        1      859
## 8        1      859
## 9        1      728
## 10       1      728
psm_data %>%
  group_by(catholic) %>%
  summarise_all(mean)
## # A tibble: 2 x 9
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prs_df distance weights
##      <int>        <dbl>       <dbl>    <dbl>       <dbl>  <dbl>    <dbl>   <dbl>
## 1        0        0.321        86.2     1.07       0.205  0.203    0.203       1
## 2        1        0.220        86.2     1.07       0.205  0.203    0.203       1
## # i 1 more variable: subclass <dbl>
t1 <- t.test(psm_data[, "w3income_1k"] ~ psm_data[, 'catholic'])
t2 <- t.test(psm_data[, "p5numpla"] ~ psm_data[, 'catholic'])
t3 <- t.test(psm_data[, "w3momed_hsb"] ~ psm_data[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1
## [1] 1
## [1] 1

comments: The new dataset has been matched with regards to the covariates, indicating that the treated and control groups have similar covariate values on average.

5. Visually check the balance in the new dataset:

#Plot the income variable against the propensity score (by catholic) and plot the lowess smooth densities
ggplot(psm_data, aes(x = distance, y = w3momed_hsb, color = as.factor(catholic))) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score") 

g1 <- ggplot(psm_data, aes(x = distance, y = w3momed_hsb, color = as.factor(catholic), shape = as.factor(catholic))) +
    geom_point(data = subset(psm_data, catholic == 0), size = 1.9, color = "red") +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score") 
g1

The matching is perfect.

6. Compute the average treatment effect in your new sample

psm_data %>%
  group_by(catholic) %>%
  summarise(n_students = n(),
            mean_math = mean(c5r2mtsc_std),
            std_error = sd(c5r2mtsc_std) / sqrt(n_students))
## # A tibble: 2 x 4
##   catholic n_students mean_math std_error
##      <int>      <int>     <dbl>     <dbl>
## 1        0        930     0.321    0.0301
## 2        1        930     0.220    0.0281
lm_psm_naive <- lm(c5r2mtsc_std ~ catholic, data = psm_data)
lm_psm_naive
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = psm_data)
## 
## Coefficients:
## (Intercept)     catholic  
##      0.3208      -0.1012
lm_psm_controls <- lm(c5r2mtsc_std ~ catholic +
                  w3income_1k + p5numpla + w3momed_hsb, data = psm_data)
lm_psm_controls
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + 
##     w3momed_hsb, data = psm_data)
## 
## Coefficients:
## (Intercept)     catholic  w3income_1k     p5numpla  w3momed_hsb  
##     0.15105     -0.10116      0.00411     -0.11500     -0.29723

comments: On average, individuals who attend Catholic schools demonstrate a standardized math score decrease of 0.115 in comparison to those who do not attend Catholic schools regardless of including the covariates.

8. Run a LASSO procedure on the ORIGINAL dataset

numeric <- unlist(lapply(ecls_full, is.numeric))  
ymat <- as.matrix(ecls_full$catholic)
xall <- as.matrix(ecls_full[ , numeric])
xall <- xall[,2:dim(xall)[2]]
head(xall)
##      race_white race_black race_hispanic race_asian p5numpla p5hmage p5hdage
## [1,]          1          0             0          0        1      47      45
## [2,]          1          0             0          0        1      41      48
## [3,]          1          0             0          0        1      43      55
## [4,]          1          0             0          0        1      38      39
## [5,]          1          0             0          0        1      47      57
## [6,]          1          0             0          0        1      41      41
##      w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty p5fstamp
## [1,]           0           0    53.50     77.5  62500.5        0        0
## [2,]           0           0    34.95     53.5  45000.5        0        0
## [3,]           0           0    63.43     53.5  62500.5        0        0
## [4,]           0           0    53.50     53.5  87500.5        0        0
## [5,]           0           0    61.56     77.5 150000.5        0        0
## [6,]           0           0    38.18     53.5  62500.5        0        0
##      c5r2mtsc c5r2mtsc_std w3income_1k
## [1,]   60.043    0.9817533     62.5005
## [2,]   56.280    0.5943775     45.0005
## [3,]   55.272    0.4906106     62.5005
## [4,]   64.604    1.4512779     87.5005
## [5,]   75.721    2.5956991    150.0005
## [6,]   54.248    0.3851966     62.5005
pick_lasso <- cv.glmnet(y=ymat, x=xall, alpha=1, family="binomial")
plot(pick_lasso)

best_lasso <- glmnet(y=ymat, x=xall, alpha=1, lambda=pick_lasso$lambda.1se)
coef(best_lasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                          s0
## (Intercept)    2.443081e-02
## race_white     1.140340e-02
## race_black     .           
## race_hispanic  .           
## race_asian     .           
## p5numpla       .           
## p5hmage        3.275488e-03
## p5hdage        2.128876e-05
## w3daded_hsb   -3.375192e-02
## w3momed_hsb   -3.319165e-02
## w3momscr       3.788223e-04
## w3dadscr       .           
## w3income       7.825630e-07
## w3povrty      -2.274717e-02
## p5fstamp       .           
## c5r2mtsc      -5.563925e-04
## c5r2mtsc_std  -1.816243e-09
## w3income_1k    2.691617e-06

As evidenced in the plot, Lasso has chosen seven variables in addition to an intercept.

9. Use Lasso predictions for matching

pr_lasso = predict(best_lasso, newx = xall)
psm_lasso <- matchit(catholic ~   w3income_1k  + p5numpla + w3momed_hsb, 
                     distance = as.numeric(pr_lasso), method= "nearest", data = ecls)
psm_data_lasso <- match.data(psm_lasso)
lm_psm_lasso <- lm(c5r2mtsc_std ~ catholic +
                  w3income_1k + p5numpla + w3momed_hsb, data = psm_data_lasso)
lm_psm_lasso
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + 
##     w3momed_hsb, data = psm_data_lasso)
## 
## Coefficients:
## (Intercept)     catholic  w3income_1k     p5numpla  w3momed_hsb  
##    0.239193    -0.168722     0.003913    -0.106540    -0.358850