1. Download the data

data_url <- "http://www.mfilipski.com/files/ecls.csv"
ecls_full <- read.csv(data_url)
ecls <- select(ecls_full, c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)

2. See the problem

ecls %>%
  group_by(catholic) %>%
  summarise(n_students = n(),
            mean_math = mean(c5r2mtsc_std),
            std_error = sd(c5r2mtsc_std) / sqrt(n_students))
catholic n_students mean_math std_error
0 4499 0.1631279 0.0145166
1 930 0.2196851 0.0281317

Key idea: Yes, they differ, but that means nothing. There’s an obvious “self-selection” problem, because people choosing to send their kids to catholic school are likely different than people who do not.

lm_naive <- lm(c5r2mtsc_std ~ catholic, data = ecls)
stargazer(lm_naive, keep.stat = c("n", "adj.rsq"), type='html', single.row = TRUE)
Dependent variable:
c5r2mtsc_std
catholic 0.057 (0.034)
Constant 0.163*** (0.014)
Observations 5,429
Adjusted R2 0.0003
Note: p<0.1; p<0.05; p<0.01
lm_controls <- lm(c5r2mtsc_std ~ catholic  +
                          w3income_1k + p5numpla + w3momed_hsb, data = ecls)
stargazer(lm_controls,  keep.stat = c("n", "adj.rsq"), type='html', single.row = TRUE)
Dependent variable:
c5r2mtsc_std
catholic -0.127*** (0.033)
w3income_1k 0.005*** (0.0003)
p5numpla -0.101*** (0.036)
w3momed_hsb -0.374*** (0.027)
Constant 0.073 (0.049)
Observations 5,429
Adjusted R2 0.124
Note: p<0.1; p<0.05; p<0.01
ecls %>%
  group_by(catholic) %>%
  summarise_all(mean)
catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
0 0.1631279 65.39393 1.106246 0.3923094
1 0.2196851 86.18063 1.073118 0.2053763

The significance tests give the following p-values. All are significantly different between groups.

t1 <- t.test(ecls[, "w3income_1k"] ~ ecls[, 'catholic'])
t2 <- t.test(ecls[, "p5numpla"] ~ ecls[, 'catholic'])
t3 <- t.test(ecls[, "w3momed_hsb"] ~ ecls[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1.212805e-37
## [1] 0.001791557
## [1] 1.530072e-33

Estimate the propensity to go to catholic school, by hand

cath_logit <- glm(catholic ~  w3income_1k  + p5numpla + w3momed_hsb,
            family = binomial(), data = ecls)
summary(cath_logit)
## 
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb, 
##     family = binomial(), data = ecls)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0172  -0.6461  -0.5240  -0.4122   2.3672  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.6702950  0.1576164 -10.597  < 2e-16 ***
## w3income_1k  0.0077921  0.0008115   9.602  < 2e-16 ***
## p5numpla    -0.2774346  0.1232930  -2.250   0.0244 *  
## w3momed_hsb -0.6504757  0.0915710  -7.104 1.22e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4972.4  on 5428  degrees of freedom
## Residual deviance: 4750.6  on 5425  degrees of freedom
## AIC: 4758.6
## 
## Number of Fisher Scoring iterations: 4
prs_df <- data.frame(pr_score = predict(cath_logit, type = "response"),
                     catholic = cath_logit$model$catholic)

Here’s the head of the predicted data:

head(prs_df)
pr_score catholic
0.1883576 0
0.1683901 0
0.1883576 0
0.2199574 1
0.3145556 0
0.1883576 0
ecls$prs_df = prs_df$pr_score
ggplot(ecls, aes(x = prs_df, y = w3income_1k, color = as.factor(catholic))) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Logit prediction") 

labs <- paste("Actual school type attended:", c("Catholic", "Public"))
prs_df %>%
  mutate(catholic = ifelse(catholic == 1, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(color = "white") +
  facet_wrap(~catholic) +
  xlab("Probability of going to Catholic school") +
  theme_bw()

Or you can do it with densities:

bal <- ggplot(prs_df, aes(pr_score, fill = as.factor(catholic))) + 
  stat_density(aes(y = ..density..),position = "identity" 
               , color = "black", alpha=0.5)
bal

Key idea: there seems to be decent support.

Run a Matching algorithm to get the impact of catholic schools by PSM

psm <- matchit(catholic ~  w3income_1k  + p5numpla + w3momed_hsb,
                     method = "nearest",  data = ecls)
# This is the new dataframe with matched data
psm_data <- match.data(psm)
dim(psm_data) # 2704 = 1352 pairs were matched
## [1] 1860    9
head(arrange(psm_data, distance), 10)
c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prs_df distance weights subclass
-0.3835843 0 17.5005 2 1 0.0606953 0.0606953 1 598
-2.5922340 1 17.5005 2 1 0.0606953 0.0606953 1 598
-2.3690526 0 22.5005 2 1 0.0629549 0.0629549 1 222
0.5271555 1 22.5005 2 1 0.0629549 0.0629549 1 222
-0.2195955 0 27.5005 2 1 0.0652927 0.0652927 1 272
-1.6878765 1 27.5005 2 1 0.0652927 0.0652927 1 272
-0.2550080 0 32.5005 2 1 0.0677111 0.0677111 1 859
-1.2722942 1 32.5005 2 1 0.0677111 0.0677111 1 859
-0.7403859 0 37.5005 2 1 0.0702124 0.0702124 1 728
-0.7841369 1 37.5005 2 1 0.0702124 0.0702124 1 728
psm_data %>%
  group_by(catholic) %>%
  summarise_all(mean)
catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prs_df distance weights subclass
0 0.3345634 86.18063 1.073118 0.2053763 0.2033602 0.2033602 1 NA
1 0.2196851 86.18063 1.073118 0.2053763 0.2033602 0.2033602 1 NA

Significance tests:

t1 <- t.test(psm_data[, "w3income_1k"] ~ psm_data[, 'catholic'])
t2 <- t.test(psm_data[, "p5numpla"] ~ psm_data[, 'catholic'])
t3 <- t.test(psm_data[, "w3momed_hsb"] ~ psm_data[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1
## [1] 1
## [1] 1

Key idea: The new dataset is “matched” in terms of the covariates. On average, the treated and control have the same covariate values.

Visually check the balance in the new dataset:

ggplot(psm_data, aes(x = distance, y = w3momed_hsb, color = as.factor(catholic))) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score") 

* The matching is so perfect that you can’t really see the two subsamples, but they are there:

g1 <- ggplot(psm_data, aes(x = distance, y = w3momed_hsb, color = as.factor(catholic), shape = as.factor(catholic))) +
    geom_point(data = subset(psm_data, catholic == 0), size = 1.9, color = "red") +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score") 
g1

You can repeat this for all covariates if you want.

Compute the average treatment effect in your new sample

psm_data %>%
  group_by(catholic) %>%
  summarise(n_students = n(),
            mean_math = mean(c5r2mtsc_std),
            std_error = sd(c5r2mtsc_std) / sqrt(n_students))
catholic n_students mean_math std_error
0 930 0.3345634 0.0297682
1 930 0.2196851 0.0281317
lm_psm_naive <- lm(c5r2mtsc_std ~ catholic, data = psm_data)
stargazer(lm_psm_naive,  keep.stat = c("n", "adj.rsq"), type='html', single.row = TRUE)
Dependent variable:
c5r2mtsc_std
catholic -0.115*** (0.041)
Constant 0.335*** (0.029)
Observations 1,860
Adjusted R2 0.004
Note: p<0.1; p<0.05; p<0.01
lm_psm_controls <- lm(c5r2mtsc_std ~ catholic +
                  w3income_1k + p5numpla + w3momed_hsb, data = psm_data)
stargazer(lm_psm_controls,  keep.stat = c("n", "adj.rsq"), type='html', single.row = TRUE)
Dependent variable:
c5r2mtsc_std
catholic -0.115*** (0.039)
w3income_1k 0.004*** (0.0005)
p5numpla -0.091 (0.070)
w3momed_hsb -0.366*** (0.050)
Constant 0.151* (0.091)
Observations 1,860
Adjusted R2 0.086
Note: p<0.1; p<0.05; p<0.01

Run a LASSO procedure on the ORIGINAL dataset (the one with all the variables)

Hints: You can use cv.glmnet followed by glmnet with the best lambda (use lambda.1se, not lambda.min) (lambda.1se the “most regularized model such that error is within one standard error of the minimum”) This command likes to have x in matrix form, so make sure you make x into a matrix and only keep the numeric variables.

numeric <- unlist(lapply(ecls_full, is.numeric))  
ymat <- as.matrix(ecls_full$catholic)
xall <- as.matrix(ecls_full[ , numeric])
xall <- xall[,2:dim(xall)[2]]
head(xall)
##      race_white race_black race_hispanic race_asian p5numpla p5hmage p5hdage
## [1,]          1          0             0          0        1      47      45
## [2,]          1          0             0          0        1      41      48
## [3,]          1          0             0          0        1      43      55
## [4,]          1          0             0          0        1      38      39
## [5,]          1          0             0          0        1      47      57
## [6,]          1          0             0          0        1      41      41
##      w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty p5fstamp
## [1,]           0           0    53.50     77.5  62500.5        0        0
## [2,]           0           0    34.95     53.5  45000.5        0        0
## [3,]           0           0    63.43     53.5  62500.5        0        0
## [4,]           0           0    53.50     53.5  87500.5        0        0
## [5,]           0           0    61.56     77.5 150000.5        0        0
## [6,]           0           0    38.18     53.5  62500.5        0        0
##      c5r2mtsc c5r2mtsc_std w3income_1k
## [1,]   60.043    0.9817533     62.5005
## [2,]   56.280    0.5943775     45.0005
## [3,]   55.272    0.4906106     62.5005
## [4,]   64.604    1.4512779     87.5005
## [5,]   75.721    2.5956991    150.0005
## [6,]   54.248    0.3851966     62.5005
pick_lasso <- cv.glmnet(y=ymat, x=xall, alpha=1, family="binomial")
plot(pick_lasso)

best_lasso <- glmnet(y=ymat, x=xall, alpha=1, lambda=pick_lasso$lambda.1se)

If you chose according to lambda.1se, how many variables would you have chosen to include? Which variables would you have chosen to run the PSM command? Do they differ from those we actually used? (hint: use coef() command)

coef(best_lasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                          s0
## (Intercept)    2.471876e-02
## race_white     4.366097e-03
## race_black     .           
## race_hispanic  .           
## race_asian     .           
## p5numpla       .           
## p5hmage        2.800406e-03
## p5hdage        .           
## w3daded_hsb   -2.936129e-02
## w3momed_hsb   -2.968947e-02
## w3momscr       2.045521e-04
## w3dadscr       .           
## w3income       7.606707e-07
## w3povrty      -1.487361e-02
## p5fstamp       .           
## c5r2mtsc       .           
## c5r2mtsc_std   .           
## w3income_1k    1.161918e-08

Key idea: Lasso selected 7 variables (as can be seen in the plot) plus an intercept. Hence “shrinkage and selection

Use Lasso predictions for matching

pr_lasso = predict(best_lasso, newx = xall)
psm_lasso <- matchit(catholic ~   w3income_1k  + p5numpla + w3momed_hsb, 
                     distance = as.numeric(pr_lasso), method= "nearest", data = ecls)
psm_data_lasso <- match.data(psm_lasso)
lm_psm_lasso <- lm(c5r2mtsc_std ~ catholic +
                  w3income_1k + p5numpla + w3momed_hsb, data = psm_data_lasso)
stargazer(lm_psm_lasso,  keep.stat = c("n", "adj.rsq"), type='html', single.row = TRUE)
Dependent variable:
c5r2mtsc_std
catholic -0.195*** (0.040)
w3income_1k 0.004*** (0.0005)
p5numpla -0.018 (0.066)
w3momed_hsb -0.312*** (0.052)
Constant 0.162* (0.088)
Observations 1,860
Adjusted R2 0.077
Note: p<0.1; p<0.05; p<0.01

Key idea: The Lasso procedure can be used as a substitute for the logit in our “matchit”. All you need to do is use lasso prediction to redefine the distance parameter. In this case it does not lead to better matching, but we can imagine cases where it might be beneficial to change the substitute ML into the first stage of a PSM.