Part A: Understanding and running PSM

1. Download the data

ecls_full <- read.csv("ecls.csv")
ecls <- select(ecls_full, c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)

2. Identify the problem

• Do test scores on c5r2mtsc_std differ for pupils of catholic and non-catholic schools on average?

ecls %>%
  group_by(catholic) %>%
  summarise(n_students = n(),
            mean_math = mean(c5r2mtsc_std),
            std_error = sd(c5r2mtsc_std) / sqrt(n_students))
## # A tibble: 2 × 4
##   catholic n_students mean_math std_error
##      <int>      <int>     <dbl>     <dbl>
## 1        0       4499     0.163    0.0145
## 2        1        930     0.220    0.0281

To answer the first question, they are different. But it doesn’t mean that it differs for pupils of catholic and non-catholic schools because we don’t control for other factors.

library(stargazer)
lm_naive <- lm(c5r2mtsc_std ~ catholic, data = ecls)
stargazer(lm_naive, keep.stat = c("n", "adj.rsq"), type = "text", single.row = TRUE)
## 
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                     c5r2mtsc_std        
## ----------------------------------------
## catholic            0.057 (0.034)       
## Constant          0.163*** (0.014)      
## ----------------------------------------
## Observations            5,429           
## Adjusted R2            0.0003           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01
lm_controls <- lm(c5r2mtsc_std ~ catholic  +
                          w3income_1k + p5numpla + w3momed_hsb, data = ecls)
stargazer(lm_controls,  keep.stat = c("n", "adj.rsq"), type = "text", single.row = TRUE)
## 
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                     c5r2mtsc_std        
## ----------------------------------------
## catholic          -0.127*** (0.033)     
## w3income_1k       0.005*** (0.0003)     
## p5numpla          -0.101*** (0.036)     
## w3momed_hsb       -0.374*** (0.027)     
## Constant            0.073 (0.049)       
## ----------------------------------------
## Observations            5,429           
## Adjusted R2             0.124           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01

After controlling other factors, we can see that catholic schools have a significant effect on the math scores but it’s negative, which is different than we assume.

ecls %>%
  group_by(catholic) %>%
  summarise_all(mean)
## # A tibble: 2 × 5
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
##      <int>        <dbl>       <dbl>    <dbl>       <dbl>
## 1        0        0.163        65.4     1.11       0.392
## 2        1        0.220        86.2     1.07       0.205
t1 <- t.test(ecls[, "w3income_1k"] ~ ecls[, 'catholic'])
t2 <- t.test(ecls[, "p5numpla"] ~ ecls[, 'catholic'])
t3 <- t.test(ecls[, "w3momed_hsb"] ~ ecls[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1.212805e-37
## [1] 0.001791557
## [1] 1.530072e-33

The catholic is correlatted to p5numpla. Therefore we have covariates issue.

3. Estimate the propensity to go to catholic school, by hand

cath_logit <- glm(catholic ~  w3income_1k  + p5numpla + w3momed_hsb,
            family = binomial(), data = ecls)
stargazer(cath_logit, type = "text")
## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                            catholic          
## ---------------------------------------------
## w3income_1k                0.008***          
##                             (0.001)          
##                                              
## p5numpla                   -0.277**          
##                             (0.123)          
##                                              
## w3momed_hsb                -0.650***         
##                             (0.092)          
##                                              
## Constant                   -1.670***         
##                             (0.158)          
##                                              
## ---------------------------------------------
## Observations                 5,429           
## Log Likelihood            -2,375.301         
## Akaike Inf. Crit.          4,758.603         
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01
# make a prediction
prs_df <- data.frame(pr_score = predict(cath_logit, type = "response"),
                     catholic = cath_logit$model$catholic)
ecls$prs_df = prs_df$pr_score
ggplot(ecls, aes(x = prs_df, y = w3income_1k, color = as.factor(catholic))) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Logit prediction") 

labs <- paste("Actual school type attended:", c("Catholic", "Public"))
prs_df %>%
  mutate(catholic = ifelse(catholic == 1, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(color = "white") +
  facet_wrap(~catholic) +
  xlab("Probability of going to Catholic school") +
  theme_bw()

bal <- ggplot(prs_df, aes(pr_score, fill = as.factor(catholic))) + 
  stat_density(aes(y = ..density..),position = "identity" 
               , color = "black", alpha=0.5)
bal

### 4. Run a Matching algorithm to get a balanced dataset

psm <- matchit(catholic ~  w3income_1k  + p5numpla + w3momed_hsb,
                     method = "nearest",  data = ecls)
# This is the new dataframe with matched data
psm_data <- match.data(psm)
dim(psm_data) # 2704 = 1352 pairs were matched
## [1] 1860    9
head(arrange(psm_data, distance), 10)
##    c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb     prs_df   distance
## 1    -1.9516170        0     17.5005        2           1 0.06069529 0.06069529
## 2    -2.5922340        1     17.5005        2           1 0.06069529 0.06069529
## 3     0.2015457        0     22.5005        2           1 0.06295488 0.06295488
## 4     0.5271555        1     22.5005        2           1 0.06295488 0.06295488
## 5    -0.1217994        0     27.5005        2           1 0.06529274 0.06529274
## 6    -1.6878765        1     27.5005        2           1 0.06529274 0.06529274
## 7    -0.3333479        0     32.5005        2           1 0.06771114 0.06771114
## 8    -1.2722942        1     32.5005        2           1 0.06771114 0.06771114
## 9    -0.7403859        0     37.5005        2           1 0.07021240 0.07021240
## 10   -0.7841369        1     37.5005        2           1 0.07021240 0.07021240
##    weights subclass
## 1        1      598
## 2        1      598
## 3        1      222
## 4        1      222
## 5        1      272
## 6        1      272
## 7        1      859
## 8        1      859
## 9        1      728
## 10       1      728

The distance is very close.

psm_data %>%
  group_by(catholic) %>%
  summarise_all(mean)
## # A tibble: 2 × 9
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prs_df distance weights
##      <int>        <dbl>       <dbl>    <dbl>       <dbl>  <dbl>    <dbl>   <dbl>
## 1        0        0.321        86.2     1.07       0.205  0.203    0.203       1
## 2        1        0.220        86.2     1.07       0.205  0.203    0.203       1
## # ℹ 1 more variable: subclass <dbl>
t1 <- t.test(psm_data[, "w3income_1k"] ~ psm_data[, 'catholic'])
t2 <- t.test(psm_data[, "p5numpla"] ~ psm_data[, 'catholic'])
t3 <- t.test(psm_data[, "w3momed_hsb"] ~ psm_data[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1
## [1] 1
## [1] 1

This time we don’t have covariates issue.

5. Visually check the balance in the new dataset:

ggplot(psm_data, aes(x = distance, y = w3momed_hsb, color = as.factor(catholic))) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score")

6. Compute the average treatment effect in your new sample

psm_data %>%
  group_by(catholic) %>%
  summarise(n_students = n(),
            mean_math = mean(c5r2mtsc_std),
            std_error = sd(c5r2mtsc_std) / sqrt(n_students))
## # A tibble: 2 × 4
##   catholic n_students mean_math std_error
##      <int>      <int>     <dbl>     <dbl>
## 1        0        930     0.321    0.0301
## 2        1        930     0.220    0.0281
lm_psm_naive <- lm(c5r2mtsc_std ~ catholic, data = psm_data)
stargazer(lm_psm_naive,  keep.stat = c("n", "adj.rsq"), type='text')
## 
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                     c5r2mtsc_std        
## ----------------------------------------
## catholic              -0.101**          
##                        (0.041)          
##                                         
## Constant              0.321***          
##                        (0.029)          
##                                         
## ----------------------------------------
## Observations            1,860           
## Adjusted R2             0.003           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01
lm_psm_controls <- lm(c5r2mtsc_std ~ catholic +
                  w3income_1k + p5numpla + w3momed_hsb, data = psm_data)
stargazer(lm_psm_controls,  keep.stat = c("n", "adj.rsq"), type='text')
## 
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                     c5r2mtsc_std        
## ----------------------------------------
## catholic              -0.101**          
##                        (0.040)          
##                                         
## w3income_1k           0.004***          
##                       (0.0005)          
##                                         
## p5numpla               -0.115           
##                        (0.071)          
##                                         
## w3momed_hsb           -0.297***         
##                        (0.050)          
##                                         
## Constant                0.151           
##                        (0.092)          
##                                         
## ----------------------------------------
## Observations            1,860           
## Adjusted R2             0.072           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01

Part B: Deconstructing PSM

7. Split the PSM procedure

plot(ecls$prs_df,psm$distance)

8. Use Machine Learning for matching

numeric <- unlist(lapply(ecls_full, is.numeric))  
ymat <- as.matrix(ecls_full$catholic)
xall <- as.matrix(ecls_full[ , numeric])
xall <- xall[,2:dim(xall)[2]]
head(xall)
##      race_white race_black race_hispanic race_asian p5numpla p5hmage p5hdage
## [1,]          1          0             0          0        1      47      45
## [2,]          1          0             0          0        1      41      48
## [3,]          1          0             0          0        1      43      55
## [4,]          1          0             0          0        1      38      39
## [5,]          1          0             0          0        1      47      57
## [6,]          1          0             0          0        1      41      41
##      w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty p5fstamp
## [1,]           0           0    53.50     77.5  62500.5        0        0
## [2,]           0           0    34.95     53.5  45000.5        0        0
## [3,]           0           0    63.43     53.5  62500.5        0        0
## [4,]           0           0    53.50     53.5  87500.5        0        0
## [5,]           0           0    61.56     77.5 150000.5        0        0
## [6,]           0           0    38.18     53.5  62500.5        0        0
##      c5r2mtsc c5r2mtsc_std w3income_1k
## [1,]   60.043    0.9817533     62.5005
## [2,]   56.280    0.5943775     45.0005
## [3,]   55.272    0.4906106     62.5005
## [4,]   64.604    1.4512779     87.5005
## [5,]   75.721    2.5956991    150.0005
## [6,]   54.248    0.3851966     62.5005
pick_lasso <- cv.glmnet(y=ymat, x=xall, alpha=1, family="binomial")
plot(pick_lasso)

best_lasso <- glmnet(y=ymat, x=xall, alpha=1, lambda=pick_lasso$lambda.1se)
coef(best_lasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                          s0
## (Intercept)    4.528656e-02
## race_white     .           
## race_black     .           
## race_hispanic  .           
## race_asian     .           
## p5numpla       .           
## p5hmage        2.436445e-03
## p5hdage        .           
## w3daded_hsb   -2.681065e-02
## w3momed_hsb   -2.758450e-02
## w3momscr       8.008080e-05
## w3dadscr       .           
## w3income       7.521171e-07
## w3povrty      -8.906012e-03
## p5fstamp       .           
## c5r2mtsc       .           
## c5r2mtsc_std   .           
## w3income_1k    4.438951e-08

9. Use Lasso predictions for matching.

pr_lasso = predict(best_lasso, newx = xall)
psm_lasso <- matchit(catholic ~   w3income_1k  + p5numpla + w3momed_hsb, 
                     distance = as.numeric(pr_lasso), method= "nearest", data = ecls)
psm_data_lasso <- match.data(psm_lasso)
lm_psm_lasso <- lm(c5r2mtsc_std ~ catholic +
                  w3income_1k + p5numpla + w3momed_hsb, data = psm_data_lasso)
stargazer(lm_psm_lasso,  keep.stat = c("n", "adj.rsq"), type='text')
## 
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                     c5r2mtsc_std        
## ----------------------------------------
## catholic              -0.183***         
##                        (0.040)          
##                                         
## w3income_1k           0.004***          
##                       (0.0005)          
##                                         
## p5numpla                0.040           
##                        (0.060)          
##                                         
## w3momed_hsb           -0.350***         
##                        (0.052)          
##                                         
## Constant                0.097           
##                        (0.085)          
##                                         
## ----------------------------------------
## Observations            1,860           
## Adjusted R2             0.080           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01