Part A: Understanding and running PSM

1. Download the data

library(dplyr)
library(stargazer)
library(ggplot2)
library(MatchIt)
library(glmnet)
data <- read.csv("ecls.csv")

# make smaller datset which only contains the variables:
mydata <- data %>% select(c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)

2. Identify the problem

  • Do test scores on c5r2mtsc_std differ for pupils of catholic and non-catholic schools on average?
# Performing t-test
t.test(c5r2mtsc_std ~ catholic, data = mydata)
## 
##  Welch Two Sample t-test
## 
## data:  c5r2mtsc_std by catholic
## t = -1.7866, df = 1468.1, p-value = 0.07421
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.118653665  0.005539377
## sample estimates:
## mean in group 0 mean in group 1 
##       0.1631279       0.2196851

Mean for test scores from non-catholic schools is 0.1631 as against catholic schools which have a mean of 0.2197.

  • Can you conclude that going to a catholic school leads to higher math scores? Why or why not?

No. As there may be an endogeneity problem in the relationship between math scores and going to catholic schools.

  • Run regressions:
# Regression of math scores on the catholic dummy
y1 <- lm(c5r2mtsc_std ~ catholic, mydata)
stargazer(y1, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                            c5r2mtsc_std        
## -----------------------------------------------
## catholic                       0.057           
##                               (0.034)          
##                                                
## Constant                     0.163***          
##                               (0.014)          
##                                                
## -----------------------------------------------
## Observations                   5,429           
## R2                            0.0005           
## Adjusted R2                   0.0003           
## Residual Std. Error      0.955 (df = 5427)     
## F Statistic            2.704 (df = 1; 5427)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# Regression with all variables
y2 <- lm(c5r2mtsc_std ~ ., mydata)
stargazer(y2, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                            c5r2mtsc_std        
## -----------------------------------------------
## catholic                     -0.127***         
##                               (0.033)          
##                                                
## w3income_1k                  0.005***          
##                              (0.0003)          
##                                                
## p5numpla                     -0.101***         
##                               (0.036)          
##                                                
## w3momed_hsb                  -0.374***         
##                               (0.027)          
##                                                
## Constant                       0.073           
##                               (0.049)          
##                                                
## -----------------------------------------------
## Observations                   5,429           
## R2                             0.124           
## Adjusted R2                    0.124           
## Residual Std. Error      0.894 (df = 5424)     
## F Statistic          192.272*** (df = 4; 5424) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Regression of just math scores on the catholic dummy gives a positive but insignificant effect on math scores. However, when all the variables are included the effect is negative and significant at the 1% level. Also, all the other coefficients in the model are significant.

  • Do the means of the covariates (income, mother’s education etc) differ between catholic vs. non-catholic schools?
# Compare the means of the covariates
means <- mydata %>%    
  group_by(catholic) %>%      
  summarise(across(c(c5r2mtsc_std, w3income_1k, p5numpla, w3momed_hsb), list(mean = mean)))
means
## # A tibble: 2 × 5
##   catholic c5r2mtsc_std_mean w3income_1k_mean p5numpla_mean w3momed_hsb_mean
##      <int>             <dbl>            <dbl>         <dbl>            <dbl>
## 1        0             0.163             65.4          1.11            0.392
## 2        1             0.220             86.2          1.07            0.205
# Test the difference in means using t-tests
t.test(w3income_1k ~ catholic, data = mydata)
## 
##  Welch Two Sample t-test
## 
## data:  w3income_1k by catholic
## t = -13.238, df = 1314.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -23.86719 -17.70620
## sample estimates:
## mean in group 0 mean in group 1 
##        65.39393        86.18063
t.test(p5numpla ~ catholic, data = mydata)
## 
##  Welch Two Sample t-test
## 
## data:  p5numpla by catholic
## t = 3.128, df = 1600.4, p-value = 0.001792
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.01235472 0.05390038
## sample estimates:
## mean in group 0 mean in group 1 
##        1.106246        1.073118
t.test(w3momed_hsb ~ catholic, data = mydata)
## 
##  Welch Two Sample t-test
## 
## data:  w3momed_hsb by catholic
## t = 12.362, df = 1545.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.1572715 0.2165946
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3923094       0.2053763

For each of the t-tests the means are significantly different for catholic vs non-catholic schools. This means that the treatment and control groups are different from one another and any estimate will be biased.

3. Estimate the propensity to go to catholic school, by hand

  • Run a logit regression to predict catholic based on the covariates we kept
# Run logit regression
y3 <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = mydata)
summary(y3)
## 
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb, 
##     family = binomial(), data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0172  -0.6461  -0.5240  -0.4122   2.3672  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.6702950  0.1576164 -10.597  < 2e-16 ***
## w3income_1k  0.0077921  0.0008115   9.602  < 2e-16 ***
## p5numpla    -0.2774346  0.1232930  -2.250   0.0244 *  
## w3momed_hsb -0.6504757  0.0915710  -7.104 1.22e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4972.4  on 5428  degrees of freedom
## Residual deviance: 4750.6  on 5425  degrees of freedom
## AIC: 4758.6
## 
## Number of Fisher Scoring iterations: 4
  • Make a prediction for who is likely to go to catholic school, using this model
# Predict likeliness of going to catholic school
mydata$prediction <- predict(y3, type = "response")
head(mydata)
##   c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prediction
## 1    0.9817533        0     62.5005        1           0  0.1883576
## 2    0.5943775        0     45.0005        1           0  0.1683901
## 3    0.4906106        0     62.5005        1           0  0.1883576
## 4    1.4512779        1     87.5005        1           0  0.2199574
## 5    2.5956991        0    150.0005        1           0  0.3145556
## 6    0.3851966        0     62.5005        1           0  0.1883576
  • Check that there is common support
# Visually check income versus predicted value
g1 <- ggplot(data = mydata, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
  geom_point(aes(prediction, w3income_1k))+
  geom_smooth(aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")
g1

# Visual check with plot densities
g2 <- ggplot(data = mydata, aes(x = prediction, fill = as.factor(catholic)))+
  geom_density(alpha=0.5)
g2

4. Run a Matching algorithm to get a balanced dataset

  • Create a matched dataset:
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = mydata)
psm_data <- match.data(psm)
dim(psm_data)
## [1] 1860    9

Number of observations reduced from 5429 to 1860.

  • Sort the data by distance, and show the 10 first observations.
# Sort the data by distance
sorted_psm_data <- arrange(psm_data, distance)
head(sorted_psm_data, n= 10)
##    c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prediction   distance
## 1    -0.3835843        0     17.5005        2           1 0.06069529 0.06069529
## 2    -2.5922340        1     17.5005        2           1 0.06069529 0.06069529
## 3    -2.3690526        0     22.5005        2           1 0.06295488 0.06295488
## 4     0.5271555        1     22.5005        2           1 0.06295488 0.06295488
## 5    -0.2195955        0     27.5005        2           1 0.06529274 0.06529274
## 6    -1.6878765        1     27.5005        2           1 0.06529274 0.06529274
## 7    -0.2550080        0     32.5005        2           1 0.06771114 0.06771114
## 8    -1.2722942        1     32.5005        2           1 0.06771114 0.06771114
## 9    -0.7403859        0     37.5005        2           1 0.07021240 0.07021240
## 10   -0.7841369        1     37.5005        2           1 0.07021240 0.07021240
##    weights subclass
## 1        1      598
## 2        1      598
## 3        1      222
## 4        1      222
## 5        1      272
## 6        1      272
## 7        1      859
## 8        1      859
## 9        1      728
## 10       1      728
  • Compare the means:
na.exclude(sorted_psm_data) %>% group_by(catholic) %>% summarise_all(mean)
## # A tibble: 2 × 9
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prediction distance
##      <int>        <dbl>       <dbl>    <dbl>       <dbl>      <dbl>    <dbl>
## 1        0        0.335        86.2     1.07       0.205      0.203    0.203
## 2        1        0.220        86.2     1.07       0.205      0.203    0.203
## # … with 2 more variables: weights <dbl>, subclass <dbl>

Test the difference in means for statistical significance:

# Test using t-tests
t.test(w3income_1k ~ catholic, sorted_psm_data)
## 
##  Welch Two Sample t-test
## 
## data:  w3income_1k by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -3.985565  3.985565
## sample estimates:
## mean in group 0 mean in group 1 
##        86.18063        86.18063
t.test(p5numpla ~ catholic, sorted_psm_data)
## 
##  Welch Two Sample t-test
## 
## data:  p5numpla by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.02550007  0.02550007
## sample estimates:
## mean in group 0 mean in group 1 
##        1.073118        1.073118
t.test(w3momed_hsb ~ catholic, sorted_psm_data)
## 
##  Welch Two Sample t-test
## 
## data:  w3momed_hsb by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.03676158  0.03676158
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2053763       0.2053763

The means are same for each t-test. PSM has now made both the treatment and control groups comparable.

5. Visually check the balance in the new dataset:

# Plot the income variable against the propensity score (by catholic)
g3 <- ggplot(data = sorted_psm_data, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
      geom_point(aes(prediction, w3income_1k))+
      geom_smooth(aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")
g3

The means are identical.

6. Compute the average treatment effect in your new sample

  • Compare means of the two groups
sorted_psm_data %>% group_by(catholic) %>% summarise_all(mean)
## # A tibble: 2 × 9
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prediction distance
##      <int>        <dbl>       <dbl>    <dbl>       <dbl>      <dbl>    <dbl>
## 1        0        0.335        86.2     1.07       0.205      0.203    0.203
## 2        1        0.220        86.2     1.07       0.205      0.203    0.203
## # … with 2 more variables: weights <dbl>, subclass <dbl>
  • Run regressions:
# Regression of just scores on the catholic dummy
y4 <- lm(c5r2mtsc_std ~ catholic, sorted_psm_data)
stargazer(y4, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                            c5r2mtsc_std        
## -----------------------------------------------
## catholic                     -0.115***         
##                               (0.041)          
##                                                
## Constant                     0.335***          
##                               (0.029)          
##                                                
## -----------------------------------------------
## Observations                   1,860           
## R2                             0.004           
## Adjusted R2                    0.004           
## Residual Std. Error      0.883 (df = 1858)     
## F Statistic           7.867*** (df = 1; 1858)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# Regression that includes all the variables
y5 <- lm(c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + w3momed_hsb, sorted_psm_data)
stargazer(y5, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                            c5r2mtsc_std        
## -----------------------------------------------
## catholic                     -0.115***         
##                               (0.039)          
##                                                
## w3income_1k                  0.004***          
##                              (0.0005)          
##                                                
## p5numpla                      -0.091           
##                               (0.070)          
##                                                
## w3momed_hsb                  -0.366***         
##                               (0.050)          
##                                                
## Constant                      0.151*           
##                               (0.091)          
##                                                
## -----------------------------------------------
## Observations                   1,860           
## R2                             0.088           
## Adjusted R2                    0.086           
## Residual Std. Error      0.846 (df = 1855)     
## F Statistic          44.707*** (df = 4; 1855)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Now, both the regressions show a negative effect of catholic schools on math scores that is significant at the 1% level.

Part B: Deconstructing PSM

7. Split the PSM procedure

  • Reproduce a PSM ‘by hand’
#Predictions from the logit regression
prediction_logit <- fitted(y3)
plot(psm$distance, prediction_logit)

8. Use Machine Learning for matching

  • Run a LASSO procedure on the ORIGINAL dataset
#Keep only numeric variables from original dataset
lassodata <- data %>% select(c5r2mtsc_std, catholic, race_white, race_black, race_hispanic, race_asian, p5numpla, p5hmage, p5hdage, w3daded_hsb, w3momed_hsb, w3momscr, w3dadscr, w3income, w3povrty, p5fstamp, c5r2mtsc, w3income_1k)
head(lassodata)
##   c5r2mtsc_std catholic race_white race_black race_hispanic race_asian p5numpla
## 1    0.9817533        0          1          0             0          0        1
## 2    0.5943775        0          1          0             0          0        1
## 3    0.4906106        0          1          0             0          0        1
## 4    1.4512779        1          1          0             0          0        1
## 5    2.5956991        0          1          0             0          0        1
## 6    0.3851966        0          1          0             0          0        1
##   p5hmage p5hdage w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty
## 1      47      45           0           0    53.50     77.5  62500.5        0
## 2      41      48           0           0    34.95     53.5  45000.5        0
## 3      43      55           0           0    63.43     53.5  62500.5        0
## 4      38      39           0           0    53.50     53.5  87500.5        0
## 5      47      57           0           0    61.56     77.5 150000.5        0
## 6      41      41           0           0    38.18     53.5  62500.5        0
##   p5fstamp c5r2mtsc w3income_1k
## 1        0   60.043     62.5005
## 2        0   56.280     45.0005
## 3        0   55.272     62.5005
## 4        0   64.604     87.5005
## 5        0   75.721    150.0005
## 6        0   54.248     62.5005
# Run lasso
y <- data.matrix(lassodata$catholic)
x <- data.matrix(lassodata[,c(1, 3:18)])

lasso1 <- cv.glmnet(x, y, family = "gaussian", alpha = 1 )
lasso <- glmnet(x, y, family = "gaussian", alpha = 1, lambda = lasso1$lambda.1se)
coef(lasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                          s0
## (Intercept)    7.516499e-02
## c5r2mtsc_std   .           
## race_white     .           
## race_black     .           
## race_hispanic  .           
## race_asian     .           
## p5numpla       .           
## p5hmage        1.700277e-03
## p5hdage        .           
## w3daded_hsb   -2.147176e-02
## w3momed_hsb   -2.195889e-02
## w3momscr       .           
## w3dadscr       .           
## w3income       7.035449e-07
## w3povrty       .           
## p5fstamp       .           
## c5r2mtsc       .           
## w3income_1k    2.168253e-16

Using LASSO, the variables to be included are p5hmage, w3daded_hsb, w3momed_hsb, w3income and w3income_1k. These variables differ from the ones used for PSM

plot(lasso1)

9. Use Lasso predictions for matching.

# Generate a prediction from the "best lasso" model
lasso_prediction <- predict(lasso, newx = x, type = "response")

# Use that prediction as your distance in the matchit command
match <- matchit(catholic ~ p5hmage  + w3daded_hsb + w3momed_hsb + w3income + w3income_1k, data = lassodata, family = binomial(), distance = as.numeric(lasso_prediction))

# Create the match.data dataset based on that matchit
matchdata <- match.data(match)

# Create some plots to compare: w3income_1k vs prediction

g5 <- ggplot(data = matchdata, aes(x = distance, y = w3income_1k, col = as.factor(catholic))) + geom_smooth(method = "loess")
g5

# w3daded_hsb vs prediction
g6 <- ggplot(data = matchdata, aes(x = distance, y = w3daded_hsb, col = as.factor(catholic))) + geom_smooth(method = "loess")
g6