library(MatchIt)
library(glmnet)
library(rvest)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(XML)
library(xml2)
library(jsonlite)
library(stargazer)
df<-read.csv('ecls.CSV')
#create a smaller dataset
df.reduced<-as.data.frame(cbind(df[,'c5r2mtsc_std'],df[,'catholic'],df[,'w3income_1k'],df[,'p5numpla'],df[,'w3momed_hsb']))
colnames(df.reduced)<-c('c5r2mtsc_std','catholic','w3income_1k','p5numpla','w3momed_hsb')

math.vs.catholic.reg<-lm(c5r2mtsc_std~catholic,data=df.reduced)
summary(math.vs.catholic.reg)
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = df.reduced)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2667 -0.6082  0.0538  0.6292  3.2705 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.16313    0.01424  11.459   <2e-16 ***
## catholic     0.05656    0.03440   1.644      0.1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9549 on 5427 degrees of freedom
## Multiple R-squared:  0.000498,   Adjusted R-squared:  0.0003138 
## F-statistic: 2.704 on 1 and 5427 DF,  p-value: 0.1002

The regression shows that the coefficient of catholic on grade is not statistically significant.

math.with.contr.reg<-lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb,data=df.reduced)
summary(math.with.contr.reg)
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + 
##     w3momed_hsb, data = df.reduced)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3922 -0.5696  0.0372  0.5986  3.2572 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0733290  0.0491661   1.491 0.135900    
## catholic    -0.1273899  0.0328888  -3.873 0.000109 ***
## w3income_1k  0.0053247  0.0003026  17.595  < 2e-16 ***
## p5numpla    -0.1009432  0.0355779  -2.837 0.004567 ** 
## w3momed_hsb -0.3740355  0.0271962 -13.753  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8941 on 5424 degrees of freedom
## Multiple R-squared:  0.1242, Adjusted R-squared:  0.1235 
## F-statistic: 192.3 on 4 and 5424 DF,  p-value: < 2.2e-16

We now add controls. We see statistically significant and negative effect of being Catholic on grades. But there are many omitted variables that affect both being Catholic and the grades. So, these associations aren’t causal.

meanbycatholic <- df.reduced %>%
  group_by(catholic) %>%
  summarise_all(funs(mean))
meanbycatholic
## # A tibble: 2 x 5
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
##      <dbl>        <dbl>       <dbl>    <dbl>       <dbl>
## 1        0        0.163        65.4     1.11       0.392
## 2        1        0.220        86.2     1.07       0.205
by_catholic <- df.reduced %>%
  group_by(catholic) %>%
  summarise_all(funs(mean))
by_catholic
## # A tibble: 2 x 5
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
##      <dbl>        <dbl>       <dbl>    <dbl>       <dbl>
## 1        0        0.163        65.4     1.11       0.392
## 2        1        0.220        86.2     1.07       0.205
t.test(w3income_1k ~ catholic, data = df.reduced)
## 
##  Welch Two Sample t-test
## 
## data:  w3income_1k by catholic
## t = -13.238, df = 1314.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -23.86719 -17.70620
## sample estimates:
## mean in group 0 mean in group 1 
##        65.39393        86.18063
t.test(p5numpla ~ catholic, data = df.reduced)
## 
##  Welch Two Sample t-test
## 
## data:  p5numpla by catholic
## t = 3.128, df = 1600.4, p-value = 0.001792
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.01235472 0.05390038
## sample estimates:
## mean in group 0 mean in group 1 
##        1.106246        1.073118
t.test(w3momed_hsb ~ catholic, data = df.reduced)
## 
##  Welch Two Sample t-test
## 
## data:  w3momed_hsb by catholic
## t = 12.362, df = 1545.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.1572715 0.2165946
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3923094       0.2053763

These comparisons show that the characteristics of Catholic and non-Catholic are statistically different.

#first run a logit regression
propens.logit<-glm(catholic~w3income_1k+p5numpla+w3momed_hsb,family=binomial(),data=df.reduced)
summary(propens.logit)
## 
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb, 
##     family = binomial(), data = df.reduced)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0172  -0.6461  -0.5240  -0.4122   2.3672  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.6702950  0.1576164 -10.597  < 2e-16 ***
## w3income_1k  0.0077921  0.0008115   9.602  < 2e-16 ***
## p5numpla    -0.2774346  0.1232930  -2.250   0.0244 *  
## w3momed_hsb -0.6504757  0.0915710  -7.104 1.22e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4972.4  on 5428  degrees of freedom
## Residual deviance: 4750.6  on 5425  degrees of freedom
## AIC: 4758.6
## 
## Number of Fisher Scoring iterations: 4
#now get the predicted values and add them to the dataset
predict.catholic<-as.data.frame(predict(propens.logit))
colnames(predict.catholic)<-'predict_catholic'
df.reduced<-cbind(df.reduced,predict.catholic)

#finally, plot the support by those at and not at Catholic school
plot(density(exp(subset(df.reduced,catholic==1)[,'predict_catholic'])), main="Propensity support")
lines(density(exp(subset(df.reduced,catholic==0)[,'predict_catholic'])),col='red')
legend('topright',legend=c('At Catholic school','Not at Catholic school'),fill=c('black','red'))

So, we can see that there are individuals in Catholic schools who have lower probability of being in a Catholic school, and the other way around for non-Catholics.

df.reduced$prediction <- predict(propens.logit, type = "response")

ggplot(data = df.reduced, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
  geom_point(data = df.reduced, aes(prediction, w3income_1k))+
  geom_smooth(data = df.reduced, aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")

#Use the canned function
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = df.reduced)
psmdata <- match.data(psm)
dim(psmdata)
## [1] 1860   10

Low-distance matched pairs are giving us those at Catholic schools who you might predict were not, and those not at Catholic schools who you might predict were.

distpsmdata <- arrange(psmdata, distance)
head(distpsmdata, n = 10)
##      c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb predict_catholic
## 176    -0.3835843        0     17.5005        2           1        -2.739274
## 4274   -2.5922340        1     17.5005        2           1        -2.739274
## 41     -2.3690526        0     22.5005        2           1        -2.700313
## 2083    0.5271555        1     22.5005        2           1        -2.700313
## 561    -0.2195955        0     27.5005        2           1        -2.661353
## 2304   -1.6878765        1     27.5005        2           1        -2.661353
## 178    -0.2550080        0     32.5005        2           1        -2.622392
## 716    -1.2722942        1     32.5005        2           1        -2.622392
## 288    -0.7403859        0     37.5005        2           1        -2.583431
## 485    -0.7841369        1     37.5005        2           1        -2.583431
##      prediction   distance weights subclass
## 176  0.06069529 0.06069529       1      598
## 4274 0.06069529 0.06069529       1      598
## 41   0.06295488 0.06295488       1      222
## 2083 0.06295488 0.06295488       1      222
## 561  0.06529274 0.06529274       1      272
## 2304 0.06529274 0.06529274       1      272
## 178  0.06771114 0.06771114       1      859
## 716  0.06771114 0.06771114       1      859
## 288  0.07021240 0.07021240       1      728
## 485  0.07021240 0.07021240       1      728
distpsmdata %>% 
  group_by(catholic) %>% 
  dplyr::summarise(c5r2mtsc_std = mean(c5r2mtsc_std),
                   w3income_1k = mean(w3income_1k),
                   p5numpla = mean(p5numpla),
                   w3momed_hsb = mean(w3momed_hsb),
                   prediction = mean(prediction),
                   distance = mean(distance),
                   Weights = mean(weights))
## # A tibble: 2 x 8
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prediction distance
##      <dbl>        <dbl>       <dbl>    <dbl>       <dbl>      <dbl>    <dbl>
## 1        0        0.335        86.2     1.07       0.205      0.203    0.203
## 2        1        0.220        86.2     1.07       0.205      0.203    0.203
## # ... with 1 more variable: Weights <dbl>
t.test(w3income_1k ~ catholic, data = distpsmdata)
## 
##  Welch Two Sample t-test
## 
## data:  w3income_1k by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -3.985565  3.985565
## sample estimates:
## mean in group 0 mean in group 1 
##        86.18063        86.18063
t.test(p5numpla ~ catholic, data = distpsmdata)
## 
##  Welch Two Sample t-test
## 
## data:  p5numpla by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.02550007  0.02550007
## sample estimates:
## mean in group 0 mean in group 1 
##        1.073118        1.073118
t.test(w3momed_hsb ~ catholic, data = distpsmdata)
## 
##  Welch Two Sample t-test
## 
## data:  w3momed_hsb by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.03676158  0.03676158
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2053763       0.2053763

We find that covariates are not different between Catholic and non-Catholic schools.

ggplot(data = distpsmdata, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
  geom_point(data = distpsmdata, aes(prediction, w3income_1k))+
  geom_smooth(data = distpsmdata, aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")

This figure shows that the Catholic and non-Catholic students are comparable.

# Compare means of the two groups.
# A regression of just scores on the catholic dummy
reg3 <- lm(c5r2mtsc_std ~ catholic, distpsmdata)
# Another regression that includes all the variables
reg4 <- lm(c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + w3momed_hsb, distpsmdata)
stargazer(reg3, reg4, type = "text")
## 
## ====================================================================
##                                   Dependent variable:               
##                     ------------------------------------------------
##                                       c5r2mtsc_std                  
##                               (1)                     (2)           
## --------------------------------------------------------------------
## catholic                   -0.115***               -0.115***        
##                             (0.041)                 (0.039)         
##                                                                     
## w3income_1k                                         0.004***        
##                                                     (0.0005)        
##                                                                     
## p5numpla                                             -0.091         
##                                                     (0.070)         
##                                                                     
## w3momed_hsb                                        -0.366***        
##                                                     (0.050)         
##                                                                     
## Constant                   0.335***                  0.151*         
##                             (0.029)                 (0.091)         
##                                                                     
## --------------------------------------------------------------------
## Observations                 1,860                   1,860          
## R2                           0.004                   0.088          
## Adjusted R2                  0.004                   0.086          
## Residual Std. Error    0.883 (df = 1858)       0.846 (df = 1855)    
## F Statistic         7.867*** (df = 1; 1858) 44.707*** (df = 4; 1855)
## ====================================================================
## Note:                                    *p<0.1; **p<0.05; ***p<0.01

Coefficients on the variable Catholic for the two regressions are comparable since PSM selected observations for which the Catholic variable is not vorrelated with other independent variables in the latter regression.

# Reproduce a PSM ‘by hand’:
predictedpropens.logit <- fitted(propens.logit)
plot(psm$distance, predictedpropens.logit)

Distance and logit prediction are identical.

NOTE: Done with help of Pit.