library(MatchIt)
library(glmnet)
library(rvest)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(XML)
library(xml2)
library(jsonlite)
library(stargazer)
df<-read.csv('ecls.CSV')
#create a smaller dataset
df.reduced<-as.data.frame(cbind(df[,'c5r2mtsc_std'],df[,'catholic'],df[,'w3income_1k'],df[,'p5numpla'],df[,'w3momed_hsb']))
colnames(df.reduced)<-c('c5r2mtsc_std','catholic','w3income_1k','p5numpla','w3momed_hsb')
math.vs.catholic.reg<-lm(c5r2mtsc_std~catholic,data=df.reduced)
summary(math.vs.catholic.reg)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = df.reduced)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2667 -0.6082 0.0538 0.6292 3.2705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.16313 0.01424 11.459 <2e-16 ***
## catholic 0.05656 0.03440 1.644 0.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9549 on 5427 degrees of freedom
## Multiple R-squared: 0.000498, Adjusted R-squared: 0.0003138
## F-statistic: 2.704 on 1 and 5427 DF, p-value: 0.1002
The regression shows that the coefficient of catholic on grade is not statistically significant.
math.with.contr.reg<-lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb,data=df.reduced)
summary(math.with.contr.reg)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla +
## w3momed_hsb, data = df.reduced)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3922 -0.5696 0.0372 0.5986 3.2572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0733290 0.0491661 1.491 0.135900
## catholic -0.1273899 0.0328888 -3.873 0.000109 ***
## w3income_1k 0.0053247 0.0003026 17.595 < 2e-16 ***
## p5numpla -0.1009432 0.0355779 -2.837 0.004567 **
## w3momed_hsb -0.3740355 0.0271962 -13.753 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8941 on 5424 degrees of freedom
## Multiple R-squared: 0.1242, Adjusted R-squared: 0.1235
## F-statistic: 192.3 on 4 and 5424 DF, p-value: < 2.2e-16
We now add controls. We see statistically significant and negative effect of being Catholic on grades. But there are many omitted variables that affect both being Catholic and the grades. So, these associations aren’t causal.
meanbycatholic <- df.reduced %>%
group_by(catholic) %>%
summarise_all(funs(mean))
meanbycatholic
## # A tibble: 2 x 5
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.163 65.4 1.11 0.392
## 2 1 0.220 86.2 1.07 0.205
by_catholic <- df.reduced %>%
group_by(catholic) %>%
summarise_all(funs(mean))
by_catholic
## # A tibble: 2 x 5
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.163 65.4 1.11 0.392
## 2 1 0.220 86.2 1.07 0.205
t.test(w3income_1k ~ catholic, data = df.reduced)
##
## Welch Two Sample t-test
##
## data: w3income_1k by catholic
## t = -13.238, df = 1314.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -23.86719 -17.70620
## sample estimates:
## mean in group 0 mean in group 1
## 65.39393 86.18063
t.test(p5numpla ~ catholic, data = df.reduced)
##
## Welch Two Sample t-test
##
## data: p5numpla by catholic
## t = 3.128, df = 1600.4, p-value = 0.001792
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.01235472 0.05390038
## sample estimates:
## mean in group 0 mean in group 1
## 1.106246 1.073118
t.test(w3momed_hsb ~ catholic, data = df.reduced)
##
## Welch Two Sample t-test
##
## data: w3momed_hsb by catholic
## t = 12.362, df = 1545.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.1572715 0.2165946
## sample estimates:
## mean in group 0 mean in group 1
## 0.3923094 0.2053763
These comparisons show that the characteristics of Catholic and non-Catholic are statistically different.
#first run a logit regression
propens.logit<-glm(catholic~w3income_1k+p5numpla+w3momed_hsb,family=binomial(),data=df.reduced)
summary(propens.logit)
##
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## family = binomial(), data = df.reduced)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0172 -0.6461 -0.5240 -0.4122 2.3672
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6702950 0.1576164 -10.597 < 2e-16 ***
## w3income_1k 0.0077921 0.0008115 9.602 < 2e-16 ***
## p5numpla -0.2774346 0.1232930 -2.250 0.0244 *
## w3momed_hsb -0.6504757 0.0915710 -7.104 1.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4972.4 on 5428 degrees of freedom
## Residual deviance: 4750.6 on 5425 degrees of freedom
## AIC: 4758.6
##
## Number of Fisher Scoring iterations: 4
#now get the predicted values and add them to the dataset
predict.catholic<-as.data.frame(predict(propens.logit))
colnames(predict.catholic)<-'predict_catholic'
df.reduced<-cbind(df.reduced,predict.catholic)
#finally, plot the support by those at and not at Catholic school
plot(density(exp(subset(df.reduced,catholic==1)[,'predict_catholic'])), main="Propensity support")
lines(density(exp(subset(df.reduced,catholic==0)[,'predict_catholic'])),col='red')
legend('topright',legend=c('At Catholic school','Not at Catholic school'),fill=c('black','red'))
So, we can see that there are individuals in Catholic schools who have lower probability of being in a Catholic school, and the other way around for non-Catholics.
df.reduced$prediction <- predict(propens.logit, type = "response")
ggplot(data = df.reduced, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
geom_point(data = df.reduced, aes(prediction, w3income_1k))+
geom_smooth(data = df.reduced, aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")
#Use the canned function
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = df.reduced)
psmdata <- match.data(psm)
dim(psmdata)
## [1] 1860 10
Low-distance matched pairs are giving us those at Catholic schools who you might predict were not, and those not at Catholic schools who you might predict were.
distpsmdata <- arrange(psmdata, distance)
head(distpsmdata, n = 10)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb predict_catholic
## 176 -0.3835843 0 17.5005 2 1 -2.739274
## 4274 -2.5922340 1 17.5005 2 1 -2.739274
## 41 -2.3690526 0 22.5005 2 1 -2.700313
## 2083 0.5271555 1 22.5005 2 1 -2.700313
## 561 -0.2195955 0 27.5005 2 1 -2.661353
## 2304 -1.6878765 1 27.5005 2 1 -2.661353
## 178 -0.2550080 0 32.5005 2 1 -2.622392
## 716 -1.2722942 1 32.5005 2 1 -2.622392
## 288 -0.7403859 0 37.5005 2 1 -2.583431
## 485 -0.7841369 1 37.5005 2 1 -2.583431
## prediction distance weights subclass
## 176 0.06069529 0.06069529 1 598
## 4274 0.06069529 0.06069529 1 598
## 41 0.06295488 0.06295488 1 222
## 2083 0.06295488 0.06295488 1 222
## 561 0.06529274 0.06529274 1 272
## 2304 0.06529274 0.06529274 1 272
## 178 0.06771114 0.06771114 1 859
## 716 0.06771114 0.06771114 1 859
## 288 0.07021240 0.07021240 1 728
## 485 0.07021240 0.07021240 1 728
distpsmdata %>%
group_by(catholic) %>%
dplyr::summarise(c5r2mtsc_std = mean(c5r2mtsc_std),
w3income_1k = mean(w3income_1k),
p5numpla = mean(p5numpla),
w3momed_hsb = mean(w3momed_hsb),
prediction = mean(prediction),
distance = mean(distance),
Weights = mean(weights))
## # A tibble: 2 x 8
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prediction distance
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.335 86.2 1.07 0.205 0.203 0.203
## 2 1 0.220 86.2 1.07 0.205 0.203 0.203
## # ... with 1 more variable: Weights <dbl>
t.test(w3income_1k ~ catholic, data = distpsmdata)
##
## Welch Two Sample t-test
##
## data: w3income_1k by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -3.985565 3.985565
## sample estimates:
## mean in group 0 mean in group 1
## 86.18063 86.18063
t.test(p5numpla ~ catholic, data = distpsmdata)
##
## Welch Two Sample t-test
##
## data: p5numpla by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.02550007 0.02550007
## sample estimates:
## mean in group 0 mean in group 1
## 1.073118 1.073118
t.test(w3momed_hsb ~ catholic, data = distpsmdata)
##
## Welch Two Sample t-test
##
## data: w3momed_hsb by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.03676158 0.03676158
## sample estimates:
## mean in group 0 mean in group 1
## 0.2053763 0.2053763
We find that covariates are not different between Catholic and non-Catholic schools.
ggplot(data = distpsmdata, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
geom_point(data = distpsmdata, aes(prediction, w3income_1k))+
geom_smooth(data = distpsmdata, aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")
This figure shows that the Catholic and non-Catholic students are comparable.
# Compare means of the two groups.
# A regression of just scores on the catholic dummy
reg3 <- lm(c5r2mtsc_std ~ catholic, distpsmdata)
# Another regression that includes all the variables
reg4 <- lm(c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + w3momed_hsb, distpsmdata)
stargazer(reg3, reg4, type = "text")
##
## ====================================================================
## Dependent variable:
## ------------------------------------------------
## c5r2mtsc_std
## (1) (2)
## --------------------------------------------------------------------
## catholic -0.115*** -0.115***
## (0.041) (0.039)
##
## w3income_1k 0.004***
## (0.0005)
##
## p5numpla -0.091
## (0.070)
##
## w3momed_hsb -0.366***
## (0.050)
##
## Constant 0.335*** 0.151*
## (0.029) (0.091)
##
## --------------------------------------------------------------------
## Observations 1,860 1,860
## R2 0.004 0.088
## Adjusted R2 0.004 0.086
## Residual Std. Error 0.883 (df = 1858) 0.846 (df = 1855)
## F Statistic 7.867*** (df = 1; 1858) 44.707*** (df = 4; 1855)
## ====================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Coefficients on the variable Catholic for the two regressions are comparable since PSM selected observations for which the Catholic variable is not vorrelated with other independent variables in the latter regression.
# Reproduce a PSM ‘by hand’:
predictedpropens.logit <- fitted(propens.logit)
plot(psm$distance, predictedpropens.logit)
Distance and logit prediction are identical.
NOTE: Done with help of Pit.