Music Credit: Royalty Free Music from Bensound
Research Question: Do catholic schools have an effect on student’s performance?
The key issue is, of course, that students in catholic schools (“Treated”) might be different from those in non-catholic schools (“Control”). The exercise will show how propensity score matching attempts to reduce this issue (Note: it does not solve it completely).
<- read.csv("./data/ecls.csv")
df names(df)
## [1] "childid" "catholic" "race" "race_white"
## [5] "race_black" "race_hispanic" "race_asian" "p5numpla"
## [9] "p5hmage" "p5hdage" "w3daded" "w3momed"
## [13] "w3daded_hsb" "w3momed_hsb" "w3momscr" "w3dadscr"
## [17] "w3inccat" "w3income" "w3povrty" "p5fstamp"
## [21] "c5r2mtsc" "c5r2mtsc_std" "w3income_1k"
<- df %>% dplyr::select(c(2,8,14,23,22))
small_df ::stargazer(small_df, type = "text",
stargazertitle = "Table 1: Summary Statistics")
##
## Table 1: Summary Statistics
## =================================================
## Statistic N Mean St. Dev. Min Max
## -------------------------------------------------
## catholic 5,429 0.171 0.377 0 1
## p5numpla 5,429 1.101 0.342 1 5
## w3momed_hsb 5,429 0.360 0.480 0 1
## w3income_1k 5,429 68.955 43.411 5.000 200.001
## c5r2mtsc_std 5,429 0.173 0.955 -3.104 3.434
## -------------------------------------------------
$catholic <- as.factor(small_df$catholic)
small_df<- small_df %>% group_by(catholic) %>% summarise(n=n(), min=min(c5r2mtsc_std), max=max(c5r2mtsc_std), mean=mean(c5r2mtsc_std), median=median(c5r2mtsc_std))
tbl2 ::kable(tbl2, caption = "Table 2: Standardize Math Score") knitr
catholic | n | min | max | mean | median |
---|---|---|---|---|---|
0 | 4499 | -3.103553 | 3.433658 | 0.1631279 | 0.2206932 |
1 | 930 | -2.592234 | 3.055238 | 0.2196851 | 0.2509585 |
Students from Catholic school have higher mean and median standardize math score compared to those from non-catholic schools.
t.test(c5r2mtsc_std ~ catholic, data = small_df)
##
## Welch Two Sample t-test
##
## data: c5r2mtsc_std by catholic
## t = -1.7866, df = 1468.1, p-value = 0.07421
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.118653665 0.005539377
## sample estimates:
## mean in group 0 mean in group 1
## 0.1631279 0.2196851
t-test shows the score are statistically different and those from Catholic schools are higher.
Looking at the mean and t-test, we may be tempted to conclude that going to a catholic school leads to higher math scores but that might not be the case. Here, we see that the data is not a balanced one.
= lm(c5r2mtsc_std ~ catholic, data = small_df)
model1 = lm(c5r2mtsc_std ~ ., data = small_df)
model2
= list("Bivariate Model \n (1)" = model1,"Multivariate Model \n (2)" = model2)
model_tbl1
<- names(coef(model_tbl1[[1]]))[str_detect(names(coef(model_tbl1[[1]])), "vdc")]
coefs
<- huxreg(model_tbl1,number_format = 3, omit_coefs =coefs,
huxtable statistics = c("Sample size:" = "nobs", "R2" = "r.squared"))%>%
set_caption("Table 3: Regression output of small subset of original data")
huxtable
Bivariate Model (1) | Multivariate Model (2) | |
---|---|---|
(Intercept) | 0.163 *** | 0.073 |
(0.014) | (0.049) | |
catholic1 | 0.057 | -0.127 *** |
(0.034) | (0.033) | |
p5numpla | -0.101 ** | |
(0.036) | ||
w3momed_hsb | -0.374 *** | |
(0.027) | ||
w3income_1k | 0.005 *** | |
(0.000) | ||
Sample size: | 5429 | 5429 |
R2 | 0.000 | 0.124 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
Bivariate model shows that test score is positively related to being in catholic school but is not statistically significant. However, multivariate model shows a completely different result. It says that going to catholic school is more likely to reduce the test score and the resut is statistically significant.
<- small_df %>% group_by(catholic) %>% summarise_all(mean) %>% t()
a <- a[2:5,]
aa <- matrix(0, nrow = 4, ncol = 1)
ttest # Assign each p values
for (i in 1:4){
<- t.test(small_df[,i+1] ~ small_df$catholic)$p.value
ttest[i]
}# Create data frames
<- data.frame(ttest)
ttest <- cbind(aa,ttest)
tab colnames(tab) <- c("Non-catholic","Catholic","ttest")
$Variables <- c("p5numpla","w3momed_hsb","w3income_1k","c5r2mtsc_std")
tabrownames(tab) <- NULL
<- tab[,c(4,1:3)]
tab <- huxtable(tab) %>% set_caption("Table 4: Means and p-values of Covariates")
hux add_footnote(hux, "p-value are less than 0.001 for all the cases.\n Hence, the difference in means are statistically different")
Variables | Non-catholic | Catholic | ttest |
---|---|---|---|
p5numpla | 1.106246 | 1.073118 | 0.00179 |
w3momed_hsb | 0.3923094 | 0.2053763 | 1.53e-33 |
w3income_1k | 65.39393 | 86.18063 | 1.21e-37 |
c5r2mtsc_std | 0.1631279 | 0.2196851 | 0.0742 |
p-value are less than 0.001 for all the cases. Hence, the difference in means are statistically different |
Problems with an identification strategy based on simple regressions: Parallel trends assumption fails and the students from non-catholic school are not good counterfactual.
<- glm(catholic ~. -c5r2mtsc_std, family = binomial, data=small_df)
model_glm <- huxreg(model_glm,number_format = 3, omit_coefs =coefs,
huxtable statistics = c("Sample size:" = "nobs", "R2" = "r.squared"))%>%
set_caption("Table 5: Regression - propensity to go to catholic school")
huxtable
(1) | |
---|---|
(Intercept) | -1.670 *** |
(0.158) | |
p5numpla | -0.277 * |
(0.123) | |
w3momed_hsb | -0.650 *** |
(0.092) | |
w3income_1k | 0.008 *** |
(0.001) | |
Sample size: | 5429 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
$predict <- predict(model_glm, newdata = small_df, type = "response")
small_dfsummary(small_df$predict)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04333 0.10801 0.16839 0.17130 0.21996 0.40389
<- ggplot(small_df, aes(predict, group=factor(catholic),fill=factor(catholic)))
g + geom_histogram(alpha=0.5) +
g labs(title="Histogram", subtitle="Predicted output grouped by Catholic",x="Logit prediction",
fill="Catholic",position = "dodge")
<- ggplot(small_df, aes(x=predict, y=w3income_1k))
g + geom_point()+
g geom_smooth(aes(group=factor(catholic),fill=factor(catholic)))+
labs(title="Scatter plot", subtitle="Predicted output grouped by Catholic",x="Logit prediction", y= "Income(1K)", fill="Catholic", position = "dodge")
<- ggplot(small_df, aes(predict,group=factor(catholic),fill=factor(catholic)))
g + geom_density(alpha=0.5) +
g labs(title="Density plot", subtitle="Predicted output grouped by Catholic",x="Logit prediction", fill="Catholic",position = "dodge")
<- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, data=small_df)
match_op <- match.data(match_op)
match_df summary(match_df)
## catholic p5numpla w3momed_hsb w3income_1k c5r2mtsc_std
## 0:930 Min. :1.000 Min. :0.0000 Min. : 7.50 Min. :-2.5922
## 1:930 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 62.50 1st Qu.:-0.2749
## Median :1.000 Median :0.0000 Median : 87.50 Median : 0.2908
## Mean :1.073 Mean :0.2054 Mean : 86.18 Mean : 0.2703
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.: 87.50 3rd Qu.: 0.8469
## Max. :3.000 Max. :1.0000 Max. :200.00 Max. : 3.0552
##
## predict distance weights subclass
## Min. :0.0607 Min. :0.0607 Min. :1 1 : 2
## 1st Qu.:0.1604 1st Qu.:0.1604 1st Qu.:1 2 : 2
## Median :0.1884 Median :0.1884 Median :1 3 : 2
## Mean :0.2034 Mean :0.2034 Mean :1 4 : 2
## 3rd Qu.:0.2200 3rd Qu.:0.2200 3rd Qu.:1 5 : 2
## Max. :0.4039 Max. :0.4039 Max. :1 6 : 2
## (Other):1848
Original data was not a balanced dataset with 4490 students from non-catholic and 930 from catholic schools. PSM provided us with balanced dataset with 930 in each catagory. In doing so, it reduced the observation from 5429 to 1860 which is by 65%. Overall mean and median of standardised score have increased but the same statistics have decreased for income and mother’s education.
head(match_df[order(match_df$distance),],10)
catholic | p5numpla | w3momed_hsb | w3income_1k | c5r2mtsc_std | predict | distance | weights | subclass |
---|---|---|---|---|---|---|---|---|
0 | 2 | 1 | 17.5 | -1.95 | 0.0607 | 0.0607 | 1 | 598 |
1 | 2 | 1 | 17.5 | -2.59 | 0.0607 | 0.0607 | 1 | 598 |
0 | 2 | 1 | 22.5 | 0.202 | 0.063 | 0.063 | 1 | 222 |
1 | 2 | 1 | 22.5 | 0.527 | 0.063 | 0.063 | 1 | 222 |
0 | 2 | 1 | 27.5 | -0.122 | 0.0653 | 0.0653 | 1 | 272 |
1 | 2 | 1 | 27.5 | -1.69 | 0.0653 | 0.0653 | 1 | 272 |
0 | 2 | 1 | 32.5 | -0.333 | 0.0677 | 0.0677 | 1 | 859 |
1 | 2 | 1 | 32.5 | -1.27 | 0.0677 | 0.0677 | 1 | 859 |
0 | 2 | 1 | 37.5 | -0.74 | 0.0702 | 0.0702 | 1 | 728 |
1 | 2 | 1 | 37.5 | -0.784 | 0.0702 | 0.0702 | 1 | 728 |
We see that the observations are matched on distance, whish means there are 930 pairs of matched observation.
<- match_df %>% group_by(catholic) %>% summarise_all(mean) %>% t()
a <- a[2:5,]
aa <- matrix(0, nrow = 4, ncol = 1)
ttest # Assign each p values
for (i in 1:4){
<- t.test(match_df[,i+1] ~ match_df$catholic)$p.value
ttest[i]
}# Create data frames
<- data.frame(ttest)
ttest <- cbind(aa,ttest)
tab colnames(tab) <- c("Non-catholic","Catholic","ttest")
$Variables <- c("p5numpla","w3momed_hsb","w3income_1k","c5r2mtsc_std")
tabrownames(tab) <- NULL
<- tab[,c(4,1:3)]
tab <- huxtable(tab) %>% set_caption("Table 6: Means and p-values of Covariates for balanced data")
hux add_footnote(hux, "p-value have increased for all but c5r2mtsc_std.")
Variables | Non-catholic | Catholic | ttest |
---|---|---|---|
p5numpla | 1.073118 | 1.073118 | 1 |
w3momed_hsb | 0.2053763 | 0.2053763 | 1 |
w3income_1k | 86.18063 | 86.18063 | 1 |
c5r2mtsc_std | 0.3208431 | 0.2196851 | 0.0142 |
p-value have increased for all but c5r2mtsc_std. |
PSM matched the dataset, whereby reducing the differences in covariates between the two groups of students. Means are nearer after PSM and ttest suggests that difference in means is not statistically significant as p-values are greater than 0.1. PSM cannot eliminate the omitted variable bias if there exist.
<- ggplot(match_df, aes(x=distance, y=w3income_1k))
g + geom_point(aes(col=factor(catholic)))+
g geom_smooth(aes(col=factor(catholic)))+
labs(title="Income vs propensity score segregated by group", x="Propensity score",y= "Income(1K)",col="Catholic",position = "dodge")
<- ggplot(match_df, aes(x=distance, y=w3momed_hsb))
g + geom_point(aes(col=factor(catholic)))+
g geom_smooth(aes(col=factor(catholic)))+
labs(title="Mother's education vs propensity score segregated by group", x="Propensity score", y= "Mother's education", col="Catholic",position = "dodge")
%>% select(c(1:4)) %>% group_by(catholic) %>% summarise_all(mean) match_df
catholic | p5numpla | w3momed_hsb | w3income_1k |
---|---|---|---|
0 | 1.07 | 0.205 | 86.2 |
1 | 1.07 | 0.205 | 86.2 |
= lm(c5r2mtsc_std ~ catholic, data = match_df)
model1 = lm(c5r2mtsc_std ~ . -predict -distance - weights - subclass, data = match_df)
model2
= list("Bivariate Model \n (1)" = model1,"Multivariate Model \n (2)" = model2)
model_tbl1
<- names(coef(model_tbl1[[1]]))[str_detect(names(coef(model_tbl1[[1]])), "vdc")]
coefs
<- huxreg(model_tbl1,number_format = 3, omit_coefs =coefs,
huxtable statistics = c("Sample size:" = "nobs", "R2" = "r.squared"))%>%
set_caption("Table 7: Regression using matched dataset")
huxtable
Bivariate Model (1) | Multivariate Model (2) | |
---|---|---|
(Intercept) | 0.321 *** | 0.151 |
(0.029) | (0.092) | |
catholic1 | -0.101 * | -0.101 * |
(0.041) | (0.040) | |
p5numpla | -0.115 | |
(0.071) | ||
w3momed_hsb | -0.297 *** | |
(0.050) | ||
w3income_1k | 0.004 *** | |
(0.000) | ||
Sample size: | 1860 | 1860 |
R2 | 0.003 | 0.074 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
<- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, data=small_df, distance = small_df$predict)
match_op2 <- match.data(match_op2)
match_df2
<- ggplot()
g + geom_line(aes(x=match_df2$predict, y=match_df$distance))+
g labs(title="Line plot", subtitle="Shows linear relationship", x="Logit Prediction",y="Propensity Score")
Data generated using logit prediction is almost identical to one using matchit(). Hence, we can use Lasso prediction as distance to replace the logit stage.
<-select_if(df, is.numeric)
df_num
<- data.matrix(df_num %>% select(!catholic))
x_matrix <- data.matrix(df_num%>% select(catholic))
y_matrix <- cv.glmnet(x_matrix, y_matrix, alpha = 1, family="binomial")
model_cv_df plot(model_cv_df,cex = 0.8, cex.axis = 0.8, cex.lab = 0.8)
<- glmnet(x_matrix, y_matrix, alpha = 1, family="binomial",lambda =model_cv_df$lambda.1se )
model_best_df
coef(model_best_df)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.687873229520
## race_white 0.058380454937
## race_black .
## race_hispanic .
## race_asian .
## p5numpla .
## p5hmage 0.022451652327
## p5hdage .
## w3daded_hsb -0.234827100310
## w3momed_hsb -0.258454212219
## w3momscr 0.001366463599
## w3dadscr .
## w3income 0.000004518323
## w3povrty -0.325226850877
## p5fstamp .
## c5r2mtsc .
## c5r2mtsc_std .
## w3income_1k .
Best model is the one with optimal lambda that we got using cross validation. For further analysis I would use eight of those variables with non-zero coef. There are new addition to the orinial model: race_white, p5hmage, w3daded_hsb,w3momed_hsb, w3momscr, w3income, w3povrty and w3income_1k.
$predict <- predict(model_best_df, newx = x_matrix, type = "response")
df_numsummary(df_num$predict)
## s0
## Min. :0.04771
## 1st Qu.:0.12259
## Median :0.16679
## Mean :0.17130
## 3rd Qu.:0.21124
## Max. :0.39966
<- matchit(catholic ~ race_white + p5hmage + w3daded_hsb + w3momed_hsb + w3momscr + w3income + w3povrty + w3income_1k, data=df_num, distance = as.numeric(df_num$predict), family="binomial") match_lasso_op
<- match.data(match_lasso_op)
match_lasso_df %>% select(1,2,7,9:14,18) %>% group_by(catholic) %>% summarise_all(mean) match_lasso_df
catholic | race_white | p5hmage | w3daded_hsb | w3momed_hsb | w3momscr | w3dadscr | w3income | w3povrty | w3income_1k |
---|---|---|---|---|---|---|---|---|---|
0 | 0.762 | 39.8 | 0.245 | 0.196 | 47.2 | 45.7 | 8.42e+04 | 0.014 | 84.2 |
1 | 0.767 | 39.8 | 0.27 | 0.205 | 47.6 | 45.9 | 8.62e+04 | 0.0161 | 86.2 |
$catholic <- as.factor(match_lasso_df$catholic)
match_lasso_df<- ggplot(match_lasso_df, aes(x=distance, y=w3income_1k))
g + geom_point(aes(col=catholic))+
g geom_smooth(aes(col=catholic), method="loess")+
labs(title="Income vs Lasso prediction segregated by group",x="Lasso prediction", y= "Income(1K)", col="Catholic",position = "dodge")
<- ggplot(match_lasso_df, aes(x=distance, y=w3momed_hsb))
g + geom_point(aes(col=catholic))+
g geom_smooth(aes(col=catholic))+
labs(title="Mother's education vs Lasso prediction segregated by group", x="Lasso prediction", y= "Mother's education",col="Catholic",position = "dodge")
PSM attempts to approximate a completely randomized experiment, at the cost of efficiency and may be biased. I am yet to understand PSM paradox to comment on the method. One thing that is clear by now is that predictions from Machine Learning can be used as distance that is easily interpretable.
INTERPRETATION MATTERS!!!
6.3 Comment on the results of those regressions after the PSM procedure-
Both bivariate model and multivariate regression results shows that test score is negatively related to being in catholic school. Treatment effect doesn’t change after including covariates and are statistically significant.