library(dplyr)
library(stargazer)
library(ggplot2)
library(MatchIt)
library(glmnet)
data <- read.csv("ecls.csv")
# make smaller datset which only contains the variables:
mydata <- data %>% select(c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)
# Performing t-test
t.test(c5r2mtsc_std ~ catholic, data = mydata)
##
## Welch Two Sample t-test
##
## data: c5r2mtsc_std by catholic
## t = -1.7866, df = 1468.1, p-value = 0.07421
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.118653665 0.005539377
## sample estimates:
## mean in group 0 mean in group 1
## 0.1631279 0.2196851
Mean for test scores from non-catholic schools is 0.1631 as against catholic schools which have a mean of 0.2197.
No. As there may be an endogeneity problem in the relationship between math scores and going to catholic schools.
# Regression of math scores on the catholic dummy
y1 <- lm(c5r2mtsc_std ~ catholic, mydata)
stargazer(y1, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## -----------------------------------------------
## catholic 0.057
## (0.034)
##
## Constant 0.163***
## (0.014)
##
## -----------------------------------------------
## Observations 5,429
## R2 0.0005
## Adjusted R2 0.0003
## Residual Std. Error 0.955 (df = 5427)
## F Statistic 2.704 (df = 1; 5427)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Regression with all variables
y2 <- lm(c5r2mtsc_std ~ ., mydata)
stargazer(y2, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## -----------------------------------------------
## catholic -0.127***
## (0.033)
##
## w3income_1k 0.005***
## (0.0003)
##
## p5numpla -0.101***
## (0.036)
##
## w3momed_hsb -0.374***
## (0.027)
##
## Constant 0.073
## (0.049)
##
## -----------------------------------------------
## Observations 5,429
## R2 0.124
## Adjusted R2 0.124
## Residual Std. Error 0.894 (df = 5424)
## F Statistic 192.272*** (df = 4; 5424)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Regression of just math scores on the catholic dummy gives a positive but insignificant effect on math scores. However, when all the variables are included the effect is negative and significant at the 1% level. Also, all the other coefficients in the model are significant.
# Compare the means of the covariates
means <- mydata %>%
group_by(catholic) %>%
summarise(across(c(c5r2mtsc_std, w3income_1k, p5numpla, w3momed_hsb), list(mean = mean)))
means
## # A tibble: 2 × 5
## catholic c5r2mtsc_std_mean w3income_1k_mean p5numpla_mean w3momed_hsb_mean
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.163 65.4 1.11 0.392
## 2 1 0.220 86.2 1.07 0.205
# Test the difference in means using t-tests
t.test(w3income_1k ~ catholic, data = mydata)
##
## Welch Two Sample t-test
##
## data: w3income_1k by catholic
## t = -13.238, df = 1314.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -23.86719 -17.70620
## sample estimates:
## mean in group 0 mean in group 1
## 65.39393 86.18063
t.test(p5numpla ~ catholic, data = mydata)
##
## Welch Two Sample t-test
##
## data: p5numpla by catholic
## t = 3.128, df = 1600.4, p-value = 0.001792
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.01235472 0.05390038
## sample estimates:
## mean in group 0 mean in group 1
## 1.106246 1.073118
t.test(w3momed_hsb ~ catholic, data = mydata)
##
## Welch Two Sample t-test
##
## data: w3momed_hsb by catholic
## t = 12.362, df = 1545.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.1572715 0.2165946
## sample estimates:
## mean in group 0 mean in group 1
## 0.3923094 0.2053763
For each of the t-tests the means are significantly different for catholic vs non-catholic schools. This means that the treatment and control groups are different from one another and any estimate will be biased.
# Run logit regression
y3 <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = mydata)
summary(y3)
##
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## family = binomial(), data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0172 -0.6461 -0.5240 -0.4122 2.3672
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6702950 0.1576164 -10.597 < 2e-16 ***
## w3income_1k 0.0077921 0.0008115 9.602 < 2e-16 ***
## p5numpla -0.2774346 0.1232930 -2.250 0.0244 *
## w3momed_hsb -0.6504757 0.0915710 -7.104 1.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4972.4 on 5428 degrees of freedom
## Residual deviance: 4750.6 on 5425 degrees of freedom
## AIC: 4758.6
##
## Number of Fisher Scoring iterations: 4
# Predict likeliness of going to catholic school
mydata$prediction <- predict(y3, type = "response")
head(mydata)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prediction
## 1 0.9817533 0 62.5005 1 0 0.1883576
## 2 0.5943775 0 45.0005 1 0 0.1683901
## 3 0.4906106 0 62.5005 1 0 0.1883576
## 4 1.4512779 1 87.5005 1 0 0.2199574
## 5 2.5956991 0 150.0005 1 0 0.3145556
## 6 0.3851966 0 62.5005 1 0 0.1883576
# Visually check income versus predicted value
g1 <- ggplot(data = mydata, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
geom_point(aes(prediction, w3income_1k))+
geom_smooth(aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")
g1
# Visual check with plot densities
g2 <- ggplot(data = mydata, aes(x = prediction, fill = as.factor(catholic)))+
geom_density(alpha=0.5)
g2
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = mydata)
psm_data <- match.data(psm)
dim(psm_data)
## [1] 1860 9
Number of observations reduced from 5429 to 1860.
# Sort the data by distance
sorted_psm_data <- arrange(psm_data, distance)
head(sorted_psm_data, n= 10)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prediction distance
## 1 -0.3835843 0 17.5005 2 1 0.06069529 0.06069529
## 2 -2.5922340 1 17.5005 2 1 0.06069529 0.06069529
## 3 -2.3690526 0 22.5005 2 1 0.06295488 0.06295488
## 4 0.5271555 1 22.5005 2 1 0.06295488 0.06295488
## 5 -0.2195955 0 27.5005 2 1 0.06529274 0.06529274
## 6 -1.6878765 1 27.5005 2 1 0.06529274 0.06529274
## 7 -0.2550080 0 32.5005 2 1 0.06771114 0.06771114
## 8 -1.2722942 1 32.5005 2 1 0.06771114 0.06771114
## 9 -0.7403859 0 37.5005 2 1 0.07021240 0.07021240
## 10 -0.7841369 1 37.5005 2 1 0.07021240 0.07021240
## weights subclass
## 1 1 598
## 2 1 598
## 3 1 222
## 4 1 222
## 5 1 272
## 6 1 272
## 7 1 859
## 8 1 859
## 9 1 728
## 10 1 728
na.exclude(sorted_psm_data) %>% group_by(catholic) %>% summarise_all(mean)
## # A tibble: 2 × 9
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prediction distance
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.335 86.2 1.07 0.205 0.203 0.203
## 2 1 0.220 86.2 1.07 0.205 0.203 0.203
## # … with 2 more variables: weights <dbl>, subclass <dbl>
Test the difference in means for statistical significance:
# Test using t-tests
t.test(w3income_1k ~ catholic, sorted_psm_data)
##
## Welch Two Sample t-test
##
## data: w3income_1k by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -3.985565 3.985565
## sample estimates:
## mean in group 0 mean in group 1
## 86.18063 86.18063
t.test(p5numpla ~ catholic, sorted_psm_data)
##
## Welch Two Sample t-test
##
## data: p5numpla by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.02550007 0.02550007
## sample estimates:
## mean in group 0 mean in group 1
## 1.073118 1.073118
t.test(w3momed_hsb ~ catholic, sorted_psm_data)
##
## Welch Two Sample t-test
##
## data: w3momed_hsb by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.03676158 0.03676158
## sample estimates:
## mean in group 0 mean in group 1
## 0.2053763 0.2053763
The means are same for each t-test. PSM has now made both the treatment and control groups comparable.
# Plot the income variable against the propensity score (by catholic)
g3 <- ggplot(data = sorted_psm_data, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
geom_point(aes(prediction, w3income_1k))+
geom_smooth(aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")
g3
The means are identical.
sorted_psm_data %>% group_by(catholic) %>% summarise_all(mean)
## # A tibble: 2 × 9
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prediction distance
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.335 86.2 1.07 0.205 0.203 0.203
## 2 1 0.220 86.2 1.07 0.205 0.203 0.203
## # … with 2 more variables: weights <dbl>, subclass <dbl>
# Regression of just scores on the catholic dummy
y4 <- lm(c5r2mtsc_std ~ catholic, sorted_psm_data)
stargazer(y4, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## -----------------------------------------------
## catholic -0.115***
## (0.041)
##
## Constant 0.335***
## (0.029)
##
## -----------------------------------------------
## Observations 1,860
## R2 0.004
## Adjusted R2 0.004
## Residual Std. Error 0.883 (df = 1858)
## F Statistic 7.867*** (df = 1; 1858)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Regression that includes all the variables
y5 <- lm(c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + w3momed_hsb, sorted_psm_data)
stargazer(y5, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## -----------------------------------------------
## catholic -0.115***
## (0.039)
##
## w3income_1k 0.004***
## (0.0005)
##
## p5numpla -0.091
## (0.070)
##
## w3momed_hsb -0.366***
## (0.050)
##
## Constant 0.151*
## (0.091)
##
## -----------------------------------------------
## Observations 1,860
## R2 0.088
## Adjusted R2 0.086
## Residual Std. Error 0.846 (df = 1855)
## F Statistic 44.707*** (df = 4; 1855)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Now, both the regressions show a negative effect of catholic schools on math scores that is significant at the 1% level.
#Predictions from the logit regression
prediction_logit <- fitted(y3)
plot(psm$distance, prediction_logit)
#Keep only numeric variables from original dataset
lassodata <- data %>% select(c5r2mtsc_std, catholic, race_white, race_black, race_hispanic, race_asian, p5numpla, p5hmage, p5hdage, w3daded_hsb, w3momed_hsb, w3momscr, w3dadscr, w3income, w3povrty, p5fstamp, c5r2mtsc, w3income_1k)
head(lassodata)
## c5r2mtsc_std catholic race_white race_black race_hispanic race_asian p5numpla
## 1 0.9817533 0 1 0 0 0 1
## 2 0.5943775 0 1 0 0 0 1
## 3 0.4906106 0 1 0 0 0 1
## 4 1.4512779 1 1 0 0 0 1
## 5 2.5956991 0 1 0 0 0 1
## 6 0.3851966 0 1 0 0 0 1
## p5hmage p5hdage w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty
## 1 47 45 0 0 53.50 77.5 62500.5 0
## 2 41 48 0 0 34.95 53.5 45000.5 0
## 3 43 55 0 0 63.43 53.5 62500.5 0
## 4 38 39 0 0 53.50 53.5 87500.5 0
## 5 47 57 0 0 61.56 77.5 150000.5 0
## 6 41 41 0 0 38.18 53.5 62500.5 0
## p5fstamp c5r2mtsc w3income_1k
## 1 0 60.043 62.5005
## 2 0 56.280 45.0005
## 3 0 55.272 62.5005
## 4 0 64.604 87.5005
## 5 0 75.721 150.0005
## 6 0 54.248 62.5005
# Run lasso
y <- data.matrix(lassodata$catholic)
x <- data.matrix(lassodata[,c(1, 3:18)])
lasso1 <- cv.glmnet(x, y, family = "gaussian", alpha = 1 )
lasso <- glmnet(x, y, family = "gaussian", alpha = 1, lambda = lasso1$lambda.1se)
coef(lasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 7.516499e-02
## c5r2mtsc_std .
## race_white .
## race_black .
## race_hispanic .
## race_asian .
## p5numpla .
## p5hmage 1.700277e-03
## p5hdage .
## w3daded_hsb -2.147176e-02
## w3momed_hsb -2.195889e-02
## w3momscr .
## w3dadscr .
## w3income 7.035449e-07
## w3povrty .
## p5fstamp .
## c5r2mtsc .
## w3income_1k 2.168253e-16
Using LASSO, the variables to be included are p5hmage, w3daded_hsb, w3momed_hsb, w3income and w3income_1k. These variables differ from the ones used for PSM
plot(lasso1)
# Generate a prediction from the "best lasso" model
lasso_prediction <- predict(lasso, newx = x, type = "response")
# Use that prediction as your distance in the matchit command
match <- matchit(catholic ~ p5hmage + w3daded_hsb + w3momed_hsb + w3income + w3income_1k, data = lassodata, family = binomial(), distance = as.numeric(lasso_prediction))
# Create the match.data dataset based on that matchit
matchdata <- match.data(match)
# Create some plots to compare: w3income_1k vs prediction
g5 <- ggplot(data = matchdata, aes(x = distance, y = w3income_1k, col = as.factor(catholic))) + geom_smooth(method = "loess")
g5
# w3daded_hsb vs prediction
g6 <- ggplot(data = matchdata, aes(x = distance, y = w3daded_hsb, col = as.factor(catholic))) + geom_smooth(method = "loess")
g6