# necessary packages
library(readr)
library(dplyr)
library(broom)
library(ggplot2)
library(MatchIt)
#1. Download the data
data <- read_csv("http://www.mfilipski.com/files/ecls.csv")
# selecting the variables I want to keep
data_select <- select(data, c5r2mtsc_std, catholic, w3income_1k, p5numpla,
w3momed_hsb)
# Group by Catholic/non-Catholic and summarize mean math scores
df_summary <- data_select %>%
group_by(catholic) %>%
summarize(mean_Math_scores = mean(c5r2mtsc_std))
# Print summary
df_summary
## # A tibble: 2 × 2
## catholic mean_Math_scores
## <dbl> <dbl>
## 1 0 0.163
## 2 1 0.220
# Mean Math score catholic schools is 0.22 and 0.163 for non catholic schools
Yes test scores differ on average between catholic and non catholic schools. In particular,mean Math score for catholic schools is 0.22 and 0.163 for non catholic schools
Can you conclude that going to a catholic school leads to higher math scores? Why or why not? No, we cannot make any valid conclusion that going to catholic school leads to a higher math scores. This is because students in these schools are systematically different and people select into either school based on valid reasons.
Run a regression
# A regression of just math scores on the catholic dummy
model <- lm(c5r2mtsc_std~catholic , data = data_select)
summary(model)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = data_select)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2667 -0.6082 0.0538 0.6292 3.2705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.16313 0.01424 11.459 <2e-16 ***
## catholic 0.05656 0.03440 1.644 0.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9549 on 5427 degrees of freedom
## Multiple R-squared: 0.000498, Adjusted R-squared: 0.0003138
## F-statistic: 2.704 on 1 and 5427 DF, p-value: 0.1002
# Another regression that includes all the variables listed above
model1 <- lm(c5r2mtsc_std~. , data = data_select)
summary(model1)
##
## Call:
## lm(formula = c5r2mtsc_std ~ ., data = data_select)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3922 -0.5696 0.0372 0.5986 3.2572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0733290 0.0491661 1.491 0.135900
## catholic -0.1273899 0.0328888 -3.873 0.000109 ***
## w3income_1k 0.0053247 0.0003026 17.595 < 2e-16 ***
## p5numpla -0.1009432 0.0355779 -2.837 0.004567 **
## w3momed_hsb -0.3740355 0.0271962 -13.753 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8941 on 5424 degrees of freedom
## Multiple R-squared: 0.1242, Adjusted R-squared: 0.1235
## F-statistic: 192.3 on 4 and 5424 DF, p-value: < 2.2e-16
The first regression results show that there is a positive coefficient for the Catholic variable, indicating that Catholic schools have slightly higher average test scores than non-Catholic schools. However, the coefficient is not statistically significant at the conventional level of 0.05, with a p-value of 0.1. The R-squared value is very low, indicating that the variation in test scores is not well explained by the Catholic variable alone.
The second regression results suggest that there are significant associations between Math scores and the predictor variables (catholic, w3income_1k, p5numpla, and w3momed_hsb). The intercept is not significantly different from zero. Among the predictor variables, being in a catholic school is found to have a significant negative relationship with maths score. This indicates that Catholic schools tend to have lower test scores than non-Catholic schools. The coefficients for the other predictor variables are all positive and significant, indicating that they have a positive association with test scores. The multiple R-squared value indicates that the model explains about 12.42% of the variation in test scores, and the adjusted R-squared suggests that the model fits the data reasonably well. The F-statistic is large and the associated p-value is very small, indicating that the overall model is statistically significant.
-Do the means of the covariates (income, mother’s education etc) differ between catholic vs. non-catholic schools?
# Compare the means [Hint: summarise_all()]
data_select %>%
group_by(catholic) %>%
summarise_all(mean, na.rm = TRUE)
## # A tibble: 2 × 5
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.163 65.4 1.11 0.392
## 2 1 0.220 86.2 1.07 0.205
# Test the difference in means for statistical significance [Hint: t.test]
t_tests <- lapply(c("w3income_1k", "p5numpla", "w3momed_hsb"), function(x) t.test(data_select[[x]] ~ data_select$catholic))
tidy_do <- function(x) {
if (inherits(x, "htest")) {
tidy(x)
} else {
data.frame()
}
}
results <- do.call(rbind, lapply(t_tests, tidy_do))
results[, c("estimate", "p.value")]
## # A tibble: 3 × 2
## estimate p.value
## <dbl> <dbl>
## 1 -20.8 1.21e-37
## 2 0.0331 1.79e- 3
## 3 0.187 1.53e-33
From the t-test, we can see that the difference between the covariates among catholic and non catholic schools are not comparable at baseline. So we cannot compare apples to oranges and hence non catholic schools cannot be a good counter-factual for catholic schools.
-Run a logit regression to predict catholic based on the covariates we kept (but not the math scores, of course).
# Run logistic regression
model_logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb, data = data_select, family = binomial())
summary(model_logit)
##
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## family = binomial(), data = data_select)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0172 -0.6461 -0.5240 -0.4122 2.3672
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6702950 0.1576164 -10.597 < 2e-16 ***
## w3income_1k 0.0077921 0.0008115 9.602 < 2e-16 ***
## p5numpla -0.2774346 0.1232930 -2.250 0.0244 *
## w3momed_hsb -0.6504757 0.0915710 -7.104 1.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4972.4 on 5428 degrees of freedom
## Residual deviance: 4750.6 on 5425 degrees of freedom
## AIC: 4758.6
##
## Number of Fisher Scoring iterations: 4
-Make a prediction for who is likely to go to catholic school, using this model
# Make a prediction for who is likely to go to catholic school, using this model
prob_catholic_df <- data.frame(pr_score = predict(model_logit, type = "response"),
catholic = model_logit$model$catholic)
head(prob_catholic_df)
## pr_score catholic
## 1 0.1883576 0
## 2 0.1683901 0
## 3 0.1883576 0
## 4 0.2199574 1
## 5 0.3145556 0
## 6 0.1883576 0
-Check for common support
# Checking for common support using overlapping densities
# Add predicted probabilities to data_select
data_select$pred_prob <- prob_catholic_df$pr_score
# common support densities
ggplot(prob_catholic_df, aes(x = pr_score, color = as.factor(catholic))) +
geom_density() +
xlab("Logit prediction")
# Plot predicted probabilities vs w3income_1k, colored by catholic status
ggplot(data_select, aes(x = pred_prob, y = w3income_1k, color = as.factor(catholic))) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = FALSE) +
xlab("Predicted probability of Catholic school") +
ylab("W3income_1k")
-Create a matched dataset:
## 4. Run a Matching algorithm to get a balanced dataset
# Create formula for matching
formula <- catholic ~ w3income_1k + p5numpla + w3momed_hsb
# Create matched dataset using MatchIt
match_output <- matchit(formula, data = data_select, method = "nearest", ratio = 1, caliper = 0.1)
# Create matched dataset using match.data
matched_data <- match.data(match_output)
# Examine dimensions of matched dataset
dim(matched_data)
## [1] 1860 9
Out of the 5429 observations from the original data set, only 1860 was matched
-Sort the data by distance, and show the 10 first observations.
# Sort the matched dataset by distance
matched_data_sorted <- matched_data[order(matched_data$distance), ]
# Show the first 10 observations
head(matched_data_sorted, 10)
## # A tibble: 10 × 9
## c5r2mtsc_std catholic w3inc…¹ p5num…² w3mom…³ pred_…⁴ dista…⁵ weights subcl…⁶
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 -1.95 0 17.5 2 1 0.0607 0.0607 1 598
## 2 -2.59 1 17.5 2 1 0.0607 0.0607 1 598
## 3 0.202 0 22.5 2 1 0.0630 0.0630 1 222
## 4 0.527 1 22.5 2 1 0.0630 0.0630 1 222
## 5 -0.122 0 27.5 2 1 0.0653 0.0653 1 272
## 6 -1.69 1 27.5 2 1 0.0653 0.0653 1 272
## 7 -0.333 0 32.5 2 1 0.0677 0.0677 1 859
## 8 -1.27 1 32.5 2 1 0.0677 0.0677 1 859
## 9 -0.740 0 37.5 2 1 0.0702 0.0702 1 728
## 10 -0.784 1 37.5 2 1 0.0702 0.0702 1 728
## # … with abbreviated variable names ¹w3income_1k, ²p5numpla, ³w3momed_hsb,
## # ⁴pred_prob, ⁵distance, ⁶subclass
By looking at the output, you can observe that the matched pairs have similar values of the propensity score and distance, indicating that the matching was successful.
-In this new dataset, do the means of the covariates (income, etc) differ between catholic vs. non-catholic schools?
# Summarize means by catholic vs non-catholic schools
matched_data_sorted %>%
select_if(is.numeric) %>%
group_by(catholic) %>%
summarise_all(mean, na.rm = TRUE)
## # A tibble: 2 × 8
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb pred_…¹ dista…² weights
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.321 86.2 1.07 0.205 0.203 0.203 1
## 2 1 0.220 86.2 1.07 0.205 0.203 0.203 1
## # … with abbreviated variable names ¹pred_prob, ²distance
# Test for differences in means between catholic vs non-catholic schools
# Test the difference in means for statistical significance [Hint: t.test]
t_tests2 <- lapply(c("w3income_1k", "p5numpla", "w3momed_hsb"), function(x) t.test(matched_data_sorted[[x]] ~ matched_data_sorted$catholic))
tidy_do2 <- function(x) {
if (inherits(x, "htest")) {
tidy(x)
} else {
data.frame()
}
}
results2 <- do.call(rbind, lapply(t_tests2, tidy_do2))
results2[, c("estimate", "p.value")]
## # A tibble: 3 × 2
## estimate p.value
## <dbl> <dbl>
## 1 0 1
## 2 0 1
## 3 0 1
##6. Compute the ATE in your new sample - Compare the means if the two groups
# Compare the means if the two groups
# Group by Catholic/non-Catholic and summarize mean math scores
df_summary2 <- matched_data_sorted %>%
group_by(catholic) %>%
summarize(mean_Math_scores2 = mean(c5r2mtsc_std))
# Print summary
df_summary2
## # A tibble: 2 × 2
## catholic mean_Math_scores2
## <dbl> <dbl>
## 1 0 0.321
## 2 1 0.220
-Run regressions:
# A regression of just math scores on the catholic dummy
model2 <- lm(c5r2mtsc_std~catholic , data = matched_data_sorted)
summary(model2)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = matched_data_sorted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.85183 -0.54554 0.02954 0.57814 2.83555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.32084 0.02914 11.010 <2e-16 ***
## catholic -0.10116 0.04121 -2.455 0.0142 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8887 on 1858 degrees of freedom
## Multiple R-squared: 0.003232, Adjusted R-squared: 0.002696
## F-statistic: 6.025 on 1 and 1858 DF, p-value: 0.01419
# Another regression that includes all the variables listed above
model3 <- lm(c5r2mtsc_std~catholic + w3income_1k + p5numpla + w3momed_hsb ,
data = matched_data_sorted)
summary(model3)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla +
## w3momed_hsb, data = matched_data_sorted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.92670 -0.52933 0.02618 0.57364 2.86344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1510520 0.0922538 1.637 0.102
## catholic -0.1011580 0.0397520 -2.545 0.011 *
## w3income_1k 0.0041105 0.0004627 8.884 < 2e-16 ***
## p5numpla -0.1150015 0.0709580 -1.621 0.105
## w3momed_hsb -0.2972267 0.0501805 -5.923 3.76e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8572 on 1855 degrees of freedom
## Multiple R-squared: 0.07404, Adjusted R-squared: 0.07205
## F-statistic: 37.08 on 4 and 1855 DF, p-value: < 2.2e-16
##5 Visually check the balance in the new dataset: For some reason the codes for Question 5 was not running in the markdown even though it was running in the R script
ggplot(matched_data_sorted, aes(x = prob_catholic_df, y = w3momed_hsb, color = as.factor(catholic))) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Logit prediction") +
ylab("Income")
From the plot we can see that the lines overlap to the extent that you cannot see the non-catholic line. This means that the matching is great