ecls_full <- read.csv("ecls.csv")
ecls <- select(ecls_full, c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)
• Do test scores on c5r2mtsc_std differ for pupils of catholic and non-catholic schools on average?
ecls %>%
group_by(catholic) %>%
summarise(n_students = n(),
mean_math = mean(c5r2mtsc_std),
std_error = sd(c5r2mtsc_std) / sqrt(n_students))
## # A tibble: 2 × 4
## catholic n_students mean_math std_error
## <int> <int> <dbl> <dbl>
## 1 0 4499 0.163 0.0145
## 2 1 930 0.220 0.0281
To answer the first question, they are different. But it doesn’t mean that it differs for pupils of catholic and non-catholic schools because we don’t control for other factors.
library(stargazer)
lm_naive <- lm(c5r2mtsc_std ~ catholic, data = ecls)
stargazer(lm_naive, keep.stat = c("n", "adj.rsq"), type = "text", single.row = TRUE)
##
## ========================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## ----------------------------------------
## catholic 0.057 (0.034)
## Constant 0.163*** (0.014)
## ----------------------------------------
## Observations 5,429
## Adjusted R2 0.0003
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
lm_controls <- lm(c5r2mtsc_std ~ catholic +
w3income_1k + p5numpla + w3momed_hsb, data = ecls)
stargazer(lm_controls, keep.stat = c("n", "adj.rsq"), type = "text", single.row = TRUE)
##
## ========================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## ----------------------------------------
## catholic -0.127*** (0.033)
## w3income_1k 0.005*** (0.0003)
## p5numpla -0.101*** (0.036)
## w3momed_hsb -0.374*** (0.027)
## Constant 0.073 (0.049)
## ----------------------------------------
## Observations 5,429
## Adjusted R2 0.124
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
After controlling other factors, we can see that catholic schools have a significant effect on the math scores but it’s negative, which is different than we assume.
ecls %>%
group_by(catholic) %>%
summarise_all(mean)
## # A tibble: 2 × 5
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.163 65.4 1.11 0.392
## 2 1 0.220 86.2 1.07 0.205
t1 <- t.test(ecls[, "w3income_1k"] ~ ecls[, 'catholic'])
t2 <- t.test(ecls[, "p5numpla"] ~ ecls[, 'catholic'])
t3 <- t.test(ecls[, "w3momed_hsb"] ~ ecls[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1.212805e-37
## [1] 0.001791557
## [1] 1.530072e-33
The catholic is correlatted to p5numpla. Therefore we have covariates issue.
cath_logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb,
family = binomial(), data = ecls)
stargazer(cath_logit, type = "text")
##
## =============================================
## Dependent variable:
## ---------------------------
## catholic
## ---------------------------------------------
## w3income_1k 0.008***
## (0.001)
##
## p5numpla -0.277**
## (0.123)
##
## w3momed_hsb -0.650***
## (0.092)
##
## Constant -1.670***
## (0.158)
##
## ---------------------------------------------
## Observations 5,429
## Log Likelihood -2,375.301
## Akaike Inf. Crit. 4,758.603
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# make a prediction
prs_df <- data.frame(pr_score = predict(cath_logit, type = "response"),
catholic = cath_logit$model$catholic)
ecls$prs_df = prs_df$pr_score
ggplot(ecls, aes(x = prs_df, y = w3income_1k, color = as.factor(catholic))) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Logit prediction")
labs <- paste("Actual school type attended:", c("Catholic", "Public"))
prs_df %>%
mutate(catholic = ifelse(catholic == 1, labs[1], labs[2])) %>%
ggplot(aes(x = pr_score)) +
geom_histogram(color = "white") +
facet_wrap(~catholic) +
xlab("Probability of going to Catholic school") +
theme_bw()
bal <- ggplot(prs_df, aes(pr_score, fill = as.factor(catholic))) +
stat_density(aes(y = ..density..),position = "identity"
, color = "black", alpha=0.5)
bal
### 4. Run a Matching algorithm to get a balanced dataset
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb,
method = "nearest", data = ecls)
# This is the new dataframe with matched data
psm_data <- match.data(psm)
dim(psm_data) # 2704 = 1352 pairs were matched
## [1] 1860 9
head(arrange(psm_data, distance), 10)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prs_df distance
## 1 -1.9516170 0 17.5005 2 1 0.06069529 0.06069529
## 2 -2.5922340 1 17.5005 2 1 0.06069529 0.06069529
## 3 0.2015457 0 22.5005 2 1 0.06295488 0.06295488
## 4 0.5271555 1 22.5005 2 1 0.06295488 0.06295488
## 5 -0.1217994 0 27.5005 2 1 0.06529274 0.06529274
## 6 -1.6878765 1 27.5005 2 1 0.06529274 0.06529274
## 7 -0.3333479 0 32.5005 2 1 0.06771114 0.06771114
## 8 -1.2722942 1 32.5005 2 1 0.06771114 0.06771114
## 9 -0.7403859 0 37.5005 2 1 0.07021240 0.07021240
## 10 -0.7841369 1 37.5005 2 1 0.07021240 0.07021240
## weights subclass
## 1 1 598
## 2 1 598
## 3 1 222
## 4 1 222
## 5 1 272
## 6 1 272
## 7 1 859
## 8 1 859
## 9 1 728
## 10 1 728
The distance is very close.
psm_data %>%
group_by(catholic) %>%
summarise_all(mean)
## # A tibble: 2 × 9
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb prs_df distance weights
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.321 86.2 1.07 0.205 0.203 0.203 1
## 2 1 0.220 86.2 1.07 0.205 0.203 0.203 1
## # ℹ 1 more variable: subclass <dbl>
t1 <- t.test(psm_data[, "w3income_1k"] ~ psm_data[, 'catholic'])
t2 <- t.test(psm_data[, "p5numpla"] ~ psm_data[, 'catholic'])
t3 <- t.test(psm_data[, "w3momed_hsb"] ~ psm_data[, 'catholic'])
t1$p.value; t2$p.value; t3$p.value
## [1] 1
## [1] 1
## [1] 1
This time we don’t have covariates issue.
ggplot(psm_data, aes(x = distance, y = w3momed_hsb, color = as.factor(catholic))) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Propensity score")
psm_data %>%
group_by(catholic) %>%
summarise(n_students = n(),
mean_math = mean(c5r2mtsc_std),
std_error = sd(c5r2mtsc_std) / sqrt(n_students))
## # A tibble: 2 × 4
## catholic n_students mean_math std_error
## <int> <int> <dbl> <dbl>
## 1 0 930 0.321 0.0301
## 2 1 930 0.220 0.0281
lm_psm_naive <- lm(c5r2mtsc_std ~ catholic, data = psm_data)
stargazer(lm_psm_naive, keep.stat = c("n", "adj.rsq"), type='text')
##
## ========================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## ----------------------------------------
## catholic -0.101**
## (0.041)
##
## Constant 0.321***
## (0.029)
##
## ----------------------------------------
## Observations 1,860
## Adjusted R2 0.003
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
lm_psm_controls <- lm(c5r2mtsc_std ~ catholic +
w3income_1k + p5numpla + w3momed_hsb, data = psm_data)
stargazer(lm_psm_controls, keep.stat = c("n", "adj.rsq"), type='text')
##
## ========================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## ----------------------------------------
## catholic -0.101**
## (0.040)
##
## w3income_1k 0.004***
## (0.0005)
##
## p5numpla -0.115
## (0.071)
##
## w3momed_hsb -0.297***
## (0.050)
##
## Constant 0.151
## (0.092)
##
## ----------------------------------------
## Observations 1,860
## Adjusted R2 0.072
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
plot(ecls$prs_df,psm$distance)
numeric <- unlist(lapply(ecls_full, is.numeric))
ymat <- as.matrix(ecls_full$catholic)
xall <- as.matrix(ecls_full[ , numeric])
xall <- xall[,2:dim(xall)[2]]
head(xall)
## race_white race_black race_hispanic race_asian p5numpla p5hmage p5hdage
## [1,] 1 0 0 0 1 47 45
## [2,] 1 0 0 0 1 41 48
## [3,] 1 0 0 0 1 43 55
## [4,] 1 0 0 0 1 38 39
## [5,] 1 0 0 0 1 47 57
## [6,] 1 0 0 0 1 41 41
## w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty p5fstamp
## [1,] 0 0 53.50 77.5 62500.5 0 0
## [2,] 0 0 34.95 53.5 45000.5 0 0
## [3,] 0 0 63.43 53.5 62500.5 0 0
## [4,] 0 0 53.50 53.5 87500.5 0 0
## [5,] 0 0 61.56 77.5 150000.5 0 0
## [6,] 0 0 38.18 53.5 62500.5 0 0
## c5r2mtsc c5r2mtsc_std w3income_1k
## [1,] 60.043 0.9817533 62.5005
## [2,] 56.280 0.5943775 45.0005
## [3,] 55.272 0.4906106 62.5005
## [4,] 64.604 1.4512779 87.5005
## [5,] 75.721 2.5956991 150.0005
## [6,] 54.248 0.3851966 62.5005
pick_lasso <- cv.glmnet(y=ymat, x=xall, alpha=1, family="binomial")
plot(pick_lasso)
best_lasso <- glmnet(y=ymat, x=xall, alpha=1, lambda=pick_lasso$lambda.1se)
coef(best_lasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 4.528656e-02
## race_white .
## race_black .
## race_hispanic .
## race_asian .
## p5numpla .
## p5hmage 2.436445e-03
## p5hdage .
## w3daded_hsb -2.681065e-02
## w3momed_hsb -2.758450e-02
## w3momscr 8.008080e-05
## w3dadscr .
## w3income 7.521171e-07
## w3povrty -8.906012e-03
## p5fstamp .
## c5r2mtsc .
## c5r2mtsc_std .
## w3income_1k 4.438951e-08
pr_lasso = predict(best_lasso, newx = xall)
psm_lasso <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb,
distance = as.numeric(pr_lasso), method= "nearest", data = ecls)
psm_data_lasso <- match.data(psm_lasso)
lm_psm_lasso <- lm(c5r2mtsc_std ~ catholic +
w3income_1k + p5numpla + w3momed_hsb, data = psm_data_lasso)
stargazer(lm_psm_lasso, keep.stat = c("n", "adj.rsq"), type='text')
##
## ========================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## ----------------------------------------
## catholic -0.183***
## (0.040)
##
## w3income_1k 0.004***
## (0.0005)
##
## p5numpla 0.040
## (0.060)
##
## w3momed_hsb -0.350***
## (0.052)
##
## Constant 0.097
## (0.085)
##
## ----------------------------------------
## Observations 1,860
## Adjusted R2 0.080
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01