rm(list = ls(all.names = TRUE))
# necessaryPackages <-c("lattice", "ggplot2", "stargazer", "did", "tree", "raster", "devtools", "purr", "dplyr", "tidyverse", "magrittr", "hrbrthemes", "MatchIt", "glmnet")
# new.packages <- necessaryPackages[!(necessaryPackages%in% installed.packages()[,"Package"])]
# if(length(new.packages))
# install.packages(new.packages, repos = "http://cran.us.r-project.org")
# lapply(necessaryPackages, require, character.only = TRUE)
library(wbstats)
library(ggplot2)
library(stargazer)
#library(tidyverse)
library(dplyr)
library(magrittr)
library(hrbrthemes) # For density plots
library(tidyr) #density plots
library(viridis) # density plots
library(MatchIt) #matchit
catholic, w3income_1k, p5numpla, w3momed_hsb, c5r2mtsc_std, w3momed_hsburl <-'http://www.mfilipski.com/files/ecls.csv'
# ecls <- read.csv("ecls.csv", header=TRUE)
ecls <- read.csv(url, header=TRUE)
#View(ecls)
qn1_6 <- ecls %>% select( catholic, w3income_1k, p5numpla, w3momed_hsb, c5r2mtsc_std, w3momed_hsb)
# Change to factor.
qn1_6$catholic =as.factor(qn1_6$catholic)
Do test scores on c5r2mtsc_std differ for pupils of catholic and non-catholic schools on average?
Can you conclude that going to a catholic school leads to higher math scores? Why or why not?
Run regressions:
– A regression of just math scores on the catholic dummy
– Another regression that includes all the variables listed above
– Comment on the results
Do the means of the covariates (income, mother’s education etc) differ between catholic vs. non-catholic schools? You can
– Compare the means [Hint: summarise_all()]
– Test the difference in means for statistical significance [Hint: t.test]
– Discuss what would be the problems with an identification strategy based on simple regressions.
model1 <- lm(c5r2mtsc_std~catholic, qn1_6)
summary(model1)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = qn1_6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2667 -0.6082 0.0538 0.6292 3.2705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.16313 0.01424 11.459 <2e-16 ***
## catholic1 0.05656 0.03440 1.644 0.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9549 on 5427 degrees of freedom
## Multiple R-squared: 0.000498, Adjusted R-squared: 0.0003138
## F-statistic: 2.704 on 1 and 5427 DF, p-value: 0.1002
model2 <-lm(c5r2mtsc_std~., qn1_6)
summary(model2)
##
## Call:
## lm(formula = c5r2mtsc_std ~ ., data = qn1_6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3922 -0.5696 0.0372 0.5986 3.2572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0733290 0.0491661 1.491 0.135900
## catholic1 -0.1273899 0.0328888 -3.873 0.000109 ***
## w3income_1k 0.0053247 0.0003026 17.595 < 2e-16 ***
## p5numpla -0.1009432 0.0355779 -2.837 0.004567 **
## w3momed_hsb -0.3740355 0.0271962 -13.753 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8941 on 5424 degrees of freedom
## Multiple R-squared: 0.1242, Adjusted R-squared: 0.1235
## F-statistic: 192.3 on 4 and 5424 DF, p-value: < 2.2e-16
stargazer(model1, model2, title="Regression of Test Scores on Catholic Dummy",
covariate.labels = "Catholic",
type="text",
keep.stat=c("n", "rsq"),
column_labels=c("OLS"))
##
## Regression of Test Scores on Catholic Dummy
## =========================================
## Dependent variable:
## ----------------------------
## c5r2mtsc_std
## (1) (2)
## -----------------------------------------
## Catholic 0.057 -0.127***
## (0.034) (0.033)
##
## w3income_1k 0.005***
## (0.0003)
##
## p5numpla -0.101***
## (0.036)
##
## w3momed_hsb -0.374***
## (0.027)
##
## Constant 0.163*** 0.073
## (0.014) (0.049)
##
## -----------------------------------------
## Observations 5,429 5,429
## R2 0.0005 0.124
## =========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## Regression of Test Scores on Catholic Dummy
## ===
## OLS
## ---
# Do they differ between catholic and non catholic?
by_catholic<-qn1_6 %>%
group_by(catholic)
sum_cov <- by_catholic %>%
summarise_all(list(mean))
sum_cov
## # A tibble: 2 x 5
## catholic w3income_1k p5numpla w3momed_hsb c5r2mtsc_std
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 0 65.4 1.11 0.392 0.163
## 2 1 86.2 1.07 0.205 0.220
attach(qn1_6)
names(qn1_6)
## [1] "catholic" "w3income_1k" "p5numpla" "w3momed_hsb" "c5r2mtsc_std"
t.test(c5r2mtsc_std~catholic)
##
## Welch Two Sample t-test
##
## data: c5r2mtsc_std by catholic
## t = -1.7866, df = 1468.1, p-value = 0.07421
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.118653665 0.005539377
## sample estimates:
## mean in group 0 mean in group 1
## 0.1631279 0.2196851
t.test(w3income_1k~catholic)
##
## Welch Two Sample t-test
##
## data: w3income_1k by catholic
## t = -13.238, df = 1314.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.86719 -17.70620
## sample estimates:
## mean in group 0 mean in group 1
## 65.39393 86.18063
t.test(p5numpla~catholic)
##
## Welch Two Sample t-test
##
## data: p5numpla by catholic
## t = 3.128, df = 1600.4, p-value = 0.001792
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.01235472 0.05390038
## sample estimates:
## mean in group 0 mean in group 1
## 1.106246 1.073118
t.test(w3momed_hsb~catholic)
##
## Welch Two Sample t-test
##
## data: w3momed_hsb by catholic
## t = 12.362, df = 1545.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1572715 0.2165946
## sample estimates:
## mean in group 0 mean in group 1
## 0.3923094 0.2053763
3. Estimate the propensity to go to catholic school, by hand
* Run a logit regression to predict catholic based on the co-variates we kept (but not the math scores, of course).
* Make a prediction for who is likely to go to catholic school, using this mode
* Check that there is common support. * Plot the income variable against the logit prediction (by catholic), and add the lowess smooth densities to the plot. They should look similar, but not perfectly aligned.
logit <- glm(catholic~w3income_1k+p5numpla+ w3momed_hsb, data=qn1_6, family= binomial())
summary(logit)
##
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## family = binomial(), data = qn1_6)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0172 -0.6461 -0.5240 -0.4122 2.3672
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6702950 0.1576164 -10.597 < 2e-16 ***
## w3income_1k 0.0077921 0.0008115 9.602 < 2e-16 ***
## p5numpla -0.2774346 0.1232930 -2.250 0.0244 *
## w3momed_hsb -0.6504757 0.0915710 -7.104 1.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4972.4 on 5428 degrees of freedom
## Residual deviance: 4750.6 on 5425 degrees of freedom
## AIC: 4758.6
##
## Number of Fisher Scoring iterations: 4
#prop_to_cath <- predict.glm(logit,qn1_6)
qn1_6$prop_to_cath <- predict(logit, type="response", newdata=qn1_6)
#graph <- cbind(prop_to_cath, qn1_6)
common_support <- ggplot(data= qn1_6, title="Probability to go to a Catholic School ",aes(x=prop_to_cath, group=as.factor(catholic), fill=as.factor(catholic))) +
geom_density(adjust=1.5, alpha=.4) +
theme_ipsum()
common_support
# Plot against logit prediction and add lowess.
lowess_plot <- ggplot( data =qn1_6) +
geom_smooth(aes(x= w3income_1k, y=predict(logit, type="response"), color=as.factor(catholic), ), method="loess",)
lowess_plot
4. Run a Matching algorithm to get a balanced dataset.
* Create a matched dataset:
* Sort the data by distance, and show the 10 first observations.
match_prop <- matchit(catholic~w3income_1k+p5numpla+ w3momed_hsb, data=qn1_6)
# Create your matched data set by using match.data
summary(match_prop)
##
## Call:
## matchit(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## data = qn1_6)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.2034 0.1647 0.5198 0.9539 0.1217
## w3income_1k 86.1806 65.3939 0.4744 1.0647 0.1011
## p5numpla 1.0731 1.1062 -0.1182 0.6323 0.0066
## w3momed_hsb 0.2054 0.3923 -0.4627 . 0.1869
## eCDF Max
## distance 0.2544
## w3income_1k 0.2478
## p5numpla 0.0265
## w3momed_hsb 0.1869
##
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.2034 0.2034 0 1 0
## w3income_1k 86.1806 86.1806 0 1 0
## p5numpla 1.0731 1.0731 0 1 0
## w3momed_hsb 0.2054 0.2054 0 . 0
## eCDF Max Std. Pair Dist.
## distance 0 0
## w3income_1k 0 0
## p5numpla 0 0
## w3momed_hsb 0 0
##
## Percent Balance Improvement:
## Std. Mean Diff. Var. Ratio eCDF Mean eCDF Max
## distance 100 100 100 100
## w3income_1k 100 100 100 100
## p5numpla 100 100 100 100
## w3momed_hsb 100 . 100 100
##
## Sample Sizes:
## Control Treated
## All 4499 930
## Matched 930 930
## Unmatched 3569 0
## Discarded 0 0
match_psm <-match.data(match_prop)
summary(match_psm)
## catholic w3income_1k p5numpla w3momed_hsb c5r2mtsc_std
## 0:930 Min. : 7.50 Min. :1.000 Min. :0.0000 Min. :-2.9539
## 1:930 1st Qu.: 62.50 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:-0.2794
## Median : 87.50 Median :1.000 Median :0.0000 Median : 0.3035
## Mean : 86.18 Mean :1.073 Mean :0.2054 Mean : 0.2771
## 3rd Qu.: 87.50 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.: 0.8766
## Max. :200.00 Max. :3.000 Max. :1.0000 Max. : 3.1066
##
## prop_to_cath distance weights subclass
## Min. :0.0607 Min. :0.0607 Min. :1 1 : 2
## 1st Qu.:0.1604 1st Qu.:0.1604 1st Qu.:1 2 : 2
## Median :0.1884 Median :0.1884 Median :1 3 : 2
## Mean :0.2034 Mean :0.2034 Mean :1 4 : 2
## 3rd Qu.:0.2200 3rd Qu.:0.2200 3rd Qu.:1 5 : 2
## Max. :0.4039 Max. :0.4039 Max. :1 6 : 2
## (Other):1848
dim(match_psm)
## [1] 1860 9
# sort by mpg
# newdata <- mtcars[order(mpg),]
newdata= match_psm[order(match_psm$distance),]
head(newdata, 10)
## catholic w3income_1k p5numpla w3momed_hsb c5r2mtsc_std prop_to_cath
## 176 0 17.5005 2 1 -0.3835843 0.06069529
## 4274 1 17.5005 2 1 -2.5922340 0.06069529
## 41 0 22.5005 2 1 -2.3690526 0.06295488
## 2083 1 22.5005 2 1 0.5271555 0.06295488
## 561 0 27.5005 2 1 -0.2195955 0.06529274
## 2304 1 27.5005 2 1 -1.6878765 0.06529274
## 178 0 32.5005 2 1 -0.2550080 0.06771114
## 716 1 32.5005 2 1 -1.2722942 0.06771114
## 288 0 37.5005 2 1 -0.7403859 0.07021240
## 485 1 37.5005 2 1 -0.7841369 0.07021240
## distance weights subclass
## 176 0.06069529 1 598
## 4274 0.06069529 1 598
## 41 0.06295488 1 222
## 2083 0.06295488 1 222
## 561 0.06529274 1 272
## 2304 0.06529274 1 272
## 178 0.06771114 1 859
## 716 0.06771114 1 859
## 288 0.07021240 1 728
## 485 0.07021240 1 728
# summarise_all(match_psm)
attach(match_psm)
sum_Cov_matched = match_psm %>%
group_by(catholic) %>%
summarise(Income = mean(w3income_1k),Places= mean(p5numpla),Moth_educ= mean(w3momed_hsb)
)
sum_Cov_matched
## # A tibble: 2 x 4
## catholic Income Places Moth_educ
## <fct> <dbl> <dbl> <dbl>
## 1 0 86.2 1.07 0.205
## 2 1 86.2 1.07 0.205
t.test(c5r2mtsc_std~catholic)
##
## Welch Two Sample t-test
##
## data: c5r2mtsc_std by catholic
## t = 2.8048, df = 1852.1, p-value = 0.005087
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.03455008 0.19520664
## sample estimates:
## mean in group 0 mean in group 1
## 0.3345634 0.2196851
t.test(w3income_1k~catholic)
##
## Welch Two Sample t-test
##
## data: w3income_1k by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.985565 3.985565
## sample estimates:
## mean in group 0 mean in group 1
## 86.18063 86.18063
t.test(p5numpla~catholic)
##
## Welch Two Sample t-test
##
## data: p5numpla by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02550007 0.02550007
## sample estimates:
## mean in group 0 mean in group 1
## 1.073118 1.073118
t.test(w3momed_hsb~catholic)
##
## Welch Two Sample t-test
##
## data: w3momed_hsb by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03676158 0.03676158
## sample estimates:
## mean in group 0 mean in group 1
## 0.2053763 0.2053763
lowess_plot1 <- ggplot( data =qn1_6) +
geom_smooth(aes(x= w3income_1k, y=predict(logit, type="response"), color=as.factor(catholic), ), method="loess",)
lowess_plot1
* Compute the average treatment effects in your new sample.
Compare means of the two groups.
Run regression with the catholic dummy.
## Compare means
match_psm%>%
group_by(catholic) %>%
summarise(score=mean(c5r2mtsc_std))
## # A tibble: 2 x 2
## catholic score
## <fct> <dbl>
## 1 0 0.335
## 2 1 0.220
## Do a t-test to compare
attach(match_psm)
t.test(c5r2mtsc_std~catholic, match_psm
)
##
## Welch Two Sample t-test
##
## data: c5r2mtsc_std by catholic
## t = 2.8048, df = 1852.1, p-value = 0.005087
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.03455008 0.19520664
## sample estimates:
## mean in group 0 mean in group 1
## 0.3345634 0.2196851
model_psm <- lm(c5r2mtsc_std~catholic, match_psm)
summary(model_psm)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = match_psm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2884 -0.5547 0.0443 0.5931 2.8356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.33456 0.02896 11.552 < 2e-16 ***
## catholic1 -0.11488 0.04096 -2.805 0.00509 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8832 on 1858 degrees of freedom
## Multiple R-squared: 0.004216, Adjusted R-squared: 0.00368
## F-statistic: 7.867 on 1 and 1858 DF, p-value: 0.005087
names(match_psm)
## [1] "catholic" "w3income_1k" "p5numpla" "w3momed_hsb" "c5r2mtsc_std"
## [6] "prop_to_cath" "distance" "weights" "subclass"
model_psm1 <-lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb, match_psm)
summary(model2)
##
## Call:
## lm(formula = c5r2mtsc_std ~ ., data = qn1_6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3922 -0.5696 0.0372 0.5986 3.2572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0733290 0.0491661 1.491 0.135900
## catholic1 -0.1273899 0.0328888 -3.873 0.000109 ***
## w3income_1k 0.0053247 0.0003026 17.595 < 2e-16 ***
## p5numpla -0.1009432 0.0355779 -2.837 0.004567 **
## w3momed_hsb -0.3740355 0.0271962 -13.753 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8941 on 5424 degrees of freedom
## Multiple R-squared: 0.1242, Adjusted R-squared: 0.1235
## F-statistic: 192.3 on 4 and 5424 DF, p-value: < 2.2e-16
stargazer(model_psm, model_psm1, title="Regression of Test Scores on Catholic Dummy Using PSM",
covariate.labels = "Catholic",
omit ="subclass",
type="text",
keep.stat=c("n", "rsq"),
column_labels=c("OLS"))
##
## Regression of Test Scores on Catholic Dummy Using PSM
## =========================================
## Dependent variable:
## ----------------------------
## c5r2mtsc_std
## (1) (2)
## -----------------------------------------
## Catholic -0.115*** -0.115***
## (0.041) (0.039)
##
## w3income_1k 0.004***
## (0.0005)
##
## p5numpla -0.091
## (0.070)
##
## w3momed_hsb -0.366***
## (0.050)
##
## Constant 0.335*** 0.151*
## (0.029) (0.091)
##
## -----------------------------------------
## Observations 1,860 1,860
## R2 0.004 0.088
## =========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## Regression of Test Scores on Catholic Dummy Using PSM
## ===
## OLS
## ---
plot(match_psm$distance, match_psm$prop_to_cath)
# #Run the same matchit command with distance = your logit predictions.
p_score <- fitted(glm(catholic~w3income_1k+p5numpla+w3momed_hsb, data=qn1_6, family = binomial))
match_prop <- matchit(catholic~w3income_1k+p5numpla+w3momed_hsb, data=qn1_6, distance = p_score)
# Create your matched dat aset by using match.data
summary(match_prop)
##
## Call:
## matchit(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## data = qn1_6, distance = p_score)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.2034 0.1647 0.5198 0.9539 0.1217
## w3income_1k 86.1806 65.3939 0.4744 1.0647 0.1011
## p5numpla 1.0731 1.1062 -0.1182 0.6323 0.0066
## w3momed_hsb 0.2054 0.3923 -0.4627 . 0.1869
## eCDF Max
## distance 0.2544
## w3income_1k 0.2478
## p5numpla 0.0265
## w3momed_hsb 0.1869
##
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.2034 0.2034 0 1 0
## w3income_1k 86.1806 86.1806 0 1 0
## p5numpla 1.0731 1.0731 0 1 0
## w3momed_hsb 0.2054 0.2054 0 . 0
## eCDF Max Std. Pair Dist.
## distance 0 0
## w3income_1k 0 0
## p5numpla 0 0
## w3momed_hsb 0 0
##
## Percent Balance Improvement:
## Std. Mean Diff. Var. Ratio eCDF Mean eCDF Max
## distance 100 100 100 100
## w3income_1k 100 100 100 100
## p5numpla 100 100 100 100
## w3momed_hsb 100 . 100 100
##
## Sample Sizes:
## Control Treated
## All 4499 930
## Matched 930 930
## Unmatched 3569 0
## Discarded 0 0
match_psm <-match.data(match_prop)
summary(match_psm)
## catholic w3income_1k p5numpla w3momed_hsb c5r2mtsc_std
## 0:930 Min. : 7.50 Min. :1.000 Min. :0.0000 Min. :-2.9539
## 1:930 1st Qu.: 62.50 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:-0.2794
## Median : 87.50 Median :1.000 Median :0.0000 Median : 0.3035
## Mean : 86.18 Mean :1.073 Mean :0.2054 Mean : 0.2771
## 3rd Qu.: 87.50 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.: 0.8766
## Max. :200.00 Max. :3.000 Max. :1.0000 Max. : 3.1066
##
## prop_to_cath distance weights subclass
## Min. :0.0607 Min. :0.0607 Min. :1 1 : 2
## 1st Qu.:0.1604 1st Qu.:0.1604 1st Qu.:1 2 : 2
## Median :0.1884 Median :0.1884 Median :1 3 : 2
## Mean :0.2034 Mean :0.2034 Mean :1 4 : 2
## 3rd Qu.:0.2200 3rd Qu.:0.2200 3rd Qu.:1 5 : 2
## Max. :0.4039 Max. :0.4039 Max. :1 6 : 2
## (Other):1848
dim(match_psm)
## [1] 1860 9
p5hmage, w3daded_hsb, w3momed_hsb, w3incomeone race variable, w3daded_hsb, w3momed_hsb, income w3dadscr, p5numplap5numpla is not there in the lasso model.library(glmnet)
rm(list = ls(all.names = TRUE))
url <-'http://www.mfilipski.com/files/ecls.csv'
# ecls <- read.csv("ecls.csv", header=TRUE)
ecls <- read.csv(url, header=TRUE)
## Select only numeric
qn7_9 <- Filter(is.numeric, ecls)
qn7_9_x <-qn7_9 %>% select(!c(catholic))
qn7_9_y <-qn7_9 %>% select(catholic)
head(qn7_9)
## catholic race_white race_black race_hispanic race_asian p5numpla p5hmage
## 1 0 1 0 0 0 1 47
## 2 0 1 0 0 0 1 41
## 3 0 1 0 0 0 1 43
## 4 1 1 0 0 0 1 38
## 5 0 1 0 0 0 1 47
## 6 0 1 0 0 0 1 41
## p5hdage w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty p5fstamp
## 1 45 0 0 53.50 77.5 62500.5 0 0
## 2 48 0 0 34.95 53.5 45000.5 0 0
## 3 55 0 0 63.43 53.5 62500.5 0 0
## 4 39 0 0 53.50 53.5 87500.5 0 0
## 5 57 0 0 61.56 77.5 150000.5 0 0
## 6 41 0 0 38.18 53.5 62500.5 0 0
## c5r2mtsc c5r2mtsc_std w3income_1k
## 1 60.043 0.9817533 62.5005
## 2 56.280 0.5943775 45.0005
## 3 55.272 0.4906106 62.5005
## 4 64.604 1.4512779 87.5005
## 5 75.721 2.5956991 150.0005
## 6 54.248 0.3851966 62.5005
#summary(qn7_9)
## Make sure the data is in the form of matrix.
y= data.matrix(qn7_9_y)
x = data.matrix(qn7_9_x)
dim(x)
## [1] 5429 17
fit= glmnet(x, y)
plot(fit, label =TRUE)
#perform a validation to get the best lambda
cv_model <- cv.glmnet(x, y, alpha =1)
# find the best lambda thaat minimises the MSE
# get best lambda
best_lambda <- cv_model$lambda.1se
plot(cv_model)
best_lambda
## [1] 0.03544492
lasso_model <- glmnet(x = x, y = y, family = "gaussian", alpha = 1, lambda=best_lambda)
coef(lasso_model)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 1.132406e-01
## race_white .
## race_black .
## race_hispanic .
## race_asian .
## p5numpla .
## p5hmage 7.016091e-04
## p5hdage .
## w3daded_hsb -1.376823e-02
## w3momed_hsb -1.325551e-02
## w3momscr .
## w3dadscr .
## w3income 6.097789e-07
## w3povrty .
## p5fstamp .
## c5r2mtsc .
## c5r2mtsc_std .
## w3income_1k .
# p_score using lasso
p_score_lasso1 <- predict(lasso_model, s=best_lambda, newx=x)
qn7_9 <-cbind(qn7_9, p_score_lasso1)
qn7_9$prop_to_cath <- predict(lasso_model, type="response", newx=x)
#match using the predicted lasso score
# change to vec
pscore_vec <- as.vector(p_score_lasso1)
match_prop_lasso1 <- matchit(catholic~p5hmage+w3daded_hsb+w3momed_hsb+w3income, data=qn7_9, distance = pscore_vec)
summary(match_prop_lasso1)
##
## Call:
## matchit(formula = catholic ~ p5hmage + w3daded_hsb + w3momed_hsb +
## w3income, data = qn7_9, distance = pscore_vec)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.1873 0.1680 0.6092 0.8589 0.1737
## p5hmage 39.7753 37.7946 0.4206 0.6679 0.0499
## w3daded_hsb 0.2699 0.4670 -0.4440 . 0.1971
## w3momed_hsb 0.2054 0.3923 -0.4627 . 0.1869
## w3income 86180.6253 65393.9285 0.4744 1.0647 0.1011
## eCDF Max
## distance 0.2772
## p5hmage 0.1734
## w3daded_hsb 0.1971
## w3momed_hsb 0.1869
## w3income 0.2478
##
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.1873 0.1873 0.0002 1.0007 0.0001
## p5hmage 39.7753 39.6441 0.0279 1.0148 0.0033
## w3daded_hsb 0.2699 0.2785 -0.0194 . 0.0086
## w3momed_hsb 0.2054 0.1978 0.0186 . 0.0075
## w3income 86180.6253 86352.6672 -0.0039 1.0261 0.0031
## eCDF Max Std. Pair Dist.
## distance 0.0022 0.0005
## p5hmage 0.0108 0.0950
## w3daded_hsb 0.0086 0.0484
## w3momed_hsb 0.0075 0.0665
## w3income 0.0075 0.0166
##
## Percent Balance Improvement:
## Std. Mean Diff. Var. Ratio eCDF Mean eCDF Max
## distance 100.0 99.6 100.0 99.2
## p5hmage 93.4 96.4 93.4 93.8
## w3daded_hsb 95.6 . 95.6 95.6
## w3momed_hsb 96.0 . 96.0 96.0
## w3income 99.2 59.0 96.9 97.0
##
## Sample Sizes:
## Control Treated
## All 4499 930
## Matched 930 930
## Unmatched 3569 0
## Discarded 0 0
match_psm_lasso1 <-match.data(match_prop_lasso1)
summary(match_psm_lasso1)
## catholic race_white race_black race_hispanic
## Min. :0.0 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.5 Median :1.0000 Median :0.0000 Median :0.00000
## Mean :0.5 Mean :0.7903 Mean :0.0414 Mean :0.08871
## 3rd Qu.:1.0 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## race_asian p5numpla p5hmage p5hdage
## Min. :0.00000 Min. :1.000 Min. :25.00 Min. :26.00
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:37.00 1st Qu.:38.00
## Median :0.00000 Median :1.000 Median :40.00 Median :41.00
## Mean :0.04892 Mean :1.073 Mean :39.71 Mean :41.91
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:43.00 3rd Qu.:45.00
## Max. :1.00000 Max. :4.000 Max. :55.00 Max. :69.00
##
## w3daded_hsb w3momed_hsb w3momscr w3dadscr
## Min. :0.0000 Min. :0.0000 Min. :29.60 Min. :29.60
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:38.18 1st Qu.:35.78
## Median :0.0000 Median :0.0000 Median :38.18 Median :39.20
## Mean :0.2742 Mean :0.2016 Mean :47.58 Mean :45.77
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:59.00 3rd Qu.:53.50
## Max. :1.0000 Max. :1.0000 Max. :77.50 Max. :77.50
##
## w3income w3povrty p5fstamp c5r2mtsc
## Min. : 7500 Min. :0.00000 Min. :0.000000 Min. :21.81
## 1st Qu.: 62501 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:48.37
## Median : 87501 Median :0.00000 Median :0.000000 Median :54.05
## Mean : 86267 Mean :0.01667 Mean :0.006989 Mean :53.74
## 3rd Qu.: 87501 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:59.54
## Max. :200001 Max. :1.00000 Max. :1.000000 Max. :83.86
##
## c5r2mtsc_std w3income_1k 1 prop_to_cath.s0
## Min. :-2.9539 Min. : 7.50 Min. :0.1191 Min. :0.11909734
## 1st Qu.:-0.2202 1st Qu.: 62.50 1st Qu.:0.1649 1st Qu.:0.16494662
## Median : 0.3651 Median : 87.50 Median :0.1819 Median :0.18190861
## Mean : 0.3333 Mean : 86.27 Mean :0.1873 Mean :0.18725722
## 3rd Qu.: 0.9301 3rd Qu.: 87.50 3rd Qu.:0.1989 3rd Qu.:0.19887059
## Max. : 3.4337 Max. :200.00 Max. :0.2689 Max. :0.26887424
##
## distance weights subclass
## Min. :0.1191 Min. :1 1 : 2
## 1st Qu.:0.1649 1st Qu.:1 2 : 2
## Median :0.1819 Median :1 3 : 2
## Mean :0.1873 Mean :1 4 : 2
## 3rd Qu.:0.1989 3rd Qu.:1 5 : 2
## Max. :0.2689 Max. :1 6 : 2
## (Other):1848
dim(match_psm_lasso1)
## [1] 1860 23
model_psm_lasso11 <-lm(c5r2mtsc_std~catholic, match_psm_lasso1)
model_psm_lasso12 <-lm(c5r2mtsc_std~catholic+p5hmage+w3daded_hsb+w3momed_hsb+w3income, match_psm_lasso1)
summary(model_psm_lasso11)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = match_psm_lasso1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4008 -0.5562 0.0366 0.5649 2.9867
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.44696 0.02886 15.487 < 2e-16 ***
## catholic -0.22727 0.04082 -5.568 2.95e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8801 on 1858 degrees of freedom
## Multiple R-squared: 0.01641, Adjusted R-squared: 0.01588
## F-statistic: 31.01 on 1 and 1858 DF, p-value: 2.948e-08
stargazer(model_psm_lasso11, model_psm_lasso12, title="Regression of Test Scores on Catholic Dummy",
covariate.labels = "Catholic",
type="text",
keep.stat=c("n", "rsq"),
column_labels=c("OLS"))
##
## Regression of Test Scores on Catholic Dummy
## =========================================
## Dependent variable:
## ----------------------------
## c5r2mtsc_std
## (1) (2)
## -----------------------------------------
## Catholic -0.227*** -0.229***
## (0.041) (0.039)
##
## p5hmage 0.015***
## (0.004)
##
## w3daded_hsb -0.249***
## (0.048)
##
## w3momed_hsb -0.276***
## (0.052)
##
## w3income 0.00000***
## (0.00000)
##
## Constant 0.447*** -0.290*
## (0.029) (0.172)
##
## -----------------------------------------
## Observations 1,860 1,860
## R2 0.016 0.113
## =========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## Regression of Test Scores on Catholic Dummy
## ===
## OLS
## ---
#distance_lasso <- predict(lasso_model, newx =x)
logit <- glm(catholic~p5hmage+w3daded_hsb+w3momed_hsb+w3momscr+w3income+w3povrty+w3income_1k, data=qn7_9, family= binomial())
summary(logit)
##
## Call:
## glm(formula = catholic ~ p5hmage + w3daded_hsb + w3momed_hsb +
## w3momscr + w3income + w3povrty + w3income_1k, family = binomial(),
## data = qn7_9)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1601 -0.6789 -0.5232 -0.3310 2.7118
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.274e+00 3.157e-01 -10.373 < 2e-16 ***
## p5hmage 3.622e-02 7.150e-03 5.066 4.05e-07 ***
## w3daded_hsb -3.311e-01 9.180e-02 -3.607 0.000310 ***
## w3momed_hsb -3.793e-01 1.004e-01 -3.777 0.000159 ***
## w3momscr 4.789e-03 3.402e-03 1.408 0.159183
## w3income 4.731e-06 9.007e-07 5.253 1.50e-07 ***
## w3povrty -1.213e+00 2.724e-01 -4.452 8.50e-06 ***
## w3income_1k NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4972.4 on 5428 degrees of freedom
## Residual deviance: 4675.6 on 5422 degrees of freedom
## AIC: 4689.6
##
## Number of Fisher Scoring iterations: 6
qn7_9$prop_to_cath_lasso <- predict(logit, type="response", newdata=qn7_9)
common_support <- ggplot(data= qn7_9, title="Probability to go to a Catholic School ",aes(x=prop_to_cath_lasso, group=as.factor(catholic), fill=as.factor(catholic))) +
geom_density(adjust=1.5, alpha=.4) +
theme_ipsum()
common_support
p_score_lasso <- fitted(glm(catholic~p5hmage+w3daded_hsb+w3momed_hsb+w3momscr+w3income+w3povrty, data=qn7_9, family = binomial))
match_prop_lasso <- matchit(catholic~p5hmage+w3daded_hsb+w3momed_hsb+w3momscr+w3income+w3povrty+w3income_1k, data=qn7_9, distance = p_score_lasso)
# # Create your matched dataset by using match.data
summary(match_prop_lasso)
##
## Call:
## matchit(formula = catholic ~ p5hmage + w3daded_hsb + w3momed_hsb +
## w3momscr + w3income + w3povrty + w3income_1k, data = qn7_9,
## distance = p_score_lasso)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.2117 0.1630 0.6601 0.7428 0.1736
## p5hmage 39.7753 37.7946 0.4206 0.6679 0.0499
## w3daded_hsb 0.2699 0.4670 -0.4440 . 0.1971
## w3momed_hsb 0.2054 0.3923 -0.4627 . 0.1869
## w3momscr 47.6209 43.9095 0.3159 1.0742 0.0924
## w3income 86180.6253 65393.9285 0.4744 1.0647 0.1011
## w3povrty 0.0161 0.1016 -0.6783 . 0.0854
## w3income_1k 86.1806 65.3939 0.4744 1.0647 0.1011
## eCDF Max
## distance 0.2781
## p5hmage 0.1734
## w3daded_hsb 0.1971
## w3momed_hsb 0.1869
## w3momscr 0.1812
## w3income 0.2478
## w3povrty 0.0854
## w3income_1k 0.2478
##
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.2117 0.2117 0.0003 1.0008 0.0002
## p5hmage 39.7753 39.7548 0.0043 0.9396 0.0043
## w3daded_hsb 0.2699 0.2613 0.0194 . 0.0086
## w3momed_hsb 0.2054 0.1785 0.0665 . 0.0269
## w3momscr 47.6209 46.7763 0.0719 1.0249 0.0206
## w3income 86180.6253 84142.9909 0.0465 0.9969 0.0079
## w3povrty 0.0161 0.0151 0.0085 . 0.0011
## w3income_1k 86.1806 84.1430 0.0465 0.9969 0.0079
## eCDF Max Std. Pair Dist.
## distance 0.0043 0.0009
## p5hmage 0.0118 0.4664
## w3daded_hsb 0.0086 0.2374
## w3momed_hsb 0.0269 0.2529
## w3momscr 0.0430 0.4011
## w3income 0.0301 0.3014
## w3povrty 0.0011 0.0085
## w3income_1k 0.0301 0.3014
##
## Percent Balance Improvement:
## Std. Mean Diff. Var. Ratio eCDF Mean eCDF Max
## distance 100.0 99.7 99.9 98.5
## p5hmage 99.0 84.6 91.4 93.2
## w3daded_hsb 95.6 . 95.6 95.6
## w3momed_hsb 85.6 . 85.6 85.6
## w3momscr 77.2 65.7 77.7 76.3
## w3income 90.2 95.1 92.2 87.8
## w3povrty 98.7 . 98.7 98.7
## w3income_1k 90.2 95.1 92.2 87.8
##
## Sample Sizes:
## Control Treated
## All 4499 930
## Matched 930 930
## Unmatched 3569 0
## Discarded 0 0
match_psm_lasso <-match.data(match_prop_lasso)
summary(match_psm_lasso)
## catholic race_white race_black race_hispanic
## Min. :0.0 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.5 Median :1.0000 Median :0.00000 Median :0.0000
## Mean :0.5 Mean :0.7591 Mean :0.04731 Mean :0.1027
## 3rd Qu.:1.0 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0 Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## race_asian p5numpla p5hmage p5hdage
## Min. :0.00000 Min. :1.000 Min. :25.00 Min. :24.00
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:37.00 1st Qu.:38.00
## Median :0.00000 Median :1.000 Median :40.00 Median :42.00
## Mean :0.05108 Mean :1.082 Mean :39.77 Mean :41.83
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:43.00 3rd Qu.:45.00
## Max. :1.00000 Max. :3.000 Max. :62.00 Max. :66.00
##
## w3daded_hsb w3momed_hsb w3momscr w3dadscr
## Min. :0.0000 Min. :0.0000 Min. :29.60 Min. :29.60
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:38.18 1st Qu.:35.78
## Median :0.0000 Median :0.0000 Median :38.18 Median :39.20
## Mean :0.2656 Mean :0.1919 Mean :47.20 Mean :45.86
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:59.00 3rd Qu.:53.50
## Max. :1.0000 Max. :1.0000 Max. :77.50 Max. :77.50
##
## w3income w3povrty p5fstamp c5r2mtsc
## Min. : 7500 Min. :0.00000 Min. :0.00000 Min. :20.47
## 1st Qu.: 62501 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:48.25
## Median : 87501 Median :0.00000 Median :0.00000 Median :53.94
## Mean : 85162 Mean :0.01559 Mean :0.00914 Mean :53.55
## 3rd Qu.: 87501 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:59.54
## Max. :200001 Max. :1.00000 Max. :1.00000 Max. :83.86
##
## c5r2mtsc_std w3income_1k 1 prop_to_cath.s0
## Min. :-3.0919 Min. : 7.50 Min. :0.1168 Min. :0.11675005
## 1st Qu.:-0.2324 1st Qu.: 62.50 1st Qu.:0.1648 1st Qu.:0.16484463
## Median : 0.3536 Median : 87.50 Median :0.1815 Median :0.18152129
## Mean : 0.3138 Mean : 85.16 Mean :0.1869 Mean :0.18686908
## 3rd Qu.: 0.9300 3rd Qu.: 87.50 3rd Qu.:0.1982 3rd Qu.:0.19816898
## Max. : 3.4337 Max. :200.00 Max. :0.2703 Max. :0.27027746
##
## prop_to_cath_lasso distance weights subclass
## Min. :0.02523 Min. :0.02523 Min. :1 1 : 2
## 1st Qu.:0.16063 1st Qu.:0.16063 1st Qu.:1 2 : 2
## Median :0.21078 Median :0.21078 Median :1 3 : 2
## Mean :0.21168 Mean :0.21168 Mean :1 4 : 2
## 3rd Qu.:0.25695 3rd Qu.:0.25695 3rd Qu.:1 5 : 2
## Max. :0.42393 Max. :0.42393 Max. :1 6 : 2
## (Other):1848
dim(match_psm_lasso)
## [1] 1860 24
model_psm_lasso <-lm(c5r2mtsc_std~catholic, match_psm_lasso)
model_psm_lasso1 <-lm(c5r2mtsc_std~catholic+p5hmage+w3daded_hsb+w3momed_hsb+w3momscr+w3income+w3povrty+w3income_1k, match_psm_lasso)
summary(model_psm_lasso1)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + p5hmage + w3daded_hsb +
## w3momed_hsb + w3momscr + w3income + w3povrty + w3income_1k,
## data = match_psm_lasso)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5140 -0.5086 0.0407 0.5589 2.8233
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.908e-01 1.838e-01 -2.126 0.03360 *
## catholic -1.896e-01 3.928e-02 -4.826 1.50e-06 ***
## p5hmage 1.115e-02 4.207e-03 2.649 0.00814 **
## w3daded_hsb -2.797e-01 4.890e-02 -5.721 1.23e-08 ***
## w3momed_hsb -2.403e-01 5.525e-02 -4.349 1.45e-05 ***
## w3momscr 4.349e-03 1.832e-03 2.373 0.01773 *
## w3income 3.208e-06 4.879e-07 6.576 6.27e-11 ***
## w3povrty -1.179e-01 1.624e-01 -0.726 0.46785
## w3income_1k NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8453 on 1852 degrees of freedom
## Multiple R-squared: 0.1226, Adjusted R-squared: 0.1193
## F-statistic: 36.98 on 7 and 1852 DF, p-value: < 2.2e-16
summary(model_psm_lasso)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = match_psm_lasso)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4998 -0.5416 0.0501 0.5952 3.0258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.40784 0.02938 13.880 < 2e-16 ***
## catholic -0.18815 0.04156 -4.528 6.34e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8961 on 1858 degrees of freedom
## Multiple R-squared: 0.01091, Adjusted R-squared: 0.01038
## F-statistic: 20.5 on 1 and 1858 DF, p-value: 6.337e-06
stargazer(model_psm_lasso, model_psm_lasso1, title="Regression of Test Scores on Catholic Dummy",
covariate.labels = "Catholic",
type="text",
keep.stat=c("n", "rsq"),
column_labels=c("OLS"))
##
## Regression of Test Scores on Catholic Dummy
## =========================================
## Dependent variable:
## ----------------------------
## c5r2mtsc_std
## (1) (2)
## -----------------------------------------
## Catholic -0.188*** -0.190***
## (0.042) (0.039)
##
## p5hmage 0.011***
## (0.004)
##
## w3daded_hsb -0.280***
## (0.049)
##
## w3momed_hsb -0.240***
## (0.055)
##
## w3momscr 0.004**
## (0.002)
##
## w3income 0.00000***
## (0.00000)
##
## w3povrty -0.118
## (0.162)
##
## w3income_1k
##
##
## Constant 0.408*** -0.391**
## (0.029) (0.184)
##
## -----------------------------------------
## Observations 1,860 1,860
## R2 0.011 0.123
## =========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## Regression of Test Scores on Catholic Dummy
## ===
## OLS
## ---