#loading necessary packages
#tinytex::install_tinytex()
library(MatchIt)
library(rvest)
library(ggplot2)
library(tidyverse)
library(XML)
library(xml2)
library(plotrix)
library(haven)
library(stargazer)
library(glmnet)
dataset <- read.csv("ecls.csv", header=T)
##For the purpose of questions 1 - 6, make a smaller dataset which only contains the variables
newdata <- dataset %>% select(c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)
Do test scores on c5r2mtsc_std differ for pupils of catholic and non-catholic schools on average?
table1 <- newdata %>% # Summary by group using dplyr
group_by(catholic) %>%
summarize(n_students=table(catholic), mean_math = mean(c5r2mtsc_std), STD_E= std.error(c5r2mtsc_std))
table1
## # A tibble: 2 x 4
## catholic n_students mean_math STD_E
## <int> <table> <dbl> <dbl>
## 1 0 4499 0.163 0.0145
## 2 1 930 0.220 0.0281
From the above results, we can see that going to a catholic school lead to a higher math score. It could be possible that catholic schools had less students compared to public schools so they got better educational facilities and had higher math scores
#A regression of just math scores on the catholic dummy
model1 <- lm(c5r2mtsc_std~catholic, data=newdata)
stargazer(model1, type="text", title="Table 1 : Math Scores for Catholic Students",
align=TRUE, dep.var.labels = "Standard Math Score", keep.stat = c("n", "adj.rsq"))
##
## Table 1 : Math Scores for Catholic Students
## ========================================
## Dependent variable:
## ---------------------------
## Standard Math Score
## ----------------------------------------
## catholic 0.057
## (0.034)
##
## Constant 0.163***
## (0.014)
##
## ----------------------------------------
## Observations 5,429
## Adjusted R2 0.0003
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Students going to catholic schools has 0.057 higher math scores than students going to non-catholic schools but the coefficient is not statistically significant.
#Another regression that includes all the variables listed above
Model2 <- lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb, data=newdata)
stargazer(Model2, type="text", title="Table 2",
align=TRUE, dep.var.labels = "Standard Math Score", covariate.labels=c("Catholic", "Family Income", "Number of places the student has lived for at least 4 months", "Mother's Education"),
keep.stat = c("n", "adj.rsq"))
##
## Table 2
## ========================================================================================
## Dependent variable:
## ---------------------------
## Standard Math Score
## ----------------------------------------------------------------------------------------
## Catholic -0.127***
## (0.033)
##
## Family Income 0.005***
## (0.0003)
##
## Number of places the student has lived for at least 4 months -0.101***
## (0.036)
##
## Mother's Education -0.374***
## (0.027)
##
## Constant 0.073
## (0.049)
##
## ----------------------------------------------------------------------------------------
## Observations 5,429
## Adjusted R2 0.124
## ========================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
However, while regressing math score with family income, mother’s education etc, we get different results. Students going to catholic schools has 0.127 lower math scores than students going to non-catholic schools and the coefficient is statistically significant at 1% level of significance.
#Do the means of the covariates (income, mother's education etc) differ between catholic vs. non-catholic schools?
by_catholic <- newdata %>%
group_by(catholic)
by_catholic %>%
summarise_all(list(mean))
## # A tibble: 2 x 5
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.163 65.4 1.11 0.392
## 2 1 0.220 86.2 1.07 0.205
#Test the difference in means for statistical significance
t.test(w3income_1k~catholic, data=newdata)
##
## Welch Two Sample t-test
##
## data: w3income_1k by catholic
## t = -13.238, df = 1314.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -23.86719 -17.70620
## sample estimates:
## mean in group 0 mean in group 1
## 65.39393 86.18063
t.test(p5numpla~catholic, data=newdata)
##
## Welch Two Sample t-test
##
## data: p5numpla by catholic
## t = 3.128, df = 1600.4, p-value = 0.001792
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.01235472 0.05390038
## sample estimates:
## mean in group 0 mean in group 1
## 1.106246 1.073118
t.test(w3momed_hsb~catholic, data=newdata)
##
## Welch Two Sample t-test
##
## data: w3momed_hsb by catholic
## t = 12.362, df = 1545.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.1572715 0.2165946
## sample estimates:
## mean in group 0 mean in group 1
## 0.3923094 0.2053763
The means of the covariates (income, mother’s education etc) differ between catholic vs. non-catholic schools significantly.
#Run a logit regression to predict catholic based on the covariates we kept (but not the math scores, of course)
model3 <- glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
family = binomial(), data = newdata)
summary(model3)
##
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## family = binomial(), data = newdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0172 -0.6461 -0.5240 -0.4122 2.3672
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6702950 0.1576164 -10.597 < 2e-16 ***
## w3income_1k 0.0077921 0.0008115 9.602 < 2e-16 ***
## p5numpla -0.2774346 0.1232930 -2.250 0.0244 *
## w3momed_hsb -0.6504757 0.0915710 -7.104 1.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4972.4 on 5428 degrees of freedom
## Residual deviance: 4750.6 on 5425 degrees of freedom
## AIC: 4758.6
##
## Number of Fisher Scoring iterations: 4
#Make a prediction for who is likely to go to catholic school, using this model
precicted_score = data.frame(pr_score = predict(model3, type="response"))
predict_data <- cbind(newdata, precicted_score)
head(predict_data)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb pr_score
## 1 0.9817533 0 62.5005 1 0 0.1883576
## 2 0.5943775 0 45.0005 1 0 0.1683901
## 3 0.4906106 0 62.5005 1 0 0.1883576
## 4 1.4512779 1 87.5005 1 0 0.2199574
## 5 2.5956991 0 150.0005 1 0 0.3145556
## 6 0.3851966 0 62.5005 1 0 0.1883576
#Check that there is common support
#plot 1: fitted plot
g <- ggplot(predict_data, aes(x=pr_score, y=w3income_1k, color=as.factor(catholic))) + geom_point()+
geom_smooth(aes(y = w3income_1k), size = 1)
g
#plot2: histogram
names <- c(`1`="Actual school type attended: Catholic", `0`="Actual school type attended: Public")
predict_data %>%
ggplot(aes(x = pr_score)) +
geom_histogram(colour = "white", fill = "black") +
facet_wrap(~catholic, ncol = 2, labeller = as_labeller(names))+xlab("Probability of going to catholic school")+
theme(strip.background = element_rect(fill = "gray80", colour = "black",
size = 0.5, linetype = "solid"),
strip.text = element_text(face = "bold"))
#plot 3: density graph
ggplot(predict_data, aes(pr_score, fill = as.factor(catholic), color =as.factor(catholic))) +
geom_density(alpha = 0.4)
From all the above graph, we can see that both catholic and non-catholic students have groups have predicted values in overlapping ranges.
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, data = predict_data)
psm_data <- match.data(psm)
dim(psm_data)
## [1] 1860 9
#Sort the data by distance, and show the 10 first observations
sort_psm_data <- psm_data[order(psm_data$distance), ]
head(sort_psm_data, 10)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb pr_score
## 176 -0.3835843 0 17.5005 2 1 0.06069529
## 4274 -2.5922340 1 17.5005 2 1 0.06069529
## 41 -2.3690526 0 22.5005 2 1 0.06295488
## 2083 0.5271555 1 22.5005 2 1 0.06295488
## 561 -0.2195955 0 27.5005 2 1 0.06529274
## 2304 -1.6878765 1 27.5005 2 1 0.06529274
## 178 -0.2550080 0 32.5005 2 1 0.06771114
## 716 -1.2722942 1 32.5005 2 1 0.06771114
## 288 -0.7403859 0 37.5005 2 1 0.07021240
## 485 -0.7841369 1 37.5005 2 1 0.07021240
## distance weights subclass
## 176 0.06069529 1 598
## 4274 0.06069529 1 598
## 41 0.06295488 1 222
## 2083 0.06295488 1 222
## 561 0.06529274 1 272
## 2304 0.06529274 1 272
## 178 0.06771114 1 859
## 716 0.06771114 1 859
## 288 0.07021240 1 728
## 485 0.07021240 1 728
The number of observations that matched are the students who are likely to go catholic school.
#In this new dataset, do the means of the covariates (income, etc) differ between catholic vs. non-catholic schools?
by_catholic_psm <- sort_psm_data %>%
group_by(catholic)
by_catholic_psm %>%
summarise_all(list(mean))
## # A tibble: 2 x 9
## catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb pr_score distance
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.335 86.2 1.07 0.205 0.203 0.203
## 2 1 0.220 86.2 1.07 0.205 0.203 0.203
## # ... with 2 more variables: weights <dbl>, subclass <dbl>
t.test(w3income_1k~catholic, data=sort_psm_data)
##
## Welch Two Sample t-test
##
## data: w3income_1k by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -3.985565 3.985565
## sample estimates:
## mean in group 0 mean in group 1
## 86.18063 86.18063
t.test(p5numpla~catholic, data=sort_psm_data)
##
## Welch Two Sample t-test
##
## data: p5numpla by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.02550007 0.02550007
## sample estimates:
## mean in group 0 mean in group 1
## 1.073118 1.073118
t.test(w3momed_hsb~catholic, data=sort_psm_data)
##
## Welch Two Sample t-test
##
## data: w3momed_hsb by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.03676158 0.03676158
## sample estimates:
## mean in group 0 mean in group 1
## 0.2053763 0.2053763
#Visually check the balance in the new dataset:
g1 <- ggplot(sort_psm_data, aes(x=pr_score, y=w3income_1k, color=as.factor(catholic))) + geom_point()+
geom_smooth(aes(y = w3income_1k), size = 1)
g1
From the above graph, the means are nearly identical at each level of the pscore distribution.
#Compare means of the two groups
table2 <- sort_psm_data %>% # Summary by group using dplyr
group_by(catholic) %>%
summarize(n_students=table(catholic), mean_math = mean(c5r2mtsc_std), STD_E= std.error(c5r2mtsc_std))
table2
## # A tibble: 2 x 4
## catholic n_students mean_math STD_E
## <int> <table> <dbl> <dbl>
## 1 0 930 0.335 0.0298
## 2 1 930 0.220 0.0281
The mean math score is different between two groups
#Run regressions:
#- A regression of just scores on the catholic dummy
model_psm1 <- lm(c5r2mtsc_std~catholic, data=sort_psm_data)
stargazer(model_psm1, type="text", title="Table 3 : Math Scores for Catholic Students",
align=TRUE, dep.var.labels = "Standard Math Score", keep.stat = c("n", "adj.rsq"))
##
## Table 3 : Math Scores for Catholic Students
## ========================================
## Dependent variable:
## ---------------------------
## Standard Math Score
## ----------------------------------------
## catholic -0.115***
## (0.041)
##
## Constant 0.335***
## (0.029)
##
## ----------------------------------------
## Observations 1,860
## Adjusted R2 0.004
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#- Another regression that includes all the variables
Model_psm2 <- lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb, data=sort_psm_data)
stargazer(Model_psm2, type="text", title="Table 4",
align=TRUE, dep.var.labels = "Standard Math Score", covariate.labels=c("Catholic", "Family Income", "Number of places the student has lived for at least 4 months", "Mother's Education"),
keep.stat = c("n", "adj.rsq"))
##
## Table 4
## ========================================================================================
## Dependent variable:
## ---------------------------
## Standard Math Score
## ----------------------------------------------------------------------------------------
## Catholic -0.115***
## (0.039)
##
## Family Income 0.004***
## (0.0005)
##
## Number of places the student has lived for at least 4 months -0.091
## (0.070)
##
## Mother's Education -0.366***
## (0.050)
##
## Constant 0.151*
## (0.091)
##
## ----------------------------------------------------------------------------------------
## Observations 1,860
## Adjusted R2 0.086
## ========================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
The math score for students going to catholic school is 0.115 point lower than students from non-catholic students which is significant at 1% level of significance level in both simple and multiple regressions.
plot(sort_psm_data$distance, sort_psm_data$pr_score, xlab = "Distance from matchit command", ylab = "Predicted math score")
lm(pr_score~distance, data=sort_psm_data)
##
## Call:
## lm(formula = pr_score ~ distance, data = sort_psm_data)
##
## Coefficients:
## (Intercept) distance
## 2.718e-15 1.000e+00
From the graph and the slope= 1, we can verify that Verify that the logit predicted values are the same as the distance used by the matchit command.
# Run your same matchit command but with distance= your logit predictions. See that this is the same.
logit_pred <- fitted(model3)
psm2 <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, data=predict_data, family=binomial(), distance= logit_pred)
psm2_data <- match.data(psm2)
head(psm2_data)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb pr_score distance
## 1 0.9817533 0 62.5005 1 0 0.1883576 0.1883576
## 2 0.5943775 0 45.0005 1 0 0.1683901 0.1683901
## 3 0.4906106 0 62.5005 1 0 0.1883576 0.1883576
## 4 1.4512779 1 87.5005 1 0 0.2199574 0.2199574
## 5 2.5956991 0 150.0005 1 0 0.3145556 0.3145556
## 6 0.3851966 0 62.5005 1 0 0.1883576 0.1883576
## weights subclass
## 1 1 348
## 2 1 319
## 3 1 24
## 4 1 550
## 5 1 338
## 6 1 25
The logit prediction values matched with the distance variable in matchit command.
#only the numeric variable
lassodata <- dataset %>% select(race_white,race_black, race_hispanic, race_asian, p5numpla, p5hmage,
p5hdage, w3daded_hsb, w3momed_hsb,
w3momscr, w3dadscr, w3income, w3povrty, p5fstamp, c5r2mtsc, w3income_1k, c5r2mtsc_std)
#converting to a matrix
x <- data.matrix(lassodata)
# standard math score as response
dfy <- dataset %>% select (catholic)
y <- data.matrix(dfy)
dim(lassodata)
## [1] 5429 17
dim(dfy)
## [1] 5429 1
n = 5429 #observation number
cvlasso <- cv.glmnet(x, y,
family="binomial",
alpha=1,
lambda= (seq(0,100, 1))/n)
plot(cvlasso, cex = 0.8, cex.axis=0.8, cex.lab=0.8)
coef(cvlasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -2.550723e+00
## race_white 2.589038e-02
## race_black .
## race_hispanic .
## race_asian .
## p5numpla .
## p5hmage 1.986092e-02
## p5hdage .
## w3daded_hsb -2.161572e-01
## w3momed_hsb -2.387878e-01
## w3momscr 7.178855e-04
## w3dadscr .
## w3income 4.513822e-06
## w3povrty -2.352962e-01
## p5fstamp .
## c5r2mtsc .
## w3income_1k 1.384174e-05
## c5r2mtsc_std .
Comment on the results – If you used lambda.1se as your regularization parameter, how many variables would you have chosen to include? answer: 8 variables – Which variables would you have chosen to run the PSM command? answer: the selected variables are:race_white, p5hmage, w3daded_hsb, w3momed_hsb, w3income, w3momscr, w3povrty, w3income_1k – Do these variables differ from those you used before? yes, some of the variables are not used before to do the psm analysis (p5hmage, w3daded_hsb, w3income etc.) and p5numpla and w3income_1k were used in previous analysis which is not in the list of selected variables by lasso analysis
# Generate a prediction from the “best lasso” model
#Run a logit regression to predict catholic based on the covariates we kept (but not the math scores, of course)
model4 <- glm(formula = catholic ~ race_white+p5hmage+w3daded_hsb+w3momed_hsb+w3momscr+w3income+w3povrty+w3income_1k,
family = binomial(), data = dataset)
summary(model4)
##
## Call:
## glm(formula = catholic ~ race_white + p5hmage + w3daded_hsb +
## w3momed_hsb + w3momscr + w3income + w3povrty + w3income_1k,
## family = binomial(), data = dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1679 -0.6811 -0.5229 -0.3306 2.7350
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.407e+00 3.209e-01 -10.617 < 2e-16 ***
## race_white 2.332e-01 8.767e-02 2.660 0.007815 **
## p5hmage 3.620e-02 7.180e-03 5.042 4.60e-07 ***
## w3daded_hsb -3.338e-01 9.187e-02 -3.634 0.000279 ***
## w3momed_hsb -3.702e-01 1.005e-01 -3.682 0.000231 ***
## w3momscr 4.395e-03 3.406e-03 1.290 0.196939
## w3income 4.447e-06 9.093e-07 4.890 1.01e-06 ***
## w3povrty -1.132e+00 2.741e-01 -4.130 3.63e-05 ***
## w3income_1k NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4972.4 on 5428 degrees of freedom
## Residual deviance: 4668.4 on 5421 degrees of freedom
## AIC: 4684.4
##
## Number of Fisher Scoring iterations: 6
pred_score1 = data.frame(pr_score1 = predict(model4, type="response"))
predict_data1 <- cbind(dataset, pred_score1)
head(predict_data1)
## childid catholic race race_white race_black race_hispanic
## 1 0001002C 0 WHITE, NON-HISPANIC 1 0 0
## 2 0001004C 0 WHITE, NON-HISPANIC 1 0 0
## 3 0001010C 0 WHITE, NON-HISPANIC 1 0 0
## 4 0001011C 1 WHITE, NON-HISPANIC 1 0 0
## 5 0001012C 0 WHITE, NON-HISPANIC 1 0 0
## 6 0002004C 0 WHITE, NON-HISPANIC 1 0 0
## race_asian p5numpla p5hmage p5hdage w3daded
## 1 0 1 47 45 DOCTORATE OR PROFESSIONAL DEGREE
## 2 0 1 41 48 BACHELOR'S DEGREE
## 3 0 1 43 55 MASTER'S DEGREE (MA, MS)
## 4 0 1 38 39 BACHELOR'S DEGREE
## 5 0 1 47 57 DOCTORATE OR PROFESSIONAL DEGREE
## 6 0 1 41 41 BACHELOR'S DEGREE
## w3momed w3daded_hsb w3momed_hsb w3momscr
## 1 SOME COLLEGE 0 0 53.50
## 2 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 34.95
## 3 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 63.43
## 4 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 53.50
## 5 MASTER'S DEGREE (MA, MS) 0 0 61.56
## 6 BACHELOR'S DEGREE 0 0 38.18
## w3dadscr w3inccat w3income w3povrty p5fstamp c5r2mtsc
## 1 77.5 $50,001 TO $75,000 62500.5 0 0 60.043
## 2 53.5 $40,001 TO $50,000 45000.5 0 0 56.280
## 3 53.5 $50,001 TO $75,000 62500.5 0 0 55.272
## 4 53.5 $75,001 TO $100,000 87500.5 0 0 64.604
## 5 77.5 $100,001 TO $200,000 150000.5 0 0 75.721
## 6 53.5 $50,001 TO $75,000 62500.5 0 0 54.248
## c5r2mtsc_std w3income_1k pr_score1
## 1 0.9817533 62.5005 0.2770983
## 2 0.5943775 45.0005 0.2082585
## 3 0.4906106 62.5005 0.2572963
## 4 1.4512779 87.5005 0.2362127
## 5 2.5956991 150.0005 0.3694909
## 6 0.3851966 62.5005 0.2238370
new_predict <- fitted(model4)
psm3 <- matchit(catholic ~w3income_1k + w3momed_hsb + p5hmage + w3daded_hsb+w3income, data=predict_data1, family=binomial(), distance= new_predict)
psm3_data <- match.data(psm3)
head(psm3_data)
## childid catholic race race_white race_black race_hispanic
## 1 0001002C 0 WHITE, NON-HISPANIC 1 0 0
## 2 0001004C 0 WHITE, NON-HISPANIC 1 0 0
## 3 0001010C 0 WHITE, NON-HISPANIC 1 0 0
## 4 0001011C 1 WHITE, NON-HISPANIC 1 0 0
## 6 0002004C 0 WHITE, NON-HISPANIC 1 0 0
## 17 0006003C 0 WHITE, NON-HISPANIC 1 0 0
## race_asian p5numpla p5hmage p5hdage w3daded
## 1 0 1 47 45 DOCTORATE OR PROFESSIONAL DEGREE
## 2 0 1 41 48 BACHELOR'S DEGREE
## 3 0 1 43 55 MASTER'S DEGREE (MA, MS)
## 4 0 1 38 39 BACHELOR'S DEGREE
## 6 0 1 41 41 BACHELOR'S DEGREE
## 17 0 1 39 39 BACHELOR'S DEGREE
## w3momed w3daded_hsb w3momed_hsb w3momscr
## 1 SOME COLLEGE 0 0 53.50
## 2 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 34.95
## 3 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 63.43
## 4 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 53.50
## 6 BACHELOR'S DEGREE 0 0 38.18
## 17 BACHELOR'S DEGREE 0 0 53.50
## w3dadscr w3inccat w3income w3povrty p5fstamp c5r2mtsc
## 1 77.5 $50,001 TO $75,000 62500.5 0 0 60.043
## 2 53.5 $40,001 TO $50,000 45000.5 0 0 56.280
## 3 53.5 $50,001 TO $75,000 62500.5 0 0 55.272
## 4 53.5 $75,001 TO $100,000 87500.5 0 0 64.604
## 6 53.5 $50,001 TO $75,000 62500.5 0 0 54.248
## 17 53.5 $75,001 TO $100,000 87500.5 0 0 54.404
## c5r2mtsc_std w3income_1k pr_score1 distance weights subclass
## 1 0.9817533 62.5005 0.2770983 0.2770983 1 408
## 2 0.5943775 45.0005 0.2082585 0.2082585 1 111
## 3 0.4906106 62.5005 0.2572963 0.2572963 1 355
## 4 1.4512779 87.5005 0.2362127 0.2362127 1 550
## 6 0.3851966 62.5005 0.2238370 0.2238370 1 923
## 17 0.4012558 87.5005 0.2428065 0.2428065 1 554
The predicted score from best lasso model matched with the distance variable from matchit psm3 dataset
Model_psm3 <- lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb, data=psm3_data)
stargazer(Model_psm3, type="text", title="Table 5",
align=TRUE, dep.var.labels = "Standard Math Score", covariate.labels=c("Catholic", "Family Income", "Number of places the student has lived for at least 4 months", "Mother's Education"),
keep.stat = c("n", "adj.rsq"))
##
## Table 5
## ========================================================================================
## Dependent variable:
## ---------------------------
## Standard Math Score
## ----------------------------------------------------------------------------------------
## Catholic -0.217***
## (0.040)
##
## Family Income 0.004***
## (0.0005)
##
## Number of places the student has lived for at least 4 months -0.041
## (0.069)
##
## Mother's Education -0.315***
## (0.051)
##
## Constant 0.198**
## (0.091)
##
## ----------------------------------------------------------------------------------------
## Observations 1,860
## Adjusted R2 0.081
## ========================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
After Lasso Matching, the math core for students going to catholic school is lower by 0.217 points than non-catholic school students at 1% level of significance
# Original Matching
plot(sort_psm_data$distance, sort_psm_data$pr_score, type = "b", frame = FALSE, pch = 19,
col = "red", xlab = "x", ylab = "y")
# Lasso Matching
lines(psm3_data$distance, psm3_data$pr_score1, pch = 18, col = "blue", type = "b", lty = 2)
# Add a legend to the plot
legend("topleft", legend=c("Original Matching", "Lasso Matching"),
col=c("red", "blue"), lty = 1:2, cex=0.8)
From the above graph, we can see that lasso matching is not different than original matching.