#loading necessary packages 
#tinytex::install_tinytex()

library(MatchIt)
library(rvest)
library(ggplot2)
library(tidyverse)
library(XML)
library(xml2)
library(plotrix)
library(haven)
library(stargazer)
library(glmnet)

Part A: Understanding and running PSM

1. Download the data

dataset <- read.csv("ecls.csv", header=T)

##For the purpose of questions 1 - 6, make a smaller dataset which only contains the variables

newdata <- dataset %>% select(c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)

2. Identify the problem

Do test scores on c5r2mtsc_std differ for pupils of catholic and non-catholic schools on average?

table1 <- newdata %>%                               # Summary by group using dplyr
  group_by(catholic) %>% 
  summarize(n_students=table(catholic), mean_math = mean(c5r2mtsc_std), STD_E= std.error(c5r2mtsc_std))

table1
## # A tibble: 2 x 4
##   catholic n_students mean_math  STD_E
##      <int> <table>        <dbl>  <dbl>
## 1        0 4499           0.163 0.0145
## 2        1  930           0.220 0.0281

From the above results, we can see that going to a catholic school lead to a higher math score. It could be possible that catholic schools had less students compared to public schools so they got better educational facilities and had higher math scores

#A regression of just math scores on the catholic dummy
model1 <- lm(c5r2mtsc_std~catholic, data=newdata)

stargazer(model1, type="text", title="Table 1 : Math Scores for Catholic Students", 
          align=TRUE, dep.var.labels = "Standard Math Score", keep.stat = c("n", "adj.rsq"))
## 
## Table 1 : Math Scores for Catholic Students
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                  Standard Math Score    
## ----------------------------------------
## catholic                0.057           
##                        (0.034)          
##                                         
## Constant              0.163***          
##                        (0.014)          
##                                         
## ----------------------------------------
## Observations            5,429           
## Adjusted R2            0.0003           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01

Students going to catholic schools has 0.057 higher math scores than students going to non-catholic schools but the coefficient is not statistically significant.

#Another regression that includes all the variables listed above
Model2 <- lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb, data=newdata)

stargazer(Model2, type="text", title="Table 2", 
          align=TRUE, dep.var.labels = "Standard Math Score",  covariate.labels=c("Catholic", "Family Income", "Number of places the student has lived for at least 4 months", "Mother's Education"),
          keep.stat = c("n", "adj.rsq"))
## 
## Table 2
## ========================================================================================
##                                                                  Dependent variable:    
##                                                              ---------------------------
##                                                                  Standard Math Score    
## ----------------------------------------------------------------------------------------
## Catholic                                                              -0.127***         
##                                                                        (0.033)          
##                                                                                         
## Family Income                                                         0.005***          
##                                                                       (0.0003)          
##                                                                                         
## Number of places the student has lived for at least 4 months          -0.101***         
##                                                                        (0.036)          
##                                                                                         
## Mother's Education                                                    -0.374***         
##                                                                        (0.027)          
##                                                                                         
## Constant                                                                0.073           
##                                                                        (0.049)          
##                                                                                         
## ----------------------------------------------------------------------------------------
## Observations                                                            5,429           
## Adjusted R2                                                             0.124           
## ========================================================================================
## Note:                                                        *p<0.1; **p<0.05; ***p<0.01

However, while regressing math score with family income, mother’s education etc, we get different results. Students going to catholic schools has 0.127 lower math scores than students going to non-catholic schools and the coefficient is statistically significant at 1% level of significance.

#Do the means of the covariates (income, mother's education etc) differ between catholic vs. non-catholic schools?

by_catholic <- newdata %>% 
               group_by(catholic)

by_catholic %>%
  summarise_all(list(mean))
## # A tibble: 2 x 5
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
##      <int>        <dbl>       <dbl>    <dbl>       <dbl>
## 1        0        0.163        65.4     1.11       0.392
## 2        1        0.220        86.2     1.07       0.205
#Test the difference in means for statistical significance
t.test(w3income_1k~catholic, data=newdata)
## 
##  Welch Two Sample t-test
## 
## data:  w3income_1k by catholic
## t = -13.238, df = 1314.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -23.86719 -17.70620
## sample estimates:
## mean in group 0 mean in group 1 
##        65.39393        86.18063
t.test(p5numpla~catholic, data=newdata)
## 
##  Welch Two Sample t-test
## 
## data:  p5numpla by catholic
## t = 3.128, df = 1600.4, p-value = 0.001792
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.01235472 0.05390038
## sample estimates:
## mean in group 0 mean in group 1 
##        1.106246        1.073118
t.test(w3momed_hsb~catholic, data=newdata)
## 
##  Welch Two Sample t-test
## 
## data:  w3momed_hsb by catholic
## t = 12.362, df = 1545.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.1572715 0.2165946
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3923094       0.2053763

The means of the covariates (income, mother’s education etc) differ between catholic vs. non-catholic schools significantly.

3. Estimate the propensity to go to catholic school, by hand

#Run a logit regression to predict catholic based on the covariates we kept (but not the math scores, of course)
model3 <- glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb, 
    family = binomial(), data = newdata)

summary(model3)
## 
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb, 
##     family = binomial(), data = newdata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0172  -0.6461  -0.5240  -0.4122   2.3672  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.6702950  0.1576164 -10.597  < 2e-16 ***
## w3income_1k  0.0077921  0.0008115   9.602  < 2e-16 ***
## p5numpla    -0.2774346  0.1232930  -2.250   0.0244 *  
## w3momed_hsb -0.6504757  0.0915710  -7.104 1.22e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4972.4  on 5428  degrees of freedom
## Residual deviance: 4750.6  on 5425  degrees of freedom
## AIC: 4758.6
## 
## Number of Fisher Scoring iterations: 4
#Make a prediction for who is likely to go to catholic school, using this model
precicted_score = data.frame(pr_score = predict(model3, type="response"))

predict_data <- cbind(newdata, precicted_score)

head(predict_data)
##   c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb  pr_score
## 1    0.9817533        0     62.5005        1           0 0.1883576
## 2    0.5943775        0     45.0005        1           0 0.1683901
## 3    0.4906106        0     62.5005        1           0 0.1883576
## 4    1.4512779        1     87.5005        1           0 0.2199574
## 5    2.5956991        0    150.0005        1           0 0.3145556
## 6    0.3851966        0     62.5005        1           0 0.1883576
#Check that there is common support
#plot 1: fitted plot 
g <- ggplot(predict_data, aes(x=pr_score,  y=w3income_1k, color=as.factor(catholic))) + geom_point()+
  geom_smooth(aes(y = w3income_1k), size = 1)
g

#plot2: histogram

names <- c(`1`="Actual school type attended: Catholic", `0`="Actual school type attended: Public")


predict_data %>%
ggplot(aes(x = pr_score)) +
  geom_histogram(colour = "white", fill = "black") +
  facet_wrap(~catholic, ncol = 2, labeller = as_labeller(names))+xlab("Probability of going to catholic school")+
  
  
  theme(strip.background = element_rect(fill = "gray80", colour = "black",
                                        size = 0.5, linetype = "solid"),
        strip.text = element_text(face = "bold"))

#plot 3: density graph

ggplot(predict_data, aes(pr_score, fill = as.factor(catholic), color =as.factor(catholic))) +
  geom_density(alpha = 0.4)

From all the above graph, we can see that both catholic and non-catholic students have groups have predicted values in overlapping ranges.

4. Run a Matching algorithm to get a balanced dataset

psm <-  matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb,  data = predict_data)

psm_data <- match.data(psm)

dim(psm_data)
## [1] 1860    9
#Sort the data by distance, and show the 10 first observations

sort_psm_data <- psm_data[order(psm_data$distance), ]

head(sort_psm_data, 10)
##      c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb   pr_score
## 176    -0.3835843        0     17.5005        2           1 0.06069529
## 4274   -2.5922340        1     17.5005        2           1 0.06069529
## 41     -2.3690526        0     22.5005        2           1 0.06295488
## 2083    0.5271555        1     22.5005        2           1 0.06295488
## 561    -0.2195955        0     27.5005        2           1 0.06529274
## 2304   -1.6878765        1     27.5005        2           1 0.06529274
## 178    -0.2550080        0     32.5005        2           1 0.06771114
## 716    -1.2722942        1     32.5005        2           1 0.06771114
## 288    -0.7403859        0     37.5005        2           1 0.07021240
## 485    -0.7841369        1     37.5005        2           1 0.07021240
##        distance weights subclass
## 176  0.06069529       1      598
## 4274 0.06069529       1      598
## 41   0.06295488       1      222
## 2083 0.06295488       1      222
## 561  0.06529274       1      272
## 2304 0.06529274       1      272
## 178  0.06771114       1      859
## 716  0.06771114       1      859
## 288  0.07021240       1      728
## 485  0.07021240       1      728

The number of observations that matched are the students who are likely to go catholic school.

#In this new dataset, do the means of the covariates (income, etc) differ between catholic vs. non-catholic schools?

by_catholic_psm <- sort_psm_data %>% 
  group_by(catholic)

by_catholic_psm %>%
  summarise_all(list(mean))
## # A tibble: 2 x 9
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb pr_score distance
##      <int>        <dbl>       <dbl>    <dbl>       <dbl>    <dbl>    <dbl>
## 1        0        0.335        86.2     1.07       0.205    0.203    0.203
## 2        1        0.220        86.2     1.07       0.205    0.203    0.203
## # ... with 2 more variables: weights <dbl>, subclass <dbl>
t.test(w3income_1k~catholic, data=sort_psm_data)
## 
##  Welch Two Sample t-test
## 
## data:  w3income_1k by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -3.985565  3.985565
## sample estimates:
## mean in group 0 mean in group 1 
##        86.18063        86.18063
t.test(p5numpla~catholic, data=sort_psm_data)
## 
##  Welch Two Sample t-test
## 
## data:  p5numpla by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.02550007  0.02550007
## sample estimates:
## mean in group 0 mean in group 1 
##        1.073118        1.073118
t.test(w3momed_hsb~catholic, data=sort_psm_data)
## 
##  Welch Two Sample t-test
## 
## data:  w3momed_hsb by catholic
## t = 0, df = 1858, p-value = 1
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.03676158  0.03676158
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2053763       0.2053763
#Visually check the balance in the new dataset:

g1 <- ggplot(sort_psm_data, aes(x=pr_score,  y=w3income_1k, color=as.factor(catholic))) + geom_point()+
  geom_smooth(aes(y = w3income_1k), size = 1)
g1

From the above graph, the means are nearly identical at each level of the pscore distribution.

6. Compute the average treatment effect in your new sample

#Compare means of the two groups

table2 <- sort_psm_data %>%                               # Summary by group using dplyr
  group_by(catholic) %>% 
  summarize(n_students=table(catholic), mean_math = mean(c5r2mtsc_std), STD_E= std.error(c5r2mtsc_std)) 
table2
## # A tibble: 2 x 4
##   catholic n_students mean_math  STD_E
##      <int> <table>        <dbl>  <dbl>
## 1        0 930            0.335 0.0298
## 2        1 930            0.220 0.0281

The mean math score is different between two groups

#Run regressions:
#- A regression of just scores on the catholic dummy

model_psm1 <- lm(c5r2mtsc_std~catholic, data=sort_psm_data)

stargazer(model_psm1, type="text", title="Table 3 : Math Scores for Catholic Students", 
          align=TRUE, dep.var.labels = "Standard Math Score", keep.stat = c("n", "adj.rsq"))
## 
## Table 3 : Math Scores for Catholic Students
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                  Standard Math Score    
## ----------------------------------------
## catholic              -0.115***         
##                        (0.041)          
##                                         
## Constant              0.335***          
##                        (0.029)          
##                                         
## ----------------------------------------
## Observations            1,860           
## Adjusted R2             0.004           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01
#- Another regression that includes all the variables

Model_psm2 <- lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb, data=sort_psm_data)

stargazer(Model_psm2, type="text", title="Table 4", 
          align=TRUE, dep.var.labels = "Standard Math Score",  covariate.labels=c("Catholic", "Family Income", "Number of places the student has lived for at least 4 months", "Mother's Education"),
          keep.stat = c("n", "adj.rsq"))
## 
## Table 4
## ========================================================================================
##                                                                  Dependent variable:    
##                                                              ---------------------------
##                                                                  Standard Math Score    
## ----------------------------------------------------------------------------------------
## Catholic                                                              -0.115***         
##                                                                        (0.039)          
##                                                                                         
## Family Income                                                         0.004***          
##                                                                       (0.0005)          
##                                                                                         
## Number of places the student has lived for at least 4 months           -0.091           
##                                                                        (0.070)          
##                                                                                         
## Mother's Education                                                    -0.366***         
##                                                                        (0.050)          
##                                                                                         
## Constant                                                               0.151*           
##                                                                        (0.091)          
##                                                                                         
## ----------------------------------------------------------------------------------------
## Observations                                                            1,860           
## Adjusted R2                                                             0.086           
## ========================================================================================
## Note:                                                        *p<0.1; **p<0.05; ***p<0.01

The math score for students going to catholic school is 0.115 point lower than students from non-catholic students which is significant at 1% level of significance level in both simple and multiple regressions.

Part B: Deconstructing PSM

7. Split the PSM procedure

plot(sort_psm_data$distance, sort_psm_data$pr_score, xlab = "Distance from matchit command", ylab = "Predicted math score")

lm(pr_score~distance, data=sort_psm_data)
## 
## Call:
## lm(formula = pr_score ~ distance, data = sort_psm_data)
## 
## Coefficients:
## (Intercept)     distance  
##   2.718e-15    1.000e+00

From the graph and the slope= 1, we can verify that Verify that the logit predicted values are the same as the distance used by the matchit command.

# Run your same matchit command but with distance= your logit predictions. See that this is the same.
logit_pred <- fitted(model3)

psm2 <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, data=predict_data, family=binomial(), distance= logit_pred)

psm2_data <- match.data(psm2)

head(psm2_data)
##   c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb  pr_score  distance
## 1    0.9817533        0     62.5005        1           0 0.1883576 0.1883576
## 2    0.5943775        0     45.0005        1           0 0.1683901 0.1683901
## 3    0.4906106        0     62.5005        1           0 0.1883576 0.1883576
## 4    1.4512779        1     87.5005        1           0 0.2199574 0.2199574
## 5    2.5956991        0    150.0005        1           0 0.3145556 0.3145556
## 6    0.3851966        0     62.5005        1           0 0.1883576 0.1883576
##   weights subclass
## 1       1      348
## 2       1      319
## 3       1       24
## 4       1      550
## 5       1      338
## 6       1       25

The logit prediction values matched with the distance variable in matchit command.

8. Use Machine Learning for matching

#only the numeric variable 

lassodata <- dataset %>% select(race_white,race_black, race_hispanic, race_asian, p5numpla, p5hmage, 
                                p5hdage, w3daded_hsb, w3momed_hsb,
                                w3momscr, w3dadscr, w3income, w3povrty, p5fstamp, c5r2mtsc,  w3income_1k, c5r2mtsc_std)


#converting to a matrix
x <- data.matrix(lassodata)

# standard math score as response 

dfy <- dataset %>% select (catholic)

y <- data.matrix(dfy)
dim(lassodata)
## [1] 5429   17
dim(dfy)
## [1] 5429    1
n = 5429 #observation number
cvlasso <- cv.glmnet(x, y, 
                     family="binomial",
                     alpha=1,
                     lambda= (seq(0,100, 1))/n)


plot(cvlasso, cex = 0.8, cex.axis=0.8, cex.lab=0.8)

coef(cvlasso)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                          s1
## (Intercept)   -2.550723e+00
## race_white     2.589038e-02
## race_black     .           
## race_hispanic  .           
## race_asian     .           
## p5numpla       .           
## p5hmage        1.986092e-02
## p5hdage        .           
## w3daded_hsb   -2.161572e-01
## w3momed_hsb   -2.387878e-01
## w3momscr       7.178855e-04
## w3dadscr       .           
## w3income       4.513822e-06
## w3povrty      -2.352962e-01
## p5fstamp       .           
## c5r2mtsc       .           
## w3income_1k    1.384174e-05
## c5r2mtsc_std   .

Comment on the results – If you used lambda.1se as your regularization parameter, how many variables would you have chosen to include? answer: 8 variables – Which variables would you have chosen to run the PSM command? answer: the selected variables are:race_white, p5hmage, w3daded_hsb, w3momed_hsb, w3income, w3momscr, w3povrty, w3income_1k – Do these variables differ from those you used before? yes, some of the variables are not used before to do the psm analysis (p5hmage, w3daded_hsb, w3income etc.) and p5numpla and w3income_1k were used in previous analysis which is not in the list of selected variables by lasso analysis

9. Use Lasso predictions for matching

# Generate a prediction from the “best lasso” model
#Run a logit regression to predict catholic based on the covariates we kept (but not the math scores, of course)
model4 <- glm(formula = catholic ~ race_white+p5hmage+w3daded_hsb+w3momed_hsb+w3momscr+w3income+w3povrty+w3income_1k,
    family = binomial(), data = dataset)

summary(model4)
## 
## Call:
## glm(formula = catholic ~ race_white + p5hmage + w3daded_hsb + 
##     w3momed_hsb + w3momscr + w3income + w3povrty + w3income_1k, 
##     family = binomial(), data = dataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1679  -0.6811  -0.5229  -0.3306   2.7350  
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.407e+00  3.209e-01 -10.617  < 2e-16 ***
## race_white   2.332e-01  8.767e-02   2.660 0.007815 ** 
## p5hmage      3.620e-02  7.180e-03   5.042 4.60e-07 ***
## w3daded_hsb -3.338e-01  9.187e-02  -3.634 0.000279 ***
## w3momed_hsb -3.702e-01  1.005e-01  -3.682 0.000231 ***
## w3momscr     4.395e-03  3.406e-03   1.290 0.196939    
## w3income     4.447e-06  9.093e-07   4.890 1.01e-06 ***
## w3povrty    -1.132e+00  2.741e-01  -4.130 3.63e-05 ***
## w3income_1k         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4972.4  on 5428  degrees of freedom
## Residual deviance: 4668.4  on 5421  degrees of freedom
## AIC: 4684.4
## 
## Number of Fisher Scoring iterations: 6
pred_score1 = data.frame(pr_score1 = predict(model4, type="response"))

predict_data1 <- cbind(dataset, pred_score1)

head(predict_data1)
##    childid catholic                race race_white race_black race_hispanic
## 1 0001002C        0 WHITE, NON-HISPANIC          1          0             0
## 2 0001004C        0 WHITE, NON-HISPANIC          1          0             0
## 3 0001010C        0 WHITE, NON-HISPANIC          1          0             0
## 4 0001011C        1 WHITE, NON-HISPANIC          1          0             0
## 5 0001012C        0 WHITE, NON-HISPANIC          1          0             0
## 6 0002004C        0 WHITE, NON-HISPANIC          1          0             0
##   race_asian p5numpla p5hmage p5hdage                          w3daded
## 1          0        1      47      45 DOCTORATE OR PROFESSIONAL DEGREE
## 2          0        1      41      48                BACHELOR'S DEGREE
## 3          0        1      43      55         MASTER'S DEGREE (MA, MS)
## 4          0        1      38      39                BACHELOR'S DEGREE
## 5          0        1      47      57 DOCTORATE OR PROFESSIONAL DEGREE
## 6          0        1      41      41                BACHELOR'S DEGREE
##                                  w3momed w3daded_hsb w3momed_hsb w3momscr
## 1                           SOME COLLEGE           0           0    53.50
## 2 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE           0           0    34.95
## 3 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE           0           0    63.43
## 4 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE           0           0    53.50
## 5               MASTER'S DEGREE (MA, MS)           0           0    61.56
## 6                      BACHELOR'S DEGREE           0           0    38.18
##   w3dadscr             w3inccat w3income w3povrty p5fstamp c5r2mtsc
## 1     77.5   $50,001 TO $75,000  62500.5        0        0   60.043
## 2     53.5   $40,001 TO $50,000  45000.5        0        0   56.280
## 3     53.5   $50,001 TO $75,000  62500.5        0        0   55.272
## 4     53.5  $75,001 TO $100,000  87500.5        0        0   64.604
## 5     77.5 $100,001 TO $200,000 150000.5        0        0   75.721
## 6     53.5   $50,001 TO $75,000  62500.5        0        0   54.248
##   c5r2mtsc_std w3income_1k pr_score1
## 1    0.9817533     62.5005 0.2770983
## 2    0.5943775     45.0005 0.2082585
## 3    0.4906106     62.5005 0.2572963
## 4    1.4512779     87.5005 0.2362127
## 5    2.5956991    150.0005 0.3694909
## 6    0.3851966     62.5005 0.2238370
new_predict <- fitted(model4)

psm3 <- matchit(catholic ~w3income_1k + w3momed_hsb + p5hmage + w3daded_hsb+w3income, data=predict_data1, family=binomial(), distance= new_predict)

psm3_data <- match.data(psm3)

head(psm3_data)
##     childid catholic                race race_white race_black race_hispanic
## 1  0001002C        0 WHITE, NON-HISPANIC          1          0             0
## 2  0001004C        0 WHITE, NON-HISPANIC          1          0             0
## 3  0001010C        0 WHITE, NON-HISPANIC          1          0             0
## 4  0001011C        1 WHITE, NON-HISPANIC          1          0             0
## 6  0002004C        0 WHITE, NON-HISPANIC          1          0             0
## 17 0006003C        0 WHITE, NON-HISPANIC          1          0             0
##    race_asian p5numpla p5hmage p5hdage                          w3daded
## 1           0        1      47      45 DOCTORATE OR PROFESSIONAL DEGREE
## 2           0        1      41      48                BACHELOR'S DEGREE
## 3           0        1      43      55         MASTER'S DEGREE (MA, MS)
## 4           0        1      38      39                BACHELOR'S DEGREE
## 6           0        1      41      41                BACHELOR'S DEGREE
## 17          0        1      39      39                BACHELOR'S DEGREE
##                                   w3momed w3daded_hsb w3momed_hsb w3momscr
## 1                            SOME COLLEGE           0           0    53.50
## 2  GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE           0           0    34.95
## 3  GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE           0           0    63.43
## 4  GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE           0           0    53.50
## 6                       BACHELOR'S DEGREE           0           0    38.18
## 17                      BACHELOR'S DEGREE           0           0    53.50
##    w3dadscr            w3inccat w3income w3povrty p5fstamp c5r2mtsc
## 1      77.5  $50,001 TO $75,000  62500.5        0        0   60.043
## 2      53.5  $40,001 TO $50,000  45000.5        0        0   56.280
## 3      53.5  $50,001 TO $75,000  62500.5        0        0   55.272
## 4      53.5 $75,001 TO $100,000  87500.5        0        0   64.604
## 6      53.5  $50,001 TO $75,000  62500.5        0        0   54.248
## 17     53.5 $75,001 TO $100,000  87500.5        0        0   54.404
##    c5r2mtsc_std w3income_1k pr_score1  distance weights subclass
## 1     0.9817533     62.5005 0.2770983 0.2770983       1      408
## 2     0.5943775     45.0005 0.2082585 0.2082585       1      111
## 3     0.4906106     62.5005 0.2572963 0.2572963       1      355
## 4     1.4512779     87.5005 0.2362127 0.2362127       1      550
## 6     0.3851966     62.5005 0.2238370 0.2238370       1      923
## 17    0.4012558     87.5005 0.2428065 0.2428065       1      554

The predicted score from best lasso model matched with the distance variable from matchit psm3 dataset

Model_psm3 <- lm(c5r2mtsc_std~catholic+w3income_1k+p5numpla+w3momed_hsb, data=psm3_data)

stargazer(Model_psm3, type="text", title="Table 5", 
          align=TRUE, dep.var.labels = "Standard Math Score",  covariate.labels=c("Catholic", "Family Income", "Number of places the student has lived for at least 4 months", "Mother's Education"),
          keep.stat = c("n", "adj.rsq"))
## 
## Table 5
## ========================================================================================
##                                                                  Dependent variable:    
##                                                              ---------------------------
##                                                                  Standard Math Score    
## ----------------------------------------------------------------------------------------
## Catholic                                                              -0.217***         
##                                                                        (0.040)          
##                                                                                         
## Family Income                                                         0.004***          
##                                                                       (0.0005)          
##                                                                                         
## Number of places the student has lived for at least 4 months           -0.041           
##                                                                        (0.069)          
##                                                                                         
## Mother's Education                                                    -0.315***         
##                                                                        (0.051)          
##                                                                                         
## Constant                                                               0.198**          
##                                                                        (0.091)          
##                                                                                         
## ----------------------------------------------------------------------------------------
## Observations                                                            1,860           
## Adjusted R2                                                             0.081           
## ========================================================================================
## Note:                                                        *p<0.1; **p<0.05; ***p<0.01

After Lasso Matching, the math core for students going to catholic school is lower by 0.217 points than non-catholic school students at 1% level of significance

# Original Matching 
plot(sort_psm_data$distance, sort_psm_data$pr_score, type = "b", frame = FALSE, pch = 19, 
     col = "red", xlab = "x", ylab = "y")
# Lasso Matching 
lines(psm3_data$distance, psm3_data$pr_score1, pch = 18, col = "blue", type = "b", lty = 2)
# Add a legend to the plot
legend("topleft", legend=c("Original Matching", "Lasso Matching"),
       col=c("red", "blue"), lty = 1:2, cex=0.8)

From the above graph, we can see that lasso matching is not different than original matching.