1. Download the data

library(readr)
ecls <- read_csv("C:/Users/jy80530/OneDrive - University of Georgia/Course/07 Spring 2020/AAEC8610/HW/HW11/ecls.csv")
df <- subset(ecls, select = c(c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb))

2. See the problem

library(dplyr)
df %>%
  group_by(catholic) %>%
  summarise(
    N=n(),
    Mean=mean(c5r2mtsc_std),
    SE=sd(c5r2mtsc_std)/sqrt(length(c5r2mtsc_std)))
## # A tibble: 2 x 4
##   catholic     N  Mean     SE
##      <dbl> <int> <dbl>  <dbl>
## 1        0  4499 0.163 0.0145
## 2        1   930 0.220 0.0281
catholic n_students mean_math std_error
0 4499 0.1631279 0.0145155
1 930 0.2196851 0.0281317
df <- data_frame(df) 
model <- lm(c5r2mtsc_std ~ catholic, df)
summary(model)
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2667 -0.6082  0.0538  0.6292  3.2705 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.16313    0.01424  11.459   <2e-16 ***
## catholic     0.05656    0.03440   1.644      0.1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9549 on 5427 degrees of freedom
## Multiple R-squared:  0.000498,   Adjusted R-squared:  0.0003138 
## F-statistic: 2.704 on 1 and 5427 DF,  p-value: 0.1002

Dependent variable:

c5r2mtsc_std

catholic

Constant

0.057(0.034)

0.163***(0.014)

Observations

Adjusted R2

5,429

0.0003

note: p<0.1; p<0.05; p<0.01
model <- lm(c5r2mtsc_std ~ catholic+w3income_1k+p5numpla+w3momed_hsb, df)
summary(model)
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + 
##     w3momed_hsb, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3922 -0.5696  0.0372  0.5986  3.2572 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0733290  0.0491661   1.491 0.135900    
## catholic    -0.1273899  0.0328888  -3.873 0.000109 ***
## w3income_1k  0.0053247  0.0003026  17.595  < 2e-16 ***
## p5numpla    -0.1009432  0.0355779  -2.837 0.004567 ** 
## w3momed_hsb -0.3740355  0.0271962 -13.753  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8941 on 5424 degrees of freedom
## Multiple R-squared:  0.1242, Adjusted R-squared:  0.1235 
## F-statistic: 192.3 on 4 and 5424 DF,  p-value: < 2.2e-16

Dependent variable:

c5r2mtsc_std

catholic

w3income_1k

p5numpla

w3momed_hsb

Constant

-0.127***(0.033)

0.005***(0.0003)

-0.101***(0.036)

-0.374***(0.027)

0.073(0.049)

Observations

Adjusted R2

5,429

0.124

note: p<0.1; p<0.05; p<0.01
library(dplyr)
by_catholic <- df %>%
  group_by(catholic)


by_catholic %>%
  summarise_all(list(mean))
## # A tibble: 2 x 5
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
##      <dbl>        <dbl>       <dbl>    <dbl>       <dbl>
## 1        0        0.163        65.4     1.11       0.392
## 2        1        0.220        86.2     1.07       0.205
catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb
0 0.1631279 65.39393 1.106246 0.3923094
1 0.2196851 86.18063 1.073118 0.2053763

The significance tests give the following p-values. All are significantly different between groups.

oneway.test(w3income_1k ~ catholic, data = df, var.equal = T)
## 
##  One-way analysis of means
## 
## data:  w3income_1k and catholic
## F = 182.62, num df = 1, denom df = 5427, p-value < 2.2e-16
oneway.test(p5numpla ~ catholic, data = df, var.equal = T)
## 
##  One-way analysis of means
## 
## data:  p5numpla and catholic
## F = 7.26, num df = 1, denom df = 5427, p-value = 0.007073
oneway.test(w3momed_hsb ~ catholic, data = df, var.equal = T)
## 
##  One-way analysis of means
## 
## data:  w3momed_hsb and catholic
## F = 119.37, num df = 1, denom df = 5427, p-value < 2.2e-16

3. Estimate the propensity to go to catholic school, by hand

glm_fit<-glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = df)
summary(glm_fit)
## 
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb, 
##     family = binomial(), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0172  -0.6461  -0.5240  -0.4122   2.3672  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.6702950  0.1576164 -10.597  < 2e-16 ***
## w3income_1k  0.0077921  0.0008115   9.602  < 2e-16 ***
## p5numpla    -0.2774346  0.1232930  -2.250   0.0244 *  
## w3momed_hsb -0.6504757  0.0915710  -7.104 1.22e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4972.4  on 5428  degrees of freedom
## Residual deviance: 4750.6  on 5425  degrees of freedom
## AIC: 4758.6
## 
## Number of Fisher Scoring iterations: 4

Here’s the head of the predicted data:

glm_probs = data.frame(probs = predict(glm_fit, type="response"))
pr_score = data.frame(glm_probs, df$catholic)
head(pr_score)
##       probs df.catholic
## 1 0.1883576           0
## 2 0.1683901           0
## 3 0.1883576           0
## 4 0.2199574           1
## 5 0.3145556           0
## 6 0.1883576           0
library(ggplot2)
newdata = data.frame(glm_probs, df$catholic, df$w3income_1k)
g<- ggplot(data =newdata, mapping= aes(x=probs, y=df.w3income_1k, color=factor(df.catholic)))+
  geom_point(alpha = 0.1)+
  geom_smooth(method=NULL)
  labs(title="Income vs. predicted value",
       x="Logit prediction",
       y="w3income_1k")
## $x
## [1] "Logit prediction"
## 
## $y
## [1] "w3income_1k"
## 
## $title
## [1] "Income vs. predicted value"
## 
## attr(,"class")
## [1] "labels"
g

library(ggplot2)
g<- ggplot(data =newdata, mapping= aes(x=probs))+
  geom_histogram()+
  facet_grid(~df.catholic)+
  theme_bw()+
  labs(title="Histogram",
       x="Probability of going to Catholic school",
       y="count")
g

4. Run a Matching algorithm to get the impact of catholic schools by PSM

library(MatchIt)
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, link="logit", data = df)
psm_data<-match.data(psm) 
dim(psm_data)
## [1] 1860    8
psm_data1 <- psm_data[order(psm_data$distance),]
head(psm_data1)
## # A tibble: 6 x 8
##   c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb distance weights
##          <dbl>    <dbl>       <dbl>    <dbl>       <dbl>    <dbl>   <dbl>
## 1       -0.384        0        17.5        2           1   0.0607       1
## 2       -2.59         1        17.5        2           1   0.0607       1
## 3       -2.37         0        22.5        2           1   0.0630       1
## 4        0.527        1        22.5        2           1   0.0630       1
## 5       -0.220        0        27.5        2           1   0.0653       1
## 6       -1.69         1        27.5        2           1   0.0653       1
## # ... with 1 more variable: subclass <fct>
by_catholic <- psm_data %>%
  group_by(catholic)


by_catholic %>%
  summarise_all(list(mean))
## # A tibble: 2 x 8
##   catholic c5r2mtsc_std w3income_1k p5numpla w3momed_hsb distance weights
##      <dbl>        <dbl>       <dbl>    <dbl>       <dbl>    <dbl>   <dbl>
## 1        0        0.335        86.2     1.07       0.205    0.203       1
## 2        1        0.220        86.2     1.07       0.205    0.203       1
## # ... with 1 more variable: subclass <dbl>

Significance tests:

oneway.test(w3income_1k ~ catholic, data = psm_data1, var.equal = T)
## 
##  One-way analysis of means
## 
## data:  w3income_1k and catholic
## F = 0, num df = 1, denom df = 1858, p-value = 1
oneway.test(p5numpla ~ catholic, data = psm_data1, var.equal = T)
## 
##  One-way analysis of means
## 
## data:  p5numpla and catholic
## F = 0, num df = 1, denom df = 1858, p-value = 1
oneway.test(w3momed_hsb ~ catholic, data = psm_data1, var.equal = T)
## 
##  One-way analysis of means
## 
## data:  w3momed_hsb and catholic
## F = 0, num df = 1, denom df = 1858, p-value = 1

5. Visually check the balance in the new dataset:

library(Hmisc)
library(plyr)
g<- ggplot(data =newdata, mapping= aes(x=probs, y=df.w3income_1k, color=factor(df.catholic)))+
  geom_point(alpha = 0.1)+
  stat_plsmo(aes(color="lowess"))+
  labs(title="Income vs. predicted value",
     x="Propensity score",
     y="w3income_1k")
g

6. Compute the average treatment effect in your new sample

psm_data %>%
  group_by(catholic) %>%
  dplyr::summarise(
    N=n(),
    Mean=mean(c5r2mtsc_std),  
    SE=sd(c5r2mtsc_std)/sqrt(length(c5r2mtsc_std))
    )
## # A tibble: 2 x 4
##   catholic     N  Mean     SE
##      <dbl> <int> <dbl>  <dbl>
## 1        0   930 0.335 0.0298
## 2        1   930 0.220 0.0281
model <- lm(c5r2mtsc_std ~ catholic, psm_data1)
summary(model)
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic, data = psm_data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2884 -0.5547  0.0443  0.5931  2.8356 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.33456    0.02896  11.552  < 2e-16 ***
## catholic    -0.11488    0.04096  -2.805  0.00509 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8832 on 1858 degrees of freedom
## Multiple R-squared:  0.004216,   Adjusted R-squared:  0.00368 
## F-statistic: 7.867 on 1 and 1858 DF,  p-value: 0.005087
model <- lm(c5r2mtsc_std ~ catholic+w3income_1k+p5numpla+w3momed_hsb, psm_data1)
summary(model)
## 
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + 
##     w3momed_hsb, data = psm_data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3758 -0.5347  0.0265  0.5692  2.8515 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.1513954  0.0910427   1.663  0.09650 .  
## catholic    -0.1148784  0.0392301  -2.928  0.00345 ** 
## w3income_1k  0.0041331  0.0004566   9.052  < 2e-16 ***
## p5numpla    -0.0911405  0.0700264  -1.302  0.19324    
## w3momed_hsb -0.3662377  0.0495217  -7.396 2.12e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.846 on 1855 degrees of freedom
## Multiple R-squared:  0.08793,    Adjusted R-squared:  0.08596 
## F-statistic: 44.71 on 4 and 1855 DF,  p-value: < 2.2e-16

7. Run a LASSO procedure on the ORIGINAL dataset (the one with all the variables)

x_vars <- as.matrix(ecls[-c(1,2,3,11,12,17)])
y_var <- ecls$catholic
head(x_vars)
##      race_white race_black race_hispanic race_asian p5numpla p5hmage p5hdage
## [1,]          1          0             0          0        1      47      45
## [2,]          1          0             0          0        1      41      48
## [3,]          1          0             0          0        1      43      55
## [4,]          1          0             0          0        1      38      39
## [5,]          1          0             0          0        1      47      57
## [6,]          1          0             0          0        1      41      41
##      w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty p5fstamp
## [1,]           0           0    53.50     77.5  62500.5        0        0
## [2,]           0           0    34.95     53.5  45000.5        0        0
## [3,]           0           0    63.43     53.5  62500.5        0        0
## [4,]           0           0    53.50     53.5  87500.5        0        0
## [5,]           0           0    61.56     77.5 150000.5        0        0
## [6,]           0           0    38.18     53.5  62500.5        0        0
##      c5r2mtsc c5r2mtsc_std w3income_1k
## [1,]   60.043    0.9817533     62.5005
## [2,]   56.280    0.5943775     45.0005
## [3,]   55.272    0.4906106     62.5005
## [4,]   64.604    1.4512779     87.5005
## [5,]   75.721    2.5956991    150.0005
## [6,]   54.248    0.3851966     62.5005
#install.packages("glmnet")
library(glmnet)

lasso_reg <-cv.glmnet(x_vars,y_var)
plot(lasso_reg)

coef(lasso_reg, s = "lambda.1se")
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                          s1
## (Intercept)    1.132599e-01
## race_white     .           
## race_black     .           
## race_hispanic  .           
## race_asian     .           
## p5numpla       .           
## p5hmage        7.018037e-04
## p5hdage        .           
## w3daded_hsb   -1.378565e-02
## w3momed_hsb   -1.327652e-02
## w3momscr       .           
## w3dadscr       .           
## w3income       6.096101e-07
## w3povrty       .           
## p5fstamp       .           
## c5r2mtsc       .           
## c5r2mtsc_std   .           
## w3income_1k    .

8. Use Lasso predictions for matching

predict(lasso_reg, newx = x_vars[1:5,], s = "lambda.1se")
##      lambda.1se
## [1,]  0.1843457
## [2,]  0.1694667
## [3,]  0.1815384
## [4,]  0.1932697
## [5,]  0.2376865

Lasso might improve matching in creating a proper distance metric to match. We can use selected variable and run the PSM command to get a stronger distance metric based on features that matter.