data<-read.csv("G:/My Drive/Seans Drive/PhD/Classes/AQME/HW/PSM/ecls.csv")
t.test(c5r2mtsc_std ~ catholic, data = data)
diff_cath<-lm(c5r2mtsc_std ~ catholic, data = data) #pvalue of .1
stargazer(diff_cath, type='html')
model1<-lm(c5r2mtsc_std ~ catholic + w3income_1k+p5numpla+w3momed_hsb, data = data) #pvalue of .1
stargazer(model1,type='html')

As seen above, in the simple linear model, individuals who are catholic has a decrease in their scores of 0.1273899. However, being catholic is an endogenous choice and therefore the coefficient is likely to be bias

dt<-as.data.table(data)
sum1<-tapply(dt$w3income_1k, data$catholic, summary) 
sum2<-tapply(dt$p5numpla, data$catholic, summary) 
sum3<-tapply(dt$w3momed_hsb, data$catholic, summary) 
sumMat<-matrix(0:0,ncol=3,nrow=4)
sumMat[1,1]=sum1$`0`[4]
sumMat[2,1]=sum1$`1`[4]
sumMat[1,2]=sum2$`0`[4]
sumMat[2,2]=sum2$`1`[4]
sumMat[1,3]=sum3$`0`[4]
sumMat[2,3]=sum3$`1`[4]
test1<-t.test(w3income_1k ~ catholic, data = data)
test2<-t.test(p5numpla ~ catholic, data = data)
test3<-t.test(w3momed_hsb ~ catholic, data = data)
sumMat[4,1]<-test1$p.value
sumMat[4,2]<-test2$p.value
sumMat[4,3]<-test3$p.value

for(i in 1:3){
  sumMat[3,i]=sumMat[1,i]-sumMat[2,i]
}
colnames(sumMat)<-c("w3income_1k",'p5numpla','w3momed_hsb')
rownames(sumMat)<-c('Catholic mean','non-Catholic mean', 'difference in means', 'p-value')
sumMat
w3income_1k p5numpla w3momed_hsb
Catholic mean 65.39393 1.1062458 0.3923094
non-Catholic mean 86.18063 1.0731183 0.2053763
difference in means -20.78670 0.0331276 0.1869331
p-value 0.00000 0.0017916 0.0000000

From the table above we see that the covariates are all statistically significant between the two groups, catholic and non-catholic. This would indicate that the treated group (catholic) is not similar to the control group (non-catholic) based upon observables that impact the outcome variable. To overcome this, we can use propensity score matching to synthetically balance our two groups.

#all of them are statistically different
PSM_logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb + race_white + race_black + race_hispanic + race_asian + p5hmage + p5hdage + w3daded_hsb + w3momscr + w3dadscr + w3income + w3povrty + p5fstamp, data = data, family = "binomial")
data$PSM_hand=predict(PSM_logit,type="response")
p1 <- ggplot(data=data, aes(x=PSM_hand, group=as.factor(catholic), fill=catholic)) +
  geom_density(adjust=1.5, alpha=.4) 
p1

p2 <- ggplot(data=data, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
  geom_density(adjust=1.5, alpha=.4) + ggtitle(label='By Hand PSM')
p2

Match_PSM <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb + race_white + race_black + race_hispanic + race_asian + p5hmage + p5hdage + w3daded_hsb + w3momscr + w3dadscr + w3income + w3povrty + p5fstamp, data = data, family = "binomial")
data_PSM<-match.data(Match_PSM)
data_PSM<-data_PSM[order(-data_PSM$distance),]
head(data_PSM, 10)
childid catholic race race_white race_black race_hispanic race_asian p5numpla p5hmage p5hdage w3daded w3momed w3daded_hsb w3momed_hsb w3momscr w3dadscr w3inccat w3income w3povrty p5fstamp c5r2mtsc c5r2mtsc_std w3income_1k PSM_hand distance weights subclass
4301 1119022C 1 WHITE, NON-HISPANIC 1 0 0 0 1 48 52 DOCTORATE OR PROFESSIONAL DEGREE DOCTORATE OR PROFESSIONAL DEGREE 0 0 59.00 59.00 $200,001 TO $200,001 200001.0 0 0 57.484 0.7183213 200.0010 0.4435402 0.4435402 1 617
255 0052015C 0 WHITE, NON-HISPANIC 1 0 0 0 1 53 55 DOCTORATE OR PROFESSIONAL DEGREE MASTER’S DEGREE (MA, MS) 0 0 61.56 72.10 $100,001 TO $200,000 150000.5 0 0 54.770 0.4389331 150.0005 0.4423514 0.4423514 1 617
1248 0314016C 1 WHITE, NON-HISPANIC 1 0 0 0 1 47 48 DOCTORATE OR PROFESSIONAL DEGREE BACHELOR’S DEGREE 0 0 61.56 77.50 $200,001 TO $200,001 200001.0 0 0 63.738 1.3621290 200.0010 0.4388899 0.4388899 1 56
375 0081019C 0 WHITE, NON-HISPANIC 1 0 0 0 1 47 52 MASTER’S DEGREE (MA, MS) MASTER’S DEGREE (MA, MS) 0 0 63.43 53.50 $200,001 TO $200,001 200001.0 0 0 45.248 -0.5412935 200.0010 0.4386094 0.4386094 1 56
329 0073012C 0 WHITE, NON-HISPANIC 1 0 0 0 1 51 51 BACHELOR’S DEGREE BACHELOR’S DEGREE 0 0 35.78 35.78 $200,001 TO $200,001 200001.0 0 0 63.241 1.3109661 200.0010 0.4241175 0.4241175 1 399
601 0147016C 0 WHITE, NON-HISPANIC 1 0 0 0 1 50 62 SOME COLLEGE SOME COLLEGE 0 0 61.56 39.18 $100,001 TO $200,000 150000.5 0 0 39.653 -1.1172615 150.0005 0.4231485 0.4231485 1 55
1247 0314015C 1 WHITE, NON-HISPANIC 1 0 0 0 1 50 58 MASTER’S DEGREE (MA, MS) DOCTORATE OR PROFESSIONAL DEGREE 0 0 59.00 63.43 $100,001 TO $200,000 150000.5 0 0 57.246 0.6938208 150.0005 0.4228320 0.4228320 1 55
3249 0812012C 1 WHITE, NON-HISPANIC 1 0 0 0 1 44 64 BACHELOR’S DEGREE SOME COLLEGE 0 0 35.78 53.50 $200,001 TO $200,001 200001.0 0 0 55.364 0.5000814 200.0010 0.4206592 0.4206592 1 399
4289 1119006C 1 WHITE, NON-HISPANIC 1 0 0 0 1 45 50 MASTER’S DEGREE (MA, MS) MASTER’S DEGREE (MA, MS) 0 0 61.56 53.50 $200,001 TO $200,001 200001.0 0 0 55.767 0.5415676 200.0010 0.4180896 0.4180896 1 605
3934 1006020C 1 WHITE, NON-HISPANIC 1 0 0 0 1 47 54 DOCTORATE OR PROFESSIONAL DEGREE BACHELOR’S DEGREE 0 0 35.78 59.00 $200,001 TO $200,001 200001.0 0 0 68.129 1.8141530 200.0010 0.4175431 0.4175431 1 541
dt<-as.data.table(data_PSM)
sum1<-tapply(dt$w3income_1k, dt$catholic, summary) 
sum2<-tapply(dt$p5numpla, dt$catholic, summary) 
sum3<-tapply(dt$w3momed_hsb, dt$catholic, summary) 
sumMat2<-matrix(0:0,ncol=3,nrow=4)
sumMat2[1,1]=sum1$`0`[4]
sumMat2[2,1]=sum1$`1`[4]
sumMat2[1,2]=sum2$`0`[4]
sumMat2[2,2]=sum2$`1`[4]
sumMat2[1,3]=sum3$`0`[4]
sumMat2[2,3]=sum3$`1`[4]
test1<-t.test(w3income_1k ~ catholic, data = data_PSM)
test2<-t.test(p5numpla ~ catholic, data = data_PSM)
test3<-t.test(w3momed_hsb ~ catholic, data = data_PSM)
sumMat2[4,1]<-test1$p.value
sumMat2[4,2]<-test2$p.value
sumMat2[4,3]<-test3$p.value

for(i in 1:3){
  sumMat2[3,i]=sumMat[1,i]-sumMat[2,i]
}
colnames(sumMat2)<-c("w3income_1k",'p5numpla','w3momed_hsb')
rownames(sumMat2)<-c('Catholic mean','non-Catholic mean', 'difference in means', 'p-value')
sumMat2
w3income_1k p5numpla w3momed_hsb
Catholic mean 82.6134172 1.0827957 0.1946237
non-Catholic mean 86.1806253 1.0731183 0.2053763
difference in means -20.7866967 0.0331276 0.1869331
p-value 0.0797688 0.4899029 0.5623800

In the new data set, the covariates are not significantly different meaning we were able to balance on observables. However, this does not guarantee that the unobservables are balanced as well. PSM allowed us to control for unobservables by predicting based on observable characteristics the likelihood of someone being in the treatment group. This attempts to overcome the counterfactual problem needed in the Ruben Causal Model. If it is unlikely that the control group would ever be in the treatment group than they are not ideal counterfactuals and your results will remain biased

p3 <- ggplot(data=data_PSM, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
  geom_density(adjust=1.5, alpha=.4) + ggtitle(label='PSM')
p3

test_PSM<-t.test(c5r2mtsc_std ~ catholic, data = data_PSM)
stargazer(test_PSM,type='html')

% Error: Unrecognized object type.

modelPSM<-lm(c5r2mtsc_std ~ catholic + w3income_1k+p5numpla+w3momed_hsb, data = data_PSM) #pvalue of 
stargazer(modelPSM,type='html')
Dependent variable:
c5r2mtsc_std
catholic -0.164***
(0.039)
w3income_1k 0.004***
(0.0005)
p5numpla -0.154**
(0.065)
w3momed_hsb -0.341***
(0.050)
Constant 0.263***
(0.086)
Observations 1,860
R2 0.090
Adjusted R2 0.088
Residual Std. Error 0.844 (df = 1855)
F Statistic 46.005*** (df = 4; 1855)
Note: p<0.1; p<0.05; p<0.01

.

data$distancePSM<-Match_PSM$distance
p4 <- ggplot() + geom_density(data=data,aes(x=PSM_hand)) + geom_density(data=data,aes(x=distancePSM))
p4

#LASSO
library(glmnet)
dfx = subset(data, select = -c(catholic,childid, race, w3daded, w3momed, w3inccat, c5r2mtsc, c5r2mtsc_std, PSM_hand, distancePSM ) )
dfy= subset(data, select = catholic)
x=model.matrix(race_white~.,dfx)[,-1] #without the intercept column
y=dfy$catholic
nfolds=10
cvfit <- cv.glmnet(
  x, y, family = "binomial", 
  nfolds = nfolds,
  # foldid = foldid,
  keep = TRUE  # returns foldid 
)
out=glmnet(x,y,alpha=1,lambda=cvfit$lambda.1se)
lasso.coef=predict(out,type="coefficients",s=cvfit$lambda.1se)
lasso.coef # Some are zero

15 x 1 sparse Matrix of class “dgCMatrix” s1 (Intercept) 5.594121e-02 race_black .
race_hispanic .
race_asian .
p5numpla .
p5hmage 2.223032e-03 p5hdage .
w3daded_hsb -2.536899e-02 w3momed_hsb -2.627530e-02 w3momscr 4.277209e-06 w3dadscr .
w3income 7.433612e-07 w3povrty -4.659162e-03 p5fstamp .
w3income_1k 2.393500e-08

#p5hmage, w3dataed_hsb, w3momed_hsb, w3income, w3poverty, w3momscr, w3income_1k

Using LASSO for variable selection, it greatly reduced the number of variables I used in the original logit model and the PSM model previously.

MatchML_PSM <- matchit(catholic ~ w3income_1k + w3momed_hsb + p5hmage + w3daded_hsb + w3momscr  + w3income + w3povrty, data = data, family = "binomial")

data$distanceML<-MatchML_PSM$distance
p5 <- ggplot() + geom_density(data=data,aes(x=PSM_hand, colour='By Hand')) + geom_density(data=data ,aes(x=distancePSM, colour='PSM Match')) + geom_density(data=data,aes(x=distanceML, colour='LASSO')) +
  scale_color_manual(name = "names", values = c("By Hand" = "turquoise", "PSM Match" = "orange",'LASSO'='blue'))
p5

data_lasso<-match.data(MatchML_PSM)

p6 <- ggplot(data=data_lasso, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
  geom_density(adjust=1.5, alpha=.4) + ggtitle(label='Lasso PSM')
p6

figure1 <- ggarrange(p1, p3, p6,
                     ncol = 1, nrow = 3)
figure1

As clearly highlighted in figure1, PSM using LASSO creates a treatment and control group that are nearly identical. This means that the results from the ML assisted PSM are closer to the ideal setting in the Ruben Causal Model than the previous methods.