data<-read.csv("G:/My Drive/Seans Drive/PhD/Classes/AQME/HW/PSM/ecls.csv")
t.test(c5r2mtsc_std ~ catholic, data = data)
diff_cath<-lm(c5r2mtsc_std ~ catholic, data = data) #pvalue of .1
stargazer(diff_cath, type='html')
model1<-lm(c5r2mtsc_std ~ catholic + w3income_1k+p5numpla+w3momed_hsb, data = data) #pvalue of .1
stargazer(model1,type='html')
As seen above, in the simple linear model, individuals who are catholic has a decrease in their scores of 0.1273899. However, being catholic is an endogenous choice and therefore the coefficient is likely to be bias
dt<-as.data.table(data)
sum1<-tapply(dt$w3income_1k, data$catholic, summary)
sum2<-tapply(dt$p5numpla, data$catholic, summary)
sum3<-tapply(dt$w3momed_hsb, data$catholic, summary)
sumMat<-matrix(0:0,ncol=3,nrow=4)
sumMat[1,1]=sum1$`0`[4]
sumMat[2,1]=sum1$`1`[4]
sumMat[1,2]=sum2$`0`[4]
sumMat[2,2]=sum2$`1`[4]
sumMat[1,3]=sum3$`0`[4]
sumMat[2,3]=sum3$`1`[4]
test1<-t.test(w3income_1k ~ catholic, data = data)
test2<-t.test(p5numpla ~ catholic, data = data)
test3<-t.test(w3momed_hsb ~ catholic, data = data)
sumMat[4,1]<-test1$p.value
sumMat[4,2]<-test2$p.value
sumMat[4,3]<-test3$p.value
for(i in 1:3){
sumMat[3,i]=sumMat[1,i]-sumMat[2,i]
}
colnames(sumMat)<-c("w3income_1k",'p5numpla','w3momed_hsb')
rownames(sumMat)<-c('Catholic mean','non-Catholic mean', 'difference in means', 'p-value')
sumMat
| w3income_1k | p5numpla | w3momed_hsb | |
|---|---|---|---|
| Catholic mean | 65.39393 | 1.1062458 | 0.3923094 |
| non-Catholic mean | 86.18063 | 1.0731183 | 0.2053763 |
| difference in means | -20.78670 | 0.0331276 | 0.1869331 |
| p-value | 0.00000 | 0.0017916 | 0.0000000 |
From the table above we see that the covariates are all statistically significant between the two groups, catholic and non-catholic. This would indicate that the treated group (catholic) is not similar to the control group (non-catholic) based upon observables that impact the outcome variable. To overcome this, we can use propensity score matching to synthetically balance our two groups.
#all of them are statistically different
PSM_logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb + race_white + race_black + race_hispanic + race_asian + p5hmage + p5hdage + w3daded_hsb + w3momscr + w3dadscr + w3income + w3povrty + p5fstamp, data = data, family = "binomial")
data$PSM_hand=predict(PSM_logit,type="response")
p1 <- ggplot(data=data, aes(x=PSM_hand, group=as.factor(catholic), fill=catholic)) +
geom_density(adjust=1.5, alpha=.4)
p1
p2 <- ggplot(data=data, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
geom_density(adjust=1.5, alpha=.4) + ggtitle(label='By Hand PSM')
p2
Match_PSM <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb + race_white + race_black + race_hispanic + race_asian + p5hmage + p5hdage + w3daded_hsb + w3momscr + w3dadscr + w3income + w3povrty + p5fstamp, data = data, family = "binomial")
data_PSM<-match.data(Match_PSM)
data_PSM<-data_PSM[order(-data_PSM$distance),]
head(data_PSM, 10)
| childid | catholic | race | race_white | race_black | race_hispanic | race_asian | p5numpla | p5hmage | p5hdage | w3daded | w3momed | w3daded_hsb | w3momed_hsb | w3momscr | w3dadscr | w3inccat | w3income | w3povrty | p5fstamp | c5r2mtsc | c5r2mtsc_std | w3income_1k | PSM_hand | distance | weights | subclass | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4301 | 1119022C | 1 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 48 | 52 | DOCTORATE OR PROFESSIONAL DEGREE | DOCTORATE OR PROFESSIONAL DEGREE | 0 | 0 | 59.00 | 59.00 | $200,001 TO $200,001 | 200001.0 | 0 | 0 | 57.484 | 0.7183213 | 200.0010 | 0.4435402 | 0.4435402 | 1 | 617 |
| 255 | 0052015C | 0 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 53 | 55 | DOCTORATE OR PROFESSIONAL DEGREE | MASTER’S DEGREE (MA, MS) | 0 | 0 | 61.56 | 72.10 | $100,001 TO $200,000 | 150000.5 | 0 | 0 | 54.770 | 0.4389331 | 150.0005 | 0.4423514 | 0.4423514 | 1 | 617 |
| 1248 | 0314016C | 1 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 47 | 48 | DOCTORATE OR PROFESSIONAL DEGREE | BACHELOR’S DEGREE | 0 | 0 | 61.56 | 77.50 | $200,001 TO $200,001 | 200001.0 | 0 | 0 | 63.738 | 1.3621290 | 200.0010 | 0.4388899 | 0.4388899 | 1 | 56 |
| 375 | 0081019C | 0 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 47 | 52 | MASTER’S DEGREE (MA, MS) | MASTER’S DEGREE (MA, MS) | 0 | 0 | 63.43 | 53.50 | $200,001 TO $200,001 | 200001.0 | 0 | 0 | 45.248 | -0.5412935 | 200.0010 | 0.4386094 | 0.4386094 | 1 | 56 |
| 329 | 0073012C | 0 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 51 | 51 | BACHELOR’S DEGREE | BACHELOR’S DEGREE | 0 | 0 | 35.78 | 35.78 | $200,001 TO $200,001 | 200001.0 | 0 | 0 | 63.241 | 1.3109661 | 200.0010 | 0.4241175 | 0.4241175 | 1 | 399 |
| 601 | 0147016C | 0 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 50 | 62 | SOME COLLEGE | SOME COLLEGE | 0 | 0 | 61.56 | 39.18 | $100,001 TO $200,000 | 150000.5 | 0 | 0 | 39.653 | -1.1172615 | 150.0005 | 0.4231485 | 0.4231485 | 1 | 55 |
| 1247 | 0314015C | 1 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 50 | 58 | MASTER’S DEGREE (MA, MS) | DOCTORATE OR PROFESSIONAL DEGREE | 0 | 0 | 59.00 | 63.43 | $100,001 TO $200,000 | 150000.5 | 0 | 0 | 57.246 | 0.6938208 | 150.0005 | 0.4228320 | 0.4228320 | 1 | 55 |
| 3249 | 0812012C | 1 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 44 | 64 | BACHELOR’S DEGREE | SOME COLLEGE | 0 | 0 | 35.78 | 53.50 | $200,001 TO $200,001 | 200001.0 | 0 | 0 | 55.364 | 0.5000814 | 200.0010 | 0.4206592 | 0.4206592 | 1 | 399 |
| 4289 | 1119006C | 1 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 45 | 50 | MASTER’S DEGREE (MA, MS) | MASTER’S DEGREE (MA, MS) | 0 | 0 | 61.56 | 53.50 | $200,001 TO $200,001 | 200001.0 | 0 | 0 | 55.767 | 0.5415676 | 200.0010 | 0.4180896 | 0.4180896 | 1 | 605 |
| 3934 | 1006020C | 1 | WHITE, NON-HISPANIC | 1 | 0 | 0 | 0 | 1 | 47 | 54 | DOCTORATE OR PROFESSIONAL DEGREE | BACHELOR’S DEGREE | 0 | 0 | 35.78 | 59.00 | $200,001 TO $200,001 | 200001.0 | 0 | 0 | 68.129 | 1.8141530 | 200.0010 | 0.4175431 | 0.4175431 | 1 | 541 |
dt<-as.data.table(data_PSM)
sum1<-tapply(dt$w3income_1k, dt$catholic, summary)
sum2<-tapply(dt$p5numpla, dt$catholic, summary)
sum3<-tapply(dt$w3momed_hsb, dt$catholic, summary)
sumMat2<-matrix(0:0,ncol=3,nrow=4)
sumMat2[1,1]=sum1$`0`[4]
sumMat2[2,1]=sum1$`1`[4]
sumMat2[1,2]=sum2$`0`[4]
sumMat2[2,2]=sum2$`1`[4]
sumMat2[1,3]=sum3$`0`[4]
sumMat2[2,3]=sum3$`1`[4]
test1<-t.test(w3income_1k ~ catholic, data = data_PSM)
test2<-t.test(p5numpla ~ catholic, data = data_PSM)
test3<-t.test(w3momed_hsb ~ catholic, data = data_PSM)
sumMat2[4,1]<-test1$p.value
sumMat2[4,2]<-test2$p.value
sumMat2[4,3]<-test3$p.value
for(i in 1:3){
sumMat2[3,i]=sumMat[1,i]-sumMat[2,i]
}
colnames(sumMat2)<-c("w3income_1k",'p5numpla','w3momed_hsb')
rownames(sumMat2)<-c('Catholic mean','non-Catholic mean', 'difference in means', 'p-value')
sumMat2
| w3income_1k | p5numpla | w3momed_hsb | |
|---|---|---|---|
| Catholic mean | 82.6134172 | 1.0827957 | 0.1946237 |
| non-Catholic mean | 86.1806253 | 1.0731183 | 0.2053763 |
| difference in means | -20.7866967 | 0.0331276 | 0.1869331 |
| p-value | 0.0797688 | 0.4899029 | 0.5623800 |
In the new data set, the covariates are not significantly different meaning we were able to balance on observables. However, this does not guarantee that the unobservables are balanced as well. PSM allowed us to control for unobservables by predicting based on observable characteristics the likelihood of someone being in the treatment group. This attempts to overcome the counterfactual problem needed in the Ruben Causal Model. If it is unlikely that the control group would ever be in the treatment group than they are not ideal counterfactuals and your results will remain biased
p3 <- ggplot(data=data_PSM, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
geom_density(adjust=1.5, alpha=.4) + ggtitle(label='PSM')
p3
test_PSM<-t.test(c5r2mtsc_std ~ catholic, data = data_PSM)
stargazer(test_PSM,type='html')
% Error: Unrecognized object type.
modelPSM<-lm(c5r2mtsc_std ~ catholic + w3income_1k+p5numpla+w3momed_hsb, data = data_PSM) #pvalue of
stargazer(modelPSM,type='html')
| Dependent variable: | |
| c5r2mtsc_std | |
| catholic | -0.164*** |
| (0.039) | |
| w3income_1k | 0.004*** |
| (0.0005) | |
| p5numpla | -0.154** |
| (0.065) | |
| w3momed_hsb | -0.341*** |
| (0.050) | |
| Constant | 0.263*** |
| (0.086) | |
| Observations | 1,860 |
| R2 | 0.090 |
| Adjusted R2 | 0.088 |
| Residual Std. Error | 0.844 (df = 1855) |
| F Statistic | 46.005*** (df = 4; 1855) |
| Note: | p<0.1; p<0.05; p<0.01 |
.
data$distancePSM<-Match_PSM$distance
p4 <- ggplot() + geom_density(data=data,aes(x=PSM_hand)) + geom_density(data=data,aes(x=distancePSM))
p4
#LASSO
library(glmnet)
dfx = subset(data, select = -c(catholic,childid, race, w3daded, w3momed, w3inccat, c5r2mtsc, c5r2mtsc_std, PSM_hand, distancePSM ) )
dfy= subset(data, select = catholic)
x=model.matrix(race_white~.,dfx)[,-1] #without the intercept column
y=dfy$catholic
nfolds=10
cvfit <- cv.glmnet(
x, y, family = "binomial",
nfolds = nfolds,
# foldid = foldid,
keep = TRUE # returns foldid
)
out=glmnet(x,y,alpha=1,lambda=cvfit$lambda.1se)
lasso.coef=predict(out,type="coefficients",s=cvfit$lambda.1se)
lasso.coef # Some are zero
15 x 1 sparse Matrix of class “dgCMatrix” s1 (Intercept) 5.594121e-02
race_black .
race_hispanic .
race_asian .
p5numpla .
p5hmage 2.223032e-03 p5hdage .
w3daded_hsb -2.536899e-02 w3momed_hsb -2.627530e-02 w3momscr
4.277209e-06 w3dadscr .
w3income 7.433612e-07 w3povrty -4.659162e-03 p5fstamp .
w3income_1k 2.393500e-08
#p5hmage, w3dataed_hsb, w3momed_hsb, w3income, w3poverty, w3momscr, w3income_1k
Using LASSO for variable selection, it greatly reduced the number of variables I used in the original logit model and the PSM model previously.
MatchML_PSM <- matchit(catholic ~ w3income_1k + w3momed_hsb + p5hmage + w3daded_hsb + w3momscr + w3income + w3povrty, data = data, family = "binomial")
data$distanceML<-MatchML_PSM$distance
p5 <- ggplot() + geom_density(data=data,aes(x=PSM_hand, colour='By Hand')) + geom_density(data=data ,aes(x=distancePSM, colour='PSM Match')) + geom_density(data=data,aes(x=distanceML, colour='LASSO')) +
scale_color_manual(name = "names", values = c("By Hand" = "turquoise", "PSM Match" = "orange",'LASSO'='blue'))
p5
data_lasso<-match.data(MatchML_PSM)
p6 <- ggplot(data=data_lasso, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
geom_density(adjust=1.5, alpha=.4) + ggtitle(label='Lasso PSM')
p6
figure1 <- ggarrange(p1, p3, p6,
ncol = 1, nrow = 3)
figure1
As
clearly highlighted in figure1, PSM using LASSO creates a treatment and
control group that are nearly identical. This means that the results
from the ML assisted PSM are closer to the ideal setting in the Ruben
Causal Model than the previous methods.