Hw10

data<-read.csv("G:/My Drive/Seans Drive/PhD/Classes/AQME/HW/PSM/ecls.csv")
t.test(c5r2mtsc_std ~ catholic, data = data)
diff_cath<-lm(c5r2mtsc_std ~ catholic, data = data) #pvalue of .1
stargazer(diff_cath, type='html')
model1<-lm(c5r2mtsc_std ~ catholic + w3income_1k+p5numpla+w3momed_hsb, data = data) #pvalue of .1
stargazer(model1,type='html')

As seen above, in the simple linear model, individuals who are catholic has a decrease in their scores of 0.1273899. However, being catholic is an endogenous choice and therefore the coefficient is likely to be bias

dt<-as.data.table(data)
sum1<-tapply(dt$w3income_1k, data$catholic, summary) 
sum2<-tapply(dt$p5numpla, data$catholic, summary) 
sum3<-tapply(dt$w3momed_hsb, data$catholic, summary) 
sumMat<-matrix(0:0,ncol=3,nrow=4)
sumMat[1,1]=sum1$`0`[4]
sumMat[2,1]=sum1$`1`[4]
sumMat[1,2]=sum2$`0`[4]
sumMat[2,2]=sum2$`1`[4]
sumMat[1,3]=sum3$`0`[4]
sumMat[2,3]=sum3$`1`[4]
test1<-t.test(w3income_1k ~ catholic, data = data)
test2<-t.test(p5numpla ~ catholic, data = data)
test3<-t.test(w3momed_hsb ~ catholic, data = data)
sumMat[4,1]<-test1$p.value
sumMat[4,2]<-test2$p.value
sumMat[4,3]<-test3$p.value

for(i in 1:3){
  sumMat[3,i]=sumMat[1,i]-sumMat[2,i]
}
colnames(sumMat)<-c("w3income_1k",'p5numpla','w3momed_hsb')
rownames(sumMat)<-c('Catholic mean','non-Catholic mean', 'difference in means', 'p-value')

sumMat

	w3income_1k	p5numpla	w3momed_hsb
Catholic mean	65.39393	1.1062458	0.3923094
non-Catholic mean	86.18063	1.0731183	0.2053763
difference in means	-20.78670	0.0331276	0.1869331
p-value	0.00000	0.0017916	0.0000000

From the table above we see that the covariates are all statistically significant between the two groups, catholic and non-catholic. This would indicate that the treated group (catholic) is not similar to the control group (non-catholic) based upon observables that impact the outcome variable. To overcome this, we can use propensity score matching to synthetically balance our two groups.

#all of them are statistically different
PSM_logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb + race_white + race_black + race_hispanic + race_asian + p5hmage + p5hdage + w3daded_hsb + w3momscr + w3dadscr + w3income + w3povrty + p5fstamp, data = data, family = "binomial")
data$PSM_hand=predict(PSM_logit,type="response")
p1 <- ggplot(data=data, aes(x=PSM_hand, group=as.factor(catholic), fill=catholic)) +
  geom_density(adjust=1.5, alpha=.4) 
p1

p2 <- ggplot(data=data, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
  geom_density(adjust=1.5, alpha=.4) + ggtitle(label='By Hand PSM')
p2

Match_PSM <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb + race_white + race_black + race_hispanic + race_asian + p5hmage + p5hdage + w3daded_hsb + w3momscr + w3dadscr + w3income + w3povrty + p5fstamp, data = data, family = "binomial")
data_PSM<-match.data(Match_PSM)
data_PSM<-data_PSM[order(-data_PSM$distance),]
head(data_PSM, 10)

	childid	catholic	race	race_white	p5numpla	p5hmage	p5hdage	w3daded	w3momed	w3momscr	w3dadscr	w3inccat	w3income	c5r2mtsc	c5r2mtsc_std	w3income_1k	PSM_hand	distance	weights	subclass
4301	1119022C	1	WHITE, NON-HISPANIC	1	1	48	52	DOCTORATE OR PROFESSIONAL DEGREE	DOCTORATE OR PROFESSIONAL DEGREE	59.00	59.00	$200,001 TO $200,001	200001.0	57.484	0.7183213	200.0010	0.4435402	0.4435402	1	617
255	0052015C	0	WHITE, NON-HISPANIC	1	1	53	55	DOCTORATE OR PROFESSIONAL DEGREE	MASTER’S DEGREE (MA, MS)	61.56	72.10	$100,001 TO $200,000	150000.5	54.770	0.4389331	150.0005	0.4423514	0.4423514	1	617
1248	0314016C	1	WHITE, NON-HISPANIC	1	1	47	48	DOCTORATE OR PROFESSIONAL DEGREE	BACHELOR’S DEGREE	61.56	77.50	$200,001 TO $200,001	200001.0	63.738	1.3621290	200.0010	0.4388899	0.4388899	1	56
375	0081019C	0	WHITE, NON-HISPANIC	1	1	47	52	MASTER’S DEGREE (MA, MS)	MASTER’S DEGREE (MA, MS)	63.43	53.50	$200,001 TO $200,001	200001.0	45.248	-0.5412935	200.0010	0.4386094	0.4386094	1	56
329	0073012C	0	WHITE, NON-HISPANIC	1	1	51	51	BACHELOR’S DEGREE	BACHELOR’S DEGREE	35.78	35.78	$200,001 TO $200,001	200001.0	63.241	1.3109661	200.0010	0.4241175	0.4241175	1	399
601	0147016C	0	WHITE, NON-HISPANIC	1	1	50	62	SOME COLLEGE	SOME COLLEGE	61.56	39.18	$100,001 TO $200,000	150000.5	39.653	-1.1172615	150.0005	0.4231485	0.4231485	1	55
1247	0314015C	1	WHITE, NON-HISPANIC	1	1	50	58	MASTER’S DEGREE (MA, MS)	DOCTORATE OR PROFESSIONAL DEGREE	59.00	63.43	$100,001 TO $200,000	150000.5	57.246	0.6938208	150.0005	0.4228320	0.4228320	1	55
3249	0812012C	1	WHITE, NON-HISPANIC	1	1	44	64	BACHELOR’S DEGREE	SOME COLLEGE	35.78	53.50	$200,001 TO $200,001	200001.0	55.364	0.5000814	200.0010	0.4206592	0.4206592	1	399
4289	1119006C	1	WHITE, NON-HISPANIC	1	1	45	50	MASTER’S DEGREE (MA, MS)	MASTER’S DEGREE (MA, MS)	61.56	53.50	$200,001 TO $200,001	200001.0	55.767	0.5415676	200.0010	0.4180896	0.4180896	1	605
3934	1006020C	1	WHITE, NON-HISPANIC	1	1	47	54	DOCTORATE OR PROFESSIONAL DEGREE	BACHELOR’S DEGREE	35.78	59.00	$200,001 TO $200,001	200001.0	68.129	1.8141530	200.0010	0.4175431	0.4175431	1	541

dt<-as.data.table(data_PSM)
sum1<-tapply(dt$w3income_1k, dt$catholic, summary) 
sum2<-tapply(dt$p5numpla, dt$catholic, summary) 
sum3<-tapply(dt$w3momed_hsb, dt$catholic, summary) 
sumMat2<-matrix(0:0,ncol=3,nrow=4)
sumMat2[1,1]=sum1$`0`[4]
sumMat2[2,1]=sum1$`1`[4]
sumMat2[1,2]=sum2$`0`[4]
sumMat2[2,2]=sum2$`1`[4]
sumMat2[1,3]=sum3$`0`[4]
sumMat2[2,3]=sum3$`1`[4]
test1<-t.test(w3income_1k ~ catholic, data = data_PSM)
test2<-t.test(p5numpla ~ catholic, data = data_PSM)
test3<-t.test(w3momed_hsb ~ catholic, data = data_PSM)
sumMat2[4,1]<-test1$p.value
sumMat2[4,2]<-test2$p.value
sumMat2[4,3]<-test3$p.value

for(i in 1:3){
  sumMat2[3,i]=sumMat[1,i]-sumMat[2,i]
}
colnames(sumMat2)<-c("w3income_1k",'p5numpla','w3momed_hsb')
rownames(sumMat2)<-c('Catholic mean','non-Catholic mean', 'difference in means', 'p-value')

sumMat2

	w3income_1k	p5numpla	w3momed_hsb
Catholic mean	82.6134172	1.0827957	0.1946237
non-Catholic mean	86.1806253	1.0731183	0.2053763
difference in means	-20.7866967	0.0331276	0.1869331
p-value	0.0797688	0.4899029	0.5623800

In the new data set, the covariates are not significantly different meaning we were able to balance on observables. However, this does not guarantee that the unobservables are balanced as well. PSM allowed us to control for unobservables by predicting based on observable characteristics the likelihood of someone being in the treatment group. This attempts to overcome the counterfactual problem needed in the Ruben Causal Model. If it is unlikely that the control group would ever be in the treatment group than they are not ideal counterfactuals and your results will remain biased

p3 <- ggplot(data=data_PSM, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
  geom_density(adjust=1.5, alpha=.4) + ggtitle(label='PSM')
p3

test_PSM<-t.test(c5r2mtsc_std ~ catholic, data = data_PSM)
stargazer(test_PSM,type='html')

% Error: Unrecognized object type.

modelPSM<-lm(c5r2mtsc_std ~ catholic + w3income_1k+p5numpla+w3momed_hsb, data = data_PSM) #pvalue of 
stargazer(modelPSM,type='html')


	Dependent variable:

	c5r2mtsc_std

catholic	-0.164^***
	(0.039)

w3income_1k	0.004^***
	(0.0005)

p5numpla	-0.154^**
	(0.065)

w3momed_hsb	-0.341^***
	(0.050)

Constant	0.263^***
	(0.086)


Observations	1,860
R²	0.090
Adjusted R²	0.088
Residual Std. Error	0.844 (df = 1855)
F Statistic	46.005^*** (df = 4; 1855)

Note:	p<0.1; p<0.05; p<0.01

data$distancePSM<-Match_PSM$distance
p4 <- ggplot() + geom_density(data=data,aes(x=PSM_hand)) + geom_density(data=data,aes(x=distancePSM))
p4

#LASSO
library(glmnet)
dfx = subset(data, select = -c(catholic,childid, race, w3daded, w3momed, w3inccat, c5r2mtsc, c5r2mtsc_std, PSM_hand, distancePSM ) )
dfy= subset(data, select = catholic)
x=model.matrix(race_white~.,dfx)[,-1] #without the intercept column
y=dfy$catholic
nfolds=10
cvfit <- cv.glmnet(
  x, y, family = "binomial", 
  nfolds = nfolds,
  # foldid = foldid,
  keep = TRUE  # returns foldid 
)
out=glmnet(x,y,alpha=1,lambda=cvfit$lambda.1se)
lasso.coef=predict(out,type="coefficients",s=cvfit$lambda.1se)
lasso.coef # Some are zero

15 x 1 sparse Matrix of class “dgCMatrix” s1 (Intercept) 5.594121e-02 race_black .
race_hispanic .
race_asian .
p5numpla .
p5hmage 2.223032e-03 p5hdage .
w3daded_hsb -2.536899e-02 w3momed_hsb -2.627530e-02 w3momscr 4.277209e-06 w3dadscr .
w3income 7.433612e-07 w3povrty -4.659162e-03 p5fstamp .
w3income_1k 2.393500e-08

#p5hmage, w3dataed_hsb, w3momed_hsb, w3income, w3poverty, w3momscr, w3income_1k

Using LASSO for variable selection, it greatly reduced the number of variables I used in the original logit model and the PSM model previously.

MatchML_PSM <- matchit(catholic ~ w3income_1k + w3momed_hsb + p5hmage + w3daded_hsb + w3momscr  + w3income + w3povrty, data = data, family = "binomial")

data$distanceML<-MatchML_PSM$distance
p5 <- ggplot() + geom_density(data=data,aes(x=PSM_hand, colour='By Hand')) + geom_density(data=data ,aes(x=distancePSM, colour='PSM Match')) + geom_density(data=data,aes(x=distanceML, colour='LASSO')) +
  scale_color_manual(name = "names", values = c("By Hand" = "turquoise", "PSM Match" = "orange",'LASSO'='blue'))
p5

data_lasso<-match.data(MatchML_PSM)

p6 <- ggplot(data=data_lasso, aes(x=w3income_1k, group=as.factor(catholic), fill=catholic)) +
  geom_density(adjust=1.5, alpha=.4) + ggtitle(label='Lasso PSM')
p6

figure1 <- ggarrange(p1, p3, p6,
                     ncol = 1, nrow = 3)
figure1

As clearly highlighted in figure1, PSM using LASSO creates a treatment and control group that are nearly identical. This means that the results from the ML assisted PSM are closer to the ideal setting in the Ruben Causal Model than the previous methods.

Hw10

Sean

2022-04-17