library(dplyr)
library(stargazer)
library(ggplot2)
# read data
data <- read.csv("PSM.csv")
# make smaller datset with necessary variables only
data <- data %>% select(c5r2mtsc_std, catholic, w3income_1k, p5numpla, w3momed_hsb)
sum1 <- summary(data$c5r2mtsc_std[data$catholic == 1])
sum1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.5922 -0.3232 0.2510 0.2197 0.7729 3.0552
sum2 <- summary(data$c5r2mtsc_std[data$catholic == 0])
sum2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1036 -0.4577 0.2207 0.1631 0.8084 3.4337
This shows that on avreage, catholics tend to have higher score than non-catholics.
# Running regression: math scores on the catholic dummy
reg1 <- lm(c5r2mtsc_std ~ catholic, data)
stargazer(reg1, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## -----------------------------------------------
## catholic 0.057
## (0.034)
##
## Constant 0.163***
## (0.014)
##
## -----------------------------------------------
## Observations 5,429
## R2 0.0005
## Adjusted R2 0.0003
## Residual Std. Error 0.955 (df = 5427)
## F Statistic 2.704 (df = 1; 5427)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Results show that the effect of variable catholic on standardized math score is positive but insignificant.
# Another regression with all variables
reg2 <- lm(c5r2mtsc_std ~ ., data)
stargazer(reg2, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## -----------------------------------------------
## catholic -0.127***
## (0.033)
##
## w3income_1k 0.005***
## (0.0003)
##
## p5numpla -0.101***
## (0.036)
##
## w3momed_hsb -0.374***
## (0.027)
##
## Constant 0.073
## (0.049)
##
## -----------------------------------------------
## Observations 5,429
## R2 0.124
## Adjusted R2 0.124
## Residual Std. Error 0.894 (df = 5424)
## F Statistic 192.272*** (df = 4; 5424)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
However, when other variables were included in the model, we find negative effect of catholic variable on math score and it is statistically significant at 1% alpha. Results also show significant negative effect of higher education level of mother and students staying in many places over last 4 months. Additionally, there is significant positive effect of higher family income on math score.
# Do the means of the covariates (income, mother’s education etc) differ between catholic vs. non-catholic schools?
mean <- data %>%
group_by(catholic) %>% # grouping by catholic variable
summarise(across(c(c5r2mtsc_std, w3income_1k, p5numpla, w3momed_hsb), list(mean = mean)))
colnames(mean) <- c("Catholic", "Scores", "FamIncome", "NumPlaces", "MomEducation")
mean
## # A tibble: 2 x 5
## Catholic Scores FamIncome NumPlaces MomEducation
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.163 65.4 1.11 0.392
## 2 1 0.220 86.2 1.07 0.205
# testing using t test
# Family income
t.test(w3income_1k ~ catholic, data = data)$p.value
## [1] 1.212805e-37
# Number of places
t.test(p5numpla ~ catholic, data = data)$p.value
## [1] 0.001791557
# mother education
t.test(w3momed_hsb ~ catholic, data = data)$p.value
## [1] 1.530072e-33
Results show that the p-value is less than 0.05, thus difference in means are signfiicantly different. Problem with identification strategy: endogeneity and omitted variable problem can be of major concern in the simple regressions.
#3. Run a logit regression to predict catholic based on the covariates we kept
logit <- glm(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = data)
summary(logit)
##
## Call:
## glm(formula = catholic ~ w3income_1k + p5numpla + w3momed_hsb,
## family = binomial(), data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0172 -0.6461 -0.5240 -0.4122 2.3672
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6702950 0.1576164 -10.597 < 2e-16 ***
## w3income_1k 0.0077921 0.0008115 9.602 < 2e-16 ***
## p5numpla -0.2774346 0.1232930 -2.250 0.0244 *
## w3momed_hsb -0.6504757 0.0915710 -7.104 1.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4972.4 on 5428 degrees of freedom
## Residual deviance: 4750.6 on 5425 degrees of freedom
## AIC: 4758.6
##
## Number of Fisher Scoring iterations: 4
# Make a prediction for who is likely to go to catholic school, using this model
data$prediction <- predict(logit, type = "response")
# head of the predicted data
head(data)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prediction
## 1 0.9817533 0 62.5005 1 0 0.1883576
## 2 0.5943775 0 45.0005 1 0 0.1683901
## 3 0.4906106 0 62.5005 1 0 0.1883576
## 4 1.4512779 1 87.5005 1 0 0.2199574
## 5 2.5956991 0 150.0005 1 0 0.3145556
## 6 0.3851966 0 62.5005 1 0 0.1883576
# Check that there is common support
# visual check with plot densities
ggplot(data = data, aes(x = prediction, fill = as.factor(catholic)))+
geom_density(alpha=0.5)
# plot income against prediction
library(ggplot2)
ggplot(data = data, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
geom_point(data = data, aes(prediction, w3income_1k))+
geom_smooth(data = data, aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")
library(MatchIt)
# running same as we did up in logit reg
psm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = data)
psm_data <- match.data(psm)
dim(psm_data)
## [1] 1860 9
There are 1860 observations now, 930 students each in the catholic category. The orginal dataset had a total of 5429 observations.
# Now, sort the data by distance, and show the 10 first observations
newpsm_data <- arrange(psm_data, distance)
head(newpsm_data, n= 10)
## c5r2mtsc_std catholic w3income_1k p5numpla w3momed_hsb prediction distance
## 1 -0.3835843 0 17.5005 2 1 0.06069529 0.06069529
## 2 -2.5922340 1 17.5005 2 1 0.06069529 0.06069529
## 3 -2.3690526 0 22.5005 2 1 0.06295488 0.06295488
## 4 0.5271555 1 22.5005 2 1 0.06295488 0.06295488
## 5 -0.2195955 0 27.5005 2 1 0.06529274 0.06529274
## 6 -1.6878765 1 27.5005 2 1 0.06529274 0.06529274
## 7 -0.2550080 0 32.5005 2 1 0.06771114 0.06771114
## 8 -1.2722942 1 32.5005 2 1 0.06771114 0.06771114
## 9 -0.7403859 0 37.5005 2 1 0.07021240 0.07021240
## 10 -0.7841369 1 37.5005 2 1 0.07021240 0.07021240
## weights subclass
## 1 1 598
## 2 1 598
## 3 1 222
## 4 1 222
## 5 1 272
## 6 1 272
## 7 1 859
## 8 1 859
## 9 1 728
## 10 1 728
# Again, means of the covariates differ between catholic vs. non-catholic schools?
newpsm_data %>%
group_by(catholic) %>%
dplyr::summarise(Score = mean(c5r2mtsc_std),
FamIncome = mean(w3income_1k),
Places = mean(p5numpla),
MomEdu = mean(w3momed_hsb),
Predict = mean(prediction),
Distance = mean(distance),
Weights = mean(weights))
## # A tibble: 2 x 8
## catholic Score FamIncome Places MomEdu Predict Distance Weights
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.335 86.2 1.07 0.205 0.203 0.203 1
## 2 1 0.220 86.2 1.07 0.205 0.203 0.203 1
# testing using t test
# Family income
t.test(w3income_1k ~ catholic, data = newpsm_data)$p.value
## [1] 1
# Number of places
t.test(p5numpla ~ catholic, data = newpsm_data)$p.value
## [1] 1
# mother education
t.test(w3momed_hsb ~ catholic, data = newpsm_data)$p.value
## [1] 1
Results show that differences in mean are statistically insignificant. Identification strategy: May still have endogeneity and omitted variable issue
ggplot(data = newpsm_data, aes(x = prediction, y= w3income_1k, color = as.factor(catholic)))+
geom_point(data = newpsm_data, aes(prediction, w3income_1k))+
geom_smooth(data = newpsm_data, aes(x = prediction, y= w3income_1k, color = as.factor(catholic), group = catholic), method = "loess")
# Yes, they are identical
# Compare means of the two groups
# Running regression: math scores on the catholic dummy
reg3 <- lm(c5r2mtsc_std ~ catholic, newpsm_data)
stargazer(reg3, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## -----------------------------------------------
## catholic -0.115***
## (0.041)
##
## Constant 0.335***
## (0.029)
##
## -----------------------------------------------
## Observations 1,860
## R2 0.004
## Adjusted R2 0.004
## Residual Std. Error 0.883 (df = 1858)
## F Statistic 7.867*** (df = 1; 1858)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Results show that now the effect of variable catholic on standardized math score is negative and significant.
# Another regression with all variables
reg4 <- lm(c5r2mtsc_std ~ catholic + w3income_1k + p5numpla + w3momed_hsb, newpsm_data)
stargazer(reg4, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## c5r2mtsc_std
## -----------------------------------------------
## catholic -0.115***
## (0.039)
##
## w3income_1k 0.004***
## (0.0005)
##
## p5numpla -0.091
## (0.070)
##
## w3momed_hsb -0.366***
## (0.050)
##
## Constant 0.151*
## (0.091)
##
## -----------------------------------------------
## Observations 1,860
## R2 0.088
## Adjusted R2 0.086
## Residual Std. Error 0.846 (df = 1855)
## F Statistic 44.707*** (df = 4; 1855)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
When other variables were included in the model, we still find significant negative effect of catholic variable on math score 1% alpha. Results also show significant negative effect of higher education level of mother and significant positive effect of higher family income on math score. The effect of students staying in many places over last 4 months was negative and insignificant.
# predictions from logit reg
red_log <- fitted(logit)
plot(psm$distance, red_log)
# Run your same matchit command but with distance= your logit predictions. See that this is the same.
RunPsm <- matchit(catholic ~ w3income_1k + p5numpla + w3momed_hsb, family = binomial(), data = data, distance = red_log)
RunPsm_data <- match.data(RunPsm)
dim(RunPsm_data)
## [1] 1860 9
# We got the same dimension as in logit matchit
# Now, use all data
newdata <- read.csv("PSM.csv")
head(newdata)
## childid catholic race race_white race_black race_hispanic
## 1 0001002C 0 WHITE, NON-HISPANIC 1 0 0
## 2 0001004C 0 WHITE, NON-HISPANIC 1 0 0
## 3 0001010C 0 WHITE, NON-HISPANIC 1 0 0
## 4 0001011C 1 WHITE, NON-HISPANIC 1 0 0
## 5 0001012C 0 WHITE, NON-HISPANIC 1 0 0
## 6 0002004C 0 WHITE, NON-HISPANIC 1 0 0
## race_asian p5numpla p5hmage p5hdage w3daded
## 1 0 1 47 45 DOCTORATE OR PROFESSIONAL DEGREE
## 2 0 1 41 48 BACHELOR'S DEGREE
## 3 0 1 43 55 MASTER'S DEGREE (MA, MS)
## 4 0 1 38 39 BACHELOR'S DEGREE
## 5 0 1 47 57 DOCTORATE OR PROFESSIONAL DEGREE
## 6 0 1 41 41 BACHELOR'S DEGREE
## w3momed w3daded_hsb w3momed_hsb w3momscr
## 1 SOME COLLEGE 0 0 53.50
## 2 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 34.95
## 3 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 63.43
## 4 GRADUATE/PROFESSIONAL SCHOOL-NO DEGREE 0 0 53.50
## 5 MASTER'S DEGREE (MA, MS) 0 0 61.56
## 6 BACHELOR'S DEGREE 0 0 38.18
## w3dadscr w3inccat w3income w3povrty p5fstamp c5r2mtsc
## 1 77.5 $50,001 TO $75,000 62500.5 0 0 60.043
## 2 53.5 $40,001 TO $50,000 45000.5 0 0 56.280
## 3 53.5 $50,001 TO $75,000 62500.5 0 0 55.272
## 4 53.5 $75,001 TO $100,000 87500.5 0 0 64.604
## 5 77.5 $100,001 TO $200,000 150000.5 0 0 75.721
## 6 53.5 $50,001 TO $75,000 62500.5 0 0 54.248
## c5r2mtsc_std w3income_1k
## 1 0.9817533 62.5005
## 2 0.5943775 45.0005
## 3 0.4906106 62.5005
## 4 1.4512779 87.5005
## 5 2.5956991 150.0005
## 6 0.3851966 62.5005
# keep only numeric variables
newdata <- newdata %>% select(c5r2mtsc_std, catholic, race_white, race_black, race_hispanic, race_asian, p5numpla, p5hmage, p5hdage, w3daded_hsb, w3momed_hsb, w3momscr, w3dadscr, w3income, w3povrty, p5fstamp, c5r2mtsc, w3income_1k)
head(newdata)
## c5r2mtsc_std catholic race_white race_black race_hispanic race_asian p5numpla
## 1 0.9817533 0 1 0 0 0 1
## 2 0.5943775 0 1 0 0 0 1
## 3 0.4906106 0 1 0 0 0 1
## 4 1.4512779 1 1 0 0 0 1
## 5 2.5956991 0 1 0 0 0 1
## 6 0.3851966 0 1 0 0 0 1
## p5hmage p5hdage w3daded_hsb w3momed_hsb w3momscr w3dadscr w3income w3povrty
## 1 47 45 0 0 53.50 77.5 62500.5 0
## 2 41 48 0 0 34.95 53.5 45000.5 0
## 3 43 55 0 0 63.43 53.5 62500.5 0
## 4 38 39 0 0 53.50 53.5 87500.5 0
## 5 47 57 0 0 61.56 77.5 150000.5 0
## 6 41 41 0 0 38.18 53.5 62500.5 0
## p5fstamp c5r2mtsc w3income_1k
## 1 0 60.043 62.5005
## 2 0 56.280 45.0005
## 3 0 55.272 62.5005
## 4 0 64.604 87.5005
## 5 0 75.721 150.0005
## 6 0 54.248 62.5005
# run lasso
library(glmnet)
y <- data.matrix(newdata$catholic)
x <- data.matrix(newdata[,c(1, 3:17)])
fit <- cv.glmnet(x, y, family = "gaussian", alpha = 1)
lasso <- glmnet(x, y, family = "gaussian", alpha=1, lambda = fit$lambda.1se)
coef(lasso, s= "lambda.lse")
## 17 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 7.516499e-02
## c5r2mtsc_std .
## race_white .
## race_black .
## race_hispanic .
## race_asian .
## p5numpla .
## p5hmage 1.700277e-03
## p5hdage .
## w3daded_hsb -2.147176e-02
## w3momed_hsb -2.195889e-02
## w3momscr .
## w3dadscr .
## w3income 7.035449e-07
## w3povrty .
## p5fstamp .
## c5r2mtsc .
plot(fit)
Using lambda.1se as the regularization parameter, we would choose only 4 out of 16 covariates. For the PSM command, I would choose education level and income. Yes it differ.
# Generate a prediction from the “best lasso” model
pred <- predict(lasso, newx = x, type = "response")
# use matchit
match <- matchit(catholic ~ p5hmage + w3daded_hsb + w3momed_hsb + w3income, data = newdata, family = binomial(), distance = as.numeric(pred))
dat.match <- match.data(match)
# make a plot
ggplot(data = dat.match, aes(x = distance, y = w3income_1k, col = as.factor(catholic))) +
geom_point(data = dat.match, aes(distance, w3income_1k)) +
geom_smooth(data = dat.match, aes(distance, w3income_1k))
# run reg
reg5 <- lm(c5r2mtsc_std ~ catholic + w3income_1k + w3momed_hsb,
data = dat.match)
summary(reg5)
##
## Call:
## lm(formula = c5r2mtsc_std ~ catholic + w3income_1k + w3momed_hsb,
## data = dat.match)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4722 -0.5200 0.0344 0.5639 2.8543
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.170945 0.050639 3.376 0.000751 ***
## catholic -0.218140 0.039071 -5.583 2.71e-08 ***
## w3income_1k 0.003970 0.000458 8.669 < 2e-16 ***
## w3momed_hsb -0.366480 0.049857 -7.351 2.94e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8424 on 1856 degrees of freedom
## Multiple R-squared: 0.09442, Adjusted R-squared: 0.09296
## F-statistic: 64.51 on 3 and 1856 DF, p-value: < 2.2e-16