#install.packages("MatchIt")
library(MatchIt)
## Warning: package 'MatchIt' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(readxl)
## Warning: package 'readxl' was built under R version 3.5.2
minw <- read_excel("C://Users/Uriel/Downloads/Minimum Wage.xlsx")
names(minw)
## [1] "NJ.PA" "EmploymentPost" "EmploymentPre" "WagePre"
## [5] "BurgerKing" "KFC" "Roys" "Wendys"
# Boxplots
boxplot(minw$EmploymentPre~minw$NJ.PA, main="Employment Pre-treatment by State", xlab="NJ = 1, PA=0", ylab="Employment")

boxplot(minw$WagePre~minw$NJ.PA, main="Wage Pre-treatment by State", xlab="NJ = 1, PA=0", ylab="Hourly wage")

minw%>%
filter(NJ.PA==1)%>%
summary()
## NJ.PA EmploymentPost EmploymentPre WagePre
## Min. :1 Min. : 0.00 Min. : 3.00 Min. :4.250
## 1st Qu.:1 1st Qu.:11.00 1st Qu.:11.00 1st Qu.:4.250
## Median :1 Median :17.00 Median :15.75 Median :4.500
## Mean :1 Mean :17.22 Mean :17.05 Mean :4.606
## 3rd Qu.:1 3rd Qu.:22.50 3rd Qu.:20.38 3rd Qu.:4.870
## Max. :1 Max. :55.50 Max. :80.00 Max. :5.750
## BurgerKing KFC Roys Wendys
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.4047 Mean :0.2241 Mean :0.2508 Mean :0.1204
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.5000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
minw%>%
filter(NJ.PA==0)%>%
summary()
## NJ.PA EmploymentPost EmploymentPre WagePre
## Min. :0 Min. : 0.00 Min. : 4.5 Min. :4.250
## 1st Qu.:0 1st Qu.:11.50 1st Qu.:12.5 1st Qu.:4.250
## Median :0 Median :16.00 Median :17.0 Median :4.500
## Mean :0 Mean :17.76 Mean :20.1 Mean :4.629
## 3rd Qu.:0 3rd Qu.:22.50 3rd Qu.:25.0 3rd Qu.:5.000
## Max. :0 Max. :38.25 Max. :67.5 Max. :5.500
## BurgerKing KFC Roys Wendys
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.4521 Mean :0.137 Mean :0.2329 Mean :0.1781
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
# Difference average employment between NJ and PA
t.test(minw$EmploymentPre~minw$NJ.PA)
##
## Welch Two Sample t-test
##
## data: minw$EmploymentPre by minw$NJ.PA
## t = 2.0835, df = 93.052, p-value = 0.03995
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1430664 5.9585737
## sample estimates:
## mean in group 0 mean in group 1
## 20.09932 17.04849
# Logit regression to calculate the propensity score for each fast food joint, i.e. the predicted probability of being in NJ given the estimates of this model.
# Covariables chosen for the model: EmploymentPre, WagePre, BurgerKing, KFC, Roys, Wendys
psm<-glm(NJ.PA~EmploymentPre+WagePre+BurgerKing+KFC+Roys+Wendys, data=minw, family = binomial())
summary(psm)
##
## Call:
## glm(formula = NJ.PA ~ EmploymentPre + WagePre + BurgerKing +
## KFC + Roys + Wendys, family = binomial(), data = minw)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0915 0.5108 0.6161 0.6866 1.1899
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.96086 1.85955 1.054 0.2917
## EmploymentPre -0.02514 0.01381 -1.820 0.0687 .
## WagePre -0.09622 0.38992 -0.247 0.8051
## BurgerKing 0.30101 0.38847 0.775 0.4384
## KFC 0.62948 0.48992 1.285 0.1988
## Roys 0.47837 0.42316 1.130 0.2583
## Wendys NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 368.38 on 371 degrees of freedom
## Residual deviance: 360.83 on 366 degrees of freedom
## AIC: 372.83
##
## Number of Fisher Scoring iterations: 4
pred.prob<-data.frame(pscore=predict(psm, type="response"), NJ.PA=psm$model$NJ.PA) # Propensity score
kable(head(pred.prob, 10)) # First ten propensity scores and the fast food joint actual treatment status
| 0.6737966 |
0 |
| 0.7218960 |
0 |
| 0.5208633 |
0 |
| 0.7884461 |
0 |
| 0.8737248 |
0 |
| 0.8736046 |
0 |
| 0.8380529 |
0 |
| 0.5981610 |
0 |
| 0.7755902 |
0 |
| 0.7486568 |
0 |
# Region of common support by visual inspection of histograms
labs <- paste("State of fast food joint:", c("NJ", "PA"))
pred.prob %>%
mutate(NJ.PA = ifelse(NJ.PA == 1, labs[1], labs[2])) %>%
ggplot(aes(x = pscore)) + geom_histogram(color = "white") + facet_wrap(~NJ.PA) + xlab("Probability of being a NJ fast food joint") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Matching algorithm
# match1 is using ratio 1; match2 is using a caliper of 0.2
match1<-matchit(NJ.PA ~ EmploymentPre+WagePre+BurgerKing+KFC+Roys+Wendys, method = "nearest", ratio=1, data = minw)
## Warning in matchit2nearest(c(`1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 0, :
## Fewer control than treated units and matching without replacement. Not all
## treated units will receive a match. Treated units will be matched in the
## order specified by m.order: largest
match2<-matchit(NJ.PA ~ EmploymentPre+WagePre+BurgerKing+KFC+Roys+Wendys, method = "nearest", caliper=0.2, data = minw)
## Warning in matchit2nearest(c(`1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 0, :
## Fewer control than treated units and matching without replacement. Not all
## treated units will receive a match. Treated units will be matched in the
## order specified by m.order: largest
summary(match1)
##
## Call:
## matchit(formula = NJ.PA ~ EmploymentPre + WagePre + BurgerKing +
## KFC + Roys + Wendys, data = minw, method = "nearest", ratio = 1)
##
## Summary of balance for all data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.8081 0.7859 0.0699 0.0223 0.0168
## EmploymentPre 17.0485 20.0993 11.7200 -3.0508 2.0000
## WagePre 4.6061 4.6286 0.3556 -0.0225 0.0000
## BurgerKing 0.4047 0.4521 0.5011 -0.0474 0.0000
## KFC 0.2241 0.1370 0.3462 0.0871 0.0000
## Roys 0.2508 0.2329 0.4256 0.0180 0.0000
## Wendys 0.1204 0.1781 0.3852 -0.0577 0.0000
## eQQ Mean eQQ Max
## distance 0.0209 0.0702
## EmploymentPre 3.0514 14.5000
## WagePre 0.0412 0.2500
## BurgerKing 0.0548 1.0000
## KFC 0.0822 1.0000
## Roys 0.0137 1.0000
## Wendys 0.0548 1.0000
##
##
## Summary of balance for matched data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.8694 0.7859 0.0699 0.0835 0.0728
## EmploymentPre 9.1096 20.0993 11.7200 -10.9897 8.2500
## WagePre 4.5655 4.6286 0.3556 -0.0632 0.0000
## BurgerKing 0.0137 0.4521 0.5011 -0.4384 0.0000
## KFC 0.8630 0.1370 0.3462 0.7260 1.0000
## Roys 0.1233 0.2329 0.4256 -0.1096 0.0000
## Wendys 0.0000 0.1781 0.3852 -0.1781 0.0000
## eQQ Mean eQQ Max
## distance 0.0835 0.3305
## EmploymentPre 10.9897 50.5000
## WagePre 0.0659 0.2500
## BurgerKing 0.4384 1.0000
## KFC 0.7260 1.0000
## Roys 0.1096 1.0000
## Wendys 0.1781 1.0000
##
## Percent Balance Improvement:
## Mean Diff. eQQ Med eQQ Mean eQQ Max
## distance -274.9166 -333.0761 -300.0914 -371.1723
## EmploymentPre -260.2220 -312.5000 -260.1571 -248.2759
## WagePre -180.1321 0.0000 -59.8007 0.0000
## BurgerKing -825.3385 0.0000 -700.0000 0.0000
## KFC -733.6139 -Inf -783.3333 0.0000
## Roys -510.2041 0.0000 -700.0000 0.0000
## Wendys -208.7371 0.0000 -225.0000 0.0000
##
## Sample sizes:
## Control Treated
## All 73 299
## Matched 73 73
## Unmatched 0 226
## Discarded 0 0
summary(match2)
##
## Call:
## matchit(formula = NJ.PA ~ EmploymentPre + WagePre + BurgerKing +
## KFC + Roys + Wendys, data = minw, method = "nearest", caliper = 0.2)
##
## Summary of balance for all data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.8081 0.7859 0.0699 0.0223 0.0168
## EmploymentPre 17.0485 20.0993 11.7200 -3.0508 2.0000
## WagePre 4.6061 4.6286 0.3556 -0.0225 0.0000
## BurgerKing 0.4047 0.4521 0.5011 -0.0474 0.0000
## KFC 0.2241 0.1370 0.3462 0.0871 0.0000
## Roys 0.2508 0.2329 0.4256 0.0180 0.0000
## Wendys 0.1204 0.1781 0.3852 -0.0577 0.0000
## eQQ Mean eQQ Max
## distance 0.0209 0.0702
## EmploymentPre 3.0514 14.5000
## WagePre 0.0412 0.2500
## BurgerKing 0.0548 1.0000
## KFC 0.0822 1.0000
## Roys 0.0137 1.0000
## Wendys 0.0548 1.0000
##
##
## Summary of balance for matched data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.8064 0.7963 0.0547 0.0101 0.0108
## EmploymentPre 16.8587 18.2935 8.7675 -1.4348 1.5000
## WagePre 4.6788 4.6191 0.3598 0.0597 0.0000
## BurgerKing 0.2899 0.4348 0.4994 -0.1449 0.0000
## KFC 0.1739 0.1449 0.3546 0.0290 0.0000
## Roys 0.3768 0.2464 0.4341 0.1304 0.0000
## Wendys 0.1594 0.1739 0.3818 -0.0145 0.0000
## eQQ Mean eQQ Max
## distance 0.0101 0.0115
## EmploymentPre 1.4710 5.0000
## WagePre 0.0641 0.2500
## BurgerKing 0.1449 1.0000
## KFC 0.0290 1.0000
## Roys 0.1304 1.0000
## Wendys 0.0145 1.0000
##
## Percent Balance Improvement:
## Mean Diff. eQQ Med eQQ Mean eQQ Max
## distance 54.4959 36.0713 51.4404 83.5386
## EmploymentPre 52.9706 25.0000 51.7917 65.5172
## WagePre -164.8701 0.0000 -55.3565 0.0000
## BurgerKing -205.9317 0.0000 -164.4928 0.0000
## KFC 66.7193 0.0000 64.7343 0.0000
## Roys -626.2755 0.0000 -852.1739 0.0000
## Wendys 74.8742 0.0000 73.5507 0.0000
##
## Sample sizes:
## Control Treated
## All 73 299
## Matched 69 69
## Unmatched 4 230
## Discarded 0 0
plot(match1)


plot(match2)


matchdata<-match.data(match1)
dim(matchdata)
## [1] 146 10
matchdata2<-match.data(match2)
dim(matchdata2)
## [1] 138 10
# Visual inspection
fn_bal <- function(dta, variable) {
dta$variable <- dta[, variable]
if (variable == 'EmploymentPre') dta$variable <- dta$variable
dta$NJ.PA <- as.factor(dta$NJ.PA)
support <- c(min(dta$variable), max(dta$variable))
ggplot(dta, aes(x = distance, y = variable, color = NJ.PA)) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Propensity score") +
ylab(variable) +
theme_bw() +
ylim(support)
}
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.1
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(
fn_bal(matchdata, "EmploymentPre"),
fn_bal(matchdata, "WagePre") + theme(legend.position = "none"),
fn_bal(matchdata, "BurgerKing"),
fn_bal(matchdata, "KFC") + theme(legend.position = "none"),
fn_bal(matchdata, "Roys"),
fn_bal(matchdata, "Wendys") + theme(legend.position = "none"),
nrow = 3, widths = c(1.3, 1.3)
)
## Warning: Removed 2 rows containing missing values (geom_smooth).
## Warning: Removed 19 rows containing missing values (geom_smooth).
## Warning: Removed 39 rows containing missing values (geom_smooth).
## Warning: Removed 53 rows containing missing values (geom_smooth).
## Warning: Removed 16 rows containing missing values (geom_smooth).

#install.packages("cobalt")
library(cobalt)
## Warning: package 'cobalt' was built under R version 3.5.3
##
## Attaching package: 'cobalt'
## The following object is masked from 'package:MatchIt':
##
## lalonde
# Checking balance before and after matching:
bal.tab(match1, m.threshold = 0.1, un = TRUE)
## Call
## matchit(formula = NJ.PA ~ EmploymentPre + WagePre + BurgerKing +
## KFC + Roys + Wendys, data = minw, method = "nearest", ratio = 1)
##
## Balance Measures
## Type Diff.Un Diff.Adj M.Threshold
## distance Distance 0.4121 1.5451
## EmploymentPre Contin. -0.3444 -1.2405 Not Balanced, >0.1
## WagePre Contin. -0.0658 -0.1842 Not Balanced, >0.1
## BurgerKing Binary -0.0474 -0.4384 Not Balanced, >0.1
## KFC Binary 0.0871 0.7260 Not Balanced, >0.1
## Roys Binary 0.0180 -0.1096 Not Balanced, >0.1
## Wendys Binary -0.0577 -0.1781 Not Balanced, >0.1
##
## Balance tally for mean differences
## count
## Balanced, <0.1 0
## Not Balanced, >0.1 6
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## EmploymentPre -1.2405 Not Balanced, >0.1
##
## Sample sizes
## Control Treated
## All 73 299
## Matched 73 73
## Unmatched 0 226
bal.plot(match1, var.name = "distance")

bal.plot(match1, var.name = "distance", mirror = TRUE, type = "histogram")

minw_cov<-c("EmploymentPre", "WagePre", "BurgerKing", "KFC", "Roys", "Wendys")
matchdata %>%
group_by(NJ.PA) %>%
select(one_of(minw_cov)) %>%
summarise_all(funs(mean))
## Adding missing grouping variables: `NJ.PA`
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.)
##
## # After:
## list(name = ~f(.))
## This warning is displayed once per session.
## # A tibble: 2 x 7
## NJ.PA EmploymentPre WagePre BurgerKing KFC Roys Wendys
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 20.1 4.63 0.452 0.137 0.233 0.178
## 2 1 9.11 4.57 0.0137 0.863 0.123 0
# Test for balance among covariates
lapply(minw_cov, function(v) {
t.test(matchdata[, v] ~ matchdata$NJ.PA)
})
## [[1]]
##
## Welch Two Sample t-test
##
## data: matchdata[, v] by matchdata$NJ.PA
## t = 7.7419, df = 82.158, p-value = 2.239e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8.165939 13.813513
## sample estimates:
## mean in group 0 mean in group 1
## 20.099315 9.109589
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: matchdata[, v] by matchdata$NJ.PA
## t = 1.1296, df = 142.32, p-value = 0.2606
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04736391 0.17366528
## sample estimates:
## mean in group 0 mean in group 1
## 4.628630 4.565479
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: matchdata[, v] by matchdata$NJ.PA
## t = 7.2777, df = 79.831, p-value = 2.087e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.3184859 0.5582264
## sample estimates:
## mean in group 0 mean in group 1
## 0.45205479 0.01369863
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: matchdata[, v] by matchdata$NJ.PA
## t = -12.669, df = 144, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.8392960 -0.6127588
## sample estimates:
## mean in group 0 mean in group 1
## 0.1369863 0.8630137
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: matchdata[, v] by matchdata$NJ.PA
## t = 1.7366, df = 135.78, p-value = 0.08473
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01520929 0.23438737
## sample estimates:
## mean in group 0 mean in group 1
## 0.2328767 0.1232877
##
##
## [[6]]
##
## Welch Two Sample t-test
##
## data: matchdata[, v] by matchdata$NJ.PA
## t = 3.9497, df = 72, p-value = 0.0001805
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.08820148 0.26796290
## sample estimates:
## mean in group 0 mean in group 1
## 0.1780822 0.0000000
# Estimating Treatment Effects
with(matchdata, t.test(EmploymentPre ~ NJ.PA))
##
## Welch Two Sample t-test
##
## data: EmploymentPre by NJ.PA
## t = 7.7419, df = 82.158, p-value = 2.239e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8.165939 13.813513
## sample estimates:
## mean in group 0 mean in group 1
## 20.099315 9.109589
# Estimating Treatment Effects via reggresion
lm_treat1 <- lm(EmploymentPre ~ NJ.PA, data = matchdata)
summary(lm_treat1)
##
## Call:
## lm(formula = EmploymentPre ~ NJ.PA, data = matchdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.599 -4.037 -0.979 1.901 47.401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.099 1.004 20.024 < 2e-16 ***
## NJ.PA -10.990 1.420 -7.742 1.58e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.576 on 144 degrees of freedom
## Multiple R-squared: 0.2939, Adjusted R-squared: 0.289
## F-statistic: 59.94 on 1 and 144 DF, p-value: 1.583e-12