PSM_Assignment 2

#install.packages("MatchIt")
library(MatchIt)

## Warning: package 'MatchIt' was built under R version 3.5.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.1

library(knitr)

## Warning: package 'knitr' was built under R version 3.5.2

library(readxl)

## Warning: package 'readxl' was built under R version 3.5.2

minw <- read_excel("C://Users/Uriel/Downloads/Minimum Wage.xlsx")
names(minw)

## [1] "NJ.PA"          "EmploymentPost" "EmploymentPre"  "WagePre"       
## [5] "BurgerKing"     "KFC"            "Roys"           "Wendys"

# Boxplots 
boxplot(minw$EmploymentPre~minw$NJ.PA, main="Employment Pre-treatment by State", xlab="NJ = 1, PA=0", ylab="Employment")

boxplot(minw$WagePre~minw$NJ.PA, main="Wage Pre-treatment by State", xlab="NJ = 1, PA=0", ylab="Hourly wage")

minw%>%
  filter(NJ.PA==1)%>%
  summary()

##      NJ.PA   EmploymentPost  EmploymentPre      WagePre     
##  Min.   :1   Min.   : 0.00   Min.   : 3.00   Min.   :4.250  
##  1st Qu.:1   1st Qu.:11.00   1st Qu.:11.00   1st Qu.:4.250  
##  Median :1   Median :17.00   Median :15.75   Median :4.500  
##  Mean   :1   Mean   :17.22   Mean   :17.05   Mean   :4.606  
##  3rd Qu.:1   3rd Qu.:22.50   3rd Qu.:20.38   3rd Qu.:4.870  
##  Max.   :1   Max.   :55.50   Max.   :80.00   Max.   :5.750  
##    BurgerKing          KFC              Roys            Wendys      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.4047   Mean   :0.2241   Mean   :0.2508   Mean   :0.1204  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.5000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000

minw%>%
  filter(NJ.PA==0)%>%
  summary()

##      NJ.PA   EmploymentPost  EmploymentPre     WagePre     
##  Min.   :0   Min.   : 0.00   Min.   : 4.5   Min.   :4.250  
##  1st Qu.:0   1st Qu.:11.50   1st Qu.:12.5   1st Qu.:4.250  
##  Median :0   Median :16.00   Median :17.0   Median :4.500  
##  Mean   :0   Mean   :17.76   Mean   :20.1   Mean   :4.629  
##  3rd Qu.:0   3rd Qu.:22.50   3rd Qu.:25.0   3rd Qu.:5.000  
##  Max.   :0   Max.   :38.25   Max.   :67.5   Max.   :5.500  
##    BurgerKing          KFC             Roys            Wendys      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.4521   Mean   :0.137   Mean   :0.2329   Mean   :0.1781  
##  3rd Qu.:1.0000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000

# Difference average employment between NJ and PA
t.test(minw$EmploymentPre~minw$NJ.PA)

## 
##  Welch Two Sample t-test
## 
## data:  minw$EmploymentPre by minw$NJ.PA
## t = 2.0835, df = 93.052, p-value = 0.03995
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1430664 5.9585737
## sample estimates:
## mean in group 0 mean in group 1 
##        20.09932        17.04849

# Logit regression to calculate the propensity score for each fast food joint, i.e. the predicted probability of being in NJ given the estimates of this model. 
# Covariables chosen for the model: EmploymentPre, WagePre, BurgerKing, KFC, Roys, Wendys
psm<-glm(NJ.PA~EmploymentPre+WagePre+BurgerKing+KFC+Roys+Wendys, data=minw, family = binomial())
summary(psm)

## 
## Call:
## glm(formula = NJ.PA ~ EmploymentPre + WagePre + BurgerKing + 
##     KFC + Roys + Wendys, family = binomial(), data = minw)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0915   0.5108   0.6161   0.6866   1.1899  
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    1.96086    1.85955   1.054   0.2917  
## EmploymentPre -0.02514    0.01381  -1.820   0.0687 .
## WagePre       -0.09622    0.38992  -0.247   0.8051  
## BurgerKing     0.30101    0.38847   0.775   0.4384  
## KFC            0.62948    0.48992   1.285   0.1988  
## Roys           0.47837    0.42316   1.130   0.2583  
## Wendys              NA         NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 368.38  on 371  degrees of freedom
## Residual deviance: 360.83  on 366  degrees of freedom
## AIC: 372.83
## 
## Number of Fisher Scoring iterations: 4

pred.prob<-data.frame(pscore=predict(psm, type="response"), NJ.PA=psm$model$NJ.PA) # Propensity score
kable(head(pred.prob, 10)) # First ten propensity scores and the fast food joint actual treatment status

pscore	NJ.PA
0.6737966	0
0.7218960	0
0.5208633	0
0.7884461	0
0.8737248	0
0.8736046	0
0.8380529	0
0.5981610	0
0.7755902	0
0.7486568	0

# Region of common support by visual inspection of histograms
labs <- paste("State of fast food joint:", c("NJ", "PA"))
pred.prob %>%
  mutate(NJ.PA = ifelse(NJ.PA == 1, labs[1], labs[2])) %>%
  ggplot(aes(x = pscore)) + geom_histogram(color = "white") + facet_wrap(~NJ.PA) + xlab("Probability of being a NJ fast food joint") + theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Matching algorithm
# match1 is using ratio 1; match2 is using a caliper of 0.2
match1<-matchit(NJ.PA ~ EmploymentPre+WagePre+BurgerKing+KFC+Roys+Wendys, method = "nearest", ratio=1, data = minw)

## Warning in matchit2nearest(c(`1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 0, :
## Fewer control than treated units and matching without replacement. Not all
## treated units will receive a match. Treated units will be matched in the
## order specified by m.order: largest

match2<-matchit(NJ.PA ~ EmploymentPre+WagePre+BurgerKing+KFC+Roys+Wendys, method = "nearest", caliper=0.2, data = minw)

## Warning in matchit2nearest(c(`1` = 0, `2` = 0, `3` = 0, `4` = 0, `5` = 0, :
## Fewer control than treated units and matching without replacement. Not all
## treated units will receive a match. Treated units will be matched in the
## order specified by m.order: largest

summary(match1)

## 
## Call:
## matchit(formula = NJ.PA ~ EmploymentPre + WagePre + BurgerKing + 
##     KFC + Roys + Wendys, data = minw, method = "nearest", ratio = 1)
## 
## Summary of balance for all data:
##               Means Treated Means Control SD Control Mean Diff eQQ Med
## distance             0.8081        0.7859     0.0699    0.0223  0.0168
## EmploymentPre       17.0485       20.0993    11.7200   -3.0508  2.0000
## WagePre              4.6061        4.6286     0.3556   -0.0225  0.0000
## BurgerKing           0.4047        0.4521     0.5011   -0.0474  0.0000
## KFC                  0.2241        0.1370     0.3462    0.0871  0.0000
## Roys                 0.2508        0.2329     0.4256    0.0180  0.0000
## Wendys               0.1204        0.1781     0.3852   -0.0577  0.0000
##               eQQ Mean eQQ Max
## distance        0.0209  0.0702
## EmploymentPre   3.0514 14.5000
## WagePre         0.0412  0.2500
## BurgerKing      0.0548  1.0000
## KFC             0.0822  1.0000
## Roys            0.0137  1.0000
## Wendys          0.0548  1.0000
## 
## 
## Summary of balance for matched data:
##               Means Treated Means Control SD Control Mean Diff eQQ Med
## distance             0.8694        0.7859     0.0699    0.0835  0.0728
## EmploymentPre        9.1096       20.0993    11.7200  -10.9897  8.2500
## WagePre              4.5655        4.6286     0.3556   -0.0632  0.0000
## BurgerKing           0.0137        0.4521     0.5011   -0.4384  0.0000
## KFC                  0.8630        0.1370     0.3462    0.7260  1.0000
## Roys                 0.1233        0.2329     0.4256   -0.1096  0.0000
## Wendys               0.0000        0.1781     0.3852   -0.1781  0.0000
##               eQQ Mean eQQ Max
## distance        0.0835  0.3305
## EmploymentPre  10.9897 50.5000
## WagePre         0.0659  0.2500
## BurgerKing      0.4384  1.0000
## KFC             0.7260  1.0000
## Roys            0.1096  1.0000
## Wendys          0.1781  1.0000
## 
## Percent Balance Improvement:
##               Mean Diff.   eQQ Med  eQQ Mean   eQQ Max
## distance       -274.9166 -333.0761 -300.0914 -371.1723
## EmploymentPre  -260.2220 -312.5000 -260.1571 -248.2759
## WagePre        -180.1321    0.0000  -59.8007    0.0000
## BurgerKing     -825.3385    0.0000 -700.0000    0.0000
## KFC            -733.6139      -Inf -783.3333    0.0000
## Roys           -510.2041    0.0000 -700.0000    0.0000
## Wendys         -208.7371    0.0000 -225.0000    0.0000
## 
## Sample sizes:
##           Control Treated
## All            73     299
## Matched        73      73
## Unmatched       0     226
## Discarded       0       0

summary(match2)

## 
## Call:
## matchit(formula = NJ.PA ~ EmploymentPre + WagePre + BurgerKing + 
##     KFC + Roys + Wendys, data = minw, method = "nearest", caliper = 0.2)
## 
## Summary of balance for all data:
##               Means Treated Means Control SD Control Mean Diff eQQ Med
## distance             0.8081        0.7859     0.0699    0.0223  0.0168
## EmploymentPre       17.0485       20.0993    11.7200   -3.0508  2.0000
## WagePre              4.6061        4.6286     0.3556   -0.0225  0.0000
## BurgerKing           0.4047        0.4521     0.5011   -0.0474  0.0000
## KFC                  0.2241        0.1370     0.3462    0.0871  0.0000
## Roys                 0.2508        0.2329     0.4256    0.0180  0.0000
## Wendys               0.1204        0.1781     0.3852   -0.0577  0.0000
##               eQQ Mean eQQ Max
## distance        0.0209  0.0702
## EmploymentPre   3.0514 14.5000
## WagePre         0.0412  0.2500
## BurgerKing      0.0548  1.0000
## KFC             0.0822  1.0000
## Roys            0.0137  1.0000
## Wendys          0.0548  1.0000
## 
## 
## Summary of balance for matched data:
##               Means Treated Means Control SD Control Mean Diff eQQ Med
## distance             0.8064        0.7963     0.0547    0.0101  0.0108
## EmploymentPre       16.8587       18.2935     8.7675   -1.4348  1.5000
## WagePre              4.6788        4.6191     0.3598    0.0597  0.0000
## BurgerKing           0.2899        0.4348     0.4994   -0.1449  0.0000
## KFC                  0.1739        0.1449     0.3546    0.0290  0.0000
## Roys                 0.3768        0.2464     0.4341    0.1304  0.0000
## Wendys               0.1594        0.1739     0.3818   -0.0145  0.0000
##               eQQ Mean eQQ Max
## distance        0.0101  0.0115
## EmploymentPre   1.4710  5.0000
## WagePre         0.0641  0.2500
## BurgerKing      0.1449  1.0000
## KFC             0.0290  1.0000
## Roys            0.1304  1.0000
## Wendys          0.0145  1.0000
## 
## Percent Balance Improvement:
##               Mean Diff. eQQ Med  eQQ Mean eQQ Max
## distance         54.4959 36.0713   51.4404 83.5386
## EmploymentPre    52.9706 25.0000   51.7917 65.5172
## WagePre        -164.8701  0.0000  -55.3565  0.0000
## BurgerKing     -205.9317  0.0000 -164.4928  0.0000
## KFC              66.7193  0.0000   64.7343  0.0000
## Roys           -626.2755  0.0000 -852.1739  0.0000
## Wendys           74.8742  0.0000   73.5507  0.0000
## 
## Sample sizes:
##           Control Treated
## All            73     299
## Matched        69      69
## Unmatched       4     230
## Discarded       0       0

plot(match1)

plot(match2)

matchdata<-match.data(match1)
dim(matchdata)

## [1] 146  10

matchdata2<-match.data(match2)
dim(matchdata2)

## [1] 138  10

# Visual inspection
fn_bal <- function(dta, variable) {
  dta$variable <- dta[, variable]
  if (variable == 'EmploymentPre') dta$variable <- dta$variable
  dta$NJ.PA <- as.factor(dta$NJ.PA)
  support <- c(min(dta$variable), max(dta$variable))
  ggplot(dta, aes(x = distance, y = variable, color = NJ.PA)) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score") +
    ylab(variable) +
    theme_bw() +
    ylim(support)
}

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.5.1

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(
   fn_bal(matchdata, "EmploymentPre"),
   fn_bal(matchdata, "WagePre") + theme(legend.position = "none"),
   fn_bal(matchdata, "BurgerKing"),
   fn_bal(matchdata, "KFC") + theme(legend.position = "none"),
   fn_bal(matchdata, "Roys"),
   fn_bal(matchdata, "Wendys") + theme(legend.position = "none"),
   nrow = 3, widths = c(1.3, 1.3)
)

## Warning: Removed 2 rows containing missing values (geom_smooth).

## Warning: Removed 19 rows containing missing values (geom_smooth).

## Warning: Removed 39 rows containing missing values (geom_smooth).

## Warning: Removed 53 rows containing missing values (geom_smooth).

## Warning: Removed 16 rows containing missing values (geom_smooth).

#install.packages("cobalt")
library(cobalt)

## Warning: package 'cobalt' was built under R version 3.5.3

## 
## Attaching package: 'cobalt'

## The following object is masked from 'package:MatchIt':
## 
##     lalonde

# Checking balance before and after matching:
bal.tab(match1, m.threshold = 0.1, un = TRUE)

## Call
##  matchit(formula = NJ.PA ~ EmploymentPre + WagePre + BurgerKing + 
##     KFC + Roys + Wendys, data = minw, method = "nearest", ratio = 1)
## 
## Balance Measures
##                   Type Diff.Un Diff.Adj        M.Threshold
## distance      Distance  0.4121   1.5451                   
## EmploymentPre  Contin. -0.3444  -1.2405 Not Balanced, >0.1
## WagePre        Contin. -0.0658  -0.1842 Not Balanced, >0.1
## BurgerKing      Binary -0.0474  -0.4384 Not Balanced, >0.1
## KFC             Binary  0.0871   0.7260 Not Balanced, >0.1
## Roys            Binary  0.0180  -0.1096 Not Balanced, >0.1
## Wendys          Binary -0.0577  -0.1781 Not Balanced, >0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1         0
## Not Balanced, >0.1     6
## 
## Variable with the greatest mean difference
##       Variable Diff.Adj        M.Threshold
##  EmploymentPre  -1.2405 Not Balanced, >0.1
## 
## Sample sizes
##           Control Treated
## All            73     299
## Matched        73      73
## Unmatched       0     226

bal.plot(match1, var.name = "distance")

bal.plot(match1, var.name = "distance", mirror = TRUE, type = "histogram")

minw_cov<-c("EmploymentPre", "WagePre", "BurgerKing", "KFC", "Roys", "Wendys")
matchdata %>%
  group_by(NJ.PA) %>%
  select(one_of(minw_cov)) %>%
  summarise_all(funs(mean))

## Adding missing grouping variables: `NJ.PA`

## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.

## # A tibble: 2 x 7
##   NJ.PA EmploymentPre WagePre BurgerKing   KFC  Roys Wendys
##   <dbl>         <dbl>   <dbl>      <dbl> <dbl> <dbl>  <dbl>
## 1     0         20.1     4.63     0.452  0.137 0.233  0.178
## 2     1          9.11    4.57     0.0137 0.863 0.123  0

# Test for balance among covariates
lapply(minw_cov, function(v) {
    t.test(matchdata[, v] ~ matchdata$NJ.PA)
})

## [[1]]
## 
##  Welch Two Sample t-test
## 
## data:  matchdata[, v] by matchdata$NJ.PA
## t = 7.7419, df = 82.158, p-value = 2.239e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   8.165939 13.813513
## sample estimates:
## mean in group 0 mean in group 1 
##       20.099315        9.109589 
## 
## 
## [[2]]
## 
##  Welch Two Sample t-test
## 
## data:  matchdata[, v] by matchdata$NJ.PA
## t = 1.1296, df = 142.32, p-value = 0.2606
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04736391  0.17366528
## sample estimates:
## mean in group 0 mean in group 1 
##        4.628630        4.565479 
## 
## 
## [[3]]
## 
##  Welch Two Sample t-test
## 
## data:  matchdata[, v] by matchdata$NJ.PA
## t = 7.2777, df = 79.831, p-value = 2.087e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.3184859 0.5582264
## sample estimates:
## mean in group 0 mean in group 1 
##      0.45205479      0.01369863 
## 
## 
## [[4]]
## 
##  Welch Two Sample t-test
## 
## data:  matchdata[, v] by matchdata$NJ.PA
## t = -12.669, df = 144, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.8392960 -0.6127588
## sample estimates:
## mean in group 0 mean in group 1 
##       0.1369863       0.8630137 
## 
## 
## [[5]]
## 
##  Welch Two Sample t-test
## 
## data:  matchdata[, v] by matchdata$NJ.PA
## t = 1.7366, df = 135.78, p-value = 0.08473
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.01520929  0.23438737
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2328767       0.1232877 
## 
## 
## [[6]]
## 
##  Welch Two Sample t-test
## 
## data:  matchdata[, v] by matchdata$NJ.PA
## t = 3.9497, df = 72, p-value = 0.0001805
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.08820148 0.26796290
## sample estimates:
## mean in group 0 mean in group 1 
##       0.1780822       0.0000000

# Estimating Treatment Effects
with(matchdata, t.test(EmploymentPre ~ NJ.PA))

## 
##  Welch Two Sample t-test
## 
## data:  EmploymentPre by NJ.PA
## t = 7.7419, df = 82.158, p-value = 2.239e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   8.165939 13.813513
## sample estimates:
## mean in group 0 mean in group 1 
##       20.099315        9.109589

# Estimating Treatment Effects via reggresion 
lm_treat1 <- lm(EmploymentPre ~ NJ.PA, data = matchdata)
summary(lm_treat1)

## 
## Call:
## lm(formula = EmploymentPre ~ NJ.PA, data = matchdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.599  -4.037  -0.979   1.901  47.401 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   20.099      1.004  20.024  < 2e-16 ***
## NJ.PA        -10.990      1.420  -7.742 1.58e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.576 on 144 degrees of freedom
## Multiple R-squared:  0.2939, Adjusted R-squared:  0.289 
## F-statistic: 59.94 on 1 and 144 DF,  p-value: 1.583e-12

PSM_Assignment 2

Uriel Lomelí

1 de abril de 2019