library(tidyverse)
library(StatMatch)
library(broom)
child <- read.csv("./data/child_soldiering.csv")
head(child)
abd C.ach C.akw C.ata C.kma C.oro C.pad C.paj C.pal age fthr.ed mthr.ed
1 1 0 0 0 1 0 0 0 0 21 7 4
2 1 0 0 0 1 0 0 0 0 29 7 4
3 1 0 0 0 1 0 0 0 0 23 7 0
4 1 0 0 0 1 0 0 0 0 18 0 0
5 1 0 0 0 1 0 0 0 0 18 4 4
6 1 0 0 0 1 0 0 0 0 25 7 4
orphan96 fthr.frm hh.size96 educ
1 0 1 9 6
2 0 1 9 3
3 0 1 2 4
4 0 1 6 7
5 0 1 5 7
6 0 0 9 7
child <- child %>% select(-C.ach)
## Means in treated / control group
## Calculate normalized difference in covariates
## mean in treated/control group
cov_mean <- child %>%
group_by(abd) %>%
summarise(across(-educ, mean)) %>% # get mean for each variable except for educ.
t() %>%
as_tibble(rownames = 'var') %>%
slice(-1) %>% # remove abd row.
rename(mean_control = V1,
mean_treated = V2)
## Variance in treated / control group
cov_var <- child %>%
group_by(abd) %>%
summarise(across(-educ, var)) %>%
t() %>%
as_tibble(rownames = 'var') %>%
slice(-1) %>%
rename(var_control = V1,
var_treated = V2)
bal_pre <- cov_mean %>%
left_join(cov_var) %>%
mutate(mean.diff = mean_treated-mean_control,
SMD = mean.diff/sqrt((var_treated+var_control)/2), # standardized mean difference
type ="before matching")
## Plot normalized mean difference for all covariates
ggplot(bal_pre, aes(x=var, y=SMD))+
geom_col(color="black", fill="steelblue")+
coord_flip() +
labs(y = 'Normalized difference',
x = 'Variable') +
theme_bw()
#
# ggplot(bal_pre, aes(x = var, y = SMD)) +
# geom_bar(stat = 'identity',
# col = 'black')
##MAHALANOBIS DISTANCE MATCHING
## 1. Get a covariate data matrix
big_X <- child %>%
select(-c('abd', 'educ')) %>%
as.matrix()
## 2. Extract treated units and control units
X_treated <- child %>%
filter(abd == 1) %>%
select(-c("abd", "educ")) %>%
as.matrix()
X_control <- child %>%
filter(abd==0) %>%
select(-c("abd", "educ")) %>%
as.matrix()
## 3. Calculate Mahalanobis Distance between all observation-pairs
## using mahalanobis.dist function from the StatMatch package
M <- mahalanobis.dist(data.x = X_treated,
data.y = X_control,
vc = var(big_X))
## For each treated unit, find closest control unit
## In each row, find the minimum value index; if multiple, get the first one.
match_ids <- unlist(apply(M, 1, function(i){which(i == min(i))[1]}))
## 4. Create a matched dataset
matched_data <- rbind(subset(child, abd == 1),
subset(child, abd == 0)[match_ids,])
## 5. Check balance after matching
## mean in treated/control group
cov_mean_matched <- matched_data %>%
group_by(abd) %>%
summarise(across(-educ, mean)) %>% # get mean for each variable except for educ.
t() %>%
as_tibble(rownames = 'var') %>%
slice(-1) %>% # remove abd row.
rename(mean_control = V1,
mean_treated = V2)
## Variance in treated / control group
cov_var_matched <- matched_data %>%
group_by(abd) %>%
summarise(across(-educ, var)) %>%
t() %>%
as_tibble(rownames = 'var') %>%
slice(-1) %>%
rename(var_control = V1,
var_treated = V2)
bal_post <- cov_mean_matched %>%
left_join(cov_var_matched) %>%
mutate(mean.diff = mean_treated-mean_control,
SMD = mean.diff/sqrt((var_treated+var_control)/2), # standardized mean difference
type ="After matching")
## Plot normalized mean difference for all covariates
ggplot(bal_post, aes(x=var, y=SMD))+
geom_col(color="black", fill="steelblue")+
coord_flip() +
labs(y = 'Normalized difference',
x = 'Variable') +
theme_bw()
## Compare balance before and after matching visually
ggplot(rbind(bal_pre, bal_post), aes(x = var, y = SMD,
col = type, group = type, shape = type)) +
geom_point() +
coord_flip() +
labs(y = 'Normalized difference',
x = 'Variable') +
theme_bw() +
geom_hline(yintercept = 0,
linetype = 'dotted') +
theme(legend.position = 'bottom',
legend.title = element_blank())
##Post-matching estimation
## Full Dataset
lm(educ ~ abd, data = child) %>% tidy() %>% filter(term == 'abd')
# A tibble: 1 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 abd -0.595 0.218 -2.73 0.00647
lm(educ ~ ., data = child) %>% tidy() %>% filter(term == 'abd')
# A tibble: 1 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 abd -0.704 0.217 -3.24 0.00125
## Matched Dataset
lm(educ ~ abd, data = matched_data) %>% tidy() %>% filter(term == 'abd')
# A tibble: 1 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 abd -0.671 0.185 -3.62 0.000311
lm(educ ~ ., data = matched_data) %>% tidy() %>% filter(term == 'abd')
# A tibble: 1 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 abd -0.683 0.178 -3.84 0.000132