1. Load data
library(tidyverse)
library(StatMatch)
library(broom)

child <- read.csv("./data/child_soldiering.csv")
head(child)
  abd C.ach C.akw C.ata C.kma C.oro C.pad C.paj C.pal age fthr.ed mthr.ed
1   1     0     0     0     1     0     0     0     0  21       7       4
2   1     0     0     0     1     0     0     0     0  29       7       4
3   1     0     0     0     1     0     0     0     0  23       7       0
4   1     0     0     0     1     0     0     0     0  18       0       0
5   1     0     0     0     1     0     0     0     0  18       4       4
6   1     0     0     0     1     0     0     0     0  25       7       4
  orphan96 fthr.frm hh.size96 educ
1        0        1         9    6
2        0        1         9    3
3        0        1         2    4
4        0        1         6    7
5        0        1         5    7
6        0        0         9    7
child <- child %>% select(-C.ach)
  1. Check balance before matching
## Means in treated / control group 
## Calculate normalized difference in covariates

## mean in treated/control group
cov_mean <- child %>% 
  group_by(abd) %>% 
  summarise(across(-educ, mean)) %>% # get mean for each variable except for educ.
  t() %>%
  as_tibble(rownames = 'var') %>%
  slice(-1) %>% # remove abd row. 
  rename(mean_control = V1,
         mean_treated = V2)
  
## Variance in treated / control group 

cov_var <- child %>% 
  group_by(abd) %>% 
  summarise(across(-educ, var)) %>%
  t() %>%
  as_tibble(rownames = 'var') %>%
  slice(-1) %>%
  rename(var_control = V1,
         var_treated = V2)

bal_pre <- cov_mean %>% 
  left_join(cov_var) %>%
  mutate(mean.diff = mean_treated-mean_control, 
         SMD = mean.diff/sqrt((var_treated+var_control)/2), # standardized mean difference
         type ="before matching")

## Plot normalized mean difference for all covariates 
ggplot(bal_pre, aes(x=var, y=SMD))+
  geom_col(color="black", fill="steelblue")+
  coord_flip() + 
  labs(y = 'Normalized difference',
       x = 'Variable') + 
  theme_bw()

# 
# ggplot(bal_pre, aes(x = var, y = SMD)) + 
#   geom_bar(stat = 'identity',
#            col = 'black')

##MAHALANOBIS DISTANCE MATCHING

## 1. Get a covariate data matrix 

big_X <- child %>% 
  select(-c('abd', 'educ')) %>% 
  as.matrix()

## 2. Extract treated units and control units 

X_treated <- child %>%
  filter(abd == 1) %>%
  select(-c("abd", "educ")) %>%
  as.matrix()

X_control <- child %>%
  filter(abd==0) %>%
  select(-c("abd", "educ")) %>%
  as.matrix()


## 3. Calculate Mahalanobis Distance between all observation-pairs 
## using mahalanobis.dist function from the StatMatch package 

M <- mahalanobis.dist(data.x = X_treated, 
                      data.y = X_control,
                      vc = var(big_X))

## For each treated unit, find closest control unit 
## In each row, find the minimum value index; if multiple, get the first one.
match_ids <- unlist(apply(M, 1, function(i){which(i == min(i))[1]}))


## 4. Create a matched dataset 

matched_data <- rbind(subset(child, abd == 1),
                      subset(child, abd == 0)[match_ids,])

## 5. Check balance after matching 

## mean in treated/control group
cov_mean_matched <- matched_data %>% 
  group_by(abd) %>% 
  summarise(across(-educ, mean)) %>% # get mean for each variable except for educ.
  t() %>%
  as_tibble(rownames = 'var') %>%
  slice(-1) %>% # remove abd row. 
  rename(mean_control = V1,
         mean_treated = V2)
  
## Variance in treated / control group 

cov_var_matched <- matched_data %>% 
  group_by(abd) %>% 
  summarise(across(-educ, var)) %>%
  t() %>%
  as_tibble(rownames = 'var') %>%
  slice(-1) %>%
  rename(var_control = V1,
         var_treated = V2)

bal_post <- cov_mean_matched %>% 
  left_join(cov_var_matched) %>%
  mutate(mean.diff = mean_treated-mean_control, 
         SMD = mean.diff/sqrt((var_treated+var_control)/2), # standardized mean difference
         type ="After matching")

## Plot normalized mean difference for all covariates 
ggplot(bal_post, aes(x=var, y=SMD))+
  geom_col(color="black", fill="steelblue")+
  coord_flip() + 
  labs(y = 'Normalized difference',
       x = 'Variable') + 
  theme_bw()

## Compare balance before and after matching visually 

ggplot(rbind(bal_pre, bal_post), aes(x = var, y = SMD, 
                                     col = type, group = type, shape = type)) +
  geom_point() + 
  coord_flip() + 
  labs(y = 'Normalized difference',
       x = 'Variable') + 
  theme_bw() + 
  geom_hline(yintercept = 0,
             linetype = 'dotted') + 
  theme(legend.position = 'bottom',
        legend.title = element_blank())

##Post-matching estimation

## Full Dataset 

lm(educ ~ abd, data = child) %>% tidy() %>% filter(term == 'abd')
# A tibble: 1 x 5
  term  estimate std.error statistic p.value
  <chr>    <dbl>     <dbl>     <dbl>   <dbl>
1 abd     -0.595     0.218     -2.73 0.00647
lm(educ ~ ., data = child) %>% tidy() %>% filter(term == 'abd')
# A tibble: 1 x 5
  term  estimate std.error statistic p.value
  <chr>    <dbl>     <dbl>     <dbl>   <dbl>
1 abd     -0.704     0.217     -3.24 0.00125
## Matched Dataset 

lm(educ ~ abd, data = matched_data) %>% tidy() %>% filter(term == 'abd')
# A tibble: 1 x 5
  term  estimate std.error statistic  p.value
  <chr>    <dbl>     <dbl>     <dbl>    <dbl>
1 abd     -0.671     0.185     -3.62 0.000311
lm(educ ~ ., data = matched_data) %>% tidy() %>% filter(term == 'abd')
# A tibble: 1 x 5
  term  estimate std.error statistic  p.value
  <chr>    <dbl>     <dbl>     <dbl>    <dbl>
1 abd     -0.683     0.178     -3.84 0.000132