Introduction

This is an attempted R analytic replication of analyses reported in Carl (2017) subbmited to OpenPsych.

Setup

Load packages and data.

library(pacman)
p_load(kirkegaard, readxl, dplyr, grid, readr)
options(digits = 2, scipen = 2)

#data
d = read_excel("data/Global Terrorism Data.xlsx")
mega = read_csv("data/Megadataset_v2.0p.csv")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character(),
##   EthnicHeterogenityVanhanen2012 = col_integer(),
##   EthnicConflictVanhanen2012 = col_integer(),
##   SlowTimePrefWangetal2011 = col_integer(),
##   Math00Mean = col_integer(),
##   Math00SD = col_integer(),
##   Read00Mean = col_integer(),
##   Read00SD = col_integer(),
##   Sci00Mean = col_integer(),
##   Sci00SD = col_integer(),
##   Math03Mean = col_integer(),
##   Math03SD = col_integer(),
##   Read03Mean = col_integer(),
##   Read03SD = col_integer(),
##   Sci03Mean = col_integer(),
##   Sci03SD = col_integer(),
##   PS03Mean = col_integer(),
##   PS03SD = col_integer(),
##   Read09.Native = col_integer(),
##   Read09.1g = col_integer()
##   # ... with 101 more columns
## )

## See spec(...) for full column specifications.

#merge
d = dplyr::left_join(d, mega[c("X1", "lat", "lon")], by = c("code" = "X1"))

Transform variables

Calculate the population residualized outcome measures.

#redoce regions
d$region = d$reglab %>% fct_recode(Asia = "asi", Eurasia = "eur", "MENA" = "mid", Africa = "afr", "Latin_Carib" = "lat", West = "wes")

#resid func
resider = function(var) {
  .model = sprintf("%s ~ log_pop", var)
  lm(as.formula(.model), data = d) %>% 
    resid() %>% 
    standardize()
}

#some derived vars
d %<>% mutate(
  #basic
  pop = pop10,
  anti_isis = part %>% as.logical,
  casualties = deaths + injuries,
  pct_muslim = per10,
  #this variable is given as 6 columns but the options are mutually exclusive, so we recode as a factor
  #there are probably better ways to do this...
  legal = df_subset_by_pattern(d, "legor") %>% apply(MARGIN = 1, function(row) {
    names(row)[row %>% as.logical %>% which] %>% str_replace("legor_", "")
  }) %>% factor,
  
  #log versions
  log_pop = log(pop),
  log_pct_muslim = log(pct_muslim + 1),
  log_attacks = log(attacks + 1),
  log_casualties = log(casualties + 1),
  log_land = log(area),
  log_gdp = ln_gdppc2000,
  log_milldeaths = log(mildeaths + 1)
)

#model derived vars
d %<>% mutate(
  log_attacks_resid = resider("log_attacks"),
  log_casualties_resid = resider("log_casualties"),
  log_pct_muslim_resid = resider("log_pct_muslim")
)

Figure 1-2

#figs 1
ggplot(d, aes(log_attacks_resid, log_casualties_resid)) +
  geom_smooth(method = lm, se = F, color = "black", linetype = "dotted") +
  geom_point(aes(size = pop, color = region), shape = 1) +
  scale_size_continuous(guide = F) +
  theme_bw()

ggsave("figures/figure 1.png")

## Saving 7 x 5 in image

#figs 2
fig2a = ggplot(d, aes(log_pct_muslim_resid, log_attacks_resid)) +
  geom_smooth(method = lm, se = F) +
  geom_point(aes(size = pop), shape = 1) +
  scale_size_continuous(guide = F) +
  theme_bw()

fig2b = ggplot(d, aes(log_pct_muslim_resid, log_casualties_resid)) +
  geom_smooth(method = lm, se = F) +
  geom_point(aes(size = pop), shape = 1) +
  scale_size_continuous(guide = F) +
  theme_bw() 

#combine
grid.newpage()
grid.draw(cbind(ggplotGrob(fig2a), ggplotGrob(fig2a), size = "last"))

ggsave("figures/figure 2.png")

## Saving 7 x 5 in image

The plots replicated. I added colors for improved interpretation.

The subplots show that there are probably modeling problems with the errors for the left cluster of cases.

Section 3.2 - Within-region associations

#combinations of region and outcome
model_combos = expand.grid(region = levels(d$region),
                           y = c("log_attacks", "log_casualties"),
                           stringsAsFactors = F)

#model and plot combinations
reg_models = map(seq_along_rows(model_combos), function(row_) {
  row_ = model_combos[row_, ]

  #subdata
  dsub = d %>% filter(region == row_$region)
  
  #build model
  model_ = sprintf("%s ~ log_pct_muslim + log_pop", row_$y)
  
  #fit
  fit = lm(model_, data = dsub) %>% MOD_summary(kfold = F)
  
  #plot
  gg = ggplot(dsub, aes_string("log_pct_muslim_resid", row_$y + "_resid")) + 
    geom_point(aes(size = pop), shape = 1) +
    geom_smooth(method = lm, se = F) +
    scale_size_continuous(guide = F) +
    theme_bw() 
  
  list(
    fit = fit,
    plot = gg
  )
})

#extract model info
reg_models_fits = map(reg_models, "fit")

data_frame(beta = map_dbl(reg_models_fits, ~.$coefs[1, 1]),
           region = model_combos$region,
           n = map_int(reg_models_fits, ~.$meta$N)
           )

## # A tibble: 12 × 3
##      beta      region     n
##     <dbl>       <chr> <int>
## 1   0.347      Africa    46
## 2   0.495        Asia    21
## 3   0.307     Eurasia    20
## 4   0.062 Latin_Carib    30
## 5  -0.215        MENA    20
## 6   0.430        West    31
## 7   0.380      Africa    46
## 8   0.491        Asia    21
## 9   0.365     Eurasia    20
## 10  0.131 Latin_Carib    30
## 11 -0.180        MENA    20
## 12  0.329        West    31

#plots
#not sure how to easily add these together to 2 compound 2x3 figures
walk(reg_models, ~.$plot %>% print)

#without israel
lm("log_attacks ~ log_pct_muslim + log_pop", data = d, subset = region == "MENA" & code != "ISR") %>% MOD_summary(kfold = F)

## Model coefficients
##                 Beta   SE CI_lower CI_upper
## log_pct_muslim 0.044 0.28   -0.558     0.65
## log_pop        0.621 0.28    0.019     1.22
## 
## 
## Model meta-data
##       outcome  N df   R2 R2-adj. R2-cv
## 1 log_attacks 19 16 0.43    0.36    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## log_pct_muslim 0.029       0.039
## log_pop        0.413       0.480

lm("log_casualties ~ log_pct_muslim + log_pop", data = d, subset = region == "MENA" & code != "ISR") %>% MOD_summary(kfold = F)

## Model coefficients
##                Beta   SE CI_lower CI_upper
## log_pct_muslim 0.09 0.27   -0.485     0.66
## log_pop        0.62 0.27    0.048     1.20
## 
## 
## Model meta-data
##          outcome  N df   R2 R2-adj. R2-cv
## 1 log_casualties 19 16 0.48    0.41    NA
## 
## 
## Etas from analysis of variance
##                 Eta Eta_partial
## log_pct_muslim 0.06       0.082
## log_pop        0.41       0.498

Replicated main model results.

Could not replicate sub-plot D. The others plots replicated.

Could not replicate Israel extra models.

Section 3.3 - Robustness to geographic, economic and institutional controls

#6 more models
basic_controls = c("log_land", "abslat", "elevavg", "rough", "legal")
additional_controls = c("log_gdp", "democ", "efrac")

#models
world_models = list(
  #attacks
  lm("log_attacks ~ log_pct_muslim + log_pop + region", data = d),
  lm("log_attacks ~ log_pct_muslim + log_pop + region + " + str_c(basic_controls, collapse = " + "), data = d),
  lm("log_attacks ~ log_pct_muslim + log_pop + region + " + str_c(c(basic_controls, additional_controls), collapse = " + "), data = d, na.action = na.exclude),
  
  #casualties
  lm("log_casualties ~ log_pct_muslim + log_pop + region", data = d),
  lm("log_casualties ~ log_pct_muslim + log_pop + region + " + str_c(basic_controls, collapse = " + "), data = d),
  lm("log_casualties ~ log_pct_muslim + log_pop + region + " + str_c(c(basic_controls, additional_controls), collapse = " + "), data = d)
) %>% map(MOD_summary, kfold = F)

#extract model info
data_frame(beta = map_dbl(world_models, ~.$coefs[1, 1]),
           n = map_int(world_models, ~.$meta$N),
           r2 = map_dbl(world_models, ~.$meta$R2)
           )

## # A tibble: 6 × 3
##    beta     n    r2
##   <dbl> <int> <dbl>
## 1  0.38   168  0.56
## 2  0.35   168  0.61
## 3  0.32   156  0.63
## 4  0.39   168  0.60
## 5  0.36   168  0.64
## 6  0.33   156  0.66

World modeling results replicated.

Section 3.4 - Percentage Muslim versus military intervention in the Middle East

west_models2 = list(
  #attacks
  lm("log_attacks ~ log_milldeaths + log_pop", data = d, subset = region == "West"),
  lm("log_attacks ~ anti_isis + log_pop", data = d, subset = region == "West"),
  lm("log_attacks ~ log_milldeaths + log_pct_muslim + log_pop", data = d, subset = region == "West"),
  lm("log_attacks ~ anti_isis + log_pct_muslim + log_pop", data = d, subset = region == "West"),
  
  #casualties
  lm("log_casualties ~ log_milldeaths + log_pop", data = d, subset = region == "West"),
  lm("log_casualties ~ anti_isis + log_pop", data = d, subset = region == "West"),
  lm("log_casualties ~ log_milldeaths + log_pct_muslim + log_pop", data = d, subset = region == "West"),
  lm("log_casualties ~ anti_isis + log_pct_muslim + log_pop", data = d, subset = region == "West")
) %>% map(MOD_summary, kfold = F)

#model info
west_models2 %>% map("coefs") %>% ldf_to_df(by_name = "model", rownames_to_var = T, rownames_name = "predictor")

##     Beta    SE CI_lower CI_upper      predictor model
## 1  0.278 0.206  -0.1436     0.70 log_milldeaths     1
## 2  0.513 0.206   0.0923     0.93        log_pop     1
## 3  1.231 0.247   0.7254     1.74      anti_isis     2
## 4  0.446 0.110   0.2208     0.67        log_pop     2
## 5  0.579 0.150   0.2707     0.89 log_milldeaths     3
## 6  0.562 0.098   0.3609     0.76 log_pct_muslim     3
## 7  0.062 0.161  -0.2688     0.39        log_pop     3
## 8  0.950 0.254   0.4283     1.47      anti_isis     4
## 9  0.256 0.104   0.0429     0.47 log_pct_muslim     4
## 10 0.415 0.102   0.2063     0.62        log_pop     4
## 11 0.174 0.211  -0.2587     0.61 log_milldeaths     5
## 12 0.591 0.211   0.1578     1.02        log_pop     5
## 13 0.864 0.300   0.2490     1.48      anti_isis     6
## 14 0.527 0.134   0.2530     0.80        log_pop     6
## 15 0.399 0.193   0.0022     0.80 log_milldeaths     7
## 16 0.420 0.126   0.1608     0.68 log_pct_muslim     7
## 17 0.253 0.208  -0.1732     0.68        log_pop     7
## 18 0.630 0.327  -0.0417     1.30      anti_isis     8
## 19 0.213 0.134  -0.0605     0.49 log_pct_muslim     8
## 20 0.501 0.131   0.2324     0.77        log_pop     8

west_models2 %>% map_df(~data.frame(n = .$meta$N, r2 = .$meta$R2))

##    n   r2
## 1 31 0.57
## 2 31 0.76
## 3 31 0.81
## 4 31 0.80
## 5 31 0.54
## 6 31 0.64
## 7 31 0.68
## 8 31 0.67

Results for continuous predictors replicated. Numbers did not match for anti-ISIS, presumably because the author coded it incorrectly as a continuous variable when it is a categorical (2 level) variable.

Per capita outcome vs. population as predictor

Which approach is best? Using per capita as the dependent or using population as a predictor?

#per capita terrorism attacks
d %<>% mutate(
  attacks_per_capita = attacks / pop,
  attacks_per_capita_log = log(attacks_per_capita + 1)
)

#compare
GG_scatter(d, "attacks_per_capita", "log_attacks_resid", case_names_vector = "country")

GG_scatter(d, "attacks_per_capita_log", "log_attacks_resid", case_names_vector = "country")

#distributions
GG_denhist(d, "log_attacks_resid", vline = NULL)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_denhist(d, "attacks_per_capita", vline = NULL)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

As can be seen, taking the log has no effect at all. The distribution of the residualization approach is fairly normal, while that for the per capita is extremely skewed and not useful for analysis.

Spatial autocorrelation

Carl handled the issue of spatial autocorrelation indirectly by using a region fixed effects approach. Probably a better approach is to include a ‘spatially lagged’ predictor, as commonly done in economics. We can examine whether there is a need to do this by examining the residuals for SAC. We can also include the KNSN predictor in the model to see how this affects results.

#examine dependent for SAC
SAC_measures(d, "attacks")

## Factors were converted to characters.

##         Morans_I knsn_3
## attacks    0.024   0.22

#examine residuals
d$world_model_3_resid = world_models[[3]]$model_obj %>% resid
d[c("lat", "lon", "world_model_3_resid")] %>% miss_filter() %>% SAC_measures("world_model_3_resid")

##                     Morans_I knsn_3
## world_model_3_resid    0.027   0.28

#include KNSN prediction in model
d$attacks_knsn = SAC_knsnr(d, "attacks")

## Factors were converted to characters.

spatial_model = lm("log_attacks ~ log_pct_muslim + log_pop + region + attacks_knsn + " + str_c(c(basic_controls, additional_controls), collapse = " + "), data = d, na.action = na.exclude) %>% MOD_summary(kfold = F) %T>% print

## Model coefficients
##                       Beta    SE CI_lower CI_upper
## log_pct_muslim       0.311 0.079    0.154    0.468
## log_pop              0.381 0.084    0.215    0.546
## region: Africa       0.000    NA       NA       NA
## region: Asia         0.488 0.242    0.010    0.965
## region: Eurasia      0.448 0.353   -0.250    1.146
## region: Latin_Carib -0.097 0.210   -0.511    0.318
## region: MENA         1.064 0.293    0.485    1.644
## region: West         0.468 0.357   -0.239    1.175
## attacks_knsn         0.045 0.060   -0.073    0.163
## log_land             0.107 0.086   -0.063    0.276
## abslat               0.105 0.127   -0.147    0.356
## elevavg             -0.047 0.080   -0.205    0.111
## rough                0.032 0.082   -0.131    0.194
## legal: fr            0.000    NA       NA       NA
## legal: ge           -0.373 0.344   -1.054    0.307
## legal: sc            0.059 0.395   -0.723    0.840
## legal: so           -0.504 0.236   -0.970   -0.038
## legal: uk            0.242 0.144   -0.042    0.526
## log_gdp             -0.248 0.096   -0.438   -0.058
## democ                0.128 0.094   -0.058    0.314
## efrac                0.070 0.075   -0.079    0.219
## 
## 
## Model meta-data
##       outcome   N  df   R2 R2-adj. R2-cv
## 1 log_attacks 156 136 0.64    0.58    NA
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## log_pct_muslim 0.203       0.319
## log_pop        0.235       0.363
## region         0.218       0.339
## attacks_knsn   0.039       0.064
## log_land       0.065       0.106
## abslat         0.043       0.070
## elevavg        0.031       0.051
## rough          0.020       0.033
## legal          0.178       0.283
## log_gdp        0.134       0.216
## democ          0.070       0.116
## efrac          0.048       0.079

d$world_model_spatial_resid = spatial_model$model_obj %>% resid
d[c("world_model_spatial_resid", "lat", "lon")] %>% miss_filter %>% SAC_measures("world_model_spatial_resid")

##                           Morans_I knsn_3
## world_model_spatial_resid    0.025   0.24

There was only a small amount of spatial autocorrelation in the residuals. Including a spatial predictor had little effect and did not even reduce the spatial autocorrelation in the residauls.

Conclusion

Some discrepancies. Most importantly with the MENA Israel extra analyses and the D subplots.

Analytic replication report for: A Global Analysis of Islamist Terrorism (Carl 2017)

Emil O. W. Kirkegaard