This is an attempted R analytic replication of analyses reported in Carl (2017) subbmited to OpenPsych.
Load packages and data.
library(pacman)
p_load(kirkegaard, readxl, dplyr, grid, readr)
options(digits = 2, scipen = 2)
#data
d = read_excel("data/Global Terrorism Data.xlsx")
mega = read_csv("data/Megadataset_v2.0p.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character(),
## EthnicHeterogenityVanhanen2012 = col_integer(),
## EthnicConflictVanhanen2012 = col_integer(),
## SlowTimePrefWangetal2011 = col_integer(),
## Math00Mean = col_integer(),
## Math00SD = col_integer(),
## Read00Mean = col_integer(),
## Read00SD = col_integer(),
## Sci00Mean = col_integer(),
## Sci00SD = col_integer(),
## Math03Mean = col_integer(),
## Math03SD = col_integer(),
## Read03Mean = col_integer(),
## Read03SD = col_integer(),
## Sci03Mean = col_integer(),
## Sci03SD = col_integer(),
## PS03Mean = col_integer(),
## PS03SD = col_integer(),
## Read09.Native = col_integer(),
## Read09.1g = col_integer()
## # ... with 101 more columns
## )
## See spec(...) for full column specifications.
#merge
d = dplyr::left_join(d, mega[c("X1", "lat", "lon")], by = c("code" = "X1"))
Calculate the population residualized outcome measures.
#redoce regions
d$region = d$reglab %>% fct_recode(Asia = "asi", Eurasia = "eur", "MENA" = "mid", Africa = "afr", "Latin_Carib" = "lat", West = "wes")
#resid func
resider = function(var) {
.model = sprintf("%s ~ log_pop", var)
lm(as.formula(.model), data = d) %>%
resid() %>%
standardize()
}
#some derived vars
d %<>% mutate(
#basic
pop = pop10,
anti_isis = part %>% as.logical,
casualties = deaths + injuries,
pct_muslim = per10,
#this variable is given as 6 columns but the options are mutually exclusive, so we recode as a factor
#there are probably better ways to do this...
legal = df_subset_by_pattern(d, "legor") %>% apply(MARGIN = 1, function(row) {
names(row)[row %>% as.logical %>% which] %>% str_replace("legor_", "")
}) %>% factor,
#log versions
log_pop = log(pop),
log_pct_muslim = log(pct_muslim + 1),
log_attacks = log(attacks + 1),
log_casualties = log(casualties + 1),
log_land = log(area),
log_gdp = ln_gdppc2000,
log_milldeaths = log(mildeaths + 1)
)
#model derived vars
d %<>% mutate(
log_attacks_resid = resider("log_attacks"),
log_casualties_resid = resider("log_casualties"),
log_pct_muslim_resid = resider("log_pct_muslim")
)
#figs 1
ggplot(d, aes(log_attacks_resid, log_casualties_resid)) +
geom_smooth(method = lm, se = F, color = "black", linetype = "dotted") +
geom_point(aes(size = pop, color = region), shape = 1) +
scale_size_continuous(guide = F) +
theme_bw()
ggsave("figures/figure 1.png")
## Saving 7 x 5 in image
#figs 2
fig2a = ggplot(d, aes(log_pct_muslim_resid, log_attacks_resid)) +
geom_smooth(method = lm, se = F) +
geom_point(aes(size = pop), shape = 1) +
scale_size_continuous(guide = F) +
theme_bw()
fig2b = ggplot(d, aes(log_pct_muslim_resid, log_casualties_resid)) +
geom_smooth(method = lm, se = F) +
geom_point(aes(size = pop), shape = 1) +
scale_size_continuous(guide = F) +
theme_bw()
#combine
grid.newpage()
grid.draw(cbind(ggplotGrob(fig2a), ggplotGrob(fig2a), size = "last"))
ggsave("figures/figure 2.png")
## Saving 7 x 5 in image
The plots replicated. I added colors for improved interpretation.
The subplots show that there are probably modeling problems with the errors for the left cluster of cases.
#combinations of region and outcome
model_combos = expand.grid(region = levels(d$region),
y = c("log_attacks", "log_casualties"),
stringsAsFactors = F)
#model and plot combinations
reg_models = map(seq_along_rows(model_combos), function(row_) {
row_ = model_combos[row_, ]
#subdata
dsub = d %>% filter(region == row_$region)
#build model
model_ = sprintf("%s ~ log_pct_muslim + log_pop", row_$y)
#fit
fit = lm(model_, data = dsub) %>% MOD_summary(kfold = F)
#plot
gg = ggplot(dsub, aes_string("log_pct_muslim_resid", row_$y + "_resid")) +
geom_point(aes(size = pop), shape = 1) +
geom_smooth(method = lm, se = F) +
scale_size_continuous(guide = F) +
theme_bw()
list(
fit = fit,
plot = gg
)
})
#extract model info
reg_models_fits = map(reg_models, "fit")
data_frame(beta = map_dbl(reg_models_fits, ~.$coefs[1, 1]),
region = model_combos$region,
n = map_int(reg_models_fits, ~.$meta$N)
)
## # A tibble: 12 × 3
## beta region n
## <dbl> <chr> <int>
## 1 0.347 Africa 46
## 2 0.495 Asia 21
## 3 0.307 Eurasia 20
## 4 0.062 Latin_Carib 30
## 5 -0.215 MENA 20
## 6 0.430 West 31
## 7 0.380 Africa 46
## 8 0.491 Asia 21
## 9 0.365 Eurasia 20
## 10 0.131 Latin_Carib 30
## 11 -0.180 MENA 20
## 12 0.329 West 31
#plots
#not sure how to easily add these together to 2 compound 2x3 figures
walk(reg_models, ~.$plot %>% print)
#without israel
lm("log_attacks ~ log_pct_muslim + log_pop", data = d, subset = region == "MENA" & code != "ISR") %>% MOD_summary(kfold = F)
## Model coefficients
## Beta SE CI_lower CI_upper
## log_pct_muslim 0.044 0.28 -0.558 0.65
## log_pop 0.621 0.28 0.019 1.22
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 log_attacks 19 16 0.43 0.36 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## log_pct_muslim 0.029 0.039
## log_pop 0.413 0.480
lm("log_casualties ~ log_pct_muslim + log_pop", data = d, subset = region == "MENA" & code != "ISR") %>% MOD_summary(kfold = F)
## Model coefficients
## Beta SE CI_lower CI_upper
## log_pct_muslim 0.09 0.27 -0.485 0.66
## log_pop 0.62 0.27 0.048 1.20
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 log_casualties 19 16 0.48 0.41 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## log_pct_muslim 0.06 0.082
## log_pop 0.41 0.498
Replicated main model results.
Could not replicate sub-plot D. The others plots replicated.
Could not replicate Israel extra models.
#6 more models
basic_controls = c("log_land", "abslat", "elevavg", "rough", "legal")
additional_controls = c("log_gdp", "democ", "efrac")
#models
world_models = list(
#attacks
lm("log_attacks ~ log_pct_muslim + log_pop + region", data = d),
lm("log_attacks ~ log_pct_muslim + log_pop + region + " + str_c(basic_controls, collapse = " + "), data = d),
lm("log_attacks ~ log_pct_muslim + log_pop + region + " + str_c(c(basic_controls, additional_controls), collapse = " + "), data = d, na.action = na.exclude),
#casualties
lm("log_casualties ~ log_pct_muslim + log_pop + region", data = d),
lm("log_casualties ~ log_pct_muslim + log_pop + region + " + str_c(basic_controls, collapse = " + "), data = d),
lm("log_casualties ~ log_pct_muslim + log_pop + region + " + str_c(c(basic_controls, additional_controls), collapse = " + "), data = d)
) %>% map(MOD_summary, kfold = F)
#extract model info
data_frame(beta = map_dbl(world_models, ~.$coefs[1, 1]),
n = map_int(world_models, ~.$meta$N),
r2 = map_dbl(world_models, ~.$meta$R2)
)
## # A tibble: 6 × 3
## beta n r2
## <dbl> <int> <dbl>
## 1 0.38 168 0.56
## 2 0.35 168 0.61
## 3 0.32 156 0.63
## 4 0.39 168 0.60
## 5 0.36 168 0.64
## 6 0.33 156 0.66
World modeling results replicated.
west_models2 = list(
#attacks
lm("log_attacks ~ log_milldeaths + log_pop", data = d, subset = region == "West"),
lm("log_attacks ~ anti_isis + log_pop", data = d, subset = region == "West"),
lm("log_attacks ~ log_milldeaths + log_pct_muslim + log_pop", data = d, subset = region == "West"),
lm("log_attacks ~ anti_isis + log_pct_muslim + log_pop", data = d, subset = region == "West"),
#casualties
lm("log_casualties ~ log_milldeaths + log_pop", data = d, subset = region == "West"),
lm("log_casualties ~ anti_isis + log_pop", data = d, subset = region == "West"),
lm("log_casualties ~ log_milldeaths + log_pct_muslim + log_pop", data = d, subset = region == "West"),
lm("log_casualties ~ anti_isis + log_pct_muslim + log_pop", data = d, subset = region == "West")
) %>% map(MOD_summary, kfold = F)
#model info
west_models2 %>% map("coefs") %>% ldf_to_df(by_name = "model", rownames_to_var = T, rownames_name = "predictor")
## Beta SE CI_lower CI_upper predictor model
## 1 0.278 0.206 -0.1436 0.70 log_milldeaths 1
## 2 0.513 0.206 0.0923 0.93 log_pop 1
## 3 1.231 0.247 0.7254 1.74 anti_isis 2
## 4 0.446 0.110 0.2208 0.67 log_pop 2
## 5 0.579 0.150 0.2707 0.89 log_milldeaths 3
## 6 0.562 0.098 0.3609 0.76 log_pct_muslim 3
## 7 0.062 0.161 -0.2688 0.39 log_pop 3
## 8 0.950 0.254 0.4283 1.47 anti_isis 4
## 9 0.256 0.104 0.0429 0.47 log_pct_muslim 4
## 10 0.415 0.102 0.2063 0.62 log_pop 4
## 11 0.174 0.211 -0.2587 0.61 log_milldeaths 5
## 12 0.591 0.211 0.1578 1.02 log_pop 5
## 13 0.864 0.300 0.2490 1.48 anti_isis 6
## 14 0.527 0.134 0.2530 0.80 log_pop 6
## 15 0.399 0.193 0.0022 0.80 log_milldeaths 7
## 16 0.420 0.126 0.1608 0.68 log_pct_muslim 7
## 17 0.253 0.208 -0.1732 0.68 log_pop 7
## 18 0.630 0.327 -0.0417 1.30 anti_isis 8
## 19 0.213 0.134 -0.0605 0.49 log_pct_muslim 8
## 20 0.501 0.131 0.2324 0.77 log_pop 8
west_models2 %>% map_df(~data.frame(n = .$meta$N, r2 = .$meta$R2))
## n r2
## 1 31 0.57
## 2 31 0.76
## 3 31 0.81
## 4 31 0.80
## 5 31 0.54
## 6 31 0.64
## 7 31 0.68
## 8 31 0.67
Results for continuous predictors replicated. Numbers did not match for anti-ISIS, presumably because the author coded it incorrectly as a continuous variable when it is a categorical (2 level) variable.
Which approach is best? Using per capita as the dependent or using population as a predictor?
#per capita terrorism attacks
d %<>% mutate(
attacks_per_capita = attacks / pop,
attacks_per_capita_log = log(attacks_per_capita + 1)
)
#compare
GG_scatter(d, "attacks_per_capita", "log_attacks_resid", case_names_vector = "country")
GG_scatter(d, "attacks_per_capita_log", "log_attacks_resid", case_names_vector = "country")
#distributions
GG_denhist(d, "log_attacks_resid", vline = NULL)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_denhist(d, "attacks_per_capita", vline = NULL)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
As can be seen, taking the log has no effect at all. The distribution of the residualization approach is fairly normal, while that for the per capita is extremely skewed and not useful for analysis.
Carl handled the issue of spatial autocorrelation indirectly by using a region fixed effects approach. Probably a better approach is to include a ‘spatially lagged’ predictor, as commonly done in economics. We can examine whether there is a need to do this by examining the residuals for SAC. We can also include the KNSN predictor in the model to see how this affects results.
#examine dependent for SAC
SAC_measures(d, "attacks")
## Factors were converted to characters.
## Morans_I knsn_3
## attacks 0.024 0.22
#examine residuals
d$world_model_3_resid = world_models[[3]]$model_obj %>% resid
d[c("lat", "lon", "world_model_3_resid")] %>% miss_filter() %>% SAC_measures("world_model_3_resid")
## Morans_I knsn_3
## world_model_3_resid 0.027 0.28
#include KNSN prediction in model
d$attacks_knsn = SAC_knsnr(d, "attacks")
## Factors were converted to characters.
spatial_model = lm("log_attacks ~ log_pct_muslim + log_pop + region + attacks_knsn + " + str_c(c(basic_controls, additional_controls), collapse = " + "), data = d, na.action = na.exclude) %>% MOD_summary(kfold = F) %T>% print
## Model coefficients
## Beta SE CI_lower CI_upper
## log_pct_muslim 0.311 0.079 0.154 0.468
## log_pop 0.381 0.084 0.215 0.546
## region: Africa 0.000 NA NA NA
## region: Asia 0.488 0.242 0.010 0.965
## region: Eurasia 0.448 0.353 -0.250 1.146
## region: Latin_Carib -0.097 0.210 -0.511 0.318
## region: MENA 1.064 0.293 0.485 1.644
## region: West 0.468 0.357 -0.239 1.175
## attacks_knsn 0.045 0.060 -0.073 0.163
## log_land 0.107 0.086 -0.063 0.276
## abslat 0.105 0.127 -0.147 0.356
## elevavg -0.047 0.080 -0.205 0.111
## rough 0.032 0.082 -0.131 0.194
## legal: fr 0.000 NA NA NA
## legal: ge -0.373 0.344 -1.054 0.307
## legal: sc 0.059 0.395 -0.723 0.840
## legal: so -0.504 0.236 -0.970 -0.038
## legal: uk 0.242 0.144 -0.042 0.526
## log_gdp -0.248 0.096 -0.438 -0.058
## democ 0.128 0.094 -0.058 0.314
## efrac 0.070 0.075 -0.079 0.219
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 log_attacks 156 136 0.64 0.58 NA
##
##
## Etas from analysis of variance
## Eta Eta_partial
## log_pct_muslim 0.203 0.319
## log_pop 0.235 0.363
## region 0.218 0.339
## attacks_knsn 0.039 0.064
## log_land 0.065 0.106
## abslat 0.043 0.070
## elevavg 0.031 0.051
## rough 0.020 0.033
## legal 0.178 0.283
## log_gdp 0.134 0.216
## democ 0.070 0.116
## efrac 0.048 0.079
d$world_model_spatial_resid = spatial_model$model_obj %>% resid
d[c("world_model_spatial_resid", "lat", "lon")] %>% miss_filter %>% SAC_measures("world_model_spatial_resid")
## Morans_I knsn_3
## world_model_spatial_resid 0.025 0.24
There was only a small amount of spatial autocorrelation in the residuals. Including a spatial predictor had little effect and did not even reduce the spatial autocorrelation in the residauls.
Some discrepancies. Most importantly with the MENA Israel extra analyses and the D subplots.