library(tidyverse)
library(tidymodels)
library(glmnet)
library(leaps)
library(caret)
raw <- read.csv("C:/Users/Seoyun1009/Downloads/sambanis_imputed.csv", stringsAsFactors = FALSE)
#Sambanis 2006 88-variable specification
sambanis_88 <- c(
"warstds", "ager", "agexp", "anoc", "army85", "autch98", "auto4",
"autonomy", "avgnabo", "centpol3", "coldwar", "decade1", "decade2",
"decade3", "decade4", "dem", "dem4", "demch98", "dlang", "drel",
"durable", "ef", "ef2", "ehet", "elfo", "elfo2", "etdo4590",
"expgdp", "exrec", "fedpol3", "fuelexp", "gdpgrowth", "geo1",
"geo2", "geo34", "geo57", "geo69", "geo8", "illiteracy", "incumb",
"infant", "inst", "inst3", "life", "lmtnest", "ln_gdpen", "lpopns",
"major", "manuexp", "milper", "mirps0", "mirps1", "mirps2",
"mirps3", "nat_war", "ncontig", "nmgdp", "nmdp4_alt", "numlang",
"nwstate", "oil", "p4mchg", "parcomp", "parreg", "part",
"partfree", "plural", "plurrel", "pol4", "pol4m", "pol4sq",
"polch98", "polcomp", "popdense", "presi", "pri", "proxregc",
"ptime", "reg", "regd4_alt", "relfrac", "seceduc", "second",
"semipol3", "sip2", "sxpnew", "sxpsq", "tnatwar", "trade",
"warhist", "xconst"
)
df <- raw %>%
select(all_of(intersect(sambanis_88, names(raw)))) %>%
mutate(warstds = factor(warstds,
levels = c(0, 1),
labels = c("peace", "war")
))
dim(df)
## [1] 7140 91
table(df$warstds)
##
## peace war
## 7024 116
# Class imbalance
prop.table(table(df$warstds))
##
## peace war
## 0.9837535 0.0162465
# Only ~1.6% of country-years see a war onset. Always-predict-peace would give 98.4% accuracy. Accuracy is useless here. Use ROC-AUC instead; stratified splitting is essential.
# Train/test split (stratified)
set.seed(7)
split <- initial_split(df, prop = 0.8, strata = warstds)
tr <- training(split)
te <- testing(split)
c(train_wars = sum(tr$warstds == "war"), test_wars =sum(te$warstds == "war"))
## train_wars test_wars
## 102 14
# Caret setup: folds +trainControl objects
set.seed(7)
fold_indices <- createFolds(tr$warstds, k = 5,
list = TRUE, returnTrain = TRUE)
# Forward/backward: regsubsets is regression-only, tune on RMSE
ctrl_reg <- trainControl(method = "cv", number = 5,
index = fold_indices)
# LASSO/Ridge: classification, tune on ROC AUC
ctrl_cls <- trainControl(method = "cv", number = 5,
index = fold_indices,
classProbs = TRUE,
summaryFunction = twoClassSummary)
# Model 1: Unregularized logit ("OLS")
ols_fit <- glm(warstds ~ ., data = tr, family = binomial)
ols_prob <- predict(ols_fit, te, type = "response")
# Model 2: Forward selection (caret + CV)
y_tr_num <- as.numeric(tr$warstds == "war")
X_tr_mat <- model.matrix(~ . - warstds, tr)[, -1]
set.seed(7)
fwd_cv <- train(
x = X_tr_mat, y = y_tr_num,
method = "leapForward",
trControl = ctrl_reg,
tuneGrid = data.frame(nvmax = 1:15))
## Reordering variables and trying again:
## Reordering variables and trying again:
## Reordering variables and trying again:
## Reordering variables and trying again:
## Reordering variables and trying again:
## Reordering variables and trying again:
# Variables selected at the CV-best size
fwd_vars <- names(coef(fwd_cv$finalModel, id = fwd_cv$bestTune$nvmax))[-1]
# Refit as logit; predict on test
fwd_logit <- glm(reformulate(fwd_vars, "warstds"),
data = tr, family = binomial)
fwd_prob <- predict(fwd_logit, te, type = "response")
# Model 3: Backward selection (caret + CV)
set.seed(7)
bwd_cv <- train(
x = X_tr_mat, y = y_tr_num,
method = "leapBackward",
trControl = ctrl_reg,
tuneGrid = data.frame(nvmax = 1:15))
## Reordering variables and trying again:
## Reordering variables and trying again:
## Reordering variables and trying again:
## Reordering variables and trying again:
## Reordering variables and trying again:
## Reordering variables and trying again:
bwd_vars <- names(coef(bwd_cv$finalModel, id = bwd_cv$bestTune$nvmax))[-1]
bwd_logit <- glm(reformulate(bwd_vars, "warstds"),
data = tr, family = binomial)
bwd_prob <- predict(bwd_logit, te, type = "response")
# Model 4: LASSO logit
set.seed(7)
las_cv <- train(
warstds ~ .,
data = tr,
method = "glmnet",
trControl = ctrl_cls,
tuneGrid = expand.grid(alpha = 1, lambda = 10^seq(-4, 0, length = 50)),
metric = "ROC")
las_prob <- predict(las_cv, te, type = "prob")[, "war"]
# How many variables did LASSO keep?
sum(coef(las_cv$finalModel, s = las_cv$bestTune$lambda)[-1] != 0)
## [1] 45
# Model 5: Ridge logit
set.seed(7)
ridge_cv <- train(
warstds ~ .,
data = tr,
method = "glmnet",
trControl = ctrl_cls,
tuneGrid = expand.grid(alpha = 0, lambda = 10^seq(-4, 0, length = 50)),
metric = "ROC")
ridge_prob <- predict(ridge_cv, te, type = "prob")[, "war"]
#Compare all five — test-set AUC
res <- bind_rows(
tibble(model = "1. OLS (unregularized)", warstds = te$warstds, prob = ols_prob),
tibble(model = "2. Forward selection (CV)", warstds = te$warstds, prob = fwd_prob),
tibble(model = "3. Backward selection (CV)", warstds = te$warstds, prob = bwd_prob),
tibble(model = "4. LASSO (CV)", warstds = te$warstds, prob = las_prob),
tibble(model = "5. Ridge (CV)", warstds = te$warstds, prob = ridge_prob))
res %>%
group_by(model) %>%
roc_auc(warstds, prob, event_level = "second") %>%
select(model, .estimate) %>%
arrange(desc(.estimate))
# ROC curves
res %>%
group_by(model) %>%
roc_curve(warstds, prob, event_level = "second") %>%
autoplot()
interpret
The findings of this analysis demonstrate that civil war is not merely a spontaneous political event, but rather the culmination of economic and social structural vulnerabilities. The ability of machine learning algorithms to predict civil war onset at a significant level (AUC 0.869)—even after controlling for multiple variables from Sambanis’s (2006) 88-variable specification—suggests that structural factors such as primary commodity export dependence (sxpsq), low economic growth (gdpgrowth), and new state formation (nwstate) act as robust leading indicators of civil conflict
reference
Three papers, one debate Fearon & Laitin (2003),APSR Sambanis (2006),Understanding Civil War Muchlinski, Siroky, He & Kocher (2016),Political Analysis