Predicting Civil War Onset_(sup

library(tidyverse)
library(tidymodels)
library(glmnet)
library(leaps)
library(caret)

raw <- read.csv("C:/Users/Seoyun1009/Downloads/sambanis_imputed.csv", stringsAsFactors = FALSE)

#Sambanis 2006 88-variable specification
sambanis_88 <- c(
  "warstds", "ager", "agexp", "anoc", "army85", "autch98", "auto4",
  "autonomy", "avgnabo", "centpol3", "coldwar", "decade1", "decade2",
  "decade3", "decade4", "dem", "dem4", "demch98", "dlang", "drel",
  "durable", "ef", "ef2", "ehet", "elfo", "elfo2", "etdo4590",
  "expgdp", "exrec", "fedpol3", "fuelexp", "gdpgrowth", "geo1",
  "geo2", "geo34", "geo57", "geo69", "geo8", "illiteracy", "incumb",
  "infant", "inst", "inst3", "life", "lmtnest", "ln_gdpen", "lpopns",
  "major", "manuexp", "milper", "mirps0", "mirps1", "mirps2",
  "mirps3", "nat_war", "ncontig", "nmgdp", "nmdp4_alt", "numlang",
  "nwstate", "oil", "p4mchg", "parcomp", "parreg", "part",
  "partfree", "plural", "plurrel", "pol4", "pol4m", "pol4sq",
  "polch98", "polcomp", "popdense", "presi", "pri", "proxregc",
  "ptime", "reg", "regd4_alt", "relfrac", "seceduc", "second",
  "semipol3", "sip2", "sxpnew", "sxpsq", "tnatwar", "trade",
  "warhist", "xconst"
)

df <- raw %>%
  select(all_of(intersect(sambanis_88, names(raw)))) %>%
  mutate(warstds = factor(warstds,
    levels = c(0, 1),
    labels = c("peace", "war")
  ))

dim(df)

## [1] 7140   91

table(df$warstds)

## 
## peace   war 
##  7024   116

# Class imbalance 

prop.table(table(df$warstds))

## 
##     peace       war 
## 0.9837535 0.0162465

# Only ~1.6% of country-years see a war onset. Always-predict-peace would give 98.4% accuracy. Accuracy is useless here. Use ROC-AUC instead; stratified splitting is essential.

# Train/test split (stratified) 
set.seed(7)
split <- initial_split(df, prop = 0.8, strata = warstds)
tr <- training(split)
te <- testing(split)
c(train_wars = sum(tr$warstds == "war"), test_wars  =sum(te$warstds == "war"))

## train_wars  test_wars 
##        102         14

# Caret setup: folds +trainControl objects
set.seed(7)
fold_indices <- createFolds(tr$warstds, k = 5,
                            list = TRUE, returnTrain = TRUE)


# Forward/backward: regsubsets is regression-only, tune on RMSE
ctrl_reg <- trainControl(method = "cv", number = 5,
                         index = fold_indices)


# LASSO/Ridge: classification, tune on ROC AUC
ctrl_cls <- trainControl(method = "cv", number = 5,
                         index = fold_indices,
                         classProbs = TRUE,
                         summaryFunction = twoClassSummary)

# Model 1: Unregularized logit ("OLS")
ols_fit <- glm(warstds ~ ., data = tr, family = binomial)
ols_prob <- predict(ols_fit, te, type = "response")

# Model 2: Forward selection (caret + CV)
y_tr_num <- as.numeric(tr$warstds == "war")
X_tr_mat <- model.matrix(~ . - warstds, tr)[, -1]
set.seed(7)
fwd_cv <- train(
  x = X_tr_mat, y = y_tr_num,
  method = "leapForward",
  trControl = ctrl_reg,
  tuneGrid = data.frame(nvmax = 1:15))

## Reordering variables and trying again:

## Reordering variables and trying again:

## Reordering variables and trying again:

## Reordering variables and trying again:

## Reordering variables and trying again:

## Reordering variables and trying again:

# Variables selected at the CV-best size
fwd_vars <- names(coef(fwd_cv$finalModel, id = fwd_cv$bestTune$nvmax))[-1]

# Refit as logit; predict on test
fwd_logit <- glm(reformulate(fwd_vars, "warstds"),
  data = tr, family = binomial)
fwd_prob <- predict(fwd_logit, te, type = "response")

# Model 3: Backward selection (caret + CV) 
set.seed(7)
bwd_cv <- train(
  x = X_tr_mat, y = y_tr_num,
  method = "leapBackward",
  trControl = ctrl_reg,
  tuneGrid = data.frame(nvmax = 1:15))

## Reordering variables and trying again:

## Reordering variables and trying again:

## Reordering variables and trying again:

## Reordering variables and trying again:

## Reordering variables and trying again:

## Reordering variables and trying again:

bwd_vars <- names(coef(bwd_cv$finalModel, id = bwd_cv$bestTune$nvmax))[-1]

bwd_logit <- glm(reformulate(bwd_vars, "warstds"),
  data = tr, family = binomial)
bwd_prob <- predict(bwd_logit, te, type = "response")

# Model 4: LASSO logit 
set.seed(7)
las_cv <- train(
  warstds ~ .,
  data = tr,
  method = "glmnet",
  trControl = ctrl_cls,
  tuneGrid = expand.grid(alpha = 1, lambda = 10^seq(-4, 0, length = 50)),
  metric = "ROC")
las_prob <- predict(las_cv, te, type = "prob")[, "war"]


# How many variables did LASSO keep?
sum(coef(las_cv$finalModel, s = las_cv$bestTune$lambda)[-1] != 0)

## [1] 45

# Model 5: Ridge logit
set.seed(7)
ridge_cv <- train(
  warstds ~ .,
  data = tr,
  method = "glmnet",
  trControl = ctrl_cls,
  tuneGrid = expand.grid(alpha = 0, lambda = 10^seq(-4, 0, length = 50)),
  metric = "ROC")


ridge_prob <- predict(ridge_cv, te, type = "prob")[, "war"]

#Compare all five — test-set AUC
res <- bind_rows(
  tibble(model = "1. OLS (unregularized)",     warstds = te$warstds, prob = ols_prob),
  tibble(model = "2. Forward selection (CV)",  warstds = te$warstds, prob = fwd_prob),
  tibble(model = "3. Backward selection (CV)", warstds = te$warstds, prob = bwd_prob),
  tibble(model = "4. LASSO (CV)",              warstds = te$warstds, prob = las_prob),
  tibble(model = "5. Ridge (CV)",              warstds = te$warstds, prob = ridge_prob))
res %>%
  group_by(model) %>%
  roc_auc(warstds, prob, event_level = "second") %>%
  select(model, .estimate) %>%
  arrange(desc(.estimate))

# ROC curves
res %>%
  group_by(model) %>%
  roc_curve(warstds, prob, event_level = "second") %>%
  autoplot()

interpret

The findings of this analysis demonstrate that civil war is not merely a spontaneous political event, but rather the culmination of economic and social structural vulnerabilities. The ability of machine learning algorithms to predict civil war onset at a significant level (AUC 0.869)—even after controlling for multiple variables from Sambanis’s (2006) 88-variable specification—suggests that structural factors such as primary commodity export dependence (sxpsq), low economic growth (gdpgrowth), and new state formation (nwstate) act as robust leading indicators of civil conflict

reference

Three papers, one debate Fearon & Laitin (2003),APSR Sambanis (2006),Understanding Civil War Muchlinski, Siroky, He & Kocher (2016),Political Analysis

Predicting Civil War Onset_(sup_learning)

Chae seoyun

2026-06-01