Large-scale atrocities

Sara Altman

library(tidyverse)
library(googlesheets)
library(ggrepel)
library(stringr)
library(ggrepel)

#params
data_url <- "https://docs.google.com/spreadsheets/d/1N9mMdh2X47AfmDJ4XEw8CDe5CL5O6mfI3KPnzERaplk/edit#gid=1335230579"
pop_url <- "https://docs.google.com/spreadsheets/d/1N9mMdh2X47AfmDJ4XEw8CDe5CL5O6mfI3KPnzERaplk/edit#gid=798338106"

column_names <- as.character(seq(from = -500, to = 1900, by = 100))

international_wars <- c("Second Persian War", "Alexander the Great", "First Punic War", "Second Punic War", "Third Mithridatic War", "Justinian", "Goguryeo-Sui Wars", "Crusades", "Chinggis Khan", "Hulagu’s Invasion", "Bahmani-Vijayanagara War", "Timur", "Russo-Tatar War", "Burma-Siam Wars", "Great Turkish War", "War of the Spanish Succession", "Great Northern War", "War of the Austrian Succession", "Sino-Dzungar War", "Seven Years War", "Napoleonic Wars", "Crimean War", "War of the Triple Alliance", "Franco-Prussian War", "Russo-Turkish War", "First World War", "Greco-Turkish War", "Second World War", "Korean War", "Iran-Iraq War")
despots <- c("Qin Shi Huang Di", "Aurangzeb", "Peter the Great", "Shaka", "Joseph Stalin", "Mao Zedong", "North Korea", "Idi Amin", "Democratic Kampuchea", "Saddam Hussein")
failed_states <- c("Age of Warring States", "Xin Dynasty", "The Three Kingdoms of China", "Fall of the Western Roman Empire", "Mayan Collapse", "Fall of the Yuan Dynasty", "Hundred Years War", "The Time of Troubles", "Thirty Years' War", "Fall of the Ming Dynasty", "Mexican Revolution", "Russian Civil War", "Chinese Civil War", "Soviet-Afghan War", "Somalian Chaos", "Second Congo War")
oppression <- c("Gladiatorial Games", "Mideast Slave Trade", "Atlantic Slave Trade",  "Aztec Human Sacrifice", "Conquest of the Americas", "Congo Free State", "Famines in British India", "Expulsion of Germans from Eastern Europe", "Postwar Vietnam", "Sanctions against Iraq")
colonial_wars <- c("Gallic War", "Roman-Jewish Wars", "Chinese Conquest of Vietnam", "Cromwell's Invasion of Ireland", "Haitian Slave Revolt", "Mexican War of Independence", "French Conquest of Algeria", "Mahdi Revolt", "Cuban Revolution", "Italo-Ethiopian War", "French Indochina War", "Algerian War of Independence", "Bengali Genocide")
civil_wars <- c("Roman Slave Wars", "War of the Allies", "An Lushan Rebellion", "Fang La Rebellion", "Albigensian Crusade", "French Wars of Religion", "Taiping Rebellion", "American Civil War", "Panthay Rebellion", "Hui Rebellion", "Spanish Civil War", "Partition of India", "Indonesian Purge", "Vietnam War", "Biafran War", "War in the Sudan", "Ugandan Bush War", "Mengistu Haile", "Mozambican Civil War", "Angolan Civil War", "Rwandan Genocide")

#answer params--don't include in questions
cols_deaths <- "R:AP"
cols_atrocities <- "A:N" 

europe <- c("Second Persian War", "First Punic War", "Second Punic War", "Third Mithridatic War", "Justinian", "Russo-Tatar War","Great Turkish War", "War of the Spanish Succession", "Great Northern War", "War of the Austrian Succession", "Seven Years War", "Napoleonic Wars", "Crimean War", "Franco-Prussian War", "Russo-Turkish War", "First World War", "The Time of Troubles", "Gladiatorial Games", "Expulsion of Germans from Eastern Europe", "Fall of the Western Roman Empire", "Hundred Years War", "Russian Civil War", "Peter the Great", "Joseph Stalin", "Gallic War", "Cromwell's Invasion of Ireland", "Spanish Civil War", "French Wars of Religion", "Albigensian Crusade", "Roman Slave Wars", "War of the Allies")
asia <- c("Alexander the Great", "Goguryeo-Sui Wars", "Crusades", "Chinggis Khan", "Hulagu’s Invasion", "Bahmani-Vijayanagara War", "Timur", "Burma-Siam Wars", "Sino-Dzungar War", "Greco-Turkish War", "Korean War", "Iran-Iraq War", "Famines in British India", "Postwar Vietnam", "Sanctions against Iraq", "Soviet-Afghan War", "Chinese Civil War", "Fall of the Ming Dynasty", "Fall of the Yuan Dynasty", "The Three Kingdoms of China", "Xin Dynasty", "Age of Warring States", "Qin Shi Huang Di", "Aurangzeb", "Mao Zedong", "North Korea", "Democratic Kampuchea", "Saddam Hussein", "Bengali Genocide", "French Indochina War", "Chinese Conquest of Vietnam", "Roman-Jewish Wars", "An Lushan Rebellion", "Fang La Rebellion", "Taiping Rebellion", "Panthay Rebellion", "Hui Rebellion", "Partition of India", "Indonesian Purge", "Vietnam War")
americas <- c("War of the Triple Alliance", "Conquest of the Americas", "Aztec Human Sacrifice", "Mayan Collapse", "Mexican Revolution", "Haitian Slave Revolt", "Mexican War of Independence", "Cuban Revolution", "American Civil War")
africa <- c("Congo Free State", "Atlantic Slave Trade", "Mideast Slave Trade", "Somalian Chaos", "Second Congo War", "Shaka", "Idi Amin", "French Conquest of Algeria", "Mahdi Revolt", "Italo-Ethiopian War", "Algerian War of Independence", "Rwandan Genocide", "Angolan Civil War", "Mozambican Civil War", "Mengistu Haile", "Ugandan Bush War", "War in the Sudan", "Biafran War")
multiple <- c("Second World War")

fill_continent_colors <- c("#BF9488", 
                 "#C1BAA9",
                 "#E5CAA2",
                 "#B3C0C5",
                 "#C6C5C3")

This challenge looks at large-scale atrocities over time. The data set is from The Great Book of Horrible Things by Matthew White. The Google Sheets data was compiled by Holden Karnofsky as part of his analysis for this blog post: http://blog.givewell.org/2015/07/08/has-violence-declined-when-large-scale-atrocities-are-systematically-included/#Sec2, which I recommend reading. Bill Marsh used this same data set to create a visualization for the New York Times (http://www.nytimes.com/imagepages/2011/11/06/opinion/06atrocities_timeline.html). Part of this challenge involves recreating this visualization.

Read in data from googlesheets

q1.1 Create agooglesheet object from the Google Sheets data set found at data_url.

sheet <- gs_url(x = data_url)
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.

## Putative key: 1N9mMdh2X47AfmDJ4XEw8CDe5CL5O6mfI3KPnzERaplk

## Sheet successfully identified: "Atrocities analysis"

q1.2 The original data is far from tidy. Read in the names, start dates, and end dates of all the atrocities. These can be found in the third quadrant. We only want the first, second-to-last, and last columns. Call the resulting tibble atrocities.

Hint: look at gs_read.

atrocities <- 
  sheet %>% 
  gs_read(range = cell_cols(cols_atrocities), skip = 18, col_names = FALSE) %>% 
  select(atrocity = 1, 
         start_year = 13,
         end_year = 14)
## Accessing worksheet titled 'Main sheet'.

## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_number(),
##   X4 = col_character(),
##   X5 = col_character(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_integer(),
##   X9 = col_character(),
##   X10 = col_character(),
##   X11 = col_integer(),
##   X12 = col_character(),
##   X13 = col_integer(),
##   X14 = col_integer()
## )

q1.3 Now, read in the fourth quadrant up until (and including) column AP. Each column represents a 100-year period of time. The values represent the number of deaths due to the atrocity listed in column A. Set col_names equal to columns and keep verbose equal to TRUE (Don't worry--the column names won't stay numbers for long). Call this tibble deaths.

deaths <- gs_read(sheet, range = cell_cols(cols_deaths), 
                  skip = 18, col_names = column_names)
## Accessing worksheet titled 'Main sheet'.

## Parsed with column specification:
## cols(
##   .default = col_number(),
##   `1600` = col_character()
## )

## See spec(...) for full column specifications.

q1.4 Inspect each tibble and look for anything unusual. Fix any problems you see.

summary(deaths)
##       -500             -400             -300              -200       
##  Min.   :     0   Min.   :     0   Min.   :      0   Min.   :     0  
##  1st Qu.:     0   1st Qu.:     0   1st Qu.:      0   1st Qu.:     0  
##  Median :     0   Median :     0   Median :      0   Median :     0  
##  Mean   :  7412   Mean   : 10882   Mean   :  29606   Mean   : 10312  
##  3rd Qu.:     0   3rd Qu.:     0   3rd Qu.:      0   3rd Qu.:     0  
##  Max.   :441176   Max.   :588235   Max.   :1000000   Max.   :531250  
##       -100              0                 100              200         
##  Min.   :     0   Min.   :       0   Min.   :     0   Min.   :      0  
##  1st Qu.:     0   1st Qu.:       0   1st Qu.:     0   1st Qu.:      0  
##  Median :     0   Median :       0   Median :     0   Median :      0  
##  Mean   : 23688   Mean   :  106700   Mean   : 11702   Mean   :  41098  
##  3rd Qu.:     0   3rd Qu.:       0   3rd Qu.:     0   3rd Qu.:      0  
##  Max.   :700000   Max.   :10000000   Max.   :500000   Max.   :3609783  
##       300              400               500              600         
##  Min.   :     0   Min.   :      0   Min.   :     0   Min.   :      0  
##  1st Qu.:     0   1st Qu.:      0   1st Qu.:     0   1st Qu.:      0  
##  Median :     0   Median :      0   Median :     0   Median :      0  
##  Mean   : 10738   Mean   :  66062   Mean   :  8300   Mean   :  19299  
##  3rd Qu.:     0   3rd Qu.:      0   3rd Qu.:     0   3rd Qu.:      0  
##  Max.   :573770   Max.   :6426230   Max.   :750000   Max.   :1409931  
##       700                800               900               1000        
##  Min.   :       0   Min.   :      0   Min.   :      0   Min.   :      0  
##  1st Qu.:       0   1st Qu.:      0   1st Qu.:      0   1st Qu.:      0  
##  Median :       0   Median :      0   Median :      0   Median :      0  
##  Mean   :  145075   Mean   :  22575   Mean   :  15075   Mean   :  15003  
##  3rd Qu.:       0   3rd Qu.:      0   3rd Qu.:      0   3rd Qu.:      0  
##  Max.   :13000000   Max.   :1424172   Max.   :1424172   Max.   :1424172  
##       1100              1200               1300               1400        
##  Min.   :      0   Min.   :       0   Min.   :       0   Min.   :      0  
##  1st Qu.:      0   1st Qu.:       0   1st Qu.:       0   1st Qu.:      0  
##  Median :      0   Median :       0   Median :       0   Median :      0  
##  Mean   :  49470   Mean   :  446252   Mean   :  254755   Mean   :  99024  
##  3rd Qu.:      0   3rd Qu.:       0   3rd Qu.:       0   3rd Qu.:      0  
##  Max.   :2000000   Max.   :40000000   Max.   :14166667   Max.   :2833333  
##       1500             1600                1700              1800         
##  Min.   :      0   Length:100         Min.   :      0   Min.   :       0  
##  1st Qu.:      0   Class :character   1st Qu.:      0   1st Qu.:       0  
##  Median :      0   Mode  :character   Median :      0   Median :       0  
##  Mean   : 148455                      Mean   : 263238   Mean   :  561150  
##  3rd Qu.:      0                      3rd Qu.:      0   3rd Qu.:       0  
##  Max.   :4494382                      Max.   :8939799   Max.   :20000000  
##       1900         
##  Min.   :       0  
##  1st Qu.:       0  
##  Median :       0  
##  Mean   : 2022960  
##  3rd Qu.:  506250  
##  Max.   :66000000
deaths <-
  deaths %>% 
  mutate(`1600` = ifelse(typeof(`1600`) == "character", 0, `1600`))

q1.5 Bind atrocities and deaths. Tidy the data and name the resulting tibble atrocities_tidy. Create a variable type that represents the type of an atrocity (you can use the vectors of types given in the parameters section).

Note: You're probably going to end up with a lot of rows where the number of deaths is 0. I filtered them out so that there would be fewer rows and the tibble would be more interpretable, but it's not strictly necessary.

find_type <- function(atrocity) {
  if (atrocity %in% colonial_wars) {return("colonial war")}
  if (atrocity %in% international_wars) {return("international war")} 
  if (atrocity %in% oppression) {return("institutional oppression")}
  if (atrocity %in% failed_states) {return("failed states")}
  if (atrocity %in% civil_wars) {return("civil war")}
  if (atrocity %in% despots) {return("despots")}
  else {return("NA")}
}

atrocities_tidy <- 
  atrocities %>% 
  bind_cols(deaths) %>% 
  gather(key = "century", value = "deaths", 4:28, convert = TRUE) %>% 
  filter(!(near(deaths, 0))) %>% 
  mutate(type = map_chr(atrocity, find_type))

Atrocities

q2 Let's look at atrocities_tidy. Construct a couple plots that visualize the information. Look at the outliers. What can you conclude?

atrocities_tidy <-
  atrocities_tidy %>% 
  mutate(length = end_year - start_year) 

to_label <-
  atrocities_tidy %>% 
  group_by(type) %>% 
  distinct(atrocity, start_year, length) %>% 
  top_n(length, n = 2)
  
  
atrocities_tidy %>% 
  group_by(atrocity) %>% 
  ggplot(aes(start_year, length)) +
  geom_point() +
  geom_smooth(method = loess, se = FALSE) +
  geom_label_repel(aes(label = atrocity), size = 2, color = "black", nudge_y = 25, data = to_label) +
  labs(x = "Starting year",
       y = "Length in years",
       title = "Atrocity length in years by starting year") +
  facet_wrap(~type)

- Most of the major outliers in terms of length are not wars. - The length of non-war atrocities appears to have decreased over time after around 500, with a bump around 1500. - The longest atrocity was the Mideast Slave Trade, followed by the gladiatorial games. - Civil war lengths haven't changed much across 2000 years. Neither have international war lengths, besides the bump around the crusades. - The decline in length of atrocities due to instutional oppression is the most dramatic.

atrocities_tidy %>% 
  group_by(atrocity, start_year, end_year, length, type, century) %>% 
  summarise(total = sum(deaths)) %>% 
  group_by(century) %>% 
  filter(near(total, max(total, na.rm = TRUE))) %>% 
  ggplot(aes(century, total)) +
  geom_point(aes(color = type, size = length)) +
  geom_text_repel(aes(label = atrocity), size = 2) +
  scale_y_log10() +
  scale_color_discrete(name = "Atrocity type") +
  labs(title = "Deaths from the worst atrocity by century",
       y = "Total deaths",
       x = "Century")

Population data

q3.1 There's a limit to the conclusions we can draw without population data. You can find population data on the population sheet. Read in the data from the population sheet and store in a tibble. You only need the columns for year and population.

population <- 
  gs_read(sheet, ws = 3 , range = cell_cols("F:K"), 
                  skip = 11, col_names = FALSE, verbose = FALSE) %>% 
  select(year = X1,
         population = X6)
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   X2 = col_integer(),
##   X3 = col_integer(),
##   X4 = col_number(),
##   X5 = col_double(),
##   X6 = col_double()
## )

q3.2 Make a quick plot of the population data to check that it makes sense.

population %>% 
  ggplot(aes(year, population)) +
  geom_line() 

q3.3 Now, join the population data with atrocities_tidy. Store the result in a new tibble. Think about how you want to deal with the differences in population across the time span of an atrocity.

atrocities_pop <-
  atrocities_tidy %>% 
  left_join(population, by = c("start_year" = "year")) %>% 
  rename(population_start = population) %>% 
  left_join(population, by = c("end_year" = "year")) %>% 
  rename(population_end = population)

q3.4 To recreate the plot, we're going to need a variable total that represents the total number of deaths due to an atrocity, a variable percent_killed that represents the percent of the world's total population killed in the atrocity, a variable mean_year that represents the median year of the atrocity, and a variable percent_killed that represents the percentage of the population killed by the atrocity. Create these variables.

find_continent <- function(atrocity) {
  if (atrocity %in% europe) {return("europe")}
  if (atrocity %in% africa) {return("africa")}
  if (atrocity %in% asia) {return("asia")}
  if (atrocity %in% americas) {return("americas")}
  if (atrocity %in% multiple) {return("multiple")}
  else {return("NA")}
}

by_atrocity <-
  atrocities_pop %>% 
  group_by(atrocity, start_year, end_year, length, 
           population_start, population_end, type) %>%
  summarise(total_deaths = sum(deaths, na.rm = TRUE)) %>% 
  mutate(mean_year = mean(c(start_year, end_year), na.rm = TRUE),
         mean_population = mean(c(population_start, population_end), na.rm = TRUE),
         percent_killed = (total_deaths / mean_population)*100,
         continent = map_chr(atrocity, find_continent))

Deaths from atrocities over time

For this question, recreate (or improve upon) this plot from the New York Times: http://www.nytimes.com/imagepages/2011/11/06/opinion/06atrocities_timeline.html.

q3.4 Recreate the New York Times plot. This doesn't have to be exact--the original author didn't use R to create the visualization.

by_atrocity %>% 
  mutate(type = factor(type, level = c("international war", 
                                       "institutional oppression", 
                                       "failed states", 
                                       "despots", 
                                       "colonial war", 
                                       "civil war")),
         continent_new = factor(continent, level = 
                                  c("europe", "asia", "americas", 
                                    "africa", "multiple"))) %>% 
  ggplot(aes(mean_year, (end_year %% 250)*3, 
             size = total_deaths, color = continent)) +
  geom_point(alpha = .6) +
  geom_text(aes(label = atrocity), size = 3, color = "black",
            hjust = "left", vjust = "bottom", nudge_x = 30) +
  geom_segment(aes(x = start_year, xend = end_year, 
                   y = (end_year %% 250)*3, yend = (end_year %% 250)*3), 
               size = .1, inherit.aes = FALSE) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = 1000, linetype = "dotted") +
  facet_grid(type~., scales = "free_y") +
  scale_x_continuous(breaks = seq(-400, 2000, 200), position = "top") +
  scale_y_continuous(trans = "reverse", labels = NULL) +
  scale_color_manual(values = fill_continent_colors) +
  scale_radius(range = c(4, 24)) +
  coord_cartesian(ylim = c(-100, 800)) +
  theme_minimal() +
  guides(size = FALSE) +
  theme_minimal() +
  theme(
   # aspect.ratio = .15,
    panel.grid.major.x = element_line(linetype = "dotted"),
    panel.grid.minor.x = element_line(linetype = "dotted"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    strip.text = element_blank(),
    axis.text.x = element_text(size = 7)
  ) +
  labs(x = NULL,
       y = NULL)