library(tidyverse)
library(googlesheets)
library(ggrepel)
library(stringr)
library(ggrepel)

#params
data_url <- "https://docs.google.com/spreadsheets/d/1N9mMdh2X47AfmDJ4XEw8CDe5CL5O6mfI3KPnzERaplk/edit#gid=1335230579"
pop_url <- "https://docs.google.com/spreadsheets/d/1N9mMdh2X47AfmDJ4XEw8CDe5CL5O6mfI3KPnzERaplk/edit#gid=798338106"

column_names <- as.character(seq(from = -500, to = 1900, by = 100))

international_wars <- c("Second Persian War", "Alexander the Great", "First Punic War", "Second Punic War", "Third Mithridatic War", "Justinian", "Goguryeo-Sui Wars", "Crusades", "Chinggis Khan", "Hulagu’s Invasion", "Bahmani-Vijayanagara War", "Timur", "Russo-Tatar War", "Burma-Siam Wars", "Great Turkish War", "War of the Spanish Succession", "Great Northern War", "War of the Austrian Succession", "Sino-Dzungar War", "Seven Years War", "Napoleonic Wars", "Crimean War", "War of the Triple Alliance", "Franco-Prussian War", "Russo-Turkish War", "First World War", "Greco-Turkish War", "Second World War", "Korean War", "Iran-Iraq War")
despots <- c("Qin Shi Huang Di", "Aurangzeb", "Peter the Great", "Shaka", "Joseph Stalin", "Mao Zedong", "North Korea", "Idi Amin", "Democratic Kampuchea", "Saddam Hussein")
failed_states <- c("Age of Warring States", "Xin Dynasty", "The Three Kingdoms of China", "Fall of the Western Roman Empire", "Mayan Collapse", "Fall of the Yuan Dynasty", "Hundred Years War", "The Time of Troubles", "Thirty Years' War", "Fall of the Ming Dynasty", "Mexican Revolution", "Russian Civil War", "Chinese Civil War", "Soviet-Afghan War", "Somalian Chaos", "Second Congo War")
oppression <- c("Gladiatorial Games", "Mideast Slave Trade", "Atlantic Slave Trade",  "Aztec Human Sacrifice", "Conquest of the Americas", "Congo Free State", "Famines in British India", "Expulsion of Germans from Eastern Europe", "Postwar Vietnam", "Sanctions against Iraq")
colonial_wars <- c("Gallic War", "Roman-Jewish Wars", "Chinese Conquest of Vietnam", "Cromwell's Invasion of Ireland", "Haitian Slave Revolt", "Mexican War of Independence", "French Conquest of Algeria", "Mahdi Revolt", "Cuban Revolution", "Italo-Ethiopian War", "French Indochina War", "Algerian War of Independence", "Bengali Genocide")
civil_wars <- c("Roman Slave Wars", "War of the Allies", "An Lushan Rebellion", "Fang La Rebellion", "Albigensian Crusade", "French Wars of Religion", "Taiping Rebellion", "American Civil War", "Panthay Rebellion", "Hui Rebellion", "Spanish Civil War", "Partition of India", "Indonesian Purge", "Vietnam War", "Biafran War", "War in the Sudan", "Ugandan Bush War", "Mengistu Haile", "Mozambican Civil War", "Angolan Civil War", "Rwandan Genocide")

#answer params--don't include in questions
cols_deaths <- "R:AP"
cols_atrocities <- "A:N" 

europe <- c("Second Persian War", "First Punic War", "Second Punic War", "Third Mithridatic War", "Justinian", "Russo-Tatar War","Great Turkish War", "War of the Spanish Succession", "Great Northern War", "War of the Austrian Succession", "Seven Years War", "Napoleonic Wars", "Crimean War", "Franco-Prussian War", "Russo-Turkish War", "First World War", "The Time of Troubles", "Gladiatorial Games", "Expulsion of Germans from Eastern Europe", "Fall of the Western Roman Empire", "Hundred Years War", "Russian Civil War", "Peter the Great", "Joseph Stalin", "Gallic War", "Cromwell's Invasion of Ireland", "Spanish Civil War", "French Wars of Religion", "Albigensian Crusade", "Roman Slave Wars", "War of the Allies")
asia <- c("Alexander the Great", "Goguryeo-Sui Wars", "Crusades", "Chinggis Khan", "Hulagu’s Invasion", "Bahmani-Vijayanagara War", "Timur", "Burma-Siam Wars", "Sino-Dzungar War", "Greco-Turkish War", "Korean War", "Iran-Iraq War", "Famines in British India", "Postwar Vietnam", "Sanctions against Iraq", "Soviet-Afghan War", "Chinese Civil War", "Fall of the Ming Dynasty", "Fall of the Yuan Dynasty", "The Three Kingdoms of China", "Xin Dynasty", "Age of Warring States", "Qin Shi Huang Di", "Aurangzeb", "Mao Zedong", "North Korea", "Democratic Kampuchea", "Saddam Hussein", "Bengali Genocide", "French Indochina War", "Chinese Conquest of Vietnam", "Roman-Jewish Wars", "An Lushan Rebellion", "Fang La Rebellion", "Taiping Rebellion", "Panthay Rebellion", "Hui Rebellion", "Partition of India", "Indonesian Purge", "Vietnam War")
americas <- c("War of the Triple Alliance", "Conquest of the Americas", "Aztec Human Sacrifice", "Mayan Collapse", "Mexican Revolution", "Haitian Slave Revolt", "Mexican War of Independence", "Cuban Revolution", "American Civil War")
africa <- c("Congo Free State", "Atlantic Slave Trade", "Mideast Slave Trade", "Somalian Chaos", "Second Congo War", "Shaka", "Idi Amin", "French Conquest of Algeria", "Mahdi Revolt", "Italo-Ethiopian War", "Algerian War of Independence", "Rwandan Genocide", "Angolan Civil War", "Mozambican Civil War", "Mengistu Haile", "Ugandan Bush War", "War in the Sudan", "Biafran War")
multiple <- c("Second World War")

fill_continent_colors <- c("#BF9488", 
                 "#C1BAA9",
                 "#E5CAA2",
                 "#B3C0C5",
                 "#C6C5C3")

Read in data from googlesheets

q1.1 Create agooglesheet object from the Google Sheets data set found at data_url.

sheet <- gs_url(x = data_url)

q1.2 The original data is far from tidy. Read in the names, start dates, and end dates of all the atrocities. These can be found in the third quadrant. We only want the first, second-to-last, and last columns. Call the resulting tibble atrocities.

atrocities <- 
  sheet %>% 
  gs_read(range = cell_cols(cols_atrocities), skip = 18, col_names = FALSE) %>% 
  select(atrocity = 1, 
         start_year = 13,
         end_year = 14)

q1.3 Now, read in the fourth quadrant up until (and including) column AP. Each column represents a 100-year period of time. The values represent the number of deaths due to the atrocity listed in column A. Set col_names equal to columns and keep verbose equal to TRUE (Don't worry--the column names won't stay numbers for long). Call this tibble deaths.

deaths <- gs_read(sheet, range = cell_cols(cols_deaths), 
                  skip = 18, col_names = column_names)

q1.4 Inspect each tibble and look for anything unusual. Fix any problems you see.

deaths <-
  deaths %>% 
  mutate(`1600` = ifelse(typeof(`1600`) == "character", 0, `1600`))

q1.5 Bind atrocities and deaths. Tidy the data and name the resulting tibble atrocities_tidy. Create a variable type that represents the type of an atrocity (you can use the vectors of types given in the parameters section).

Note: You're probably going to end up with a lot of rows where the number of deaths is 0. I filtered them out so that there would be fewer rows and the tibble would be more interpretable, but it's not strictly necessary.

find_type <- function(atrocity) {
  if (atrocity %in% colonial_wars) {return("colonial war")}
  if (atrocity %in% international_wars) {return("international war")} 
  if (atrocity %in% oppression) {return("institutional oppression")}
  if (atrocity %in% failed_states) {return("failed states")}
  if (atrocity %in% civil_wars) {return("civil war")}
  if (atrocity %in% despots) {return("despots")}
  else {return("NA")}
}

atrocities_tidy <- 
  atrocities %>% 
  bind_cols(deaths) %>% 
  gather(key = "century", value = "deaths", 4:28, convert = TRUE) %>% 
  filter(!(near(deaths, 0))) %>% 
  mutate(type = map_chr(atrocity, find_type))

Atrocities

q2 Let's look at atrocities_tidy. Construct a couple plots that visualize the information. Look at the outliers. What can you conclude?

atrocities_tidy <-
  atrocities_tidy %>% 
  mutate(length = end_year - start_year) 

to_label <-
  atrocities_tidy %>% 
  group_by(type) %>% 
  distinct(atrocity, start_year, length) %>% 
  top_n(length, n = 2)
  
  
atrocities_tidy %>% 
  group_by(atrocity) %>% 
  ggplot(aes(start_year, length)) +
  geom_point() +
  geom_smooth(method = loess, se = FALSE) +
  geom_label_repel(aes(label = atrocity), size = 2, color = "black", nudge_y = 25, data = to_label) +
  labs(x = "Starting year",
       y = "Length in years",
       title = "Atrocity length in years by starting year") +
  facet_wrap(~type)

- Most of the major outliers in terms of length are not wars. - The length of non-war atrocities appears to have decreased over time after around 500, with a bump around 1500. - The longest atrocity was the Mideast Slave Trade, followed by the gladiatorial games. - Civil war lengths haven't changed much across 2000 years. Neither have international war lengths, besides the bump around the crusades. - The decline in length of atrocities due to instutional oppression is the most dramatic.

atrocities_tidy %>% 
  group_by(atrocity, start_year, end_year, length, type, century) %>% 
  summarise(total = sum(deaths)) %>% 
  group_by(century) %>% 
  filter(near(total, max(total, na.rm = TRUE))) %>% 
  ggplot(aes(century, total)) +
  geom_point(aes(color = type, size = length)) +
  geom_text_repel(aes(label = atrocity), size = 2) +
  scale_y_log10() +
  scale_color_discrete(name = "Atrocity type") +
  labs(title = "Deaths from the worst atrocity by century",
       y = "Total deaths",
       x = "Century")

Population data

q3.1 There's a limit to the conclusions we can draw without population data. You can find population data on the population sheet. Read in the data from the population sheet and store in a tibble. You only need the columns for year and population.

population <- 
  gs_read(sheet, ws = 3 , range = cell_cols("F:K"), 
                  skip = 11, col_names = FALSE, verbose = FALSE) %>% 
  select(year = X1,
         population = X6)

population %>% 
  ggplot(aes(year, population)) +
  geom_line()

q3.3 Now, join the population data with atrocities_tidy. Store the result in a new tibble. Think about how you want to deal with the differences in population across the time span of an atrocity.

atrocities_pop <-
  atrocities_tidy %>% 
  left_join(population, by = c("start_year" = "year")) %>% 
  rename(population_start = population) %>% 
  left_join(population, by = c("end_year" = "year")) %>% 
  rename(population_end = population)

q3.4 To recreate the plot, we're going to need a variable total that represents the total number of deaths due to an atrocity, a variable percent_killed that represents the percent of the world's total population killed in the atrocity, a variable mean_year that represents the median year of the atrocity, and a variable percent_killed that represents the percentage of the population killed by the atrocity. Create these variables.

find_continent <- function(atrocity) {
  if (atrocity %in% europe) {return("europe")}
  if (atrocity %in% africa) {return("africa")}
  if (atrocity %in% asia) {return("asia")}
  if (atrocity %in% americas) {return("americas")}
  if (atrocity %in% multiple) {return("multiple")}
  else {return("NA")}
}

by_atrocity <-
  atrocities_pop %>% 
  group_by(atrocity, start_year, end_year, length, 
           population_start, population_end, type) %>%
  summarise(total_deaths = sum(deaths, na.rm = TRUE)) %>% 
  mutate(mean_year = mean(c(start_year, end_year), na.rm = TRUE),
         mean_population = mean(c(population_start, population_end), na.rm = TRUE),
         percent_killed = (total_deaths / mean_population)*100,
         continent = map_chr(atrocity, find_continent))

Deaths from atrocities over time

q3.4 Recreate the New York Times plot. This doesn't have to be exact--the original author didn't use R to create the visualization.

by_atrocity %>% 
  mutate(type = factor(type, level = c("international war", 
                                       "institutional oppression", 
                                       "failed states", 
                                       "despots", 
                                       "colonial war", 
                                       "civil war")),
         continent_new = factor(continent, level = 
                                  c("europe", "asia", "americas", 
                                    "africa", "multiple"))) %>% 
  ggplot(aes(mean_year, (end_year %% 250)*3, 
             size = total_deaths, color = continent)) +
  geom_point(alpha = .6) +
  geom_text(aes(label = atrocity), size = 3, color = "black",
            hjust = "left", vjust = "bottom", nudge_x = 30) +
  geom_segment(aes(x = start_year, xend = end_year, 
                   y = (end_year %% 250)*3, yend = (end_year %% 250)*3), 
               size = .1, inherit.aes = FALSE) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = 1000, linetype = "dotted") +
  facet_grid(type~., scales = "free_y") +
  scale_x_continuous(breaks = seq(-400, 2000, 200), position = "top") +
  scale_y_continuous(trans = "reverse", labels = NULL) +
  scale_color_manual(values = fill_continent_colors) +
  scale_radius(range = c(4, 24)) +
  coord_cartesian(ylim = c(-100, 800)) +
  theme_minimal() +
  guides(size = FALSE) +
  theme_minimal() +
  theme(
   # aspect.ratio = .15,
    panel.grid.major.x = element_line(linetype = "dotted"),
    panel.grid.minor.x = element_line(linetype = "dotted"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    strip.text = element_blank(),
    axis.text.x = element_text(size = 7)
  ) +
  labs(x = NULL,
       y = NULL)

Large-scale atrocities

Read in data from googlesheets

Atrocities

Population data

Deaths from atrocities over time