Sara Altman
library(tidyverse)
library(googlesheets)
library(ggrepel)
library(stringr)
library(ggrepel)
#params
data_url <- "https://docs.google.com/spreadsheets/d/1N9mMdh2X47AfmDJ4XEw8CDe5CL5O6mfI3KPnzERaplk/edit#gid=1335230579"
pop_url <- "https://docs.google.com/spreadsheets/d/1N9mMdh2X47AfmDJ4XEw8CDe5CL5O6mfI3KPnzERaplk/edit#gid=798338106"
column_names <- as.character(seq(from = -500, to = 1900, by = 100))
international_wars <- c("Second Persian War", "Alexander the Great", "First Punic War", "Second Punic War", "Third Mithridatic War", "Justinian", "Goguryeo-Sui Wars", "Crusades", "Chinggis Khan", "Hulagu’s Invasion", "Bahmani-Vijayanagara War", "Timur", "Russo-Tatar War", "Burma-Siam Wars", "Great Turkish War", "War of the Spanish Succession", "Great Northern War", "War of the Austrian Succession", "Sino-Dzungar War", "Seven Years War", "Napoleonic Wars", "Crimean War", "War of the Triple Alliance", "Franco-Prussian War", "Russo-Turkish War", "First World War", "Greco-Turkish War", "Second World War", "Korean War", "Iran-Iraq War")
despots <- c("Qin Shi Huang Di", "Aurangzeb", "Peter the Great", "Shaka", "Joseph Stalin", "Mao Zedong", "North Korea", "Idi Amin", "Democratic Kampuchea", "Saddam Hussein")
failed_states <- c("Age of Warring States", "Xin Dynasty", "The Three Kingdoms of China", "Fall of the Western Roman Empire", "Mayan Collapse", "Fall of the Yuan Dynasty", "Hundred Years War", "The Time of Troubles", "Thirty Years' War", "Fall of the Ming Dynasty", "Mexican Revolution", "Russian Civil War", "Chinese Civil War", "Soviet-Afghan War", "Somalian Chaos", "Second Congo War")
oppression <- c("Gladiatorial Games", "Mideast Slave Trade", "Atlantic Slave Trade", "Aztec Human Sacrifice", "Conquest of the Americas", "Congo Free State", "Famines in British India", "Expulsion of Germans from Eastern Europe", "Postwar Vietnam", "Sanctions against Iraq")
colonial_wars <- c("Gallic War", "Roman-Jewish Wars", "Chinese Conquest of Vietnam", "Cromwell's Invasion of Ireland", "Haitian Slave Revolt", "Mexican War of Independence", "French Conquest of Algeria", "Mahdi Revolt", "Cuban Revolution", "Italo-Ethiopian War", "French Indochina War", "Algerian War of Independence", "Bengali Genocide")
civil_wars <- c("Roman Slave Wars", "War of the Allies", "An Lushan Rebellion", "Fang La Rebellion", "Albigensian Crusade", "French Wars of Religion", "Taiping Rebellion", "American Civil War", "Panthay Rebellion", "Hui Rebellion", "Spanish Civil War", "Partition of India", "Indonesian Purge", "Vietnam War", "Biafran War", "War in the Sudan", "Ugandan Bush War", "Mengistu Haile", "Mozambican Civil War", "Angolan Civil War", "Rwandan Genocide")
#answer params--don't include in questions
cols_deaths <- "R:AP"
cols_atrocities <- "A:N"
europe <- c("Second Persian War", "First Punic War", "Second Punic War", "Third Mithridatic War", "Justinian", "Russo-Tatar War","Great Turkish War", "War of the Spanish Succession", "Great Northern War", "War of the Austrian Succession", "Seven Years War", "Napoleonic Wars", "Crimean War", "Franco-Prussian War", "Russo-Turkish War", "First World War", "The Time of Troubles", "Gladiatorial Games", "Expulsion of Germans from Eastern Europe", "Fall of the Western Roman Empire", "Hundred Years War", "Russian Civil War", "Peter the Great", "Joseph Stalin", "Gallic War", "Cromwell's Invasion of Ireland", "Spanish Civil War", "French Wars of Religion", "Albigensian Crusade", "Roman Slave Wars", "War of the Allies")
asia <- c("Alexander the Great", "Goguryeo-Sui Wars", "Crusades", "Chinggis Khan", "Hulagu’s Invasion", "Bahmani-Vijayanagara War", "Timur", "Burma-Siam Wars", "Sino-Dzungar War", "Greco-Turkish War", "Korean War", "Iran-Iraq War", "Famines in British India", "Postwar Vietnam", "Sanctions against Iraq", "Soviet-Afghan War", "Chinese Civil War", "Fall of the Ming Dynasty", "Fall of the Yuan Dynasty", "The Three Kingdoms of China", "Xin Dynasty", "Age of Warring States", "Qin Shi Huang Di", "Aurangzeb", "Mao Zedong", "North Korea", "Democratic Kampuchea", "Saddam Hussein", "Bengali Genocide", "French Indochina War", "Chinese Conquest of Vietnam", "Roman-Jewish Wars", "An Lushan Rebellion", "Fang La Rebellion", "Taiping Rebellion", "Panthay Rebellion", "Hui Rebellion", "Partition of India", "Indonesian Purge", "Vietnam War")
americas <- c("War of the Triple Alliance", "Conquest of the Americas", "Aztec Human Sacrifice", "Mayan Collapse", "Mexican Revolution", "Haitian Slave Revolt", "Mexican War of Independence", "Cuban Revolution", "American Civil War")
africa <- c("Congo Free State", "Atlantic Slave Trade", "Mideast Slave Trade", "Somalian Chaos", "Second Congo War", "Shaka", "Idi Amin", "French Conquest of Algeria", "Mahdi Revolt", "Italo-Ethiopian War", "Algerian War of Independence", "Rwandan Genocide", "Angolan Civil War", "Mozambican Civil War", "Mengistu Haile", "Ugandan Bush War", "War in the Sudan", "Biafran War")
multiple <- c("Second World War")
fill_continent_colors <- c("#BF9488",
"#C1BAA9",
"#E5CAA2",
"#B3C0C5",
"#C6C5C3")This challenge looks at large-scale atrocities over time. The data set is from The Great Book of Horrible Things by Matthew White. The Google Sheets data was compiled by Holden Karnofsky as part of his analysis for this blog post: http://blog.givewell.org/2015/07/08/has-violence-declined-when-large-scale-atrocities-are-systematically-included/#Sec2, which I recommend reading. Bill Marsh used this same data set to create a visualization for the New York Times (http://www.nytimes.com/imagepages/2011/11/06/opinion/06atrocities_timeline.html). Part of this challenge involves recreating this visualization.
q1.1 Create agooglesheet object from the Google Sheets data set found at data_url.
sheet <- gs_url(x = data_url)## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1N9mMdh2X47AfmDJ4XEw8CDe5CL5O6mfI3KPnzERaplk
## Sheet successfully identified: "Atrocities analysis"
q1.2 The original data is far from tidy. Read in the names, start dates, and end dates of all the atrocities. These can be found in the third quadrant. We only want the first, second-to-last, and last columns. Call the resulting tibble atrocities.
Hint: look at gs_read.
atrocities <-
sheet %>%
gs_read(range = cell_cols(cols_atrocities), skip = 18, col_names = FALSE) %>%
select(atrocity = 1,
start_year = 13,
end_year = 14)## Accessing worksheet titled 'Main sheet'.
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_number(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_character(),
## X7 = col_character(),
## X8 = col_integer(),
## X9 = col_character(),
## X10 = col_character(),
## X11 = col_integer(),
## X12 = col_character(),
## X13 = col_integer(),
## X14 = col_integer()
## )
q1.3 Now, read in the fourth quadrant up until (and including) column AP. Each column represents a 100-year period of time. The values represent the number of deaths due to the atrocity listed in column A. Set col_names equal to columns and keep verbose equal to TRUE (Don't worry--the column names won't stay numbers for long). Call this tibble deaths.
deaths <- gs_read(sheet, range = cell_cols(cols_deaths),
skip = 18, col_names = column_names)## Accessing worksheet titled 'Main sheet'.
## Parsed with column specification:
## cols(
## .default = col_number(),
## `1600` = col_character()
## )
## See spec(...) for full column specifications.
q1.4 Inspect each tibble and look for anything unusual. Fix any problems you see.
summary(deaths)## -500 -400 -300 -200
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 0 Median : 0
## Mean : 7412 Mean : 10882 Mean : 29606 Mean : 10312
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0
## Max. :441176 Max. :588235 Max. :1000000 Max. :531250
## -100 0 100 200
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 0 Median : 0
## Mean : 23688 Mean : 106700 Mean : 11702 Mean : 41098
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0
## Max. :700000 Max. :10000000 Max. :500000 Max. :3609783
## 300 400 500 600
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 0 Median : 0
## Mean : 10738 Mean : 66062 Mean : 8300 Mean : 19299
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0
## Max. :573770 Max. :6426230 Max. :750000 Max. :1409931
## 700 800 900 1000
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 0 Median : 0
## Mean : 145075 Mean : 22575 Mean : 15075 Mean : 15003
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0
## Max. :13000000 Max. :1424172 Max. :1424172 Max. :1424172
## 1100 1200 1300 1400
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 0 Median : 0
## Mean : 49470 Mean : 446252 Mean : 254755 Mean : 99024
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0
## Max. :2000000 Max. :40000000 Max. :14166667 Max. :2833333
## 1500 1600 1700 1800
## Min. : 0 Length:100 Min. : 0 Min. : 0
## 1st Qu.: 0 Class :character 1st Qu.: 0 1st Qu.: 0
## Median : 0 Mode :character Median : 0 Median : 0
## Mean : 148455 Mean : 263238 Mean : 561150
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0
## Max. :4494382 Max. :8939799 Max. :20000000
## 1900
## Min. : 0
## 1st Qu.: 0
## Median : 0
## Mean : 2022960
## 3rd Qu.: 506250
## Max. :66000000
deaths <-
deaths %>%
mutate(`1600` = ifelse(typeof(`1600`) == "character", 0, `1600`))q1.5 Bind atrocities and deaths. Tidy the data and name the resulting tibble atrocities_tidy. Create a variable type that represents the type of an atrocity (you can use the vectors of types given in the parameters section).
Note: You're probably going to end up with a lot of rows where the number of deaths is 0. I filtered them out so that there would be fewer rows and the tibble would be more interpretable, but it's not strictly necessary.
find_type <- function(atrocity) {
if (atrocity %in% colonial_wars) {return("colonial war")}
if (atrocity %in% international_wars) {return("international war")}
if (atrocity %in% oppression) {return("institutional oppression")}
if (atrocity %in% failed_states) {return("failed states")}
if (atrocity %in% civil_wars) {return("civil war")}
if (atrocity %in% despots) {return("despots")}
else {return("NA")}
}
atrocities_tidy <-
atrocities %>%
bind_cols(deaths) %>%
gather(key = "century", value = "deaths", 4:28, convert = TRUE) %>%
filter(!(near(deaths, 0))) %>%
mutate(type = map_chr(atrocity, find_type))q2 Let's look at atrocities_tidy. Construct a couple plots that visualize the information. Look at the outliers. What can you conclude?
atrocities_tidy <-
atrocities_tidy %>%
mutate(length = end_year - start_year)
to_label <-
atrocities_tidy %>%
group_by(type) %>%
distinct(atrocity, start_year, length) %>%
top_n(length, n = 2)
atrocities_tidy %>%
group_by(atrocity) %>%
ggplot(aes(start_year, length)) +
geom_point() +
geom_smooth(method = loess, se = FALSE) +
geom_label_repel(aes(label = atrocity), size = 2, color = "black", nudge_y = 25, data = to_label) +
labs(x = "Starting year",
y = "Length in years",
title = "Atrocity length in years by starting year") +
facet_wrap(~type) - Most of the major outliers in terms of length are not wars. - The length of non-war atrocities appears to have decreased over time after around 500, with a bump around 1500. - The longest atrocity was the Mideast Slave Trade, followed by the gladiatorial games. - Civil war lengths haven't changed much across 2000 years. Neither have international war lengths, besides the bump around the crusades. - The decline in length of atrocities due to instutional oppression is the most dramatic.
atrocities_tidy %>%
group_by(atrocity, start_year, end_year, length, type, century) %>%
summarise(total = sum(deaths)) %>%
group_by(century) %>%
filter(near(total, max(total, na.rm = TRUE))) %>%
ggplot(aes(century, total)) +
geom_point(aes(color = type, size = length)) +
geom_text_repel(aes(label = atrocity), size = 2) +
scale_y_log10() +
scale_color_discrete(name = "Atrocity type") +
labs(title = "Deaths from the worst atrocity by century",
y = "Total deaths",
x = "Century")q3.1 There's a limit to the conclusions we can draw without population data. You can find population data on the population sheet. Read in the data from the population sheet and store in a tibble. You only need the columns for year and population.
population <-
gs_read(sheet, ws = 3 , range = cell_cols("F:K"),
skip = 11, col_names = FALSE, verbose = FALSE) %>%
select(year = X1,
population = X6)## Parsed with column specification:
## cols(
## X1 = col_integer(),
## X2 = col_integer(),
## X3 = col_integer(),
## X4 = col_number(),
## X5 = col_double(),
## X6 = col_double()
## )
q3.2 Make a quick plot of the population data to check that it makes sense.
population %>%
ggplot(aes(year, population)) +
geom_line() q3.3 Now, join the population data with atrocities_tidy. Store the result in a new tibble. Think about how you want to deal with the differences in population across the time span of an atrocity.
atrocities_pop <-
atrocities_tidy %>%
left_join(population, by = c("start_year" = "year")) %>%
rename(population_start = population) %>%
left_join(population, by = c("end_year" = "year")) %>%
rename(population_end = population)q3.4 To recreate the plot, we're going to need a variable total that represents the total number of deaths due to an atrocity, a variable percent_killed that represents the percent of the world's total population killed in the atrocity, a variable mean_year that represents the median year of the atrocity, and a variable percent_killed that represents the percentage of the population killed by the atrocity. Create these variables.
find_continent <- function(atrocity) {
if (atrocity %in% europe) {return("europe")}
if (atrocity %in% africa) {return("africa")}
if (atrocity %in% asia) {return("asia")}
if (atrocity %in% americas) {return("americas")}
if (atrocity %in% multiple) {return("multiple")}
else {return("NA")}
}
by_atrocity <-
atrocities_pop %>%
group_by(atrocity, start_year, end_year, length,
population_start, population_end, type) %>%
summarise(total_deaths = sum(deaths, na.rm = TRUE)) %>%
mutate(mean_year = mean(c(start_year, end_year), na.rm = TRUE),
mean_population = mean(c(population_start, population_end), na.rm = TRUE),
percent_killed = (total_deaths / mean_population)*100,
continent = map_chr(atrocity, find_continent))For this question, recreate (or improve upon) this plot from the New York Times: http://www.nytimes.com/imagepages/2011/11/06/opinion/06atrocities_timeline.html.
q3.4 Recreate the New York Times plot. This doesn't have to be exact--the original author didn't use R to create the visualization.
by_atrocity %>%
mutate(type = factor(type, level = c("international war",
"institutional oppression",
"failed states",
"despots",
"colonial war",
"civil war")),
continent_new = factor(continent, level =
c("europe", "asia", "americas",
"africa", "multiple"))) %>%
ggplot(aes(mean_year, (end_year %% 250)*3,
size = total_deaths, color = continent)) +
geom_point(alpha = .6) +
geom_text(aes(label = atrocity), size = 3, color = "black",
hjust = "left", vjust = "bottom", nudge_x = 30) +
geom_segment(aes(x = start_year, xend = end_year,
y = (end_year %% 250)*3, yend = (end_year %% 250)*3),
size = .1, inherit.aes = FALSE) +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_vline(xintercept = 1000, linetype = "dotted") +
facet_grid(type~., scales = "free_y") +
scale_x_continuous(breaks = seq(-400, 2000, 200), position = "top") +
scale_y_continuous(trans = "reverse", labels = NULL) +
scale_color_manual(values = fill_continent_colors) +
scale_radius(range = c(4, 24)) +
coord_cartesian(ylim = c(-100, 800)) +
theme_minimal() +
guides(size = FALSE) +
theme_minimal() +
theme(
# aspect.ratio = .15,
panel.grid.major.x = element_line(linetype = "dotted"),
panel.grid.minor.x = element_line(linetype = "dotted"),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
strip.text = element_blank(),
axis.text.x = element_text(size = 7)
) +
labs(x = NULL,
y = NULL)