This is quick look at how we can get data from http://killedbypolice.net/ and from the US Census website, and combine them to explore. The US Census data is relatively straightforward, but the http://killedbypolice.net/ data is more challenging to obtain and tidy because of the inconsistant use of HTML table formatting.
Code for this document is online at https://gist.github.com/benmarwick/a8888f96728f32e4191f81e7e62d2736
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(stringr)
Here’s an easy way to get the census data (I haven’t done anything further with this): check to get latest data set?
tmp <- rio::import("https://www2.census.gov/programs-surveys/popest/tables/2010-2016/state/totals/nst-est2016-01.xlsx",
skip = 3)
check the structure of the data
dplyr::glimpse(tmp)
## Observations: 63
## Variables: 10
## $ X__1 <chr> "United States", "Northeast", "Midwest", "Sou...
## $ Census <dbl> 308745538, 55317240, 66927001, 114555744, 719...
## $ `Estimates Base` <dbl> 308758105, 55318353, 66929825, 114563005, 719...
## $ `2010` <dbl> 309348193, 55388056, 66978602, 114863114, 721...
## $ `2011` <dbl> 311663358, 55632766, 67153331, 116061801, 728...
## $ `2012` <dbl> 313998379, 55829059, 67332320, 117299171, 735...
## $ `2013` <dbl> 316204908, 55988771, 67543948, 118424320, 742...
## $ `2014` <dbl> 318563456, 56116791, 67726368, 119696311, 750...
## $ `2015` <dbl> 320896618, 56184737, 67838387, 121039206, 758...
## $ `2016` <dbl> 323127513, 56209510, 67941429, 122319574, 766...
clean up state names to abbreviations
state_abb <- data_frame(state_name = state.name,
state_abb = state.abb)
state_abb <- rbind(state_abb,
data_frame(
state_name = "District of Columbia",
state_abb = "DC"))
state_pops <-
tmp %>%
mutate(state_name = gsub("\\.", "", X__1)) %>%
left_join(state_abb)
Here’s how we can get all years of the data from http://killedbypolice.net/
library(glue)
library(readr)
url <- "http://killedbypolice.net/"
For all KBP pages… make a vector of URLs for all years available
years <- 2016:2013
all_urls <- c(url, glue('{url}kbp{years}'))
all_pages <- map(all_urls, ~read_table(.x))
x1 <- map(all_pages, ~setNames(.x, "V1"))
filter to keep only columns that have data in them, get rid of some HTML
x2 <- map(x1, ~.x %>%
mutate(V99 = gsub('<center>|<CENTER>|<font size=2>|target=new', "", V1)) %>%
filter(grepl("<TR><TD>\\(", V99)) %>%
select(-V1))
x3 <- map(x2, ~.x %>%
# split variables into cols
separate(V99,
into = glue('V{1:9}'),
sep = "<td>|<TD>",
remove = FALSE) %>% # keep it so we can check on things later
separate(V2,
into = c('number', 'date',
'number2', 'date2'),
sep = "\\)|<br>|\\)|<BR>"))
format some col classes, numeric, date, get rid of white space
x4 <- map(x3, ~.x %>%
mutate_all(funs(trimws)) %>%
mutate(number = as.numeric(gsub("\\(", "", number))) %>%
mutate(date_format = as.Date(date, "%B %d, %Y")))
remove URLs in name col, split mulitple desceased into multiple cols
x5 <- map(x4, ~.x %>%
mutate(deceased = gsub('http\\S+\\s*', "", V5)) %>%
mutate(deceased = gsub('<a href=">|</a>', "", deceased)) %>%
# make separate row where there are two deceased in one row
separate_rows(deceased, sep = "<br>|<BR>") %>%
# get the name and age in their own cols
separate(deceased,
into = c('deceased_name', 'deceased_age'),
sep = ",(?=[^,]+$)", # split on the last comma
remove = FALSE) %>%
mutate(deceased_age = as.numeric(trimws(deceased_age))) %>%
separate(V4,
into = c('gender', 'race', 'other_gender_race', 'other1_gender_race', 'other2_gender_race'),
sep = "/") %>%
mutate(gender = substr(gender, 1, 1)) %>%
separate(V6,
into = c('method', 'method1', 'method2', 'method3', 'method4'),
sep = "<br>|<BR>") %>%
mutate(method = trimws(method)))
convert list of data frames to one big data frame
x6 <- bind_rows(x5)
We have 5307 rows.
What does it look like? Here’s a snippet of just a few columns:
x6 %>%
mutate(state = V3) %>%
select(number, date, state, deceased_name, deceased_age) %>%
head(n = 10) %>%
knitr::kable()
| number | date | state | deceased_name | deceased_age |
|---|---|---|---|---|
| 1015 | November 6, 2017 | VA | NA | |
| 1014 | November 5, 2017 | IL | Eddie Patterson | 49 |
| 1013 | November 5, 2017 | NM | Marlysa Sanchez | 31 |
| 1012 | November 5, 2017 | MS | NA | |
| 1011 | November 4, 2017 | IN | NA | |
| 1010 | November 4, 2017 | AZ | Humberto Edwards | 27 |
| 1009 | November 4, 2017 | CA | Augustus Crawford | 20 |
| 1008 | November 4, 2017 | CO | Michael Marin | 35 |
| 1007 | November 4, 2017 | AR | Nyung Kyee | 56 |
| 1006 | November 4, 2017 | MT | Frank Joey Half Jr. | 30 |
ggplot(x6,
aes(deceased_age)) +
geom_histogram() +
theme_bw()
age and gender
age_and_gender <-
x6 %>%
select(deceased_age, gender) %>%
filter(!is.na(deceased_age)) %>%
mutate(age_category=cut(deceased_age,
breaks = seq(0,max(deceased_age), 5))) %>%
filter(!is.na(age_category)) %>%
group_by(age_category) %>%
summarise(prop_female = sum(gender == 'F') / n())
ggplot(age_and_gender,
aes(age_category,
prop_female)) +
geom_point(size = 3) +
ylab("Proportion of deaths that are Female") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
ggtitle("Female deaths by police, by age group",
subtitle = glue("Data from {url}"))
race
race <-
x6 %>%
select(deceased_age, race) %>%
filter(!is.na(race)) %>%
filter(!is.na(deceased_age)) %>%
mutate(race = str_sub(race, 1, 1)) %>%
filter(race %in% c("I", "B", "W", "L", "A", "P", "O"))
# get n to use in tick labels
race_n <-
race %>%
group_by(race) %>%
tally() %>%
mutate(race_label = glue('{race}\n(n = {n})')) %>%
right_join(race)
library(ggbeeswarm)
ggplot(race_n,
aes(reorder(race_label,
-deceased_age),
deceased_age)) +
geom_quasirandom(alpha = 0.1) +
xlab("Race (as classified by killedbypolice.net)") +
theme_bw() +
ggtitle("Deaths by police, by age and race of deceased",
subtitle = glue("Data from {url}"))
method of death
method_of_death <-
x6 %>%
mutate(method_long = case_when(
method == "G" ~ "Gun",
method == "T" ~ "Taser",
method == "R" ~ "Restraint/\nPhysical Force",
method == "C" ~ "Chemical",
method == "V" ~ "Vehicle",
method == "O" ~ "Other"))
method_of_death_df <-
method_of_death %>%
group_by(method_long) %>%
tally() %>%
mutate(prop = n / sum(n) ) %>%
filter(!is.na(method_long))
ggplot(method_of_death_df,
aes(reorder(method_long, -prop),
prop)) +
geom_col() +
xlab("") +
ylab("Proportion of all deaths by police") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Let’s convert n to the proportion of people in each state age distribution by state https://hafen.github.io/geofacet/
library("geofacet")
ggplot(x6,
aes(deceased_age)) +
geom_histogram() +
theme_bw() +
facet_geo(~ V3)
hexbin state map based on https://rud.is/b/2015/05/15/u-s-drought-monitoring-with-hexbin-state-maps-in-r/
x7 <-
x6 %>%
group_by(V3) %>%
tally() %>%
left_join(state_pops, by = c("V3" = "state_abb")) %>%
mutate(prop = n / Census * 100000) # per 100k people
library(rgdal)
library(rgeos)
# get map from
download.file("https://gist.githubusercontent.com/hrbrmstr/51f961198f65509ad863/raw/219173f69979f663aa9192fbe3e115ebd357ca9f/us_states_hexgrid.geojson", "us_states_hexgrid.geojson")
us <- readOGR("us_states_hexgrid.geojson", "OGRGeoJSON")
## OGR data source with driver: GeoJSON
## Source: "us_states_hexgrid.geojson", layer: "OGRGeoJSON"
## with 51 features
## It has 6 fields
centers <- cbind.data.frame(data.frame(gCentroid(us, byid=TRUE), id=us@data$iso3166_2))
us_map <- fortify(us, region="iso3166_2")
gg <- ggplot()
gg <- gg + geom_map(
data = us_map,
map = us_map,
aes(x = long,
y = lat,
map_id = id),
color = "white",
size = 0.5
)
gg <- gg + geom_map(data = x7,
map = us_map,
aes(fill = prop,
map_id = V3))
gg <- gg + geom_map(
data = x7,
map = us_map,
aes(map_id = V3),
fill = "#ffffff",
alpha = 0,
color = "white",
show.legend = FALSE
)
gg <- gg + geom_text(data=centers, aes(label=id, x=x, y=y), color="white", size=4)
gg <- gg + coord_map() +
scale_fill_viridis_c() +
theme_bw() +
xlab("") +
ylab("")
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(panel.spacing=unit(3, "lines"))
gg <- gg + theme(panel.grid=element_blank())
gg <- gg + theme(axis.ticks=element_blank())
gg <- gg + theme(axis.text=element_blank())
gg <- gg + theme(strip.background=element_blank())
gg <- gg + theme(strip.text=element_text(face="bold", hjust=0, size=14))
gg <- gg + ggtitle(glue('Killed by police, per 100k people, {min(years)}-2017'),
subtitle = "Data from http://killedbypolice.net/")
gg
This seems right, cf. https://mappingpoliceviolence.org/states/
interactive map of raw counts by state over time
state_pops_long <-
state_pops %>%
select(state_abb, one_of(as.character(years))) %>%
gather(year, population, -state_abb) %>%
filter(!is.na(state_abb))
x8 <-
x6 %>%
mutate(month = format(date_format, "%m"),
year = format(date_format, "%Y"),
month_year = glue('{month}/{year}'),
state_abb = V3) %>%
group_by(month, year, month_year, state_abb) %>%
tally() %>%
arrange(year, month) %>%
left_join(state_pops_long)
get an ordered factor to make the x-axis plot in order
x8$month_year <- factor(x8$month_year, levels = unique(x8$month_year))
gg1 <-
ggplot(x8,
aes(month_year,
n)) +
geom_line(aes(group = state_abb,
colour = state_abb)) +
scale_color_viridis_d() +
# scale_y_log10() +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggtitle(glue('Raw counts of people killed by police, {min(years)}-2017'),
subtitle = "Data from http://killedbypolice.net/")
gg1
# get interactive plot
plotly::ggplotly(gg1)
This is just drafts of the code that I used above
# No need to look here, it's just what I did to get the 2017 data only
# for one webpage, the KBP 2017 data
# get HTML page as raw HTML
tmp1 <- readr::read_table(url)
# rename the column for ease of use
names(tmp1) <- "V1"
# filter to keep only columns that have data in them, get rid of some HTML
tmp2 <- tmp1 %>%
mutate(V99 = gsub('<center>|<CENTER>|<font size=2>|target=new', "", V1)) %>%
filter(grepl("<TR><TD>\\(", V99)) %>%
select(-V1)
tmp3 <- tmp2 %>%
# split variables into cols
separate(V99,
into = glue('V{1:9}'),
sep = "<td>|<TD>",
remove = FALSE) %>% # keep it so we can check on things later
separate(V2,
into = c('number', 'date',
'number2', 'date2'),
sep = "\\)|<br>|\\)|<BR>")
# format some col classes, numeric, date, get rid of white space
tmp4 <-
tmp3 %>%
mutate_all(funs(trimws)) %>%
mutate(number = as.numeric(gsub("\\(", "", number))) %>%
mutate(date_format = as.Date(date, "%B %d, %Y"))
# remove URLs in name col, split mulitple desceased into multiple cols
tmp5 <-
tmp4 %>%
mutate(deceased = gsub('http\\S+\\s*', "", V5)) %>%
mutate(deceased = gsub('<a href=">|</a>', "", deceased)) %>%
# make separate row where there are two deceased in one row
separate_rows(deceased, sep = "<br>") %>%
# get the name and age in their own cols
separate(deceased,
into = c('deceased_name', 'deceased_age'),
sep = ",(?=[^,]+$)", # split on the last comma
remove = FALSE) %>%
mutate(deceased_age = as.numeric(trimws(deceased_age))) %>%
separate(V4,
into = c('gender', 'race'),
sep = "/")
# How many are we missing?
max_number_on_website <- max(tmp5$number)
number_that_we_have <- nrow(tmp5)
difference <- max_number_on_website - number_that_we_have
difference
## [1] -2
# is this from your group? https://rpubs.com/johnbradford/policeShootDataCreation