This is quick look at how we can get data from http://killedbypolice.net/ and from the US Census website, and combine them to explore. The US Census data is relatively straightforward, but the http://killedbypolice.net/ data is more challenging to obtain and tidy because of the inconsistant use of HTML table formatting.

Code for this document is online at https://gist.github.com/benmarwick/a8888f96728f32e4191f81e7e62d2736

knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(stringr)

Census data

Here’s an easy way to get the census data (I haven’t done anything further with this): check to get latest data set?

tmp <- rio::import("https://www2.census.gov/programs-surveys/popest/tables/2010-2016/state/totals/nst-est2016-01.xlsx",
                   skip = 3)

check the structure of the data

dplyr::glimpse(tmp)
## Observations: 63
## Variables: 10
## $ X__1             <chr> "United States", "Northeast", "Midwest", "Sou...
## $ Census           <dbl> 308745538, 55317240, 66927001, 114555744, 719...
## $ `Estimates Base` <dbl> 308758105, 55318353, 66929825, 114563005, 719...
## $ `2010`           <dbl> 309348193, 55388056, 66978602, 114863114, 721...
## $ `2011`           <dbl> 311663358, 55632766, 67153331, 116061801, 728...
## $ `2012`           <dbl> 313998379, 55829059, 67332320, 117299171, 735...
## $ `2013`           <dbl> 316204908, 55988771, 67543948, 118424320, 742...
## $ `2014`           <dbl> 318563456, 56116791, 67726368, 119696311, 750...
## $ `2015`           <dbl> 320896618, 56184737, 67838387, 121039206, 758...
## $ `2016`           <dbl> 323127513, 56209510, 67941429, 122319574, 766...

clean up state names to abbreviations

state_abb <- data_frame(state_name = state.name,
                        state_abb = state.abb)
state_abb <- rbind(state_abb,
                   data_frame(
                     state_name = "District of Columbia",
                     state_abb = "DC"))
state_pops <- 
tmp %>% 
  mutate(state_name = gsub("\\.", "", X__1)) %>% 
  left_join(state_abb)

Killed by Police Data

Here’s how we can get all years of the data from http://killedbypolice.net/

library(glue)
library(readr)

url <- "http://killedbypolice.net/"

For all KBP pages… make a vector of URLs for all years available

years <- 2016:2013
all_urls <- c(url, glue('{url}kbp{years}'))
all_pages <- map(all_urls, ~read_table(.x))

x1 <- map(all_pages, ~setNames(.x, "V1"))

filter to keep only columns that have data in them, get rid of some HTML

x2 <- map(x1, ~.x %>%   
            mutate(V99 = gsub('<center>|<CENTER>|<font size=2>|target=new', "", V1)) %>% 
            filter(grepl("<TR><TD>\\(", V99)) %>% 
            select(-V1))

x3 <- map(x2, ~.x %>% 
            # split variables into cols
            separate(V99, 
                     into = glue('V{1:9}'),
                     sep = "<td>|<TD>",
                     remove = FALSE) %>% # keep it so we can check on things later
            separate(V2,
                     into = c('number', 'date', 
                              'number2', 'date2'),
                     sep = "\\)|<br>|\\)|<BR>"))

format some col classes, numeric, date, get rid of white space

x4 <- map(x3, ~.x %>% 
            mutate_all(funs(trimws)) %>% 
            mutate(number = as.numeric(gsub("\\(", "", number))) %>% 
            mutate(date_format =  as.Date(date, "%B %d, %Y")))

remove URLs in name col, split mulitple desceased into multiple cols

x5 <- map(x4, ~.x %>% 
            mutate(deceased = gsub('http\\S+\\s*', "", V5)) %>%
            mutate(deceased = gsub('<a href=">|</a>', "", deceased)) %>% 
            # make separate row where there are two deceased in one row
            separate_rows(deceased, sep = "<br>|<BR>") %>% 
            # get the name and age in their own cols
            separate(deceased, 
                     into = c('deceased_name', 'deceased_age'),
                     sep = ",(?=[^,]+$)",  # split on the last comma
                     remove = FALSE)  %>% 
            mutate(deceased_age = as.numeric(trimws(deceased_age))) %>% 
            separate(V4, 
                     into = c('gender', 'race', 'other_gender_race', 'other1_gender_race', 'other2_gender_race'),
                     sep = "/") %>% 
            mutate(gender = substr(gender, 1, 1)) %>% 
            separate(V6, 
                     into = c('method', 'method1', 'method2', 'method3', 'method4'),
                     sep = "<br>|<BR>") %>% 
            mutate(method = trimws(method)))

convert list of data frames to one big data frame

x6 <- bind_rows(x5)

We have 5307 rows.

What does it look like? Here’s a snippet of just a few columns:

x6 %>% 
  mutate(state = V3) %>% 
  select(number, date, state, deceased_name, deceased_age) %>% 
  head(n = 10) %>% 
  knitr::kable()
number date state deceased_name deceased_age
1015 November 6, 2017 VA NA
1014 November 5, 2017 IL Eddie Patterson 49
1013 November 5, 2017 NM Marlysa Sanchez 31
1012 November 5, 2017 MS NA
1011 November 4, 2017 IN NA
1010 November 4, 2017 AZ Humberto Edwards 27
1009 November 4, 2017 CA Augustus Crawford 20
1008 November 4, 2017 CO Michael Marin 35
1007 November 4, 2017 AR Nyung Kyee 56
1006 November 4, 2017 MT Frank Joey Half Jr. 30

Some quick plots

ggplot(x6,
       aes(deceased_age)) +
  geom_histogram() +
  theme_bw()

age and gender

age_and_gender <- 
x6 %>% 
  select(deceased_age, gender) %>% 
  filter(!is.na(deceased_age)) %>% 
  mutate(age_category=cut(deceased_age, 
                          breaks = seq(0,max(deceased_age), 5))) %>% 
  filter(!is.na(age_category)) %>% 
  group_by(age_category) %>% 
  summarise(prop_female = sum(gender == 'F') / n())

ggplot(age_and_gender, 
       aes(age_category, 
           prop_female)) +
  geom_point(size = 3) + 
  ylab("Proportion of deaths that are Female") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  ggtitle("Female deaths by police, by age group",
          subtitle = glue("Data from {url}"))

race

race <- 
x6 %>% 
select(deceased_age, race) %>% 
  filter(!is.na(race)) %>% 
  filter(!is.na(deceased_age)) %>% 
  mutate(race = str_sub(race, 1, 1)) %>% 
  filter(race %in% c("I", "B", "W", "L", "A", "P", "O"))

# get n to use in tick labels
race_n <- 
  race %>% 
  group_by(race) %>% 
  tally() %>% 
  mutate(race_label = glue('{race}\n(n = {n})')) %>% 
  right_join(race)

library(ggbeeswarm)
ggplot(race_n, 
       aes(reorder(race_label, 
                   -deceased_age),
           deceased_age)) +
  geom_quasirandom(alpha = 0.1) +
  xlab("Race (as classified by killedbypolice.net)") +
  theme_bw()  +
  ggtitle("Deaths by police, by age and race of deceased",
          subtitle = glue("Data from {url}"))

method of death

method_of_death <- 
x6 %>% 
  mutate(method_long = case_when(
    method == "G" ~ "Gun",
    method == "T" ~ "Taser",
    method == "R" ~ "Restraint/\nPhysical Force",
    method == "C" ~ "Chemical",
    method == "V" ~ "Vehicle",
    method == "O" ~ "Other")) 
  
method_of_death_df <- 
  method_of_death %>% 
  group_by(method_long) %>% 
  tally() %>% 
  mutate(prop = n / sum(n) ) %>% 
  filter(!is.na(method_long))


ggplot(method_of_death_df, 
       aes(reorder(method_long, -prop),
           prop)) +
  geom_col() +
  xlab("") +
  ylab("Proportion of all deaths by police") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Let’s convert n to the proportion of people in each state age distribution by state https://hafen.github.io/geofacet/

library("geofacet")

ggplot(x6, 
       aes(deceased_age)) +
  geom_histogram() +
  theme_bw() +
  facet_geo(~ V3)

hexbin state map based on https://rud.is/b/2015/05/15/u-s-drought-monitoring-with-hexbin-state-maps-in-r/

x7 <- 
  x6 %>% 
  group_by(V3) %>% 
  tally() %>% 
  left_join(state_pops, by = c("V3" = "state_abb")) %>% 
  mutate(prop =  n / Census * 100000) # per 100k people

library(rgdal)
library(rgeos)

# get map from 
download.file("https://gist.githubusercontent.com/hrbrmstr/51f961198f65509ad863/raw/219173f69979f663aa9192fbe3e115ebd357ca9f/us_states_hexgrid.geojson", "us_states_hexgrid.geojson")
us <- readOGR("us_states_hexgrid.geojson", "OGRGeoJSON")
## OGR data source with driver: GeoJSON 
## Source: "us_states_hexgrid.geojson", layer: "OGRGeoJSON"
## with 51 features
## It has 6 fields
centers <- cbind.data.frame(data.frame(gCentroid(us, byid=TRUE), id=us@data$iso3166_2))
us_map <- fortify(us, region="iso3166_2")

gg <- ggplot()

gg <- gg + geom_map(
  data = us_map,
  map = us_map,
  aes(x = long,
      y = lat,
      map_id = id),
  color = "white",
  size = 0.5
)

gg <- gg + geom_map(data = x7,
                    map = us_map,
                    aes(fill = prop,
                        map_id = V3))

gg <- gg + geom_map(
  data = x7,
  map = us_map,
  aes(map_id = V3),
  fill = "#ffffff",
  alpha = 0,
  color = "white",
  show.legend = FALSE
) 

gg <- gg + geom_text(data=centers, aes(label=id, x=x, y=y), color="white", size=4)

gg <- gg + coord_map() +
  scale_fill_viridis_c() +
  theme_bw()  +
  xlab("") +
  ylab("")

gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(panel.spacing=unit(3, "lines"))
gg <- gg + theme(panel.grid=element_blank())
gg <- gg + theme(axis.ticks=element_blank())
gg <- gg + theme(axis.text=element_blank())
gg <- gg + theme(strip.background=element_blank())
gg <- gg + theme(strip.text=element_text(face="bold", hjust=0, size=14))
gg <- gg + ggtitle(glue('Killed by police, per 100k people, {min(years)}-2017'),
                   subtitle = "Data from http://killedbypolice.net/")

gg 

This seems right, cf. https://mappingpoliceviolence.org/states/

interactive map of raw counts by state over time

state_pops_long <-  
  state_pops %>% 
  select(state_abb, one_of(as.character(years))) %>% 
  gather(year, population, -state_abb) %>% 
  filter(!is.na(state_abb)) 


x8 <- 
  x6 %>%
  mutate(month = format(date_format, "%m"), 
         year = format(date_format, "%Y"),
         month_year = glue('{month}/{year}'),
         state_abb = V3) %>%
  group_by(month, year, month_year, state_abb) %>%
  tally() %>% 
  arrange(year, month) %>% 
  left_join(state_pops_long) 

get an ordered factor to make the x-axis plot in order

x8$month_year <- factor(x8$month_year, levels = unique(x8$month_year))

gg1 <- 
  ggplot(x8,
         aes(month_year,
             n)) +
  geom_line(aes(group = state_abb,
                colour = state_abb)) +
  scale_color_viridis_d() +
  # scale_y_log10() +
  theme_bw()  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))  +
  ggtitle(glue('Raw counts of people killed by police, {min(years)}-2017'),
          subtitle = "Data from http://killedbypolice.net/")

gg1

# get interactive plot
plotly::ggplotly(gg1)

No need to read below here

This is just drafts of the code that I used above

# No need to look here, it's just what I did to get the 2017 data only
# for one webpage, the KBP 2017 data
# get HTML page as raw HTML
tmp1 <- readr::read_table(url)
# rename the column for ease of use
names(tmp1) <- "V1"


# filter to keep only columns that have data in them, get rid of some HTML 
tmp2 <- tmp1 %>% 
  mutate(V99 = gsub('<center>|<CENTER>|<font size=2>|target=new', "", V1)) %>% 
  filter(grepl("<TR><TD>\\(", V99)) %>% 
  select(-V1)


tmp3 <- tmp2 %>% 
  # split variables into cols
  separate(V99, 
           into = glue('V{1:9}'),
           sep = "<td>|<TD>",
           remove = FALSE) %>% # keep it so we can check on things later
  separate(V2,
           into = c('number', 'date', 
                    'number2', 'date2'),
           sep = "\\)|<br>|\\)|<BR>")

# format some col classes, numeric, date, get rid of white space
tmp4 <- 
tmp3 %>% 
  mutate_all(funs(trimws)) %>% 
  mutate(number = as.numeric(gsub("\\(", "", number))) %>% 
  mutate(date_format =  as.Date(date, "%B %d, %Y"))

# remove URLs in name col, split mulitple desceased into multiple cols
tmp5 <- 
  tmp4 %>% 
  mutate(deceased = gsub('http\\S+\\s*', "", V5)) %>%
  mutate(deceased = gsub('<a href=">|</a>', "", deceased)) %>% 
  # make separate row where there are two deceased in one row
  separate_rows(deceased, sep = "<br>") %>% 
  # get the name and age in their own cols
  separate(deceased, 
           into = c('deceased_name', 'deceased_age'),
           sep = ",(?=[^,]+$)",  # split on the last comma
           remove = FALSE)  %>% 
  mutate(deceased_age = as.numeric(trimws(deceased_age))) %>% 
  separate(V4, 
           into = c('gender', 'race'),
           sep = "/")


# How many are we missing?
max_number_on_website <- max(tmp5$number)
number_that_we_have <- nrow(tmp5)
difference <- max_number_on_website - number_that_we_have
difference
## [1] -2
# is this from your group? https://rpubs.com/johnbradford/policeShootDataCreation