Scraping data from killedbypolice.net

This is quick look at how we can get data from http://killedbypolice.net/ and from the US Census website, and combine them to explore. The US Census data is relatively straightforward, but the http://killedbypolice.net/ data is more challenging to obtain and tidy because of the inconsistant use of HTML table formatting.

Code for this document is online at https://gist.github.com/benmarwick/a8888f96728f32e4191f81e7e62d2736

knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE)
library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Warning: package 'purrr' was built under R version 3.4.2

## Warning: package 'dplyr' was built under R version 3.4.2

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

library(stringr)

Census data

Here’s an easy way to get the census data (I haven’t done anything further with this): check to get latest data set?

tmp <- rio::import("https://www2.census.gov/programs-surveys/popest/tables/2010-2016/state/totals/nst-est2016-01.xlsx",
                   skip = 3)

check the structure of the data

dplyr::glimpse(tmp)

## Observations: 63
## Variables: 10
## $ X__1             <chr> "United States", "Northeast", "Midwest", "Sou...
## $ Census           <dbl> 308745538, 55317240, 66927001, 114555744, 719...
## $ `Estimates Base` <dbl> 308758105, 55318353, 66929825, 114563005, 719...
## $ `2010`           <dbl> 309348193, 55388056, 66978602, 114863114, 721...
## $ `2011`           <dbl> 311663358, 55632766, 67153331, 116061801, 728...
## $ `2012`           <dbl> 313998379, 55829059, 67332320, 117299171, 735...
## $ `2013`           <dbl> 316204908, 55988771, 67543948, 118424320, 742...
## $ `2014`           <dbl> 318563456, 56116791, 67726368, 119696311, 750...
## $ `2015`           <dbl> 320896618, 56184737, 67838387, 121039206, 758...
## $ `2016`           <dbl> 323127513, 56209510, 67941429, 122319574, 766...

clean up state names to abbreviations

state_abb <- data_frame(state_name = state.name,
                        state_abb = state.abb)
state_abb <- rbind(state_abb,
                   data_frame(
                     state_name = "District of Columbia",
                     state_abb = "DC"))
state_pops <- 
tmp %>% 
  mutate(state_name = gsub("\\.", "", X__1)) %>% 
  left_join(state_abb)

Killed by Police Data

Here’s how we can get all years of the data from http://killedbypolice.net/

library(glue)
library(readr)

url <- "http://killedbypolice.net/"

For all KBP pages… make a vector of URLs for all years available

years <- 2016:2013
all_urls <- c(url, glue('{url}kbp{years}'))
all_pages <- map(all_urls, ~read_table(.x))

x1 <- map(all_pages, ~setNames(.x, "V1"))

filter to keep only columns that have data in them, get rid of some HTML

x2 <- map(x1, ~.x %>%   
            mutate(V99 = gsub('<center>|<CENTER>|<font size=2>|target=new', "", V1)) %>% 
            filter(grepl("<TR><TD>\\(", V99)) %>% 
            select(-V1))

x3 <- map(x2, ~.x %>% 
            # split variables into cols
            separate(V99, 
                     into = glue('V{1:9}'),
                     sep = "<td>|<TD>",
                     remove = FALSE) %>% # keep it so we can check on things later
            separate(V2,
                     into = c('number', 'date', 
                              'number2', 'date2'),
                     sep = "\\)|<br>|\\)|<BR>"))

format some col classes, numeric, date, get rid of white space

x4 <- map(x3, ~.x %>% 
            mutate_all(funs(trimws)) %>% 
            mutate(number = as.numeric(gsub("\\(", "", number))) %>% 
            mutate(date_format =  as.Date(date, "%B %d, %Y")))

remove URLs in name col, split mulitple desceased into multiple cols

x5 <- map(x4, ~.x %>% 
            mutate(deceased = gsub('http\\S+\\s*', "", V5)) %>%
            mutate(deceased = gsub('<a href=">|</a>', "", deceased)) %>% 
            # make separate row where there are two deceased in one row
            separate_rows(deceased, sep = "<br>|<BR>") %>% 
            # get the name and age in their own cols
            separate(deceased, 
                     into = c('deceased_name', 'deceased_age'),
                     sep = ",(?=[^,]+$)",  # split on the last comma
                     remove = FALSE)  %>% 
            mutate(deceased_age = as.numeric(trimws(deceased_age))) %>% 
            separate(V4, 
                     into = c('gender', 'race', 'other_gender_race', 'other1_gender_race', 'other2_gender_race'),
                     sep = "/") %>% 
            mutate(gender = substr(gender, 1, 1)) %>% 
            separate(V6, 
                     into = c('method', 'method1', 'method2', 'method3', 'method4'),
                     sep = "<br>|<BR>") %>% 
            mutate(method = trimws(method)))

convert list of data frames to one big data frame

x6 <- bind_rows(x5)

We have 5307 rows.

What does it look like? Here’s a snippet of just a few columns:

x6 %>% 
  mutate(state = V3) %>% 
  select(number, date, state, deceased_name, deceased_age) %>% 
  head(n = 10) %>% 
  knitr::kable()

number	date	state	deceased_name	deceased_age
1015	November 6, 2017	VA		NA
1014	November 5, 2017	IL	Eddie Patterson	49
1013	November 5, 2017	NM	Marlysa Sanchez	31
1012	November 5, 2017	MS		NA
1011	November 4, 2017	IN		NA
1010	November 4, 2017	AZ	Humberto Edwards	27
1009	November 4, 2017	CA	Augustus Crawford	20
1008	November 4, 2017	CO	Michael Marin	35
1007	November 4, 2017	AR	Nyung Kyee	56
1006	November 4, 2017	MT	Frank Joey Half Jr.	30

Some quick plots

ggplot(x6,
       aes(deceased_age)) +
  geom_histogram() +
  theme_bw()

age and gender

age_and_gender <- 
x6 %>% 
  select(deceased_age, gender) %>% 
  filter(!is.na(deceased_age)) %>% 
  mutate(age_category=cut(deceased_age, 
                          breaks = seq(0,max(deceased_age), 5))) %>% 
  filter(!is.na(age_category)) %>% 
  group_by(age_category) %>% 
  summarise(prop_female = sum(gender == 'F') / n())

ggplot(age_and_gender, 
       aes(age_category, 
           prop_female)) +
  geom_point(size = 3) + 
  ylab("Proportion of deaths that are Female") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  ggtitle("Female deaths by police, by age group",
          subtitle = glue("Data from {url}"))

race

race <- 
x6 %>% 
select(deceased_age, race) %>% 
  filter(!is.na(race)) %>% 
  filter(!is.na(deceased_age)) %>% 
  mutate(race = str_sub(race, 1, 1)) %>% 
  filter(race %in% c("I", "B", "W", "L", "A", "P", "O"))

# get n to use in tick labels
race_n <- 
  race %>% 
  group_by(race) %>% 
  tally() %>% 
  mutate(race_label = glue('{race}\n(n = {n})')) %>% 
  right_join(race)

library(ggbeeswarm)
ggplot(race_n, 
       aes(reorder(race_label, 
                   -deceased_age),
           deceased_age)) +
  geom_quasirandom(alpha = 0.1) +
  xlab("Race (as classified by killedbypolice.net)") +
  theme_bw()  +
  ggtitle("Deaths by police, by age and race of deceased",
          subtitle = glue("Data from {url}"))

method of death

method_of_death <- 
x6 %>% 
  mutate(method_long = case_when(
    method == "G" ~ "Gun",
    method == "T" ~ "Taser",
    method == "R" ~ "Restraint/\nPhysical Force",
    method == "C" ~ "Chemical",
    method == "V" ~ "Vehicle",
    method == "O" ~ "Other")) 
  
method_of_death_df <- 
  method_of_death %>% 
  group_by(method_long) %>% 
  tally() %>% 
  mutate(prop = n / sum(n) ) %>% 
  filter(!is.na(method_long))


ggplot(method_of_death_df, 
       aes(reorder(method_long, -prop),
           prop)) +
  geom_col() +
  xlab("") +
  ylab("Proportion of all deaths by police") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Let’s convert n to the proportion of people in each state age distribution by state https://hafen.github.io/geofacet/

library("geofacet")

ggplot(x6, 
       aes(deceased_age)) +
  geom_histogram() +
  theme_bw() +
  facet_geo(~ V3)

hexbin state map based on https://rud.is/b/2015/05/15/u-s-drought-monitoring-with-hexbin-state-maps-in-r/

x7 <- 
  x6 %>% 
  group_by(V3) %>% 
  tally() %>% 
  left_join(state_pops, by = c("V3" = "state_abb")) %>% 
  mutate(prop =  n / Census * 100000) # per 100k people

library(rgdal)
library(rgeos)

# get map from 
download.file("https://gist.githubusercontent.com/hrbrmstr/51f961198f65509ad863/raw/219173f69979f663aa9192fbe3e115ebd357ca9f/us_states_hexgrid.geojson", "us_states_hexgrid.geojson")
us <- readOGR("us_states_hexgrid.geojson", "OGRGeoJSON")

## OGR data source with driver: GeoJSON 
## Source: "us_states_hexgrid.geojson", layer: "OGRGeoJSON"
## with 51 features
## It has 6 fields

centers <- cbind.data.frame(data.frame(gCentroid(us, byid=TRUE), id=us@data$iso3166_2))
us_map <- fortify(us, region="iso3166_2")

gg <- ggplot()

gg <- gg + geom_map(
  data = us_map,
  map = us_map,
  aes(x = long,
      y = lat,
      map_id = id),
  color = "white",
  size = 0.5
)

gg <- gg + geom_map(data = x7,
                    map = us_map,
                    aes(fill = prop,
                        map_id = V3))

gg <- gg + geom_map(
  data = x7,
  map = us_map,
  aes(map_id = V3),
  fill = "#ffffff",
  alpha = 0,
  color = "white",
  show.legend = FALSE
) 

gg <- gg + geom_text(data=centers, aes(label=id, x=x, y=y), color="white", size=4)

gg <- gg + coord_map() +
  scale_fill_viridis_c() +
  theme_bw()  +
  xlab("") +
  ylab("")

gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(panel.spacing=unit(3, "lines"))
gg <- gg + theme(panel.grid=element_blank())
gg <- gg + theme(axis.ticks=element_blank())
gg <- gg + theme(axis.text=element_blank())
gg <- gg + theme(strip.background=element_blank())
gg <- gg + theme(strip.text=element_text(face="bold", hjust=0, size=14))
gg <- gg + ggtitle(glue('Killed by police, per 100k people, {min(years)}-2017'),
                   subtitle = "Data from http://killedbypolice.net/")

gg

This seems right, cf. https://mappingpoliceviolence.org/states/

interactive map of raw counts by state over time

state_pops_long <-  
  state_pops %>% 
  select(state_abb, one_of(as.character(years))) %>% 
  gather(year, population, -state_abb) %>% 
  filter(!is.na(state_abb)) 


x8 <- 
  x6 %>%
  mutate(month = format(date_format, "%m"), 
         year = format(date_format, "%Y"),
         month_year = glue('{month}/{year}'),
         state_abb = V3) %>%
  group_by(month, year, month_year, state_abb) %>%
  tally() %>% 
  arrange(year, month) %>% 
  left_join(state_pops_long)

get an ordered factor to make the x-axis plot in order

x8$month_year <- factor(x8$month_year, levels = unique(x8$month_year))

gg1 <- 
  ggplot(x8,
         aes(month_year,
             n)) +
  geom_line(aes(group = state_abb,
                colour = state_abb)) +
  scale_color_viridis_d() +
  # scale_y_log10() +
  theme_bw()  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))  +
  ggtitle(glue('Raw counts of people killed by police, {min(years)}-2017'),
          subtitle = "Data from http://killedbypolice.net/")

gg1

# get interactive plot
plotly::ggplotly(gg1)

No need to read below here

This is just drafts of the code that I used above

# No need to look here, it's just what I did to get the 2017 data only
# for one webpage, the KBP 2017 data
# get HTML page as raw HTML
tmp1 <- readr::read_table(url)
# rename the column for ease of use
names(tmp1) <- "V1"


# filter to keep only columns that have data in them, get rid of some HTML 
tmp2 <- tmp1 %>% 
  mutate(V99 = gsub('<center>|<CENTER>|<font size=2>|target=new', "", V1)) %>% 
  filter(grepl("<TR><TD>\\(", V99)) %>% 
  select(-V1)


tmp3 <- tmp2 %>% 
  # split variables into cols
  separate(V99, 
           into = glue('V{1:9}'),
           sep = "<td>|<TD>",
           remove = FALSE) %>% # keep it so we can check on things later
  separate(V2,
           into = c('number', 'date', 
                    'number2', 'date2'),
           sep = "\\)|<br>|\\)|<BR>")

# format some col classes, numeric, date, get rid of white space
tmp4 <- 
tmp3 %>% 
  mutate_all(funs(trimws)) %>% 
  mutate(number = as.numeric(gsub("\\(", "", number))) %>% 
  mutate(date_format =  as.Date(date, "%B %d, %Y"))

# remove URLs in name col, split mulitple desceased into multiple cols
tmp5 <- 
  tmp4 %>% 
  mutate(deceased = gsub('http\\S+\\s*', "", V5)) %>%
  mutate(deceased = gsub('<a href=">|</a>', "", deceased)) %>% 
  # make separate row where there are two deceased in one row
  separate_rows(deceased, sep = "<br>") %>% 
  # get the name and age in their own cols
  separate(deceased, 
           into = c('deceased_name', 'deceased_age'),
           sep = ",(?=[^,]+$)",  # split on the last comma
           remove = FALSE)  %>% 
  mutate(deceased_age = as.numeric(trimws(deceased_age))) %>% 
  separate(V4, 
           into = c('gender', 'race'),
           sep = "/")


# How many are we missing?
max_number_on_website <- max(tmp5$number)
number_that_we_have <- nrow(tmp5)
difference <- max_number_on_website - number_that_we_have
difference

## [1] -2

# is this from your group? https://rpubs.com/johnbradford/policeShootDataCreation

Scraping data from killedbypolice.net

Ben Marwick

07 November 2017

Census data

Killed by Police Data

Some quick plots

No need to read below here