knitr::opts_chunk$set(echo = TRUE)

#import libraries to be used
library(tidyverse)
library(dplyr)
library(ggplot2)
library(usmap)
## Warning: package 'usmap' was built under R version 4.0.3
library(maps)
## Warning: package 'maps' was built under R version 4.0.3
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.3

1 Import Data

#read in data using tidyverse package to create an initial dataframe called df_initial
setwd("C:/Users/MCuser/Dropbox/Rachel/MontColl/grant proposal")
df_initial<-read_csv("mappingPoliceViolenceFull.csv",col_names=TRUE)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_logical(),
##   `Victim's name` = col_character(),
##   `Victim's age` = col_character(),
##   `Victim's gender` = col_character(),
##   `Victim's race` = col_character(),
##   `URL of image of victim` = col_character(),
##   `Date of Incident (month/day/year)` = col_character(),
##   `Street Address of Incident` = col_character(),
##   City = col_character(),
##   State = col_character(),
##   Zipcode = col_character(),
##   County = col_character(),
##   `Agency responsible for death` = col_character(),
##   `ORI Agency Identifier (if available)` = col_character(),
##   `Cause of death` = col_character(),
##   `A brief description of the circumstances surrounding the death` = col_character(),
##   `Official disposition of death (justified or other)` = col_character(),
##   `Criminal Charges?` = col_character(),
##   `Link to news article or photo of official document` = col_character(),
##   `Symptoms of mental illness?` = col_character(),
##   `Unarmed/Did Not Have an Actual Weapon` = col_character()
##   # ... with 9 more columns
## )
## i Use `spec()` for the full column specifications.

2 Clean the data

2.1 Select columns of interest

Select columns from original dataset that I would like to include in my analysis

# select columns of interest from original dataset
df<- df_initial %>%
     select(c(2,3,4,6,8,9,10,11,13,19,20,24))

2.2 Rename Columns

#I have reformatted and renamed columns to make it easier to use the data. 

names(df) <- tolower(names(df))
names(df) <- gsub("'","",names(df))
names(df)<-gsub(" ","_",names(df))
str(df)
## tibble [8,427 x 12] (S3: tbl_df/tbl/data.frame)
##  $ victims_age                          : chr [1:8427] "Unknown" "Unknown" "61" "22" ...
##  $ victims_gender                       : chr [1:8427] "Male" "Male" "Male" "Male" ...
##  $ victims_race                         : chr [1:8427] "Unknown race" "Unknown race" "Unknown race" "Unknown race" ...
##  $ date_of_incident_(month/day/year)    : chr [1:8427] "9/6/2020" "9/5/2020" "9/5/2020" "9/5/2020" ...
##  $ city                                 : chr [1:8427] "Virginia Beach" "Chicago" "San Antonio" "Battle Creek" ...
##  $ state                                : chr [1:8427] "VA" "IL" "TX" "MI" ...
##  $ zipcode                              : chr [1:8427] "23452" "60638" "78201" "49037" ...
##  $ county                               : chr [1:8427] "Virginia Beach City" "Cook" "Bexar" "Calhoun" ...
##  $ ori_agency_identifier_(if_available) : chr [1:8427] "VA1280000" "ILCPD0000" "TXSPD0000" "MI1323700;MI1311300" ...
##  $ symptoms_of_mental_illness?          : chr [1:8427] "No" "No" "No" "Drug or alcohol use" ...
##  $ unarmed/did_not_have_an_actual_weapon: chr [1:8427] "Allegedly Armed" "Allegedly Armed" "Allegedly Armed" "Vehicle" ...
##  $ body_camera_(source:_wapo)           : chr [1:8427] NA NA NA NA ...
df<- df%>%
  rename("age"= "victims_age","gender"= "victims_gender","race"= "victims_race","ori" ="ori_agency_identifier_(if_available)","mental_illness"= "symptoms_of_mental_illness?","armed"= "unarmed/did_not_have_an_actual_weapon", "date"= "date_of_incident_(month/day/year)", "bodycam" ="body_camera_(source:_wapo)")

3 Check data

3.1 Overview of structure of data

str(df)
## tibble [8,427 x 12] (S3: tbl_df/tbl/data.frame)
##  $ age           : chr [1:8427] "Unknown" "Unknown" "61" "22" ...
##  $ gender        : chr [1:8427] "Male" "Male" "Male" "Male" ...
##  $ race          : chr [1:8427] "Unknown race" "Unknown race" "Unknown race" "Unknown race" ...
##  $ date          : chr [1:8427] "9/6/2020" "9/5/2020" "9/5/2020" "9/5/2020" ...
##  $ city          : chr [1:8427] "Virginia Beach" "Chicago" "San Antonio" "Battle Creek" ...
##  $ state         : chr [1:8427] "VA" "IL" "TX" "MI" ...
##  $ zipcode       : chr [1:8427] "23452" "60638" "78201" "49037" ...
##  $ county        : chr [1:8427] "Virginia Beach City" "Cook" "Bexar" "Calhoun" ...
##  $ ori           : chr [1:8427] "VA1280000" "ILCPD0000" "TXSPD0000" "MI1323700;MI1311300" ...
##  $ mental_illness: chr [1:8427] "No" "No" "No" "Drug or alcohol use" ...
##  $ armed         : chr [1:8427] "Allegedly Armed" "Allegedly Armed" "Allegedly Armed" "Vehicle" ...
##  $ bodycam       : chr [1:8427] NA NA NA NA ...
dim(df)
## [1] 8427   12

There are now 8427 rows in the dat and 12 columns which I have selected.

3.2 Check for Missing values

# check for missing values
sum(is.na(df))
## [1] 3072

There are 3072 missing values, however this does not tell us how many rows have missing values. I will also keep data with race “unknown” and age “unknown” for the time being.

Count number of rows with missing values ( “unknown” will not be counted in this step)

#create dummy data frame to hold rows with missing values
dfX<-df[rowSums(is.na(df)) > 0, ]  
dim(dfX)
## [1] 3029   12

There are 3029 rows with missing values A quick over view of the data shows that most of the missing data values are for the bodycam column

#count missing values in bodycam column
sum(is.na(df$bodycam))
## [1] 2994

There are 2994 missing entries in the bodycam column. Rechecking the number of missing values after excluding bodycam column

#count missing values in bodycam column
dfX2<- df%>%
  select(bodycam)

dfX2<-df[rowSums(is.na(dfX2)) > 0, ]  
dim(dfX2)
## [1] 2994   12

There are now only 53 rows with missing data if the boycam column is excluded. Hence for the intially EDA questions will ignore missing values in bodycam data and will remove the 53 rows with missing data.

3.3 Checking columns for “unknown” classifications.

As part of my EDA I wish to check the racial breakdown of victims, hence will count number of “unknown” racial classification. I will not drop this data from my dataset but in my analysis also classify race as unknown.

#count number of cases were race is unknown.
sum(df$race == "Unknown race" )
## [1] 845

There are 845 victims whose race is unknown.

##Format dates to extract year

#slice original date column to obtain year
df$date<-df$date%>%
        str_sub(., start = -4L, end = -1L)

4 EDA Questions

For my EDA I am interested in

1. Mapping the number of deaths over the states.
2. Racial breakdown of victims.
3. Gender breakdown of victims.

4.1 Map deaths by state:

# create dataframe df2: deaths per a year per a state
df2<-df%>% count(date,state)
#I have included the data for 2020 although this year is not complete.
year= c('2020','2019','2018','2017','2016','2015','2014','2013')

df2<-df2 %>% 
  filter( date %in% year)
#change state abbreviation to state name for maps function. State names need to be lower case for the maps library
df2$region<-state.name[match(df2$state,state.abb)]
df2$region<-tolower(df2$region)

#check missing values in df2 dataframe
sum(is.na(df2$region))
## [1] 8
# region names for DC are missing. Add these manually.
df2$region[is.na(df2$region)]="district of columbia"


# prepare maps of USA to add data
us_states<-map_data("state")
#join our dataset to the map data from the library(maps)

us_states_police <- left_join(us_states, df2)
## Joining, by = "region"
p <- ggplot(data = us_states_police,
            aes(x = long, y = lat,
                group = group, fill = n))

p2<-p + geom_polygon(color = "gray90", size = 0.1)+ scale_fill_viridis_c(option = "plasma") + facet_wrap(~date, nrow=3,drop=TRUE)

p2+ theme(strip.background = element_blank(),panel.grid = element_blank(), axis.title = element_blank(),axis.text = element_blank(),axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
  xlab("")+
  ylab("")+ 
  labs(fill="Deaths",title=("Total Deaths caused by Police by State 2013-2020"))

Only total deaths have been shown and does not take into consideration the population of the individual states.

4.2 Deaths per 10 million inhabitants per state for 2019

I have obtained population estimates from www.census.gov for 2019. link: https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/state/detail/SCPRC-EST2019-18+POP-RES.csv retrieved: 9 October 2020

I will use this population data to estimate deaths caused by police per 10 million inhabitants in a state and see what effect this has on the heatmap.

#import census data
setwd("C:/Users/MCuser/Dropbox/Rachel/MontColl/grant proposal")
census_initial<-read_csv("census2019.csv",col_names=TRUE)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   SUMLEV = col_double(),
##   REGION = col_character(),
##   DIVISION = col_character(),
##   STATE = col_double(),
##   NAME = col_character(),
##   POPESTIMATE2019 = col_double(),
##   POPEST18PLUS2019 = col_double(),
##   PCNT_POPEST18PLUS = col_double()
## )
#select columns of interest in census data
census_df<- census_initial%>%
          select(NAME,POPESTIMATE2019,POPEST18PLUS2019,PCNT_POPEST18PLUS)
#Clean column names
names(census_df) <- tolower(names(census_df))
#lower name to join with map dataset
census_df$name<-tolower(census_df$name)
census_df<-rename(census_df,region=name)
#merge census data for population with us_states_police mapping dataframe. 
us_states_police2019<- left_join(us_states_police, census_df)
## Joining, by = "region"
#only include 2019 data, census data for other years in inaccurate
us_states_police2019<-us_states_police2019%>%
  filter(.,date=="2019")
#calculate deaths per 10 million people for 2019 population estimates
us_states_police2019<-us_states_police2019%>%
  mutate(., deathsHundredThousand= (n/popestimate2019)*10000000)

Create heatmaps

p3 <- ggplot(data = us_states_police2019,
            aes(x = long, y = lat,
                group = group, fill = deathsHundredThousand))
p4<-p3 + geom_polygon(color = "gray90", size = 0.1)+ scale_fill_viridis_c(option = "plasma")

p5<-p4+ theme(strip.background = element_blank(),panel.grid = element_blank(), axis.title = element_blank(),axis.text = element_blank(),axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
  xlab("")+
  ylab("")+ 
  labs(fill="Deaths",title=("Deaths per 10million people per State caused by Police in 2019"))

x3 <- ggplot(data = us_states_police2019,
            aes(x = long, y = lat,
                group = group, fill = n))

x4<-x3 + geom_polygon(color = "gray90", size = 0.1)+ scale_fill_viridis_c(option = "plasma")

x5<-x4+ theme(strip.background = element_blank(),panel.grid = element_blank(), axis.title = element_blank(),axis.text = element_blank(),axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
  xlab("")+
  ylab("")+ 
  labs(fill="Deaths",title=("Total Deaths per State caused by Police in 2019"))
x5

p5

4.3 Racial breakdown of victims

Race of victims across all states per a year.

df3<-df%>% count(date,race)
ggplot(data=df3, mapping=aes(x=race,y=n,fill=race)) +
   geom_bar(position="dodge", stat="identity")+
   facet_wrap(~date)+
   ylab("Number of Deaths")+
   xlab("")+
   labs(fill="Race", title="Racial Demographics of Total Deaths caused by Police in the USA")+
   theme(strip.background = element_blank(),panel.grid = element_blank(),axis.ticks.x = element_blank(),axis.text.x=element_blank())

Would be interesting to add population data per demographic and check deaths by percent of race in population.

4.4 Age distribution of victims.

# There are no missing values but some of the ages are unknown. Calculate number of unknown ages and replace with the mean
# Compute a histogram of age in ggplot
sum(df$age=='Unknown')
## [1] 271

There are 271 victims with age ’Unknown". I will ignore these unknown values.

#Replace age value "Unknown" with "NA"
df <- df %>%
  mutate(age= replace(age, age == "Unknown", NA)) #Replace age value "Unknown" with "NA"

# change datatype of age from character to integer
df<-df %>% mutate(age = as.integer(age))
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
#plot histogram of age.

ggplot(data=df, aes(x=age,na.rm=TRUE)) +  #ignore NA values in plot
  geom_histogram(aes(fill=..count..),binwidth = 1) + 
  facet_wrap(~date)+
  labs(title="Age Distribution of Victims", x="Age", y="Count")+
  ylab("Number of Deaths")+
  xlab("Age")+
  scale_x_continuous(name = "Age",
                           breaks = seq(0, 80, 20),
                           limits=c(0, 80)) +
  scale_fill_gradient("Count", low = "blue", high = "yellow")+
  theme(strip.background = element_blank(),panel.grid = element_blank())
## Warning: Removed 300 rows containing non-finite values (stat_bin).
## Warning: Removed 16 rows containing missing values (geom_bar).

4.5 Male versus female victims

#summary of gender information
df %>% group_by(gender) %>% 
   summarise(counted = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 5 x 2
##   gender      counted
##   <chr>         <int>
## 1 Female          421
## 2 Male           7981
## 3 Transgender       9
## 4 Unknown           4
## 5 <NA>             12

There are 12 NA values in gender and 4 Unknown values and 9 transgender. Change ‘Unknown’ age to NA. Will ignore these values when plotting.

df %>% 
  filter(!(gender == 'Unknown')) %>% # ignore entries where gender is Unknown
  filter(!(gender == 'NA')) %>%     # ignore entries where gender is NA
  filter(!(gender == 'Transgender')) %>%   #ignore entries where gender is transgender
   ggplot(mapping=aes(x=gender,na.rm=TRUE,fill=race)) +
   geom_bar(stat="count")+
   facet_wrap(~date)+
   ylab("Number of Deaths")+
   xlab("")+
   labs( title="Gender Demographics of Total Deaths caused by Police in the USA")+
   theme(strip.background = element_blank(),panel.grid = element_blank(),axis.ticks.x = element_blank())

5 Essay

5.1 Overview of data and cleaning

The main dataset on which this analysis was based was obtained from the Montgomery College Data Science 110 class datasets and was entitled “mapping Police Violence”. I have also utilised population data from www.census.gov for 2019. link: https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/state/detail/SCPRC-EST2019-18+POP-RES.csv retrieved: 9 October 2020

I have selected 12 variables of interest from the main dataset. Although it was my initial idea to include all these variables in my EDA, I have now limited the actual number of variables in my EDA. In order to make my analysis easier, I have renamed the variables. All variable names are in lower case. Below is a list of the new variable names in italics and the original variable name in the main dataset.

age: Victim’s age
gender: Victim’s gender
race: Victim’s Race
ori: ORI Agency Identifier (if available)
mental_illness: Symptoms of mental illness
armed: Unarmed/Did Not Have an Actual Weapon
date: Date of Incident (month/day/year)
bodycam: Body Camera (source: wapo)
city: City
state: State
zipcode: Zipcode
county: County

There are 10 catergorical variables and 2 quantitative variable i.e age and date.

I have cleaned the main dataset by selecting variables of interest to me. Changing the names to make the data more user friendly (shorter names in lower case, without spaces.) I have also checked for missing values and values catergorised as "unknown’. There were 8427 rows in the data and 12 columns. There were also 3072 missing values in the dataset. This corresponded to 3029 rows with missing data. There were 2994 rows of missing values in the bodycam data alone. This left a further 53 rows with other missing values. Initially I tried leaving out these 53 rows with missing values other than bodycam, however some states only had 1 death and this removed a critical data point in the maps. I have also included the data for year 2020 although the year has not concluded. It would be interesting to see how the year 2020 ( to date) compares to previous years.

For the census data, I performed a similar cleaning process. I selected the census estimate for the population for 2019. I was not sure whether to use the census data for over 18s in the population or the total population estimate. A small percentage of the data ( 304) victims were under 18yrs of age. I have used the total population estimate although I would like to repeat the graphs using the over 18s population estimate.

5.2 Visualisations

Note **: The year 2020 has not yet concluded, however I have opted to graph this year.

Heat maps of total deaths across states
I have plotted heat maps of total deaths due to police violence across all states per year. The year 2020 has not concluded, however I have opted to include this data. Interestly enough, 3 states constantly present as those having the highest number of deaths due to police violence. These states are: California, Texas and Florida. California constantly has the highest total number of deaths from 2013 to 2020 **

Heat map of deaths per 10 million inhabitants of states in 2019
As 3 states California, Texas and Florida constantly presented as those having the highest total number of police deaths, I was intrigued by what effect including population in the data would have on the heat maps. I then opted to calculate the deaths per 10 million people in each state. I chose deaths per 10million, as the numbers for police violence is very low and wanted to make the deaths per a state relative to each other easier to visualise.

Once the population was taken into consideration for 2019, the deaths due to police violence per 10million people where very low in California, Texas and Florida. New Mexico, Oklahoma and Mississippi had the highest deaths due to police violence per 10million people.

It would be interesting to repeat these graphs limiting the population estimates to people over 18yrs of age as the majority of the victims are older than 18 and those victims under 18yrs of age can be considered outliers.

Racial Demographics of Victims
Victims with ‘Unknown Race’ have also been catergorised in these plots. I was surprised to discover that ‘white’ people are the largest racial demographic of victims. It would be interesting to also add racial population demographics to these plots. I have accessed the population estimates by race from the census bureau, however I was confused by the racial classifications. For example: black non-hispanic, white hispanic, white non- hispanic and hispanic.

Age Distribution of Victims
There were 271 victims with age unknown.These datapoints have been ignored in the plots. From the visualisation is it evident that the vast majority of victims are between the ages of 20 and 40yrs of age.

Gender of Victims
In the dataset there were 12 victims where the age was NA, 4 unknown genders and 9 transgender. These data points have been ignored from the gender plot. I did not delete these datapoints from the main dataset as some states have very low deaths number or only 1 death and removing this from the dataset would skew the heatmaps. From the visualisation is it obvious that the vast majority of victims are white males. The number of female victims are extremely low. Of the female victims, the vast majority are white females.