#{r setup, include=FALSE} # knitr::opts_chunk$set(echo = TRUE) # options(repos="https://cran.rstudio.com") #
library(tidyverse)
library(ggplot2)
library(psych)
library(RColorBrewer)
library(lubridate)
# install.packages("devtools")
# devtools::install_github("thomasp85/patchwork") # it may be outdated
# package "patchwork" is used to combine ggplots into same graph
# information at https://patchwork.data-imaginist.com)
# install.packages("patchwork", repos = "https://cran.rstudio.com/bin/macosx/contrib/4.0/patchwork_1.0.1.tgz")
# devtools::install_github("pdil/usmap")
library(patchwork)
library(usmap)
#setwd("/Users/raulmiranda/Desktop/DATA 110 Fall 2020/Databases")
setwd("C:/Users/MCuser/Dropbox/Rachel/MontColl/grant proposal")
mpv <- read.csv("mappingPoliceViolenceFull.csv")
# mpv_PD <- read.csv("MPVDataset_by PD.csv") # not used in this project
# mpv_State <- read.csv("MPVDataset_by State.csv") # not used in this project
# str(mpv)
# mpv
Select and rename: Age, Gender, Race, Date, City, State, Zipcode, County, Agency, Cause, Disposition, Charges, Mental, Unarmed, Weapon, Threat, Fleeing, Camera
Clean the gender and race columns
mpv_clean <- select(mpv, "age"="Victim.s.age", "gender"="Victim.s.gender", "race"="Victim.s.race", "date"="Date.of.Incident..month.day.year.", "city"="City", "state"="State","zip"= "Zipcode", "agency"="Agency.responsible.for.death", "cause" = "Cause.of.death", "disposition" = "Official.disposition.of.death..justified.or.other.", "charges" = "Criminal.Charges.", "mental" = "Symptoms.of.mental.illness.", "unarmed"="Unarmed.Did.Not.Have.an.Actual.Weapon", "weapon" = "Alleged.Weapon..Source..WaPo.and.Review.of.Cases.Not.Included.in.WaPo.Database.", "threat"="Alleged.Threat.Level..Source..WaPo.", "fleeing"="Fleeing..Source..WaPo.", "camera"="Body.Camera..Source..WaPo.")
mpv_clean$gender <- trimws(mpv_clean$gender) # remove trailing spaces in gender column
mpv_clean$gender [mpv_clean$gender == ""] <- "Unknown"
mpv_clean$race [mpv_clean$race == "Native American"] <- "Native"
mpv_clean$race [mpv_clean$race == "Pacific Islander"] <- "Pacific"
mpv_clean$race [mpv_clean$race == "Unknown race"] <- "Unknown"
# mpv_clean
mpv_gender <- group_by (mpv_clean, gender) %>% summarise(shootings=n())
mpv_gender
## # A tibble: 4 x 2
## gender shootings
## <chr> <int>
## 1 Female 421
## 2 Male 7981
## 3 Transgender 9
## 4 Unknown 16
ggplot(mpv_gender, aes(gender, shootings, color=gender, fill=gender, na.rm=TRUE)) +
geom_bar(stat="identity") + labs (title = "1. Shootings by Gender")
mpv_racegender1 <- filter(mpv_clean, race==c("White", "Black", "Hispanic")) %>%
group_by (race, gender) %>% summarise(shootings=n())
mpv_racegender1
## # A tibble: 9 x 3
## # Groups: race [3]
## race gender shootings
## <chr> <chr> <int>
## 1 Black Female 25
## 2 Black Male 700
## 3 Black Transgender 2
## 4 Hispanic Female 17
## 5 Hispanic Male 456
## 6 Hispanic Unknown 1
## 7 White Female 77
## 8 White Male 1168
## 9 White Transgender 1
mpv_racegender2 <- filter(mpv_clean, race==c("Asian", "Pacific", "Native", "Unknown")) %>%
group_by (race, gender) %>% summarise(shootings=n())
mpv_racegender2
## # A tibble: 8 x 3
## # Groups: race [4]
## race gender shootings
## <chr> <chr> <int>
## 1 Asian Male 31
## 2 Native Female 2
## 3 Native Male 25
## 4 Pacific Female 2
## 5 Pacific Male 11
## 6 Unknown Female 3
## 7 Unknown Male 181
## 8 Unknown Unknown 1
p1 <- ggplot(mpv_racegender1, aes(race, shootings, color=gender, fill=gender)) +
geom_bar(stat="identity", position="dodge") + labs (title = "2. Shootings by race and gender")
p2 <- ggplot(mpv_racegender2, aes(race, shootings, color=gender, fill=gender)) +
geom_bar(stat="identity", position="dodge")
p1 + p2 # patchwork command to combine plots
mpv_date <- filter (mpv_clean) %>%
group_by (gender, date=mdy(date)) %>% # group by gender and convert dates to date format
summarise(shootings=n()) %>% # add up shootings by gender
mutate(total=cumsum(shootings)) # add cumulative total
mpv_date # data frame of shootings per day and cumulative
## # A tibble: 3,031 x 4
## # Groups: gender [4]
## gender date shootings total
## <chr> <date> <int> <int>
## 1 Female 2013-01-04 1 1
## 2 Female 2013-01-08 1 2
## 3 Female 2013-01-09 2 4
## 4 Female 2013-01-22 1 5
## 5 Female 2013-01-24 1 6
## 6 Female 2013-01-29 1 7
## 7 Female 2013-01-30 1 8
## 8 Female 2013-02-08 1 9
## 9 Female 2013-02-19 1 10
## 10 Female 2013-03-16 1 11
## # ... with 3,021 more rows
mpv_date1 <- filter (mpv_clean, gender=="Male") %>%
group_by (gender, date=mdy(date)) %>% # as above for males
summarise(shootings=n()) %>%
mutate(total=cumsum(shootings))
mpv_date1 # shootings per day and cumulative for males
## # A tibble: 2,623 x 4
## # Groups: gender [1]
## gender date shootings total
## <chr> <date> <int> <int>
## 1 Male 2013-01-01 6 6
## 2 Male 2013-01-02 1 7
## 3 Male 2013-01-03 3 10
## 4 Male 2013-01-04 5 15
## 5 Male 2013-01-05 6 21
## 6 Male 2013-01-06 4 25
## 7 Male 2013-01-07 3 28
## 8 Male 2013-01-08 1 29
## 9 Male 2013-01-09 1 30
## 10 Male 2013-01-10 3 33
## # ... with 2,613 more rows
mpv_date2 <- filter (mpv_clean, gender==c("Female" , "Transgender", "Unknown")) %>%
group_by (gender, date=mdy(date)) %>% # as above for females and other genders
summarise(shootings=n()) %>%
mutate(total=cumsum(shootings))
mpv_date2 # shootings per day and cumulative for females and others
## # A tibble: 149 x 4
## # Groups: gender [3]
## gender date shootings total
## <chr> <date> <int> <int>
## 1 Female 2013-01-09 1 1
## 2 Female 2013-01-29 1 2
## 3 Female 2013-03-31 1 3
## 4 Female 2013-04-01 1 4
## 5 Female 2013-04-09 1 5
## 6 Female 2013-04-18 1 6
## 7 Female 2013-04-27 1 7
## 8 Female 2013-05-12 1 8
## 9 Female 2013-05-17 1 9
## 10 Female 2013-06-02 1 10
## # ... with 139 more rows
p3<- ggplot(mpv_date1) +
geom_point(aes(date,shootings, colour = gender)) + scale_color_manual(values = "blue") +
labs (title = "3. Shootings per day by gender")
p4<- ggplot(mpv_date2) +
geom_point(aes(date,shootings, colour = gender)) + scale_color_manual(values = c("red", "purple", "yellow"))
p5<- ggplot(mpv_date1)+
geom_point(aes(date,total, colour = gender)) + scale_color_manual(values = "blue") +
labs (title = "4. Accumulated shootings over time by gender")
p6<- ggplot(mpv_date2)+
geom_point(aes(date,total, colour = gender)) + scale_color_manual(values = c("red", "purple", "yellow"))
p7<- ggplot(mpv_date) +
geom_point(aes(date,total, colour = gender)) +
facet_wrap(~gender) + scale_color_manual(values = c("red", "blue", "purple", "yellow")) +
labs (title = "5. All together: facet-wrap of accumulated shootings over time by gender")
p3 / p4 # patchwork command to combine plots
p5 / p6
p7
mpv_age <- mpv_clean %>% group_by (age=as.numeric(age)) %>%
summarise(shootings=n())
mpv_age
## # A tibble: 86 x 2
## age shootings
## <dbl> <int>
## 1 1 3
## 2 5 3
## 3 6 3
## 4 7 1
## 5 10 1
## 6 12 2
## 7 13 2
## 8 14 6
## 9 15 20
## 10 16 46
## # ... with 76 more rows
# plot(mpv_age) # simple plot
p7 <- ggplot(mpv_age) + # try out the same with ggplot
geom_point(aes(age,shootings)) + labs (title = "6. Shootings by age of victim")
p7
p7 + geom_smooth(aes(age,shootings)) + # explore loess smoothing
labs (title = "7. Try loess smoothing")
p7 + geom_smooth(aes(age,shootings), method="lm", formula=y~poly(x,3)) + # explore cubic smooting
labs (title = "8. Try cubic function")
p7 + geom_smooth(aes(age,shootings), method="gam", formula=y~s(x)) + # explore a Gen. Additive Model
labs (title = "9. Try Generative Additive Model")
mpv_age_race <- filter(mpv_clean, race==c("White", "Black", "Hispanic")) %>% # filter race
group_by (race, age=as.numeric(age)) %>% # group by age
summarise(shootings=n()) # count shootings
mpv_age_race
## # A tibble: 188 x 3
## # Groups: race [3]
## race age shootings
## <chr> <dbl> <int>
## 1 Black 12 1
## 2 Black 14 1
## 3 Black 15 3
## 4 Black 16 4
## 5 Black 17 16
## 6 Black 18 11
## 7 Black 19 16
## 8 Black 20 33
## 9 Black 21 21
## 10 Black 22 33
## # ... with 178 more rows
p8 <- ggplot(mpv_age_race) +
geom_point(aes(age,shootings, color= race)) + labs (title = "10. Shootings by age and race")
p8
p8 + geom_smooth(aes(age,shootings), method="gam", formula=y~s(x)) + # use a GAM for global fit
labs (title = "11. Global fit with a GAM")
p8 + geom_smooth(aes(age,shootings, color=race), method="gam", formula=y~s(x), se=FALSE) + # without SE
labs (title = "12. Shootings Target the Young", x="Age", y="Shootings")
mpv_state <- mpv_clean %>% group_by(state) %>% summarise(deaths=n())
mpv_state
## # A tibble: 51 x 2
## state deaths
## <chr> <int>
## 1 AK 47
## 2 AL 153
## 3 AR 117
## 4 AZ 373
## 5 CA 1294
## 6 CO 263
## 7 CT 40
## 8 DC 27
## 9 DE 21
## 10 FL 607
## # ... with 41 more rows
mpv_state_cause <- filter(mpv_clean, grepl("Gunshot",cause) | grepl("^Taser",cause) | cause=="Asphyxiated" | grepl("Beaten", cause) | cause=="Physical Restraint" | cause =="Vehicle" ) %>%
group_by (state, cause) %>% summarise(deaths=n())
mpv_state_cause
## # A tibble: 225 x 3
## # Groups: state [51]
## state cause deaths
## <chr> <chr> <int>
## 1 AK Gunshot 46
## 2 AK Gunshot, Taser 1
## 3 AL Beaten 1
## 4 AL Gunshot 139
## 5 AL Gunshot, Taser 5
## 6 AL Taser 7
## 7 AL Tasered 1
## 8 AR Beaten 1
## 9 AR Gunshot 109
## 10 AR Gunshot, Police Dog 1
## # ... with 215 more rows
#head(us_map())
#head(statepop)
#plot_usmap("states")
p9 <- plot_usmap(data =statepop, values="pop_2015") + #add labels=T if state Abbrev are desired
scale_fill_continuous(low="white", high="blue", name="Population (2015)", label=scales::comma,
guide=guide_colorbar(barwidth=15, barheight=0.5, title.position = "top")) +
labs(title = "13. State Population",
subtitle = "in 2015") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"), legend.position="bottom")
mpv_state_pop <- left_join(mpv_state, statepop[,c("abbr","pop_2015")], by=c("state"="abbr")) %>%
mutate(deathrate = deaths/pop_2015*1000000)
mpv_state_pop
## # A tibble: 51 x 4
## state deaths pop_2015 deathrate
## <chr> <int> <dbl> <dbl>
## 1 AK 47 738432 63.6
## 2 AL 153 4858979 31.5
## 3 AR 117 2978204 39.3
## 4 AZ 373 6828065 54.6
## 5 CA 1294 39144818 33.1
## 6 CO 263 5456574 48.2
## 7 CT 40 3590886 11.1
## 8 DC 27 672228 40.2
## 9 DE 21 945934 22.2
## 10 FL 607 20271272 29.9
## # ... with 41 more rows
mpv_state_cause_pop <- left_join(mpv_state_cause, statepop[,c("abbr","pop_2015")], by=c("state"="abbr")) %>%
mutate(deathrate = deaths/pop_2015*1000000)
mpv_state_cause_pop
## # A tibble: 225 x 5
## # Groups: state [51]
## state cause deaths pop_2015 deathrate
## <chr> <chr> <int> <dbl> <dbl>
## 1 AK Gunshot 46 738432 62.3
## 2 AK Gunshot, Taser 1 738432 1.35
## 3 AL Beaten 1 4858979 0.206
## 4 AL Gunshot 139 4858979 28.6
## 5 AL Gunshot, Taser 5 4858979 1.03
## 6 AL Taser 7 4858979 1.44
## 7 AL Tasered 1 4858979 0.206
## 8 AR Beaten 1 2978204 0.336
## 9 AR Gunshot 109 2978204 36.6
## 10 AR Gunshot, Police Dog 1 2978204 0.336
## # ... with 215 more rows
p10 <- plot_usmap(data= mpv_state_pop, values="deathrate") + #add labels=T for state abbr
scale_fill_continuous(low="white", high="red", name="Death Rate per Million Inhabitants", label=scales::comma,
guide=guide_colorbar(barwidth=15, barheight=0.5, title.position = "top")) +
labs(title = "14. All Police-Caused Deaths",
subtitle = "in the 2013-2020 period") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"), legend.position="bottom")
p11 <- plot_usmap(data= filter(mpv_state_cause_pop, grepl("Gunshot",cause)), values="deathrate") +
#add labels=T for state abbr
scale_fill_continuous(low="white", high="purple", name="Death Rate per Million Inhabitants",
label=scales::comma,
guide=guide_colorbar(barwidth=15, barheight=0.5, title.position = "top")) +
labs(title = "15. Leading Cause: Gunshots",
subtitle = "in the 2013-2020 period") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"), legend.position="bottom")
p12 <- plot_usmap(data= filter(mpv_state_cause_pop, cause=="Asphyxiated"), values="deathrate") +
#add labels=T for state abbr
scale_fill_continuous(low="white", high="yellow", name="Death Rate per Million Inhabitants",
label=scales::comma,
guide=guide_colorbar(barwidth=15, barheight=0.5, title.position = "top")) +
labs(title = "16. Leading Cause: Asphyxiation",
subtitle = "in the 2013-2020 period") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"), legend.position="bottom")
p13 <- plot_usmap(data= filter(mpv_state_cause_pop, cause=="Physical Restraint"), values="deathrate") +
#add labels=T for state abbr
scale_fill_continuous(low="white", high="chocolate1", name="Death Rate per Million Inhabitants",
label=scales::comma,
guide=guide_colorbar(barwidth=15, barheight=0.5, title.position = "top")) +
labs(title = "17. Leading Cause: Physical Restraint",
subtitle = "in the 2013-2020 period") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"), legend.position="bottom")
p14 <- plot_usmap(data= filter(mpv_state_cause_pop, grepl("Beaten", cause)), values="deathrate") +
#add labels=T for state abbr
scale_fill_continuous(low="white", high="magenta", name="Death Rate per Million Inhabitants",
label=scales::comma,
guide=guide_colorbar(barwidth=15, barheight=0.5, title.position = "top")) +
labs(title = "18. Leading Cause: Beaten",
subtitle = "in the 2013-2020 period") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"), legend.position="bottom")
p15 <- plot_usmap(data= filter(mpv_state_cause_pop, cause=="Vehicle"), values="deathrate") +
#add labels=T for state abbr
scale_fill_continuous(low="white", high="aquamarine", name="Death Rate per Million Inhabitants",
label=scales::comma,
guide=guide_colorbar(barwidth=15, barheight=0.5, title.position = "top")) +
labs(title = "19. Leading Cause: Vehicle",
subtitle = "in the 2013-2020 period") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"), legend.position="bottom")
p16 <- plot_usmap(data= filter(mpv_state_cause_pop, grepl("^Taser",cause)), values="deathrate") +
#add labels=T for state abbr
scale_fill_continuous(low="white", high="chartreuse", name="Death Rate per Million Inhabitants",
label=scales::comma,
guide=guide_colorbar(barwidth=15, barheight=0.5, title.position = "top")) +
labs(title = "20. Leading Cause: Taser",
subtitle = "in the 2013-2020 period") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"), legend.position="bottom")
p9+p10
p10+p11
p12+p13
p14+p15
p10+p16
R. Miranda
October 13, 2020
https://mappingpoliceviolence.org/s/MPVDatasetDownload.xlsx with the 2013-2020 record of Police killings.
The MPV project defines Police Killing as a case where a person dies as a result of being shot, beaten, restrained, intentionally hit by a police vehicle, pepper sprayed, tasered, or otherwise harmed by police officers, whether on-duty or off-duty.
A person was coded as unarmed when not holding any weapon or object when killed; when household/personal objects were not used to attach; when holding toy weapons; when bystanders or hostages were killed by the police; and when persons were hit by police cars. A person was coded as allegedly armed under other circumstances.
A person was coded as using a vehicle as a weapon when killed by police while driving to attack; when killed while escaping by car at high speeds. Persons who crash without being hit by the police, or are hit and killed by a civilian driver are not included in the database.
The full dataset, originally in Excel, contains 29 variables and 8428 records as of 10/04/2020. It was downloaded and save as .csv in my desktop drive.
The variables include the victim’s name, gender, race, face image if available at fatalencounters.org, date of incident, address, city, state, zip, county, agency or police department responsible for incident and the ORI identifier, the cause of death, a brief description of the encounter, the officiall disposition of the case at present, whether criminal charges exist, link to news article, symptoms of mental illness, whether victim was unarmed, alleged weapon from the Washington Post database, alleged level of threat from the Washington Post, fleeing circumstance from the Washington Post, body camera (from the Washington Post), Washington Post identifier, off-duty police status, geography (suburban, rural, etc., from Trulia population density map), MPV identifier and Fatal Encounters identifier.
The dataset is rich in details. Additional information is also available from the linked records, such as the background of the police officers (previous killings committed) and of the victims (e.g., on parole) when available. That information was not obtained for the current project.
The dataset was reasonably structured and clean, requiring some but not extensive manipulation. It was examined using Excel filtering to check for obvious problems. The continuous variables were converted to numeric (when used), particularly the dates and zips.
Only a subset of 18 variables were selected for further analysis: Age, Gender, Race, Date, City, State, Zipcode, County, Agency, Cause, Disposition, Charges, Mental, Unarmed, Weapon, Threat, Fleeing, Camera. In the end, because of lack of time, the only variables employed in the analysis were Age, Gender, Race, Date, and State.
Variables with long names were shortened as shown for convenient use in R statements. A complication was that variable names with spaces, special characters, etc., are converted to periods when saving Excel as .csv. Line 48 of the code shows the necessary cleaning. Some of the gender entries had trailing spaces that needed to be eliminated for efficient sorting. Race values were also shortened, as shown in line 50-55.
The initial exploration was the distribution of dead victims according to gender. Code line 60 shows grouping by gender and summarizing all shootings by gender (shooting is the generic term used here for a dead victim.) Ggplot bargraph was used.
As depicted in Fig. 1 bar chart, the vast majority are males, 5% are females, and a small minority are transgender or unknown.
A second step was to examine the interaction between gender and race on the number of shootings. As shown in Fig. 2 bar charts, white females are about 8% of the white victims, while hispanic females are about 5% of the hispanic victims. For black females, the percentage is about 2.5% of black victims, and for Asian Pacific, Native Americans, and unknown race, it’s smaller than 2% of corresponding victims.
In terms of racial distribution, black victims amount to 26% of shootings, hispanic are 17%, white are 45%, asian and native groups are 2.5% and the remainder are of unknown race. It is important to notice that black plus hispanic victims amount to about 95% of white victims, more than the make-up of the population.
In order to place figures side to side or special arrangements, we used “patchwork”, a package downloaded from cran.rstudio.com and documented at the top of the code. This was used for Fig. 2 and subsequent ones.
Detail about Fig. 2: we divided up the plot into two bar charts: the y-axis range of the left chart is 4 times that of the right one, in order to visualize better the differences between majority and minority victims. “Patchwork” allows for placing the two plots side by side.
We explored the daily and cumulative number of shootings over the 2013-2020 period, in code lines 86-122. The shootings were separated for males, females, transgender and other. Fig. 3 shows an uncomfortable steady level of daily shootings over the 7.7 year period, at about 2.9 deaths per day for males and about 0.15 deaths per day for females, showing no sign of improvement over the period. Figs. 4 and 5 show the cumulative increase over time for all genders, and the constant slope conveys the same conclusion.
In code lines 126-140 we explored shootings according to age of victim. Fig. 6 shows a negatively-skewed bell curve with a mean of about 30 years of age. We tried various smoothing functions, as shown in Figs. 7-9, a loess smoothing, cubic function and a generative additive model. Certainly the GAM is the better model, although the default GAM model fails slightly between ages of 0 and 10.
In code 144-157 we looked at how race affects the age distribution of victims. Figs. 10-12 show that the mean for black victims is lower, at about 27 years of age, than for hispanic at about 28 and white victims at 30 years of age.
In code 160-196 we explored the usmap library and the plot_usmap package to visualize shootings by state and cause of death.
We joined the MPV dataset and the StatePopulation dataset, and in line 195 defined the death rate as deaths/state-population per million inhabitants.
We filtered out the following causes of death: Gunshot, Asphyxiation, Physical Restraint, Beating, Vehicle and Taser, and plotted the sums by state on the USA map, with color intensity scaled according to death rate.
Figs. 13 and 14 compare the state population with the total death rate by all causes, showing the predominance of Oklahoma, Alabama, Alaska, and the relatively high death rate for some mid western states. Figs. 14 and 15 show that the leading cause is gunshots for those same states. Figs. 16 to 20 show the state distribution of the lesser causes of death, asphyxiation, physical restraint, beating, and taser. The dark grey color indicates states that didn’t have any occurrence.
The analysis of these factors corroborate results presented by others and are not surprising at this stage. With more effort, other aspects of the same MPV dataset could analyzed in depth, such as relations between unarmed victims and race, age, and state. Likewise, correlations of those variables with mental state could be important. With even more work, there is important information in the description of events, which could show revealing trends. The news articles, if they could be mined automatically, would be invaluable.
The Town-Hall event revealed personal experiences that started in all cases at an early age with confrontations against police personnel who were not well trained to understand actions of young individuals. Many of those confrontations could lead to unjustified police violence. This project actually confirms that sadly death rises dramatically between 15 and 30 years of age.
In terms of graphics, this project gave me the opportunity to start exploring some visualization techniques. The mapping packages are of course very nice and I could learn a lot more. The next step is to include more quantitative information in them, as well as interactive features.