Import relevant libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(RColorBrewer)
library(streamgraph)
library(alluvial)
library(stringr)
library(ggthemes)
library(ggiraph)
library(ggiraphExtra)
##
## Attaching package: 'ggiraphExtra'
## The following object is masked from 'package:ggthemes':
##
## theme_clean
Load the data
setwd("~/Desktop/Python Datasets")
pf <- read.csv("police_fatalities.csv",stringsAsFactors = TRUE)
Analyze the structure
head(pf)
## Date NumberOfSubjects Fatal SubjectArmed SubjectRace SubjectGender
## 1 3/4/2010 1 N N L U
## 2 1/9/2010 1 F Y W M
## 3 6/10/2010 1 F Y W M
## 4 11/10/2010 1 N Y U M
## 5 11/11/2010 1 N Y L M
## 6 11/11/2010 1 N N L F
## SubjectAge NatureOfStop NumberOfShots NumberOfOfficers OfficerRace
## 1 U <NA> <NA> 1 W
## 2 U <NA> <NA> 1 W
## 3 U <NA> <NA> 2 W;W
## 4 U <NA> <NA> 1 W
## 5 U <NA> <NA> 1 W
## 6 U <NA> <NA> 1 W
## OfficerGender Department FullNarrative City
## 1 M Albuquerque Police Department <NA> Albuquerque
## 2 M Albuquerque Police Department <NA> Albuquerque
## 3 M;M Albuquerque Police Department <NA> Albuquerque
## 4 M Albuquerque Police Department <NA> Albuquerque
## 5 M Albuquerque Police Department <NA> Albuquerque
## 6 M Albuquerque Police Department <NA> Albuquerque
## Notes
## 1 motorized vehicle; no hits
## 2
## 3
## 4
## 5
## 6 motorized vehicle
str(pf)
## 'data.frame': 4400 obs. of 16 variables:
## $ Date : Factor w/ 2328 levels "01/01/2010","01/01/2011",..: 1376 568 1720 764 769 769 1836 2315 1207 1571 ...
## $ NumberOfSubjects: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Fatal : Factor w/ 5 levels " F"," N","F",..: 4 3 3 4 4 4 3 4 3 3 ...
## $ SubjectArmed : Factor w/ 4 levels "N","U","Y","Y ": 1 3 3 3 3 1 3 3 3 3 ...
## $ SubjectRace : Factor w/ 6 levels "A","B","L","O",..: 3 6 6 5 3 3 3 5 6 3 ...
## $ SubjectGender : Factor w/ 5 levels "F","M","M;U",..: 5 2 2 2 2 1 2 2 2 2 ...
## $ SubjectAge : Factor w/ 76 levels "0-19","12","13",..: 75 75 75 75 75 75 75 75 75 75 ...
## $ NatureOfStop : Factor w/ 436 levels "\"suspicious person looking into vehicles in the area\"",..: NA NA NA NA NA NA NA NA NA NA ...
## $ NumberOfShots : Factor w/ 103 levels ">/=1",">/=12",..: NA NA NA NA NA NA NA NA NA NA ...
## $ NumberOfOfficers: Factor w/ 25 levels ">1",">2",">3",..: 8 8 15 8 8 8 18 8 8 8 ...
## $ OfficerRace : Factor w/ 217 levels "","A","A;A","A;B;W",..: 138 138 167 138 138 138 88 138 138 138 ...
## $ OfficerGender : Factor w/ 96 levels "F","F;F","F;F;M;F",..: 19 19 29 19 19 19 36 19 19 19 ...
## $ Department : Factor w/ 49 levels "Albuquerque Police Department",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ FullNarrative : Factor w/ 1770 levels ""," \"shot during violent physical confrontation following foot chase (admitted to facts sufficient for guilty fin"| __truncated__,..: NA NA NA NA NA NA NA NA NA NA ...
## $ City : Factor w/ 47 levels "Albuquerque",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Notes : Factor w/ 869 levels ""," Shot in chest",..: 510 1 1 1 1 509 1 1 1 1 ...
Remove Unnecessary Punctuation from NumberOfShots, NumberOfOfficers, and SubjectAge
pf1 <- pf %>% mutate(NumberOfShots = gsub(">/=","",NumberOfShots), NumberOfOfficers = gsub(">","",NumberOfOfficers))
pf2 <- pf1 %>% filter(!SubjectAge %in% c("U","Juvenile","UNKNOWN","N/A"))
pf2 <- pf2 %>% mutate(SubjectAge = as.character(SubjectAge))
for (i in 1:nrow(pf2)){
if (pf2[i,7] == "0-19"){
pf2[i,"SubjectAge2"] <- sample(10:19,1)
} else if (pf2[i,7] == "21-23"){
pf2[i,"SubjectAge2"] <- sample(21:23,1)
} else if (pf2[i,7] == "20-29"){
pf2[i,"SubjectAge2"] <- sample(20:29,1)
} else if (pf2[i,7] == "30-39"){
pf2[i,"SubjectAge2"] <- sample(30:39,1)
} else if (pf2[i,7] == "40-49"){
pf2[i,"SubjectAge2"] <- sample(40:49,1)
}else if (pf2[i,7] == "50-59"){
pf2[i,"SubjectAge2"] <- sample(50:59,1)
}else if (pf2[i,7] == "60-69"){
pf2[i,"SubjectAge2"] <- sample(60:69,1)
}else{
pf2[i,"SubjectAge2"] <- as.numeric(pf2[i,7])
}
}
pf2 <- pf2 %>% mutate(NumberOfOfficers = as.numeric(NumberOfOfficers), NumberOfShots = as.numeric(NumberOfShots))
## Warning: Problem with `mutate()` input `NumberOfOfficers`.
## x NAs introduced by coercion
## ℹ Input `NumberOfOfficers` is `as.numeric(NumberOfOfficers)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
## Warning: Problem with `mutate()` input `NumberOfShots`.
## x NAs introduced by coercion
## ℹ Input `NumberOfShots` is `as.numeric(NumberOfShots)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
Create Date Column
pf2 <- pf2 %>% mutate(Date = mdy(as.character(Date)))
## Warning: Problem with `mutate()` input `Date`.
## x 176 failed to parse.
## ℹ Input `Date` is `mdy(as.character(Date))`.
## Warning: 176 failed to parse.
Create New Variables to Separate Officer Race and Gender
for (i in 1:nrow(pf2)){
pf2[i,"AsianOfficers"] = str_count(pf2[i,"OfficerRace"], pattern = "A|A/W|A/PI|W/A|ASIAN")
pf2[i,"BlackOfficers"] = str_count(pf2[i,"OfficerRace"], pattern = "B|BLACK")
pf2[i,"LatinoOfficers"] = str_count(pf2[i,"OfficerRace"], pattern = "L|H|H/LW/ H|W/H")
pf2[i,"WhiteOfficers"] = str_count(pf2[i,"OfficerRace"], pattern = "W|WHITE|NA/W|W/")
pf2[i,"NativeAmericanOfficers"] = str_count(pf2[i,"OfficerRace"], pattern = "AI/AN")
pf2[i,"OtherOfficers"] = str_count(pf2[i,"OfficerRace"], pattern = "O|Other|Multi-Racial")
pf2[i,"UnknownOfficers"] = str_count(pf2[i,"OfficerRace"], pattern = "U|Unknown|NA")
}
for (i in 1:nrow(pf2)){
pf2[i,"MaleOfficers"] = str_count(pf2[i,"OfficerGender"], pattern = "M")
pf2[i,"FemaleOfficers"] = str_count(pf2[i,"OfficerGender"], pattern = "F")
}
pf3 <- pf2 %>% filter(!is.na(SubjectRace),SubjectGender != "U",Fatal != "U")
Create binary category for Officer Race (White vs. NonWhite)
pf_off_race <- pf3
for (i in 1:nrow(pf_off_race)){
if (pf_off_race[i,"WhiteOfficers"] > pf_off_race[i,"BlackOfficers"] & pf_off_race[i,"WhiteOfficers"] > pf_off_race[i,"LatinoOfficers"]){
pf_off_race[i,"OfficerRace2"] = "White"
} else{
pf_off_race[i,"OfficerRace2"] = "Non-White"
}
}
pf_off_race2 <- pf_off_race %>% filter(SubjectRace %in% c("W","L","B"), Fatal %in% c("F","N")) %>% mutate(Status = case_when(Fatal == "F"~0, Fatal == "N"~1))
Boxplots
# Victim Age by Race
ggplot(pf3) + geom_boxplot(mapping = aes(SubjectAge2)) + facet_wrap(~SubjectRace)
# Victim Age by Gender
ggplot(pf3) + geom_boxplot(mapping = aes(SubjectAge2)) + facet_wrap(~SubjectGender)
# Number of Officers by Victim Race
ggplot(pf3) + geom_boxplot(mapping = aes(NumberOfOfficers)) + facet_wrap(~SubjectRace)
## Warning: Removed 46 rows containing non-finite values (stat_boxplot).
# Number of Officers by Victim Fatality
pf3 %>% ggplot() + geom_boxplot(mapping = aes(NumberOfOfficers)) + facet_wrap(~Fatal)
## Warning: Removed 46 rows containing non-finite values (stat_boxplot).
# Number of White Officers by Victim Race
ggplot(pf3) + geom_boxplot(mapping = aes(WhiteOfficers)) + facet_wrap(~SubjectRace)
pf4 <- pf_off_race %>% filter(!is.na(NumberOfShots))
# Number of Shots by Victim Race
ggplot(pf4) + geom_boxplot(mapping = aes(NumberOfShots)) + facet_wrap(~SubjectRace)
# Number of Shots by Fatality
ggplot(pf4) + geom_boxplot(mapping = aes(NumberOfShots)) + facet_wrap(~Fatal)
Barcharts
fatal_race <- pf3 %>% filter(Fatal == "F") %>% group_by(SubjectRace,Fatal) %>% count()
fatal_race
## # A tibble: 6 x 3
## # Groups: SubjectRace, Fatal [6]
## SubjectRace Fatal n
## <fct> <fct> <int>
## 1 A F 11
## 2 B F 327
## 3 L F 146
## 4 O F 5
## 5 U F 63
## 6 W F 261
# Fatalities by Race
ggplot(fatal_race) + geom_col(mapping = aes(SubjectRace,n, fill = SubjectRace))
fatal_race2 <- pf3 %>% group_by(SubjectRace) %>% count() %>% summarise(total = sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
fatal_race2
## # A tibble: 6 x 2
## SubjectRace total
## <fct> <int>
## 1 A 22
## 2 B 994
## 3 L 368
## 4 O 8
## 5 U 121
## 6 W 484
# Total Shootings by Race
ggplot(fatal_race2) + geom_col(mapping = aes(SubjectRace,total, fill = SubjectRace))
fatal_race3 <- pf3 %>% group_by(SubjectRace,Fatal) %>% count()
fatal_race3
## # A tibble: 12 x 3
## # Groups: SubjectRace, Fatal [12]
## SubjectRace Fatal n
## <fct> <fct> <int>
## 1 A F 11
## 2 A N 11
## 3 B F 327
## 4 B N 667
## 5 L F 146
## 6 L N 222
## 7 O F 5
## 8 O N 3
## 9 U F 63
## 10 U N 58
## 11 W F 261
## 12 W N 223
# Fatal and Non-Fatal of Shootings by Race
ggplot(fatal_race3) + geom_col(mapping = aes(SubjectRace,n, fill = Fatal),position = "dodge")
Multiple Linear Regression Models
pf5 <- pf_off_race %>% mutate(Status = case_when(Fatal == "F"~0, Fatal == "N"~1))
pf6 <- pf5 %>% filter(SubjectRace %in% c("W","L","B","A"))
pf7 <- pf5 %>% filter(SubjectRace %in% c("W","L","B"))
# Number of Shots as Linear Function of Age, Number of Officers, and Officer Race
num_off <- lm(NumberOfShots~SubjectAge2 + NumberOfOfficers + OfficerRace2,data=pf5)
summary(num_off)
##
## Call:
## lm(formula = NumberOfShots ~ SubjectAge2 + NumberOfOfficers +
## OfficerRace2, data = pf5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.077 -3.106 -1.648 1.751 42.335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.78198 0.81339 0.961 0.33667
## SubjectAge2 -0.05095 0.02333 -2.184 0.02929 *
## NumberOfOfficers 3.89143 0.21883 17.783 < 2e-16 ***
## OfficerRace2White 1.74352 0.53260 3.274 0.00111 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.15 on 747 degrees of freedom
## (1246 observations deleted due to missingness)
## Multiple R-squared: 0.3025, Adjusted R-squared: 0.2997
## F-statistic: 108 on 3 and 747 DF, p-value: < 2.2e-16
# Plot of Number of Shots as Linear Function of Age, Number of Officers, and Officer Race
ggPredict(num_off,se=TRUE,interactive = TRUE)
# Number of Shots as Linear Function of Age, Victim Race, and Number of Officers
num_shot <- lm(NumberOfShots~SubjectAge2+ NumberOfOfficers + SubjectRace,data=pf7)
summary(num_shot)
##
## Call:
## lm(formula = NumberOfShots ~ SubjectAge2 + NumberOfOfficers +
## SubjectRace, data = pf7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.082 -3.027 -1.621 1.561 41.783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.57823 0.83251 0.695 0.488
## SubjectAge2 -0.03388 0.02530 -1.339 0.181
## NumberOfOfficers 4.19422 0.23047 18.198 <2e-16 ***
## SubjectRaceL 0.81007 0.70808 1.144 0.253
## SubjectRaceW -0.09915 0.70765 -0.140 0.889
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.077 on 681 degrees of freedom
## (1160 observations deleted due to missingness)
## Multiple R-squared: 0.3283, Adjusted R-squared: 0.3244
## F-statistic: 83.21 on 4 and 681 DF, p-value: < 2.2e-16
# Plot of Number of Shots as Linear Function of Age, Officer Race, and Status of Shooting
ggPredict(num_shot,se=TRUE,interactive = TRUE)
Primary Multiple Linear Model and Scatterplot
# Model Number of Shots as a Function of Number of Officers, Victim Age, Victim Race, and Officer Race
pf_fit1 <- lm(NumberOfShots ~ NumberOfOfficers + SubjectRace + SubjectAge2 + OfficerRace2, data = pf6)
summary(pf_fit1)
##
## Call:
## lm(formula = NumberOfShots ~ NumberOfOfficers + SubjectRace +
## SubjectAge2 + OfficerRace2, data = pf6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.939 -3.125 -1.620 1.526 40.967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.34248 3.22998 -0.725 0.46856
## NumberOfOfficers 4.18850 0.22833 18.344 < 2e-16 ***
## SubjectRaceB 2.53958 3.16679 0.802 0.42287
## SubjectRaceL 3.26869 3.20666 1.019 0.30840
## SubjectRaceW 2.17507 3.21029 0.678 0.49830
## SubjectAge2 -0.03955 0.02517 -1.571 0.11660
## OfficerRace2White 1.46030 0.54935 2.658 0.00804 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.034 on 684 degrees of freedom
## (1177 observations deleted due to missingness)
## Multiple R-squared: 0.3349, Adjusted R-squared: 0.329
## F-statistic: 57.39 on 6 and 684 DF, p-value: < 2.2e-16
# Plot Number of Shots as a Function of Number of Officers, Victim Age, Victim Race, and Fatality
pf_plot1 <-ggplot(pf6, mapping = aes(SubjectAge2,NumberOfShots,color = SubjectRace)) + geom_point(mapping = aes(color = SubjectRace, size = NumberOfOfficers, shape = OfficerRace2, alpha = 0.5)) + xlab("Victim Age") + ylab("Number of Shots") + ggtitle("Number of Shots by Age and Race of Victim and Number of Officers") + theme(plot.title = element_text(hjust = 0.5)) + theme_economist() + geom_smooth(method = "lm",se=FALSE)
pf_plot1
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1175 rows containing non-finite values (stat_smooth).
## Warning: Removed 1177 rows containing missing values (geom_point).
pf_plot2 <- ggplotly(pf_plot1)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1175 rows containing non-finite values (stat_smooth).
pf_plot2
Multiple Logistic Regression Models
# Survival as Function of Age and Officer Race (White or NonWhite)
off_race <- glm(Status ~ SubjectAge2 + OfficerRace2, data = pf_off_race2, family = binomial)
summary(off_race)
##
## Call:
## glm(formula = Status ~ SubjectAge2 + OfficerRace2, family = binomial,
## data = pf_off_race2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7018 -1.2772 0.8200 0.9758 1.7255
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.705832 0.148232 11.508 <2e-16 ***
## SubjectAge2 -0.037556 0.004272 -8.791 <2e-16 ***
## OfficerRace2White -0.272358 0.098114 -2.776 0.0055 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2481.1 on 1845 degrees of freedom
## Residual deviance: 2389.4 on 1843 degrees of freedom
## AIC: 2395.4
##
## Number of Fisher Scoring iterations: 4
# Plot of Survival as Function of Age and Officer Race (White or NonWhite)
ggPredict(off_race,se=TRUE,interactive = TRUE)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
x_age <- seq(0,150,0.01)
log_off_race <- with(pf_off_race2,expand.grid(SubjectAge2 = x_age, OfficerRace2 = unique(OfficerRace2)))
log_off_race$y <-predict(off_race,log_off_race,type = "response")
# Plot of Survival as Function of Age and Officer Race (White or NonWhite) using GGplot
pf_odds<-ggplot(log_off_race, mapping = aes(SubjectAge2,y,colour = OfficerRace2, group = OfficerRace2)) + geom_point()+ geom_line() +ylim(c(0,1)) + xlab("Victim Age") + ylab("Probability of Survival") + ggtitle("Police Shooting Survival Odds by Officer Race") + theme(plot.title = element_text(hjust = 0.5)) + theme_dark() + theme(plot.title = element_text(hjust = 0.5))
pf_odds
# Survival as Function of Age, Officer Race (White or NonWhite), and Victim Race
off_race2 <- glm(Status ~ SubjectAge2 + OfficerRace2 + SubjectRace, data = pf_off_race2, family = binomial)
summary(off_race2)
##
## Call:
## glm(formula = Status ~ SubjectAge2 + OfficerRace2 + SubjectRace,
## family = binomial, data = pf_off_race2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7415 -1.2246 0.7964 0.9596 1.7807
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.709093 0.152082 11.238 < 2e-16 ***
## SubjectAge2 -0.031456 0.004449 -7.071 1.54e-12 ***
## OfficerRace2White -0.228866 0.099348 -2.304 0.0212 *
## SubjectRaceL -0.237533 0.128289 -1.852 0.0641 .
## SubjectRaceW -0.603187 0.119993 -5.027 4.99e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2481.1 on 1845 degrees of freedom
## Residual deviance: 2364.1 on 1841 degrees of freedom
## AIC: 2374.1
##
## Number of Fisher Scoring iterations: 4
# Plot of Survival as Function of Age, Officer Race (White or NonWhite), and Victim Race
ggPredict(off_race2,se=TRUE,interactive = TRUE)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
log_off_race2 <- with(pf_off_race2,expand.grid(SubjectAge2 = x_age, OfficerRace2 = unique(OfficerRace2),SubjectRace = unique(SubjectRace)))
log_off_race2$y <-predict(off_race2,log_off_race2,type = "response")
pf_odds2<-ggplot(log_off_race2, mapping = aes(SubjectAge2,y,colour = OfficerRace2, group = OfficerRace2)) + geom_point()+ geom_line() +ylim(c(0,1)) + xlab("Victim Age") + ylab("Probability of Survival") + ggtitle("Police Shooting Survival Odds by Officer Race and by Victim Race") + theme(plot.title = element_text(hjust = 0.5)) + theme_dark() + theme(plot.title = element_text(hjust = 0.5)) + facet_wrap(~SubjectRace)
pf_odds2
Logistic Models and Plots
# Survival as a Logistic Function of Age and Victim Race (W,L,B)
race <- glm(Status ~ SubjectAge2 + SubjectRace,data = pf7, family = binomial)
summary(race)
##
## Call:
## glm(formula = Status ~ SubjectAge2 + SubjectRace, family = binomial,
## data = pf7)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7562 -1.2334 0.8073 0.9569 1.7345
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.616985 0.145944 11.079 < 2e-16 ***
## SubjectAge2 -0.031568 0.004442 -7.107 1.19e-12 ***
## SubjectRaceL -0.230927 0.128009 -1.804 0.0712 .
## SubjectRaceW -0.628657 0.119378 -5.266 1.39e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2481.1 on 1845 degrees of freedom
## Residual deviance: 2369.4 on 1842 degrees of freedom
## AIC: 2377.4
##
## Number of Fisher Scoring iterations: 4
# Plot of Survival as a Logistic Function of Age and Victim Race
ggPredict(race,se=TRUE,interactive = TRUE)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
# Survival as a Logistic Function of Age and Victim Race (All races)
pf_fit3 <- glm(Status ~ SubjectAge2 + SubjectRace, family = binomial(link = "logit"),data = pf5)
summary(pf_fit3)
##
## Call:
## glm(formula = Status ~ SubjectAge2 + SubjectRace, family = binomial(link = "logit"),
## data = pf5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7643 -1.2132 0.8046 0.9712 1.7495
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.95749 0.44892 2.133 0.0329 *
## SubjectAge2 -0.03252 0.00428 -7.598 3e-14 ***
## SubjectRaceB 0.68710 0.43650 1.574 0.1155
## SubjectRaceL 0.45788 0.44448 1.030 0.3029
## SubjectRaceO -0.57757 0.85210 -0.678 0.4979
## SubjectRaceU 0.07289 0.46957 0.155 0.8766
## SubjectRaceW 0.06496 0.44195 0.147 0.8831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2699.1 on 1996 degrees of freedom
## Residual deviance: 2570.4 on 1990 degrees of freedom
## AIC: 2584.4
##
## Number of Fisher Scoring iterations: 4
log_pf <- with(pf5,expand.grid(SubjectAge2 = x_age, SubjectRace = unique(SubjectRace)))
log_pf$y <-predict(pf_fit3,log_pf,type = "response")
# Plot of Survival as a Logistic Function of Age and Victim Race (All races)
pf_odds<-ggplot(log_pf, mapping = aes(SubjectAge2,y,colour = SubjectRace, group = SubjectRace)) + geom_point(pf5,mapping = aes(SubjectAge2,Status,colour = SubjectRace))+ geom_line() +ylim(c(0,1)) + xlab("Victim Age") + ylab("Probability of Survival") + ggtitle("Police Shooting Survival Odds by Victim Race") + theme(plot.title = element_text(hjust = 0.5)) + theme_dark() + theme(plot.title = element_text(hjust = 0.5))
pf_odds1 <- ggplotly(pf_odds)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
pf_odds1
# Survival as a Logistic Function of Age and Number of White Officers
fat_shoot <- glm(Status ~ SubjectAge2 + WhiteOfficers,data = pf5, family = binomial)
summary(fat_shoot)
##
## Call:
## glm(formula = Status ~ SubjectAge2 + WhiteOfficers, family = binomial,
## data = pf5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7272 -1.2368 0.8120 0.9806 1.8519
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.771172 0.142795 12.404 <2e-16 ***
## SubjectAge2 -0.038175 0.004138 -9.225 <2e-16 ***
## WhiteOfficers -0.296398 0.052812 -5.612 2e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2699.1 on 1996 degrees of freedom
## Residual deviance: 2569.7 on 1994 degrees of freedom
## AIC: 2575.7
##
## Number of Fisher Scoring iterations: 4
# Plot of Survival as a Logistic Function of Age and Number of White Officers
ggPredict(fat_shoot,se=TRUE,interactive = TRUE)
log_num_white <- with(pf5,expand.grid(SubjectAge2 = x_age, WhiteOfficers = unique(WhiteOfficers)))
log_num_white$y <-predict(fat_shoot,log_num_white,type = "response")
odds_num_white<-ggplot(log_num_white, mapping = aes(SubjectAge2,y,colour = WhiteOfficers, group = WhiteOfficers)) + geom_point()+ geom_line() +ylim(c(0,1)) + xlab("Victim Age") + ylab("Probability of Survival") + ggtitle("Police Shooting Survival Odds by Number of White Officers") + theme(plot.title = element_text(hjust = 0.5)) + theme_dark() + theme(plot.title = element_text(hjust = 0.5))
odds_num_white
Analysis Summary
The data set used in the analysis above was sourced from Vice News. The data set contains data from 2010 to 2016 on police shootings in the US, both fatal and non-fatal. The variables in this data set were primarily categorical, but some were quantitative or numerical as well. The variables in the data set were race and gender of the victims as well as officers, the number of officers, the number of shots fired by officers, the status of the shooting (i.e. whether it was fatal or non-fatal), the status of the suspect (i.e. whether the suspect was armed or not), the age of the suspect, the police department, city, date, and the nature of the stop. I chose this topic, firstly, because I’m interested in the topic of racism in the US and, secondly, because my preliminary research on the subject seemed to suggest that while racism does certainly exist, it is not manifested in police shootings. That is, while I did find research that showed that black people are stopped/targeted by the police more than white people (Cooley et al, 2020, p. 764-766) and even that they are killed more often (Menifield et al, 2018, p.64-68), to my surprise, the majority of the research I came across relating to police shootings suggested that white police officers are no more likely to kill or shoot black people than they are to shoot or kill white people (R. Lott Jr., 2016, p.14-17). Consequently, I wanted to undertake this study to answer questions such as: Do white police officers shoot/kill more black/hispanic people than white people? Do police officers in general tend to fire more shots at black and hispanic people? I chose this data set in particular because, in addition to containing data on the race of the victims of shootings, it contains data on the race and gender of the officers, and I felt this was pertinent to attempting to answer the aforementioned questions.
The results of my analysis were mixed. For instance, the median number of black, latino, and white people shot was quite similar based on the boxplot showing the number of shots by victim race. The distributions themselves are quite similar too as they are heavily right-skewed and the boxplots generally conveyed that police shootings are fairly uniform across victim race. And yet, I did find that more black people were both shot in general and fatally by the police than any other race by far as can be observed in the barcharts that demonstrate shootings by race and fatalities by race. Since black people make up only 13% of the population, this, in my opinion, clearly demonstrates that there is racism in policing and confirms the literature that I came across by Menifeld et al. The model of the number of gun shots as a linear function of victim race and age, along with the number of officers and white officers, however, didn’t seem to corroborate this finding. Specifically, for every additional black person shot, the model predicts an additional 2.52 gun shots and for every additional latino person shot, the model predicts an additional 3.24 gun shots, and for every additional white person shot, the model predicts an additional 2.12 gunshots. This seems to be in line with the pre-existing literature on the subject. On the other hand,for every additional white officer, the model predicts an additional 1.45 gun shots and for every additional officer in general, the model predicts an additional 4.1 gun shots. And, interestingly, for every one unit increase in age, there is 0.035 decrease in gunshots, so the number of guneshots tends to go down with increases in age. However, the model turned out to be a poor fit for the data, as can be observed from the Adjusted R-Squared value of just above 0.33. Nonetheless, the visualization entitled, “Number of Shots by Race and Age of Victim and Number of Officers”, seems to capture the findings of the model to some degree as it is clear from the plot that black people were shot more than people of other races and that as the number of shots increased, so too did the number of officers.
The model of fatalities as a logistic function of victim race and age also corroborates the bar chart findings. From the bar chart that displays both fatal and non-fatal shootings, one can observe that the only races for which non-fatalaties exceeded fatalities were latino and black people. And, as can be seen in the plot of the logistic model, black people and latino people are far more likely than any other race to survive a police shooting. Specifically, the slope coefficient for black victims indicates that for every additional black victim, there will be a log 0.68 increase in odds of survival and the slope coefficient for latino victims indicates that for every additional latino victim, there will be a log 0.46 increase in the odds of survival. In terms of odds, this means that the odds of survival are roughly 2 times as high for each additional black person and 1.5 times as high for every additional latino person. The logistic plot of survival as a function of officer race and victim race is highly interesting as well as it corroborates the literature that, indeed, black and latino people are no more likely to be shot by white police officers than white people are, however, they also highlighted something I had not seen in the literature I came across:regardless of race, victims of police shootings have a greater probability of dying when shot by white police officers compared to non-white police officers. Another finding that I came across that I wasn’t able to find in pre-existing literature was that the odds of dying increase as the number of white police officers increase as can be seen in the final logistic plot.
Thus, in sum, I found evidence that suggests that black and latino people that get shot by the police are less likely to die in comparison to other races. And yet, I also found that black people experience a higher number of gunshots and that they accounted for the majority of police shootings in general, as well as fatalities, compared to white people, despite constituting a minority of the population. As far as any issues I wish I could have overcome are concerned, it would’ve been nice to be able to use the officer race variable as a multivariate factor rather than having to manipulate into a binary (i.e. non-white officers vs white officers) factor, but other than that, I don’t think there was much more outside of hypothesis testing that I could’ve done.
Bibliography
Lott Jr., John R., Moody, Carlisle E. (2016). Do White Police Officers Unfairly Target Black Suspects?. College of William and Mary.
Cooley, Erin., Hester, Neil., Cipolli, William., Rivera, Laura I., Abrams, Kaitlin., Pagan, Jeremy., Sommers, Samuel R., Payne, Keith. (2020). Racial Biases in Officers’ Decisions to Frisk Are Amplified for Black People Stopped Among Groups Leading to Similar Biases in Searches, Arrests, and Use of Force. Social Pscyhological and Personality Science. Vol. 11(6) 761-769.
Menifield, Charles E., Shin, Geiguen., Strother, Logan. (2018). Do White Law Enforcement Officers Target Minority Suspects? Vol. 79(1) 56-68.