The dataset shows the number of traffic stops by law enforcement agencies in Montana between 2009 and 2016 reported by the Stanford Open Policing Project. The report focuses mainly on data cleansing and data visualization using the packages tidyverse, highcharter, and dslabs.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(scales)
## Warning: package 'scales' was built under R version 3.6.3
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.6.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
## [1] "make-admissions.R"
## [2] "make-brca.R"
## [3] "make-brexit_polls.R"
## [4] "make-death_prob.R"
## [5] "make-divorce_margarine.R"
## [6] "make-gapminder-rdas.R"
## [7] "make-greenhouse_gases.R"
## [8] "make-historic_co2.R"
## [9] "make-mnist_27.R"
## [10] "make-movielens.R"
## [11] "make-murders-rda.R"
## [12] "make-na_example-rda.R"
## [13] "make-nyc_regents_scores.R"
## [14] "make-olive.R"
## [15] "make-outlier_example.R"
## [16] "make-polls_2008.R"
## [17] "make-polls_us_election_2016.R"
## [18] "make-reported_heights-rda.R"
## [19] "make-research_funding_rates.R"
## [20] "make-stars.R"
## [21] "make-temp_carbon.R"
## [22] "make-tissue-gene-expression.R"
## [23] "make-trump_tweets.R"
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.6.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.6.3
library(RColorBrewer)
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
The dataset consists of 825,118 observations that represents the number of traffice stops and 29 variables. For this report, I will be analyzing five mains variables including date, county_name, subject_age, subject_race, and subject_sex.
stops1 <- read_csv("mt_statewide_2020_04_01.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## raw_row_number = col_double(),
## time = col_time(format = ""),
## lat = col_double(),
## lng = col_double(),
## subject_age = col_double(),
## arrest_made = col_logical(),
## citation_issued = col_logical(),
## warning_issued = col_logical(),
## frisk_performed = col_logical(),
## search_conducted = col_logical(),
## vehicle_year = col_double()
## )
## See spec(...) for full column specifications.
dim(stops1) #Dimensions of dataset
## [1] 825118 29
colnames(stops1)# Variable names of dataset
## [1] "raw_row_number" "date"
## [3] "time" "location"
## [5] "lat" "lng"
## [7] "county_name" "subject_age"
## [9] "subject_race" "subject_sex"
## [11] "department_name" "type"
## [13] "violation" "arrest_made"
## [15] "citation_issued" "warning_issued"
## [17] "outcome" "frisk_performed"
## [19] "search_conducted" "search_basis"
## [21] "reason_for_stop" "vehicle_make"
## [23] "vehicle_model" "vehicle_type"
## [25] "vehicle_registration_state" "vehicle_year"
## [27] "raw_Race" "raw_Ethnicity"
## [29] "raw_SearchType"
str(stops1)
## tibble [825,118 x 29] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ raw_row_number : num [1:825118] 1 2 3 4 5 6 7 8 9 10 ...
## $ date : chr [1:825118] "1/1/2009" "1/2/2009" "1/3/2009" "1/4/2009" ...
## $ time : 'hms' num [1:825118] 02:10:53 11:34:19 11:36:42 10:33:11 ...
## ..- attr(*, "units")= chr "secs"
## $ location : chr [1:825118] "US 89 N MM10 (SB)" "HWY 93 SO AND ANNS LANE S/B" "P007 HWY 93 MM 77 N/B" "P007 HWY 93 MM 81 S/B" ...
## $ lat : num [1:825118] 47.6 46.8 46.7 46.7 46.7 ...
## $ lng : num [1:825118] -112 -114 -114 -114 -114 ...
## $ county_name : chr [1:825118] "Cascade County" "Missoula County" "Missoula County" "Missoula County" ...
## $ subject_age : num [1:825118] 16 19 17 17 31 20 30 34 21 18 ...
## $ subject_race : chr [1:825118] "white" "white" "white" NA ...
## $ subject_sex : chr [1:825118] "female" "male" "male" "female" ...
## $ department_name : chr [1:825118] "Montana Highway Patrol" "Montana Highway Patrol" "Montana Highway Patrol" "Montana Highway Patrol" ...
## $ type : chr [1:825118] "vehicular" "vehicular" "vehicular" "vehicular" ...
## $ violation : chr [1:825118] "240 - INSURANCE|150 - HIT AND RUN|245 - OTHER NON-HAZARDOUS" "EXPIRED TAG ( 4 MONTHS OR LESS )|SEATBELT ( DRIVER )|FAULTY EQUIPMENT" "SPEED" "SPEED" ...
## $ arrest_made : logi [1:825118] FALSE TRUE TRUE TRUE TRUE TRUE ...
## $ citation_issued : logi [1:825118] TRUE FALSE FALSE FALSE FALSE FALSE ...
## $ warning_issued : logi [1:825118] TRUE TRUE FALSE FALSE FALSE TRUE ...
## $ outcome : chr [1:825118] "citation" "arrest" "arrest" "arrest" ...
## $ frisk_performed : logi [1:825118] FALSE FALSE FALSE NA NA NA ...
## $ search_conducted : logi [1:825118] FALSE FALSE FALSE TRUE TRUE TRUE ...
## $ search_basis : chr [1:825118] NA NA NA NA ...
## $ reason_for_stop : chr [1:825118] "#NAME?" "EXPIRED TAG ( - MONTHS OR LESS )" "SPEED" "SPEED" ...
## $ vehicle_make : chr [1:825118] "FORD" "GMC" "GMC" "HOND" ...
## $ vehicle_model : chr [1:825118] "EXPLORER" "TK" "YUKON" "CR-V" ...
## $ vehicle_type : chr [1:825118] "SPORT UTILITY" "TRUCK" "SPORT UTILITY" "SPORT UTILITY" ...
## $ vehicle_registration_state: chr [1:825118] "MT" "MT" "MT" "MT" ...
## $ vehicle_year : num [1:825118] 1994 1996 1999 2002 1992 ...
## $ raw_Race : chr [1:825118] "W" "W" "W" "W" ...
## $ raw_Ethnicity : chr [1:825118] "N" "N" "N" NA ...
## $ raw_SearchType : chr [1:825118] "NO SEARCH REQUESTED" "NO SEARCH REQUESTED" "NO SEARCH REQUESTED" NA ...
## - attr(*, "spec")=
## .. cols(
## .. raw_row_number = col_double(),
## .. date = col_character(),
## .. time = col_time(format = ""),
## .. location = col_character(),
## .. lat = col_double(),
## .. lng = col_double(),
## .. county_name = col_character(),
## .. subject_age = col_double(),
## .. subject_race = col_character(),
## .. subject_sex = col_character(),
## .. department_name = col_character(),
## .. type = col_character(),
## .. violation = col_character(),
## .. arrest_made = col_logical(),
## .. citation_issued = col_logical(),
## .. warning_issued = col_logical(),
## .. outcome = col_character(),
## .. frisk_performed = col_logical(),
## .. search_conducted = col_logical(),
## .. search_basis = col_character(),
## .. reason_for_stop = col_character(),
## .. vehicle_make = col_character(),
## .. vehicle_model = col_character(),
## .. vehicle_type = col_character(),
## .. vehicle_registration_state = col_character(),
## .. vehicle_year = col_double(),
## .. raw_Race = col_character(),
## .. raw_Ethnicity = col_character(),
## .. raw_SearchType = col_character()
## .. )
The date variable is defined as character. Therefore, I will change it to the date format.
stops1$date <- as.Date(stops1$date, format = "%m/%d/%Y")
Add another column that shows the years when each traffic stop occured and remove the missing values.
stops1 <- stops1 %>% mutate(year = year(date)) %>%
filter(!is.na(year))
stops1 %>% count(year) #number of stops in each year
## # A tibble: 8 x 2
## year n
## <dbl> <int>
## 1 2009 18434
## 2 2010 124285
## 3 2011 122839
## 4 2012 117487
## 5 2013 114283
## 6 2014 109747
## 7 2015 115935
## 8 2016 102097
Combine the number of unknow races with other races and remove the missing values.
stops1$subject_race <- gsub("unknown","other", stops1$subject_race)
stops1 <- stops1 %>% filter(!is.na(subject_race))
stops1 %>% count(subject_race) #number of stops of each race
## # A tibble: 5 x 2
## subject_race n
## <chr> <int>
## 1 asian/pacific islander 6700
## 2 black 8805
## 3 hispanic 16055
## 4 other 41417
## 5 white 752024
Remove the missing values and the ages under 16 that is the legal driving age in Montana.
stops1 <- stops1 %>% filter(!is.na(subject_age) & subject_age >= 16)
Remove the missing values.
stops1 <- stops1 %>% filter(!is.na(subject_sex))
stops1 %>% count(subject_sex) #number of stops per gender
## # A tibble: 2 x 2
## subject_sex n
## <chr> <int>
## 1 female 266126
## 2 male 552795
Remove the missing values and filter five counties that have the highest number of traffic stops.
stops1 <- stops1 %>% filter(!is.na(county_name))
# Number of stops in each county sorted by descending order
stops1 %>% count(county_name) %>%
arrange(desc(n))
## # A tibble: 60 x 2
## county_name n
## <chr> <int>
## 1 Flathead County 72712
## 2 Gallatin County 67113
## 3 Yellowstone County 59582
## 4 Cascade County 57534
## 5 Missoula County 55120
## 6 Lewis And Clark County 38094
## 7 Custer County 27957
## 8 Ravalli County 26219
## 9 Lake County 24474
## 10 Powell County 23420
## # ... with 50 more rows
# Choose the first five counties
stops1 <- stops1 %>% filter(county_name == "Flathead County" | county_name == "Gallatin County" | county_name == "Yellowstone County" | county_name == "Cascade County" | county_name == "Missoula County")
# Remove "County" from the county names
stops1$county_name <- gsub(" County","",stops1$county_name)
Use boxplots to describe the distribution of age of the people involved in the traffic stops in each county.
fig1 <- ggplot(stops1, aes(x = county_name, y = subject_age)) +
geom_boxplot(aes(fill = county_name)) +
theme_economist() +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position = "none") +
labs(title = "Distribution of Age in Five Counties",
x = "",
y = "Age")
ggplotly(fig1)
According to the boxplots, Flathead County and Cascade County have higher distribution than the other three counties. The median records of Flathead and Cascade are 38 and 37 relatively while the median of age of the others is 35. The variability of the age is similar in all five counties with the IQR of 25 - 26. The surpring insight is that all five counties have incidents of people over 93 years old stopped by the police. However, these are considered outliers of the dataset.
Using geom_tile to describe the age of people involved in traffic stops in five states from 2009 and 2016
fig2 <- ggplot(stops1, aes(year, county_name, fill = subject_age)) +
geom_tile(color = "grey50") +
scale_x_continuous(expand=c(0,0)) +
scale_fill_viridis_c(option = "B", direction = -1) +
theme_classic() + theme(panel.grid = element_blank(),
legend.direction = "horizontal",
legend.position = "top") +
labs(title = "Stop Counts in Five Counties",
subtitle = "(2009 - 2016)",
x = "Year",
y = "County",
fill = "Age")
fig2
The tile graph shows that Gallatin and Cascade have more young people between 20 and 40 years involved in the traffic stops. Yellowstone and Missoula seem to have the similar pattern of the age of the peole stopped by the police, mostly between 20 and 60. Flathead has the widest range of age from 20 to 80. Furthermore, the years 2011 and 2012 have more young people in the traffic stops compared to the other years.
Using facet_wrap to describe the number of traffic stops of each race during 2009-2016.
fig3 <- stops1 %>%
group_by(county_name)%>%
count(year = year(date), subject_race) %>% # count the number of incidents of each year and each race
ggplot(aes(x = year, y = n, color = subject_race)) +
theme_wsj()+
geom_point() +
geom_line() +
facet_wrap(~ county_name, nrow = 2)+
labs(title = "Stop Counts per Race in Five Counties",
x = "Year",
y = "Stop Counts",
color = "")
ggplotly(fig3)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
In general, the white people are involved in the most of incidents in all five counties. Flathead and Galattin counties have the number of white people over 10,000. Futhermore, in these two counties, the number of white people peaks in 2010 and 2011. In the other counties, the number of white people fluctuates between 5,000 and 10,000 during these seven years.
Using highchart to describe the number incidents happening to each gender from 2009 to 2016.
library(zoo)
## Warning: package 'zoo' was built under R version 3.6.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Add the month-year column showing year and month when the incident happened
stops1 <- stops1 %>% mutate(month_year = format(date, "%Y-%m"))
# Count the number of incidents per gender in each month of each year
stopcounts_gender <- stops1 %>% group_by(month_year) %>%
summarize(subject_sex) %>%
count(month_year, subject_sex)
## `summarise()` regrouping output by 'month_year' (override with `.groups` argument)
fig4 <- highchart() %>%
hc_add_series(data = stopcounts_gender,
type = "line",
hcaes(x = month_year,
y = n,
group = subject_sex)) %>%
hc_add_theme(hc_theme_db()) %>%
hc_title(text = "Stop Counts by Genders") %>%
hc_subtitle(text = "2009 - 2016") %>%
hc_xAxis(title = list(text="Year")) %>%
hc_yAxis(title = list(text="Stop Counts")) %>%
hc_tooltip(shared = TRUE)
## Warning: `parse_quosure()` is deprecated as of rlang 0.2.0.
## Please use `parse_quo()` instead.
## This warning is displayed once per session.
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `as_data_frame()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `rename_()` is deprecated as of dplyr 0.7.0.
## Please use `rename()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
fig4
In general, the number of males stopped by the police is higher that the number of females. The number of males fluctuates between 2,000 and 3,000 incidents while the number of females goes up and down between 1,000 and 1,600. The record of incidents of both groups peaks during July and August 2010 with 3,472 males and 1,886 females.
Citation E. Pierson, C. Simoiu, J. Overgoor, S. Corbett-Davies, D. Jenson, A. Shoemaker, V. Ramachandran, P. Barghouty, C. Phillips, R. Shroff, and S. Goel. “A large-scale analysis of racial disparities in police stops across the United States”. Nature Human Behaviour, Vol. 4, 2020.