The dataset relates to hate crimes and includes various columns with information incidents, agencies, locations, offenders, victims, and the nature of crimes.
The goal of this project is to make some visualizations that communicate the dynamics of relationships among some of the variables and distribution of some variables.
if(!require(tidyverse))install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
if(!require(gridExtra))install.packages("gridExtra")
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(gridExtra)
data <- read.csv("hate_crime.csv")
set.seed(123)
sampled_data <- data %>% sample_n(240000)
summary(sampled_data)
## incident_id data_year ori pug_agency_name
## Min. : 2 Min. :1991 Length:240000 Length:240000
## 1st Qu.: 60445 1st Qu.:1999 Class :character Class :character
## Median : 120863 Median :2006 Mode :character Mode :character
## Mean : 348916 Mean :2007
## 3rd Qu.: 181280 3rd Qu.:2016
## Max. :1494167 Max. :2022
##
## pub_agency_unit agency_type_name state_abbr state_name
## Length:240000 Length:240000 Length:240000 Length:240000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## division_name region_name population_group_code
## Length:240000 Length:240000 Length:240000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## population_group_description incident_date adult_victim_count
## Length:240000 Length:240000 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Median : 1.00
## Mean : 0.73
## 3rd Qu.: 1.00
## Max. :146.00
## NA's :169388
## juvenile_victim_count total_offender_count adult_offender_count
## Min. : 0.0 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.0 Median : 1.0000 Median : 0.00
## Mean : 0.1 Mean : 0.9562 Mean : 0.61
## 3rd Qu.: 0.0 3rd Qu.: 1.0000 3rd Qu.: 1.00
## Max. :60.0 Max. :99.0000 Max. :60.00
## NA's :171814 NA's :175958
## juvenile_offender_count offender_race offender_ethnicity
## Min. : 0.00 Length:240000 Length:240000
## 1st Qu.: 0.00 Class :character Class :character
## Median : 0.00 Mode :character Mode :character
## Mean : 0.12
## 3rd Qu.: 0.00
## Max. :20.00
## NA's :175965
## victim_count offense_name total_individual_victims
## Min. : 1.000 Length:240000 Min. : 0.000
## 1st Qu.: 1.000 Class :character 1st Qu.: 1.000
## Median : 1.000 Mode :character Median : 1.000
## Mean : 1.238 Mean : 0.989
## 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :200.000 Max. :147.000
## NA's :4816
## location_name bias_desc victim_types multiple_offense
## Length:240000 Length:240000 Length:240000 Length:240000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## multiple_bias
## Length:240000
## Class :character
## Mode :character
##
##
##
##
length(unique(sampled_data$offense_name))
## [1] 398
offense_frequency <- sampled_data %>%
group_by(offense_name) %>%
summarise(frequency = n()) %>%
arrange(desc(frequency))
offense_frequency
## # A tibble: 398 × 2
## offense_name frequency
## <chr> <int>
## 1 Destruction/Damage/Vandalism of Property 71550
## 2 Intimidation 69908
## 3 Simple Assault 46312
## 4 Aggravated Assault 25246
## 5 Robbery 3980
## 6 Burglary/Breaking & Entering 3071
## 7 All Other Larceny 2423
## 8 Destruction/Damage/Vandalism of Property;Intimidation 1990
## 9 Arson 1281
## 10 Drug/Narcotic Violations 1216
## # ℹ 388 more rows
sampled_data <- sampled_data %>% mutate(offense_name =
ifelse(offense_name =="Destruction/Damage/Vandalism of Property",
"Damage", offense_name))
sampled_data <- sampled_data %>% mutate(offense_name =
ifelse(offense_name =="Simple Assault",
"Sim.Assault", offense_name))
offense_frequency <- sampled_data %>%
group_by(offense_name) %>%
summarise(frequency = n()) %>%
arrange(desc(frequency))
offense_frequency
## # A tibble: 398 × 2
## offense_name frequency
## <chr> <int>
## 1 Damage 71550
## 2 Intimidation 69908
## 3 Sim.Assault 46312
## 4 Aggravated Assault 25246
## 5 Robbery 3980
## 6 Burglary/Breaking & Entering 3071
## 7 All Other Larceny 2423
## 8 Destruction/Damage/Vandalism of Property;Intimidation 1990
## 9 Arson 1281
## 10 Drug/Narcotic Violations 1216
## # ℹ 388 more rows
The three offenses with the highest frequency are Damage, Intimidation, and Simple Assault.
summ_dam <- sampled_data %>% filter(offense_name == "Damage")
summary(summ_dam)
## incident_id data_year ori pug_agency_name
## Min. : 6 Min. :1991 Length:71550 Length:71550
## 1st Qu.: 62594 1st Qu.:1999 Class :character Class :character
## Median : 118000 Median :2006 Mode :character Mode :character
## Mean : 300800 Mean :2007
## 3rd Qu.: 172303 3rd Qu.:2014
## Max. :1494037 Max. :2022
##
## pub_agency_unit agency_type_name state_abbr state_name
## Length:71550 Length:71550 Length:71550 Length:71550
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## division_name region_name population_group_code
## Length:71550 Length:71550 Length:71550
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## population_group_description incident_date adult_victim_count
## Length:71550 Length:71550 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00
## Mean : 0.44
## 3rd Qu.: 1.00
## Max. :26.00
## NA's :55202
## juvenile_victim_count total_offender_count adult_offender_count
## Min. :0.00 Min. : 0.0000 Min. : 0.00
## 1st Qu.:0.00 1st Qu.: 0.0000 1st Qu.: 0.00
## Median :0.00 Median : 0.0000 Median : 0.00
## Mean :0.01 Mean : 0.3997 Mean : 0.25
## 3rd Qu.:0.00 3rd Qu.: 1.0000 3rd Qu.: 0.00
## Max. :3.00 Max. :99.0000 Max. :20.00
## NA's :55391 NA's :57517
## juvenile_offender_count offender_race offender_ethnicity
## Min. : 0.0 Length:71550 Length:71550
## 1st Qu.: 0.0 Class :character Class :character
## Median : 0.0 Mode :character Mode :character
## Mean : 0.1
## 3rd Qu.: 0.0
## Max. :13.0
## NA's :57517
## victim_count offense_name total_individual_victims
## Min. : 1.000 Length:71550 Min. : 0.00
## 1st Qu.: 1.000 Class :character 1st Qu.: 0.00
## Median : 1.000 Mode :character Median : 0.00
## Mean : 1.126 Mean : 0.54
## 3rd Qu.: 1.000 3rd Qu.: 1.00
## Max. :100.000 Max. :52.00
## NA's :4498
## location_name bias_desc victim_types multiple_offense
## Length:71550 Length:71550 Length:71550 Length:71550
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## multiple_bias
## Length:71550
## Class :character
## Mode :character
##
##
##
##
summ_int <- sampled_data %>% filter(offense_name == "Intimidation")
summary(summ_int)
## incident_id data_year ori pug_agency_name
## Min. : 2 Min. :1991 Length:69908 Length:69908
## 1st Qu.: 52111 1st Qu.:1998 Class :character Class :character
## Median : 106931 Median :2005 Mode :character Mode :character
## Mean : 329107 Mean :2006
## 3rd Qu.: 177476 3rd Qu.:2015
## Max. :1494167 Max. :2022
##
## pub_agency_unit agency_type_name state_abbr state_name
## Length:69908 Length:69908 Length:69908 Length:69908
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## division_name region_name population_group_code
## Length:69908 Length:69908 Length:69908
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## population_group_description incident_date adult_victim_count
## Length:69908 Length:69908 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Median : 1.00
## Mean : 0.88
## 3rd Qu.: 1.00
## Max. :146.00
## NA's :49938
## juvenile_victim_count total_offender_count adult_offender_count
## Min. : 0.00 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.00 Median : 1.0000 Median : 1.00
## Mean : 0.17 Mean : 0.8979 Mean : 0.57
## 3rd Qu.: 0.00 3rd Qu.: 1.0000 3rd Qu.: 1.00
## Max. :60.00 Max. :99.0000 Max. :60.00
## NA's :51821 NA's :51154
## juvenile_offender_count offender_race offender_ethnicity
## Min. :0.00 Length:69908 Length:69908
## 1st Qu.:0.00 Class :character Class :character
## Median :0.00 Mode :character Mode :character
## Mean :0.12
## 3rd Qu.:0.00
## Max. :9.00
## NA's :51157
## victim_count offense_name total_individual_victims
## Min. : 1.000 Length:69908 Min. : 0.000
## 1st Qu.: 1.000 Class :character 1st Qu.: 1.000
## Median : 1.000 Mode :character Median : 1.000
## Mean : 1.226 Mean : 1.202
## 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :146.000 Max. :146.000
##
## location_name bias_desc victim_types multiple_offense
## Length:69908 Length:69908 Length:69908 Length:69908
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## multiple_bias
## Length:69908
## Class :character
## Mode :character
##
##
##
##
summ_simp <- sampled_data %>% filter(offense_name == "Sim.Assault")
summary(summ_simp)
## incident_id data_year ori pug_agency_name
## Min. : 8 Min. :1991 Length:46312 Length:46312
## 1st Qu.: 67600 1st Qu.:2000 Class :character Class :character
## Median : 132274 Median :2008 Mode :character Mode :character
## Mean : 377579 Mean :2008
## 3rd Qu.: 186354 3rd Qu.:2016
## Max. :1494040 Max. :2022
##
## pub_agency_unit agency_type_name state_abbr state_name
## Length:46312 Length:46312 Length:46312 Length:46312
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## division_name region_name population_group_code
## Length:46312 Length:46312 Length:46312
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## population_group_description incident_date adult_victim_count
## Length:46312 Length:46312 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000
## Mode :character Mode :character Median : 1.000
## Mean : 0.758
## 3rd Qu.: 1.000
## Max. :10.000
## NA's :30850
## juvenile_victim_count total_offender_count adult_offender_count
## Min. :0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.: 1.000 1st Qu.: 0.000
## Median :0.000 Median : 1.000 Median : 1.000
## Mean :0.139 Mean : 1.404 Mean : 0.748
## 3rd Qu.:0.000 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :8.000 Max. :50.000 Max. :30.000
## NA's :31064 NA's :31589
## juvenile_offender_count offender_race offender_ethnicity victim_count
## Min. : 0.000 Length:46312 Length:46312 Min. : 1.000
## 1st Qu.: 0.000 Class :character Class :character 1st Qu.: 1.000
## Median : 0.000 Mode :character Mode :character Median : 1.000
## Mean : 0.151 Mean : 1.178
## 3rd Qu.: 0.000 3rd Qu.: 1.000
## Max. :15.000 Max. :26.000
## NA's :31592
## offense_name total_individual_victims location_name
## Length:46312 Min. : 0.000 Length:46312
## Class :character 1st Qu.: 1.000 Class :character
## Mode :character Median : 1.000 Mode :character
## Mean : 1.134
## 3rd Qu.: 1.000
## Max. :26.000
##
## bias_desc victim_types multiple_offense multiple_bias
## Length:46312 Length:46312 Length:46312 Length:46312
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
var1991 <- sampled_data[sampled_data$data_year == 1991 & sampled_data$offense_name %in%
c("Damage", "Intimidation", "Sim.Assault"), ]
bar_var1991 <- ggplot(var1991, aes(offense_name)) + geom_bar() +
labs(title = "Top 3 Offences in 1991",
x = "Year: 1991", y = "Frequency") +
theme_minimal()
bar_var1991
var2000 <- sampled_data[sampled_data$data_year == 2000 & sampled_data$offense_name %in%
c("Damage", "Intimidation", "Sim.Assault"), ]
bar_var2000 <- ggplot(var2000, aes(offense_name)) + geom_bar() +
labs(title = "Top 3 Offences in 2000",
x = "Year: 2000", y = "Frequency") +
theme_minimal()
bar_var2000
var2010 <- sampled_data[sampled_data$data_year == 2010 & sampled_data$offense_name %in%
c("Damage", "Intimidation", "Sim.Assault"), ]
bar_var2010 <- ggplot(var2010, aes(offense_name)) + geom_bar() +
labs(title = "Top 3 Offences in 2010",
x = "Year: 2010", y = "Frequency") +
theme_minimal()
bar_var2010
var2020 <- sampled_data[sampled_data$data_year == 2020 & sampled_data$offense_name %in%
c("Damage", "Intimidation", "Sim.Assault"), ]
bar_var2020 <- ggplot(var2020, aes(offense_name)) + geom_bar() +
labs(title = "Top 3 Offences in 2020",
x = "Year: 2020", y = "Frequency") +
theme_minimal()
bar_var2020
grid.arrange(bar_var1991, bar_var2000, bar_var2010, bar_var2020, ncol = 2)
ann_dam <- sampled_data %>%
filter(offense_name == "Damage") %>%
group_by(data_year) %>%
summarise(damage_count = n())
ann_dam
## # A tibble: 32 × 2
## data_year damage_count
## <int> <int>
## 1 1991 1226
## 2 1992 1751
## 3 1993 2163
## 4 1994 1634
## 5 1995 2136
## 6 1996 2648
## 7 1997 2378
## 8 1998 2432
## 9 1999 2494
## 10 2000 2612
## # ℹ 22 more rows
view(ann_dam)
dam_linearmodel <- lm(damage_count ~ data_year, data = ann_dam)
dam_linearmodel
##
## Call:
## lm(formula = damage_count ~ data_year, data = ann_dam)
##
## Coefficients:
## (Intercept) data_year
## -450.919 1.339
dam_newdata <-data.frame(data_year = 2023)
dam_prediction <- predict(dam_linearmodel, newdata = dam_newdata)
dam_prediction
## 1
## 2258.032
ann_int <- sampled_data %>%
filter(offense_name == "Intimidation") %>%
group_by(data_year) %>%
summarise(intimidation_count = n())
ann_int
## # A tibble: 32 × 2
## data_year intimidation_count
## <int> <int>
## 1 1991 1513
## 2 1992 2270
## 3 1993 2403
## 4 1994 2075
## 5 1995 2820
## 6 1996 2938
## 7 1997 2729
## 8 1998 2693
## 9 1999 2582
## 10 2000 2623
## # ℹ 22 more rows
view(ann_int)
int_linearmodel <- lm(intimidation_count ~ data_year, data = ann_int)
int_linearmodel
##
## Call:
## lm(formula = intimidation_count ~ data_year, data = ann_int)
##
## Coefficients:
## (Intercept) data_year
## 24847.48 -11.29
int_newdata <-data.frame(data_year = 2023)
int_prediction <- predict(int_linearmodel, newdata = int_newdata)
int_prediction
## 1
## 1998.262
ann_sim <- sampled_data %>%
filter(offense_name == "Sim.Assault") %>%
group_by(data_year) %>%
summarise(sim_count = n())
ann_sim
## # A tibble: 32 × 2
## data_year sim_count
## <int> <int>
## 1 1991 755
## 2 1992 1231
## 3 1993 1417
## 4 1994 1018
## 5 1995 1388
## 6 1996 1267
## 7 1997 1350
## 8 1998 1357
## 9 1999 1426
## 10 2000 1377
## # ℹ 22 more rows
view(ann_sim)
sim_linearmodel <- lm(sim_count ~ data_year, data = ann_sim)
sim_linearmodel
##
## Call:
## lm(formula = sim_count ~ data_year, data = ann_sim)
##
## Coefficients:
## (Intercept) data_year
## -42354.47 21.83
sim_newdata <-data.frame(data_year = 2023)
sim_prediction <- predict(sim_linearmodel, newdata = sim_newdata)
sim_prediction
## 1
## 1807.444