The dataset relates to hate crimes and includes various columns with information incidents, agencies, locations, offenders, victims, and the nature of crimes.

The goal of this project is to make some visualizations that communicate the dynamics of relationships among some of the variables and distribution of some variables.

if(!require(tidyverse))install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
if(!require(gridExtra))install.packages("gridExtra")
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(gridExtra)
data <- read.csv("hate_crime.csv")


set.seed(123)
sampled_data <- data %>% sample_n(240000)


summary(sampled_data)
##   incident_id        data_year        ori            pug_agency_name   
##  Min.   :      2   Min.   :1991   Length:240000      Length:240000     
##  1st Qu.:  60445   1st Qu.:1999   Class :character   Class :character  
##  Median : 120863   Median :2006   Mode  :character   Mode  :character  
##  Mean   : 348916   Mean   :2007                                        
##  3rd Qu.: 181280   3rd Qu.:2016                                        
##  Max.   :1494167   Max.   :2022                                        
##                                                                        
##  pub_agency_unit    agency_type_name    state_abbr         state_name       
##  Length:240000      Length:240000      Length:240000      Length:240000     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  division_name      region_name        population_group_code
##  Length:240000      Length:240000      Length:240000        
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
##                                                             
##  population_group_description incident_date      adult_victim_count
##  Length:240000                Length:240000      Min.   :  0.00    
##  Class :character             Class :character   1st Qu.:  0.00    
##  Mode  :character             Mode  :character   Median :  1.00    
##                                                  Mean   :  0.73    
##                                                  3rd Qu.:  1.00    
##                                                  Max.   :146.00    
##                                                  NA's   :169388    
##  juvenile_victim_count total_offender_count adult_offender_count
##  Min.   : 0.0          Min.   : 0.0000      Min.   : 0.00       
##  1st Qu.: 0.0          1st Qu.: 0.0000      1st Qu.: 0.00       
##  Median : 0.0          Median : 1.0000      Median : 0.00       
##  Mean   : 0.1          Mean   : 0.9562      Mean   : 0.61       
##  3rd Qu.: 0.0          3rd Qu.: 1.0000      3rd Qu.: 1.00       
##  Max.   :60.0          Max.   :99.0000      Max.   :60.00       
##  NA's   :171814                             NA's   :175958      
##  juvenile_offender_count offender_race      offender_ethnicity
##  Min.   : 0.00           Length:240000      Length:240000     
##  1st Qu.: 0.00           Class :character   Class :character  
##  Median : 0.00           Mode  :character   Mode  :character  
##  Mean   : 0.12                                                
##  3rd Qu.: 0.00                                                
##  Max.   :20.00                                                
##  NA's   :175965                                               
##   victim_count     offense_name       total_individual_victims
##  Min.   :  1.000   Length:240000      Min.   :  0.000         
##  1st Qu.:  1.000   Class :character   1st Qu.:  1.000         
##  Median :  1.000   Mode  :character   Median :  1.000         
##  Mean   :  1.238                      Mean   :  0.989         
##  3rd Qu.:  1.000                      3rd Qu.:  1.000         
##  Max.   :200.000                      Max.   :147.000         
##                                       NA's   :4816            
##  location_name       bias_desc         victim_types       multiple_offense  
##  Length:240000      Length:240000      Length:240000      Length:240000     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  multiple_bias     
##  Length:240000     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Checking the Offense with the Highest Frequency

length(unique(sampled_data$offense_name))
## [1] 398
offense_frequency <- sampled_data %>%
                       group_by(offense_name) %>%
                       summarise(frequency = n()) %>%
                      arrange(desc(frequency))


offense_frequency
## # A tibble: 398 × 2
##    offense_name                                          frequency
##    <chr>                                                     <int>
##  1 Destruction/Damage/Vandalism of Property                  71550
##  2 Intimidation                                              69908
##  3 Simple Assault                                            46312
##  4 Aggravated Assault                                        25246
##  5 Robbery                                                    3980
##  6 Burglary/Breaking & Entering                               3071
##  7 All Other Larceny                                          2423
##  8 Destruction/Damage/Vandalism of Property;Intimidation      1990
##  9 Arson                                                      1281
## 10 Drug/Narcotic Violations                                   1216
## # ℹ 388 more rows

Renaming Some of the Columns

sampled_data <- sampled_data %>% mutate(offense_name = 
                                          ifelse(offense_name =="Destruction/Damage/Vandalism of Property",
                                                 "Damage", offense_name))

sampled_data <- sampled_data %>% mutate(offense_name = 
                                          ifelse(offense_name =="Simple Assault",
                                                 "Sim.Assault", offense_name))
offense_frequency <- sampled_data %>%
                       group_by(offense_name) %>%
                       summarise(frequency = n()) %>%
                      arrange(desc(frequency))



offense_frequency
## # A tibble: 398 × 2
##    offense_name                                          frequency
##    <chr>                                                     <int>
##  1 Damage                                                    71550
##  2 Intimidation                                              69908
##  3 Sim.Assault                                               46312
##  4 Aggravated Assault                                        25246
##  5 Robbery                                                    3980
##  6 Burglary/Breaking & Entering                               3071
##  7 All Other Larceny                                          2423
##  8 Destruction/Damage/Vandalism of Property;Intimidation      1990
##  9 Arson                                                      1281
## 10 Drug/Narcotic Violations                                   1216
## # ℹ 388 more rows

The three offenses with the highest frequency are Damage, Intimidation, and Simple Assault.

Summary Statistics of the Offenses

summ_dam <- sampled_data %>% filter(offense_name == "Damage")
summary(summ_dam)
##   incident_id        data_year        ori            pug_agency_name   
##  Min.   :      6   Min.   :1991   Length:71550       Length:71550      
##  1st Qu.:  62594   1st Qu.:1999   Class :character   Class :character  
##  Median : 118000   Median :2006   Mode  :character   Mode  :character  
##  Mean   : 300800   Mean   :2007                                        
##  3rd Qu.: 172303   3rd Qu.:2014                                        
##  Max.   :1494037   Max.   :2022                                        
##                                                                        
##  pub_agency_unit    agency_type_name    state_abbr         state_name       
##  Length:71550       Length:71550       Length:71550       Length:71550      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  division_name      region_name        population_group_code
##  Length:71550       Length:71550       Length:71550         
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
##                                                             
##  population_group_description incident_date      adult_victim_count
##  Length:71550                 Length:71550       Min.   : 0.00     
##  Class :character             Class :character   1st Qu.: 0.00     
##  Mode  :character             Mode  :character   Median : 0.00     
##                                                  Mean   : 0.44     
##                                                  3rd Qu.: 1.00     
##                                                  Max.   :26.00     
##                                                  NA's   :55202     
##  juvenile_victim_count total_offender_count adult_offender_count
##  Min.   :0.00          Min.   : 0.0000      Min.   : 0.00       
##  1st Qu.:0.00          1st Qu.: 0.0000      1st Qu.: 0.00       
##  Median :0.00          Median : 0.0000      Median : 0.00       
##  Mean   :0.01          Mean   : 0.3997      Mean   : 0.25       
##  3rd Qu.:0.00          3rd Qu.: 1.0000      3rd Qu.: 0.00       
##  Max.   :3.00          Max.   :99.0000      Max.   :20.00       
##  NA's   :55391                              NA's   :57517       
##  juvenile_offender_count offender_race      offender_ethnicity
##  Min.   : 0.0            Length:71550       Length:71550      
##  1st Qu.: 0.0            Class :character   Class :character  
##  Median : 0.0            Mode  :character   Mode  :character  
##  Mean   : 0.1                                                 
##  3rd Qu.: 0.0                                                 
##  Max.   :13.0                                                 
##  NA's   :57517                                                
##   victim_count     offense_name       total_individual_victims
##  Min.   :  1.000   Length:71550       Min.   : 0.00           
##  1st Qu.:  1.000   Class :character   1st Qu.: 0.00           
##  Median :  1.000   Mode  :character   Median : 0.00           
##  Mean   :  1.126                      Mean   : 0.54           
##  3rd Qu.:  1.000                      3rd Qu.: 1.00           
##  Max.   :100.000                      Max.   :52.00           
##                                       NA's   :4498            
##  location_name       bias_desc         victim_types       multiple_offense  
##  Length:71550       Length:71550       Length:71550       Length:71550      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  multiple_bias     
##  Length:71550      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
summ_int <- sampled_data %>% filter(offense_name == "Intimidation")
summary(summ_int)
##   incident_id        data_year        ori            pug_agency_name   
##  Min.   :      2   Min.   :1991   Length:69908       Length:69908      
##  1st Qu.:  52111   1st Qu.:1998   Class :character   Class :character  
##  Median : 106931   Median :2005   Mode  :character   Mode  :character  
##  Mean   : 329107   Mean   :2006                                        
##  3rd Qu.: 177476   3rd Qu.:2015                                        
##  Max.   :1494167   Max.   :2022                                        
##                                                                        
##  pub_agency_unit    agency_type_name    state_abbr         state_name       
##  Length:69908       Length:69908       Length:69908       Length:69908      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  division_name      region_name        population_group_code
##  Length:69908       Length:69908       Length:69908         
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
##                                                             
##  population_group_description incident_date      adult_victim_count
##  Length:69908                 Length:69908       Min.   :  0.00    
##  Class :character             Class :character   1st Qu.:  0.00    
##  Mode  :character             Mode  :character   Median :  1.00    
##                                                  Mean   :  0.88    
##                                                  3rd Qu.:  1.00    
##                                                  Max.   :146.00    
##                                                  NA's   :49938     
##  juvenile_victim_count total_offender_count adult_offender_count
##  Min.   : 0.00         Min.   : 0.0000      Min.   : 0.00       
##  1st Qu.: 0.00         1st Qu.: 0.0000      1st Qu.: 0.00       
##  Median : 0.00         Median : 1.0000      Median : 1.00       
##  Mean   : 0.17         Mean   : 0.8979      Mean   : 0.57       
##  3rd Qu.: 0.00         3rd Qu.: 1.0000      3rd Qu.: 1.00       
##  Max.   :60.00         Max.   :99.0000      Max.   :60.00       
##  NA's   :51821                              NA's   :51154       
##  juvenile_offender_count offender_race      offender_ethnicity
##  Min.   :0.00            Length:69908       Length:69908      
##  1st Qu.:0.00            Class :character   Class :character  
##  Median :0.00            Mode  :character   Mode  :character  
##  Mean   :0.12                                                 
##  3rd Qu.:0.00                                                 
##  Max.   :9.00                                                 
##  NA's   :51157                                                
##   victim_count     offense_name       total_individual_victims
##  Min.   :  1.000   Length:69908       Min.   :  0.000         
##  1st Qu.:  1.000   Class :character   1st Qu.:  1.000         
##  Median :  1.000   Mode  :character   Median :  1.000         
##  Mean   :  1.226                      Mean   :  1.202         
##  3rd Qu.:  1.000                      3rd Qu.:  1.000         
##  Max.   :146.000                      Max.   :146.000         
##                                                               
##  location_name       bias_desc         victim_types       multiple_offense  
##  Length:69908       Length:69908       Length:69908       Length:69908      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  multiple_bias     
##  Length:69908      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
summ_simp <- sampled_data %>% filter(offense_name == "Sim.Assault")
summary(summ_simp)
##   incident_id        data_year        ori            pug_agency_name   
##  Min.   :      8   Min.   :1991   Length:46312       Length:46312      
##  1st Qu.:  67600   1st Qu.:2000   Class :character   Class :character  
##  Median : 132274   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 377579   Mean   :2008                                        
##  3rd Qu.: 186354   3rd Qu.:2016                                        
##  Max.   :1494040   Max.   :2022                                        
##                                                                        
##  pub_agency_unit    agency_type_name    state_abbr         state_name       
##  Length:46312       Length:46312       Length:46312       Length:46312      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  division_name      region_name        population_group_code
##  Length:46312       Length:46312       Length:46312         
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
##                                                             
##  population_group_description incident_date      adult_victim_count
##  Length:46312                 Length:46312       Min.   : 0.000    
##  Class :character             Class :character   1st Qu.: 0.000    
##  Mode  :character             Mode  :character   Median : 1.000    
##                                                  Mean   : 0.758    
##                                                  3rd Qu.: 1.000    
##                                                  Max.   :10.000    
##                                                  NA's   :30850     
##  juvenile_victim_count total_offender_count adult_offender_count
##  Min.   :0.000         Min.   : 0.000       Min.   : 0.000      
##  1st Qu.:0.000         1st Qu.: 1.000       1st Qu.: 0.000      
##  Median :0.000         Median : 1.000       Median : 1.000      
##  Mean   :0.139         Mean   : 1.404       Mean   : 0.748      
##  3rd Qu.:0.000         3rd Qu.: 1.000       3rd Qu.: 1.000      
##  Max.   :8.000         Max.   :50.000       Max.   :30.000      
##  NA's   :31064                              NA's   :31589       
##  juvenile_offender_count offender_race      offender_ethnicity  victim_count   
##  Min.   : 0.000          Length:46312       Length:46312       Min.   : 1.000  
##  1st Qu.: 0.000          Class :character   Class :character   1st Qu.: 1.000  
##  Median : 0.000          Mode  :character   Mode  :character   Median : 1.000  
##  Mean   : 0.151                                                Mean   : 1.178  
##  3rd Qu.: 0.000                                                3rd Qu.: 1.000  
##  Max.   :15.000                                                Max.   :26.000  
##  NA's   :31592                                                                 
##  offense_name       total_individual_victims location_name     
##  Length:46312       Min.   : 0.000           Length:46312      
##  Class :character   1st Qu.: 1.000           Class :character  
##  Mode  :character   Median : 1.000           Mode  :character  
##                     Mean   : 1.134                             
##                     3rd Qu.: 1.000                             
##                     Max.   :26.000                             
##                                                                
##   bias_desc         victim_types       multiple_offense   multiple_bias     
##  Length:46312       Length:46312       Length:46312       Length:46312      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
## 

Data Visualizations

var1991 <- sampled_data[sampled_data$data_year == 1991 & sampled_data$offense_name %in%
                            c("Damage", "Intimidation", "Sim.Assault"), ]


bar_var1991 <- ggplot(var1991, aes(offense_name)) + geom_bar() +
                           labs(title = "Top 3 Offences in 1991", 
                                x = "Year: 1991", y = "Frequency") +
                           theme_minimal()
bar_var1991

var2000 <- sampled_data[sampled_data$data_year == 2000 & sampled_data$offense_name %in%
                            c("Damage", "Intimidation", "Sim.Assault"), ]


bar_var2000 <- ggplot(var2000, aes(offense_name)) + geom_bar() +
                           labs(title = "Top 3 Offences in 2000", 
                                x = "Year: 2000", y = "Frequency") +
                           theme_minimal()
bar_var2000

var2010 <- sampled_data[sampled_data$data_year == 2010 & sampled_data$offense_name %in%
                            c("Damage", "Intimidation", "Sim.Assault"), ]


bar_var2010 <- ggplot(var2010, aes(offense_name)) + geom_bar() +
                           labs(title = "Top 3 Offences in 2010", 
                                x = "Year: 2010", y = "Frequency") +
                           theme_minimal()
bar_var2010

var2020 <- sampled_data[sampled_data$data_year == 2020 & sampled_data$offense_name %in%
                            c("Damage", "Intimidation", "Sim.Assault"), ]


bar_var2020 <- ggplot(var2020, aes(offense_name)) + geom_bar() +
                           labs(title = "Top 3 Offences in 2020", 
                                x = "Year: 2020", y = "Frequency") +
                           theme_minimal()
bar_var2020

grid.arrange(bar_var1991, bar_var2000, bar_var2010, bar_var2020, ncol = 2)

ann_dam <- sampled_data %>% 
           filter(offense_name == "Damage") %>% 
           group_by(data_year) %>%
           summarise(damage_count = n()) 


ann_dam
## # A tibble: 32 × 2
##    data_year damage_count
##        <int>        <int>
##  1      1991         1226
##  2      1992         1751
##  3      1993         2163
##  4      1994         1634
##  5      1995         2136
##  6      1996         2648
##  7      1997         2378
##  8      1998         2432
##  9      1999         2494
## 10      2000         2612
## # ℹ 22 more rows
view(ann_dam)
dam_linearmodel <- lm(damage_count ~ data_year, data = ann_dam) 

dam_linearmodel 
## 
## Call:
## lm(formula = damage_count ~ data_year, data = ann_dam)
## 
## Coefficients:
## (Intercept)    data_year  
##    -450.919        1.339
dam_newdata <-data.frame(data_year = 2023)


dam_prediction <- predict(dam_linearmodel, newdata = dam_newdata) 

dam_prediction
##        1 
## 2258.032
ann_int <- sampled_data %>% 
           filter(offense_name == "Intimidation") %>% 
           group_by(data_year) %>%
           summarise(intimidation_count = n()) 


ann_int 
## # A tibble: 32 × 2
##    data_year intimidation_count
##        <int>              <int>
##  1      1991               1513
##  2      1992               2270
##  3      1993               2403
##  4      1994               2075
##  5      1995               2820
##  6      1996               2938
##  7      1997               2729
##  8      1998               2693
##  9      1999               2582
## 10      2000               2623
## # ℹ 22 more rows
view(ann_int)
int_linearmodel <- lm(intimidation_count ~ data_year, data = ann_int) 

int_linearmodel 
## 
## Call:
## lm(formula = intimidation_count ~ data_year, data = ann_int)
## 
## Coefficients:
## (Intercept)    data_year  
##    24847.48       -11.29
int_newdata <-data.frame(data_year = 2023)


int_prediction <- predict(int_linearmodel, newdata = int_newdata) 

int_prediction
##        1 
## 1998.262
ann_sim <- sampled_data %>% 
           filter(offense_name == "Sim.Assault") %>% 
           group_by(data_year) %>%
           summarise(sim_count = n()) 


ann_sim 
## # A tibble: 32 × 2
##    data_year sim_count
##        <int>     <int>
##  1      1991       755
##  2      1992      1231
##  3      1993      1417
##  4      1994      1018
##  5      1995      1388
##  6      1996      1267
##  7      1997      1350
##  8      1998      1357
##  9      1999      1426
## 10      2000      1377
## # ℹ 22 more rows
view(ann_sim)
sim_linearmodel <- lm(sim_count ~ data_year, data = ann_sim) 

sim_linearmodel 
## 
## Call:
## lm(formula = sim_count ~ data_year, data = ann_sim)
## 
## Coefficients:
## (Intercept)    data_year  
##   -42354.47        21.83
sim_newdata <-data.frame(data_year = 2023)


sim_prediction <- predict(sim_linearmodel, newdata = sim_newdata) 

sim_prediction
##        1 
## 1807.444