This project uses a crime dataset from 1991 to 2022. The most common crimes during this time are Aggravated Assault, Intimidation, and Simple Assault. By using the Random Forest method, we aim to predict how often these crimes will happen in 2023. This analysis will help understand crime trends and support decisions for law enforcement and policy making.

Loading Libraries

if(!require(tidyverse))install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
data <- read.csv("hate_crime.csv")

head(data)
##   incident_id data_year       ori   pug_agency_name pub_agency_unit
## 1          43      1991 AR0350100        Pine Bluff                
## 2          44      1991 AR0350100        Pine Bluff                
## 3          45      1991 AR0600300 North Little Rock                
## 4          46      1991 AR0600300 North Little Rock                
## 5          47      1991 AR0670000            Sevier                
## 6        3015      1991 AR0040200            Rogers                
##   agency_type_name state_abbr state_name      division_name region_name
## 1             City         AR   Arkansas West South Central       South
## 2             City         AR   Arkansas West South Central       South
## 3             City         AR   Arkansas West South Central       South
## 4             City         AR   Arkansas West South Central       South
## 5           County         AR   Arkansas West South Central       South
## 6             City         AR   Arkansas West South Central       South
##   population_group_code   population_group_description incident_date
## 1                     3 Cities from 50,000 thru 99,999    1991-07-04
## 2                     3 Cities from 50,000 thru 99,999    1991-12-24
## 3                     3 Cities from 50,000 thru 99,999    1991-07-10
## 4                     3 Cities from 50,000 thru 99,999    1991-10-06
## 5                    8D  Non-MSA counties under 10,000    1991-10-14
## 6                     5 Cities from 10,000 thru 24,999    1991-08-31
##   adult_victim_count juvenile_victim_count total_offender_count
## 1                 NA                    NA                    1
## 2                 NA                    NA                    1
## 3                 NA                    NA                    1
## 4                 NA                    NA                    2
## 5                 NA                    NA                    1
## 6                 NA                    NA                    1
##   adult_offender_count juvenile_offender_count             offender_race
## 1                   NA                      NA Black or African American
## 2                   NA                      NA Black or African American
## 3                   NA                      NA Black or African American
## 4                   NA                      NA Black or African American
## 5                   NA                      NA                     White
## 6                   NA                      NA                     White
##   offender_ethnicity victim_count
## 1      Not Specified            1
## 2      Not Specified            2
## 3      Not Specified            2
## 4      Not Specified            1
## 5      Not Specified            1
## 6      Not Specified            1
##                                                  offense_name
## 1                                          Aggravated Assault
## 2 Aggravated Assault;Destruction/Damage/Vandalism of Property
## 3     Aggravated Assault;Murder and Nonnegligent Manslaughter
## 4                                                Intimidation
## 5                                                Intimidation
## 6                                                Intimidation
##   total_individual_victims                      location_name
## 1                        1                     Residence/Home
## 2                        1 Highway/Road/Alley/Street/Sidewalk
## 3                        2                     Residence/Home
## 4                        1                     Residence/Home
## 5                        1                     School/College
## 6                        1 Highway/Road/Alley/Street/Sidewalk
##                        bias_desc victim_types multiple_offense multiple_bias
## 1 Anti-Black or African American   Individual                S             S
## 2                     Anti-White   Individual                M             S
## 3                     Anti-White   Individual                M             S
## 4                     Anti-White   Individual                S             S
## 5 Anti-Black or African American   Individual                S             S
## 6 Anti-Black or African American   Individual                S             S

Getting the top 3 offenses based on total offender count

top_offenses <- data %>%
  group_by(offense_name) %>%
  summarize(total_offender_count = sum(total_offender_count, na.rm = TRUE)) %>%
  arrange(desc(total_offender_count)) %>%
  slice_head(n = 3)
 
top_offenses
## # A tibble: 3 × 2
##   offense_name       total_offender_count
##   <chr>                             <int>
## 1 Simple Assault                    65440
## 2 Intimidation                      63191
## 3 Aggravated Assault                42291

Extracting the names of the top 3 offenses

top_offenses_names <- top_offenses$offense_name
top_offenses$offense_name
## [1] "Simple Assault"     "Intimidation"       "Aggravated Assault"

Filtering the dataset to include only the top 3 offenses

filtered_data <- data %>%
  filter(offense_name %in% top_offenses_names)
 
head(filtered_data)
##   incident_id data_year       ori   pug_agency_name pub_agency_unit
## 1          43      1991 AR0350100        Pine Bluff                
## 2          46      1991 AR0600300 North Little Rock                
## 3          47      1991 AR0670000            Sevier                
## 4        3015      1991 AR0040200            Rogers                
## 5        3016      1991 AR0290100              Hope                
## 6        3017      1991 AR0350100        Pine Bluff                
##   agency_type_name state_abbr state_name      division_name region_name
## 1             City         AR   Arkansas West South Central       South
## 2             City         AR   Arkansas West South Central       South
## 3           County         AR   Arkansas West South Central       South
## 4             City         AR   Arkansas West South Central       South
## 5             City         AR   Arkansas West South Central       South
## 6             City         AR   Arkansas West South Central       South
##   population_group_code   population_group_description incident_date
## 1                     3 Cities from 50,000 thru 99,999    1991-07-04
## 2                     3 Cities from 50,000 thru 99,999    1991-10-06
## 3                    8D  Non-MSA counties under 10,000    1991-10-14
## 4                     5 Cities from 10,000 thru 24,999    1991-08-31
## 5                     6   Cities from 2,500 thru 9,999    1991-09-19
## 6                     3 Cities from 50,000 thru 99,999    1991-12-23
##   adult_victim_count juvenile_victim_count total_offender_count
## 1                 NA                    NA                    1
## 2                 NA                    NA                    2
## 3                 NA                    NA                    1
## 4                 NA                    NA                    1
## 5                 NA                    NA                    1
## 6                 NA                    NA                    1
##   adult_offender_count juvenile_offender_count             offender_race
## 1                   NA                      NA Black or African American
## 2                   NA                      NA Black or African American
## 3                   NA                      NA                     White
## 4                   NA                      NA                     White
## 5                   NA                      NA Black or African American
## 6                   NA                      NA Black or African American
##   offender_ethnicity victim_count       offense_name total_individual_victims
## 1      Not Specified            1 Aggravated Assault                        1
## 2      Not Specified            1       Intimidation                        1
## 3      Not Specified            1       Intimidation                        1
## 4      Not Specified            1       Intimidation                        1
## 5      Not Specified            1     Simple Assault                        1
## 6      Not Specified            1 Aggravated Assault                        1
##                        location_name                      bias_desc
## 1                     Residence/Home Anti-Black or African American
## 2                     Residence/Home                     Anti-White
## 3                     School/College Anti-Black or African American
## 4 Highway/Road/Alley/Street/Sidewalk Anti-Black or African American
## 5 Highway/Road/Alley/Street/Sidewalk                     Anti-White
## 6                Service/Gas Station                     Anti-White
##   victim_types multiple_offense multiple_bias
## 1   Individual                S             S
## 2   Individual                S             S
## 3   Individual                S             S
## 4   Individual                S             S
## 5   Individual                S             S
## 6   Individual                S             S

Yearly counts for each of the top 3 offenses

yearly_counts <- filtered_data %>%
  group_by(offense_name, data_year) %>%
  summarize(yearly_offender_count = sum(total_offender_count, na.rm = TRUE)) %>%
  arrange(offense_name, data_year)
## `summarise()` has grouped output by 'offense_name'. You can override using the
## `.groups` argument.
yearly_counts
## # A tibble: 96 × 3
## # Groups:   offense_name [3]
##    offense_name       data_year yearly_offender_count
##    <chr>                  <int>                 <int>
##  1 Aggravated Assault      1991                  1490
##  2 Aggravated Assault      1992                  2053
##  3 Aggravated Assault      1993                  2299
##  4 Aggravated Assault      1994                  1452
##  5 Aggravated Assault      1995                  1726
##  6 Aggravated Assault      1996                  1788
##  7 Aggravated Assault      1997                  1644
##  8 Aggravated Assault      1998                  1466
##  9 Aggravated Assault      1999                  1393
## 10 Aggravated Assault      2000                  1530
## # ℹ 86 more rows

Frequency of Simple Assault

SimAssault_count <- yearly_counts %>% filter(offense_name == "Simple Assault")
SimAssault_count
## # A tibble: 32 × 3
## # Groups:   offense_name [1]
##    offense_name   data_year yearly_offender_count
##    <chr>              <int>                 <int>
##  1 Simple Assault      1991                  1561
##  2 Simple Assault      1992                  2430
##  3 Simple Assault      1993                  2407
##  4 Simple Assault      1994                  1713
##  5 Simple Assault      1995                  2200
##  6 Simple Assault      1996                  2109
##  7 Simple Assault      1997                  2074
##  8 Simple Assault      1998                  2164
##  9 Simple Assault      1999                  2181
## 10 Simple Assault      2000                  1996
## # ℹ 22 more rows

Prepare the features (X) and labels (y)

X <- SimAssault_count$data_year
y <- SimAssault_count$yearly_offender_count

Convert to data frame for modeling

data_model <- data.frame(X, y)

Split the data into training and testing sets

set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
train_data <- data_model[train_index, ]
test_data <- data_model[-train_index, ]

The first line of code sets a random seed using set.seed(42) to ensure reproducibility. This ensures that the random processes in the code (like splitting data into training and testing sets) produce the same results every time you run the code. It is just like using the same starting point for a random number generator so that the sequence of random numbers it produces is always the same.

Splits the data into training (80%) and testing (20%) sets using createDataPartition. The indices of the training data are stored in train_index. The training and testing data are stored in train_data and test_data, respectively.

Separate features and labels for training and testing sets

X_train <- train_data$X
y_train <- train_data$y
X_test <- test_data$X
y_test <- test_data$y

These lines of code separate the features (X_train and X_test) and labels (y_train and y_test) for both training and testing sets.

1 X_train <- train_data$X: Extracts the feature (year) from the training data and assigns it to X_train.

2 y_train <- train_data$y: Extracts the label (number of “Damage” crimes) from the training data and assigns it to y_train.

3 X_test <- test_data$X: Extracts the feature (year) from the testing data and assigns it to X_test.

4 y_test <- test_data$y: Extracts the label (number of “Damage” crimes) from the testing data and assigns it to y_test.

These lines of code separate the input data (features) from the output data (labels) for both the training and testing datasets. This separation is essential for training a machine learning model and then evaluating its performance on new data.

Train the Random Forest model

model <- randomForest(y ~ X, data = train_data, ntree = 100, set.seed = 42)

This line of code trains a Random Forest model using randomForest. The formula y ~ X specifies that y is the dependent variable and X is the independent variable. The model is trained on train_data with 100 trees (ntree = 100). The random seed ensures reproducibility.

Predict on the test set

y_pred <- predict(model, newdata = data.frame(X = X_test))

This line of code uses the trained model to predict the y values for the X_test data. The predictions are stored in y_pred.

Evaluate the model

mse <- mean((y_test - y_pred)^2)
print(paste("Mean Squared Error:", mse))
## [1] "Mean Squared Error: 132544.287086563"

This line of code calculates the Mean Squared Error (MSE) between the actual y_test values and the predicted y_pred values to evaluate the model’s performance. The MSE is printed.

Predict the frequency of occurrences for the year 2023

year_2023 <- data.frame(X = 2023)
prediction_2023 <- predict(model, year_2023)

print(paste("Predicted frequency of Simple Assault in 2023:", round(prediction_2023)))
## [1] "Predicted frequency of Simple Assault in 2023: 2717"

Creates a data frame year_2023 with the year 2023. It uses the trained model to predict the frequency of “Aggravated Assault” occurrences for the year 2023.

The last line of code outputs the predicted frequency of Simple Assault occurrences for the year 2023. The prediction of occurrences of Simple Assault in 2023 is 2717.

This final output provides insight into future trends based on historical data.

intimidation_count <- yearly_counts %>% filter(offense_name == "Intimidation")
intimidation_count
## # A tibble: 32 × 3
## # Groups:   offense_name [1]
##    offense_name data_year yearly_offender_count
##    <chr>            <int>                 <int>
##  1 Intimidation      1991                  1286
##  2 Intimidation      1992                  2005
##  3 Intimidation      1993                  2080
##  4 Intimidation      1994                  1894
##  5 Intimidation      1995                  2574
##  6 Intimidation      1996                  2704
##  7 Intimidation      1997                  2777
##  8 Intimidation      1998                  2382
##  9 Intimidation      1999                  2187
## 10 Intimidation      2000                  2349
## # ℹ 22 more rows

Prepare the features (X) and labels (y)

X <- intimidation_count$data_year
y <- intimidation_count$yearly_offender_count

Convert to data frame for modeling

data_model <- data.frame(X, y)

Split the data into training and testing sets

set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
train_data <- data_model[train_index, ]
test_data <- data_model[-train_index, ]

Separate features and labels for training and testing sets

X_train <- train_data$X
y_train <- train_data$y
X_test <- test_data$X
y_test <- test_data$y

Train the Random Forest model

model <- randomForest(y ~ X, data = train_data, ntree = 100, set.seed = 42)

Predict on the test set

y_pred <- predict(model, newdata = data.frame(X = X_test))

Evaluate the model

mse <- mean((y_test - y_pred)^2)
print(paste("Mean Squared Error:", mse))
## [1] "Mean Squared Error: 47856.542985236"

Predict the frequency of occurrences for the year 2023

year_2023 <- data.frame(X = 2023)
prediction_2023 <- predict(model, year_2023)

print(paste("Predicted frequency of Intimidation in 2023:", round(prediction_2023)))
## [1] "Predicted frequency of Intimidation in 2023: 2534"
aggravated_count <- yearly_counts %>% filter(offense_name == "Aggravated Assault")
aggravated_count
## # A tibble: 32 × 3
## # Groups:   offense_name [1]
##    offense_name       data_year yearly_offender_count
##    <chr>                  <int>                 <int>
##  1 Aggravated Assault      1991                  1490
##  2 Aggravated Assault      1992                  2053
##  3 Aggravated Assault      1993                  2299
##  4 Aggravated Assault      1994                  1452
##  5 Aggravated Assault      1995                  1726
##  6 Aggravated Assault      1996                  1788
##  7 Aggravated Assault      1997                  1644
##  8 Aggravated Assault      1998                  1466
##  9 Aggravated Assault      1999                  1393
## 10 Aggravated Assault      2000                  1530
## # ℹ 22 more rows

Prepare the features (X) and labels (y)

X <- aggravated_count$data_year
y <- aggravated_count$yearly_offender_count

Convert to data frame for modeling

data_model <- data.frame(X, y)

Split the data into training and testing sets

set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
train_data <- data_model[train_index, ]
test_data <- data_model[-train_index, ]

Separate features and labels for training and testing sets

X_train <- train_data$X
y_train <- train_data$y
X_test <- test_data$X
y_test <- test_data$y

Train the Random Forest model

model <- randomForest(y ~ X, data = train_data, ntree = 100, set.seed = 42)

Predict on the test set

y_pred <- predict(model, newdata = data.frame(X = X_test))

Evaluate the model

mse <- mean((y_test - y_pred)^2)
print(paste("Mean Squared Error:", mse))
## [1] "Mean Squared Error: 29063.5534727083"

Predict the frequency of occurrences for the year 2023

year_2023 <- data.frame(X = 2023)
prediction_2023 <- predict(model, year_2023)

print(paste("Predicted frequency of Aggravated Assault in 2023:", round(prediction_2023)))
## [1] "Predicted frequency of Aggravated Assault in 2023: 1368"
SimAssault_new <- data.frame(offense_name = "Simple Assault",
                      data_year = 2023,
                      yearly_offender_count = 2717)

SimAssault_count <- rbind(SimAssault_count, SimAssault_new)
SimAssault_count
## # A tibble: 33 × 3
## # Groups:   offense_name [1]
##    offense_name   data_year yearly_offender_count
##    <chr>              <dbl>                 <dbl>
##  1 Simple Assault      1991                  1561
##  2 Simple Assault      1992                  2430
##  3 Simple Assault      1993                  2407
##  4 Simple Assault      1994                  1713
##  5 Simple Assault      1995                  2200
##  6 Simple Assault      1996                  2109
##  7 Simple Assault      1997                  2074
##  8 Simple Assault      1998                  2164
##  9 Simple Assault      1999                  2181
## 10 Simple Assault      2000                  1996
## # ℹ 23 more rows
int_new <- data.frame(offense_name = "Intimidation",
                      data_year = 2023,
                      yearly_offender_count = 2534)

intimidation_count <- rbind(intimidation_count, int_new)
agg_new <- data.frame(offense_name = "Aggravated Assault",
                      data_year = 2023,
                      yearly_offender_count = 1368)

aggravated_count <- rbind(aggravated_count, agg_new)
view(aggravated_count)
pred_new <- rbind(SimAssault_new, int_new ,agg_new )
pred_new
##         offense_name data_year yearly_offender_count
## 1     Simple Assault      2023                  2717
## 2       Intimidation      2023                  2534
## 3 Aggravated Assault      2023                  1368
yearly_counts2023 <- rbind(aggravated_count,intimidation_count, SimAssault_count)
yearly_counts2023
## # A tibble: 99 × 3
## # Groups:   offense_name [3]
##    offense_name       data_year yearly_offender_count
##    <chr>                  <dbl>                 <dbl>
##  1 Aggravated Assault      1991                  1490
##  2 Aggravated Assault      1992                  2053
##  3 Aggravated Assault      1993                  2299
##  4 Aggravated Assault      1994                  1452
##  5 Aggravated Assault      1995                  1726
##  6 Aggravated Assault      1996                  1788
##  7 Aggravated Assault      1997                  1644
##  8 Aggravated Assault      1998                  1466
##  9 Aggravated Assault      1999                  1393
## 10 Aggravated Assault      2000                  1530
## # ℹ 89 more rows
# Step 5: Visualize each offense's counts per year and the prediction
ggplot(yearly_counts2023, aes(x = data_year, y = yearly_offender_count, color = offense_name, group = offense_name)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  geom_point(data = pred_new, aes(x = data_year, y = yearly_offender_count), shape = 17, size = 3) +
  geom_text(data = pred_new, aes(x = data_year, y = yearly_offender_count, label = round(yearly_offender_count, 0)), vjust = -0.5) +
  labs(title = "Yearly Counts of Top 3 Offenses with Predictions",
       x = "Year",
       y = "Offender Count",
       color = "Offense Name") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.