This project uses a crime dataset from 1991 to 2022. The most common crimes during this time are Aggravated Assault, Intimidation, and Simple Assault. By using the Random Forest method, we aim to predict how often these crimes will happen in 2023. This analysis will help understand crime trends and support decisions for law enforcement and policy making.
if(!require(tidyverse))install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
data <- read.csv("hate_crime.csv")
head(data)
## incident_id data_year ori pug_agency_name pub_agency_unit
## 1 43 1991 AR0350100 Pine Bluff
## 2 44 1991 AR0350100 Pine Bluff
## 3 45 1991 AR0600300 North Little Rock
## 4 46 1991 AR0600300 North Little Rock
## 5 47 1991 AR0670000 Sevier
## 6 3015 1991 AR0040200 Rogers
## agency_type_name state_abbr state_name division_name region_name
## 1 City AR Arkansas West South Central South
## 2 City AR Arkansas West South Central South
## 3 City AR Arkansas West South Central South
## 4 City AR Arkansas West South Central South
## 5 County AR Arkansas West South Central South
## 6 City AR Arkansas West South Central South
## population_group_code population_group_description incident_date
## 1 3 Cities from 50,000 thru 99,999 1991-07-04
## 2 3 Cities from 50,000 thru 99,999 1991-12-24
## 3 3 Cities from 50,000 thru 99,999 1991-07-10
## 4 3 Cities from 50,000 thru 99,999 1991-10-06
## 5 8D Non-MSA counties under 10,000 1991-10-14
## 6 5 Cities from 10,000 thru 24,999 1991-08-31
## adult_victim_count juvenile_victim_count total_offender_count
## 1 NA NA 1
## 2 NA NA 1
## 3 NA NA 1
## 4 NA NA 2
## 5 NA NA 1
## 6 NA NA 1
## adult_offender_count juvenile_offender_count offender_race
## 1 NA NA Black or African American
## 2 NA NA Black or African American
## 3 NA NA Black or African American
## 4 NA NA Black or African American
## 5 NA NA White
## 6 NA NA White
## offender_ethnicity victim_count
## 1 Not Specified 1
## 2 Not Specified 2
## 3 Not Specified 2
## 4 Not Specified 1
## 5 Not Specified 1
## 6 Not Specified 1
## offense_name
## 1 Aggravated Assault
## 2 Aggravated Assault;Destruction/Damage/Vandalism of Property
## 3 Aggravated Assault;Murder and Nonnegligent Manslaughter
## 4 Intimidation
## 5 Intimidation
## 6 Intimidation
## total_individual_victims location_name
## 1 1 Residence/Home
## 2 1 Highway/Road/Alley/Street/Sidewalk
## 3 2 Residence/Home
## 4 1 Residence/Home
## 5 1 School/College
## 6 1 Highway/Road/Alley/Street/Sidewalk
## bias_desc victim_types multiple_offense multiple_bias
## 1 Anti-Black or African American Individual S S
## 2 Anti-White Individual M S
## 3 Anti-White Individual M S
## 4 Anti-White Individual S S
## 5 Anti-Black or African American Individual S S
## 6 Anti-Black or African American Individual S S
top_offenses <- data %>%
group_by(offense_name) %>%
summarize(total_offender_count = sum(total_offender_count, na.rm = TRUE)) %>%
arrange(desc(total_offender_count)) %>%
slice_head(n = 3)
top_offenses
## # A tibble: 3 × 2
## offense_name total_offender_count
## <chr> <int>
## 1 Simple Assault 65440
## 2 Intimidation 63191
## 3 Aggravated Assault 42291
top_offenses_names <- top_offenses$offense_name
top_offenses$offense_name
## [1] "Simple Assault" "Intimidation" "Aggravated Assault"
filtered_data <- data %>%
filter(offense_name %in% top_offenses_names)
head(filtered_data)
## incident_id data_year ori pug_agency_name pub_agency_unit
## 1 43 1991 AR0350100 Pine Bluff
## 2 46 1991 AR0600300 North Little Rock
## 3 47 1991 AR0670000 Sevier
## 4 3015 1991 AR0040200 Rogers
## 5 3016 1991 AR0290100 Hope
## 6 3017 1991 AR0350100 Pine Bluff
## agency_type_name state_abbr state_name division_name region_name
## 1 City AR Arkansas West South Central South
## 2 City AR Arkansas West South Central South
## 3 County AR Arkansas West South Central South
## 4 City AR Arkansas West South Central South
## 5 City AR Arkansas West South Central South
## 6 City AR Arkansas West South Central South
## population_group_code population_group_description incident_date
## 1 3 Cities from 50,000 thru 99,999 1991-07-04
## 2 3 Cities from 50,000 thru 99,999 1991-10-06
## 3 8D Non-MSA counties under 10,000 1991-10-14
## 4 5 Cities from 10,000 thru 24,999 1991-08-31
## 5 6 Cities from 2,500 thru 9,999 1991-09-19
## 6 3 Cities from 50,000 thru 99,999 1991-12-23
## adult_victim_count juvenile_victim_count total_offender_count
## 1 NA NA 1
## 2 NA NA 2
## 3 NA NA 1
## 4 NA NA 1
## 5 NA NA 1
## 6 NA NA 1
## adult_offender_count juvenile_offender_count offender_race
## 1 NA NA Black or African American
## 2 NA NA Black or African American
## 3 NA NA White
## 4 NA NA White
## 5 NA NA Black or African American
## 6 NA NA Black or African American
## offender_ethnicity victim_count offense_name total_individual_victims
## 1 Not Specified 1 Aggravated Assault 1
## 2 Not Specified 1 Intimidation 1
## 3 Not Specified 1 Intimidation 1
## 4 Not Specified 1 Intimidation 1
## 5 Not Specified 1 Simple Assault 1
## 6 Not Specified 1 Aggravated Assault 1
## location_name bias_desc
## 1 Residence/Home Anti-Black or African American
## 2 Residence/Home Anti-White
## 3 School/College Anti-Black or African American
## 4 Highway/Road/Alley/Street/Sidewalk Anti-Black or African American
## 5 Highway/Road/Alley/Street/Sidewalk Anti-White
## 6 Service/Gas Station Anti-White
## victim_types multiple_offense multiple_bias
## 1 Individual S S
## 2 Individual S S
## 3 Individual S S
## 4 Individual S S
## 5 Individual S S
## 6 Individual S S
yearly_counts <- filtered_data %>%
group_by(offense_name, data_year) %>%
summarize(yearly_offender_count = sum(total_offender_count, na.rm = TRUE)) %>%
arrange(offense_name, data_year)
## `summarise()` has grouped output by 'offense_name'. You can override using the
## `.groups` argument.
yearly_counts
## # A tibble: 96 × 3
## # Groups: offense_name [3]
## offense_name data_year yearly_offender_count
## <chr> <int> <int>
## 1 Aggravated Assault 1991 1490
## 2 Aggravated Assault 1992 2053
## 3 Aggravated Assault 1993 2299
## 4 Aggravated Assault 1994 1452
## 5 Aggravated Assault 1995 1726
## 6 Aggravated Assault 1996 1788
## 7 Aggravated Assault 1997 1644
## 8 Aggravated Assault 1998 1466
## 9 Aggravated Assault 1999 1393
## 10 Aggravated Assault 2000 1530
## # ℹ 86 more rows
SimAssault_count <- yearly_counts %>% filter(offense_name == "Simple Assault")
SimAssault_count
## # A tibble: 32 × 3
## # Groups: offense_name [1]
## offense_name data_year yearly_offender_count
## <chr> <int> <int>
## 1 Simple Assault 1991 1561
## 2 Simple Assault 1992 2430
## 3 Simple Assault 1993 2407
## 4 Simple Assault 1994 1713
## 5 Simple Assault 1995 2200
## 6 Simple Assault 1996 2109
## 7 Simple Assault 1997 2074
## 8 Simple Assault 1998 2164
## 9 Simple Assault 1999 2181
## 10 Simple Assault 2000 1996
## # ℹ 22 more rows
X <- SimAssault_count$data_year
y <- SimAssault_count$yearly_offender_count
data_model <- data.frame(X, y)
set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
train_data <- data_model[train_index, ]
test_data <- data_model[-train_index, ]
The first line of code sets a random seed using set.seed(42) to ensure reproducibility. This ensures that the random processes in the code (like splitting data into training and testing sets) produce the same results every time you run the code. It is just like using the same starting point for a random number generator so that the sequence of random numbers it produces is always the same.
Splits the data into training (80%) and testing (20%) sets using createDataPartition. The indices of the training data are stored in train_index. The training and testing data are stored in train_data and test_data, respectively.
X_train <- train_data$X
y_train <- train_data$y
X_test <- test_data$X
y_test <- test_data$y
These lines of code separate the features (X_train and X_test) and labels (y_train and y_test) for both training and testing sets.
1 X_train <- train_data$X: Extracts the feature (year) from the training data and assigns it to X_train.
2 y_train <- train_data$y: Extracts the label (number of “Damage” crimes) from the training data and assigns it to y_train.
3 X_test <- test_data$X: Extracts the feature (year) from the testing data and assigns it to X_test.
4 y_test <- test_data$y: Extracts the label (number of “Damage” crimes) from the testing data and assigns it to y_test.
These lines of code separate the input data (features) from the output data (labels) for both the training and testing datasets. This separation is essential for training a machine learning model and then evaluating its performance on new data.
model <- randomForest(y ~ X, data = train_data, ntree = 100, set.seed = 42)
This line of code trains a Random Forest model using randomForest. The formula y ~ X specifies that y is the dependent variable and X is the independent variable. The model is trained on train_data with 100 trees (ntree = 100). The random seed ensures reproducibility.
y_pred <- predict(model, newdata = data.frame(X = X_test))
This line of code uses the trained model to predict the y values for the X_test data. The predictions are stored in y_pred.
mse <- mean((y_test - y_pred)^2)
print(paste("Mean Squared Error:", mse))
## [1] "Mean Squared Error: 132544.287086563"
This line of code calculates the Mean Squared Error (MSE) between the actual y_test values and the predicted y_pred values to evaluate the model’s performance. The MSE is printed.
year_2023 <- data.frame(X = 2023)
prediction_2023 <- predict(model, year_2023)
print(paste("Predicted frequency of Simple Assault in 2023:", round(prediction_2023)))
## [1] "Predicted frequency of Simple Assault in 2023: 2717"
Creates a data frame year_2023 with the year 2023. It uses the trained model to predict the frequency of “Aggravated Assault” occurrences for the year 2023.
The last line of code outputs the predicted frequency of Simple Assault occurrences for the year 2023. The prediction of occurrences of Simple Assault in 2023 is 2717.
This final output provides insight into future trends based on historical data.
intimidation_count <- yearly_counts %>% filter(offense_name == "Intimidation")
intimidation_count
## # A tibble: 32 × 3
## # Groups: offense_name [1]
## offense_name data_year yearly_offender_count
## <chr> <int> <int>
## 1 Intimidation 1991 1286
## 2 Intimidation 1992 2005
## 3 Intimidation 1993 2080
## 4 Intimidation 1994 1894
## 5 Intimidation 1995 2574
## 6 Intimidation 1996 2704
## 7 Intimidation 1997 2777
## 8 Intimidation 1998 2382
## 9 Intimidation 1999 2187
## 10 Intimidation 2000 2349
## # ℹ 22 more rows
X <- intimidation_count$data_year
y <- intimidation_count$yearly_offender_count
data_model <- data.frame(X, y)
set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
train_data <- data_model[train_index, ]
test_data <- data_model[-train_index, ]
X_train <- train_data$X
y_train <- train_data$y
X_test <- test_data$X
y_test <- test_data$y
model <- randomForest(y ~ X, data = train_data, ntree = 100, set.seed = 42)
y_pred <- predict(model, newdata = data.frame(X = X_test))
mse <- mean((y_test - y_pred)^2)
print(paste("Mean Squared Error:", mse))
## [1] "Mean Squared Error: 47856.542985236"
year_2023 <- data.frame(X = 2023)
prediction_2023 <- predict(model, year_2023)
print(paste("Predicted frequency of Intimidation in 2023:", round(prediction_2023)))
## [1] "Predicted frequency of Intimidation in 2023: 2534"
aggravated_count <- yearly_counts %>% filter(offense_name == "Aggravated Assault")
aggravated_count
## # A tibble: 32 × 3
## # Groups: offense_name [1]
## offense_name data_year yearly_offender_count
## <chr> <int> <int>
## 1 Aggravated Assault 1991 1490
## 2 Aggravated Assault 1992 2053
## 3 Aggravated Assault 1993 2299
## 4 Aggravated Assault 1994 1452
## 5 Aggravated Assault 1995 1726
## 6 Aggravated Assault 1996 1788
## 7 Aggravated Assault 1997 1644
## 8 Aggravated Assault 1998 1466
## 9 Aggravated Assault 1999 1393
## 10 Aggravated Assault 2000 1530
## # ℹ 22 more rows
X <- aggravated_count$data_year
y <- aggravated_count$yearly_offender_count
data_model <- data.frame(X, y)
set.seed(42)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
train_data <- data_model[train_index, ]
test_data <- data_model[-train_index, ]
X_train <- train_data$X
y_train <- train_data$y
X_test <- test_data$X
y_test <- test_data$y
model <- randomForest(y ~ X, data = train_data, ntree = 100, set.seed = 42)
y_pred <- predict(model, newdata = data.frame(X = X_test))
mse <- mean((y_test - y_pred)^2)
print(paste("Mean Squared Error:", mse))
## [1] "Mean Squared Error: 29063.5534727083"
year_2023 <- data.frame(X = 2023)
prediction_2023 <- predict(model, year_2023)
print(paste("Predicted frequency of Aggravated Assault in 2023:", round(prediction_2023)))
## [1] "Predicted frequency of Aggravated Assault in 2023: 1368"
SimAssault_new <- data.frame(offense_name = "Simple Assault",
data_year = 2023,
yearly_offender_count = 2717)
SimAssault_count <- rbind(SimAssault_count, SimAssault_new)
SimAssault_count
## # A tibble: 33 × 3
## # Groups: offense_name [1]
## offense_name data_year yearly_offender_count
## <chr> <dbl> <dbl>
## 1 Simple Assault 1991 1561
## 2 Simple Assault 1992 2430
## 3 Simple Assault 1993 2407
## 4 Simple Assault 1994 1713
## 5 Simple Assault 1995 2200
## 6 Simple Assault 1996 2109
## 7 Simple Assault 1997 2074
## 8 Simple Assault 1998 2164
## 9 Simple Assault 1999 2181
## 10 Simple Assault 2000 1996
## # ℹ 23 more rows
int_new <- data.frame(offense_name = "Intimidation",
data_year = 2023,
yearly_offender_count = 2534)
intimidation_count <- rbind(intimidation_count, int_new)
agg_new <- data.frame(offense_name = "Aggravated Assault",
data_year = 2023,
yearly_offender_count = 1368)
aggravated_count <- rbind(aggravated_count, agg_new)
view(aggravated_count)
pred_new <- rbind(SimAssault_new, int_new ,agg_new )
pred_new
## offense_name data_year yearly_offender_count
## 1 Simple Assault 2023 2717
## 2 Intimidation 2023 2534
## 3 Aggravated Assault 2023 1368
yearly_counts2023 <- rbind(aggravated_count,intimidation_count, SimAssault_count)
yearly_counts2023
## # A tibble: 99 × 3
## # Groups: offense_name [3]
## offense_name data_year yearly_offender_count
## <chr> <dbl> <dbl>
## 1 Aggravated Assault 1991 1490
## 2 Aggravated Assault 1992 2053
## 3 Aggravated Assault 1993 2299
## 4 Aggravated Assault 1994 1452
## 5 Aggravated Assault 1995 1726
## 6 Aggravated Assault 1996 1788
## 7 Aggravated Assault 1997 1644
## 8 Aggravated Assault 1998 1466
## 9 Aggravated Assault 1999 1393
## 10 Aggravated Assault 2000 1530
## # ℹ 89 more rows
# Step 5: Visualize each offense's counts per year and the prediction
ggplot(yearly_counts2023, aes(x = data_year, y = yearly_offender_count, color = offense_name, group = offense_name)) +
geom_line(size = 1) +
geom_point(size = 2) +
geom_point(data = pred_new, aes(x = data_year, y = yearly_offender_count), shape = 17, size = 3) +
geom_text(data = pred_new, aes(x = data_year, y = yearly_offender_count, label = round(yearly_offender_count, 0)), vjust = -0.5) +
labs(title = "Yearly Counts of Top 3 Offenses with Predictions",
x = "Year",
y = "Offender Count",
color = "Offense Name") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.