1. Project Overview

This document represents Part 1 of our Statistical Learning final project, which focuses on: - Cleaning and preparing FBI hate-crime datasets (2015–2025) - Ensuring consistent bias categories across time and across National vs. New Jersey data - Constructing a unified monthly time series dataset - Aggregating annual totals - Conducting exploratory data analysis (EDA), including: 1. Annual bias-category trends 2. Overall annual totals 3. Pre/Post COVID comparisons 4. Correlation heatmaps

The data comes from the FBI Crime Data Explorer, downloaded and separated into: - Bias category totals (4 files) - Monthly hate-crime totals (4 files) We combine all eight datasets into a single analytical structure.

2. Load Data Files

# Bias-category total files

bias_files <- c(
"feb2015-feb2020_national_biascategories.csv",
"march2020-march2025_national_biascategories.csv",
"feb2015-feb2020_NJ_biascategories.csv",
"march2020-march2025_NJ_biascategories.csv"
)

# Monthly total hate-crime files

monthly_files <- c(
"feb2015-feb2020_national_hatecrimes.csv",
"march2020-march2025_national_hatecrimes.csv",
"feb2015-feb2020_NJ_hatecrimes.csv",
"march2020-march2025_NJ_hatecrimes.csv"
)

3. Wrangling Monthly Hate-Crime Totals

We reshape each dataset from wide format (Month-Year as column names) to long format and convert to actual dates.

process_monthly_totals <- function(file_path) {

location <- ifelse(str_detect(file_path, "national"), "National", "New Jersey")
period <- ifelse(str_detect(file_path, "feb2015"),
"Pre-COVID (2015–2020)",
"Post-COVID (2020–2025)")

df <- read_csv(file_path)

df_clean <- df %>%
select(-1) %>%
pivot_longer(
cols = everything(),
names_to = "Month_Year",
values_to = "Monthly_Total_Count"
) %>%
mutate(
Location = location,
Period = period,
Date = my(Month_Year)
) %>%
filter(!is.na(Date)) %>%
group_by(Location, Period) %>%
mutate(Period_Total_Count_Monthly = sum(Monthly_Total_Count, na.rm = TRUE)) %>%
ungroup()

df_clean
}

df_monthly_totals <- map_dfr(monthly_files, process_monthly_totals)

4. Wrangling Bias-Category Totals

We pivot all bias categories long and compute proportions within each period (needed for proportional allocation into monthly estimates).

process_bias_totals <- function(file_path) {

location <- ifelse(str_detect(file_path, "national"), "National", "New Jersey")
period <- ifelse(str_detect(file_path, "feb2015"),
"Pre-COVID (2015–2020)",
"Post-COVID (2020–2025)")

df <- read_csv(file_path)

df_long <- df %>%
pivot_longer(
cols = everything(),
names_to = "Bias_Category",
values_to = "Bias_Category_Total"
) %>%
mutate(
Bias_Category = str_trim(str_remove_all(Bias_Category, '"')),
Location = location,
Period = period,
Period_Total_Count_Bias = sum(Bias_Category_Total, na.rm = TRUE),
Bias_Proportion = Bias_Category_Total / Period_Total_Count_Bias
)

df_long
}

df_bias_totals <- map_dfr(bias_files, process_bias_totals)

5. Merge Data & Construct Monthly Bias-Type Estimates

We merge monthly totals with proportional bias data, generating a monthly time series for all bias categories.

df_merged <- inner_join(
df_monthly_totals,
df_bias_totals,
by = c("Location", "Period")
)

df_final_time_series <- df_merged %>%
mutate(
Estimated_Monthly_Count = Monthly_Total_Count * Bias_Proportion,
Year = year(Date),
Month = month(Date, label = TRUE)
) %>%
select(Date, Year, Month, Location, Period, Bias_Category, Estimated_Monthly_Count)

write_csv(df_final_time_series, "full_monthly_bias_time_series.csv")

6. Aggregating to Annual Totals

This dataset is used for trend lines and pre/post comparison.

df_annual_bias <- df_final_time_series %>%
group_by(Location, Year, Bias_Category) %>%
summarise(
Annual_Count = sum(Estimated_Monthly_Count, na.rm = TRUE),
.groups = "drop"
)

write_csv(df_annual_bias, "annual_bias_trends.csv")

7. Exploratory Data Analysis (EDA)

7.1 Annual Trend plots by Bias Category

7.2 Overall Annual Hate Crime Totals - All Biases Combined

df_overall_trends <- df_final_time_series %>%
group_by(Location, Year) %>%
summarise(
Annual_Total = sum(Estimated_Monthly_Count, na.rm = TRUE),
.groups = "drop"
)

a) National Total Hate Crimes

df_overall_trends %>%
filter(Location == "National") %>%
ggplot(aes(Year, Annual_Total)) +
geom_line(size = 1, color = "#1f78b4") +
geom_point(size = 2, color = "#1f78b4") +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "National Total Hate Crimes per Year (2015–2025)",
x = "Year", y = "Total Estimated Hate Crimes"
) +
scale_y_continuous(labels = comma) +
theme_minimal()

b) New Jersey Total Hate Crimes

df_overall_trends %>%
filter(Location == "New Jersey") %>%
ggplot(aes(Year, Annual_Total)) +
geom_line(size = 1, color = "#e31a1c") +
geom_point(size = 2, color = "#e31a1c") +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "New Jersey Total Hate Crimes per Year (2015–2025)",
x = "Year", y = "Total Estimated Hate Crimes"
) +
scale_y_continuous(labels = comma) +
theme_minimal()

c) National vs. New Jersey Total Hate Crimes — Direct Comparison

df_overall_trends %>%
ggplot(aes(x = Year, y = Annual_Total, color = Location)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
geom_vline(xintercept = 2020, linetype = "dashed") +
scale_color_manual(values = c("National" = "#1f78b4", "New Jersey" = "#e31a1c")) +
labs(
title = "Comparison of National vs. New Jersey Total Hate Crimes (2015–2025)",
x = "Year",
y = "Total Estimated Hate Crimes",
color = "Location"
) +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(legend.position = "bottom")

7.3 Pre/Post COVID Percentage Changes

df_change <- df_annual_bias %>%
mutate(Period_Group = ifelse(Year < 2020, "Pre_COVID", "Post_COVID")) %>%
group_by(Location, Bias_Category, Period_Group) %>%
summarise(Total = sum(Annual_Count), .groups = "drop") %>%
pivot_wider(names_from = Period_Group, values_from = Total) %>%
mutate(
Absolute_Change = Post_COVID - Pre_COVID,
Percent_Change = (Absolute_Change / Pre_COVID) * 100
) %>%
arrange(Location, desc(Percent_Change))

df_change
## # A tibble: 12 × 6
##    Location   Bias_Category  Post_COVID Pre_COVID Absolute_Change Percent_Change
##    <chr>      <chr>               <dbl>     <dbl>           <dbl>          <dbl>
##  1 National   Gender Identi…      5099.    1658.           3441.           208. 
##  2 National   Gender              1379.     624.            756.           121. 
##  3 National   Race/Ethnicit…     74304.   41988.          32315.            77.0
##  4 National   Sexual Orient…     21862.   12555.           9307.            74.1
##  5 National   Religion           25734.   16047.           9687.            60.4
##  6 National   Disability          2022.    1396.            626.            44.9
##  7 New Jersey Gender Identi…       401.      46.4           354.           763. 
##  8 New Jersey Disability           123.      37.8            84.8          224. 
##  9 New Jersey Sexual Orient…      1543.     573.            971.           169. 
## 10 New Jersey Race/Ethnicit…      7104.    2764.           4340.           157. 
## 11 New Jersey Gender               101.      39.6            61.4          155. 
## 12 New Jersey Religion            3062.    1887.           1175.            62.3

7.4 Correlation Heatmaps of Bias Types

calculate_correlation_data <- function(location_name, df) {
df %>%
filter(Location == location_name) %>%
pivot_wider(id_cols = Date, names_from = Bias_Category, values_from = Estimated_Monthly_Count, values_fn = sum) %>%
select(-Date) %>%
cor(use = "complete.obs") %>%
melt(value.name = "Correlation") %>%
rename(Bias_Category_1 = Var1, Bias_Category_2 = Var2) %>%
mutate(Location = location_name)
}

df_cor_all <- bind_rows(
calculate_correlation_data("National", df_final_time_series),
calculate_correlation_data("New Jersey", df_final_time_series)
)

ggplot(df_cor_all, aes(x = Bias_Category_1, y = Bias_Category_2, fill = Correlation)) +
geom_tile() +
geom_text(aes(label = round(Correlation, 2)), size = 3) +
facet_wrap(~ Location) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
labs(
title = "Correlation Heatmaps of Monthly Hate Crime Counts by Bias Type",
subtitle = "National vs. New Jersey (2015–2025)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_fixed()

8. Time Series Trend Models National Monthly Hate Crimes 2015 - 2025

Modeling: Monthly Total Hate Crimes=β0 + β1 (time) + ϵ

library(forecast)
library(broom)

# Prepare total monthly counts
df_monthly_total <- df_final_time_series %>%
  group_by(Location, Date) %>%
  summarise(Monthly_Total = sum(Estimated_Monthly_Count), .groups = "drop")

#Linear Trend Model -- National level
national_ts <- df_monthly_total %>%
  filter(Location == "National") %>%
  arrange(Date)

national_ts$time_index <- 1:nrow(national_ts)

model_linear <- lm(Monthly_Total ~ time_index, data = national_ts)
summary(model_linear)
## 
## Call:
## lm(formula = Monthly_Total ~ time_index, data = national_ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -726.28 -205.88  -35.64  124.96 2354.42 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 975.0710    65.1540   14.97   <2e-16 ***
## time_index   11.4232     0.9193   12.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 357.6 on 120 degrees of freedom
## Multiple R-squared:  0.5627, Adjusted R-squared:  0.559 
## F-statistic: 154.4 on 1 and 120 DF,  p-value: < 2.2e-16

slope = time_index = 11.42 - This means that hate crimes are increasing by about 11.4 incidents per month on average across the entire 10 year period. - This is a steady upward trend, not flat or random - This corresponds to 11.4 * 12 = 137 additional incidents per year

The p-value of the slope is < 2e-16 - this means it is extremely statistical significant

R^2 = 0.563 - This means about 56% of the variation in monthly hate crime totals is explained by time alone - This makes sense for social and crime data because it contains seasonality, event-driven noise (such as COVID, BLM, etc.) and reporting inconsistencies

A simple linear trend explaining > 50% of variation is meaningful

#for new jersey:
nj_ts <- df_monthly_total %>%
  filter(Location == "New Jersey") %>%
  arrange(Date)

nj_ts$time_index <- 1:nrow(nj_ts)

model_linear_nj <- lm(Monthly_Total ~ time_index, data = nj_ts)
summary(model_linear_nj)
## 
## Call:
## lm(formula = Monthly_Total ~ time_index, data = nj_ts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.026  -30.351   -7.418   25.847  305.707 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  50.7813    10.2852   4.937 2.59e-06 ***
## time_index    1.5309     0.1451  10.549  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.45 on 120 degrees of freedom
## Multiple R-squared:  0.4811, Adjusted R-squared:  0.4768 
## F-statistic: 111.3 on 1 and 120 DF,  p-value: < 2.2e-16

time_index = 1.53 (p<2e-16) - For each passing month, New Jersey’s hate crime count increases by about 1.5 incidents per month on average - this effect is highly statistically significant - this means that NH crimes show a clear and steady upward trend over the decade

intercept = 50.78 - the estimated starting point of the trend; in the first month the model estimates NJ had about 51 hate crime incidents

R^2 = 0.48 - 48% of the month to month variation in NJ hate crimes data is explained by time alone - this is moderately strong for noisy crime data - the Residual Standard Error is 56.45 which means predictions are typically off by about 56 incidents from the true monthly count - given that NJ’s monthly totals are much smaller than the national totals, this is expected

9. Pre/Post Covid Significance Testing ## a) National: The goal is to see if something changed beginning in 2020 above and beyond the existing time trend.

national_ts <- national_ts %>%
  mutate(Post2020 = ifelse(year(Date) >= 2020, 1, 0))

model_break <- lm(Monthly_Total ~ time_index + Post2020, data = national_ts)
summary(model_break)
## 
## Call:
## lm(formula = Monthly_Total ~ time_index + Post2020, data = national_ts)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -814.7 -174.5    7.7  145.4 2153.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1083.973     68.576  15.807  < 2e-16 ***
## time_index     5.826      1.746   3.337 0.001129 ** 
## Post2020     455.673    123.026   3.704 0.000323 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 340 on 119 degrees of freedom
## Multiple R-squared:  0.6079, Adjusted R-squared:  0.6013 
## F-statistic: 92.24 on 2 and 119 DF,  p-value: < 2.2e-16

Intercept = 1083.91 - the baseline monthly hate crime count at time_index = 0 (the first month in data)

Time_Index (slope) = 5.823 (p = 0.011) - This means that before accounting for 2020, the monthly total was already increasing over time and on average, hate crimes increased by about 5.8 incidents per month per year - Upward trend exists before 2020

Post2020 = 455.67 (p = 0.0003) - This positive, significant coefficient means that starting in 2020, monthly hate crimes jumped by an additional ~456 incidents independent of the underlying time trend - So there is a statistically significant level shift upward in 2020 - This supports the idea of a structural break due to COVID and the political climate

library(dplyr)
library(ggplot2)

# Prepare national data
nat <- df_monthly_total %>%
  filter(Location == "National") %>%
  arrange(Date) %>%
  mutate(
    time_index = 1:n(),
    Post2020 = ifelse(lubridate::year(Date) >= 2020, 1, 0)
  )

# Fit break model
nat$Fitted <- predict(model_break)

# Plot structural break
ggplot(nat, aes(x = Date)) +
  geom_line(aes(y = Monthly_Total), color = "black", alpha = 0.7) +
  geom_line(aes(y = Fitted), color = "red", size = 1) +
  geom_vline(xintercept = as.Date("2020-01-01"), linetype = "dashed") +
  labs(
    title = "Structural Break in National Hate Crimes (2020)",
    y = "Monthly Hate Crime Count",
    x = "Date"
  ) +
  theme_minimal()

- ANOVA (f test)

This shows that there is indeed a spike around COVID, which we knew about but I want to test if there is a signficant increase in trend before and after COVID. To do this, I need to create an interaction term and do a F-test.

national_ts <- national_ts %>%
  mutate(
    Post2020 = ifelse(year(Date) >= 2020, 1, 0),
    time_post2020 = time_index * Post2020   # interaction term
  )

model_slope_change <- lm(Monthly_Total ~ time_index + Post2020 + time_post2020,
                         data = national_ts)

summary(model_slope_change)
## 
## Call:
## lm(formula = Monthly_Total ~ time_index + Post2020 + time_post2020, 
##     data = national_ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -853.60 -152.05    7.57  147.75 2117.53 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1033.235     89.766  11.510  < 2e-16 ***
## time_index       7.518      2.602   2.889  0.00460 ** 
## Post2020       632.820    236.541   2.675  0.00853 ** 
## time_post2020   -3.080      3.512  -0.877  0.38219    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 340.4 on 118 degrees of freedom
## Multiple R-squared:  0.6104, Adjusted R-squared:  0.6005 
## F-statistic: 61.63 on 3 and 118 DF,  p-value: < 2.2e-16
model_level_only <- lm(Monthly_Total ~ time_index + Post2020,
                       data = national_ts)

anova(model_level_only, model_slope_change)
## Analysis of Variance Table
## 
## Model 1: Monthly_Total ~ time_index + Post2020
## Model 2: Monthly_Total ~ time_index + Post2020 + time_post2020
##   Res.Df      RSS Df Sum of Sq      F Pr(>F)
## 1    119 13760272                           
## 2    118 13671133  1     89139 0.7694 0.3822

There is no statistical evidence that the slope (trend) of hate crimes changed after 2020.

Even though 2020 had a jump (level shift), the rate of increase per month after 2020 is not significantly different from the rate before 2020. There is a clear jump in hate crime totals in 2020.

However, when testing whether the trend changed — that is, whether hate crimes began increasing faster after COVID — the statistical evidence does not support a significant change in slope (F-test p = 0.38). This means the spike reflects a one-time level shift, not an ongoing acceleration.

b) New Jersey:

#for new jersey:
model_break_nj <- lm(Monthly_Total ~ time_index + I(year(Date) >= 2020), data = nj_ts)
summary(model_break_nj)
## 
## Call:
## lm(formula = Monthly_Total ~ time_index + I(year(Date) >= 2020), 
##     data = nj_ts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.445  -28.344   -6.399   25.922  285.068 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                61.9764    11.1791   5.544 1.81e-07 ***
## time_index                  0.9556     0.2846   3.358  0.00106 ** 
## I(year(Date) >= 2020)TRUE  46.8427    20.0553   2.336  0.02118 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.43 on 119 degrees of freedom
## Multiple R-squared:  0.5039, Adjusted R-squared:  0.4956 
## F-statistic: 60.43 on 2 and 119 DF,  p-value: < 2.2e-16

time_index = 0.96 (p = 0.001) - before and after 2020, NJ hate crimes increased by about 1 incident per month per year - NJ has a slow by steady long term upward trend - this is statistically significant

post2020 indicator = 46.84 (p = 0.021) - beginning in 2020, NJ saw an average jump of about 47 additional hate crime incidents per month, above what the trend alone would predict - this is statistically significant, so it is very unlikely to be random - clear evidence of a structural break due to COVID and political climat

intercept - 61.98 - the baseline before trend and break are applied

Model Fit - R^2 = 0.504 –> model explains 50% of variation - Residual SE = 55.43 –> typical error is about 55 incidents - This is an improved fit compared to the trend only model (r^2 was 0.48 before adding the break) - Thus Adding the post 2020 dummy improves the model

# Prepare NJ data
nj <- df_monthly_total %>%
  filter(Location == "New Jersey") %>%
  arrange(Date) %>%
  mutate(
    time_index = 1:n(),
    Post2020 = ifelse(lubridate::year(Date) >= 2020, 1, 0)
  )

# Fit break model
nj$Fitted <- predict(model_break_nj)

# Plot structural break
ggplot(nj, aes(x = Date)) +
  geom_line(aes(y = Monthly_Total), color = "brown", alpha = 0.7) +
  geom_line(aes(y = Fitted), color = "red", size = 1) +
  geom_vline(xintercept = as.Date("2020-01-01"), linetype = "dashed") +
  labs(
    title = "Structural Break in New Jersey Hate Crimes (2020)",
    y = "Monthly Hate Crime Count",
    x = "Date"
  ) +
  theme_minimal()

- ANOVA (f test)

nj_ts <- nj_ts %>%
  mutate(
    Post2020 = ifelse(year(Date) >= 2020, 1, 0),
    time_post2020 = time_index * Post2020   # interaction term
  )

model_slope_change_nj <- lm(Monthly_Total ~ time_index + Post2020 + time_post2020,
                         data = nj_ts)

summary(model_slope_change_nj)
## 
## Call:
## lm(formula = Monthly_Total ~ time_index + Post2020 + time_post2020, 
##     data = nj_ts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -120.572  -30.439   -1.667   24.946  257.630 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    23.4307    13.6090   1.722   0.0877 .  
## time_index      2.2404     0.3945   5.679 9.90e-08 ***
## Post2020      181.4200    35.8611   5.059 1.56e-06 ***
## time_post2020  -2.3401     0.5324  -4.395 2.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.6 on 118 degrees of freedom
## Multiple R-squared:  0.5737, Adjusted R-squared:  0.5628 
## F-statistic: 52.93 on 3 and 118 DF,  p-value: < 2.2e-16
model_level_only_nj <- lm(Monthly_Total ~ time_index + Post2020,
                       data = nj_ts)

anova(model_level_only_nj, model_slope_change_nj)
## Analysis of Variance Table
## 
## Model 1: Monthly_Total ~ time_index + Post2020
## Model 2: Monthly_Total ~ time_index + Post2020 + time_post2020
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    119 365670                                  
## 2    118 314224  1     51445 19.319 2.432e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

New Jersey experienced a statistically significant change in its hate-crime trend after 2020. Although there was a sharp increase in 2020, the month-to-month slope decreased significantly afterward. The F-test shows that including a post-2020 slope term substantially improves the model (p = 2.4e−05), indicating that the long-term trend shifted direction, not just the level.

. NJ shows a significant change in slope after 2020

The coefficient for time_post2020 is −2.34 and highly significant (p = 2.4e−05).

This means: After 2020, the month-to-month trend decreased relative to the pre-2020 trend, even though the overall post-2020 level jumped (Post2020 = +181 and significant).

  1. The F-test confirms this is a statistically meaningful change

ANOVA F = 19.3, p = 2.4e−05 → very strong evidence that adding the slope-change term improves the model.

In simple English:

“There is strong evidence that the trajectory of NJ hate crime reports changed after 2020, not just the level.”

  1. The story for NJ is: a spike in 2020 + a trend reversal

Pre-2020 trend: +2.24 incidents per month (significant)

Level change at 2020: +181 (significant)

Post-2020 trend change: −2.34 per month

So the post-2020 trend is:

2.24 – 2.34 ≈ −0.1 incidents/month (flat/slightly downward)

This means NJ saw:

• A large spike in 2020 • But afterward, the slope flattened or turned slightly downward

This is a different story from the national results.

10. ARIMA Regression on Annual Counts

#ARIMA Time series model
ts_data <- ts(national_ts$Monthly_Total, start = c(2015, 2), frequency = 12)

arima_model <- auto.arima(ts_data)
summary(arima_model)
## Series: ts_data 
## ARIMA(0,1,2)(0,0,2)[12] with drift 
## 
## Coefficients:
##           ma1      ma2    sma1    sma2   drift
##       -0.5308  -0.3713  0.2046  0.1385  9.5747
## s.e.   0.0821   0.0846  0.0912  0.0850  4.1479
## 
## sigma^2 = 94333:  log likelihood = -863.24
## AIC=1738.48   AICc=1739.21   BIC=1755.25
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE       ACF1
## Training set 2.067934 299.4891 171.7147 -1.746076 10.0339 0.6509726 0.07035696
forecast_2026 <- forecast(arima_model, h = 12)
autoplot(forecast_2026) + labs(title = "ARIMA Forecast for National Hate Crimes")

auto.arima() selected ARIMA(0,1,2)(0,0,2)[12] with drift. This translates to: non-seasonal: - 0 AR terms - 1 difference –> data has strong trend - 2 MA terms –> short term shock smoothing seasonal (period = 12 months) - 0 AR - 0 Differencing - 2 MA seasonal terms –> addresses repeating annual patterns With drift - The model includes a constant trend upward

This is a common model for data with persistent upward movement, seasonal fluctuations, no clear auto-regressive structure and noise best handled by MA terms.

Drift = 9.57 (SE = 4.15) - the average monthly increase in hate crimes after differencing - hate crimes are increasing by about 9-10 incidents per month on average over time (consistent with earlier models)

2026 Forecast - shows upward trend of about 10 additional hate crimes per month - growing uncertainty shown by wider prediction intervals

For new jersey:

nj_ts <- df_monthly_total %>%
  filter(Location == "New Jersey") %>%
  arrange(Date)

nj_ts$time_index <- 1:nrow(nj_ts)

model_linear_nj <- lm(Monthly_Total ~ time_index, data = nj_ts)
summary(model_linear_nj)
## 
## Call:
## lm(formula = Monthly_Total ~ time_index, data = nj_ts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.026  -30.351   -7.418   25.847  305.707 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  50.7813    10.2852   4.937 2.59e-06 ***
## time_index    1.5309     0.1451  10.549  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.45 on 120 degrees of freedom
## Multiple R-squared:  0.4811, Adjusted R-squared:  0.4768 
## F-statistic: 111.3 on 1 and 120 DF,  p-value: < 2.2e-16
model_break_nj <- lm(Monthly_Total ~ time_index + I(year(Date) >= 2020), data = nj_ts)
summary(model_break_nj)
## 
## Call:
## lm(formula = Monthly_Total ~ time_index + I(year(Date) >= 2020), 
##     data = nj_ts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.445  -28.344   -6.399   25.922  285.068 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                61.9764    11.1791   5.544 1.81e-07 ***
## time_index                  0.9556     0.2846   3.358  0.00106 ** 
## I(year(Date) >= 2020)TRUE  46.8427    20.0553   2.336  0.02118 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.43 on 119 degrees of freedom
## Multiple R-squared:  0.5039, Adjusted R-squared:  0.4956 
## F-statistic: 60.43 on 2 and 119 DF,  p-value: < 2.2e-16

11. Per-Bias Category Time Series

race

race_ts <- df_final_time_series %>%
  filter(Location == "National", Bias_Category == "Race/Ethnicity/Ancestry") %>%
  arrange(Date)

race_ts$time_index <- 1:nrow(race_ts)

lm_race <- lm(Estimated_Monthly_Count ~ time_index, data = race_ts)
summary(lm_race)
## 
## Call:
## lm(formula = Estimated_Monthly_Count ~ time_index, data = race_ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -239.45  -63.24  -11.16   43.96 1010.44 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 275.7165    14.6742   18.79   <2e-16 ***
## time_index    1.6399     0.1038   15.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.3 on 242 degrees of freedom
## Multiple R-squared:  0.5075, Adjusted R-squared:  0.5055 
## F-statistic: 249.4 on 1 and 242 DF,  p-value: < 2.2e-16

religion

religon_ts <- df_final_time_series %>%
  filter(Location == "National", Bias_Category == "Religion") %>%
  arrange(Date)

religon_ts$time_index <- 1:nrow(religon_ts)

lm_religion <- lm(Estimated_Monthly_Count ~ time_index, data = religon_ts)
summary(lm_religion)
## 
## Call:
## lm(formula = Estimated_Monthly_Count ~ time_index, data = religon_ts)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80.23 -21.35  -3.47  17.18 343.19 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 111.06904    4.99384   22.24   <2e-16 ***
## time_index    0.49112    0.03534   13.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.88 on 242 degrees of freedom
## Multiple R-squared:  0.4438, Adjusted R-squared:  0.4415 
## F-statistic: 193.1 on 1 and 242 DF,  p-value: < 2.2e-16

gender

gender_ts <- df_final_time_series %>%
  filter(Location == "National", Bias_Category == "Gender") %>%
  arrange(Date)

gender_ts$time_index <- 1:nrow(gender_ts)

lm_gender <- lm(Estimated_Monthly_Count ~ time_index, data = gender_ts)
summary(lm_gender)
## 
## Call:
## lm(formula = Estimated_Monthly_Count ~ time_index, data = gender_ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7252 -1.2937 -0.2704  0.7910 19.4592 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.504391   0.289403   12.11   <2e-16 ***
## time_index  0.038407   0.002048   18.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.253 on 242 degrees of freedom
## Multiple R-squared:  0.5924, Adjusted R-squared:  0.5907 
## F-statistic: 351.7 on 1 and 242 DF,  p-value: < 2.2e-16

disability

disability_ts <- df_final_time_series %>%
  filter(Location == "National", Bias_Category == "Disability") %>%
  arrange(Date)

disability_ts$time_index <- 1:nrow(disability_ts)

lm_disability <- lm(Estimated_Monthly_Count ~ time_index, data = disability_ts)
summary(lm_disability)
## 
## Call:
## lm(formula = Estimated_Monthly_Count ~ time_index, data = disability_ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4456 -1.7546 -0.2852  1.5975 26.3590 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.125544   0.389801   25.98   <2e-16 ***
## time_index   0.031691   0.002759   11.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.035 on 242 degrees of freedom
## Multiple R-squared:  0.3529, Adjusted R-squared:  0.3502 
## F-statistic:   132 on 1 and 242 DF,  p-value: < 2.2e-16

12. Bias Category Overview

a) Bias Category Comparison - National Level

bias_nat <- df_final_time_series %>%
  filter(Location == "National") %>%
  group_by(Date, Bias_Category) %>%
  summarise(Monthly_Total = sum(Estimated_Monthly_Count), .groups = "drop")

ggplot(bias_nat, aes(x = Date, y = Monthly_Total)) +
  geom_line(color = "steelblue") +
  facet_wrap(~ Bias_Category, scales = "free_y") +
  labs(
    title = "National Hate Crime Trends by Bias Category (2015–2025)",
    x = "Date",
    y = "Monthly Count"
  ) +
  theme_minimal() +
  theme(strip.text = element_text(size = 10))

b) Bias Category Comparison - New Jersey Level

bias_nj <- df_final_time_series %>%
  filter(Location == "New Jersey") %>%
  group_by(Date, Bias_Category) %>%
  summarise(Monthly_Total = sum(Estimated_Monthly_Count), .groups = "drop")

ggplot(bias_nj, aes(x = Date, y = Monthly_Total)) +
  geom_line(color = "steelblue") +
  facet_wrap(~ Bias_Category, scales = "free_y") +
  labs(
    title = "New Jersey Hate Crime Trends by Bias Category (2015–2025)",
    x = "Date",
    y = "Monthly Count"
  ) +
  theme_minimal() +
  theme(strip.text = element_text(size = 10))

c) Bias Category Regression Model (Annual Data)

reg_model <- lm(Annual_Count ~ Year + Location + Bias_Category, 
                data = df_annual_bias)
summary(reg_model)
## 
## Call:
## lm(formula = Annual_Count ~ Year + Location + Bias_Category, 
##     data = df_annual_bias)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4467.6 -1183.0  -115.6  1148.2  7729.0 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -132424.84  116851.06  -1.133  0.25928    
## Year                                      66.34      57.85   1.147  0.25367    
## LocationNew Jersey                     -2833.11     365.85  -7.744 2.95e-12 ***
## Bias_CategoryGender                      -65.22     633.68  -0.103  0.91820    
## Bias_CategoryGender Identity             164.84     633.68   0.260  0.79519    
## Bias_CategoryRace/Ethnicity/Ancestry    5571.89     633.68   8.793 1.02e-14 ***
## Bias_CategoryReligion                   1961.43     633.68   3.095  0.00243 ** 
## Bias_CategorySexual Orientation         1497.93     633.68   2.364  0.01964 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2102 on 124 degrees of freedom
## Multiple R-squared:  0.5883, Adjusted R-squared:  0.5651 
## F-statistic: 25.31 on 7 and 124 DF,  p-value: < 2.2e-16
  • interpret the model results
library(dplyr)
library(ggplot2)

# Filter dataset for one location and bias category
plot_data <- df_annual_bias %>%
  filter(Location == "National",
         Bias_Category == "Race/Ethnicity/Ancestry")

# Fit regression model
reg_model_simple <- lm(Annual_Count ~ Year, data = plot_data)

# Add fitted values + confidence intervals
plot_data <- plot_data %>%
  mutate(
    Fitted = predict(reg_model_simple, interval = "confidence")[, "fit"],
    CI_Lower = predict(reg_model_simple, interval = "confidence")[, "lwr"],
    CI_Upper = predict(reg_model_simple, interval = "confidence")[, "upr"]
  )

# Plot
ggplot(plot_data, aes(x = Year)) +
  geom_point(aes(y = Annual_Count), size = 3, color = "black") +      # actual values
  geom_line(aes(y = Fitted), color = "steelblue", size = 1.2) +       # regression line
  geom_ribbon(aes(ymin = CI_Lower, ymax = CI_Upper), alpha = 0.2, fill = "steelblue") +  # CI
  labs(
    title = "Annual Hate Crimes with Linear Trend and 95% Confidence Interval\n(National, Race/Ethnicity/Ancestry)",
    x = "Year",
    y = "Annual Count"
  ) +
  theme_minimal(base_size = 14)

Plot Fitted Values + 95% confidence interval

future_years <- data.frame(
  Year = max(df_annual_bias$Year) + 1:5,
  Location = "National",
  Bias_Category = "Race/Ethnicity/Ancestry"
)

future_preds <- cbind(
  future_years,
  predict(reg_model, newdata = future_years, interval = "prediction")
)

future_preds
##   Year Location           Bias_Category      fit      lwr      upr
## 1 2026 National Race/Ethnicity/Ancestry 7549.123 3225.532 11872.71
## 2 2027 National Race/Ethnicity/Ancestry 7615.462 3272.208 11958.72
## 3 2028 National Race/Ethnicity/Ancestry 7681.800 3315.968 12047.63
## 4 2029 National Race/Ethnicity/Ancestry 7748.139 3356.859 12139.42
## 5 2030 National Race/Ethnicity/Ancestry 7814.478 3394.929 12234.03

13. Pre/Post Covid T-Tests with FDR Correction

df_prepost <- df_annual_bias %>%
  mutate(Period = ifelse(Year < 2020, "Pre", "Post"))

results <- df_prepost %>%
  group_by(Location, Bias_Category) %>%
  summarise(
    t_test = list(t.test(Annual_Count ~ Period)),
    p_value = t_test[[1]]$p.value
  ) %>%
  ungroup() %>%
  mutate(
    p_adjusted = p.adjust(p_value, method = "fdr")
  )

results
## # A tibble: 12 × 5
##    Location   Bias_Category           t_test  p_value p_adjusted
##    <chr>      <chr>                   <list>    <dbl>      <dbl>
##  1 National   Disability              <htest> 0.328       0.344 
##  2 National   Gender                  <htest> 0.0301      0.0727
##  3 National   Gender Identity         <htest> 0.0103      0.0620
##  4 National   Race/Ethnicity/Ancestry <htest> 0.0913      0.134 
##  5 National   Religion                <htest> 0.167       0.201 
##  6 National   Sexual Orientation      <htest> 0.100       0.134 
##  7 New Jersey Disability              <htest> 0.0187      0.0727
##  8 New Jersey Gender                  <htest> 0.0424      0.0727
##  9 New Jersey Gender Identity         <htest> 0.00512     0.0614
## 10 New Jersey Race/Ethnicity/Ancestry <htest> 0.0412      0.0727
## 11 New Jersey Religion                <htest> 0.344       0.344 
## 12 New Jersey Sexual Orientation      <htest> 0.0342      0.0727
# Prepare results for plotting
plot_results <- results %>%
  mutate(
    Significance = ifelse(p_adjusted < 0.05, "Significant", "Not Significant")
  )

# Bar plot of adjusted p-values
ggplot(plot_results, aes(x = Bias_Category, y = p_adjusted, fill = Significance)) +
  geom_col(width = 0.7) +
  geom_hline(yintercept = 0.05, linetype = "dashed", color = "red", size = 1) +
  facet_wrap(~ Location) +
  scale_fill_manual(values = c("Significant" = "steelblue", "Not Significant" = "grey70")) +
  labs(
    title = "Adjusted P-Values for Pre/Post-COVID T-Tests by Bias Category",
    y = "Adjusted P-Value (FDR)",
    x = "Bias Category",
    fill = "Significance"
  ) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

14. Population Analysis

NJ_population <- read_csv("NJ Population.csv")
NJ_pop <- NJ_population[16:nrow(NJ_population), ]

library(scales)

# Create a scaling factor
pop_scale <- max(df_overall_trends$Annual_Total, na.rm = TRUE) /
             max(NJ_pop$Population, na.rm = TRUE)

df_overall_trends %>%
  filter(Location == "New Jersey") %>%
  ggplot(aes(x = Year)) +
  
  # Hate crimes line
  geom_line(aes(y = Annual_Total), size = 1, color = "#e31a1c") +
  geom_point(aes(y = Annual_Total), size = 2, color = "#e31a1c") +
  
  # Population line (scaled)
  geom_line(
    data = NJ_pop,
    aes(y = Population * pop_scale),
    color = "#1f78b4",
    size = 1
  ) +
  
  geom_vline(xintercept = 2020, linetype = "dashed") +
  
  labs(
    title = "New Jersey Hate Crimes and Population Trends (2015–2025)",
    x = "Year",
    y = "Total Estimated Hate Crimes"
  ) +
  
  scale_y_continuous(
    labels = comma,
    sec.axis = sec_axis(
      ~ . / pop_scale,
      name = "New Jersey Population",
      labels = comma
    )
  ) +
  
  theme_minimal()

nj_rates <- df_overall_trends %>%
  filter(Location == "New Jersey") %>%
  left_join(NJ_pop, by = "Year") %>%
  mutate(
    hate_crime_rate = (Annual_Total / Population) * 100000
  )


ggplot(nj_rates, aes(x = Year, y = hate_crime_rate)) +
  geom_line(size = 1, color = "#6a3d9a") +
  geom_point(size = 2, color = "#6a3d9a") +
  geom_vline(xintercept = 2020, linetype = "dashed") +
  labs(
    title = "Hate Crimes per 100,000 Residents in New Jersey (2015–2025)",
    x = "Year",
    y = "Hate Crimes per 100,000 Residents"
  ) +
  scale_y_continuous(labels = comma) +
  theme_minimal()

US_population <- read_csv("US Population.csv")
US_pop <- US_population[66:nrow(US_population), ]

names(US_pop)[1] <- "Year"


us_rates <- df_overall_trends %>%
  filter(Location == "National") %>%
  left_join(US_pop, by = "Year") %>%
  mutate(
    hate_crime_rate = (Annual_Total / Population) * 100000
  )

ggplot(us_rates, aes(x = Year, y = hate_crime_rate)) +
  geom_line(size = 1, color = "#1f78b4") +
  geom_point(size = 2, color = "#1f78b4") +
  geom_vline(xintercept = 2020, linetype = "dashed") +
  labs(
    title = "Hate Crimes per 100,000 Residents in the United States (2015–2025)",
    x = "Year",
    y = "Hate Crimes per 100,000 Residents"
  ) +
  scale_y_continuous(labels = comma) +
  theme_minimal()

summary(us_rates$hate_crime_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.542   4.528   5.187   5.614   7.405   7.900
combined_rates <- bind_rows(
  nj_rates %>% mutate(Location = "New Jersey"),
  us_rates %>% mutate(Location = "United States")
)

ggplot(combined_rates, aes(Year, hate_crime_rate, color = Location)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  geom_vline(xintercept = 2020, linetype = "dashed") +
  labs(
    title = "Hate Crimes per 100,000 Residents: New Jersey vs United States",
    x = "Year",
    y = "Hate Crimes per 100,000 Residents",
    color = "Location"
  ) +
  scale_y_continuous(labels = comma) +
  theme_minimal()