This document represents Part 1 of our Statistical Learning final project, which focuses on: - Cleaning and preparing FBI hate-crime datasets (2015–2025) - Ensuring consistent bias categories across time and across National vs. New Jersey data - Constructing a unified monthly time series dataset - Aggregating annual totals - Conducting exploratory data analysis (EDA), including: 1. Annual bias-category trends 2. Overall annual totals 3. Pre/Post COVID comparisons 4. Correlation heatmaps
The data comes from the FBI Crime Data Explorer, downloaded and separated into: - Bias category totals (4 files) - Monthly hate-crime totals (4 files) We combine all eight datasets into a single analytical structure.
# Bias-category total files
bias_files <- c(
"feb2015-feb2020_national_biascategories.csv",
"march2020-march2025_national_biascategories.csv",
"feb2015-feb2020_NJ_biascategories.csv",
"march2020-march2025_NJ_biascategories.csv"
)
# Monthly total hate-crime files
monthly_files <- c(
"feb2015-feb2020_national_hatecrimes.csv",
"march2020-march2025_national_hatecrimes.csv",
"feb2015-feb2020_NJ_hatecrimes.csv",
"march2020-march2025_NJ_hatecrimes.csv"
)
We reshape each dataset from wide format (Month-Year as column names) to long format and convert to actual dates.
process_monthly_totals <- function(file_path) {
location <- ifelse(str_detect(file_path, "national"), "National", "New Jersey")
period <- ifelse(str_detect(file_path, "feb2015"),
"Pre-COVID (2015–2020)",
"Post-COVID (2020–2025)")
df <- read_csv(file_path)
df_clean <- df %>%
select(-1) %>%
pivot_longer(
cols = everything(),
names_to = "Month_Year",
values_to = "Monthly_Total_Count"
) %>%
mutate(
Location = location,
Period = period,
Date = my(Month_Year)
) %>%
filter(!is.na(Date)) %>%
group_by(Location, Period) %>%
mutate(Period_Total_Count_Monthly = sum(Monthly_Total_Count, na.rm = TRUE)) %>%
ungroup()
df_clean
}
df_monthly_totals <- map_dfr(monthly_files, process_monthly_totals)
We pivot all bias categories long and compute proportions within each period (needed for proportional allocation into monthly estimates).
process_bias_totals <- function(file_path) {
location <- ifelse(str_detect(file_path, "national"), "National", "New Jersey")
period <- ifelse(str_detect(file_path, "feb2015"),
"Pre-COVID (2015–2020)",
"Post-COVID (2020–2025)")
df <- read_csv(file_path)
df_long <- df %>%
pivot_longer(
cols = everything(),
names_to = "Bias_Category",
values_to = "Bias_Category_Total"
) %>%
mutate(
Bias_Category = str_trim(str_remove_all(Bias_Category, '"')),
Location = location,
Period = period,
Period_Total_Count_Bias = sum(Bias_Category_Total, na.rm = TRUE),
Bias_Proportion = Bias_Category_Total / Period_Total_Count_Bias
)
df_long
}
df_bias_totals <- map_dfr(bias_files, process_bias_totals)
We merge monthly totals with proportional bias data, generating a monthly time series for all bias categories.
df_merged <- inner_join(
df_monthly_totals,
df_bias_totals,
by = c("Location", "Period")
)
df_final_time_series <- df_merged %>%
mutate(
Estimated_Monthly_Count = Monthly_Total_Count * Bias_Proportion,
Year = year(Date),
Month = month(Date, label = TRUE)
) %>%
select(Date, Year, Month, Location, Period, Bias_Category, Estimated_Monthly_Count)
write_csv(df_final_time_series, "full_monthly_bias_time_series.csv")
This dataset is used for trend lines and pre/post comparison.
df_annual_bias <- df_final_time_series %>%
group_by(Location, Year, Bias_Category) %>%
summarise(
Annual_Count = sum(Estimated_Monthly_Count, na.rm = TRUE),
.groups = "drop"
)
write_csv(df_annual_bias, "annual_bias_trends.csv")
df_annual_bias %>%
filter(Location == "National") %>%
ggplot(aes(Year, Annual_Count, color = Bias_Category)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "National Annual Hate Crime Trends by Bias Category (2015–2025)",
x = "Year", y = "Estimated Annual Count"
) +
theme_minimal() +
theme(legend.position = "bottom")
df_annual_bias %>%
filter(Location == "New Jersey") %>%
ggplot(aes(Year, Annual_Count, color = Bias_Category)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "New Jersey Annual Hate Crime Trends by Bias Category (2015–2025)",
x = "Year", y = "Estimated Annual Count"
) +
theme_minimal() +
theme(legend.position = "bottom")
df_overall_trends <- df_final_time_series %>%
group_by(Location, Year) %>%
summarise(
Annual_Total = sum(Estimated_Monthly_Count, na.rm = TRUE),
.groups = "drop"
)
df_overall_trends %>%
filter(Location == "National") %>%
ggplot(aes(Year, Annual_Total)) +
geom_line(size = 1, color = "#1f78b4") +
geom_point(size = 2, color = "#1f78b4") +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "National Total Hate Crimes per Year (2015–2025)",
x = "Year", y = "Total Estimated Hate Crimes"
) +
scale_y_continuous(labels = comma) +
theme_minimal()
df_overall_trends %>%
filter(Location == "New Jersey") %>%
ggplot(aes(Year, Annual_Total)) +
geom_line(size = 1, color = "#e31a1c") +
geom_point(size = 2, color = "#e31a1c") +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "New Jersey Total Hate Crimes per Year (2015–2025)",
x = "Year", y = "Total Estimated Hate Crimes"
) +
scale_y_continuous(labels = comma) +
theme_minimal()
df_overall_trends %>%
ggplot(aes(x = Year, y = Annual_Total, color = Location)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
geom_vline(xintercept = 2020, linetype = "dashed") +
scale_color_manual(values = c("National" = "#1f78b4", "New Jersey" = "#e31a1c")) +
labs(
title = "Comparison of National vs. New Jersey Total Hate Crimes (2015–2025)",
x = "Year",
y = "Total Estimated Hate Crimes",
color = "Location"
) +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(legend.position = "bottom")
df_change <- df_annual_bias %>%
mutate(Period_Group = ifelse(Year < 2020, "Pre_COVID", "Post_COVID")) %>%
group_by(Location, Bias_Category, Period_Group) %>%
summarise(Total = sum(Annual_Count), .groups = "drop") %>%
pivot_wider(names_from = Period_Group, values_from = Total) %>%
mutate(
Absolute_Change = Post_COVID - Pre_COVID,
Percent_Change = (Absolute_Change / Pre_COVID) * 100
) %>%
arrange(Location, desc(Percent_Change))
df_change
## # A tibble: 12 × 6
## Location Bias_Category Post_COVID Pre_COVID Absolute_Change Percent_Change
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 National Gender Identi… 5099. 1658. 3441. 208.
## 2 National Gender 1379. 624. 756. 121.
## 3 National Race/Ethnicit… 74304. 41988. 32315. 77.0
## 4 National Sexual Orient… 21862. 12555. 9307. 74.1
## 5 National Religion 25734. 16047. 9687. 60.4
## 6 National Disability 2022. 1396. 626. 44.9
## 7 New Jersey Gender Identi… 401. 46.4 354. 763.
## 8 New Jersey Disability 123. 37.8 84.8 224.
## 9 New Jersey Sexual Orient… 1543. 573. 971. 169.
## 10 New Jersey Race/Ethnicit… 7104. 2764. 4340. 157.
## 11 New Jersey Gender 101. 39.6 61.4 155.
## 12 New Jersey Religion 3062. 1887. 1175. 62.3
calculate_correlation_data <- function(location_name, df) {
df %>%
filter(Location == location_name) %>%
pivot_wider(id_cols = Date, names_from = Bias_Category, values_from = Estimated_Monthly_Count, values_fn = sum) %>%
select(-Date) %>%
cor(use = "complete.obs") %>%
melt(value.name = "Correlation") %>%
rename(Bias_Category_1 = Var1, Bias_Category_2 = Var2) %>%
mutate(Location = location_name)
}
df_cor_all <- bind_rows(
calculate_correlation_data("National", df_final_time_series),
calculate_correlation_data("New Jersey", df_final_time_series)
)
ggplot(df_cor_all, aes(x = Bias_Category_1, y = Bias_Category_2, fill = Correlation)) +
geom_tile() +
geom_text(aes(label = round(Correlation, 2)), size = 3) +
facet_wrap(~ Location) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
labs(
title = "Correlation Heatmaps of Monthly Hate Crime Counts by Bias Type",
subtitle = "National vs. New Jersey (2015–2025)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_fixed()
Modeling: Monthly Total Hate Crimes=β0 + β1 (time) + ϵ
library(forecast)
library(broom)
# Prepare total monthly counts
df_monthly_total <- df_final_time_series %>%
group_by(Location, Date) %>%
summarise(Monthly_Total = sum(Estimated_Monthly_Count), .groups = "drop")
#Linear Trend Model -- National level
national_ts <- df_monthly_total %>%
filter(Location == "National") %>%
arrange(Date)
national_ts$time_index <- 1:nrow(national_ts)
model_linear <- lm(Monthly_Total ~ time_index, data = national_ts)
summary(model_linear)
##
## Call:
## lm(formula = Monthly_Total ~ time_index, data = national_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -726.28 -205.88 -35.64 124.96 2354.42
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 975.0710 65.1540 14.97 <2e-16 ***
## time_index 11.4232 0.9193 12.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 357.6 on 120 degrees of freedom
## Multiple R-squared: 0.5627, Adjusted R-squared: 0.559
## F-statistic: 154.4 on 1 and 120 DF, p-value: < 2.2e-16
slope = time_index = 11.42 - This means that hate crimes are increasing by about 11.4 incidents per month on average across the entire 10 year period. - This is a steady upward trend, not flat or random - This corresponds to 11.4 * 12 = 137 additional incidents per year
The p-value of the slope is < 2e-16 - this means it is extremely statistical significant
R^2 = 0.563 - This means about 56% of the variation in monthly hate crime totals is explained by time alone - This makes sense for social and crime data because it contains seasonality, event-driven noise (such as COVID, BLM, etc.) and reporting inconsistencies
A simple linear trend explaining > 50% of variation is meaningful
#for new jersey:
nj_ts <- df_monthly_total %>%
filter(Location == "New Jersey") %>%
arrange(Date)
nj_ts$time_index <- 1:nrow(nj_ts)
model_linear_nj <- lm(Monthly_Total ~ time_index, data = nj_ts)
summary(model_linear_nj)
##
## Call:
## lm(formula = Monthly_Total ~ time_index, data = nj_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.026 -30.351 -7.418 25.847 305.707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.7813 10.2852 4.937 2.59e-06 ***
## time_index 1.5309 0.1451 10.549 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.45 on 120 degrees of freedom
## Multiple R-squared: 0.4811, Adjusted R-squared: 0.4768
## F-statistic: 111.3 on 1 and 120 DF, p-value: < 2.2e-16
time_index = 1.53 (p<2e-16) - For each passing month, New Jersey’s hate crime count increases by about 1.5 incidents per month on average - this effect is highly statistically significant - this means that NH crimes show a clear and steady upward trend over the decade
intercept = 50.78 - the estimated starting point of the trend; in the first month the model estimates NJ had about 51 hate crime incidents
R^2 = 0.48 - 48% of the month to month variation in NJ hate crimes data is explained by time alone - this is moderately strong for noisy crime data - the Residual Standard Error is 56.45 which means predictions are typically off by about 56 incidents from the true monthly count - given that NJ’s monthly totals are much smaller than the national totals, this is expected
national_ts <- national_ts %>%
mutate(Post2020 = ifelse(year(Date) >= 2020, 1, 0))
model_break <- lm(Monthly_Total ~ time_index + Post2020, data = national_ts)
summary(model_break)
##
## Call:
## lm(formula = Monthly_Total ~ time_index + Post2020, data = national_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -814.7 -174.5 7.7 145.4 2153.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1083.973 68.576 15.807 < 2e-16 ***
## time_index 5.826 1.746 3.337 0.001129 **
## Post2020 455.673 123.026 3.704 0.000323 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 340 on 119 degrees of freedom
## Multiple R-squared: 0.6079, Adjusted R-squared: 0.6013
## F-statistic: 92.24 on 2 and 119 DF, p-value: < 2.2e-16
Intercept = 1083.91 - the baseline monthly hate crime count at time_index = 0 (the first month in data)
Time_Index (slope) = 5.823 (p = 0.011) - This means that before accounting for 2020, the monthly total was already increasing over time and on average, hate crimes increased by about 5.8 incidents per month per year - Upward trend exists before 2020
Post2020 = 455.67 (p = 0.0003) - This positive, significant coefficient means that starting in 2020, monthly hate crimes jumped by an additional ~456 incidents independent of the underlying time trend - So there is a statistically significant level shift upward in 2020 - This supports the idea of a structural break due to COVID and the political climate
library(dplyr)
library(ggplot2)
# Prepare national data
nat <- df_monthly_total %>%
filter(Location == "National") %>%
arrange(Date) %>%
mutate(
time_index = 1:n(),
Post2020 = ifelse(lubridate::year(Date) >= 2020, 1, 0)
)
# Fit break model
nat$Fitted <- predict(model_break)
# Plot structural break
ggplot(nat, aes(x = Date)) +
geom_line(aes(y = Monthly_Total), color = "black", alpha = 0.7) +
geom_line(aes(y = Fitted), color = "red", size = 1) +
geom_vline(xintercept = as.Date("2020-01-01"), linetype = "dashed") +
labs(
title = "Structural Break in National Hate Crimes (2020)",
y = "Monthly Hate Crime Count",
x = "Date"
) +
theme_minimal()
This shows that there is indeed a spike around COVID, which we knew about but I want to test if there is a signficant increase in trend before and after COVID. To do this, I need to create an interaction term and do a F-test.
national_ts <- national_ts %>%
mutate(
Post2020 = ifelse(year(Date) >= 2020, 1, 0),
time_post2020 = time_index * Post2020 # interaction term
)
model_slope_change <- lm(Monthly_Total ~ time_index + Post2020 + time_post2020,
data = national_ts)
summary(model_slope_change)
##
## Call:
## lm(formula = Monthly_Total ~ time_index + Post2020 + time_post2020,
## data = national_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -853.60 -152.05 7.57 147.75 2117.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1033.235 89.766 11.510 < 2e-16 ***
## time_index 7.518 2.602 2.889 0.00460 **
## Post2020 632.820 236.541 2.675 0.00853 **
## time_post2020 -3.080 3.512 -0.877 0.38219
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 340.4 on 118 degrees of freedom
## Multiple R-squared: 0.6104, Adjusted R-squared: 0.6005
## F-statistic: 61.63 on 3 and 118 DF, p-value: < 2.2e-16
model_level_only <- lm(Monthly_Total ~ time_index + Post2020,
data = national_ts)
anova(model_level_only, model_slope_change)
## Analysis of Variance Table
##
## Model 1: Monthly_Total ~ time_index + Post2020
## Model 2: Monthly_Total ~ time_index + Post2020 + time_post2020
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 119 13760272
## 2 118 13671133 1 89139 0.7694 0.3822
There is no statistical evidence that the slope (trend) of hate crimes changed after 2020.
Even though 2020 had a jump (level shift), the rate of increase per month after 2020 is not significantly different from the rate before 2020. There is a clear jump in hate crime totals in 2020.
However, when testing whether the trend changed — that is, whether hate crimes began increasing faster after COVID — the statistical evidence does not support a significant change in slope (F-test p = 0.38). This means the spike reflects a one-time level shift, not an ongoing acceleration.
#for new jersey:
model_break_nj <- lm(Monthly_Total ~ time_index + I(year(Date) >= 2020), data = nj_ts)
summary(model_break_nj)
##
## Call:
## lm(formula = Monthly_Total ~ time_index + I(year(Date) >= 2020),
## data = nj_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.445 -28.344 -6.399 25.922 285.068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.9764 11.1791 5.544 1.81e-07 ***
## time_index 0.9556 0.2846 3.358 0.00106 **
## I(year(Date) >= 2020)TRUE 46.8427 20.0553 2.336 0.02118 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.43 on 119 degrees of freedom
## Multiple R-squared: 0.5039, Adjusted R-squared: 0.4956
## F-statistic: 60.43 on 2 and 119 DF, p-value: < 2.2e-16
time_index = 0.96 (p = 0.001) - before and after 2020, NJ hate crimes increased by about 1 incident per month per year - NJ has a slow by steady long term upward trend - this is statistically significant
post2020 indicator = 46.84 (p = 0.021) - beginning in 2020, NJ saw an average jump of about 47 additional hate crime incidents per month, above what the trend alone would predict - this is statistically significant, so it is very unlikely to be random - clear evidence of a structural break due to COVID and political climat
intercept - 61.98 - the baseline before trend and break are applied
Model Fit - R^2 = 0.504 –> model explains 50% of variation - Residual SE = 55.43 –> typical error is about 55 incidents - This is an improved fit compared to the trend only model (r^2 was 0.48 before adding the break) - Thus Adding the post 2020 dummy improves the model
# Prepare NJ data
nj <- df_monthly_total %>%
filter(Location == "New Jersey") %>%
arrange(Date) %>%
mutate(
time_index = 1:n(),
Post2020 = ifelse(lubridate::year(Date) >= 2020, 1, 0)
)
# Fit break model
nj$Fitted <- predict(model_break_nj)
# Plot structural break
ggplot(nj, aes(x = Date)) +
geom_line(aes(y = Monthly_Total), color = "brown", alpha = 0.7) +
geom_line(aes(y = Fitted), color = "red", size = 1) +
geom_vline(xintercept = as.Date("2020-01-01"), linetype = "dashed") +
labs(
title = "Structural Break in New Jersey Hate Crimes (2020)",
y = "Monthly Hate Crime Count",
x = "Date"
) +
theme_minimal()
nj_ts <- nj_ts %>%
mutate(
Post2020 = ifelse(year(Date) >= 2020, 1, 0),
time_post2020 = time_index * Post2020 # interaction term
)
model_slope_change_nj <- lm(Monthly_Total ~ time_index + Post2020 + time_post2020,
data = nj_ts)
summary(model_slope_change_nj)
##
## Call:
## lm(formula = Monthly_Total ~ time_index + Post2020 + time_post2020,
## data = nj_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -120.572 -30.439 -1.667 24.946 257.630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4307 13.6090 1.722 0.0877 .
## time_index 2.2404 0.3945 5.679 9.90e-08 ***
## Post2020 181.4200 35.8611 5.059 1.56e-06 ***
## time_post2020 -2.3401 0.5324 -4.395 2.43e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.6 on 118 degrees of freedom
## Multiple R-squared: 0.5737, Adjusted R-squared: 0.5628
## F-statistic: 52.93 on 3 and 118 DF, p-value: < 2.2e-16
model_level_only_nj <- lm(Monthly_Total ~ time_index + Post2020,
data = nj_ts)
anova(model_level_only_nj, model_slope_change_nj)
## Analysis of Variance Table
##
## Model 1: Monthly_Total ~ time_index + Post2020
## Model 2: Monthly_Total ~ time_index + Post2020 + time_post2020
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 119 365670
## 2 118 314224 1 51445 19.319 2.432e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
New Jersey experienced a statistically significant change in its hate-crime trend after 2020. Although there was a sharp increase in 2020, the month-to-month slope decreased significantly afterward. The F-test shows that including a post-2020 slope term substantially improves the model (p = 2.4e−05), indicating that the long-term trend shifted direction, not just the level.
. NJ shows a significant change in slope after 2020
The coefficient for time_post2020 is −2.34 and highly significant (p = 2.4e−05).
This means: After 2020, the month-to-month trend decreased relative to the pre-2020 trend, even though the overall post-2020 level jumped (Post2020 = +181 and significant).
ANOVA F = 19.3, p = 2.4e−05 → very strong evidence that adding the slope-change term improves the model.
In simple English:
“There is strong evidence that the trajectory of NJ hate crime reports changed after 2020, not just the level.”
Pre-2020 trend: +2.24 incidents per month (significant)
Level change at 2020: +181 (significant)
Post-2020 trend change: −2.34 per month
So the post-2020 trend is:
2.24 – 2.34 ≈ −0.1 incidents/month (flat/slightly downward)
This means NJ saw:
• A large spike in 2020 • But afterward, the slope flattened or turned slightly downward
This is a different story from the national results.
#ARIMA Time series model
ts_data <- ts(national_ts$Monthly_Total, start = c(2015, 2), frequency = 12)
arima_model <- auto.arima(ts_data)
summary(arima_model)
## Series: ts_data
## ARIMA(0,1,2)(0,0,2)[12] with drift
##
## Coefficients:
## ma1 ma2 sma1 sma2 drift
## -0.5308 -0.3713 0.2046 0.1385 9.5747
## s.e. 0.0821 0.0846 0.0912 0.0850 4.1479
##
## sigma^2 = 94333: log likelihood = -863.24
## AIC=1738.48 AICc=1739.21 BIC=1755.25
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 2.067934 299.4891 171.7147 -1.746076 10.0339 0.6509726 0.07035696
forecast_2026 <- forecast(arima_model, h = 12)
autoplot(forecast_2026) + labs(title = "ARIMA Forecast for National Hate Crimes")
auto.arima() selected ARIMA(0,1,2)(0,0,2)[12] with drift. This translates to: non-seasonal: - 0 AR terms - 1 difference –> data has strong trend - 2 MA terms –> short term shock smoothing seasonal (period = 12 months) - 0 AR - 0 Differencing - 2 MA seasonal terms –> addresses repeating annual patterns With drift - The model includes a constant trend upward
This is a common model for data with persistent upward movement, seasonal fluctuations, no clear auto-regressive structure and noise best handled by MA terms.
Drift = 9.57 (SE = 4.15) - the average monthly increase in hate crimes after differencing - hate crimes are increasing by about 9-10 incidents per month on average over time (consistent with earlier models)
2026 Forecast - shows upward trend of about 10 additional hate crimes per month - growing uncertainty shown by wider prediction intervals
For new jersey:
nj_ts <- df_monthly_total %>%
filter(Location == "New Jersey") %>%
arrange(Date)
nj_ts$time_index <- 1:nrow(nj_ts)
model_linear_nj <- lm(Monthly_Total ~ time_index, data = nj_ts)
summary(model_linear_nj)
##
## Call:
## lm(formula = Monthly_Total ~ time_index, data = nj_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.026 -30.351 -7.418 25.847 305.707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.7813 10.2852 4.937 2.59e-06 ***
## time_index 1.5309 0.1451 10.549 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.45 on 120 degrees of freedom
## Multiple R-squared: 0.4811, Adjusted R-squared: 0.4768
## F-statistic: 111.3 on 1 and 120 DF, p-value: < 2.2e-16
model_break_nj <- lm(Monthly_Total ~ time_index + I(year(Date) >= 2020), data = nj_ts)
summary(model_break_nj)
##
## Call:
## lm(formula = Monthly_Total ~ time_index + I(year(Date) >= 2020),
## data = nj_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.445 -28.344 -6.399 25.922 285.068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.9764 11.1791 5.544 1.81e-07 ***
## time_index 0.9556 0.2846 3.358 0.00106 **
## I(year(Date) >= 2020)TRUE 46.8427 20.0553 2.336 0.02118 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.43 on 119 degrees of freedom
## Multiple R-squared: 0.5039, Adjusted R-squared: 0.4956
## F-statistic: 60.43 on 2 and 119 DF, p-value: < 2.2e-16
race_ts <- df_final_time_series %>%
filter(Location == "National", Bias_Category == "Race/Ethnicity/Ancestry") %>%
arrange(Date)
race_ts$time_index <- 1:nrow(race_ts)
lm_race <- lm(Estimated_Monthly_Count ~ time_index, data = race_ts)
summary(lm_race)
##
## Call:
## lm(formula = Estimated_Monthly_Count ~ time_index, data = race_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -239.45 -63.24 -11.16 43.96 1010.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 275.7165 14.6742 18.79 <2e-16 ***
## time_index 1.6399 0.1038 15.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 114.3 on 242 degrees of freedom
## Multiple R-squared: 0.5075, Adjusted R-squared: 0.5055
## F-statistic: 249.4 on 1 and 242 DF, p-value: < 2.2e-16
religon_ts <- df_final_time_series %>%
filter(Location == "National", Bias_Category == "Religion") %>%
arrange(Date)
religon_ts$time_index <- 1:nrow(religon_ts)
lm_religion <- lm(Estimated_Monthly_Count ~ time_index, data = religon_ts)
summary(lm_religion)
##
## Call:
## lm(formula = Estimated_Monthly_Count ~ time_index, data = religon_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.23 -21.35 -3.47 17.18 343.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.06904 4.99384 22.24 <2e-16 ***
## time_index 0.49112 0.03534 13.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.88 on 242 degrees of freedom
## Multiple R-squared: 0.4438, Adjusted R-squared: 0.4415
## F-statistic: 193.1 on 1 and 242 DF, p-value: < 2.2e-16
gender_ts <- df_final_time_series %>%
filter(Location == "National", Bias_Category == "Gender") %>%
arrange(Date)
gender_ts$time_index <- 1:nrow(gender_ts)
lm_gender <- lm(Estimated_Monthly_Count ~ time_index, data = gender_ts)
summary(lm_gender)
##
## Call:
## lm(formula = Estimated_Monthly_Count ~ time_index, data = gender_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7252 -1.2937 -0.2704 0.7910 19.4592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.504391 0.289403 12.11 <2e-16 ***
## time_index 0.038407 0.002048 18.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.253 on 242 degrees of freedom
## Multiple R-squared: 0.5924, Adjusted R-squared: 0.5907
## F-statistic: 351.7 on 1 and 242 DF, p-value: < 2.2e-16
disability_ts <- df_final_time_series %>%
filter(Location == "National", Bias_Category == "Disability") %>%
arrange(Date)
disability_ts$time_index <- 1:nrow(disability_ts)
lm_disability <- lm(Estimated_Monthly_Count ~ time_index, data = disability_ts)
summary(lm_disability)
##
## Call:
## lm(formula = Estimated_Monthly_Count ~ time_index, data = disability_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4456 -1.7546 -0.2852 1.5975 26.3590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.125544 0.389801 25.98 <2e-16 ***
## time_index 0.031691 0.002759 11.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.035 on 242 degrees of freedom
## Multiple R-squared: 0.3529, Adjusted R-squared: 0.3502
## F-statistic: 132 on 1 and 242 DF, p-value: < 2.2e-16
bias_nat <- df_final_time_series %>%
filter(Location == "National") %>%
group_by(Date, Bias_Category) %>%
summarise(Monthly_Total = sum(Estimated_Monthly_Count), .groups = "drop")
ggplot(bias_nat, aes(x = Date, y = Monthly_Total)) +
geom_line(color = "steelblue") +
facet_wrap(~ Bias_Category, scales = "free_y") +
labs(
title = "National Hate Crime Trends by Bias Category (2015–2025)",
x = "Date",
y = "Monthly Count"
) +
theme_minimal() +
theme(strip.text = element_text(size = 10))
bias_nj <- df_final_time_series %>%
filter(Location == "New Jersey") %>%
group_by(Date, Bias_Category) %>%
summarise(Monthly_Total = sum(Estimated_Monthly_Count), .groups = "drop")
ggplot(bias_nj, aes(x = Date, y = Monthly_Total)) +
geom_line(color = "steelblue") +
facet_wrap(~ Bias_Category, scales = "free_y") +
labs(
title = "New Jersey Hate Crime Trends by Bias Category (2015–2025)",
x = "Date",
y = "Monthly Count"
) +
theme_minimal() +
theme(strip.text = element_text(size = 10))
reg_model <- lm(Annual_Count ~ Year + Location + Bias_Category,
data = df_annual_bias)
summary(reg_model)
##
## Call:
## lm(formula = Annual_Count ~ Year + Location + Bias_Category,
## data = df_annual_bias)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4467.6 -1183.0 -115.6 1148.2 7729.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -132424.84 116851.06 -1.133 0.25928
## Year 66.34 57.85 1.147 0.25367
## LocationNew Jersey -2833.11 365.85 -7.744 2.95e-12 ***
## Bias_CategoryGender -65.22 633.68 -0.103 0.91820
## Bias_CategoryGender Identity 164.84 633.68 0.260 0.79519
## Bias_CategoryRace/Ethnicity/Ancestry 5571.89 633.68 8.793 1.02e-14 ***
## Bias_CategoryReligion 1961.43 633.68 3.095 0.00243 **
## Bias_CategorySexual Orientation 1497.93 633.68 2.364 0.01964 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2102 on 124 degrees of freedom
## Multiple R-squared: 0.5883, Adjusted R-squared: 0.5651
## F-statistic: 25.31 on 7 and 124 DF, p-value: < 2.2e-16
library(dplyr)
library(ggplot2)
# Filter dataset for one location and bias category
plot_data <- df_annual_bias %>%
filter(Location == "National",
Bias_Category == "Race/Ethnicity/Ancestry")
# Fit regression model
reg_model_simple <- lm(Annual_Count ~ Year, data = plot_data)
# Add fitted values + confidence intervals
plot_data <- plot_data %>%
mutate(
Fitted = predict(reg_model_simple, interval = "confidence")[, "fit"],
CI_Lower = predict(reg_model_simple, interval = "confidence")[, "lwr"],
CI_Upper = predict(reg_model_simple, interval = "confidence")[, "upr"]
)
# Plot
ggplot(plot_data, aes(x = Year)) +
geom_point(aes(y = Annual_Count), size = 3, color = "black") + # actual values
geom_line(aes(y = Fitted), color = "steelblue", size = 1.2) + # regression line
geom_ribbon(aes(ymin = CI_Lower, ymax = CI_Upper), alpha = 0.2, fill = "steelblue") + # CI
labs(
title = "Annual Hate Crimes with Linear Trend and 95% Confidence Interval\n(National, Race/Ethnicity/Ancestry)",
x = "Year",
y = "Annual Count"
) +
theme_minimal(base_size = 14)
Plot Fitted Values + 95% confidence interval
future_years <- data.frame(
Year = max(df_annual_bias$Year) + 1:5,
Location = "National",
Bias_Category = "Race/Ethnicity/Ancestry"
)
future_preds <- cbind(
future_years,
predict(reg_model, newdata = future_years, interval = "prediction")
)
future_preds
## Year Location Bias_Category fit lwr upr
## 1 2026 National Race/Ethnicity/Ancestry 7549.123 3225.532 11872.71
## 2 2027 National Race/Ethnicity/Ancestry 7615.462 3272.208 11958.72
## 3 2028 National Race/Ethnicity/Ancestry 7681.800 3315.968 12047.63
## 4 2029 National Race/Ethnicity/Ancestry 7748.139 3356.859 12139.42
## 5 2030 National Race/Ethnicity/Ancestry 7814.478 3394.929 12234.03
df_prepost <- df_annual_bias %>%
mutate(Period = ifelse(Year < 2020, "Pre", "Post"))
results <- df_prepost %>%
group_by(Location, Bias_Category) %>%
summarise(
t_test = list(t.test(Annual_Count ~ Period)),
p_value = t_test[[1]]$p.value
) %>%
ungroup() %>%
mutate(
p_adjusted = p.adjust(p_value, method = "fdr")
)
results
## # A tibble: 12 × 5
## Location Bias_Category t_test p_value p_adjusted
## <chr> <chr> <list> <dbl> <dbl>
## 1 National Disability <htest> 0.328 0.344
## 2 National Gender <htest> 0.0301 0.0727
## 3 National Gender Identity <htest> 0.0103 0.0620
## 4 National Race/Ethnicity/Ancestry <htest> 0.0913 0.134
## 5 National Religion <htest> 0.167 0.201
## 6 National Sexual Orientation <htest> 0.100 0.134
## 7 New Jersey Disability <htest> 0.0187 0.0727
## 8 New Jersey Gender <htest> 0.0424 0.0727
## 9 New Jersey Gender Identity <htest> 0.00512 0.0614
## 10 New Jersey Race/Ethnicity/Ancestry <htest> 0.0412 0.0727
## 11 New Jersey Religion <htest> 0.344 0.344
## 12 New Jersey Sexual Orientation <htest> 0.0342 0.0727
# Prepare results for plotting
plot_results <- results %>%
mutate(
Significance = ifelse(p_adjusted < 0.05, "Significant", "Not Significant")
)
# Bar plot of adjusted p-values
ggplot(plot_results, aes(x = Bias_Category, y = p_adjusted, fill = Significance)) +
geom_col(width = 0.7) +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "red", size = 1) +
facet_wrap(~ Location) +
scale_fill_manual(values = c("Significant" = "steelblue", "Not Significant" = "grey70")) +
labs(
title = "Adjusted P-Values for Pre/Post-COVID T-Tests by Bias Category",
y = "Adjusted P-Value (FDR)",
x = "Bias Category",
fill = "Significance"
) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
NJ_population <- read_csv("NJ Population.csv")
NJ_pop <- NJ_population[16:nrow(NJ_population), ]
library(scales)
# Create a scaling factor
pop_scale <- max(df_overall_trends$Annual_Total, na.rm = TRUE) /
max(NJ_pop$Population, na.rm = TRUE)
df_overall_trends %>%
filter(Location == "New Jersey") %>%
ggplot(aes(x = Year)) +
# Hate crimes line
geom_line(aes(y = Annual_Total), size = 1, color = "#e31a1c") +
geom_point(aes(y = Annual_Total), size = 2, color = "#e31a1c") +
# Population line (scaled)
geom_line(
data = NJ_pop,
aes(y = Population * pop_scale),
color = "#1f78b4",
size = 1
) +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "New Jersey Hate Crimes and Population Trends (2015–2025)",
x = "Year",
y = "Total Estimated Hate Crimes"
) +
scale_y_continuous(
labels = comma,
sec.axis = sec_axis(
~ . / pop_scale,
name = "New Jersey Population",
labels = comma
)
) +
theme_minimal()
nj_rates <- df_overall_trends %>%
filter(Location == "New Jersey") %>%
left_join(NJ_pop, by = "Year") %>%
mutate(
hate_crime_rate = (Annual_Total / Population) * 100000
)
ggplot(nj_rates, aes(x = Year, y = hate_crime_rate)) +
geom_line(size = 1, color = "#6a3d9a") +
geom_point(size = 2, color = "#6a3d9a") +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "Hate Crimes per 100,000 Residents in New Jersey (2015–2025)",
x = "Year",
y = "Hate Crimes per 100,000 Residents"
) +
scale_y_continuous(labels = comma) +
theme_minimal()
US_population <- read_csv("US Population.csv")
US_pop <- US_population[66:nrow(US_population), ]
names(US_pop)[1] <- "Year"
us_rates <- df_overall_trends %>%
filter(Location == "National") %>%
left_join(US_pop, by = "Year") %>%
mutate(
hate_crime_rate = (Annual_Total / Population) * 100000
)
ggplot(us_rates, aes(x = Year, y = hate_crime_rate)) +
geom_line(size = 1, color = "#1f78b4") +
geom_point(size = 2, color = "#1f78b4") +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "Hate Crimes per 100,000 Residents in the United States (2015–2025)",
x = "Year",
y = "Hate Crimes per 100,000 Residents"
) +
scale_y_continuous(labels = comma) +
theme_minimal()
summary(us_rates$hate_crime_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.542 4.528 5.187 5.614 7.405 7.900
combined_rates <- bind_rows(
nj_rates %>% mutate(Location = "New Jersey"),
us_rates %>% mutate(Location = "United States")
)
ggplot(combined_rates, aes(Year, hate_crime_rate, color = Location)) +
geom_line(size = 1) +
geom_point(size = 2) +
geom_vline(xintercept = 2020, linetype = "dashed") +
labs(
title = "Hate Crimes per 100,000 Residents: New Jersey vs United States",
x = "Year",
y = "Hate Crimes per 100,000 Residents",
color = "Location"
) +
scale_y_continuous(labels = comma) +
theme_minimal()