# Creating the FBI data histogram with a mean line
fbi_histogram <- ggplot(hate_crimes %>% 
         #Remove missing or non-finite values to prevent plotting errors
                               filter(!is.na(hatecrimes_fbi) & is.finite(hatecrimes_fbi)),
                               aes(x = hatecrimes_fbi)) +
  geom_histogram(fill = "skyblue", bins = 30, alpha = 1) +
  # Add a vertical dashed line representing the national mean
  # This helps visually assess states fall above or below average
  geom_vline(aes(xintercept = mean(hatecrimes_fbi, na.rm = TRUE)), #
             color = "red", linetype = "dashed", size = 1) +
  # Label axes for interpretability in the final report
  labs(x = "Average Annual Hate Crimes per 100k", y = "Frequency") +
 # Apply a clean theme to improve readability and presentation quality
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Overlaying data on map of United States
# Load geographic boundary data for U.S. states
# This dataset contains the coordinates needed to draw each state polygon
states_map <- map_data("state")

# Prepare the hate crimes data for spatial visualization
hate_crimes_map_data <- hate_crimes %>%
  
  # Convert state names to lowercase so they match the format
  # used in the map_data dataset (necessary for a successful join)
  mutate(state = tolower(state)) %>%
  
  # Merge hate crime data with geographic coordinates
  # inner_join keeps only states present in BOTH datasets,
  # preventing blank or unmatched regions on the map
  inner_join(states_map, by = c("state" = "region"))

# Create a choropleth map (color-coded map) of hate crime rates
hate_crimes_map <- ggplot() +
  
  # Draw each state as a polygon and fill based on hate crime rate
  geom_polygon(
    data = hate_crimes_map_data,
    aes(
      x = long, 
      y = lat, 
      group = group,     # Ensures each state boundary is drawn correctly
      fill = hatecrimes_fbi
    ),
    color = "white"      # White borders improve state separation
  ) +
  
  # Fix the aspect ratio so the U.S. map is not distorted
  coord_fixed(1.3) +
  
  # Use the viridis color scale:
  # - perceptually uniform (differences reflect real data differences)
  # - colorblind-friendly
  # - prints well in grayscale
  scale_fill_viridis(
    option = "magma",
    name = "Hate Crimes\nper 100k",
    guide = guide_colorbar(direction = "vertical")
  ) +
  
  # Remove axes, ticks, and background elements
  # This keeps attention on the spatial pattern
  theme_void() +
  
  # Position legend and center the title for a cleaner layout
  theme(
    legend.position = "right",
    plot.title = element_text(hjust = 0.5)
  )

# Combine the histogram and map into a single figure
combined_plot <- fbi_histogram + hate_crimes_map +
  
  # Arrange plots in two columns for easy comparison
  plot_layout(ncol = 2) +
  
  # Add a caption describing the time period
  labs(caption = "Notes: Years 2010 through 2015") +
  
  # Style the caption for readability
  theme(plot.caption = element_text(hjust = 0, face = "italic"))

print(combined_plot)

# Reorder the columns so that the DV is first
hate_crimes_reordered <- hate_crimes %>% 
  select(hatecrimes_fbi, everything())

# Use datasummary_skim() on the reordered data
datasummary_skim(hate_crimes_reordered, type="numeric", fmt=2, histogram=FALSE)
## Warning: The `histogram` argument is deprecated. Use `fun_numeric` instead.
Unique Missing Pct. Mean SD Min Median Max
hatecrimes_fbi 51 2 2.37 1.71 0.27 1.99 10.95
median_house_inc 51 0 55223.61 9208.48 35521.00 54916.00 76165.00
share_unemp_seas 32 0 4.96 1.07 2.80 5.10 7.30
share_pop_metro 31 0 75.02 18.16 31.00 79.00 100.00
share_pop_hs 40 0 86.91 3.41 79.90 87.40 91.80
share_non_citizen 13 6 5.46 3.11 1.00 4.50 13.00
share_white_poverty 12 0 9.18 2.47 4.00 9.00 17.00
gini_index 39 0 45.38 2.09 41.90 45.40 53.20
share_non_white 34 0 31.57 16.49 6.00 28.00 81.00
share_vote_trump 33 0 49.00 11.87 4.00 49.00 70.00
hate_crimes_per_100k_splc 48 8 0.30 0.25 0.07 0.23 1.52
avg_hatecrimes_per_100k_fbi 51 2 2.37 1.71 0.27 1.99 10.95
hatecrimes_splc 48 8 0.30 0.25 0.07 0.23 1.52
var_table <- matrix(
    c(  
      "median_house_inc",    "Negative", "Higher income states are associated with lower hate crime rates.",
      "share_unemp_seas",    "Positive", "Higher unemployment is associated to higher hate crime rates.",
      "share_pop_metro",     "Negative", "Metropolitan areas will reflect fewer hate crimes",
      "share_pop_hs",        "Negative", "A higher share of the population with a high school diploma is associated with lower hate crime rates.",
      "share_non_citizen",   "Positive", "Areas with higher non-citizen populations are associated with more hate crimes.",
      "share_white_poverty", "Positive", "A greater poor white population will be associated with more hate crimes.",
      "gini_index",          "Positive", "Higher income inequality is associated with higher hate crime rates.",
      "share_non_white",     "Positive", "Areas with diverse populations will experience more hate crimes.",
      "share_vote_trump",    "Positive", "States with higher support for Trump are associated with higher hate crime rates."
    ), 
    ncol = 3, 
    byrow = TRUE
  )

# Convert the matrix to a data frame for kable
table_df <- as.data.frame(var_table)

# Create the table with knitr::kable and shade the first row
kable(table_df, "html", booktabs = T, col.names = c("Variable Name", "Hypothesized Direction", "Hypothesis Formulation")) %>%
  kable_styling(latex_options = "striped", stripe_color = "gray!30") %>%
  row_spec(0, background = "gray!30")
Variable Name Hypothesized Direction Hypothesis Formulation
median_house_inc Negative Higher income states are associated with lower hate crime rates.
share_unemp_seas Positive Higher unemployment is associated to higher hate crime rates.
share_pop_metro Negative Metropolitan areas will reflect fewer hate crimes
share_pop_hs Negative A higher share of the population with a high school diploma is associated with lower hate crime rates.
share_non_citizen Positive Areas with higher non-citizen populations are associated with more hate crimes.
share_white_poverty Positive A greater poor white population will be associated with more hate crimes.
gini_index Positive Higher income inequality is associated with higher hate crime rates.
share_non_white Positive Areas with diverse populations will experience more hate crimes.
share_vote_trump Positive States with higher support for Trump are associated with higher hate crime rates.
# Education 
model_edu <- lm(hatecrimes_fbi ~ share_pop_hs, data = hate_crimes)

# Diversity
model_div <- lm(hatecrimes_fbi ~ share_non_white, data = hate_crimes)
model_div2 <- lm(hatecrimes_fbi ~ share_non_citizen, data = hate_crimes)

# Geographic Distribution 
model_geo <- lm(hatecrimes_fbi ~ share_pop_metro, data = hate_crimes)

# Economic 
model_eco <- lm(hatecrimes_fbi ~ median_house_inc, data = hate_crimes)
model_eco2 <- lm(hatecrimes_fbi ~ share_unemp_seas, data = hate_crimes)
model_eco3 <- lm(hatecrimes_fbi ~ share_white_poverty, data = hate_crimes)

# Income Inequality 
model_ine <- lm(hatecrimes_fbi ~ gini_index, data = hate_crimes)

# Political Leanings 
model_pol <- lm(hatecrimes_fbi ~ share_vote_trump, data = hate_crimes)

model_list <- list(
  "Education" = model_edu,
  "Diversity" = model_div,
  "Geographic Distribution" = model_geo,
  "Economic Indicators" = model_eco,
  "Income Inequality" = model_ine,
  "Political Leanings" = model_pol
)

modelsummary(model_list,
             estimate = "{estimate}{stars}",
             statistic = NULL,
             gof_omit = "IC|Log|alg|pss|F|RMSE",
             notes = "Notes: + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001",
             output="huxtable")
EducationDiversityGeographic DistributionEconomic IndicatorsIncome InequalityPolitical Leanings
(Intercept)-4.8092.077***0.897-0.956-13.334**6.026***
share_pop_hs0.083
share_non_white0.010
share_pop_metro0.020
median_house_inc0.000*
gini_index0.346**
share_vote_trump-0.074***
Num.Obs.505050505050
R20.0270.0070.0440.1010.1770.255
R2 Adj.0.007-0.0140.0240.0830.1600.239
Notes: + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
selected_vars <- hate_crimes %>% 
  select(hatecrimes_fbi,
         gini_index, 
         median_house_inc, 
         share_unemp_seas, 
         share_pop_metro, 
         share_pop_hs, 
         share_non_white, 
         share_non_citizen, 
         share_white_poverty,
         share_vote_trump)

cor_matrix <- cor(selected_vars, use = "complete.obs")

corrplot(cor_matrix, method = "circle", diag = FALSE, type = "lower",
                       order = "hclust",      # Orders variables based on hierarchical clustering, or change it to "original" to keep your custom order 
                       tl.srt = 45,           # Set text label angle for the top labels
                       tl.offset = 0.5,       # Increase offset to move text labels closer to the grid
                       tl.cex = 0.6,          # Font size of text labels
                       tl.col = "black",      # Label color
                       number.cex = 0.8,      # Adjust size of numbers inside circles
                       addCoef.col = "black", # Color of the correlation coefficients
                       cl.pos = 'n',          # Removes legend
                       is.corr = TRUE)        # Ensure it interprets values as correlations for formatting

model_fbi <- lm(hatecrimes_fbi ~ median_house_inc + 
                                             share_unemp_seas + 
                                             share_pop_metro + 
                                             share_pop_hs + 
                                             share_non_citizen + 
                                             share_white_poverty + 
                                             gini_index + 
                                             share_non_white + 
                                             share_vote_trump, 
                data = hate_crimes)

modelsummary(list("avg annual per 100,000" = model_fbi),
             estimate = "{estimate}{stars}",
             statistic = NULL,
             gof_omit = "IC|Log|alg|pss|F|RMSE",
             notes = "Notes: + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001",
             output="huxtable")
avg annual per 100,000
(Intercept)-42.347*
median_house_inc0.000
share_unemp_seas0.285
share_pop_metro-0.026
share_pop_hs0.212
share_non_citizen0.198
share_white_poverty0.063
gini_index0.570**
share_non_white-0.037
share_vote_trump-0.022
Num.Obs.47
R20.498
R2 Adj.0.376
Notes: + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
# ----------------------------------------
# Step 1: Define model labels
# 'Multivariate' refers to the full model including all independent variables.
# 'Bivariate' refers to models with a single predictor (one DV vs. one IV).
# This vector is for labeling purposes in the plot legend.
model_names <- c("Multivariate", rep("Bivariate", 9))

# ----------------------------------------
# Step 2: Define colors for the plot
# Green for the multivariate model, pink for bivariate models
colors <- c("#1B9E77", rep("#E7298A", 9))

# ----------------------------------------
# Step 3: Extract coefficients and confidence intervals
# 'tidy()' from the broom package converts model objects to a tidy data frame
tidy_fbi <- tidy(model_fbi, conf.int = TRUE) %>% mutate(Model = "Multivariate")
tidy_edu <- tidy(model_edu, conf.int = TRUE) %>% mutate(Model = "Bivariate")
tidy_div <- tidy(model_div, conf.int = TRUE) %>% mutate(Model = "Bivariate")
tidy_div2 <- tidy(model_div2, conf.int = TRUE) %>% mutate(Model = "Bivariate")
tidy_geo <- tidy(model_geo, conf.int = TRUE) %>% mutate(Model = "Bivariate")
tidy_eco <- tidy(model_eco, conf.int = TRUE) %>% mutate(Model = "Bivariate")
tidy_eco2 <- tidy(model_eco2, conf.int = TRUE) %>% mutate(Model = "Bivariate")
tidy_eco3 <- tidy(model_eco3, conf.int = TRUE) %>% mutate(Model = "Bivariate")
tidy_ine <- tidy(model_ine, conf.int = TRUE) %>% mutate(Model = "Bivariate")
tidy_pol <- tidy(model_pol, conf.int = TRUE) %>% mutate(Model = "Bivariate")

# ----------------------------------------
# Step 4: Combine all tidied models into a single dataframe
all_models_df <- bind_rows(
  tidy_fbi, tidy_edu, tidy_div, tidy_div2, tidy_geo, 
  tidy_eco, tidy_eco2, tidy_eco3, tidy_ine, tidy_pol
) %>%
  filter(term != "(Intercept)") %>%  # Remove intercepts from plot
  mutate(
    adjustment = if_else(Model == "Multivariate", 0, -0.02 * estimate),
    estimate_adj = estimate + adjustment,  # Slightly offset bivariate estimates for visibility
    ymin = conf.low + adjustment,          # Adjust lower bound
    ymax = conf.high + adjustment          # Adjust upper bound
  )

# ----------------------------------------
# Step 5: Plot coefficients with confidence intervals
ggplot(all_models_df, aes(x = term, y = estimate_adj, color = Model)) +
  geom_point(position = position_dodge(width = 0.5)) +  # dodge points to avoid overlap
  geom_errorbar(aes(ymin = ymin, ymax = ymax), width = 0.2, position = position_dodge(width = 0.5)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black", size = 0.5) +  # Reference line at 0
  scale_color_manual(values = c("Multivariate" = "#1B9E77", "Bivariate" = "#E7298A")) +
  theme_minimal() +
  theme(legend.position = "right") +
  labs(
    x = "Variable", 
    y = "Coefficient Estimate", 
    color = "Model Type",
    title = "Comparison of Bivariate vs. Multivariate Coefficient Estimates"
  ) +
  coord_flip()  # Flip coordinates so variables appear on the y-axis for readability

# ----------------------------------------
# Annotations:
# - Green dots = coefficients from the full multivariate model
# - Pink dots = coefficients from each bivariate model
# - Error bars = 95% confidence intervals
# - Dashed horizontal line = no effect (0)
# - Positive estimate: variable is associated with higher hate crime rate
# - Negative estimate: variable is associated with lower hate crime rate
# Multivariate regression model using SPLC data
model_splc <- lm(hatecrimes_splc ~ median_house_inc + 
                                         share_unemp_seas + 
                                         share_pop_metro + 
                                         share_pop_hs + 
                                         share_non_citizen + 
                                         share_white_poverty + 
                                         gini_index + 
                                         share_non_white + 
                                         share_vote_trump, 
                 data = hate_crimes)


modelsummary(list("FBI\n(avg annual per 100,000)" = model_fbi, "SPLC\n(hate incidents per 100,000 in 10 day period )"= model_splc),
             estimate = "{estimate}{stars}",
             statistic = NULL,
             gof_omit = "IC|Log|alg|pss|F|RMSE",
             notes = "Notes: + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001",
             output="huxtable")
FBI
(avg annual per 100,000)
SPLC
(hate incidents per 100,000 in 10 day period )
(Intercept)-42.347*-4.305
median_house_inc0.000-0.000
share_unemp_seas0.2850.049
share_pop_metro-0.026-0.003
share_pop_hs0.2120.036+
share_non_citizen0.1980.009
share_white_poverty0.0630.005
gini_index0.570**0.050*
share_non_white-0.037-0.003
share_vote_trump-0.022-0.013*
Num.Obs.4745
R20.4980.596
R2 Adj.0.3760.492
Notes: + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
splc_map_data <- hate_crimes %>%
  mutate(state = tolower(state)) %>%
  inner_join(states_map, by = c("state" = "region"))

splc_map <- ggplot() +
  geom_polygon(
    data = splc_map_data,
    aes(
      x = long,
      y = lat,
      group = group,
      fill = hatecrimes_splc
    ),
    color = "white"
  ) +
  coord_fixed(1.3) +
  scale_fill_viridis(option = "plasma",
                     name = "SPLC Hate Incidents\nper 100k (10 days)",
                     guide = guide_colorbar(direction = "vertical")) +
  theme_void() +
  theme(
    legend.position = "right",
    plot.title = element_text(hjust = 0.5)
  ) +
  labs(title = "SPLC-Reported Hate Incidents Across States")

print(splc_map)