library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(haven)
library(nnet)    
library(clarify) 
library(texreg)  
DATA <- read_xlsx("hate_crime copy 2.xlsx")

print(colnames(DATA))
## [1] "data_year"          "pug_agency_name"    "state_abbr"        
## [4] "state_name"         "incident_date"      "offender_race"     
## [7] "offender_ethnicity" "offense_name"       "bias_desc"
colnames(DATA) <- c("Year", "Pug Agency Name", "State", "State Name", "Incident Date", 
                    "Offender Race", "Offender Ethnicity", "Offense", "Reason/Bias")

DATA$Year <- as.numeric(DATA$Year)

violent_crimes <- c("Aggravated Assault", "Murder and Nonnegligent Manslaughter", 
                    "Rape", "Robbery")

DATA <- DATA %>% 
  mutate(violent_crime = ifelse(grepl(paste(violent_crimes, collapse = "|"), Offense), 1, 0),
         `Offender Race` = as.factor(`Offender Race`),
         `Offender Ethnicity` = as.factor(`Offender Ethnicity`))

model_1 <- glm(violent_crime ~ `Year`, family = binomial(link = "logit"), data = DATA)
model_2 <- glm(violent_crime ~ `Year` + `Offender Race`, family = binomial(link = "logit"), data = DATA)
model_3 <- glm(violent_crime ~ `Year` + `Offender Race` + `Offender Ethnicity`, family = binomial(link = "logit"), data = DATA)
model_comparison <- data.frame(
  Model = c("Model 1", "Model 2", "Model 3"),
  AIC = c(AIC(model_1), AIC(model_2), AIC(model_3)),
  BIC = c(BIC(model_1), BIC(model_2), BIC(model_3)),
  LogLikelihood = c(logLik(model_1), logLik(model_2), logLik(model_3))
)

print(model_comparison)
##     Model      AIC      BIC LogLikelihood
## 1 Model 1 199453.1 199473.9     -99724.53
## 2 Model 2 179915.4 180009.4     -89948.68
## 3 Model 3 179412.7 179548.5     -89693.34
anova(model_1, model_2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: violent_crime ~ Year
## Model 2: violent_crime ~ Year + `Offender Race`
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1    253774     199449                          
## 2    253767     179897  7    19552 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model_2, model_3, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: violent_crime ~ Year + `Offender Race`
## Model 2: violent_crime ~ Year + `Offender Race` + `Offender Ethnicity`
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1    253767     179897                          
## 2    253763     179387  4   510.69 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
set.seed(123)
sim_model_3 <- sim(model_3, n = 1000, vcov = "robust")

# Check simulation object class
print(class(sim_model_3)) 
## [1] "clarify_sim"
# Compute Average Marginal Effects (AMEs)
variables <- c("Year", "Offender Race", "Offender Ethnicity")

ame_results <- lapply(variables, function(v) {
  sim_ame(sim_model_3, var = v)
})

names(ame_results) <- variables
print(ame_results)
## $Year
## A `clarify_est` object (from `sim_ame()`)
##  - Average marginal effect of `Year`
##  - 1000 simulated values
##  - 1 quantity estimated:                          
##  E[dY/d(Year)] -0.00145698
## 
## $`Offender Race`
## A `clarify_est` object (from `sim_ame()`)
##  - Average adjusted predictions for `Offender Race`
##  - 1000 simulated values
##  - 8 quantities estimated:                                                           
##  E[Y(American Indian or Alaska Native)]          0.22218936
##  E[Y(Asian)]                                     0.20205824
##  E[Y(Black or African American)]                 0.27298622
##  E[Y(Multiple)]                                  0.26860282
##  E[Y(Native Hawaiian or Other Pacific Islander)] 0.19032922
##  E[Y(Not Specified)]                             0.05842007
## # ... and 2 more
## 
## $`Offender Ethnicity`
## A `clarify_est` object (from `sim_ame()`)
##  - Average adjusted predictions for `Offender Ethnicity`
##  - 1000 simulated values
##  - 5 quantities estimated:                                       
##  E[Y(Hispanic or Latino)]     0.2550358
##  E[Y(Multiple)]               0.1829382
##  E[Y(Not Hispanic or Latino)] 0.1398939
##  E[Y(Not Specified)]          0.1291668
##  E[Y(Unknown)]                0.1421222
print(levels(DATA$`Offender Race`))
## [1] "American Indian or Alaska Native"         
## [2] "Asian"                                    
## [3] "Black or African American"                
## [4] "Multiple"                                 
## [5] "Native Hawaiian or Other Pacific Islander"
## [6] "Not Specified"                            
## [7] "Unknown"                                  
## [8] "White"
print(levels(DATA$`Offender Ethnicity`))
## [1] "Hispanic or Latino"     "Multiple"               "Not Hispanic or Latino"
## [4] "Not Specified"          "Unknown"
# Set reference values for prediction
set_values <- sim_setx(sim_model_3, x = list(
  Year = mean(DATA$Year, na.rm = TRUE), 
  `Offender Race` = "White", 
  `Offender Ethnicity` = "Not Hispanic or Latino"
))
predictions <- sim_apply(sim_model_3, FUN = predict, setx = set_values)
print(predictions)
## A `clarify_est` object (from `sim_apply()`)
##  - 1000 simulated values
##  - 253776 quantities estimated:             
##  1 -0.7904012
##  2 -0.7904012
##  3 -0.7904012
##  4 -0.7904012
##  5 -1.3124653
##  6 -1.3124653
## # ... and 253770 more
# Compute first differences
fd_results <- lapply(variables, function(v) {
  sim_ame(sim_model_3, var = v, contrast = "difference")
})

names(fd_results) <- variables
print(fd_results)
## $Year
## A `clarify_est` object (from `sim_ame()`)
##  - Average marginal effect of `Year`
##  - 1000 simulated values
##  - 1 quantity estimated:                          
##  E[dY/d(Year)] -0.00145698
## 
## $`Offender Race`
## A `clarify_est` object (from `sim_ame()`)
##  - Average adjusted predictions for `Offender Race`
##  - 1000 simulated values
##  - 8 quantities estimated:                                                           
##  E[Y(American Indian or Alaska Native)]          0.22218936
##  E[Y(Asian)]                                     0.20205824
##  E[Y(Black or African American)]                 0.27298622
##  E[Y(Multiple)]                                  0.26860282
##  E[Y(Native Hawaiian or Other Pacific Islander)] 0.19032922
##  E[Y(Not Specified)]                             0.05842007
## # ... and 2 more
## 
## $`Offender Ethnicity`
## A `clarify_est` object (from `sim_ame()`)
##  - Average adjusted predictions for `Offender Ethnicity`
##  - 1000 simulated values
##  - 5 quantities estimated:                                       
##  E[Y(Hispanic or Latino)]     0.2550358
##  E[Y(Multiple)]               0.1829382
##  E[Y(Not Hispanic or Latino)] 0.1398939
##  E[Y(Not Specified)]          0.1291668
##  E[Y(Unknown)]                0.1421222

Analysis of Hate Crimes: Predicting Violence through Logistic Regression

In this analysis, using the FBI Hate Crime Dataset, I examined factors that influence whether a hate crime is violent or non-violent by using logistic regression models. The dependent variable, violent_crime, was coded as 1 for violent crimes (aggravated assault, murder and nonnegligent manslaughter, rape, and robbery) and 0 for non-violent crimes. Each model added more variables to see how well they explained the likelihood of a hate crime being violent.

The first model included only the year of the incident, while the second model added the offender’s race. The third model expanded on this by including offender ethnicity. To compare the models, I used the Akaike Information Criterion (AIC) and Bayesian Information Criterion (BIC), where lower values indicate better model fit. The results showed that adding race significantly improved the model, with the AIC dropping from 199,453.06 in Model 1 to 179,915.36 in Model 2. A likelihood ratio test confirmed that this change was statistically significant. However, adding offender ethnicity in Model 3 only slightly improved the model, reducing the AIC to 179,412.68. The likelihood ratio test showed this change was also statistically significant, but the small improvement suggests that offender ethnicity contributes less than race in predicting violent hate crimes. Since Model 2 provided the biggest improvement while Model 3 added little additional value, I determined that Model 2 best represents the data.

Using this data, after determining that Model 2 best represents my findings, I used the clarify package to gain a deeper understanding of how offender characteristics impact the likelihood of a hate crime being violent. The clarify package allowed me to compute average marginal effects (AMEs) and predicted probabilities for offender characteristics, to help interpret the practical significance of these factors beyond just statistical fit. The average marginal effect of year was estimated at -0.00146, meaning that each additional year is associated with a very slight decrease in the probability of a violent hate crime. While statistically significant, the effect size is very small, suggesting that time alone is not a strong predictor of violent hate crimes. Looking at offender race, the model predicted different probabilities of a hate crime being violent depending on the racial category. Offenders identified as Black or African American had the highest estimated probability at 0.273, followed closely by those categorized as multiple races at 0.269. White offenders had a lower estimated probability, while the lowest probability was found among cases where the race of the offender was “Not Specified” at 0.058, which may reflect missing or incomplete data. These findings do not necessarily indicate that certain racial groups commit more violent hate crimes but could instead reflect disparities in crime reporting, law enforcement practices, and data collection methods. Systematic biases in policing, legal definitions of hate crime severity, or broader social factors may influence these numbers. A similar pattern emerged when examining offender ethnicity. The results showed that offenders categorized as Hispanic or Latino had the highest estimated probability of committing a violent hate crime at 0.255, while those categorized as Not Hispanic or Latino had a lower probability at 0.140. Cases labeled “Not Specified” or “Unknown” had even lower probabilities. Like race, these findings may be influenced by factors beyond offender behavior, such as differences in reporting, law enforcement classification methods, and social dynamics surrounding crime categorization.

These findings suggest that offender race has a strong impact on predicting whether a hate crime is violent, while offender ethnicity has a smaller but still significant effect. The impact of year is minimal, suggesting that other factors, such as social dynamics, law enforcement focus, and crime classification policies, play a larger role. Given the possibility of reporting and classification biases, these results should be interpreted with caution, as they do not necessarily reflect inherent differences in criminal behavior across racial or ethnic groups. Instead, they may be shaped by structural factors that influence how hate crimes are reported, investigated, and recorded. This analysis highlights how different offender characteristics correlate with the likelihood of a hate crime being violent. While statistical models suggest that race is a key predictor, it is essential to consider how systematic biases in crime data and law enforcement practices may shape these results. Further research is needed to explore the social and systemic factors driving these differences and to account for potential biases in how hate crimes are reported and recorded.