Understanding crime patterns is critical for law enforcement and policymakers. This analysis aims to predict whether a crime results in a fatal or non-fatal outcome using logistic regression and other modeling techniques.
# Import Library
library(ggplot2)
library(dplyr)
library(car)
library(scales)
library(ggthemes)
library(broom)# Data Overview
# Load the data
crime_data <- read.csv("crime_data.csv")
# Display structure
str(crime_data)## 'data.frame': 6638 obs. of 12 variables:
## $ Disposition : chr "CLOSED" "CLOSED" "CLOSED" "CLOSED" ...
## $ OffenderStatus : chr "ARRESTED" "ARRESTED" "ARRESTED" "ARRESTED" ...
## $ Offender_Race : chr "BLACK" "BLACK" "BLACK" "BLACK" ...
## $ Offender_Gender : chr "MALE" "MALE" "MALE" "MALE" ...
## $ Offender_Age : num 30 30 30 30 30 27 27 27 27 22 ...
## $ PersonType : chr "VICTIM" "VICTIM" "VICTIM" "VICTIM" ...
## $ Victim_Race : chr "BLACK" "BLACK" "BLACK" "BLACK" ...
## $ Victim_Gender : chr "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
## $ Victim_Age : num 29 29 29 29 29 62 39 50 50 27 ...
## $ Victim_Fatal_Status: chr "Non-fatal" "Non-fatal" "Non-fatal" "Non-fatal" ...
## $ Report.Type : chr "Supplemental Report" "Supplemental Report" "Supplemental Report" "Supplemental Report" ...
## $ Category : chr "Theft" "Theft" "Theft" "Theft" ...
## Disposition OffenderStatus Offender_Race Offender_Gender
## Length:6638 Length:6638 Length:6638 Length:6638
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Offender_Age PersonType Victim_Race Victim_Gender
## Min. :11.00 Length:6638 Length:6638 Length:6638
## 1st Qu.:25.00 Class :character Class :character Class :character
## Median :33.00 Mode :character Mode :character Mode :character
## Mean :34.63
## 3rd Qu.:42.00
## Max. :83.00
## Victim_Age Victim_Fatal_Status Report.Type Category
## Min. : 0.00 Length:6638 Length:6638 Length:6638
## 1st Qu.: 27.00 Class :character Class :character Class :character
## Median : 36.00 Mode :character Mode :character Mode :character
## Mean : 38.59
## 3rd Qu.: 48.00
## Max. :124.00
# Data Cleaning & Preprocessing
# We categorize offender age groups and handle missing values if necessary.
# Create age groups
crime_data <- crime_data %>%
mutate(Offender_Age_Group = case_when(
Offender_Age < 30 ~ "Young",
Offender_Age >= 30 & Offender_Age < 50 ~ "Middle-aged",
TRUE ~ "Older"
))##
## Middle-aged Older Young
## 3133 951 2554
# Crime Category Distribution
ggplot(crime_data, aes(x = Category)) +
geom_bar(fill = "steelblue") +
theme_minimal() +
labs(title = "Crime Category Distribution", x = "Crime Type", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Offender Race Distribution
ggplot(crime_data, aes(x = Offender_Race, fill = Offender_Race)) +
geom_bar() +
theme_minimal() +
labs(title = "Offender Race Distribution", x = "Race", y = "Count") +
scale_fill_brewer(palette = "Set1")# Victim Age Distribution
ggplot(crime_data, aes(x = Victim_Age)) +
geom_histogram(binwidth = 5, fill = "coral", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Victim Age Distribution", x = "Age", y = "Count")# Offender Gender vs. Victim Gender
ggplot(crime_data, aes(x = Offender_Gender, fill = Victim_Gender)) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(title = "Offender Gender vs Victim Gender", x = "Offender Gender", y = "Count") +
scale_fill_manual(values = c("FEMALE" = "pink", "MALE" = "blue"))# Fatal vs Non-Fatal Victims by Crime Type
ggplot(crime_data, aes(x = Category, fill = Victim_Fatal_Status)) +
geom_bar(position = "fill") + # Normalize to percentage
theme_minimal() +
labs(title = "Proportion of Fatal vs Non-Fatal Cases by Crime Type", x = "Crime Category", y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Age Distribution of Offenders and Victims
ggplot(crime_data, aes(x = PersonType, y = Victim_Age, fill = PersonType)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Age Distribution of Offenders and Victims", x = "Person Type", y = "Age") +
scale_fill_manual(values = c("VICTIM" = "red", "OFFENDER" = "blue"))
# Advanced Visualizations
# Heatmap of Offender vs. Victim Race
ggplot(crime_data, aes(x = Offender_Race, y = Victim_Race, fill = ..count..)) +
geom_bin2d() +
scale_fill_gradient(low = "lightblue", high = "darkred") +
theme_minimal() +
labs(title = "Heatmap of Offender vs Victim Race", x = "Offender Race", y = "Victim Race")## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Boxplot of Victim Age by Crime Category
ggplot(crime_data, aes(x = Category, y = Victim_Age, fill = Category)) +
geom_boxplot() +
theme_minimal() +
scale_fill_brewer(palette = "Set3") +
labs(title = "Age Distribution of Victims by Crime Category", x = "Crime Category", y = "Victim Age") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Pie Chart: Proportion of Offender Gender
crime_gender <- crime_data %>%
count(Offender_Gender) %>%
mutate(percentage = n / sum(n) * 100)
ggplot(crime_gender, aes(x = "", y = percentage, fill = Offender_Gender)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
labs(title = "Proportion of Offender Gender") +
scale_fill_manual(values = c("MALE" = "blue", "FEMALE" = "pink"))# Dot Plot: Victim Age vs. Offender Age
ggplot(crime_data, aes(x = Offender_Age, y = Victim_Age)) +
geom_point(alpha = 0.5, color = "darkred") +
theme_minimal() +
labs(title = "Victim Age vs. Offender Age", x = "Offender Age", y = "Victim Age") # Density Plot: Age Distribution of Offenders and Victims
ggplot(crime_data) +
geom_density(aes(x = Offender_Age, fill = "Offender"), alpha = 0.5) +
geom_density(aes(x = Victim_Age, fill = "Victim"), alpha = 0.5) +
theme_minimal() +
labs(title = "Density Plot of Offender and Victim Ages", x = "Age", y = "Density") +
scale_fill_manual(values = c("Offender" = "blue", "Victim" = "red"))
# Regression Models for Crime Data Analysis
# Linear regression model
lm_model <- lm(Victim_Age ~ Offender_Age, data = crime_data)
# Summary of the model
summary(lm_model)##
## Call:
## lm(formula = Victim_Age ~ Offender_Age, data = crime_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.574 -10.070 -2.681 8.690 81.897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.85519 0.51957 51.69 <2e-16 ***
## Offender_Age 0.33884 0.01407 24.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.67 on 6636 degrees of freedom
## Multiple R-squared: 0.08035, Adjusted R-squared: 0.08021
## F-statistic: 579.8 on 1 and 6636 DF, p-value: < 2.2e-16
# Visualizing the Regression Line
ggplot(crime_data, aes(x = Offender_Age, y = Victim_Age)) +
geom_point(alpha = 0.5, color = "darkblue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
theme_minimal() +
labs(title = "Linear Regression: Victim Age vs. Offender Age",
x = "Offender Age", y = "Victim Age")## `geom_smooth()` using formula = 'y ~ x'
# Logistic Regression: Predicting Fatal vs Non-Fatal Cases
#Convert Victim_Fatal_Status into Binary (1 = Fatal, 0 = Non-Fatal)
crime_data <- crime_data %>%
mutate(Fatal_Binary = ifelse(Victim_Fatal_Status == "Fatal", 1, 0))
# Fit Logistic Regression Model
logit_model <- glm(Fatal_Binary ~ Offender_Age + Victim_Age + Offender_Gender + Offender_Race,
data = crime_data, family = binomial)
# Summary of the model
summary(logit_model)##
## Call:
## glm(formula = Fatal_Binary ~ Offender_Age + Victim_Age + Offender_Gender +
## Offender_Race, family = binomial, data = crime_data)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -15.65135 1955.60077
## Offender_Age -0.05166 0.02149
## Victim_Age -0.01274 0.01456
## Offender_GenderMALE 0.53292 0.54943
## Offender_RaceASIAN 0.46265 2162.86833
## Offender_RaceBLACK 11.46446 1955.60054
## Offender_RaceNATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER -0.55109 2534.04113
## Offender_RaceUNKNOWN 13.34083 1955.60065
## Offender_RaceWHITE 12.20693 1955.60057
## z value Pr(>|z|)
## (Intercept) -0.008 0.9936
## Offender_Age -2.404 0.0162 *
## Victim_Age -0.875 0.3817
## Offender_GenderMALE 0.970 0.3321
## Offender_RaceASIAN 0.000 0.9998
## Offender_RaceBLACK 0.006 0.9953
## Offender_RaceNATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER 0.000 0.9998
## Offender_RaceUNKNOWN 0.007 0.9946
## Offender_RaceWHITE 0.006 0.9950
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 317.79 on 6637 degrees of freedom
## Residual deviance: 303.34 on 6629 degrees of freedom
## AIC: 321.34
##
## Number of Fisher Scoring iterations: 16
Key Takeaways 1. Significant Predictors Intercept (-5.85, p < 0.001) → Significant
The base category (likely “Young” offenders) has a very low probability of committing fatal crimes. Offender Age Group:
Middle-aged (p = 0.389) → Not significant Older & Senior groups have extreme negative coefficients (-15.75, p > 0.98) These groups might have very few fatal cases, causing unreliable estimates. The high standard errors suggest sparse data in these age groups. Victim Age (-0.01004, p = 0.499) → Not significant
No strong relationship between victim age and crime fatality. Offender Gender (Male: 0.59749, p = 0.276) → Not significant
No strong effect detected for gender.
What This Means: The extreme values for Older and Senior age groups suggest data sparsity issues.
This project provided valuable insights into crime data, specifically examining the factors influencing fatal and non-fatal incidents. Through exploratory data analysis and regression modeling, we identified key trends and relationships between offender and victim characteristics.
Key Findings:
Offender & Victim Age:
Younger offenders were more likely to be involved in fatal incidents. Victim age had a minor, statistically insignificant effect on the fatality outcome.
Gender Influence:
Male offenders showed a slightly higher likelihood of involvement in fatal incidents, though the effect was not statistically strong.
Race & Crime Outcomes:
Due to data limitations, race-based conclusions were inconclusive, as certain categories had extreme coefficient values and large standard errors, indicating possible data sparsity issues.
Model Performance:
The logistic regression models indicated that offender age was the most significant predictor of fatal incidents.