This R Markdown report performs a complete analysis of the Crime Dataset (India), including data cleaning, descriptive statistics, exploratory analysis, modeling, clustering, association rule mining, and forecasting.
file_path <- 'C:/Users/abidm/OneDrive/Desktop/crime_dataset_india.csv'
crime <- read.csv(file_path, stringsAsFactors = FALSE)
crime <- clean_names(crime)
cat("Rows:", nrow(crime), " Columns:", ncol(crime))
## Rows: 40160 Columns: 14
head(crime)
## report_number date_reported date_occurred time_of_occurrence city
## 1 1 02-01-2020 00:00 01-01-2020 00:00 01-01-2020 01:11 Ahmedabad
## 2 2 01-01-2020 19:00 01-01-2020 01:00 01-01-2020 06:26 Chennai
## 3 3 02-01-2020 05:00 01-01-2020 02:00 01-01-2020 14:30 Ludhiana
## 4 4 01-01-2020 05:00 01-01-2020 03:00 01-01-2020 14:46 Pune
## 5 5 01-01-2020 21:00 01-01-2020 04:00 01-01-2020 16:51 Pune
## 6 6 02-01-2020 03:00 01-01-2020 05:00 01-01-2020 17:09 Delhi
## crime_code crime_type victim_age victim_gender weapon crime_domain
## 1 576 IDENTITY THEFT 16 M Blunt Object Violent Crime
## 2 128 HOMICIDE 37 M Poison Other Crime
## 3 271 KIDNAPPING 48 F Blunt Object Other Crime
## 4 170 BURGLARY 49 F Firearm Other Crime
## 5 421 VANDALISM 30 F Other Other Crime
## 6 442 ASSAULT 16 M Firearm Violent Crime
## police_deployed case_closed date_case_closed
## 1 13 No
## 2 9 No
## 3 15 No
## 4 1 Yes 29-04-2020 05:00
## 5 18 Yes 08-01-2020 21:00
## 6 18 Yes 30-03-2020 03:00
date_cols <- intersect(c('date','incident_date','crime_date','reported_date','occurred_on','datetime'), names(crime))
if(length(date_cols) > 0){
dcol <- date_cols[1]
crime <- crime %>% mutate(raw_date = .data[[dcol]])
crime <- crime %>% mutate(parsed_date = parse_date_time(raw_date, orders=c('Y-m-d','d/m/Y','m/d/Y','Y/m/d','d-b-Y','Ymd HMS','Ymd')))
crime <- crime %>% mutate(
year = year(parsed_date),
month = month(parsed_date),
day = day(parsed_date),
wday = wday(parsed_date, label=TRUE),
hour = hour(parsed_date)
)
}
# Time of day category
if("hour" %in% names(crime)){
crime <- crime %>% mutate(time_of_day = case_when(
hour >= 5 & hour < 12 ~ "Morning",
hour >= 12 & hour < 17 ~ "Afternoon",
hour >= 17 & hour < 21 ~ "Evening",
TRUE ~ "Night"
))
}
head(crime)
## report_number date_reported date_occurred time_of_occurrence city
## 1 1 02-01-2020 00:00 01-01-2020 00:00 01-01-2020 01:11 Ahmedabad
## 2 2 01-01-2020 19:00 01-01-2020 01:00 01-01-2020 06:26 Chennai
## 3 3 02-01-2020 05:00 01-01-2020 02:00 01-01-2020 14:30 Ludhiana
## 4 4 01-01-2020 05:00 01-01-2020 03:00 01-01-2020 14:46 Pune
## 5 5 01-01-2020 21:00 01-01-2020 04:00 01-01-2020 16:51 Pune
## 6 6 02-01-2020 03:00 01-01-2020 05:00 01-01-2020 17:09 Delhi
## crime_code crime_type victim_age victim_gender weapon crime_domain
## 1 576 IDENTITY THEFT 16 M Blunt Object Violent Crime
## 2 128 HOMICIDE 37 M Poison Other Crime
## 3 271 KIDNAPPING 48 F Blunt Object Other Crime
## 4 170 BURGLARY 49 F Firearm Other Crime
## 5 421 VANDALISM 30 F Other Other Crime
## 6 442 ASSAULT 16 M Firearm Violent Crime
## police_deployed case_closed date_case_closed
## 1 13 No
## 2 9 No
## 3 15 No
## 4 1 Yes 29-04-2020 05:00
## 5 18 Yes 08-01-2020 21:00
## 6 18 Yes 30-03-2020 03:00
num_cols <- crime %>% select(where(is.numeric)) %>% names()
cat("Numeric Columns:\n")
## Numeric Columns:
print(num_cols)
## [1] "report_number" "crime_code" "victim_age" "police_deployed"
# Exploratory Data Analysis
ggplot(crime, aes(x = crime_type)) +
geom_bar(fill="lightgreen") +
labs(title="Crime Count by Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Crime Hotspots (Location-Based Analysis)
crime %>%
group_by(city) %>%
summarise(total = n()) %>%
ggplot(aes(x = reorder(city, total), y = total)) +
geom_bar(stat="identity", fill="orange") +
coord_flip() +
labs(title="Crime Count by State")
##City-wise Crime Trend (Top 10 Cities)
top_cities <- crime %>% count(city, sort=TRUE) %>% slice(1:10) %>% pull(city)
crime %>%
filter(city %in% top_cities) %>%
ggplot(aes(x = city)) +
geom_bar(fill="darkorange") +
labs(title="Top 10 Cities with Highest Crime Frequency") +
theme(axis.text.x = element_text(angle=45, hjust=1))
##Top 10 Crime Types
crime %>%
count(crime_type, sort=TRUE) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(crime_type, n), y = n)) +
geom_bar(stat="identity", fill="skyblue") +
coord_flip() +
labs(title="Top 10 Crime Types", x="Crime Type", y="Count")
crime %>%
count(crime_type) %>% # count each crime type
mutate(prop = n / sum(n)) %>% # calculate proportions
ggplot(aes(x = "", y = prop, fill = crime_type)) +
geom_col(width = 1) + # bar to be converted to pie
coord_polar(theta = "y") + # convert to pie
labs(title = "Crime Proportions by Type",
fill = "Crime Type") +
theme_void() # remove axes
ggplot(crime, aes(x = crime_type)) +
geom_bar(fill = "steelblue") +
labs(
title = "Count of Crimes by Type",
x = "Crime Type",
y = "Count"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(crime, aes(x = victim_gender)) +
geom_bar(fill = "tomato") +
labs(
title = "Count of Victim Gender",
x = "Gender",
y = "Count"
) +
theme_minimal()
library(tidyverse)
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)
df <- janitor::clean_names(df)
str(df)
## 'data.frame': 40160 obs. of 14 variables:
## $ report_number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date_reported : Factor w/ 25546 levels "01-01-2020 05:00",..: 837 3 839 1 4 838 2 842 2518 1671 ...
## $ date_occurred : Factor w/ 40160 levels "01-01-2020 00:00",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ time_of_occurrence: Factor w/ 39886 levels "01-01-2020 01:11",..: 1 2 4 5 6 7 3 1306 1307 9 ...
## $ city : Factor w/ 29 levels "Agra","Ahmedabad",..: 2 5 16 22 22 6 5 5 18 5 ...
## $ crime_code : int 576 128 271 170 421 442 172 169 338 497 ...
## $ crime_type : Factor w/ 21 levels "ARSON","ASSAULT",..: 12 11 14 3 20 2 21 4 8 15 ...
## $ victim_age : int 16 37 48 49 30 16 64 78 41 29 ...
## $ victim_gender : Factor w/ 3 levels "F","M","X": 2 2 1 1 1 2 1 3 3 2 ...
## $ weapon : Factor w/ 7 levels "Blunt Object",..: 1 7 1 3 6 3 4 4 1 4 ...
## $ crime_domain : Factor w/ 4 levels "Fire Accident",..: 4 2 2 2 2 4 4 2 2 2 ...
## $ police_deployed : int 13 9 15 1 18 18 13 8 1 4 ...
## $ case_closed : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 2 2 1 1 1 ...
## $ date_case_closed : Factor w/ 16012 levels "","01-01-2021 00:00",..: 1 1 1 14810 3647 15249 12152 1 1 1 ...
# Select useful predictor variables
reg_data <- df %>%
select(police_deployed, victim_age, victim_gender, crime_domain)
# Remove rows with missing values
reg_data <- reg_data %>% na.omit()
# Convert categorical variables to factors
reg_data$victim_gender <- as.factor(reg_data$victim_gender)
reg_data$crime_domain <- as.factor(reg_data$crime_domain)
# ------------------------------
# Build Linear Regression Model
# ------------------------------
model <- lm(police_deployed ~ victim_age + victim_gender + crime_domain,
data = reg_data)
summary(model)
##
## Call:
## lm(formula = police_deployed ~ victim_age + victim_gender + crime_domain,
## data = reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1102 -4.9760 -0.0073 4.9657 9.0967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.9419455 0.1093082 90.953 <2e-16 ***
## victim_age 0.0008555 0.0013494 0.634 0.526
## victim_genderM -0.0488855 0.0597065 -0.819 0.413
## victim_genderX 0.0220883 0.0907735 0.243 0.808
## crime_domainOther Crime 0.0500698 0.0955031 0.524 0.600
## crime_domainTraffic Fatality 0.0785998 0.1530763 0.513 0.608
## crime_domainViolent Crime 0.0273697 0.1021017 0.268 0.789
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.468 on 40153 degrees of freedom
## Multiple R-squared: 4.278e-05, Adjusted R-squared: -0.0001066
## F-statistic: 0.2863 on 6 and 40153 DF, p-value: 0.9437
A linear regression model was developed to predict the number of police deployed based on victim age, victim gender, and crime domain. The model output includes coefficient estimates and significance levels, helping understand which crime-related factors influence police allocation decisions.
# --- K-NEAREST NEIGHBORS (KNN) REGRESSION - FINAL WORKING CODE ---
# Load essential libraries
library(caret)
library(dplyr)
library(janitor)
# 1. Load and Prepare Data
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)
df <- clean_names(df)
# Create the full dataset with only the required columns
# Target variable is renamed to PoliceDeployed for convenience
modeling_data <- df %>%
select(PoliceDeployed = police_deployed, victim_age, victim_gender, crime_domain)
# 2. Data Splitting (70% Train, 30% Test)
# Splitting the raw data frame (caret handles scaling/encoding)
set.seed(42)
train_index <- createDataPartition(modeling_data$PoliceDeployed, p = 0.7, list = FALSE)
train_data <- modeling_data[train_index, ]
test_data <- modeling_data[-train_index, ]
# 3. Train KNN Regression Model
# FIX APPLIED: Removed 'dummy' from preProcess.
# 'caret' automatically creates dummy variables for factors when using the formula interface.
knn_model <- train(
PoliceDeployed ~ victim_age + victim_gender + crime_domain,
data = train_data,
method = "knn",
tuneGrid = data.frame(k = 5),
trControl = trainControl(method = "none"),
# Only specify 'center' and 'scale' for normalization (essential for KNN)
preProcess = c("center", "scale")
)
# 4. Evaluate the Model
# Pass the raw test data frame (not the manually scaled matrix) to predict
predictions <- predict(knn_model, test_data)
# Extract the true test values
y_test <- test_data$PoliceDeployed
# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((y_test - predictions)^2))
# Print the final evaluation metric
print(paste("KNN Regression (k=5) RMSE:", round(rmse, 4)))
## [1] "KNN Regression (k=5) RMSE: 5.536"
# Install required packages if you haven't already
# install.packages(c("ggplot2", "dplyr"))
library(ggplot2)
library(dplyr)
# 1. Load the data
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)
# Select the numeric variable for analysis
# We will use 'Police_Deployed'
numeric_data <- df$Police_Deployed
# --- Method A: Z-score Detection ---
print("--- 1. Outlier Detection using Z-score ---")
## [1] "--- 1. Outlier Detection using Z-score ---"
# Calculate Z-scores for Police_Deployed
z_scores <- as.vector(scale(numeric_data))
# Identify indices of outliers (absolute Z-score > 3)
z_outlier_indices <- which(abs(z_scores) > 3)
# Print the count and the corresponding Police_Deployed values
print(paste("Total number of Z-score outliers (> 3 SD):", length(z_outlier_indices)))
## [1] "Total number of Z-score outliers (> 3 SD): 0"
if(length(z_outlier_indices) > 0) {
print("Example Police_Deployed values corresponding to Z-score outliers:")
print(head(numeric_data[z_outlier_indices]))
} else {
print("No Z-score outliers found (i.e., no values are more than 3 standard deviations from the mean).")
}
## [1] "No Z-score outliers found (i.e., no values are more than 3 standard deviations from the mean)."
# --- Method B: IQR Detection (Tukey's Method) ---
print("\n--- 2. Outlier Detection using IQR (1.5 * IQR Rule) ---")
## [1] "\n--- 2. Outlier Detection using IQR (1.5 * IQR Rule) ---"
# Calculate the quartiles and IQR
Q1 <- quantile(numeric_data, 0.25)
Q3 <- quantile(numeric_data, 0.75)
IQR_val <- Q3 - Q1
# Define the boundaries for non-outlier data
lower_bound <- Q1 - 1.5 * IQR_val
upper_bound <- Q3 + 1.5 * IQR_val
# Identify indices of outliers
iqr_outlier_indices <- which(numeric_data < lower_bound | numeric_data > upper_bound)
# Print the count and the corresponding Police_Deployed values
print(paste("Lower bound:", round(lower_bound, 2)))
## [1] "Lower bound: -10"
print(paste("Upper bound:", round(upper_bound, 2)))
## [1] "Upper bound: 30"
print(paste("Total number of IQR outliers:", length(iqr_outlier_indices)))
## [1] "Total number of IQR outliers: 0"
if(length(iqr_outlier_indices) > 0) {
print("Example Police_Deployed values corresponding to IQR outliers:")
print(head(numeric_data[iqr_outlier_indices]))
} else {
print("No IQR outliers found.")
}
## [1] "No IQR outliers found."
# --- Method C: Boxplot Visualization (Individual Events) ---
print("\n--- 3. Boxplot Visualization (Individual Outliers) ---")
## [1] "\n--- 3. Boxplot Visualization (Individual Outliers) ---"
# Create a boxplot for Police_Deployed
boxplot_plot <- ggplot(df, aes(x = factor(0), y = Police_Deployed)) +
geom_boxplot(fill = "skyblue", color = "darkblue") +
labs(
title = "Boxplot of Police Deployed (Outlier Detection)",
x = "",
y = "Police Deployed Count"
) +
theme_minimal() +
scale_x_discrete(breaks = NULL) # Remove x-axis label for a single boxplot
# Print the plot object (assuming R environment can display it)
print(boxplot_plot)
# --- Method D: Boxplot Detection (Regional Spikes) ---
print("\n--- 4. Boxplot Visualization (Regional Abnormal Spikes) ---")
## [1] "\n--- 4. Boxplot Visualization (Regional Abnormal Spikes) ---"
# To detect regional spikes, we analyze Police_Deployed by City.
# We'll limit to the top 10 cities by number of crimes for a clearer plot.
top_cities <- df %>%
group_by(City) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(10)
df_top_cities <- df %>%
filter(City %in% top_cities$City)
# Create a boxplot of Police_Deployed grouped by the top 10 cities
regional_boxplot_plot <- ggplot(df_top_cities, aes(x = reorder(City, Police_Deployed, FUN = median), y = Police_Deployed, fill = City)) +
geom_boxplot() +
labs(
title = "Police Deployed Outliers by City (Top 10 Cities)",
x = "City (Ordered by Median Police Deployed)",
y = "Police Deployed Count"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
# Print the plot object
print(regional_boxplot_plot)
# 1. Load the data
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)
# 2. Data Preparation
# Convert Police_Deployed to numeric (in case it wasn't read correctly)
df$Police_Deployed <- as.numeric(df$Police_Deployed)
# Ensure Crime_Domain is treated as a factor (grouping variable)
df$Crime_Domain <- as.factor(df$Crime_Domain)
# 3. Perform the One-Way ANOVA
# Formula: Response variable (Police_Deployed) ~ Factor variable (Crime_Domain)
anova_model <- aov(Police_Deployed ~ Crime_Domain, data = df)
# 4. Display the ANOVA Summary Table
print("--- ANOVA Summary Table (Testing Mean Police Deployed across Crime Domains) ---")
## [1] "--- ANOVA Summary Table (Testing Mean Police Deployed across Crime Domains) ---"
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Crime_Domain 3 13 4.3 0.144 0.934
## Residuals 40156 1200681 29.9
# 5. Post-hoc Test (Tukey's Honestly Significant Difference - HSD)
# This test is used to determine which specific pairs of groups (Crime Domains)
# are significantly different if the overall ANOVA test is significant (p < 0.05).
print("\n--- Tukey's HSD Post-hoc Test (Pairwise Comparisons) ---")
## [1] "\n--- Tukey's HSD Post-hoc Test (Pairwise Comparisons) ---"
tukey_hsd <- TukeyHSD(anova_model)
print(tukey_hsd)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Police_Deployed ~ Crime_Domain, data = df)
##
## $Crime_Domain
## diff lwr upr p adj
## Other Crime-Fire Accident 0.05015389 -0.1951861 0.2954939 0.9530319
## Traffic Fatality-Fire Accident 0.07863479 -0.3146108 0.4718803 0.9558486
## Violent Crime-Fire Accident 0.02832089 -0.2339655 0.2906073 0.9925507
## Traffic Fatality-Other Crime 0.02848090 -0.3056587 0.3626205 0.9962958
## Violent Crime-Other Crime -0.02183300 -0.1824611 0.1387951 0.9853873
## Violent Crime-Traffic Fatality -0.05031390 -0.3970872 0.2964594 0.9823380
# Install required package if you haven't already
# install.packages("arules")
library(arules)
# 1. Load the data
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)
# 2. Select Relevant Categorical Columns
# We select key categorical features to define our 'transaction' items.
data_for_apriori <- df[, c("Crime_Type", "Weapon", "Crime_Domain", "Victim_Gender", "Case_Closed")]
# 3. Convert DataFrame to Transactions Format
# Before converting, we convert all columns to factors (if not already)
# and ensure missing values are handled if any (though inspection showed none in these columns).
# Convert all columns to factors and prepend the column name to the level value
# to clearly distinguish items (e.g., "Weapon=Firearm" instead of just "Firearm")
transaction_data <- data.frame(lapply(data_for_apriori, function(x) paste(names(data_for_apriori)[which(names(data_for_apriori) == names(x))], x, sep = "=")))
# Convert the data frame to the 'transactions' object required by arules
crime_transactions <- as(transaction_data, "transactions")
## Warning: Column(s) 1, 2, 3, 4, 5 not logical or factor. Applying default
## discretization (see '? discretizeDF').
# 4. Run the Apriori Algorithm
# We set minimum support (minSup) and minimum confidence (minConf)
# minSup=0.01 means the itemset must appear in at least 1% of all records.
# minConf=0.6 means 60% of records containing the LHS must also contain the RHS.
apriori_rules <- apriori(
crime_transactions,
parameter = list(supp = 0.01, conf = 0.6, maxlen = 5)
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 401
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[37 item(s), 40160 transaction(s)] done [0.01s].
## sorting and recoding items ... [37 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [157 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# 5. Inspect and Sort the Rules
# Filter out redundant rules and sort by lift for best interpretation.
sorted_rules <- sort(apriori_rules, by = "lift", decreasing = TRUE)
# Remove redundant rules (e.g., shorter rules that are subsets of longer rules)
subset_rules <- is.subset(sorted_rules, sorted_rules, sparse = FALSE)
subset_rules[lower.tri(subset_rules, diag = TRUE)] <- FALSE
clean_rules <- sorted_rules[!apply(subset_rules, 2, any)]
print("--- Top 10 Strongest Association Rules (Sorted by Lift) ---")
## [1] "--- Top 10 Strongest Association Rules (Sorted by Lift) ---"
inspect(head(clean_rules, 10))
## lhs rhs support confidence coverage lift count
## [1] {Crime_Domain==Traffic Fatality} => {Crime_Type==TRAFFIC VIOLATION} 0.04768426 1 0.04768426 20.971279 1915
## [2] {Crime_Type==ARSON} => {Crime_Domain==Fire Accident} 0.04716135 1 0.04716135 10.499346 1894
## [3] {Crime_Type==FIREARM OFFENSE} => {Crime_Domain==Fire Accident} 0.04808267 1 0.04808267 10.499346 1931
## [4] {Crime_Type==VEHICLE - STOLEN} => {Crime_Domain==Violent Crime} 0.04636454 1 0.04636454 3.500697 1862
## [5] {Crime_Type==ASSAULT} => {Crime_Domain==Violent Crime} 0.04768426 1 0.04768426 3.500697 1915
## [6] {Crime_Type==SEXUAL ASSAULT} => {Crime_Domain==Violent Crime} 0.04773406 1 0.04773406 3.500697 1917
## [7] {Crime_Type==IDENTITY THEFT} => {Crime_Domain==Violent Crime} 0.04775896 1 0.04775896 3.500697 1918
## [8] {Crime_Type==ROBBERY} => {Crime_Domain==Violent Crime} 0.04800797 1 0.04800797 3.500697 1928
## [9] {Crime_Type==DOMESTIC VIOLENCE} => {Crime_Domain==Violent Crime} 0.04810757 1 0.04810757 3.500697 1932
## [10] {Crime_Type==SHOPLIFTING} => {Crime_Domain==Other Crime} 0.04628984 1 0.04628984 1.750044 1859