1 Introduction

This R Markdown report performs a complete analysis of the Crime Dataset (India), including data cleaning, descriptive statistics, exploratory analysis, modeling, clustering, association rule mining, and forecasting.

2 Load & Clean Data

file_path <- 'C:/Users/abidm/OneDrive/Desktop/crime_dataset_india.csv'
crime <- read.csv(file_path, stringsAsFactors = FALSE)
crime <- clean_names(crime)


cat("Rows:", nrow(crime), " Columns:", ncol(crime))
## Rows: 40160  Columns: 14
head(crime)
##   report_number    date_reported    date_occurred time_of_occurrence      city
## 1             1 02-01-2020 00:00 01-01-2020 00:00   01-01-2020 01:11 Ahmedabad
## 2             2 01-01-2020 19:00 01-01-2020 01:00   01-01-2020 06:26   Chennai
## 3             3 02-01-2020 05:00 01-01-2020 02:00   01-01-2020 14:30  Ludhiana
## 4             4 01-01-2020 05:00 01-01-2020 03:00   01-01-2020 14:46      Pune
## 5             5 01-01-2020 21:00 01-01-2020 04:00   01-01-2020 16:51      Pune
## 6             6 02-01-2020 03:00 01-01-2020 05:00   01-01-2020 17:09     Delhi
##   crime_code     crime_type victim_age victim_gender       weapon  crime_domain
## 1        576 IDENTITY THEFT         16             M Blunt Object Violent Crime
## 2        128       HOMICIDE         37             M       Poison   Other Crime
## 3        271     KIDNAPPING         48             F Blunt Object   Other Crime
## 4        170       BURGLARY         49             F      Firearm   Other Crime
## 5        421      VANDALISM         30             F        Other   Other Crime
## 6        442        ASSAULT         16             M      Firearm Violent Crime
##   police_deployed case_closed date_case_closed
## 1              13          No                 
## 2               9          No                 
## 3              15          No                 
## 4               1         Yes 29-04-2020 05:00
## 5              18         Yes 08-01-2020 21:00
## 6              18         Yes 30-03-2020 03:00

2.1 Handle Dates & Feature Engineering

date_cols <- intersect(c('date','incident_date','crime_date','reported_date','occurred_on','datetime'), names(crime))


if(length(date_cols) > 0){
dcol <- date_cols[1]
crime <- crime %>% mutate(raw_date = .data[[dcol]])
crime <- crime %>% mutate(parsed_date = parse_date_time(raw_date, orders=c('Y-m-d','d/m/Y','m/d/Y','Y/m/d','d-b-Y','Ymd HMS','Ymd')))
crime <- crime %>% mutate(
year = year(parsed_date),
month = month(parsed_date),
day = day(parsed_date),
wday = wday(parsed_date, label=TRUE),
hour = hour(parsed_date)
)

}
# Time of day category
if("hour" %in% names(crime)){
crime <- crime %>% mutate(time_of_day = case_when(
hour >= 5 & hour < 12 ~ "Morning",
hour >= 12 & hour < 17 ~ "Afternoon",
hour >= 17 & hour < 21 ~ "Evening",
TRUE ~ "Night"
))
}


head(crime)
##   report_number    date_reported    date_occurred time_of_occurrence      city
## 1             1 02-01-2020 00:00 01-01-2020 00:00   01-01-2020 01:11 Ahmedabad
## 2             2 01-01-2020 19:00 01-01-2020 01:00   01-01-2020 06:26   Chennai
## 3             3 02-01-2020 05:00 01-01-2020 02:00   01-01-2020 14:30  Ludhiana
## 4             4 01-01-2020 05:00 01-01-2020 03:00   01-01-2020 14:46      Pune
## 5             5 01-01-2020 21:00 01-01-2020 04:00   01-01-2020 16:51      Pune
## 6             6 02-01-2020 03:00 01-01-2020 05:00   01-01-2020 17:09     Delhi
##   crime_code     crime_type victim_age victim_gender       weapon  crime_domain
## 1        576 IDENTITY THEFT         16             M Blunt Object Violent Crime
## 2        128       HOMICIDE         37             M       Poison   Other Crime
## 3        271     KIDNAPPING         48             F Blunt Object   Other Crime
## 4        170       BURGLARY         49             F      Firearm   Other Crime
## 5        421      VANDALISM         30             F        Other   Other Crime
## 6        442        ASSAULT         16             M      Firearm Violent Crime
##   police_deployed case_closed date_case_closed
## 1              13          No                 
## 2               9          No                 
## 3              15          No                 
## 4               1         Yes 29-04-2020 05:00
## 5              18         Yes 08-01-2020 21:00
## 6              18         Yes 30-03-2020 03:00

3 Descriptive Statistics

num_cols <- crime %>% select(where(is.numeric)) %>% names()
cat("Numeric Columns:\n")
## Numeric Columns:
print(num_cols)
## [1] "report_number"   "crime_code"      "victim_age"      "police_deployed"
# Exploratory Data Analysis
ggplot(crime, aes(x = crime_type)) +
  geom_bar(fill="lightgreen") +
  labs(title="Crime Count by Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Crime Hotspots (Location-Based Analysis)

crime %>%
  group_by(city) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = reorder(city, total), y = total)) +
  geom_bar(stat="identity", fill="orange") +
  coord_flip() +
  labs(title="Crime Count by State")

##City-wise Crime Trend (Top 10 Cities)

top_cities <- crime %>% count(city, sort=TRUE) %>% slice(1:10) %>% pull(city)

crime %>%
  filter(city %in% top_cities) %>%
  ggplot(aes(x = city)) +
  geom_bar(fill="darkorange") +
  labs(title="Top 10 Cities with Highest Crime Frequency") +
  theme(axis.text.x = element_text(angle=45, hjust=1))

##Top 10 Crime Types

crime %>%
  count(crime_type, sort=TRUE) %>%
  slice(1:10) %>%
  ggplot(aes(x = reorder(crime_type, n), y = n)) +
  geom_bar(stat="identity", fill="skyblue") +
  coord_flip() +
  labs(title="Top 10 Crime Types", x="Crime Type", y="Count")

3.1 Pie Chart - Crime Proportions

crime %>%
  count(crime_type) %>%                           # count each crime type
  mutate(prop = n / sum(n)) %>%                   # calculate proportions
  ggplot(aes(x = "", y = prop, fill = crime_type)) +
  geom_col(width = 1) +                           # bar to be converted to pie
  coord_polar(theta = "y") +                      # convert to pie
  labs(title = "Crime Proportions by Type",
       fill = "Crime Type") +
  theme_void()                                    # remove axes

4 visualization

ggplot(crime, aes(x = crime_type)) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Count of Crimes by Type",
    x = "Crime Type",
    y = "Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(crime, aes(x = victim_gender)) +
  geom_bar(fill = "tomato") +
  labs(
    title = "Count of Victim Gender",
    x = "Gender",
    y = "Count"
  ) +
  theme_minimal()

5 REGRESSION ANALYSIS

library(tidyverse)
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)
df <- janitor::clean_names(df)
str(df)
## 'data.frame':    40160 obs. of  14 variables:
##  $ report_number     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ date_reported     : Factor w/ 25546 levels "01-01-2020 05:00",..: 837 3 839 1 4 838 2 842 2518 1671 ...
##  $ date_occurred     : Factor w/ 40160 levels "01-01-2020 00:00",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ time_of_occurrence: Factor w/ 39886 levels "01-01-2020 01:11",..: 1 2 4 5 6 7 3 1306 1307 9 ...
##  $ city              : Factor w/ 29 levels "Agra","Ahmedabad",..: 2 5 16 22 22 6 5 5 18 5 ...
##  $ crime_code        : int  576 128 271 170 421 442 172 169 338 497 ...
##  $ crime_type        : Factor w/ 21 levels "ARSON","ASSAULT",..: 12 11 14 3 20 2 21 4 8 15 ...
##  $ victim_age        : int  16 37 48 49 30 16 64 78 41 29 ...
##  $ victim_gender     : Factor w/ 3 levels "F","M","X": 2 2 1 1 1 2 1 3 3 2 ...
##  $ weapon            : Factor w/ 7 levels "Blunt Object",..: 1 7 1 3 6 3 4 4 1 4 ...
##  $ crime_domain      : Factor w/ 4 levels "Fire Accident",..: 4 2 2 2 2 4 4 2 2 2 ...
##  $ police_deployed   : int  13 9 15 1 18 18 13 8 1 4 ...
##  $ case_closed       : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 2 2 1 1 1 ...
##  $ date_case_closed  : Factor w/ 16012 levels "","01-01-2021 00:00",..: 1 1 1 14810 3647 15249 12152 1 1 1 ...
# Select useful predictor variables
reg_data <- df %>%
  select(police_deployed, victim_age, victim_gender, crime_domain)

# Remove rows with missing values
reg_data <- reg_data %>% na.omit()

# Convert categorical variables to factors
reg_data$victim_gender <- as.factor(reg_data$victim_gender)
reg_data$crime_domain  <- as.factor(reg_data$crime_domain)

# ------------------------------
# Build Linear Regression Model
# ------------------------------

model <- lm(police_deployed ~ victim_age + victim_gender + crime_domain,
            data = reg_data)

summary(model)
## 
## Call:
## lm(formula = police_deployed ~ victim_age + victim_gender + crime_domain, 
##     data = reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1102 -4.9760 -0.0073  4.9657  9.0967 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   9.9419455  0.1093082  90.953   <2e-16 ***
## victim_age                    0.0008555  0.0013494   0.634    0.526    
## victim_genderM               -0.0488855  0.0597065  -0.819    0.413    
## victim_genderX                0.0220883  0.0907735   0.243    0.808    
## crime_domainOther Crime       0.0500698  0.0955031   0.524    0.600    
## crime_domainTraffic Fatality  0.0785998  0.1530763   0.513    0.608    
## crime_domainViolent Crime     0.0273697  0.1021017   0.268    0.789    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.468 on 40153 degrees of freedom
## Multiple R-squared:  4.278e-05,  Adjusted R-squared:  -0.0001066 
## F-statistic: 0.2863 on 6 and 40153 DF,  p-value: 0.9437

5.1 Note:

A linear regression model was developed to predict the number of police deployed based on victim age, victim gender, and crime domain. The model output includes coefficient estimates and significance levels, helping understand which crime-related factors influence police allocation decisions.

6 KNN REGRESSION in R (Predicting Police Deployed)

# --- K-NEAREST NEIGHBORS (KNN) REGRESSION - FINAL WORKING CODE ---

# Load essential libraries
library(caret)
library(dplyr)
library(janitor)

# 1. Load and Prepare Data
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)
df <- clean_names(df)

# Create the full dataset with only the required columns
# Target variable is renamed to PoliceDeployed for convenience
modeling_data <- df %>%
  select(PoliceDeployed = police_deployed, victim_age, victim_gender, crime_domain)

# 2. Data Splitting (70% Train, 30% Test)
# Splitting the raw data frame (caret handles scaling/encoding)
set.seed(42)
train_index <- createDataPartition(modeling_data$PoliceDeployed, p = 0.7, list = FALSE)

train_data <- modeling_data[train_index, ]
test_data <- modeling_data[-train_index, ]

# 3. Train KNN Regression Model
# FIX APPLIED: Removed 'dummy' from preProcess.
# 'caret' automatically creates dummy variables for factors when using the formula interface.
knn_model <- train(
  PoliceDeployed ~ victim_age + victim_gender + crime_domain,
  data = train_data,
  method = "knn",
  tuneGrid = data.frame(k = 5),
  trControl = trainControl(method = "none"),
  # Only specify 'center' and 'scale' for normalization (essential for KNN)
  preProcess = c("center", "scale") 
)

# 4. Evaluate the Model
# Pass the raw test data frame (not the manually scaled matrix) to predict
predictions <- predict(knn_model, test_data)

# Extract the true test values
y_test <- test_data$PoliceDeployed

# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((y_test - predictions)^2))

# Print the final evaluation metric
print(paste("KNN Regression (k=5) RMSE:", round(rmse, 4)))
## [1] "KNN Regression (k=5) RMSE: 5.536"

7 — Outlier Detection in R —

# Install required packages if you haven't already
# install.packages(c("ggplot2", "dplyr")) 

library(ggplot2)
library(dplyr)

# 1. Load the data
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)

# Select the numeric variable for analysis
# We will use 'Police_Deployed'
numeric_data <- df$Police_Deployed

# --- Method A: Z-score Detection ---
print("--- 1. Outlier Detection using Z-score ---")
## [1] "--- 1. Outlier Detection using Z-score ---"
# Calculate Z-scores for Police_Deployed
z_scores <- as.vector(scale(numeric_data))

# Identify indices of outliers (absolute Z-score > 3)
z_outlier_indices <- which(abs(z_scores) > 3)

# Print the count and the corresponding Police_Deployed values
print(paste("Total number of Z-score outliers (> 3 SD):", length(z_outlier_indices)))
## [1] "Total number of Z-score outliers (> 3 SD): 0"
if(length(z_outlier_indices) > 0) {
    print("Example Police_Deployed values corresponding to Z-score outliers:")
    print(head(numeric_data[z_outlier_indices]))
} else {
    print("No Z-score outliers found (i.e., no values are more than 3 standard deviations from the mean).")
}
## [1] "No Z-score outliers found (i.e., no values are more than 3 standard deviations from the mean)."
# --- Method B: IQR Detection (Tukey's Method) ---
print("\n--- 2. Outlier Detection using IQR (1.5 * IQR Rule) ---")
## [1] "\n--- 2. Outlier Detection using IQR (1.5 * IQR Rule) ---"
# Calculate the quartiles and IQR
Q1 <- quantile(numeric_data, 0.25)
Q3 <- quantile(numeric_data, 0.75)
IQR_val <- Q3 - Q1

# Define the boundaries for non-outlier data
lower_bound <- Q1 - 1.5 * IQR_val
upper_bound <- Q3 + 1.5 * IQR_val

# Identify indices of outliers
iqr_outlier_indices <- which(numeric_data < lower_bound | numeric_data > upper_bound)

# Print the count and the corresponding Police_Deployed values
print(paste("Lower bound:", round(lower_bound, 2)))
## [1] "Lower bound: -10"
print(paste("Upper bound:", round(upper_bound, 2)))
## [1] "Upper bound: 30"
print(paste("Total number of IQR outliers:", length(iqr_outlier_indices)))
## [1] "Total number of IQR outliers: 0"
if(length(iqr_outlier_indices) > 0) {
    print("Example Police_Deployed values corresponding to IQR outliers:")
    print(head(numeric_data[iqr_outlier_indices]))
} else {
    print("No IQR outliers found.")
}
## [1] "No IQR outliers found."
# --- Method C: Boxplot Visualization (Individual Events) ---
print("\n--- 3. Boxplot Visualization (Individual Outliers) ---")
## [1] "\n--- 3. Boxplot Visualization (Individual Outliers) ---"
# Create a boxplot for Police_Deployed
boxplot_plot <- ggplot(df, aes(x = factor(0), y = Police_Deployed)) +
  geom_boxplot(fill = "skyblue", color = "darkblue") +
  labs(
    title = "Boxplot of Police Deployed (Outlier Detection)",
    x = "",
    y = "Police Deployed Count"
  ) +
  theme_minimal() +
  scale_x_discrete(breaks = NULL) # Remove x-axis label for a single boxplot

# Print the plot object (assuming R environment can display it)
print(boxplot_plot)

# --- Method D: Boxplot Detection (Regional Spikes) ---
print("\n--- 4. Boxplot Visualization (Regional Abnormal Spikes) ---")
## [1] "\n--- 4. Boxplot Visualization (Regional Abnormal Spikes) ---"
# To detect regional spikes, we analyze Police_Deployed by City.
# We'll limit to the top 10 cities by number of crimes for a clearer plot.
top_cities <- df %>%
    group_by(City) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count)) %>%
    head(10)

df_top_cities <- df %>%
    filter(City %in% top_cities$City)

# Create a boxplot of Police_Deployed grouped by the top 10 cities
regional_boxplot_plot <- ggplot(df_top_cities, aes(x = reorder(City, Police_Deployed, FUN = median), y = Police_Deployed, fill = City)) +
  geom_boxplot() +
  labs(
    title = "Police Deployed Outliers by City (Top 10 Cities)",
    x = "City (Ordered by Median Police Deployed)",
    y = "Police Deployed Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

# Print the plot object
print(regional_boxplot_plot)

8 One-Way ANOVA in R

# 1. Load the data
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)

# 2. Data Preparation
# Convert Police_Deployed to numeric (in case it wasn't read correctly)
df$Police_Deployed <- as.numeric(df$Police_Deployed)
# Ensure Crime_Domain is treated as a factor (grouping variable)
df$Crime_Domain <- as.factor(df$Crime_Domain)

# 3. Perform the One-Way ANOVA
# Formula: Response variable (Police_Deployed) ~ Factor variable (Crime_Domain)
anova_model <- aov(Police_Deployed ~ Crime_Domain, data = df)

# 4. Display the ANOVA Summary Table
print("--- ANOVA Summary Table (Testing Mean Police Deployed across Crime Domains) ---")
## [1] "--- ANOVA Summary Table (Testing Mean Police Deployed across Crime Domains) ---"
summary(anova_model)
##                 Df  Sum Sq Mean Sq F value Pr(>F)
## Crime_Domain     3      13     4.3   0.144  0.934
## Residuals    40156 1200681    29.9
# 5. Post-hoc Test (Tukey's Honestly Significant Difference - HSD)
# This test is used to determine which specific pairs of groups (Crime Domains) 
# are significantly different if the overall ANOVA test is significant (p < 0.05).

print("\n--- Tukey's HSD Post-hoc Test (Pairwise Comparisons) ---")
## [1] "\n--- Tukey's HSD Post-hoc Test (Pairwise Comparisons) ---"
tukey_hsd <- TukeyHSD(anova_model)
print(tukey_hsd)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Police_Deployed ~ Crime_Domain, data = df)
## 
## $Crime_Domain
##                                       diff        lwr       upr     p adj
## Other Crime-Fire Accident       0.05015389 -0.1951861 0.2954939 0.9530319
## Traffic Fatality-Fire Accident  0.07863479 -0.3146108 0.4718803 0.9558486
## Violent Crime-Fire Accident     0.02832089 -0.2339655 0.2906073 0.9925507
## Traffic Fatality-Other Crime    0.02848090 -0.3056587 0.3626205 0.9962958
## Violent Crime-Other Crime      -0.02183300 -0.1824611 0.1387951 0.9853873
## Violent Crime-Traffic Fatality -0.05031390 -0.3970872 0.2964594 0.9823380

9 Apriori Association Rule Mining in R

# Install required package if you haven't already
# install.packages("arules") 

library(arules)

# 1. Load the data
df <- read.csv("crime_dataset_india.csv", stringsAsFactors = TRUE)

# 2. Select Relevant Categorical Columns
# We select key categorical features to define our 'transaction' items.
data_for_apriori <- df[, c("Crime_Type", "Weapon", "Crime_Domain", "Victim_Gender", "Case_Closed")]

# 3. Convert DataFrame to Transactions Format
# Before converting, we convert all columns to factors (if not already) 
# and ensure missing values are handled if any (though inspection showed none in these columns).

# Convert all columns to factors and prepend the column name to the level value 
# to clearly distinguish items (e.g., "Weapon=Firearm" instead of just "Firearm")
transaction_data <- data.frame(lapply(data_for_apriori, function(x) paste(names(data_for_apriori)[which(names(data_for_apriori) == names(x))], x, sep = "=")))

# Convert the data frame to the 'transactions' object required by arules
crime_transactions <- as(transaction_data, "transactions")
## Warning: Column(s) 1, 2, 3, 4, 5 not logical or factor. Applying default
## discretization (see '? discretizeDF').
# 4. Run the Apriori Algorithm
# We set minimum support (minSup) and minimum confidence (minConf)
# minSup=0.01 means the itemset must appear in at least 1% of all records.
# minConf=0.6 means 60% of records containing the LHS must also contain the RHS.

apriori_rules <- apriori(
  crime_transactions, 
  parameter = list(supp = 0.01, conf = 0.6, maxlen = 5)
)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target  ext
##       5  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 401 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[37 item(s), 40160 transaction(s)] done [0.01s].
## sorting and recoding items ... [37 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [157 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# 5. Inspect and Sort the Rules
# Filter out redundant rules and sort by lift for best interpretation.
sorted_rules <- sort(apriori_rules, by = "lift", decreasing = TRUE)

# Remove redundant rules (e.g., shorter rules that are subsets of longer rules)
subset_rules <- is.subset(sorted_rules, sorted_rules, sparse = FALSE)
subset_rules[lower.tri(subset_rules, diag = TRUE)] <- FALSE
clean_rules <- sorted_rules[!apply(subset_rules, 2, any)]


print("--- Top 10 Strongest Association Rules (Sorted by Lift) ---")
## [1] "--- Top 10 Strongest Association Rules (Sorted by Lift) ---"
inspect(head(clean_rules, 10))
##      lhs                                 rhs                                support confidence   coverage      lift count
## [1]  {Crime_Domain==Traffic Fatality} => {Crime_Type==TRAFFIC VIOLATION} 0.04768426          1 0.04768426 20.971279  1915
## [2]  {Crime_Type==ARSON}              => {Crime_Domain==Fire Accident}   0.04716135          1 0.04716135 10.499346  1894
## [3]  {Crime_Type==FIREARM OFFENSE}    => {Crime_Domain==Fire Accident}   0.04808267          1 0.04808267 10.499346  1931
## [4]  {Crime_Type==VEHICLE - STOLEN}   => {Crime_Domain==Violent Crime}   0.04636454          1 0.04636454  3.500697  1862
## [5]  {Crime_Type==ASSAULT}            => {Crime_Domain==Violent Crime}   0.04768426          1 0.04768426  3.500697  1915
## [6]  {Crime_Type==SEXUAL ASSAULT}     => {Crime_Domain==Violent Crime}   0.04773406          1 0.04773406  3.500697  1917
## [7]  {Crime_Type==IDENTITY THEFT}     => {Crime_Domain==Violent Crime}   0.04775896          1 0.04775896  3.500697  1918
## [8]  {Crime_Type==ROBBERY}            => {Crime_Domain==Violent Crime}   0.04800797          1 0.04800797  3.500697  1928
## [9]  {Crime_Type==DOMESTIC VIOLENCE}  => {Crime_Domain==Violent Crime}   0.04810757          1 0.04810757  3.500697  1932
## [10] {Crime_Type==SHOPLIFTING}        => {Crime_Domain==Other Crime}     0.04628984          1 0.04628984  1.750044  1859