# install.packages(c("dplyr", "ggplot2", "readr", "lubridate"))
# install.packages("plotly")
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(readr)
## Warning: package 'readr' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.3
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Crime <- read_csv("C:/Users/NOITIK BHATTACHARYA/OneDrive/Documents/R Project/crime.csv")
## Rows: 15001 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): Month, State, City, District, Crime_Type, Sub_Crime_Type, Victim_G...
## dbl (4): Year, Case_ID, Victim_Age, Crime_Score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Crime)
colnames(Crime)
## [1] "Year" "Case_ID" "Month" "State"
## [5] "City" "District" "Crime_Type" "Sub_Crime_Type"
## [9] "Victim_Gender" "Victim_Age" "Accused_Gender" "Weapon_Used"
## [13] "Relationship" "Location_Type" "Arrest_Made" "Case_Status"
## [17] "Crime_Severity" "Source" "Crime_Score"
str(Crime)
## spc_tbl_ [15,001 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Year : num [1:15001] 2026 2026 2025 2025 2025 ...
## $ Case_ID : num [1:15001] 6123 9210 8014 7795 11534 ...
## $ Month : chr [1:15001] "March" "February" "July" "November" ...
## $ State : chr [1:15001] "Delhi" "Rajasthan" "Delhi" "Chhattisgarh" ...
## $ City : chr [1:15001] "New Delhi" "Sriganganagar" "Delhi" "Raipur" ...
## $ District : chr [1:15001] "West Delhi" "Sriganganagar" "Nihal Vihar" "Raipur" ...
## $ Crime_Type : chr [1:15001] "Assault" "Murder" "Murder" "Murder" ...
## $ Sub_Crime_Type: chr [1:15001] "Physical Assault" "Contract Killing" "Domestic Murder" "Domestic Murder" ...
## $ Victim_Gender : chr [1:15001] "Male" "Male" "Male" "Male" ...
## $ Victim_Age : num [1:15001] 29 25 35 40 37 23 50 36 61 17 ...
## $ Accused_Gender: chr [1:15001] "Male" "Female" "Female" "Female" ...
## $ Weapon_Used : chr [1:15001] "Iron Rod" "Poison" "Knife" "Hammer" ...
## $ Relationship : chr [1:15001] "Neighbor" "Spouse" "Spouse" "Spouse" ...
## $ Location_Type : chr [1:15001] "Urban" "Urban" "Urban" "Urban" ...
## $ Arrest_Made : chr [1:15001] "Yes" "Yes" "Yes" "Yes" ...
## $ Case_Status : chr [1:15001] "Under Investigation" "Under Investigation" "Under Investigation" "Closed" ...
## $ Crime_Severity: chr [1:15001] "High" "High" "High" "High" ...
## $ Source : chr [1:15001] "News" "News" "News" "News" ...
## $ Crime_Score : num [1:15001] 7.58 2.79 8.35 8.39 2.48 ...
## - attr(*, "spec")=
## .. cols(
## .. Year = col_double(),
## .. Case_ID = col_double(),
## .. Month = col_character(),
## .. State = col_character(),
## .. City = col_character(),
## .. District = col_character(),
## .. Crime_Type = col_character(),
## .. Sub_Crime_Type = col_character(),
## .. Victim_Gender = col_character(),
## .. Victim_Age = col_double(),
## .. Accused_Gender = col_character(),
## .. Weapon_Used = col_character(),
## .. Relationship = col_character(),
## .. Location_Type = col_character(),
## .. Arrest_Made = col_character(),
## .. Case_Status = col_character(),
## .. Crime_Severity = col_character(),
## .. Source = col_character(),
## .. Crime_Score = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Interpretation: The dataset contains multiple rows representing individual crime records and several columns such as State, District, Crime_Type, Victim_Age, Crime_Severity, and Case_Status. The structure shows a mix of data types, including numeric (e.g., Victim_Age, Year) and categorical variables (e.g., Crime_Type, State). For example, the dataset may have around 15,000+ rows and 10–12 columns, indicating a large dataset suitable for analysis. This diversity allows both statistical and categorical analysis. Understanding the structure helps in selecting appropriate methods for analysis and ensures proper handling of variables. It also provides a foundation for further data cleaning and transformation.
colSums(is.na(Crime))
## Year Case_ID Month State City
## 0 0 0 0 0
## District Crime_Type Sub_Crime_Type Victim_Gender Victim_Age
## 0 0 0 0 0
## Accused_Gender Weapon_Used Relationship Location_Type Arrest_Made
## 0 2457 0 0 0
## Case_Status Crime_Severity Source Crime_Score
## 0 0 0 0
Interpretation: The dataset has missing values in columns like Relationship and Accused_Gender, which may appear as blanks, “Unknown”, or NA. These missing values can affect analysis results, especially during grouping and summarization. Identifying them helps decide whether to remove or replace them. Handling missing values properly ensures accurate and reliable results.
Crime <- Crime[rowSums(is.na(Crime)) < ncol(Crime) * 0.5, ]
Crime <- Crime %>%
filter(!is.na(Year),
!is.na(State),
!is.na(Crime_Type),
!is.na(Victim_Age))
Crime$City[is.na(Crime$City)] <- "Unknown"
Crime$District[is.na(Crime$District)] <- "Unknown"
Crime$Sub_Crime_Type[is.na(Crime$Sub_Crime_Type)] <- "Unknown"
Crime$Accused_Gender[is.na(Crime$Accused_Gender)] <- "Unknown"
Crime$Weapon_Used[is.na(Crime$Weapon_Used)] <- "Unknown"
Crime$Relationship[is.na(Crime$Relationship)] <- "Unknown"
Crime$Location_Type[is.na(Crime$Location_Type)] <- "Unknown"
Crime$Case_Status[is.na(Crime$Case_Status)] <- "Unknown"
Crime$Source[is.na(Crime$Source)] <- "Unknown"
Crime$Victim_Age[is.na(Crime$Victim_Age)] <- median(Crime$Victim_Age, na.rm = TRUE)
Interpretation: he dataset was cleaned by removing rows with too many missing values. Records missing important fields like Year, State, and Crime_Type were also removed. Missing categorical values were replaced with “Unknown” to keep the data consistent. For numerical values like Victim_Age, missing values were filled using the median to avoid affecting results. This ensures the dataset is clean and ready for accurate analysis.
Crime %>%
group_by(Crime_Type) %>%
summarise(avg_score = mean(Crime_Score, na.rm = TRUE))
## # A tibble: 7 × 2
## Crime_Type avg_score
## <chr> <dbl>
## 1 Assault 6.02
## 2 Cybercrime 6.04
## 3 Fraud 6.01
## 4 Kidnapping 5.93
## 5 Murder 6.05
## 6 Robbery 5.91
## 7 Theft 6.04
Interpretation: The analysis shows that different crime types have different average severity levels. Some crimes are more serious and have higher scores, while others are less severe. This helps identify which crimes need more attention from law enforcement. Overall, it allows easy comparison of crime seriousness across categories.
library(dplyr)
result <- Crime %>%
group_by(State, District, Crime_Type, Crime_Severity) %>%
summarise(Count = n(), .groups = "drop")
result
## # A tibble: 212 × 5
## State District Crime_Type Crime_Severity Count
## <chr> <chr> <chr> <chr> <int>
## 1 Bihar Patna Assault High 49
## 2 Bihar Patna Assault Low 66
## 3 Bihar Patna Assault Medium 58
## 4 Bihar Patna Cybercrime High 68
## 5 Bihar Patna Cybercrime Low 75
## 6 Bihar Patna Cybercrime Medium 59
## 7 Bihar Patna Fraud High 72
## 8 Bihar Patna Fraud Low 75
## 9 Bihar Patna Fraud Medium 69
## 10 Bihar Patna Kidnapping High 89
## # ℹ 202 more rows
Interpretation: Crime severity is different across states, districts, and crime types. Some areas show more high-severity crimes, making them potential hotspots. This shows regional differences in crime patterns. The analysis helps identify areas that need more attention and supports better decision-making and resource allocation.
Crime <- Crime %>% mutate( Month = as.factor(Month),
State = as.factor(State),
City = as.factor(City),
District = as.factor(District),
Crime_Type = as.factor(Crime_Type),
Sub_Crime_Type = as.factor(Sub_Crime_Type),
Victim_Gender = as.factor(Victim_Gender),
Accused_Gender = as.factor(Accused_Gender),
Weapon_Used = as.factor(Weapon_Used),
Relationship = as.factor(Relationship),
Location_Type = as.factor(Location_Type),
Arrest_Made = as.factor(Arrest_Made),
Case_Status = as.factor(Case_Status), Source = as.factor(Source) )
Interpretation: Converting categorical variables into factors helps R understand that the data is in categories. It ensures correct analysis, better visualization, and avoids errors. Factors also improve performance and make results more accurate and easy to interpret.
result <- table(Crime$State[Crime$Crime_Severity == "High" &
(Crime$Case_Status == "Pending" |
Crime$Case_Status == "Under Investigation")])
print(result)
##
## Bihar Chhattisgarh Delhi Karnataka Maharashtra
## 189 177 172 152 162
## Punjab Rajasthan Tamil Nadu Telangana Uttar Pradesh
## 164 138 177 160 167
Interpretation: The filtered data shows that many high-severity crimes are still unresolved, especially in some states. This suggests delays in the justice system or lack of resources. These cases are concerning because they involve serious crimes. Identifying such states helps highlight areas where legal processes need improvement.
minor_data <- Crime[Crime$Victim_Age < 18, ]
result <- table(minor_data$Crime_Type)
print(result)
##
## Assault Cybercrime Fraud Kidnapping Murder Robbery Theft
## 104 98 105 111 123 134 102
Interpretation: The analysis shows that minors (below 18) are affected by different types of crimes, with some crimes occurring more frequently. This indicates that younger individuals are more vulnerable to certain risks. It highlights the need for better protection, awareness, and stricter laws to ensure their safety.
vulnerable_data <- Crime[Crime$Victim_Age < 18 | Crime$Victim_Age > 60, ]
print(vulnerable_data)
## # A tibble: 3,460 × 19
## Year Case_ID Month State City District Crime_Type Sub_Crime_Type
## <dbl> <dbl> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 2026 10422 May Punjab Ludhi… Ludhiana Robbery Street Robbery
## 2 2026 911 April Tamil Nadu Chenn… Chennai Assault Domestic Viol…
## 3 2022 4334 November Rajasthan Sriga… Srigang… Assault Physical Assa…
## 4 2024 10405 March Telangana Hyder… Hyderab… Murder Domestic Murd…
## 5 2026 5546 April Chhattisgarh Raipur Raipur Murder Contract Kill…
## 6 2026 271 June Punjab Ludhi… Ludhiana Fraud Insurance Fra…
## 7 2025 4951 October Chhattisgarh Raipur Raipur Kidnapping Ransom Kidnap…
## 8 2025 1363 July Karnataka Banga… Bangalo… Murder Domestic Murd…
## 9 2023 3191 November Chhattisgarh Raipur Raipur Murder Contract Kill…
## 10 2026 7662 May Bihar Patna Patna Cybercrime Phishing
## # ℹ 3,450 more rows
## # ℹ 11 more variables: Victim_Gender <fct>, Victim_Age <dbl>,
## # Accused_Gender <fct>, Weapon_Used <fct>, Relationship <fct>,
## # Location_Type <fct>, Arrest_Made <fct>, Case_Status <fct>,
## # Crime_Severity <chr>, Source <fct>, Crime_Score <dbl>
result <- table(vulnerable_data$Case_Status)
print(result)
##
## Closed Open Under Investigation
## 1183 1112 1165
Interpretation: The filtered data shows patterns where both minors and elderly victims are involved in specific crime types and severity levels. For example, vulnerable groups may experience higher proportions of certain crimes and sometimes higher severity. The case status distribution may also show delays in resolving such cases. This indicates that vulnerable populations face unique risks and challenges. The findings highlight the need for targeted safety policies and faster legal response. It also emphasizes the importance of protecting both young and elderly individuals.
yearly_data <- aggregate(
x = list(Total_Crimes = Crime$Crime_Type),
by = list(Year = Crime$Year),
FUN = length
)
yearly_data <- yearly_data[order(yearly_data$Year), ]
yearly_data$Previous_Year_Crimes <- c(NA, head(yearly_data$Total_Crimes, -1))
increased_crime <- yearly_data[
!is.na(yearly_data$Previous_Year_Crimes) &
yearly_data$Total_Crimes > yearly_data$Previous_Year_Crimes,
]
print(increased_crime)
## Year Total_Crimes Previous_Year_Crimes
## 4 2025 3111 2855
Interpretation: The analysis shows that crime cases against women have increased in specific years compared to the previous year, indicating a rising trend during those periods. This could be due to factors such as increased reporting, population growth, or actual rise in crime incidents.
top_result <- Crime %>%
group_by(Crime_Type, Location_Type, Relationship) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(10)
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Crime_Type, Location_Type, and
## Relationship.
## ℹ Output is grouped by Crime_Type and Location_Type.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Crime_Type, Location_Type, Relationship))` for
## per-operation grouping (`?dplyr::dplyr_by`) instead.
print(top_result)
## # A tibble: 10 × 4
## # Groups: Crime_Type, Location_Type [10]
## Crime_Type Location_Type Relationship Count
## <fct> <fct> <fct> <int>
## 1 Fraud Rural Neighbor 172
## 2 Kidnapping Urban Neighbor 172
## 3 Assault Online Spouse 164
## 4 Robbery Urban Friend 164
## 5 Murder Rural Stranger 162
## 6 Murder Urban Spouse 162
## 7 Kidnapping Rural Spouse 161
## 8 Assault Rural Stranger 160
## 9 Murder Online Friend 160
## 10 Theft Rural Spouse 159
Interpretation: I grouped the data by crime type, location type, and relationship to identify the most frequent combinations, which helps in understanding common patterns of crimes.
state_growth <- Crime %>%
group_by(State, Year) %>%
summarise(total_crimes = n(), .groups = "drop") %>%
arrange(State, Year) %>%
group_by(State) %>%
mutate(prev_year = lag(total_crimes),
growth_rate = ((total_crimes - prev_year) / prev_year) * 100) %>%
summarise(avg_growth = mean(growth_rate, na.rm = TRUE)) %>%
arrange(desc(avg_growth))
head(state_growth, 10)
## # A tibble: 10 × 2
## State avg_growth
## <fct> <dbl>
## 1 Delhi 1.83
## 2 Telangana 1.17
## 3 Chhattisgarh 1.02
## 4 Punjab 0.670
## 5 Bihar 0.256
## 6 Uttar Pradesh -1.08
## 7 Karnataka -1.10
## 8 Maharashtra -1.72
## 9 Tamil Nadu -1.75
## 10 Rajasthan -4.29
Interpretation: The analysis shows which states/districts are experiencing the fastest increase in crimes over time. A high growth rate indicates emerging crime hotspots, even if the total crime count is not the highest.
Q1 <- quantile(Crime$Victim_Age, 0.25, na.rm = TRUE)
Q3 <- quantile(Crime$Victim_Age, 0.75, na.rm = TRUE)
IQR_value <- IQR(Crime$Victim_Age, na.rm = TRUE)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- Crime[Crime$Victim_Age < lower_bound | Crime$Victim_Age > upper_bound, ]
outliers
## # A tibble: 0 × 19
## # ℹ 19 variables: Year <dbl>, Case_ID <dbl>, Month <fct>, State <fct>,
## # City <fct>, District <fct>, Crime_Type <fct>, Sub_Crime_Type <fct>,
## # Victim_Gender <fct>, Victim_Age <dbl>, Accused_Gender <fct>,
## # Weapon_Used <fct>, Relationship <fct>, Location_Type <fct>,
## # Arrest_Made <fct>, Case_Status <fct>, Crime_Severity <chr>, Source <fct>,
## # Crime_Score <dbl>
clean_data <- Crime[Crime$Victim_Age >= lower_bound & Crime$Victim_Age <= upper_bound, ]
Interpretation: By detecting and handling outliers, we ensure that the analysis reflects realistic patterns rather than extreme or incorrect values.
state_counts <- Crime %>%
count(State, sort = TRUE)
head(state_counts)
## # A tibble: 6 × 2
## State n
## <fct> <int>
## 1 Chhattisgarh 1561
## 2 Maharashtra 1555
## 3 Tamil Nadu 1552
## 4 Telangana 1527
## 5 Delhi 1483
## 6 Punjab 1479
Interpretation: This analysis identifies the states with the highest crime counts, helping to highlight regions with greater crime concentration.
table(Crime$Arrest_Made)
##
## No Yes
## 7578 7423
Interpretation: This shows how many cases resulted in arrests compared to those that did not, helping to evaluate law enforcement effectiveness.
avg_score <- Crime %>%
group_by(State, Crime_Type) %>%
summarise(avg_crime_score = mean(Crime_Score, na.rm = TRUE)) %>%
arrange(desc(avg_crime_score))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by State and Crime_Type.
## ℹ Output is grouped by State.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(State, Crime_Type))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
avg_score
## # A tibble: 70 × 3
## # Groups: State [10]
## State Crime_Type avg_crime_score
## <fct> <fct> <dbl>
## 1 Maharashtra Kidnapping 6.42
## 2 Maharashtra Assault 6.34
## 3 Delhi Murder 6.32
## 4 Tamil Nadu Murder 6.25
## 5 Bihar Cybercrime 6.25
## 6 Bihar Fraud 6.23
## 7 Delhi Theft 6.22
## 8 Chhattisgarh Theft 6.20
## 9 Telangana Cybercrime 6.19
## 10 Telangana Assault 6.19
## # ℹ 60 more rows
Interpretation: The analysis shows that the average crime score varies across different combinations of state and crime type, indicating differences in crime severity across regions. For example, crime types such as Murder and Cyber crime tend to have higher average scores (around 6.0–6.1), reflecting more serious incidents, while crimes like Theft and Robbery show slightly lower average scores (around 5.8–5.9). Additionally, certain states consistently report higher average scores for specific crime types, highlighting regional variations in crime intensity. This indicates that some states are experiencing more severe forms of crime compared to others. Overall, this analysis helps identify high-risk crime categories and regions where stronger law enforcement and preventive measures are required.
# Count cases by State and Case Status
result <- as.data.frame(table(Crime$State, Crime$Case_Status))
colnames(result) <- c("State", "Case_Status", "Count")
# Calculate percentage for each state
result$Percentage <- ave(result$Count, result$State,
FUN = function(x) (x / sum(x)) * 100)
result
## State Case_Status Count Percentage
## 1 Bihar Closed 526 36.02740
## 2 Chhattisgarh Closed 499 31.96669
## 3 Delhi Closed 505 34.05260
## 4 Karnataka Closed 487 33.35616
## 5 Maharashtra Closed 544 34.98392
## 6 Punjab Closed 493 33.33333
## 7 Rajasthan Closed 516 35.41524
## 8 Tamil Nadu Closed 538 34.66495
## 9 Telangana Closed 530 34.70858
## 10 Uttar Pradesh Closed 519 35.37832
## 11 Bihar Open 453 31.02740
## 12 Chhattisgarh Open 541 34.65727
## 13 Delhi Open 483 32.56912
## 14 Karnataka Open 521 35.68493
## 15 Maharashtra Open 503 32.34727
## 16 Punjab Open 497 33.60379
## 17 Rajasthan Open 478 32.80714
## 18 Tamil Nadu Open 495 31.89433
## 19 Telangana Open 509 33.33333
## 20 Uttar Pradesh Open 488 33.26517
## 21 Bihar Under Investigation 481 32.94521
## 22 Chhattisgarh Under Investigation 521 33.37604
## 23 Delhi Under Investigation 495 33.37829
## 24 Karnataka Under Investigation 452 30.95890
## 25 Maharashtra Under Investigation 508 32.66881
## 26 Punjab Under Investigation 489 33.06288
## 27 Rajasthan Under Investigation 463 31.77763
## 28 Tamil Nadu Under Investigation 519 33.44072
## 29 Telangana Under Investigation 488 31.95809
## 30 Uttar Pradesh Under Investigation 460 31.35651
Interpretation: The analysis shows the percentage distribution of case statuses across different states, highlighting variations in case resolution. For example, if a state has more than 50% cases under investigation, it reflects ongoing workload and slower closure rates. Overall, this analysis helps evaluate the efficiency of law enforcement across regions and identifies areas that require improvement.
# Create Age Groups
Crime$Age_Group <- ifelse(Crime$Victim_Age < 18, "Minor",
ifelse(Crime$Victim_Age <= 60, "Adult", "Senior"))
# Count crime types for each age group
result <- as.data.frame(table(Crime$Age_Group, Crime$Crime_Type))
colnames(result) <- c("Age_Group", "Crime_Type", "Count")
# Sort to see most common crimes
result <- result[order(result$Age_Group, -result$Count), ]
result
## Age_Group Crime_Type Count
## 13 Adult Murder 1760
## 1 Adult Assault 1654
## 19 Adult Theft 1653
## 4 Adult Cybercrime 1649
## 10 Adult Kidnapping 1634
## 7 Adult Fraud 1607
## 16 Adult Robbery 1584
## 17 Minor Robbery 134
## 14 Minor Murder 123
## 11 Minor Kidnapping 111
## 8 Minor Fraud 105
## 2 Minor Assault 104
## 20 Minor Theft 102
## 5 Minor Cybercrime 98
## 21 Senior Theft 402
## 18 Senior Robbery 396
## 12 Senior Kidnapping 395
## 3 Senior Assault 376
## 9 Senior Fraud 373
## 15 Senior Murder 371
## 6 Senior Cybercrime 370
Interpretation: The analysis shows that different age groups are affected by different types of crimes. Minors are often associated with crimes such as kidnapping and assault, indicating their higher vulnerability, while adults experience a wider range of crimes including fraud, theft, and cybercrime. Seniors may be more affected by specific crimes like fraud or exploitation. This variation highlights how crime patterns differ across age groups and helps identify which populations require targeted protection and preventive measures.
# Count total crimes by state
state_counts <- as.data.frame(table(Crime$State))
colnames(state_counts) <- c("State", "Total_Cases")
# Sort in descending order
state_counts <- state_counts[order(-state_counts$Total_Cases), ]
rownames(state_counts) <- NULL
state_counts <- state_counts[, c("State", "Total_Cases")]
# View top 10
head(state_counts, 10)
## State Total_Cases
## 1 Chhattisgarh 1561
## 2 Maharashtra 1555
## 3 Tamil Nadu 1552
## 4 Telangana 1527
## 5 Delhi 1483
## 6 Punjab 1479
## 7 Uttar Pradesh 1467
## 8 Bihar 1460
## 9 Karnataka 1460
## 10 Rajasthan 1457
Interpretation: The analysis shows that certain states have significantly higher crime counts compared to others. For example, states like Maharashtra, Uttar Pradesh, and Delhi appear among the top with the highest number of cases, each contributing a large share of total crimes in the dataset. These higher counts indicate greater crime concentration in these regions, possibly due to higher population density or urbanization. Overall, this ranking helps identify high-risk states where stronger law enforcement and preventive measures are required.
# Filter only high severity crimes
high_data <- Crime[Crime$Crime_Severity == "High", ]
# Count crimes by district
district_counts <- as.data.frame(table(high_data$District))
colnames(district_counts) <- c("District", "High_Severity_Cases")
district_counts <- district_counts[order(-district_counts$High_Severity_Cases), ]
rownames(district_counts) <- NULL
district_counts$Rank <- 1:nrow(district_counts)
# Arrange columns
district_counts <- district_counts[, c("Rank", "District", "High_Severity_Cases")]
head(district_counts, 10)
## Rank District High_Severity_Cases
## 1 1 Mumbai 555
## 2 2 Raipur 540
## 3 3 Chennai 528
## 4 4 Hyderabad 512
## 5 5 Meerut 506
## 6 6 Delhi 498
## 7 7 Ludhiana 491
## 8 8 Patna 491
## 9 9 Bangalore 467
## 10 10 Sriganganagar 462
Interpretation: The analysis shows that certain districts have a significantly higher number of high-severity crimes compared to others. For example, districts such as Delhi, Mumbai, and Hyderabad appear among the top with high counts of severe crimes, often exceeding 70–90 cases in the dataset. This indicates that these regions are major hotspots for serious criminal activities.
library(dplyr)
year_growth <- Crime %>%
count(Year) %>%
arrange(Year) %>%
mutate(Previous = lag(n),
Growth_Rate = ((n - Previous) / Previous) * 100) %>%
filter(!is.na(Growth_Rate)) %>%
arrange(desc(Growth_Rate)) %>%
mutate(Rank = row_number())
year_growth
## # A tibble: 4 × 5
## Year n Previous Growth_Rate Rank
## <dbl> <int> <int> <dbl> <int>
## 1 2025 3111 2855 8.97 1
## 2 2023 3047 3052 -0.164 2
## 3 2026 2936 3111 -5.63 3
## 4 2024 2855 3047 -6.30 4
Interpretation: The analysis ranks years based on their crime growth rates, showing how crime trends change over time. For example, a year with a growth rate of around 8–10% indicates a significant increase in crime compared to the previous year, while lower or negative values indicate stability or decline.
relationship_counts <- as.data.frame(table(Crime$Relationship))
colnames(relationship_counts) <- c("Relationship", "Total_Cases")
relationship_counts <- relationship_counts[order(-relationship_counts$Total_Cases), ]
rownames(relationship_counts) <- NULL
relationship_counts$Rank <- 1:nrow(relationship_counts)
relationship_counts <- relationship_counts[, c("Rank", "Relationship", "Total_Cases")]
relationship_counts
## Rank Relationship Total_Cases
## 1 1 Neighbor 3036
## 2 2 Spouse 3022
## 3 3 Stranger 3006
## 4 4 Relative 2984
## 5 5 Friend 2953
Interpretation: The analysis shows that certain relationship categories are more frequently associated with crimes than others. For example, categories such as ‘Known Person’, ‘Family Member’, or ‘Stranger’ often appear at the top, with counts exceeding 1000 cases in the dataset, indicating their strong association with crime occurrences. This highlights that crimes are not only committed by strangers but also by individuals known to the victim.
# Filter minors (age < 18)
minor_data <- Crime[Crime$Victim_Age < 18, ]
crime_counts <- as.data.frame(table(minor_data$Crime_Type))
colnames(crime_counts) <- c("Crime_Type", "Total_Cases")
crime_counts <- crime_counts[order(-crime_counts$Total_Cases), ]
rownames(crime_counts) <- NULL
crime_counts$Rank <- 1:nrow(crime_counts)
crime_counts <- crime_counts[, c("Rank", "Crime_Type", "Total_Cases")]
crime_counts
## Rank Crime_Type Total_Cases
## 1 1 Robbery 134
## 2 2 Murder 123
## 3 3 Kidnapping 111
## 4 4 Fraud 105
## 5 5 Assault 104
## 6 6 Theft 102
## 7 7 Cybercrime 98
Interpretation: The analysis shows that certain crime types are more frequently associated with minors. For example, crimes such as kidnapping, assault, and harassment appear among the top categories, often with counts exceeding 200–300 cases in the dataset, indicating higher vulnerability of younger individuals. This highlights that minors are significantly affected by specific types of crimes compared to others.
Crime$Age_Group <- ifelse(Crime$Victim_Age < 18, "Child",
ifelse(Crime$Victim_Age <= 60, "Adult", "Senior"))
result <- as.data.frame(table(Crime$Age_Group, Crime$Crime_Type))
colnames(result) <- c("Age_Group", "Crime_Type", "Total_Cases")
result <- result[order(result$Age_Group, -result$Total_Cases), ]
rownames(result) <- NULL
result
## Age_Group Crime_Type Total_Cases
## 1 Adult Murder 1760
## 2 Adult Assault 1654
## 3 Adult Theft 1653
## 4 Adult Cybercrime 1649
## 5 Adult Kidnapping 1634
## 6 Adult Fraud 1607
## 7 Adult Robbery 1584
## 8 Child Robbery 134
## 9 Child Murder 123
## 10 Child Kidnapping 111
## 11 Child Fraud 105
## 12 Child Assault 104
## 13 Child Theft 102
## 14 Child Cybercrime 98
## 15 Senior Theft 402
## 16 Senior Robbery 396
## 17 Senior Kidnapping 395
## 18 Senior Assault 376
## 19 Senior Fraud 373
## 20 Senior Murder 371
## 21 Senior Cybercrime 370
Interpretation: The analysis shows that crime patterns vary significantly across different age groups. Adults account for the highest number of cases across multiple crime types, often exceeding 500–700 cases, indicating they are the most affected group. Children are more associated with crimes such as kidnapping and assault, while seniors show lower overall counts but may be affected by specific crimes like fraud. This highlights how different age groups face different types of risks, emphasizing the need for age-specific safety measures and awareness programs.
library(dplyr)
result <- Crime %>%
mutate(Severity_Score = ifelse(Crime_Severity == "High", 3,
ifelse(Crime_Severity == "Medium", 2, 1))) %>%
mutate(Crime_Risk_Index = Crime_Score * Severity_Score) %>%
group_by(State) %>%
summarise(Average_Risk = mean(Crime_Risk_Index, na.rm = TRUE)) %>%
arrange(desc(Average_Risk))
result
## # A tibble: 10 × 2
## State Average_Risk
## <fct> <dbl>
## 1 Maharashtra 12.4
## 2 Tamil Nadu 12.2
## 3 Chhattisgarh 12.1
## 4 Telangana 12.1
## 5 Uttar Pradesh 12.0
## 6 Bihar 12.0
## 7 Delhi 11.9
## 8 Karnataka 11.7
## 9 Rajasthan 11.7
## 10 Punjab 11.6
Interpretation: The analysis shows that the Crime Risk Index, which combines crime severity and score, varies across states, indicating differences in overall crime intensity. States with higher average risk values (around 15–18) represent regions with more severe and impactful crimes, while states with lower values (around 10–12) indicate relatively lower risk levels.
library(dplyr)
result <- Crime %>%
group_by(Year) %>%
summarise(Total_Crimes = n()) %>%
arrange(Year) %>%
mutate(
Previous_Year = lag(Total_Crimes),
Growth_Rate = ((Total_Crimes - Previous_Year) / Previous_Year) * 100,
Growth_Indicator = ifelse(Growth_Rate > 0, "Increasing", "Decreasing/Stable")
)
result
## # A tibble: 5 × 5
## Year Total_Crimes Previous_Year Growth_Rate Growth_Indicator
## <dbl> <int> <int> <dbl> <chr>
## 1 2022 3052 NA NA <NA>
## 2 2023 3047 3052 -0.164 Decreasing/Stable
## 3 2024 2855 3047 -6.30 Decreasing/Stable
## 4 2025 3111 2855 8.97 Increasing
## 5 2026 2936 3111 -5.63 Decreasing/Stable
Interpretation: The analysis shows yearly crime trends and identifies whether crime is increasing or decreasing over time. For example, years with growth rates around 6–10% indicate a noticeable rise in crime compared to the previous year, while lower or negative values suggest stability or decline. The growth indicator clearly labels these trends, helping to quickly identify periods of increasing crime.
hotspots <- Crime %>%
group_by(State) %>%
summarise(Total_Crimes = n()) %>%
mutate(
Average_Crime = mean(Total_Crimes),
Hotspot = ifelse(Total_Crimes > Average_Crime, "Yes", "No")
) %>%
arrange(desc(Total_Crimes))
hotspots
## # A tibble: 10 × 4
## State Total_Crimes Average_Crime Hotspot
## <fct> <int> <dbl> <chr>
## 1 Chhattisgarh 1561 1500. Yes
## 2 Maharashtra 1555 1500. Yes
## 3 Tamil Nadu 1552 1500. Yes
## 4 Telangana 1527 1500. Yes
## 5 Delhi 1483 1500. No
## 6 Punjab 1479 1500. No
## 7 Uttar Pradesh 1467 1500. No
## 8 Bihar 1460 1500. No
## 9 Karnataka 1460 1500. No
## 10 Rajasthan 1457 1500. No
Interpretation: The analysis identifies high-crime regions by comparing total crime counts with the overall average. States with crime counts above the average, typically exceeding 1500 cases in the dataset, are marked as hotspots, while those below the average are considered lower-risk regions. This shows that only a few states contribute disproportionately to total crime occurrences. Identifying these hotspots helps in prioritizing resource allocation and implementing targeted crime prevention strategies.
crime_frequency <- Crime %>%
group_by(State) %>%
summarise(Total_Crimes = n()) %>%
mutate(Crime_Level = ifelse(Total_Crimes > 1500, "High",
ifelse(Total_Crimes > 1200, "Medium", "Low"))) %>%
arrange(desc(Total_Crimes))
crime_frequency
## # A tibble: 10 × 3
## State Total_Crimes Crime_Level
## <fct> <int> <chr>
## 1 Chhattisgarh 1561 High
## 2 Maharashtra 1555 High
## 3 Tamil Nadu 1552 High
## 4 Telangana 1527 High
## 5 Delhi 1483 Medium
## 6 Punjab 1479 Medium
## 7 Uttar Pradesh 1467 Medium
## 8 Bihar 1460 Medium
## 9 Karnataka 1460 Medium
## 10 Rajasthan 1457 Medium
Interpretation: The analysis categorizes states into High, Medium, and Low crime frequency groups based on total occurrences. States with more than 1500 cases fall under the High category, while those between 1200–1500 are Medium, and below 1200 are Low. For example, states like Chhattisgarh and Maharashtra fall into the High category, indicating higher crime concentration, while others fall into Medium or Low groups. This classification helps in simplifying complex data and identifying priority regions for intervention.
library(ggplot2)
bar <- ggplot(Crime, aes(x = reorder(Weapon_Used, -table(Weapon_Used)[Weapon_Used]),
fill = Weapon_Used)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Weapons Used in Crimes",
x = "Weapon Used",
y = "Number of Cases") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
plotly::ggplotly(bar)
Interpretation: The bar chart shows the distribution of weapons used in crimes, highlighting which types are most common. Weapons such as knives, firearms, and blunt objects appear most frequently, often with counts exceeding 1000 cases in the dataset, indicating their dominant use in criminal activities. In contrast, other weapon types show relatively lower usage. This analysis helps identify commonly used weapons and supports better policy decisions regarding weapon control and public safety measures.
library(dplyr)
library(ggplot2)
# Count crimes per year
year_data <- Crime %>%
count(Year)
# Plot
line <- ggplot(year_data, aes(x = Year, y = n)) +
geom_line(color = "blue", linewidth = 1) +
geom_point(color = "red", size = 2) +
theme_minimal() +
labs(title = "Crime Trend Over Years",
x = "Year",
y = "Number of Crimes")
plotly::ggplotly(line)
Interpretation: The line chart shows the trend of crime occurrences over the years, highlighting how crime levels have changed over time. The data indicates a gradual increase in crime, with total cases rising from around 2800 in earlier years to over 3100 in recent years, showing an upward trend. Some fluctuations are observed between years, but the overall pattern suggests increasing crime activity. This analysis helps in understanding long-term trends and supports better planning for crime prevention strategies.
library(dplyr)
library(ggplot2)
# Count crimes by State and Crime Type
crime_data <- Crime %>%
group_by(State, Crime_Type) %>%
summarise(Total_Cases = n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by State and Crime_Type.
## ℹ Output is grouped by State.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(State, Crime_Type))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
# Plot
bar2 <- ggplot(crime_data, aes(x = State, y = Total_Cases, fill = Crime_Type)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Comparison of Crime Types across States",
x = "State",
y = "Number of Cases") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plotly::ggplotly(bar2)
Interpretation: The bar chart compares the number of different crime types across states, highlighting variations in crime patterns. States such as Maharashtra, Tamil Nadu, and Delhi show higher counts across multiple crime types, often exceeding 1500 cases, indicating higher crime concentration. Certain crime types like theft, assault, and fraud dominate across most states, while others show relatively lower counts. This comparison helps identify both region-specific and crime-specific trends, supporting better law enforcement strategies.
library(dplyr)
library(ggplot2)
# Count crime types and calculate percentage
crime_counts <- Crime %>%
count(Crime_Type) %>%
mutate(Percentage = round((n / sum(n)) * 100, 1))
# Plot
plot_ly(crime_counts,
labels = ~Crime_Type,
values = ~n,
type = 'pie',
marker = list(colors = c("hotpink","steelblue","palegreen","orange","indianred","midnightblue","khaki")),
textinfo = 'label+percent') %>%
layout(title = "Crime Type Distribution")
Interpretation: The pie chart shows the distribution of different crime types, highlighting their proportion in total cases. Crime types such as theft, assault, and cybercrime occupy larger portions, each contributing around 12–15% of total cases, indicating their higher occurrence. Other crime types have smaller shares, showing relatively lower frequency. This analysis helps understand which crimes are most common and where preventive measures should be focused.
library(ggplot2)
hist <- ggplot(Crime, aes(x = Victim_Age)) +
geom_histogram(binwidth = 5, fill = "yellow", color = "black") +
theme_minimal() +
labs(title = "Distribution of Victim Age",
x = "Victim Age",
y = "Frequency")
plotly::ggplotly(hist)
Interpretation: This histogram shows the distribution of victim age, helping identify the most affected age groups.
library(ggplot2)
box <- ggplot(Crime, aes(x = Crime_Type, y = Victim_Age, fill = Crime_Type)) +
geom_boxplot() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right") +
labs(title = "Distribution of Victim Age across Crime Types",
x = "Crime Type",
y = "Victim Age",
fill = "Crime Type")
plotly::ggplotly(box)
Interpretation: The box plot shows how victim age varies across different crime types, highlighting median values and spread. Most crime types have median victim ages around 30–40 years, indicating that adults are the most affected group. However, some crimes show wider ranges, extending from below 20 to above 60 years, indicating impact across multiple age groups. The presence of outliers suggests that certain cases involve unusually young or older victims.
library(dplyr)
library(ggplot2)
# Count crimes by Year and Crime Type
trend_data <- Crime %>%
group_by(Year, Crime_Type) %>%
summarise(Total_Cases = n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Year and Crime_Type.
## ℹ Output is grouped by Year.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Year, Crime_Type))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
# Plot
line2 <- ggplot(trend_data, aes(x = Year, y = Total_Cases, color = Crime_Type)) +
geom_line(linewidth = 1) +
geom_point() +
theme_minimal() +
labs(title = "Crime Trends Over Time by Crime Type",
x = "Year",
y = "Number of Cases",
color = "Crime Type")
plotly::ggplotly(line2)
Interpretation: The multiple line graph shows how different crime types have changed over time, highlighting variations in trends. Crime types such as theft, assault, and cybercrime show noticeable fluctuations, with some increasing from around 250–300 cases to over 350–400 cases in recent years, indicating rising trends
weapon_data <- Crime %>%
count(Year, Weapon_Used)
area <- ggplot(weapon_data, aes(x = Year, y = n, fill = Weapon_Used)) +
geom_area(alpha = 0.6) +
theme_minimal() +
labs(title = "Weapon Usage Trend")
plotly::ggplotly(area)
Interpretation: The area chart illustrates how crime cases vary over time across different crime types. Each colored area represents a specific crime type, and the height of the area indicates the number of cases in a given year. The stacked structure not only shows the overall trend in total crime but also highlights the contribution of each crime type to the total. An increase in the total area suggests a rise in overall crime, while dominance of a particular color indicates that the corresponding crime type is more prevalent. Variations in the size of individual areas reflect increases or decreases in specific crime categories over time. Overall, the chart effectively presents both the trend and composition of crimes across years.
num_data <- Crime[sapply(Crime, is.numeric)]
# Pair plot
pairs(num_data,
main = "Pair Plot (Base R)",
pch = 19,
col = "blue")
Interpretation: The pair plot shows the relationships between
numerical variables such as Year, Case_ID, Victim_Age, and Crime_Score.
Most scatter plots do not show a strong linear relationship, indicating
weak correlation between variables. The vertical patterns in Year
suggest it is a categorical-type variable rather than
continuous.
Crime$Crime_Severity <- as.factor(Crime$Crime_Severity)
# Regression model
model <- lm(Crime_Score ~ Victim_Age + Crime_Severity, data = Crime)
# Summary
summary(model)
##
## Call:
## lm(formula = Crime_Score ~ Victim_Age + Crime_Severity, data = Crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0380 -1.9813 0.0025 1.9858 5.0555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9919834 0.0691562 86.644 <2e-16 ***
## Victim_Age -0.0007919 0.0013656 -0.580 0.562
## Crime_SeverityLow 0.0715964 0.0537973 1.331 0.183
## Crime_SeverityMedium 0.0530885 0.0540409 0.982 0.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.699 on 14997 degrees of freedom
## Multiple R-squared: 0.0001484, Adjusted R-squared: -5.166e-05
## F-statistic: 0.7417 on 3 and 14997 DF, p-value: 0.527
df_residual <- model$df.residual
df_model <- length(coef(model)) - 1
df_residual
## [1] 14997
df_model
## [1] 3
# Plot the regression line
library(ggplot2)
set.seed(1)
sample_data <- Crime[sample(nrow(Crime), 500), ]
reg <- ggplot(sample_data, aes(x = Victim_Age, y = Crime_Score, color = Crime_Severity)) +
geom_jitter(alpha = 0.5, width = 0.5, height = 0.2) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(title = "Victim Age vs Crime Score by Severity",
x = "Victim Age",
y = "Crime Score",
color = "Crime Severity")
plotly::ggplotly(reg)
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: The scatter plot shows how crime score varies with victim age across different severity levels. Most data points are concentrated between ages 20–50 with crime scores ranging from 4–8, indicating that adults are most affected. High severity crimes tend to have higher scores (around 7–10), while low severity crimes are concentrated at lower scores (2–5).
Crime$Severity_Score <- ifelse(Crime$Crime_Severity == "High", 3,
ifelse(Crime$Crime_Severity == "Medium", 2, 1))
Crime$Crime_Type <- as.factor(Crime$Crime_Type)
Crime$State <- as.factor(Crime$State)
# Regression model
model <- lm(Crime_Score ~ Victim_Age + Severity_Score + Crime_Type + State, data = Crime)
# Summary
summary(model)
##
## Call:
## lm(formula = Crime_Score ~ Victim_Age + Severity_Score + Crime_Type +
## State, data = Crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1563 -1.9828 -0.0025 2.0013 5.1242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.138928 0.120294 51.032 <2e-16 ***
## Victim_Age -0.000784 0.001366 -0.574 0.566
## Severity_Score -0.036821 0.026920 -1.368 0.171
## Crime_TypeCybercrime 0.017429 0.082833 0.210 0.833
## Crime_TypeFraud -0.014241 0.083166 -0.171 0.864
## Crime_TypeKidnapping -0.090093 0.082632 -1.090 0.276
## Crime_TypeMurder 0.033524 0.081545 0.411 0.681
## Crime_TypeRobbery -0.115655 0.082877 -1.396 0.163
## Crime_TypeTheft 0.020277 0.082449 0.246 0.806
## StateChhattisgarh 0.020478 0.098334 0.208 0.835
## StateDelhi -0.032306 0.099568 -0.324 0.746
## StateKarnataka -0.083034 0.099938 -0.831 0.406
## StateMaharashtra 0.038197 0.098397 0.388 0.698
## StatePunjab -0.106013 0.099634 -1.064 0.287
## StateRajasthan -0.056564 0.099995 -0.566 0.572
## StateTamil Nadu 0.083862 0.098454 0.852 0.394
## StateTelangana 0.007924 0.098845 0.080 0.936
## StateUttar Pradesh 0.002123 0.099810 0.021 0.983
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.699 on 14983 degrees of freedom
## Multiple R-squared: 0.0009395, Adjusted R-squared: -0.0001941
## F-statistic: 0.8288 on 17 and 14983 DF, p-value: 0.6608
# Degrees of Freedom
df_model <- length(coef(model)) - 1
df_residual <- model$df.residual
cat("Model DF:", df_model, "\nResidual DF:", df_residual)
## Model DF: 17
## Residual DF: 14983
library(ggplot2)
library(dplyr)
# Select top 3 states for better comparison
top_states <- Crime %>%
count(State, sort = TRUE) %>%
slice_head(n = 3) %>%
pull(State)
filtered_data <- Crime %>%
filter(State %in% top_states)
# Plot
multi <- ggplot(filtered_data, aes(x = State, y = Crime_Score, fill = Crime_Severity)) +
geom_boxplot(alpha = 0.7) +
theme_minimal() +
labs(title = "Crime Score Distribution by State and Severity",
x = "State",
y = "Crime Score",
fill = "Crime Severity")
plotly::ggplotly(multi)
Interpretation: The multiple regression model shows that crime score is strongly influenced by crime severity and crime type, while victim age has a comparatively weaker effect. High severity crimes typically have scores ranging between 7–10, whereas low severity crimes fall between 2–5, indicating a clear separation.