# Run this in R Console (the bottom panel of RStudio)
options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages(c("dplyr", "tidyr", "tibble", "ggplot2", "readr", "e1071"))
## Installing packages into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## package 'tidyr' successfully unpacked and MD5 sums checked
## package 'tibble' successfully unpacked and MD5 sums checked
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'readr' successfully unpacked and MD5 sums checked
## package 'e1071' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
# Install with all dependencies
install.packages("dplyr")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("tidyr")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'tidyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("tibble")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'tibble' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("ggplot2")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("readr")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("e1071")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'e1071' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.3
library(tibble)
## Warning: package 'tibble' was built under R version 4.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(readr)
## Warning: package 'readr' was built under R version 4.5.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
data <- read.csv("C:/Users/Welcome/Downloads/CrimesOnWomenData(5).csv")
data <- data[, -1]
# Rename columns
colnames(data) <- c("State", "Year", "Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")
# Display first few rows
head(data, 10)
## State Year Rape K.A DD AoW AoM DV WT
## 1 ANDHRA PRADESH 2001 871 765 420 3544 2271 5791 7
## 2 ARUNACHAL PRADESH 2001 33 55 0 78 3 11 0
## 3 ASSAM 2001 817 1070 59 850 4 1248 0
## 4 BIHAR 2001 888 518 859 562 21 1558 83
## 5 CHHATTISGARH 2001 959 171 70 1763 161 840 0
## 6 GOA 2001 12 6 2 17 7 11 0
## 7 GUJARAT 2001 286 857 67 756 111 3667 0
## 8 HARYANA 2001 398 297 285 478 401 1513 0
## 9 HIMACHAL PRADESH 2001 124 105 10 310 14 317 0
## 10 JAMMU & KASHMIR 2001 169 504 13 622 288 50 0
cat("\nDataset dimensions:", dim(data)[1], "rows x", dim(data)[2], "columns\n")
##
## Dataset dimensions: 736 rows x 9 columns
str(data)
## 'data.frame': 736 obs. of 9 variables:
## $ State: chr "ANDHRA PRADESH" "ARUNACHAL PRADESH" "ASSAM" "BIHAR" ...
## $ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
## $ Rape : int 871 33 817 888 959 12 286 398 124 169 ...
## $ K.A : int 765 55 1070 518 171 6 857 297 105 504 ...
## $ DD : int 420 0 59 859 70 2 67 285 10 13 ...
## $ AoW : int 3544 78 850 562 1763 17 756 478 310 622 ...
## $ AoM : int 2271 3 4 21 161 7 111 401 14 288 ...
## $ DV : int 5791 11 1248 1558 840 11 3667 1513 317 50 ...
## $ WT : int 7 0 0 83 0 0 0 0 0 0 ...
summary(data)
## State Year Rape K.A
## Length:736 Min. :2001 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.:2006 1st Qu.: 35.0 1st Qu.: 24.75
## Mode :character Median :2011 Median : 348.5 Median : 290.00
## Mean :2011 Mean : 727.9 Mean : 1134.54
## 3rd Qu.:2016 3rd Qu.:1069.0 3rd Qu.: 1216.00
## Max. :2021 Max. :6337.0 Max. :15381.00
## DD AoW AoM DV
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 1.0 1st Qu.: 34.0 1st Qu.: 3.0 1st Qu.: 13.0
## Median : 29.0 Median : 387.5 Median : 31.0 Median : 678.5
## Mean : 215.7 Mean : 1579.1 Mean : 332.7 Mean : 2595.1
## 3rd Qu.: 259.0 3rd Qu.: 2122.2 3rd Qu.: 277.5 3rd Qu.: 3545.0
## Max. :2524.0 Max. :14853.0 Max. :9422.0 Max. :23278.0
## WT
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 28.74
## 3rd Qu.: 15.00
## Max. :549.00
cat("\nMissing values per column:\n")
##
## Missing values per column:
colSums(is.na(data))
## State Year Rape K.A DD AoW AoM DV WT
## 0 0 0 0 0 0 0 0 0
cat("Number of duplicate rows:", sum(duplicated(data)), "\n")
## Number of duplicate rows: 0
sapply(data[, sapply(data, is.numeric)], function(x) sum(x < 0, na.rm = TRUE))
## Year Rape K.A DD AoW AoM DV WT
## 0 0 0 0 0 0 0 0
crime_totals <- colSums(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], na.rm = TRUE)
cat("========== TOTAL CRIMES (2001-2021) ==========\n")
## ========== TOTAL CRIMES (2001-2021) ==========
print(crime_totals)
## Rape K.A DD AoW AoM DV WT
## 535702 835023 158750 1162229 244884 1909978 21156
grand_total <- sum(crime_totals)
cat("\nGRAND TOTAL:", format(grand_total, big.mark = ","), "\n")
##
## GRAND TOTAL: 4,867,722
means <- colMeans(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], na.rm = TRUE)
cat("========== MEAN VALUES ==========\n")
## ========== MEAN VALUES ==========
print(round(means, 2))
## Rape K.A DD AoW AoM DV WT
## 727.86 1134.54 215.69 1579.12 332.72 2595.08 28.74
cat("\nHighest mean:", names(which.max(means)), "(", round(max(means), 2), ")\n")
##
## Highest mean: DV ( 2595.08 )
cat("Lowest mean:", names(which.min(means)), "(", round(min(means), 2), ")\n")
## Lowest mean: WT ( 28.74 )
medians <- apply(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], 2, median, na.rm = TRUE)
cat("========== MEDIAN VALUES ==========\n")
## ========== MEDIAN VALUES ==========
print(round(medians, 2))
## Rape K.A DD AoW AoM DV WT
## 348.5 290.0 29.0 387.5 31.0 678.5 0.0
comparison <- data.frame(
Crime = names(means),
Mean = round(means, 2),
Median = round(medians, 2),
Difference = round(means - medians, 2)
)
print(comparison)
## Crime Mean Median Difference
## Rape Rape 727.86 348.5 379.36
## K.A K.A 1134.54 290.0 844.54
## DD DD 215.69 29.0 186.69
## AoW AoW 1579.12 387.5 1191.62
## AoM AoM 332.72 31.0 301.72
## DV DV 2595.08 678.5 1916.58
## WT WT 28.74 0.0 28.74
sds <- apply(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], 2, sd, na.rm = TRUE)
cat("========== STANDARD DEVIATION ==========\n")
## ========== STANDARD DEVIATION ==========
print(round(sds, 2))
## Rape K.A DD AoW AoM DV WT
## 977.02 1993.54 424.93 2463.96 806.02 4042.00 80.00
cat("\nHighest variability:", names(which.max(sds)), "(SD =", round(max(sds), 2), ")\n")
##
## Highest variability: DV (SD = 4042 )
cat("Lowest variability:", names(which.min(sds)), "(SD =", round(min(sds), 2), ")\n")
## Lowest variability: WT (SD = 80 )
range_data <- apply(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], 2, range, na.rm = TRUE)
cat("========== MINIMUM VALUES ==========\n")
## ========== MINIMUM VALUES ==========
print(round(range_data[1, ], 2))
## Rape K.A DD AoW AoM DV WT
## 0 0 0 0 0 0 0
cat("\n========== MAXIMUM VALUES ==========\n")
##
## ========== MAXIMUM VALUES ==========
print(round(range_data[2, ], 2))
## Rape K.A DD AoW AoM DV WT
## 6337 15381 2524 14853 9422 23278 549
for(crime in names(range_data[2, ])) {
max_val <- range_data[2, crime]
max_row <- data[which(data[[crime]] == max_val), c("State", "Year", crime)]
if(nrow(max_row) > 0) {
cat(crime, ":", max_val, "-", max_row$State[1], "(", max_row$Year[1], ")\n")
}
}
## Rape : 6337 - Punjab ( 2021 )
## K.A : 15381 - Uttar Pradesh ( 2018 )
## DD : 2524 - Uttar Pradesh ( 2017 )
## AoW : 14853 - Nagaland ( 2021 )
## AoM : 9422 - Uttar Pradesh ( 2016 )
## DV : 23278 - West Bengal ( 2014 )
## WT : 549 - Tamil Nadu ( 2013 )
cat("========== PERCENTILE ANALYSIS ==========\n\n")
## ========== PERCENTILE ANALYSIS ==========
for(crime in c("Rape", "K.A", "DV", "AoW")) {
percentiles <- quantile(data[[crime]], probs = c(0.25, 0.50, 0.75, 0.90, 0.95, 0.99), na.rm = TRUE)
cat(crime, ":\n")
cat(" 25th percentile:", round(percentiles[1], 2), "\n")
cat(" 50th percentile (Median):", round(percentiles[2], 2), "\n")
cat(" 75th percentile:", round(percentiles[3], 2), "\n")
cat(" 90th percentile:", round(percentiles[4], 2), "\n")
cat(" 95th percentile:", round(percentiles[5], 2), "\n")
cat(" 99th percentile:", round(percentiles[6], 2), "\n\n")
}
## Rape :
## 25th percentile: 35
## 50th percentile (Median): 348.5
## 75th percentile: 1069
## 90th percentile: 1835.5
## 95th percentile: 2857
## 99th percentile: 4667.25
##
## K.A :
## 25th percentile: 24.75
## 50th percentile (Median): 290
## 75th percentile: 1216
## 90th percentile: 3779
## 95th percentile: 5275.25
## 99th percentile: 9517.2
##
## DV :
## 25th percentile: 13
## 50th percentile (Median): 678.5
## 75th percentile: 3545
## 90th percentile: 7671
## 95th percentile: 11353.25
## 99th percentile: 18412.05
##
## AoW :
## 25th percentile: 34
## 50th percentile (Median): 387.5
## 75th percentile: 2122.25
## 90th percentile: 4782
## 95th percentile: 6791
## 99th percentile: 11325.55
detect_outliers <- function(x, crime_name) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_val
upper_bound <- Q3 + 1.5 * IQR_val
outliers_count <- sum(x < lower_bound | x > upper_bound, na.rm = TRUE)
return(outliers_count)
}
cat("========== OUTLIER DETECTION ==========\n\n")
## ========== OUTLIER DETECTION ==========
for(crime in c("Rape", "K.A", "DV", "AoW")) {
outlier_count <- detect_outliers(data[[crime]], crime)
cat(crime, ":", outlier_count, "outliers (",
round(outlier_count/nrow(data)*100, 2), "%)\n")
}
## Rape : 41 outliers ( 5.57 %)
## K.A : 84 outliers ( 11.41 %)
## DV : 53 outliers ( 7.2 %)
## AoW : 56 outliers ( 7.61 %)
par(mfrow = c(2, 2))
boxplot(data$Rape, main = "Rape Cases", ylab = "Number of Cases", col = "lightblue")
boxplot(data$K.A, main = "Kidnapping & Abduction", ylab = "Number of Cases", col = "lightgreen")
boxplot(data$DV, main = "Domestic Violence", ylab = "Number of Cases", col = "lightcoral")
boxplot(data$AoW, main = "Assault on Women", ylab = "Number of Cases", col = "lightyellow")

state_total <- data %>%
group_by(State) %>%
summarise(
Rape = sum(Rape, na.rm = TRUE),
Kidnapping = sum(K.A, na.rm = TRUE),
DD = sum(DD, na.rm = TRUE),
AoW = sum(AoW, na.rm = TRUE),
AoM = sum(AoM, na.rm = TRUE),
DV = sum(DV, na.rm = TRUE),
WT = sum(WT, na.rm = TRUE),
Total_Crimes = sum(Rape + K.A + DD + AoW + AoM + DV + WT, na.rm = TRUE)
) %>%
arrange(desc(Total_Crimes))
cat("========== TOP 10 STATES ==========\n")
## ========== TOP 10 STATES ==========
print(head(state_total[, c("State", "Total_Crimes")], 10))
## # A tibble: 10 × 2
## State Total_Crimes
## <chr> <int>
## 1 Uttar Pradesh 2843264
## 2 West Bengal 2545397
## 3 Madhya Pradesh 2386164
## 4 Rajasthan 2312652
## 5 Maharashtra 2172435
## 6 Andhra Pradesh 2102602
## 7 Assam 1906570
## 8 ANDHRA PRADESH 1803521
## 9 UTTAR PRADESH 1498605
## 10 MADHYA PRADESH 1440378
cat("\n========== BOTTOM 10 STATES ==========\n")
##
## ========== BOTTOM 10 STATES ==========
print(tail(state_total[, c("State", "Total_Crimes")], 10))
## # A tibble: 10 × 2
## State Total_Crimes
## <chr> <int>
## 1 MANIPUR 9088
## 2 GOA 7911
## 3 Puducherry 6551
## 4 A & N ISLANDS 4228
## 5 SIKKIM 3872
## 6 NAGALAND 2718
## 7 D & N HAVELI 1427
## 8 DAMAN & DIU 653
## 9 Lakshadweep 594
## 10 LAKSHADWEEP 171
highest_state <- state_total[1, ]
cat("\nHIGHEST CRIME STATE:", highest_state$State,
"-", format(highest_state$Total_Crimes, big.mark = ","), "cases\n")
##
## HIGHEST CRIME STATE: Uttar Pradesh - 2,843,264 cases
lowest_state <- state_total[nrow(state_total), ]
cat("LOWEST CRIME STATE:", lowest_state$State,
"-", format(lowest_state$Total_Crimes, big.mark = ","), "cases\n")
## LOWEST CRIME STATE: LAKSHADWEEP - 171 cases
top5_total <- sum(head(state_total$Total_Crimes, 5))
cat("\nTop 5 states account for:", round(top5_total / sum(state_total$Total_Crimes) * 100, 1), "%\n")
##
## Top 5 states account for: 28 %
crime_df <- data.frame(
Crime_Type = names(crime_totals),
Total_Cases = crime_totals,
Percentage = round(crime_totals / sum(crime_totals) * 100, 2)
) %>% arrange(desc(Total_Cases))
cat("========== CRIME DISTRIBUTION ==========\n")
## ========== CRIME DISTRIBUTION ==========
print(crime_df)
## Crime_Type Total_Cases Percentage
## DV DV 1909978 39.24
## AoW AoW 1162229 23.88
## K.A K.A 835023 17.15
## Rape Rape 535702 11.01
## AoM AoM 244884 5.03
## DD DD 158750 3.26
## WT WT 21156 0.43
cat("\nMOST COMMON:", crime_df$Crime_Type[1],
"(", format(crime_df$Total_Cases[1], big.mark = ","), "cases,",
crime_df$Percentage[1], "%)\n")
##
## MOST COMMON: DV ( 1,909,978 cases, 39.24 %)
cat("LEAST COMMON:", crime_df$Crime_Type[nrow(crime_df)],
"(", format(crime_df$Total_Cases[nrow(crime_df)], big.mark = ","), "cases,",
crime_df$Percentage[nrow(crime_df)], "%)\n")
## LEAST COMMON: WT ( 21,156 cases, 0.43 %)
yearly <- data %>%
group_by(Year) %>%
summarise(
Rape = sum(Rape, na.rm = TRUE),
Kidnapping = sum(K.A, na.rm = TRUE),
DD = sum(DD, na.rm = TRUE),
AoW = sum(AoW, na.rm = TRUE),
AoM = sum(AoM, na.rm = TRUE),
DV = sum(DV, na.rm = TRUE),
WT = sum(WT, na.rm = TRUE),
Total = sum(Rape + K.A + DD + AoW + AoM + DV + WT, na.rm = TRUE)
)
highest_year <- yearly %>% arrange(desc(Total)) %>% slice(1)
cat("========== YEAR WITH HIGHEST CRIMES ==========\n")
## ========== YEAR WITH HIGHEST CRIMES ==========
print(highest_year)
## # A tibble: 1 × 9
## Year Rape Kidnapping DD AoW AoM DV WT Total
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 2014 36735 57311 10050 82235 21938 122877 2070 9989891
lowest_year <- yearly %>% arrange(Total) %>% slice(1)
cat("\n========== YEAR WITH LOWEST CRIMES ==========\n")
##
## ========== YEAR WITH LOWEST CRIMES ==========
print(lowest_year)
## # A tibble: 1 × 9
## Year Rape Kidnapping DD AoW AoM DV WT Total
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 2001 15694 13681 6738 33622 9656 49032 114 3918785
first_year_total <- yearly$Total[yearly$Year == min(yearly$Year)]
last_year_total <- yearly$Total[yearly$Year == max(yearly$Year)]
change_percent <- ((last_year_total - first_year_total) / first_year_total) * 100
cat("\n========== OVERALL CHANGE ==========\n")
##
## ========== OVERALL CHANGE ==========
cat("2001 Total:", format(first_year_total, big.mark = ","), "\n")
## 2001 Total: 3,918,785
cat("2021 Total:", format(last_year_total, big.mark = ","), "\n")
## 2021 Total: 9,893,397
cat("Change:", round(change_percent, 1), "% increase\n")
## Change: 152.5 % increase
top15_states <- head(state_total, 15)
ggplot(top15_states, aes(x = reorder(State, Total_Crimes), y = Total_Crimes)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = format(Total_Crimes, big.mark = ",")),
hjust = -0.1, size = 3) +
coord_flip() +
theme_minimal() +
labs(
title = "Top 15 States by Total Crimes Against Women (2001-2021)",
x = "State",
y = "Total Reported Cases"
) +
scale_y_continuous(labels = scales::comma)

major_crimes <- crime_df %>% filter(Crime_Type != "WT")
ggplot(major_crimes, aes(x = "", y = Percentage, fill = Crime_Type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
theme_void() +
labs(title = "Distribution of Major Crimes Against Women") +
geom_text(aes(label = paste0(round(Percentage, 1), "%")),
position = position_stack(vjust = 0.5), size = 4) +
scale_fill_brewer(palette = "Set2")

ggplot(yearly, aes(x = Year, y = Total)) +
geom_line(color = "darkred", size = 1.2) +
geom_point(color = "steelblue", size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "gray30", linetype = "dashed") +
theme_minimal() +
labs(
title = "Yearly Trend of Crimes Against Women (2001-2021)",
x = "Year",
y = "Total Reported Cases"
) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = seq(2001, 2021, 2))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

yearly_long <- yearly %>%
select(Year, Rape, Kidnapping, DD, AoW, AoM, DV) %>%
pivot_longer(cols = -Year, names_to = "Crime_Type", values_to = "Count")
ggplot(yearly_long, aes(x = Year, y = Count, color = Crime_Type)) +
geom_line(size = 1) +
geom_point(size = 1.5) +
facet_wrap(~Crime_Type, scales = "free_y", ncol = 2) +
theme_minimal() +
labs(
title = "Crime Trends by Category Over Time",
x = "Year",
y = "Reported Cases"
) +
theme(legend.position = "bottom")

ggplot(data, aes(x = Rape)) +
geom_histogram(bins = 50, fill = "steelblue", color = "white", alpha = 0.7) +
theme_minimal() +
labs(
title = "Distribution of Rape Cases Across State-Years",
x = "Rape Cases",
y = "Frequency"
) +
geom_vline(xintercept = mean(data$Rape), color = "red", linetype = "dashed", size = 1) +
geom_vline(xintercept = median(data$Rape), color = "blue", linetype = "dashed", size = 1) +
annotate("text", x = 1500, y = 100, label = "Mean", color = "red") +
annotate("text", x = 1500, y = 90, label = "Median", color = "blue")

boxplot_data <- data %>%
select(Rape, K.A, DD, AoW, DV) %>%
pivot_longer(everything(), names_to = "Crime", values_to = "Count")
ggplot(boxplot_data, aes(x = Crime, y = Count, fill = Crime)) +
geom_boxplot() +
scale_y_log10() +
theme_minimal() +
labs(
title = "Distribution of Crime Cases by Type (Log Scale)",
x = "Crime Type",
y = "Number of Cases (Log10 Scale)"
) +
theme(legend.position = "none")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 329 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

cor_matrix <- cor(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV")], use = "complete.obs")
cor_long <- as.data.frame(as.table(cor_matrix))
names(cor_long) <- c("Crime1", "Crime2", "Correlation")
ggplot(cor_long, aes(x = Crime1, y = Crime2, fill = Correlation)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1)) +
theme_minimal() +
labs(title = "Correlation Heatmap of Crime Types") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(label = round(Correlation, 2)), size = 3)

anova_rape <- aov(Rape ~ State, data = data)
cat("========== ANOVA: RAPE BY STATE ==========\n")
## ========== ANOVA: RAPE BY STATE ==========
summary(anova_rape)
## Df Sum Sq Mean Sq F value Pr(>F)
## State 69 516185347 7480947 26.87 <2e-16 ***
## Residuals 666 185429293 278422
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_dv <- aov(DV ~ State, data = data)
cat("\n========== ANOVA: DOMESTIC VIOLENCE BY STATE ==========\n")
##
## ========== ANOVA: DOMESTIC VIOLENCE BY STATE ==========
summary(anova_dv)
## Df Sum Sq Mean Sq F value Pr(>F)
## State 69 8.295e+09 120220089 21.56 <2e-16 ***
## Residuals 666 3.713e+09 5575225
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rape_p_value <- summary(anova_rape)[[1]][["Pr(>F)"]][1]
dv_p_value <- summary(anova_dv)[[1]][["Pr(>F)"]][1]
cat("\n========== INTERPRETATION ==========\n")
##
## ========== INTERPRETATION ==========
if(rape_p_value < 0.05) {
cat("✓ Rape cases differ significantly across states (p < 0.05)\n")
}
## ✓ Rape cases differ significantly across states (p < 0.05)
if(dv_p_value < 0.05) {
cat("✓ Domestic Violence cases differ significantly across states (p < 0.05)\n")
}
## ✓ Domestic Violence cases differ significantly across states (p < 0.05)
cat("========== CORRELATION MATRIX ==========\n")
## ========== CORRELATION MATRIX ==========
print(round(cor_matrix, 3))
## Rape K.A DD AoW AoM DV
## Rape 1.000 0.701 0.553 0.804 0.458 0.680
## K.A 0.701 1.000 0.692 0.671 0.356 0.686
## DD 0.553 0.692 1.000 0.456 0.423 0.487
## AoW 0.804 0.671 0.456 1.000 0.521 0.632
## AoM 0.458 0.356 0.423 0.521 1.000 0.419
## DV 0.680 0.686 0.487 0.632 0.419 1.000
cat("\n========== STRONGEST CORRELATIONS ==========\n")
##
## ========== STRONGEST CORRELATIONS ==========
cat("Rape - K.A:", round(cor(data$Rape, data$K.A), 3), "\n")
## Rape - K.A: 0.701
cat("AoW - DV:", round(cor(data$AoW, data$DV), 3), "\n")
## AoW - DV: 0.632
cat("DD - AoM:", round(cor(data$DD, data$AoM), 3), "\n")
## DD - AoM: 0.423
cat("Rape - AoW:", round(cor(data$Rape, data$AoW), 3), "\n")
## Rape - AoW: 0.804
data$Total_Crimes <- data$Rape + data$K.A + data$DD + data$AoW + data$AoM + data$DV + data$WT
regression_model <- lm(Total_Crimes ~ Year, data = data)
cat("========== REGRESSION: TOTAL CRIMES ~ YEAR ==========\n")
## ========== REGRESSION: TOTAL CRIMES ~ YEAR ==========
summary(regression_model)
##
## Call:
## lm(formula = Total_Crimes ~ Year, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9950 -6045 -3149 2890 43987
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -675531.68 108168.70 -6.245 7.16e-10 ***
## Year 339.18 53.78 6.306 4.93e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8827 on 734 degrees of freedom
## Multiple R-squared: 0.0514, Adjusted R-squared: 0.05011
## F-statistic: 39.77 on 1 and 734 DF, p-value: 4.933e-10
future_years <- data.frame(Year = c(2022, 2023, 2024, 2025))
predictions <- predict(regression_model, newdata = future_years, interval = "confidence")
cat("\n========== PREDICTIONS FOR 2022-2025 ==========\n")
##
## ========== PREDICTIONS FOR 2022-2025 ==========
prediction_df <- data.frame(
Year = future_years$Year,
Predicted = round(predictions[, "fit"], 0),
Lower_CI = round(predictions[, "lwr"], 0),
Upper_CI = round(predictions[, "upr"], 0)
)
print(prediction_df)
## Year Predicted Lower_CI Upper_CI
## 1 2022 10294 8982 11606
## 2 2023 10633 9228 12038
## 3 2024 10972 9473 12472
## 4 2025 11312 9716 12907
r_squared <- summary(regression_model)$r.squared
cat("\nR-squared =", round(r_squared, 4),
"→", round(r_squared * 100, 1), "% of variation explained by Year\n")
##
## R-squared = 0.0514 → 5.1 % of variation explained by Year
skewness_values <- sapply(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV")],
skewness, na.rm = TRUE)
cat("========== SKEWNESS ANALYSIS ==========\n")
## ========== SKEWNESS ANALYSIS ==========
skew_df <- data.frame(
Crime = names(skewness_values),
Skewness = round(skewness_values, 3),
Interpretation = ifelse(skewness_values > 1, "Highly Right-Skewed",
ifelse(skewness_values > 0.5, "Moderately Right-Skewed",
"Approximately Symmetric"))
)
print(skew_df)
## Crime Skewness Interpretation
## Rape Rape 2.275 Highly Right-Skewed
## K.A K.A 3.115 Highly Right-Skewed
## DD DD 3.359 Highly Right-Skewed
## AoW AoW 2.251 Highly Right-Skewed
## AoM AoM 4.858 Highly Right-Skewed
## DV DV 2.229 Highly Right-Skewed
ggplot(data, aes(x = Rape)) +
geom_histogram(aes(y = after_stat(density)), bins = 50, fill = "steelblue", alpha = 0.7) +
geom_density(color = "red", size = 1) +
theme_minimal() +
labs(
title = "Distribution of Rape Cases (Skewness Check)",
subtitle = paste("Skewness =", round(skewness_values["Rape"], 3), "- Highly Right-Skewed"),
x = "Rape Cases",
y = "Density"
)
