# Run this in R Console (the bottom panel of RStudio)
options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages(c("dplyr", "tidyr", "tibble", "ggplot2", "readr", "e1071"))
## Installing packages into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## package 'tidyr' successfully unpacked and MD5 sums checked
## package 'tibble' successfully unpacked and MD5 sums checked
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'readr' successfully unpacked and MD5 sums checked
## package 'e1071' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
# Install with all dependencies
install.packages("dplyr")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("tidyr")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'tidyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("tibble")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'tibble' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("ggplot2")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("readr")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
install.packages("e1071")
## Installing package into 'C:/Users/Welcome/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'e1071' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Welcome\AppData\Local\Temp\RtmpcdhFTZ\downloaded_packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.3
library(tibble)
## Warning: package 'tibble' was built under R version 4.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(readr)
## Warning: package 'readr' was built under R version 4.5.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
data <- read.csv("C:/Users/Welcome/Downloads/CrimesOnWomenData(5).csv")


data <- data[, -1]

# Rename columns
colnames(data) <- c("State", "Year", "Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")

# Display first few rows
head(data, 10)
##                State Year Rape  K.A  DD  AoW  AoM   DV WT
## 1     ANDHRA PRADESH 2001  871  765 420 3544 2271 5791  7
## 2  ARUNACHAL PRADESH 2001   33   55   0   78    3   11  0
## 3              ASSAM 2001  817 1070  59  850    4 1248  0
## 4              BIHAR 2001  888  518 859  562   21 1558 83
## 5       CHHATTISGARH 2001  959  171  70 1763  161  840  0
## 6                GOA 2001   12    6   2   17    7   11  0
## 7            GUJARAT 2001  286  857  67  756  111 3667  0
## 8            HARYANA 2001  398  297 285  478  401 1513  0
## 9   HIMACHAL PRADESH 2001  124  105  10  310   14  317  0
## 10   JAMMU & KASHMIR 2001  169  504  13  622  288   50  0
cat("\nDataset dimensions:", dim(data)[1], "rows x", dim(data)[2], "columns\n")
## 
## Dataset dimensions: 736 rows x 9 columns
str(data)
## 'data.frame':    736 obs. of  9 variables:
##  $ State: chr  "ANDHRA PRADESH" "ARUNACHAL PRADESH" "ASSAM" "BIHAR" ...
##  $ Year : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
##  $ Rape : int  871 33 817 888 959 12 286 398 124 169 ...
##  $ K.A  : int  765 55 1070 518 171 6 857 297 105 504 ...
##  $ DD   : int  420 0 59 859 70 2 67 285 10 13 ...
##  $ AoW  : int  3544 78 850 562 1763 17 756 478 310 622 ...
##  $ AoM  : int  2271 3 4 21 161 7 111 401 14 288 ...
##  $ DV   : int  5791 11 1248 1558 840 11 3667 1513 317 50 ...
##  $ WT   : int  7 0 0 83 0 0 0 0 0 0 ...
summary(data)
##     State                Year           Rape             K.A          
##  Length:736         Min.   :2001   Min.   :   0.0   Min.   :    0.00  
##  Class :character   1st Qu.:2006   1st Qu.:  35.0   1st Qu.:   24.75  
##  Mode  :character   Median :2011   Median : 348.5   Median :  290.00  
##                     Mean   :2011   Mean   : 727.9   Mean   : 1134.54  
##                     3rd Qu.:2016   3rd Qu.:1069.0   3rd Qu.: 1216.00  
##                     Max.   :2021   Max.   :6337.0   Max.   :15381.00  
##        DD              AoW               AoM               DV         
##  Min.   :   0.0   Min.   :    0.0   Min.   :   0.0   Min.   :    0.0  
##  1st Qu.:   1.0   1st Qu.:   34.0   1st Qu.:   3.0   1st Qu.:   13.0  
##  Median :  29.0   Median :  387.5   Median :  31.0   Median :  678.5  
##  Mean   : 215.7   Mean   : 1579.1   Mean   : 332.7   Mean   : 2595.1  
##  3rd Qu.: 259.0   3rd Qu.: 2122.2   3rd Qu.: 277.5   3rd Qu.: 3545.0  
##  Max.   :2524.0   Max.   :14853.0   Max.   :9422.0   Max.   :23278.0  
##        WT        
##  Min.   :  0.00  
##  1st Qu.:  0.00  
##  Median :  0.00  
##  Mean   : 28.74  
##  3rd Qu.: 15.00  
##  Max.   :549.00
cat("\nMissing values per column:\n")
## 
## Missing values per column:
colSums(is.na(data))
## State  Year  Rape   K.A    DD   AoW   AoM    DV    WT 
##     0     0     0     0     0     0     0     0     0
cat("Number of duplicate rows:", sum(duplicated(data)), "\n")
## Number of duplicate rows: 0
sapply(data[, sapply(data, is.numeric)], function(x) sum(x < 0, na.rm = TRUE))
## Year Rape  K.A   DD  AoW  AoM   DV   WT 
##    0    0    0    0    0    0    0    0
crime_totals <- colSums(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], na.rm = TRUE)

cat("========== TOTAL CRIMES (2001-2021) ==========\n")
## ========== TOTAL CRIMES (2001-2021) ==========
print(crime_totals)
##    Rape     K.A      DD     AoW     AoM      DV      WT 
##  535702  835023  158750 1162229  244884 1909978   21156
grand_total <- sum(crime_totals)
cat("\nGRAND TOTAL:", format(grand_total, big.mark = ","), "\n")
## 
## GRAND TOTAL: 4,867,722
means <- colMeans(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], na.rm = TRUE)

cat("========== MEAN VALUES ==========\n")
## ========== MEAN VALUES ==========
print(round(means, 2))
##    Rape     K.A      DD     AoW     AoM      DV      WT 
##  727.86 1134.54  215.69 1579.12  332.72 2595.08   28.74
cat("\nHighest mean:", names(which.max(means)), "(", round(max(means), 2), ")\n")
## 
## Highest mean: DV ( 2595.08 )
cat("Lowest mean:", names(which.min(means)), "(", round(min(means), 2), ")\n")
## Lowest mean: WT ( 28.74 )
medians <- apply(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], 2, median, na.rm = TRUE)

cat("========== MEDIAN VALUES ==========\n")
## ========== MEDIAN VALUES ==========
print(round(medians, 2))
##  Rape   K.A    DD   AoW   AoM    DV    WT 
## 348.5 290.0  29.0 387.5  31.0 678.5   0.0
comparison <- data.frame(
  Crime = names(means),
  Mean = round(means, 2),
  Median = round(medians, 2),
  Difference = round(means - medians, 2)
)
print(comparison)
##      Crime    Mean Median Difference
## Rape  Rape  727.86  348.5     379.36
## K.A    K.A 1134.54  290.0     844.54
## DD      DD  215.69   29.0     186.69
## AoW    AoW 1579.12  387.5    1191.62
## AoM    AoM  332.72   31.0     301.72
## DV      DV 2595.08  678.5    1916.58
## WT      WT   28.74    0.0      28.74
sds <- apply(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], 2, sd, na.rm = TRUE)

cat("========== STANDARD DEVIATION ==========\n")
## ========== STANDARD DEVIATION ==========
print(round(sds, 2))
##    Rape     K.A      DD     AoW     AoM      DV      WT 
##  977.02 1993.54  424.93 2463.96  806.02 4042.00   80.00
cat("\nHighest variability:", names(which.max(sds)), "(SD =", round(max(sds), 2), ")\n")
## 
## Highest variability: DV (SD = 4042 )
cat("Lowest variability:", names(which.min(sds)), "(SD =", round(min(sds), 2), ")\n")
## Lowest variability: WT (SD = 80 )
range_data <- apply(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV", "WT")], 2, range, na.rm = TRUE)

cat("========== MINIMUM VALUES ==========\n")
## ========== MINIMUM VALUES ==========
print(round(range_data[1, ], 2))
## Rape  K.A   DD  AoW  AoM   DV   WT 
##    0    0    0    0    0    0    0
cat("\n========== MAXIMUM VALUES ==========\n")
## 
## ========== MAXIMUM VALUES ==========
print(round(range_data[2, ], 2))
##  Rape   K.A    DD   AoW   AoM    DV    WT 
##  6337 15381  2524 14853  9422 23278   549
for(crime in names(range_data[2, ])) {
  max_val <- range_data[2, crime]
  max_row <- data[which(data[[crime]] == max_val), c("State", "Year", crime)]
  if(nrow(max_row) > 0) {
    cat(crime, ":", max_val, "-", max_row$State[1], "(", max_row$Year[1], ")\n")
  }
}
## Rape : 6337 - Punjab ( 2021 )
## K.A : 15381 - Uttar Pradesh ( 2018 )
## DD : 2524 - Uttar Pradesh ( 2017 )
## AoW : 14853 - Nagaland ( 2021 )
## AoM : 9422 - Uttar Pradesh ( 2016 )
## DV : 23278 - West Bengal ( 2014 )
## WT : 549 - Tamil Nadu ( 2013 )
cat("========== PERCENTILE ANALYSIS ==========\n\n")
## ========== PERCENTILE ANALYSIS ==========
for(crime in c("Rape", "K.A", "DV", "AoW")) {
  percentiles <- quantile(data[[crime]], probs = c(0.25, 0.50, 0.75, 0.90, 0.95, 0.99), na.rm = TRUE)
  cat(crime, ":\n")
  cat("  25th percentile:", round(percentiles[1], 2), "\n")
  cat("  50th percentile (Median):", round(percentiles[2], 2), "\n")
  cat("  75th percentile:", round(percentiles[3], 2), "\n")
  cat("  90th percentile:", round(percentiles[4], 2), "\n")
  cat("  95th percentile:", round(percentiles[5], 2), "\n")
  cat("  99th percentile:", round(percentiles[6], 2), "\n\n")
}
## Rape :
##   25th percentile: 35 
##   50th percentile (Median): 348.5 
##   75th percentile: 1069 
##   90th percentile: 1835.5 
##   95th percentile: 2857 
##   99th percentile: 4667.25 
## 
## K.A :
##   25th percentile: 24.75 
##   50th percentile (Median): 290 
##   75th percentile: 1216 
##   90th percentile: 3779 
##   95th percentile: 5275.25 
##   99th percentile: 9517.2 
## 
## DV :
##   25th percentile: 13 
##   50th percentile (Median): 678.5 
##   75th percentile: 3545 
##   90th percentile: 7671 
##   95th percentile: 11353.25 
##   99th percentile: 18412.05 
## 
## AoW :
##   25th percentile: 34 
##   50th percentile (Median): 387.5 
##   75th percentile: 2122.25 
##   90th percentile: 4782 
##   95th percentile: 6791 
##   99th percentile: 11325.55
detect_outliers <- function(x, crime_name) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_val
  upper_bound <- Q3 + 1.5 * IQR_val
  outliers_count <- sum(x < lower_bound | x > upper_bound, na.rm = TRUE)
  return(outliers_count)
}

cat("========== OUTLIER DETECTION ==========\n\n")
## ========== OUTLIER DETECTION ==========
for(crime in c("Rape", "K.A", "DV", "AoW")) {
  outlier_count <- detect_outliers(data[[crime]], crime)
  cat(crime, ":", outlier_count, "outliers (", 
      round(outlier_count/nrow(data)*100, 2), "%)\n")
}
## Rape : 41 outliers ( 5.57 %)
## K.A : 84 outliers ( 11.41 %)
## DV : 53 outliers ( 7.2 %)
## AoW : 56 outliers ( 7.61 %)
par(mfrow = c(2, 2))
boxplot(data$Rape, main = "Rape Cases", ylab = "Number of Cases", col = "lightblue")
boxplot(data$K.A, main = "Kidnapping & Abduction", ylab = "Number of Cases", col = "lightgreen")
boxplot(data$DV, main = "Domestic Violence", ylab = "Number of Cases", col = "lightcoral")
boxplot(data$AoW, main = "Assault on Women", ylab = "Number of Cases", col = "lightyellow")

state_total <- data %>%
  group_by(State) %>%
  summarise(
    Rape = sum(Rape, na.rm = TRUE),
    Kidnapping = sum(K.A, na.rm = TRUE),
    DD = sum(DD, na.rm = TRUE),
    AoW = sum(AoW, na.rm = TRUE),
    AoM = sum(AoM, na.rm = TRUE),
    DV = sum(DV, na.rm = TRUE),
    WT = sum(WT, na.rm = TRUE),
    Total_Crimes = sum(Rape + K.A + DD + AoW + AoM + DV + WT, na.rm = TRUE)
  ) %>%
  arrange(desc(Total_Crimes))

cat("========== TOP 10 STATES ==========\n")
## ========== TOP 10 STATES ==========
print(head(state_total[, c("State", "Total_Crimes")], 10))
## # A tibble: 10 × 2
##    State          Total_Crimes
##    <chr>                 <int>
##  1 Uttar Pradesh       2843264
##  2 West Bengal         2545397
##  3 Madhya Pradesh      2386164
##  4 Rajasthan           2312652
##  5 Maharashtra         2172435
##  6 Andhra Pradesh      2102602
##  7 Assam               1906570
##  8 ANDHRA PRADESH      1803521
##  9 UTTAR PRADESH       1498605
## 10 MADHYA PRADESH      1440378
cat("\n========== BOTTOM 10 STATES ==========\n")
## 
## ========== BOTTOM 10 STATES ==========
print(tail(state_total[, c("State", "Total_Crimes")], 10))
## # A tibble: 10 × 2
##    State         Total_Crimes
##    <chr>                <int>
##  1 MANIPUR               9088
##  2 GOA                   7911
##  3 Puducherry            6551
##  4 A & N ISLANDS         4228
##  5 SIKKIM                3872
##  6 NAGALAND              2718
##  7 D & N HAVELI          1427
##  8 DAMAN & DIU            653
##  9 Lakshadweep            594
## 10 LAKSHADWEEP            171
highest_state <- state_total[1, ]
cat("\nHIGHEST CRIME STATE:", highest_state$State, 
    "-", format(highest_state$Total_Crimes, big.mark = ","), "cases\n")
## 
## HIGHEST CRIME STATE: Uttar Pradesh - 2,843,264 cases
lowest_state <- state_total[nrow(state_total), ]
cat("LOWEST CRIME STATE:", lowest_state$State, 
    "-", format(lowest_state$Total_Crimes, big.mark = ","), "cases\n")
## LOWEST CRIME STATE: LAKSHADWEEP - 171 cases
top5_total <- sum(head(state_total$Total_Crimes, 5))
cat("\nTop 5 states account for:", round(top5_total / sum(state_total$Total_Crimes) * 100, 1), "%\n")
## 
## Top 5 states account for: 28 %
crime_df <- data.frame(
  Crime_Type = names(crime_totals),
  Total_Cases = crime_totals,
  Percentage = round(crime_totals / sum(crime_totals) * 100, 2)
) %>% arrange(desc(Total_Cases))

cat("========== CRIME DISTRIBUTION ==========\n")
## ========== CRIME DISTRIBUTION ==========
print(crime_df)
##      Crime_Type Total_Cases Percentage
## DV           DV     1909978      39.24
## AoW         AoW     1162229      23.88
## K.A         K.A      835023      17.15
## Rape       Rape      535702      11.01
## AoM         AoM      244884       5.03
## DD           DD      158750       3.26
## WT           WT       21156       0.43
cat("\nMOST COMMON:", crime_df$Crime_Type[1], 
    "(", format(crime_df$Total_Cases[1], big.mark = ","), "cases,", 
    crime_df$Percentage[1], "%)\n")
## 
## MOST COMMON: DV ( 1,909,978 cases, 39.24 %)
cat("LEAST COMMON:", crime_df$Crime_Type[nrow(crime_df)], 
    "(", format(crime_df$Total_Cases[nrow(crime_df)], big.mark = ","), "cases,", 
    crime_df$Percentage[nrow(crime_df)], "%)\n")
## LEAST COMMON: WT ( 21,156 cases, 0.43 %)
yearly <- data %>%
  group_by(Year) %>%
  summarise(
    Rape = sum(Rape, na.rm = TRUE),
    Kidnapping = sum(K.A, na.rm = TRUE),
    DD = sum(DD, na.rm = TRUE),
    AoW = sum(AoW, na.rm = TRUE),
    AoM = sum(AoM, na.rm = TRUE),
    DV = sum(DV, na.rm = TRUE),
    WT = sum(WT, na.rm = TRUE),
    Total = sum(Rape + K.A + DD + AoW + AoM + DV + WT, na.rm = TRUE)
  )

highest_year <- yearly %>% arrange(desc(Total)) %>% slice(1)
cat("========== YEAR WITH HIGHEST CRIMES ==========\n")
## ========== YEAR WITH HIGHEST CRIMES ==========
print(highest_year)
## # A tibble: 1 × 9
##    Year  Rape Kidnapping    DD   AoW   AoM     DV    WT   Total
##   <int> <int>      <int> <int> <int> <int>  <int> <int>   <int>
## 1  2014 36735      57311 10050 82235 21938 122877  2070 9989891
lowest_year <- yearly %>% arrange(Total) %>% slice(1)
cat("\n========== YEAR WITH LOWEST CRIMES ==========\n")
## 
## ========== YEAR WITH LOWEST CRIMES ==========
print(lowest_year)
## # A tibble: 1 × 9
##    Year  Rape Kidnapping    DD   AoW   AoM    DV    WT   Total
##   <int> <int>      <int> <int> <int> <int> <int> <int>   <int>
## 1  2001 15694      13681  6738 33622  9656 49032   114 3918785
first_year_total <- yearly$Total[yearly$Year == min(yearly$Year)]
last_year_total <- yearly$Total[yearly$Year == max(yearly$Year)]
change_percent <- ((last_year_total - first_year_total) / first_year_total) * 100

cat("\n========== OVERALL CHANGE ==========\n")
## 
## ========== OVERALL CHANGE ==========
cat("2001 Total:", format(first_year_total, big.mark = ","), "\n")
## 2001 Total: 3,918,785
cat("2021 Total:", format(last_year_total, big.mark = ","), "\n")
## 2021 Total: 9,893,397
cat("Change:", round(change_percent, 1), "% increase\n")
## Change: 152.5 % increase
top15_states <- head(state_total, 15)

ggplot(top15_states, aes(x = reorder(State, Total_Crimes), y = Total_Crimes)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = format(Total_Crimes, big.mark = ",")), 
            hjust = -0.1, size = 3) +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Top 15 States by Total Crimes Against Women (2001-2021)",
    x = "State",
    y = "Total Reported Cases"
  ) +
  scale_y_continuous(labels = scales::comma)

major_crimes <- crime_df %>% filter(Crime_Type != "WT")

ggplot(major_crimes, aes(x = "", y = Percentage, fill = Crime_Type)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  theme_void() +
  labs(title = "Distribution of Major Crimes Against Women") +
  geom_text(aes(label = paste0(round(Percentage, 1), "%")), 
            position = position_stack(vjust = 0.5), size = 4) +
  scale_fill_brewer(palette = "Set2")

ggplot(yearly, aes(x = Year, y = Total)) +
  geom_line(color = "darkred", size = 1.2) +
  geom_point(color = "steelblue", size = 2) +
  geom_smooth(method = "lm", se = TRUE, color = "gray30", linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "Yearly Trend of Crimes Against Women (2001-2021)",
    x = "Year",
    y = "Total Reported Cases"
  ) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(breaks = seq(2001, 2021, 2))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

yearly_long <- yearly %>%
  select(Year, Rape, Kidnapping, DD, AoW, AoM, DV) %>%
  pivot_longer(cols = -Year, names_to = "Crime_Type", values_to = "Count")

ggplot(yearly_long, aes(x = Year, y = Count, color = Crime_Type)) +
  geom_line(size = 1) +
  geom_point(size = 1.5) +
  facet_wrap(~Crime_Type, scales = "free_y", ncol = 2) +
  theme_minimal() +
  labs(
    title = "Crime Trends by Category Over Time",
    x = "Year",
    y = "Reported Cases"
  ) +
  theme(legend.position = "bottom")

ggplot(data, aes(x = Rape)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "white", alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "Distribution of Rape Cases Across State-Years",
    x = "Rape Cases",
    y = "Frequency"
  ) +
  geom_vline(xintercept = mean(data$Rape), color = "red", linetype = "dashed", size = 1) +
  geom_vline(xintercept = median(data$Rape), color = "blue", linetype = "dashed", size = 1) +
  annotate("text", x = 1500, y = 100, label = "Mean", color = "red") +
  annotate("text", x = 1500, y = 90, label = "Median", color = "blue")

boxplot_data <- data %>%
  select(Rape, K.A, DD, AoW, DV) %>%
  pivot_longer(everything(), names_to = "Crime", values_to = "Count")

ggplot(boxplot_data, aes(x = Crime, y = Count, fill = Crime)) +
  geom_boxplot() +
  scale_y_log10() +
  theme_minimal() +
  labs(
    title = "Distribution of Crime Cases by Type (Log Scale)",
    x = "Crime Type",
    y = "Number of Cases (Log10 Scale)"
  ) +
  theme(legend.position = "none")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 329 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

cor_matrix <- cor(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV")], use = "complete.obs")

cor_long <- as.data.frame(as.table(cor_matrix))
names(cor_long) <- c("Crime1", "Crime2", "Correlation")

ggplot(cor_long, aes(x = Crime1, y = Crime2, fill = Correlation)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1)) +
  theme_minimal() +
  labs(title = "Correlation Heatmap of Crime Types") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_text(aes(label = round(Correlation, 2)), size = 3)

anova_rape <- aov(Rape ~ State, data = data)
cat("========== ANOVA: RAPE BY STATE ==========\n")
## ========== ANOVA: RAPE BY STATE ==========
summary(anova_rape)
##              Df    Sum Sq Mean Sq F value Pr(>F)    
## State        69 516185347 7480947   26.87 <2e-16 ***
## Residuals   666 185429293  278422                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_dv <- aov(DV ~ State, data = data)
cat("\n========== ANOVA: DOMESTIC VIOLENCE BY STATE ==========\n")
## 
## ========== ANOVA: DOMESTIC VIOLENCE BY STATE ==========
summary(anova_dv)
##              Df    Sum Sq   Mean Sq F value Pr(>F)    
## State        69 8.295e+09 120220089   21.56 <2e-16 ***
## Residuals   666 3.713e+09   5575225                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rape_p_value <- summary(anova_rape)[[1]][["Pr(>F)"]][1]
dv_p_value <- summary(anova_dv)[[1]][["Pr(>F)"]][1]

cat("\n========== INTERPRETATION ==========\n")
## 
## ========== INTERPRETATION ==========
if(rape_p_value < 0.05) {
  cat("✓ Rape cases differ significantly across states (p < 0.05)\n")
}
## ✓ Rape cases differ significantly across states (p < 0.05)
if(dv_p_value < 0.05) {
  cat("✓ Domestic Violence cases differ significantly across states (p < 0.05)\n")
}
## ✓ Domestic Violence cases differ significantly across states (p < 0.05)
cat("========== CORRELATION MATRIX ==========\n")
## ========== CORRELATION MATRIX ==========
print(round(cor_matrix, 3))
##       Rape   K.A    DD   AoW   AoM    DV
## Rape 1.000 0.701 0.553 0.804 0.458 0.680
## K.A  0.701 1.000 0.692 0.671 0.356 0.686
## DD   0.553 0.692 1.000 0.456 0.423 0.487
## AoW  0.804 0.671 0.456 1.000 0.521 0.632
## AoM  0.458 0.356 0.423 0.521 1.000 0.419
## DV   0.680 0.686 0.487 0.632 0.419 1.000
cat("\n========== STRONGEST CORRELATIONS ==========\n")
## 
## ========== STRONGEST CORRELATIONS ==========
cat("Rape - K.A:", round(cor(data$Rape, data$K.A), 3), "\n")
## Rape - K.A: 0.701
cat("AoW - DV:", round(cor(data$AoW, data$DV), 3), "\n")
## AoW - DV: 0.632
cat("DD - AoM:", round(cor(data$DD, data$AoM), 3), "\n")
## DD - AoM: 0.423
cat("Rape - AoW:", round(cor(data$Rape, data$AoW), 3), "\n")
## Rape - AoW: 0.804
data$Total_Crimes <- data$Rape + data$K.A + data$DD + data$AoW + data$AoM + data$DV + data$WT

regression_model <- lm(Total_Crimes ~ Year, data = data)
cat("========== REGRESSION: TOTAL CRIMES ~ YEAR ==========\n")
## ========== REGRESSION: TOTAL CRIMES ~ YEAR ==========
summary(regression_model)
## 
## Call:
## lm(formula = Total_Crimes ~ Year, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9950  -6045  -3149   2890  43987 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -675531.68  108168.70  -6.245 7.16e-10 ***
## Year            339.18      53.78   6.306 4.93e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8827 on 734 degrees of freedom
## Multiple R-squared:  0.0514, Adjusted R-squared:  0.05011 
## F-statistic: 39.77 on 1 and 734 DF,  p-value: 4.933e-10
future_years <- data.frame(Year = c(2022, 2023, 2024, 2025))
predictions <- predict(regression_model, newdata = future_years, interval = "confidence")

cat("\n========== PREDICTIONS FOR 2022-2025 ==========\n")
## 
## ========== PREDICTIONS FOR 2022-2025 ==========
prediction_df <- data.frame(
  Year = future_years$Year,
  Predicted = round(predictions[, "fit"], 0),
  Lower_CI = round(predictions[, "lwr"], 0),
  Upper_CI = round(predictions[, "upr"], 0)
)
print(prediction_df)
##   Year Predicted Lower_CI Upper_CI
## 1 2022     10294     8982    11606
## 2 2023     10633     9228    12038
## 3 2024     10972     9473    12472
## 4 2025     11312     9716    12907
r_squared <- summary(regression_model)$r.squared
cat("\nR-squared =", round(r_squared, 4), 
    "→", round(r_squared * 100, 1), "% of variation explained by Year\n")
## 
## R-squared = 0.0514 → 5.1 % of variation explained by Year
skewness_values <- sapply(data[, c("Rape", "K.A", "DD", "AoW", "AoM", "DV")], 
                          skewness, na.rm = TRUE)

cat("========== SKEWNESS ANALYSIS ==========\n")
## ========== SKEWNESS ANALYSIS ==========
skew_df <- data.frame(
  Crime = names(skewness_values),
  Skewness = round(skewness_values, 3),
  Interpretation = ifelse(skewness_values > 1, "Highly Right-Skewed",
                         ifelse(skewness_values > 0.5, "Moderately Right-Skewed", 
                                "Approximately Symmetric"))
)
print(skew_df)
##      Crime Skewness      Interpretation
## Rape  Rape    2.275 Highly Right-Skewed
## K.A    K.A    3.115 Highly Right-Skewed
## DD      DD    3.359 Highly Right-Skewed
## AoW    AoW    2.251 Highly Right-Skewed
## AoM    AoM    4.858 Highly Right-Skewed
## DV      DV    2.229 Highly Right-Skewed
ggplot(data, aes(x = Rape)) +
  geom_histogram(aes(y = after_stat(density)), bins = 50, fill = "steelblue", alpha = 0.7) +
  geom_density(color = "red", size = 1) +
  theme_minimal() +
  labs(
    title = "Distribution of Rape Cases (Skewness Check)",
    subtitle = paste("Skewness =", round(skewness_values["Rape"], 3), "- Highly Right-Skewed"),
    x = "Rape Cases",
    y = "Density"
  )