library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
#loading dataset
crime_dataset_india <- read_csv("D:/CAP482_2025/PROJECT DS/crime_ds.csv")
## Rows: 12748 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Date of Occurrence, Time of Occurrence, City, Crime Description, V...
## dbl  (5): Report Number, Crime Code, Victim Age, Police Deployed, year
## dttm (1): Date Reported
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(crime_dataset_india)
crime <- crime_dataset_india
View(crime)
selected_cities <- c("Delhi", "Mumbai", "Bangalore", "Hyderabad", "Chennai",
                     "Kolkata", "Ahmedabad", "Pune", "Lucknow", "Jaipur", 
                     "Patna", "Kanpur", "Surat", "Indore")

# Subset the dataset using the City column
crime_subset <- subset(crime, City %in% selected_cities)

#char to ddtm conversion

crime_a <- crime_subset %>%
  mutate(
    year = substr(`Date Reported`, 7, 10)
  ) %>%
  group_by(year) %>%
  slice_sample(prop = 0.40) %>%
  ungroup()
crime_a$`Date of Occurrence` <- mdy_hm(crime_a$`Date of Occurrence`)
crime_a$`Date Reported` <- dmy_hm(crime_a$`Date Reported`)
## Warning: All formats failed to parse. No formats found.
crime_a$`Date Case Closed` <- dmy_hm(crime_a$`Date Case Closed`)
crime_a$`Time of Occurrence` <- dmy_hm(crime_a$`Time of Occurrence`)