library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lubridate)
library(writexl)


file_path <- "../data/Police_Department_Incident_Reports__Historical_2003_to_May_2018_20250331.csv"
crimes_03_to_18 <- read_csv(file_path)
## Rows: 2129525 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): IncidntNum, Incident Code, Category, Descript, DayOfWeek, Date, P...
## dbl  (24): PdId, X, Y, SF Find Neighborhoods 2 2, Current Police Districts 2...
## time  (1): Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unique(crimes_03_to_18$Category) #listing unique values in Category column
##  [1] "ROBBERY"                     "VEHICLE THEFT"              
##  [3] "ARSON"                       "ASSAULT"                    
##  [5] "TRESPASS"                    "BURGLARY"                   
##  [7] "LARCENY/THEFT"               "WARRANTS"                   
##  [9] "OTHER OFFENSES"              "DRUG/NARCOTIC"              
## [11] "SUSPICIOUS OCC"              "LIQUOR LAWS"                
## [13] "VANDALISM"                   "WEAPON LAWS"                
## [15] "NON-CRIMINAL"                "MISSING PERSON"             
## [17] "FRAUD"                       "SEX OFFENSES, FORCIBLE"     
## [19] "SECONDARY CODES"             "DISORDERLY CONDUCT"         
## [21] "RECOVERED VEHICLE"           "KIDNAPPING"                 
## [23] "FORGERY/COUNTERFEITING"      "PROSTITUTION"               
## [25] "DRUNKENNESS"                 "BAD CHECKS"                 
## [27] "DRIVING UNDER THE INFLUENCE" "LOITERING"                  
## [29] "STOLEN PROPERTY"             "SUICIDE"                    
## [31] "BRIBERY"                     "EXTORTION"                  
## [33] "EMBEZZLEMENT"                "GAMBLING"                   
## [35] "PORNOGRAPHY/OBSCENE MAT"     "SEX OFFENSES, NON FORCIBLE" 
## [37] "TREA"
#filter for vehicle thefts
vehicle_theft_03_to_17 <- crimes_03_to_18 %>% 
  mutate(Date = mdy(Date)) %>%  # Convert "MM/DD/YYYY" to Date format
  filter(Category == "VEHICLE THEFT" & Date < as.Date("2018-01-01")) #filter for reported crimes before 2018

#group by year and count reported vehicle thefts
vehicle_theft_by_year <- vehicle_theft_03_to_17 %>%
  mutate(Year = as.numeric(substr(Date, 1, 4))) %>%  # Extract first 4 characters as Year
  group_by(Year) %>%
  summarise(count = n()) %>%
  arrange(Year)  # Sort by Year

#checking in Excel why it doesn't match Chronicle story
vehicle_theft_03_to_17_export <- crimes_03_to_18 %>% 
  filter(Category == "VEHICLE THEFT")

write_xlsx(vehicle_theft_03_to_17_export, "../data/vehicle_theft_03_to_17_export.xlsx")