# =========================
# Crime data cleaning script
# =========================

# --- read & clean names ---
dat <- read_csv("/home/rstudio/data/crime_data_cleaned.csv", guess_max = 200000, show_col_types = FALSE) %>% clean_names()
suppressPackageStartupMessages({
  library(dplyr); library(lubridate)
})

# Build everything strictly from offense_start_date; leave all other columns unchanged
dat <- dat %>%
  mutate(
    # Parse RFC-822 style first; then some common fallbacks
    datetime_offense = suppressWarnings(parse_date_time(
      offense_start_date,
      orders = c("a, d b Y H:M:S z", "a, d b Y H:M:S", "Y-m-d H:M:S", "m/d/Y H:M:S"),
      tz = "UTC"
    )),
    date_offense = as.Date(datetime_offense),
    time_offense_hhmm = if_else(!is.na(datetime_offense),
                                format(datetime_offense, "%H:%M"),
                                NA_character_),
    Hour    = if_else(!is.na(datetime_offense), hour(datetime_offense),   NA_integer_),
    Minutes = if_else(!is.na(datetime_offense), minute(datetime_offense), NA_integer_)
  )

# Quick sanity check: should now be populated where offense_start_date is non-missing
dat %>%
  select(offense_start_date, date_offense, time_offense_hhmm, datetime_offense, Hour, Minutes) %>%
  head(10)
## # A tibble: 10 × 6
##    offense_start_date   date_offense time_offense_hhmm datetime_offense     Hour
##    <chr>                <date>       <chr>             <dttm>              <int>
##  1 Sun, 15 Jan 2023 01… 2023-01-15   01:20             2023-01-15 01:20:00     1
##  2 Sun, 15 Jan 2023 08… 2023-01-15   08:15             2023-01-15 08:15:00     8
##  3 Tue, 17 Jan 2023 23… 2023-01-17   23:31             2023-01-17 23:31:00    23
##  4 Sun, 15 Jan 2023 14… 2023-01-15   14:34             2023-01-15 14:34:00    14
##  5 Sat, 07 Jan 2023 01… 2023-01-07   01:36             2023-01-07 01:36:00     1
##  6 Fri, 06 Jan 2023 18… 2023-01-06   18:45             2023-01-06 18:45:00    18
##  7 Mon, 12 Sep 2022 18… 2022-09-12   18:20             2022-09-12 18:20:00    18
##  8 Wed, 04 Jan 2023 18… 2023-01-04   18:00             2023-01-04 18:00:00    18
##  9 Sat, 07 Jan 2023 02… 2023-01-07   02:52             2023-01-07 02:52:00     2
## 10 Sat, 07 Jan 2023 04… 2023-01-07   04:00             2023-01-07 04:00:00     4
## # ℹ 1 more variable: Minutes <int>
# ===============================
# Exploratory Data Analysis (EDA)
# ===============================

library(dplyr)
library(ggplot2)
library(lubridate)
library(tmap)
library(sf)

# 1) Crimes by month (by offense date/datetime)
dat_month <- dat %>%
  mutate(.month_date = coalesce(date_offense, as.Date(datetime_offense))) %>%
  filter(!is.na(.month_date)) %>%
  mutate(month = month(.month_date, label = TRUE, abbr = TRUE)) %>%
  count(month)

p1 <- ggplot(dat_month, aes(month, n, fill = month)) +
  geom_col(show.legend = FALSE) +
  labs(title = "Crimes by Month (by Offense Date)",
       x = NULL, y = "Incidents") +
  theme_minimal()
print(p1)

# 2) Hourly pattern (from offense time if available; else from datetime_offense)
dat_hour <- dat %>%
  mutate(.hr = coalesce(Hour, hour(datetime_offense))) %>%
  filter(!is.na(.hr)) %>%
  count(.hr, name = "n")

p2 <- ggplot(dat_hour, aes(.hr, n)) +
  geom_line(linewidth = 1) + geom_point() +
  scale_x_continuous(breaks = 0:23) +
  labs(title = "Crimes by Hour of Day (Offense Time)",
       x = "Hour", y = "Incidents") +
  theme_minimal()
print(p2)

off_candidates <- c("nibrs_code_name")
off_col <- intersect(off_candidates, names(dat))[1]
if (!is.na(off_col)) {
  top_off <- dat %>%
    filter(!is.na(.data[[off_col]]), .data[[off_col]] != "") %>%
    count(!!rlang::sym(off_col), sort = TRUE, name = "n") %>%
    slice_head(n = 10)

  p3 <- ggplot(top_off, aes(x = reorder(!!rlang::sym(off_col), n), y = n)) +
    geom_col() + coord_flip() +
    labs(title = "Top 10 Reported Offenses",
         x = "Offense Type", y = "Incidents") +
    theme_minimal()
  print(p3)
} else {
  message("No offense-like column found. Available names:\n", paste(names(dat), collapse = ", "))
}

# Insight: # → Reveals the most common crime categories (e.g., Theft, Assault, Burglary), guiding targeted prevention.

if ("nhood_name" %in% names(dat)) {
  p3 <- dat %>%
    filter(!is.na(nhood_name), nhood_name != "") %>%
    count(nhood_name, sort = TRUE) %>%
    slice_head(n = 10) %>%
    ggplot(aes(x = reorder(nhood_name, n), y = n)) +
    geom_col() + coord_flip() +
    labs(title = "Top 10 Neighborhoods by Incident Count",
         x = "Neighborhood", y = "Incidents") +
    theme_minimal()
  print(p3)
} else {
  message("Column 'nhood_name' not found—skipping neighborhood chart.")
}

# packages
library(dplyr)
library(sf)
library(leaflet)
library(leaflet.extras)   # heatmap, search, measure
library(leaflet.extras2)  # timeline slider
library(viridisLite)

# ---- Prep: keep valid coords & make sf ----
pts <- dat %>%
  filter(!is.na(long), !is.na(lat)) %>%
  st_as_sf(coords = c("long", "lat"), crs = 4326, remove = FALSE)

leaflet(pts) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addHeatmap(
    lng = ~long, lat = ~lat,
    intensity = 1, blur = 20, radius = 12, max = 0.6
  ) %>%
  addResetMapButton()
# project points to meters for binning
pts_m <- st_transform(pts, 3857)

# 1) make the hex grid
hex_raw <- st_make_grid(pts_m, cellsize = 800, square = FALSE)
hex <- st_sf(id = seq_along(hex_raw), geometry = hex_raw)

# 2) count points falling in each hex
join_counts <- st_join(hex, pts_m, join = st_intersects) %>%
  st_drop_geometry() %>%
  count(id, name = "n")

# 3) attach counts back to hexes; set 0 for empty
hex_count <- hex %>%
  left_join(join_counts, by = "id") %>%
  mutate(n = ifelse(is.na(n), 0L, n)) %>%
  st_transform(4326)

pal <- colorBin(viridis(7), hex_count$n,
                bins = c(1,2,5,10,20,40,80, Inf), right = FALSE)

leaflet(hex_count) %>%
  addProviderTiles(providers$CartoDB.DarkMatter) %>%
  addPolygons(
    fillColor = ~pal(n), color = "#222", weight = 0.5,
    fillOpacity = 0.8, label = ~paste0("Incidents: ", n)
  ) %>%
  addLegend(pal = pal, values = ~n, title = "Incidents / ~0.8 km hex",
            opacity = 0.9)
# bucket into time-of-day groups
pts_td <- pts %>%
  mutate(hour = ifelse(is.na(datetime_offense), NA_integer_,
                       as.integer(format(datetime_offense, "%H"))),
         tod = cut(hour,
                   breaks = c(-Inf, 6, 12, 18, Inf),
                   labels = c("Late Night (0–5)","Morning (6–11)",
                              "Afternoon (12–17)","Evening (18–23)")))

m <- leaflet() %>% addProviderTiles(providers$CartoDB.Positron)

add_tod_heat <- function(map, df, group_name) {
  if (nrow(df)) {
    map %>% addHeatmap(data = df, lng = ~long, lat = ~lat,
                       radius = 12, blur = 20, max = 0.6, group = group_name)
  } else map
}

m %>%
  add_tod_heat(dplyr::filter(pts_td, tod == "Late Night (0–5)"), "Late Night (0–5)") %>%
  add_tod_heat(dplyr::filter(pts_td, tod == "Morning (6–11)"), "Morning (6–11)") %>%
  add_tod_heat(dplyr::filter(pts_td, tod == "Afternoon (12–17)"), "Afternoon (12–17)") %>%
  add_tod_heat(dplyr::filter(pts_td, tod == "Evening (18–23)"), "Evening (18–23)") %>%
  addLayersControl(
    baseGroups = NULL,
    overlayGroups = c("Late Night (0–5)","Morning (6–11)",
                      "Afternoon (12–17)","Evening (18–23)"),
    options = layersControlOptions(collapsed = FALSE)
  )
leaflet(pts) %>%
  addProviderTiles(providers$Esri.WorldTopoMap) %>%
  addCircleMarkers(
    ~long, ~lat, radius = 3, stroke = FALSE, fillOpacity = 0.85,
    clusterOptions = markerClusterOptions(),
    popup = ~paste0(
      "<b>", coalesce(nhood_name, "Unknown neighborhood"), "</b><br/>",
      "When: ", ifelse(is.na(datetime_offense), "NA",
                       format(datetime_offense, "%Y-%m-%d %H:%M")), "<br/>",
      "Type: ", coalesce(ucr_grouping, "NA"), "<br/>",
      "Location: ", coalesce(location_type, "NA")
    )
  )