# =========================
# Crime data cleaning script
# =========================
# --- read & clean names ---
dat <- read_csv("/home/rstudio/data/crime_data_cleaned.csv", guess_max = 200000, show_col_types = FALSE) %>% clean_names()
suppressPackageStartupMessages({
library(dplyr); library(lubridate)
})
# Build everything strictly from offense_start_date; leave all other columns unchanged
dat <- dat %>%
mutate(
# Parse RFC-822 style first; then some common fallbacks
datetime_offense = suppressWarnings(parse_date_time(
offense_start_date,
orders = c("a, d b Y H:M:S z", "a, d b Y H:M:S", "Y-m-d H:M:S", "m/d/Y H:M:S"),
tz = "UTC"
)),
date_offense = as.Date(datetime_offense),
time_offense_hhmm = if_else(!is.na(datetime_offense),
format(datetime_offense, "%H:%M"),
NA_character_),
Hour = if_else(!is.na(datetime_offense), hour(datetime_offense), NA_integer_),
Minutes = if_else(!is.na(datetime_offense), minute(datetime_offense), NA_integer_)
)
# Quick sanity check: should now be populated where offense_start_date is non-missing
dat %>%
select(offense_start_date, date_offense, time_offense_hhmm, datetime_offense, Hour, Minutes) %>%
head(10)
## # A tibble: 10 × 6
## offense_start_date date_offense time_offense_hhmm datetime_offense Hour
## <chr> <date> <chr> <dttm> <int>
## 1 Sun, 15 Jan 2023 01… 2023-01-15 01:20 2023-01-15 01:20:00 1
## 2 Sun, 15 Jan 2023 08… 2023-01-15 08:15 2023-01-15 08:15:00 8
## 3 Tue, 17 Jan 2023 23… 2023-01-17 23:31 2023-01-17 23:31:00 23
## 4 Sun, 15 Jan 2023 14… 2023-01-15 14:34 2023-01-15 14:34:00 14
## 5 Sat, 07 Jan 2023 01… 2023-01-07 01:36 2023-01-07 01:36:00 1
## 6 Fri, 06 Jan 2023 18… 2023-01-06 18:45 2023-01-06 18:45:00 18
## 7 Mon, 12 Sep 2022 18… 2022-09-12 18:20 2022-09-12 18:20:00 18
## 8 Wed, 04 Jan 2023 18… 2023-01-04 18:00 2023-01-04 18:00:00 18
## 9 Sat, 07 Jan 2023 02… 2023-01-07 02:52 2023-01-07 02:52:00 2
## 10 Sat, 07 Jan 2023 04… 2023-01-07 04:00 2023-01-07 04:00:00 4
## # ℹ 1 more variable: Minutes <int>
# ===============================
# Exploratory Data Analysis (EDA)
# ===============================
library(dplyr)
library(ggplot2)
library(lubridate)
library(tmap)
library(sf)
# 1) Crimes by month (by offense date/datetime)
dat_month <- dat %>%
mutate(.month_date = coalesce(date_offense, as.Date(datetime_offense))) %>%
filter(!is.na(.month_date)) %>%
mutate(month = month(.month_date, label = TRUE, abbr = TRUE)) %>%
count(month)
p1 <- ggplot(dat_month, aes(month, n, fill = month)) +
geom_col(show.legend = FALSE) +
labs(title = "Crimes by Month (by Offense Date)",
x = NULL, y = "Incidents") +
theme_minimal()
print(p1)
# 2) Hourly pattern (from offense time if available; else from datetime_offense)
dat_hour <- dat %>%
mutate(.hr = coalesce(Hour, hour(datetime_offense))) %>%
filter(!is.na(.hr)) %>%
count(.hr, name = "n")
p2 <- ggplot(dat_hour, aes(.hr, n)) +
geom_line(linewidth = 1) + geom_point() +
scale_x_continuous(breaks = 0:23) +
labs(title = "Crimes by Hour of Day (Offense Time)",
x = "Hour", y = "Incidents") +
theme_minimal()
print(p2)
off_candidates <- c("nibrs_code_name")
off_col <- intersect(off_candidates, names(dat))[1]
if (!is.na(off_col)) {
top_off <- dat %>%
filter(!is.na(.data[[off_col]]), .data[[off_col]] != "") %>%
count(!!rlang::sym(off_col), sort = TRUE, name = "n") %>%
slice_head(n = 10)
p3 <- ggplot(top_off, aes(x = reorder(!!rlang::sym(off_col), n), y = n)) +
geom_col() + coord_flip() +
labs(title = "Top 10 Reported Offenses",
x = "Offense Type", y = "Incidents") +
theme_minimal()
print(p3)
} else {
message("No offense-like column found. Available names:\n", paste(names(dat), collapse = ", "))
}
# Insight: # → Reveals the most common crime categories (e.g., Theft,
Assault, Burglary), guiding targeted prevention.
if ("nhood_name" %in% names(dat)) {
p3 <- dat %>%
filter(!is.na(nhood_name), nhood_name != "") %>%
count(nhood_name, sort = TRUE) %>%
slice_head(n = 10) %>%
ggplot(aes(x = reorder(nhood_name, n), y = n)) +
geom_col() + coord_flip() +
labs(title = "Top 10 Neighborhoods by Incident Count",
x = "Neighborhood", y = "Incidents") +
theme_minimal()
print(p3)
} else {
message("Column 'nhood_name' not found—skipping neighborhood chart.")
}
# packages
library(dplyr)
library(sf)
library(leaflet)
library(leaflet.extras) # heatmap, search, measure
library(leaflet.extras2) # timeline slider
library(viridisLite)
# ---- Prep: keep valid coords & make sf ----
pts <- dat %>%
filter(!is.na(long), !is.na(lat)) %>%
st_as_sf(coords = c("long", "lat"), crs = 4326, remove = FALSE)
leaflet(pts) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addHeatmap(
lng = ~long, lat = ~lat,
intensity = 1, blur = 20, radius = 12, max = 0.6
) %>%
addResetMapButton()
# project points to meters for binning
pts_m <- st_transform(pts, 3857)
# 1) make the hex grid
hex_raw <- st_make_grid(pts_m, cellsize = 800, square = FALSE)
hex <- st_sf(id = seq_along(hex_raw), geometry = hex_raw)
# 2) count points falling in each hex
join_counts <- st_join(hex, pts_m, join = st_intersects) %>%
st_drop_geometry() %>%
count(id, name = "n")
# 3) attach counts back to hexes; set 0 for empty
hex_count <- hex %>%
left_join(join_counts, by = "id") %>%
mutate(n = ifelse(is.na(n), 0L, n)) %>%
st_transform(4326)
pal <- colorBin(viridis(7), hex_count$n,
bins = c(1,2,5,10,20,40,80, Inf), right = FALSE)
leaflet(hex_count) %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addPolygons(
fillColor = ~pal(n), color = "#222", weight = 0.5,
fillOpacity = 0.8, label = ~paste0("Incidents: ", n)
) %>%
addLegend(pal = pal, values = ~n, title = "Incidents / ~0.8 km hex",
opacity = 0.9)
# bucket into time-of-day groups
pts_td <- pts %>%
mutate(hour = ifelse(is.na(datetime_offense), NA_integer_,
as.integer(format(datetime_offense, "%H"))),
tod = cut(hour,
breaks = c(-Inf, 6, 12, 18, Inf),
labels = c("Late Night (0–5)","Morning (6–11)",
"Afternoon (12–17)","Evening (18–23)")))
m <- leaflet() %>% addProviderTiles(providers$CartoDB.Positron)
add_tod_heat <- function(map, df, group_name) {
if (nrow(df)) {
map %>% addHeatmap(data = df, lng = ~long, lat = ~lat,
radius = 12, blur = 20, max = 0.6, group = group_name)
} else map
}
m %>%
add_tod_heat(dplyr::filter(pts_td, tod == "Late Night (0–5)"), "Late Night (0–5)") %>%
add_tod_heat(dplyr::filter(pts_td, tod == "Morning (6–11)"), "Morning (6–11)") %>%
add_tod_heat(dplyr::filter(pts_td, tod == "Afternoon (12–17)"), "Afternoon (12–17)") %>%
add_tod_heat(dplyr::filter(pts_td, tod == "Evening (18–23)"), "Evening (18–23)") %>%
addLayersControl(
baseGroups = NULL,
overlayGroups = c("Late Night (0–5)","Morning (6–11)",
"Afternoon (12–17)","Evening (18–23)"),
options = layersControlOptions(collapsed = FALSE)
)
leaflet(pts) %>%
addProviderTiles(providers$Esri.WorldTopoMap) %>%
addCircleMarkers(
~long, ~lat, radius = 3, stroke = FALSE, fillOpacity = 0.85,
clusterOptions = markerClusterOptions(),
popup = ~paste0(
"<b>", coalesce(nhood_name, "Unknown neighborhood"), "</b><br/>",
"When: ", ifelse(is.na(datetime_offense), "NA",
format(datetime_offense, "%Y-%m-%d %H:%M")), "<br/>",
"Type: ", coalesce(ucr_grouping, "NA"), "<br/>",
"Location: ", coalesce(location_type, "NA")
)
)