library(tidyverse)
library(lubridate)
library(tsibble)
library(feasts)
library(forecast)
library(plotly)
# Reading the data
crimes_data <- read.csv("Crimes_2001_to_present.csv")
# Initial look at the data
head(crimes_data)
## ID Case.Number Date Block IUCR
## 1 13689808 JH539692 12/11/2024 11:40:00 PM 022XX N LEAVITT ST 031A
## 2 13689797 JH539688 12/11/2024 11:38:00 PM 087XX S BURLEY AVE 0326
## 3 13690325 JH539659 12/11/2024 10:50:00 PM 014XX S SPRINGFIELD AVE 0325
## 4 13689749 JH539636 12/11/2024 10:36:00 PM 006XX N ASHLAND AVE 031A
## 5 13689743 JH539668 12/11/2024 09:41:00 PM 049XX S CICERO AVE 031A
## 6 13689599 JH539522 12/11/2024 07:49:00 PM 064XX S RICHMOND ST 0320
## Primary.Type Description Location.Description Arrest
## 1 ROBBERY ARMED - HANDGUN SIDEWALK false
## 2 ROBBERY AGGRAVATED VEHICULAR HIJACKING APARTMENT false
## 3 ROBBERY VEHICULAR HIJACKING STREET false
## 4 ROBBERY ARMED - HANDGUN GAS STATION false
## 5 ROBBERY ARMED - HANDGUN GAS STATION false
## 6 ROBBERY STRONG ARM - NO WEAPON SIDEWALK false
## Domestic Beat District Ward Community.Area FBI.Code X.Coordinate Y.Coordinate
## 1 false 1432 14 32 22 3 1161323 1914920
## 2 false 424 4 10 46 3 1199184 1847735
## 3 false 1011 10 24 29 3 1150611 1892853
## 4 false 1215 12 1 24 3 1165609 1904556
## 5 false 814 8 22 56 3 1145191 1871544
## 6 false 823 8 15 66 3 1157824 1861716
## Year Updated.On Latitude Longitude Location
## 1 2024 12/19/2024 03:41:35 PM 41.92222 -87.68267 (41.922223823, -87.682674421)
## 2 2024 12/19/2024 03:41:35 PM 41.73699 -87.54583 (41.736993626, -87.545827764)
## 3 2024 12/19/2024 03:41:35 PM 41.86189 -87.72261 (41.861885976, -87.722610798)
## 4 2024 12/19/2024 03:41:35 PM 41.89369 -87.66722 (41.893693999, -87.667222147)
## 5 2024 12/19/2024 03:41:35 PM 41.80352 -87.74304 (41.803515121, -87.743044463)
## 6 2024 12/19/2024 03:41:35 PM 41.77630 -87.69698 (41.776298295, -87.696979416)
str(crimes_data)
## 'data.frame': 309431 obs. of 22 variables:
## $ ID : int 13689808 13689797 13690325 13689749 13689743 13689599 13690171 13689477 13689503 13689479 ...
## $ Case.Number : chr "JH539692" "JH539688" "JH539659" "JH539636" ...
## $ Date : chr "12/11/2024 11:40:00 PM" "12/11/2024 11:38:00 PM" "12/11/2024 10:50:00 PM" "12/11/2024 10:36:00 PM" ...
## $ Block : chr "022XX N LEAVITT ST" "087XX S BURLEY AVE" "014XX S SPRINGFIELD AVE" "006XX N ASHLAND AVE" ...
## $ IUCR : chr "031A" "0326" "0325" "031A" ...
## $ Primary.Type : chr "ROBBERY" "ROBBERY" "ROBBERY" "ROBBERY" ...
## $ Description : chr "ARMED - HANDGUN" "AGGRAVATED VEHICULAR HIJACKING" "VEHICULAR HIJACKING" "ARMED - HANDGUN" ...
## $ Location.Description: chr "SIDEWALK" "APARTMENT" "STREET" "GAS STATION" ...
## $ Arrest : chr "false" "false" "false" "false" ...
## $ Domestic : chr "false" "false" "false" "false" ...
## $ Beat : int 1432 424 1011 1215 814 823 1011 422 914 825 ...
## $ District : int 14 4 10 12 8 8 10 4 9 8 ...
## $ Ward : int 32 10 24 1 22 15 24 7 11 16 ...
## $ Community.Area : int 22 46 29 24 56 66 29 46 34 66 ...
## $ FBI.Code : int 3 3 3 3 3 3 3 3 3 3 ...
## $ X.Coordinate : int 1161323 1199184 1150611 1165609 1145191 1157824 1147575 1195224 1172636 1160426 ...
## $ Y.Coordinate : int 1914920 1847735 1892853 1904556 1871544 1861716 1894183 1852934 1888149 1863809 ...
## $ Year : int 2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
## $ Updated.On : chr "12/19/2024 03:41:35 PM" "12/19/2024 03:41:35 PM" "12/19/2024 03:41:35 PM" "12/19/2024 03:41:35 PM" ...
## $ Latitude : num 41.9 41.7 41.9 41.9 41.8 ...
## $ Longitude : num -87.7 -87.5 -87.7 -87.7 -87.7 ...
## $ Location : chr "(41.922223823, -87.682674421)" "(41.736993626, -87.545827764)" "(41.861885976, -87.722610798)" "(41.893693999, -87.667222147)" ...
# Converting date format and selecting relevant columns
crimes_clean <- crimes_data %>%
mutate(
Date = as.Date(Date, format = "%m/%d/%Y"),
Year = year(Date),
Month = month(Date),
Day = day(Date),
Weekday = wday(Date, label = TRUE)
) %>%
select(Date, Primary.Type, Year, Month, Day, Weekday)
# Check for missing values
sapply(crimes_clean, function(x) sum(is.na(x)))
## Date Primary.Type Year Month Day Weekday
## 0 0 0 0 0 0
# Top 10 most frequent crime types
crime_freq <- crimes_clean %>%
count(Primary.Type, sort = TRUE) %>%
head(10)
# Plot
ggplot(crime_freq, aes(x = reorder(Primary.Type, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Top 10 Most Frequent Crime Types",
x = "Crime Type",
y = "Frequency"
) +
theme_minimal()
# Select most common crime type
most_common_crime <- crime_freq$Primary.Type[1]
# Create daily time series
daily_crimes <- crimes_clean %>%
filter(Primary.Type == most_common_crime) %>%
count(Date, name = "count") %>%
complete(Date = seq.Date(
from = min(Date),
to = max(Date),
by = "day"
),
fill = list(count = 0))
# Convert to tsibble
ts_crimes <- daily_crimes %>%
as_tsibble(index = Date)
# Plot daily trend
ts_crimes %>%
ggplot(aes(x = Date, y = count)) +
geom_line(color = "steelblue") +
labs(
title = paste("Daily", most_common_crime, "Incidents"),
y = "Number of Incidents",
x = "Date"
) +
theme_minimal()
# Monthly aggregation
monthly_crimes <- crimes_clean %>%
filter(Primary.Type == most_common_crime) %>%
count(Year, Month) %>%
mutate(Date = make_date(Year, Month, 1))
# Plot monthly pattern
ggplot(monthly_crimes, aes(x = Date, y = n)) +
geom_line(color = "steelblue") +
geom_smooth(method = "loess", se = FALSE, color = "red") +
labs(
title = paste("Monthly", most_common_crime, "Incidents with Trend"),
y = "Number of Incidents",
x = "Date"
) +
theme_minimal()
# Weekly pattern
weekly_pattern <- crimes_clean %>%
filter(Primary.Type == most_common_crime) %>%
count(Weekday) %>%
mutate(Weekday = fct_reorder(Weekday, n))
ggplot(weekly_pattern, aes(x = Weekday, y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(
title = paste("Weekly Pattern of", most_common_crime),
y = "Total Incidents",
x = "Day of Week"
) +
theme_minimal()
# Monthly pattern across years
monthly_pattern <- crimes_clean %>%
filter(Primary.Type == most_common_crime) %>%
count(Month, Year) %>%
group_by(Month) %>%
summarise(avg_crimes = mean(n))
ggplot(monthly_pattern, aes(x = Month, y = avg_crimes)) +
geom_line(group = 1, color = "steelblue") +
geom_point(color = "steelblue") +
scale_x_continuous(breaks = 1:12) +
labs(
title = paste("Average Monthly Pattern of", most_common_crime),
y = "Average Incidents",
x = "Month"
) +
theme_minimal()
# Convert to ts object
crimes_ts <- ts(ts_crimes$count,
frequency = 7) # Weekly seasonality
# Decomposition
decomp <- decompose(crimes_ts)
# Plot decomposition
plot(decomp)
# Basic statistics
summary_stats <- ts_crimes %>%
summarise(
mean_daily = mean(count),
median_daily = median(count),
sd_daily = sd(count),
min_daily = min(count),
max_daily = max(count)
)
print(summary_stats)
## # A tsibble: 8,746 x 6 [1D]
## Date mean_daily median_daily sd_daily min_daily max_daily
## <date> <dbl> <int> <dbl> <int> <int>
## 1 2001-01-01 41 41 NA 41 41
## 2 2001-01-02 35 35 NA 35 35
## 3 2001-01-03 51 51 NA 51 51
## 4 2001-01-04 55 55 NA 55 55
## 5 2001-01-05 55 55 NA 55 55
## 6 2001-01-06 54 54 NA 54 54
## 7 2001-01-07 36 36 NA 36 36
## 8 2001-01-08 44 44 NA 44 44
## 9 2001-01-09 38 38 NA 38 38
## 10 2001-01-10 41 41 NA 41 41
## # ℹ 8,736 more rows
# ACF plot
acf(ts_crimes$count, main = "Autocorrelation Function")
Based on the exploratory data analysis:
These findings will inform our model selection and feature engineering in the next phase.