1. Loading Required Libraries

library(tidyverse)
library(lubridate)
library(tsibble)
library(feasts)
library(forecast)
library(plotly)

2. Data Loading and Initial Inspection

# Reading the data
crimes_data <- read.csv("Crimes_2001_to_present.csv")

# Initial look at the data
head(crimes_data)
##         ID Case.Number                   Date                   Block IUCR
## 1 13689808    JH539692 12/11/2024 11:40:00 PM      022XX N LEAVITT ST 031A
## 2 13689797    JH539688 12/11/2024 11:38:00 PM      087XX S BURLEY AVE 0326
## 3 13690325    JH539659 12/11/2024 10:50:00 PM 014XX S SPRINGFIELD AVE 0325
## 4 13689749    JH539636 12/11/2024 10:36:00 PM     006XX N ASHLAND AVE 031A
## 5 13689743    JH539668 12/11/2024 09:41:00 PM      049XX S CICERO AVE 031A
## 6 13689599    JH539522 12/11/2024 07:49:00 PM     064XX S RICHMOND ST 0320
##   Primary.Type                    Description Location.Description Arrest
## 1      ROBBERY                ARMED - HANDGUN             SIDEWALK  false
## 2      ROBBERY AGGRAVATED VEHICULAR HIJACKING            APARTMENT  false
## 3      ROBBERY            VEHICULAR HIJACKING               STREET  false
## 4      ROBBERY                ARMED - HANDGUN          GAS STATION  false
## 5      ROBBERY                ARMED - HANDGUN          GAS STATION  false
## 6      ROBBERY         STRONG ARM - NO WEAPON             SIDEWALK  false
##   Domestic Beat District Ward Community.Area FBI.Code X.Coordinate Y.Coordinate
## 1    false 1432       14   32             22        3      1161323      1914920
## 2    false  424        4   10             46        3      1199184      1847735
## 3    false 1011       10   24             29        3      1150611      1892853
## 4    false 1215       12    1             24        3      1165609      1904556
## 5    false  814        8   22             56        3      1145191      1871544
## 6    false  823        8   15             66        3      1157824      1861716
##   Year             Updated.On Latitude Longitude                      Location
## 1 2024 12/19/2024 03:41:35 PM 41.92222 -87.68267 (41.922223823, -87.682674421)
## 2 2024 12/19/2024 03:41:35 PM 41.73699 -87.54583 (41.736993626, -87.545827764)
## 3 2024 12/19/2024 03:41:35 PM 41.86189 -87.72261 (41.861885976, -87.722610798)
## 4 2024 12/19/2024 03:41:35 PM 41.89369 -87.66722 (41.893693999, -87.667222147)
## 5 2024 12/19/2024 03:41:35 PM 41.80352 -87.74304 (41.803515121, -87.743044463)
## 6 2024 12/19/2024 03:41:35 PM 41.77630 -87.69698 (41.776298295, -87.696979416)
str(crimes_data)
## 'data.frame':    309431 obs. of  22 variables:
##  $ ID                  : int  13689808 13689797 13690325 13689749 13689743 13689599 13690171 13689477 13689503 13689479 ...
##  $ Case.Number         : chr  "JH539692" "JH539688" "JH539659" "JH539636" ...
##  $ Date                : chr  "12/11/2024 11:40:00 PM" "12/11/2024 11:38:00 PM" "12/11/2024 10:50:00 PM" "12/11/2024 10:36:00 PM" ...
##  $ Block               : chr  "022XX N LEAVITT ST" "087XX S BURLEY AVE" "014XX S SPRINGFIELD AVE" "006XX N ASHLAND AVE" ...
##  $ IUCR                : chr  "031A" "0326" "0325" "031A" ...
##  $ Primary.Type        : chr  "ROBBERY" "ROBBERY" "ROBBERY" "ROBBERY" ...
##  $ Description         : chr  "ARMED - HANDGUN" "AGGRAVATED VEHICULAR HIJACKING" "VEHICULAR HIJACKING" "ARMED - HANDGUN" ...
##  $ Location.Description: chr  "SIDEWALK" "APARTMENT" "STREET" "GAS STATION" ...
##  $ Arrest              : chr  "false" "false" "false" "false" ...
##  $ Domestic            : chr  "false" "false" "false" "false" ...
##  $ Beat                : int  1432 424 1011 1215 814 823 1011 422 914 825 ...
##  $ District            : int  14 4 10 12 8 8 10 4 9 8 ...
##  $ Ward                : int  32 10 24 1 22 15 24 7 11 16 ...
##  $ Community.Area      : int  22 46 29 24 56 66 29 46 34 66 ...
##  $ FBI.Code            : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ X.Coordinate        : int  1161323 1199184 1150611 1165609 1145191 1157824 1147575 1195224 1172636 1160426 ...
##  $ Y.Coordinate        : int  1914920 1847735 1892853 1904556 1871544 1861716 1894183 1852934 1888149 1863809 ...
##  $ Year                : int  2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
##  $ Updated.On          : chr  "12/19/2024 03:41:35 PM" "12/19/2024 03:41:35 PM" "12/19/2024 03:41:35 PM" "12/19/2024 03:41:35 PM" ...
##  $ Latitude            : num  41.9 41.7 41.9 41.9 41.8 ...
##  $ Longitude           : num  -87.7 -87.5 -87.7 -87.7 -87.7 ...
##  $ Location            : chr  "(41.922223823, -87.682674421)" "(41.736993626, -87.545827764)" "(41.861885976, -87.722610798)" "(41.893693999, -87.667222147)" ...

3. Data Preprocessing

# Converting date format and selecting relevant columns
crimes_clean <- crimes_data %>%
  mutate(
    Date = as.Date(Date, format = "%m/%d/%Y"),
    Year = year(Date),
    Month = month(Date),
    Day = day(Date),
    Weekday = wday(Date, label = TRUE)
  ) %>%
  select(Date, Primary.Type, Year, Month, Day, Weekday)

# Check for missing values
sapply(crimes_clean, function(x) sum(is.na(x)))
##         Date Primary.Type         Year        Month          Day      Weekday 
##            0            0            0            0            0            0

4. Basic Time Series Analysis

4.1 Crime Frequency by Type

# Top 10 most frequent crime types
crime_freq <- crimes_clean %>%
  count(Primary.Type, sort = TRUE) %>%
  head(10)

# Plot
ggplot(crime_freq, aes(x = reorder(Primary.Type, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Top 10 Most Frequent Crime Types",
    x = "Crime Type",
    y = "Frequency"
  ) +
  theme_minimal()

4.2 Time Series Creation for Most Common Crime

# Select most common crime type
most_common_crime <- crime_freq$Primary.Type[1]

# Create daily time series
daily_crimes <- crimes_clean %>%
  filter(Primary.Type == most_common_crime) %>%
  count(Date, name = "count") %>%
  complete(Date = seq.Date(
    from = min(Date),
    to = max(Date),
    by = "day"
  ),
  fill = list(count = 0))

# Convert to tsibble
ts_crimes <- daily_crimes %>%
  as_tsibble(index = Date)

5. Temporal Patterns Analysis

5.1 Daily Trend

# Plot daily trend
ts_crimes %>%
  ggplot(aes(x = Date, y = count)) +
  geom_line(color = "steelblue") +
  labs(
    title = paste("Daily", most_common_crime, "Incidents"),
    y = "Number of Incidents",
    x = "Date"
  ) +
  theme_minimal()

5.2 Monthly Pattern

# Monthly aggregation
monthly_crimes <- crimes_clean %>%
  filter(Primary.Type == most_common_crime) %>%
  count(Year, Month) %>%
  mutate(Date = make_date(Year, Month, 1))

# Plot monthly pattern
ggplot(monthly_crimes, aes(x = Date, y = n)) +
  geom_line(color = "steelblue") +
  geom_smooth(method = "loess", se = FALSE, color = "red") +
  labs(
    title = paste("Monthly", most_common_crime, "Incidents with Trend"),
    y = "Number of Incidents",
    x = "Date"
  ) +
  theme_minimal()

5.3 Seasonal Patterns

# Weekly pattern
weekly_pattern <- crimes_clean %>%
  filter(Primary.Type == most_common_crime) %>%
  count(Weekday) %>%
  mutate(Weekday = fct_reorder(Weekday, n))

ggplot(weekly_pattern, aes(x = Weekday, y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = paste("Weekly Pattern of", most_common_crime),
    y = "Total Incidents",
    x = "Day of Week"
  ) +
  theme_minimal()

# Monthly pattern across years
monthly_pattern <- crimes_clean %>%
  filter(Primary.Type == most_common_crime) %>%
  count(Month, Year) %>%
  group_by(Month) %>%
  summarise(avg_crimes = mean(n))

ggplot(monthly_pattern, aes(x = Month, y = avg_crimes)) +
  geom_line(group = 1, color = "steelblue") +
  geom_point(color = "steelblue") +
  scale_x_continuous(breaks = 1:12) +
  labs(
    title = paste("Average Monthly Pattern of", most_common_crime),
    y = "Average Incidents",
    x = "Month"
  ) +
  theme_minimal()

6. Decomposition Analysis

# Convert to ts object
crimes_ts <- ts(ts_crimes$count, 
                frequency = 7)  # Weekly seasonality

# Decomposition
decomp <- decompose(crimes_ts)

# Plot decomposition
plot(decomp)

7. Statistical Summary

# Basic statistics
summary_stats <- ts_crimes %>%
  summarise(
    mean_daily = mean(count),
    median_daily = median(count),
    sd_daily = sd(count),
    min_daily = min(count),
    max_daily = max(count)
  )

print(summary_stats)
## # A tsibble: 8,746 x 6 [1D]
##    Date       mean_daily median_daily sd_daily min_daily max_daily
##    <date>          <dbl>        <int>    <dbl>     <int>     <int>
##  1 2001-01-01         41           41       NA        41        41
##  2 2001-01-02         35           35       NA        35        35
##  3 2001-01-03         51           51       NA        51        51
##  4 2001-01-04         55           55       NA        55        55
##  5 2001-01-05         55           55       NA        55        55
##  6 2001-01-06         54           54       NA        54        54
##  7 2001-01-07         36           36       NA        36        36
##  8 2001-01-08         44           44       NA        44        44
##  9 2001-01-09         38           38       NA        38        38
## 10 2001-01-10         41           41       NA        41        41
## # ℹ 8,736 more rows
# ACF plot
acf(ts_crimes$count, main = "Autocorrelation Function")

8. Initial Findings

Based on the exploratory data analysis:

  1. Data Structure:
    • Time period covered
    • Frequency of observations
    • Quality of data (missing values, outliers)
  2. Temporal Patterns:
    • Overall trend direction
    • Seasonal patterns (weekly, monthly, yearly)
    • Notable anomalies or changes in patterns
  3. Key Statistics:
    • Average daily incidents
    • Variation in incidents
    • Peak periods

These findings will inform our model selection and feature engineering in the next phase.