# Load required libraries
library(tidyverse) # Data wrangling & visualization
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # Date parsing
library(janitor) # Cleaning column names
## Warning: package 'janitor' was built under R version 4.4.2
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(sf) # GIS mapping
## Warning: package 'sf' was built under R version 4.4.2
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(leaflet) # Interactive maps
## Warning: package 'leaflet' was built under R version 4.4.2
library(shiny) # Dashboard development
## Warning: package 'shiny' was built under R version 4.4.2
library(plotly) # Interactive visualizations
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggthemes) # Themes for ggplot2
## Warning: package 'ggthemes' was built under R version 4.4.2
library(DT) # Interactive tables
## Warning: package 'DT' was built under R version 4.4.2
##
## Attaching package: 'DT'
##
## The following objects are masked from 'package:shiny':
##
## dataTableOutput, renderDataTable
# Load datasets
chicago_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/Chicago_Crimes_-_2001_to_Present.csv") %>% clean_names()
nyc_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/NYPD_Arrests_Data__Historic_.csv") %>% clean_names()
la_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/Los_Angeles_Crime_Data_from_2010_to_2019.csv") %>% clean_names()
philly_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/Philadelphia_Crime.csv") %>% clean_names()
seattle_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/Seattle_Crime_Data__2008-Present.csv") %>% clean_names()
# 🔍 Inspect column names for each dataset
colnames(chicago_data)
## [1] "id" "case_number" "date"
## [4] "block" "iucr" "primary_type"
## [7] "description" "location_description" "arrest"
## [10] "domestic" "beat" "district"
## [13] "ward" "community_area" "fbi_code"
## [16] "x_coordinate" "y_coordinate" "year"
## [19] "updated_on" "latitude" "longitude"
## [22] "location"
colnames(nyc_data)
## [1] "arrest_key" "arrest_date" "pd_cd"
## [4] "pd_desc" "ky_cd" "ofns_desc"
## [7] "law_code" "law_cat_cd" "arrest_boro"
## [10] "arrest_precinct" "jurisdiction_code" "age_group"
## [13] "perp_sex" "perp_race" "x_coord_cd"
## [16] "y_coord_cd" "latitude" "longitude"
## [19] "lon_lat"
colnames(la_data)
## [1] "dr_no" "date_rptd" "date_occ" "time_occ"
## [5] "area" "area_name" "rpt_dist_no" "part_1_2"
## [9] "crm_cd" "crm_cd_desc" "mocodes" "vict_age"
## [13] "vict_sex" "vict_descent" "premis_cd" "premis_desc"
## [17] "weapon_used_cd" "weapon_desc" "status" "status_desc"
## [21] "crm_cd_1" "crm_cd_2" "crm_cd_3" "crm_cd_4"
## [25] "location" "cross_street" "lat" "lon"
colnames(philly_data)
## [1] "dc_dist" "psa" "dispatch_date_time"
## [4] "dispatch_date" "dispatch_time" "hour"
## [7] "dc_key" "location_block" "ucr_general"
## [10] "text_general_code" "police_districts" "month"
## [13] "lon" "lat"
colnames(seattle_data)
## [1] "report_number" "offense_id"
## [3] "offense_start_date_time" "offense_end_date_time"
## [5] "report_date_time" "group_a_b"
## [7] "crime_against_category" "offense_parent_group"
## [9] "offense" "offense_code"
## [11] "precinct" "sector"
## [13] "beat" "mcpp"
## [15] "x100_block_address" "longitude"
## [17] "latitude"
# 🔧 Standardize latitude and longitude column names
chicago_data <- chicago_data %>% rename(latitude = latitude, longitude = longitude)
nyc_data <- nyc_data %>% rename(latitude = latitude, longitude = longitude)
la_data <- la_data %>% rename(latitude = lat, longitude = lon) # Adjusted for LA
philly_data <- philly_data %>% rename(latitude = lat, longitude = lon) # Adjusted for Philly
seattle_data <- seattle_data %>% rename(latitude = latitude, longitude = longitude)
# 🔧 Standardize column names for crime type and date
chicago_data <- chicago_data %>% rename(date = date, crime_type = primary_type)
nyc_data <- nyc_data %>% rename(date = arrest_date, crime_type = ofns_desc)
la_data <- la_data %>% rename(date = date_occ, crime_type = crm_cd_desc)
philly_data <- philly_data %>% rename(date = dispatch_date, crime_type = text_general_code)
seattle_data <- seattle_data %>% rename(date = report_date_time, crime_type = offense_parent_group)
# 🔧 Convert date columns safely (handle different formats)
chicago_data <- chicago_data %>% mutate(date = suppressWarnings(mdy_hms(date)))
nyc_data <- nyc_data %>% mutate(date = suppressWarnings(mdy(date)))
la_data <- la_data %>% mutate(date = suppressWarnings(mdy_hms(date)))
philly_data <- philly_data %>% mutate(date = suppressWarnings(ymd(date)))
seattle_data <- seattle_data %>% mutate(date = suppressWarnings(ymd_hms(date)))
# 🔧 Convert latitude and longitude to numeric
chicago_data <- chicago_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
nyc_data <- nyc_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
la_data <- la_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
philly_data <- philly_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
seattle_data <- seattle_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
# 🔧 Remove rows with missing values in key columns
chicago_data <- chicago_data %>% drop_na(date, latitude, longitude, crime_type)
nyc_data <- nyc_data %>% drop_na(date, latitude, longitude, crime_type)
la_data <- la_data %>% drop_na(date, latitude, longitude, crime_type)
philly_data <- philly_data %>% drop_na(date, latitude, longitude, crime_type)
seattle_data <- seattle_data %>% drop_na(date, latitude, longitude, crime_type)
# ✅ Merge datasets after cleaning (selecting only common columns)
common_cols <- Reduce(intersect, list(
colnames(chicago_data),
colnames(nyc_data),
colnames(la_data),
colnames(philly_data),
colnames(seattle_data)
))
chicago_data <- chicago_data %>% select(all_of(common_cols))
nyc_data <- nyc_data %>% select(all_of(common_cols))
la_data <- la_data %>% select(all_of(common_cols))
philly_data <- philly_data %>% select(all_of(common_cols))
seattle_data <- seattle_data %>% select(all_of(common_cols))
all_data <- bind_rows(
chicago_data %>% mutate(city = "Chicago"),
nyc_data %>% mutate(city = "New York"),
la_data %>% mutate(city = "Los Angeles"),
philly_data %>% mutate(city = "Philadelphia"),
seattle_data %>% mutate(city = "Seattle")
)
# 🔍 Verify merged dataset
glimpse(all_data)
## Rows: 17,974,641
## Columns: 5
## $ date <dttm> 2021-05-24 15:06:00, 2021-06-26 09:24:00, 2023-11-09 07:30…
## $ crime_type <chr> "HOMICIDE", "HOMICIDE", "BURGLARY", "BATTERY", "CRIMINAL DA…
## $ latitude <dbl> 41.91784, 41.99522, 41.95235, 41.73775, 41.88602, 41.99491,…
## $ longitude <dbl> -87.75597, -87.71335, -87.67798, -87.60486, -87.63394, -87.…
## $ city <chr> "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chi…
summary(all_data$date)
## Min. 1st Qu.
## "2001-01-01 00:00:00.0000" "2008-01-14 00:00:00.0000"
## Median Mean
## "2012-03-20 00:00:00.0000" "2012-06-08 14:59:32.5429"
## 3rd Qu. Max.
## "2016-08-31 00:00:00.0000" "2025-01-13 00:00:00.0000"
2. Exploratory Data Analysis (EDA)
2.1 Crime Type Distribution Across Cities
# Calculate crime counts by city and type
crime_distribution <- all_data %>%
group_by(city, crime_type) %>%
summarize(total_crimes = n(), .groups = "drop")
# Filter for the top 20 crime types to reduce clutter
top_crimes <- crime_distribution %>%
group_by(crime_type) %>%
summarize(total_count = sum(total_crimes)) %>%
slice_max(total_count, n = 20) # Show only the 20 most common crime types
# Plot with improved readability
ggplot(crime_distribution %>% filter(crime_type %in% top_crimes$crime_type),
aes(x = reorder(crime_type, total_crimes), y = total_crimes, fill = city)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(
title = "Top 20 Crime Types Across Cities",
x = "Crime Type",
y = "Number of Crimes",
fill = "City"
) +
theme_minimal() +
scale_fill_brewer(palette = "Set2") +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) # Adjust text to be readable

2.2 Trends Over Time
# Group data by year and city
crime_trends <- all_data %>%
mutate(year = year(date)) %>%
group_by(city, year) %>%
summarize(crime_count = n(), .groups = "drop")
# Line plot showing crime trends over years
ggplot(crime_trends, aes(x = year, y = crime_count, color = city)) +
geom_line(size = 1) +
geom_point() +
labs(
title = "Crime Trends Over Time by City",
x = "Year",
y = "Number of Crimes",
color = "City"
) +
theme_minimal() +
scale_color_brewer(palette = "Set1")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

2.3 Top Crime Types in Each City
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.2
library(forcats) # Ensure forcats is loaded for fct_reorder()
# Get the top 5 crime types for each city
top_crime_types <- all_data %>%
group_by(city, crime_type) %>%
summarize(total_crimes = n(), .groups = "drop") %>%
group_by(city) %>%
slice_max(total_crimes, n = 5) # Top 5 crimes for each city
# Bar chart for top crime types by city
ggplot(top_crime_types, aes(x = fct_reorder(crime_type, total_crimes), y = total_crimes, fill = city)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(
title = "Top 5 Crime Types by City",
x = "Crime Type",
y = "Number of Crimes",
fill = "City"
) +
theme_minimal() +
scale_fill_brewer(palette = "Set3")

2.4 Total Crimes Across Cities
# Summarize total crimes per city
city_crime_totals <- all_data %>%
group_by(city) %>%
summarize(total_crimes = n(), .groups = "drop")
# Bar chart for total crimes across cities
ggplot(city_crime_totals, aes(x = reorder(city, -total_crimes), y = total_crimes, fill = city)) +
geom_bar(stat = "identity") +
labs(
title = "Total Crimes Across Cities",
x = "City",
y = "Total Crimes",
fill = "City"
) +
theme_minimal() +
scale_fill_brewer(palette = "Set1")

2.5 Interactive Table of Crime Data
datatable(
all_data %>%
select(city, date, crime_type, latitude, longitude) %>%
arrange(desc(date)) %>%
head(1000), # Show only the most recent 1000 crimes
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE
)
2.6
# Aggregate crime counts per city
crime_summary <- all_data %>%
filter(year(date) >= 2013) %>% # Last decade of data
group_by(city) %>%
summarize(total_crimes = n())
# Load U.S. state-level map
us_states <- map_data("state")
# Convert city names to lowercase to match map data
crime_summary <- crime_summary %>%
mutate(city = tolower(city))
3. GIS Mapping: Interactive Crime Hotspots
3.1 Use a Random Sample of Crimes
# Reduce dataset size to 10,000 rows (adjust as needed)
crime_sample <- all_data %>% sample_n(10000)
leaflet(crime_sample) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude, lat = ~latitude,
radius = 3, color = "red", stroke = FALSE, fillOpacity = 0.5,
popup = ~paste0("<strong>City:</strong> ", city, "<br>",
"<strong>Crime Type:</strong> ", crime_type, "<br>",
"<strong>Date:</strong> ", as.character(date))
) %>%
setView(lng = -95, lat = 37, zoom = 4) # Centered on the U.S.
3.2: Aggregate Crime Data by City
crime_summary <- all_data %>%
group_by(city) %>%
summarize(
avg_lat = mean(latitude, na.rm = TRUE),
avg_lon = mean(longitude, na.rm = TRUE),
total_crimes = n()
)
leaflet(crime_summary) %>%
addTiles() %>%
addCircleMarkers(
lng = ~avg_lon, lat = ~avg_lat,
radius = ~sqrt(total_crimes) / 10, # Scale by crime count
color = "blue", fillOpacity = 0.6,
popup = ~paste0("<strong>City:</strong> ", city, "<br>",
"<strong>Total Crimes:</strong> ", total_crimes)
) %>%
setView(lng = -95, lat = 37, zoom = 4)
4. Interactive Dashboard (Shiny App)
library(shiny)
library(tidyverse)
library(plotly)
library(leaflet)
library(DT)
# Define UI
ui <- fluidPage(
titlePanel("Crime Trends Dashboard"),
sidebarLayout(
sidebarPanel(
selectInput("city", "Select City", choices = unique(all_data$city), selected = "Chicago"),
selectInput("crime_type", "Select Crime Type", choices = unique(all_data$crime_type)),
sliderInput("year_range", "Select Year Range",
min = min(year(all_data$date)),
max = max(year(all_data$date)),
value = c(2010, 2023),
step = 1, sep = "")
),
mainPanel(
tabsetPanel(
tabPanel("Crime Trends Over Time", plotlyOutput("crime_trend_plot")),
tabPanel("Crime Type Distribution", plotlyOutput("crime_type_plot")),
tabPanel("Top Crime Types by City", plotlyOutput("top_crime_types_plot")),
tabPanel("Total Crimes Across Cities", plotlyOutput("total_crimes_plot")),
tabPanel("Crime Data Table", DTOutput("crime_table")),
tabPanel("Crime Map", leafletOutput("crime_map"))
)
)
)
)
# Define Server Logic
server <- function(input, output, session) {
# Filtered data based on input
filtered_data <- reactive({
all_data %>%
filter(city == input$city, crime_type == input$crime_type,
between(year(date), input$year_range[1], input$year_range[2]))
})
# 1️⃣ Crime Trends Over Time
output$crime_trend_plot <- renderPlotly({
trend_data <- filtered_data() %>%
group_by(year = year(date)) %>%
summarize(crime_count = n())
ggplot(trend_data, aes(x = year, y = crime_count)) +
geom_line(size = 1, color = "blue") +
geom_point() +
labs(title = "Crime Trends Over Time", x = "Year", y = "Crime Count") +
theme_minimal()
})
# 2️⃣ Crime Type Distribution
output$crime_type_plot <- renderPlotly({
crime_distribution <- all_data %>%
filter(between(year(date), input$year_range[1], input$year_range[2])) %>%
group_by(city, crime_type) %>%
summarize(total_crimes = n(), .groups = "drop")
top_crimes <- crime_distribution %>%
group_by(crime_type) %>%
summarize(total_count = sum(total_crimes)) %>%
slice_max(total_count, n = 20)
ggplot(crime_distribution %>% filter(crime_type %in% top_crimes$crime_type),
aes(x = reorder(crime_type, total_crimes), y = total_crimes, fill = city)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(title = "Top 20 Crime Types", x = "Crime Type", y = "Number of Crimes") +
theme_minimal()
})
# 3️⃣ Top Crime Types in Each City
output$top_crime_types_plot <- renderPlotly({
top_crime_types <- all_data %>%
group_by(city, crime_type) %>%
summarize(total_crimes = n(), .groups = "drop") %>%
group_by(city) %>%
slice_max(total_crimes, n = 5)
ggplot(top_crime_types, aes(x = fct_reorder(crime_type, total_crimes), y = total_crimes, fill = city)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 5 Crime Types by City", x = "Crime Type", y = "Total Crimes") +
theme_minimal()
})
# 4️⃣ Total Crimes Across Cities
output$total_crimes_plot <- renderPlotly({
city_crime_totals <- all_data %>%
group_by(city) %>%
summarize(total_crimes = n(), .groups = "drop")
ggplot(city_crime_totals, aes(x = reorder(city, -total_crimes), y = total_crimes, fill = city)) +
geom_bar(stat = "identity") +
labs(title = "Total Crimes Across Cities", x = "City", y = "Total Crimes") +
theme_minimal()
})
# 5️⃣ Interactive Data Table
output$crime_table <- renderDT({
datatable(
filtered_data() %>%
select(city, date, crime_type, latitude, longitude) %>%
arrange(desc(date)) %>%
head(1000),
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE
)
})
# 6️⃣ Interactive Crime Map
output$crime_map <- renderLeaflet({
crime_sample <- filtered_data() %>% sample_n(min(5000, nrow(.)))
leaflet(crime_sample) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude, lat = ~latitude,
radius = 3, color = "red", stroke = FALSE, fillOpacity = 0.5,
popup = ~paste0("<strong>City:</strong> ", city, "<br>",
"<strong>Crime Type:</strong> ", crime_type, "<br>",
"<strong>Date:</strong> ", as.character(date))
) %>%
setView(lng = -95, lat = 37, zoom = 4)
})
}
# Run the App
shinyApp(ui, server)
Shiny applications not supported in static R Markdown documents