# Load required libraries
library(tidyverse)     # Data wrangling & visualization

## Warning: package 'ggplot2' was built under R version 4.4.2

## Warning: package 'readr' was built under R version 4.4.2

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)     # Date parsing
library(janitor)       # Cleaning column names

## Warning: package 'janitor' was built under R version 4.4.2

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(sf)            # GIS mapping

## Warning: package 'sf' was built under R version 4.4.2

## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE

library(leaflet)       # Interactive maps

## Warning: package 'leaflet' was built under R version 4.4.2

library(shiny)         # Dashboard development

## Warning: package 'shiny' was built under R version 4.4.2

library(plotly)        # Interactive visualizations

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(ggthemes)      # Themes for ggplot2

## Warning: package 'ggthemes' was built under R version 4.4.2

library(DT)           # Interactive tables

## Warning: package 'DT' was built under R version 4.4.2

## 
## Attaching package: 'DT'
## 
## The following objects are masked from 'package:shiny':
## 
##     dataTableOutput, renderDataTable

# Load datasets
chicago_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/Chicago_Crimes_-_2001_to_Present.csv") %>% clean_names()
nyc_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/NYPD_Arrests_Data__Historic_.csv") %>% clean_names()
la_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/Los_Angeles_Crime_Data_from_2010_to_2019.csv") %>% clean_names()
philly_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/Philadelphia_Crime.csv") %>% clean_names()
seattle_data <- read.csv("C:/Users/meera/OneDrive/Desktop/crime/Seattle_Crime_Data__2008-Present.csv") %>% clean_names()

# 🔍 Inspect column names for each dataset
colnames(chicago_data)

##  [1] "id"                   "case_number"          "date"                
##  [4] "block"                "iucr"                 "primary_type"        
##  [7] "description"          "location_description" "arrest"              
## [10] "domestic"             "beat"                 "district"            
## [13] "ward"                 "community_area"       "fbi_code"            
## [16] "x_coordinate"         "y_coordinate"         "year"                
## [19] "updated_on"           "latitude"             "longitude"           
## [22] "location"

colnames(nyc_data)

##  [1] "arrest_key"        "arrest_date"       "pd_cd"            
##  [4] "pd_desc"           "ky_cd"             "ofns_desc"        
##  [7] "law_code"          "law_cat_cd"        "arrest_boro"      
## [10] "arrest_precinct"   "jurisdiction_code" "age_group"        
## [13] "perp_sex"          "perp_race"         "x_coord_cd"       
## [16] "y_coord_cd"        "latitude"          "longitude"        
## [19] "lon_lat"

colnames(la_data)

##  [1] "dr_no"          "date_rptd"      "date_occ"       "time_occ"      
##  [5] "area"           "area_name"      "rpt_dist_no"    "part_1_2"      
##  [9] "crm_cd"         "crm_cd_desc"    "mocodes"        "vict_age"      
## [13] "vict_sex"       "vict_descent"   "premis_cd"      "premis_desc"   
## [17] "weapon_used_cd" "weapon_desc"    "status"         "status_desc"   
## [21] "crm_cd_1"       "crm_cd_2"       "crm_cd_3"       "crm_cd_4"      
## [25] "location"       "cross_street"   "lat"            "lon"

colnames(philly_data)

##  [1] "dc_dist"            "psa"                "dispatch_date_time"
##  [4] "dispatch_date"      "dispatch_time"      "hour"              
##  [7] "dc_key"             "location_block"     "ucr_general"       
## [10] "text_general_code"  "police_districts"   "month"             
## [13] "lon"                "lat"

colnames(seattle_data)

##  [1] "report_number"           "offense_id"             
##  [3] "offense_start_date_time" "offense_end_date_time"  
##  [5] "report_date_time"        "group_a_b"              
##  [7] "crime_against_category"  "offense_parent_group"   
##  [9] "offense"                 "offense_code"           
## [11] "precinct"                "sector"                 
## [13] "beat"                    "mcpp"                   
## [15] "x100_block_address"      "longitude"              
## [17] "latitude"

# 🔧 Standardize latitude and longitude column names
chicago_data <- chicago_data %>% rename(latitude = latitude, longitude = longitude)
nyc_data <- nyc_data %>% rename(latitude = latitude, longitude = longitude)
la_data <- la_data %>% rename(latitude = lat, longitude = lon)  # Adjusted for LA
philly_data <- philly_data %>% rename(latitude = lat, longitude = lon)  # Adjusted for Philly
seattle_data <- seattle_data %>% rename(latitude = latitude, longitude = longitude)

# 🔧 Standardize column names for crime type and date
chicago_data <- chicago_data %>% rename(date = date, crime_type = primary_type)
nyc_data <- nyc_data %>% rename(date = arrest_date, crime_type = ofns_desc)
la_data <- la_data %>% rename(date = date_occ, crime_type = crm_cd_desc)
philly_data <- philly_data %>% rename(date = dispatch_date, crime_type = text_general_code)
seattle_data <- seattle_data %>% rename(date = report_date_time, crime_type = offense_parent_group)

# 🔧 Convert date columns safely (handle different formats)
chicago_data <- chicago_data %>% mutate(date = suppressWarnings(mdy_hms(date)))
nyc_data <- nyc_data %>% mutate(date = suppressWarnings(mdy(date)))
la_data <- la_data %>% mutate(date = suppressWarnings(mdy_hms(date)))
philly_data <- philly_data %>% mutate(date = suppressWarnings(ymd(date)))
seattle_data <- seattle_data %>% mutate(date = suppressWarnings(ymd_hms(date)))

# 🔧 Convert latitude and longitude to numeric
chicago_data <- chicago_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
nyc_data <- nyc_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
la_data <- la_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
philly_data <- philly_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))
seattle_data <- seattle_data %>% mutate(latitude = as.numeric(latitude), longitude = as.numeric(longitude))

# 🔧 Remove rows with missing values in key columns
chicago_data <- chicago_data %>% drop_na(date, latitude, longitude, crime_type)
nyc_data <- nyc_data %>% drop_na(date, latitude, longitude, crime_type)
la_data <- la_data %>% drop_na(date, latitude, longitude, crime_type)
philly_data <- philly_data %>% drop_na(date, latitude, longitude, crime_type)
seattle_data <- seattle_data %>% drop_na(date, latitude, longitude, crime_type)

# ✅ Merge datasets after cleaning (selecting only common columns)
common_cols <- Reduce(intersect, list(
  colnames(chicago_data),
  colnames(nyc_data),
  colnames(la_data),
  colnames(philly_data),
  colnames(seattle_data)
))

chicago_data <- chicago_data %>% select(all_of(common_cols))
nyc_data <- nyc_data %>% select(all_of(common_cols))
la_data <- la_data %>% select(all_of(common_cols))
philly_data <- philly_data %>% select(all_of(common_cols))
seattle_data <- seattle_data %>% select(all_of(common_cols))

all_data <- bind_rows(
  chicago_data %>% mutate(city = "Chicago"),
  nyc_data %>% mutate(city = "New York"),
  la_data %>% mutate(city = "Los Angeles"),
  philly_data %>% mutate(city = "Philadelphia"),
  seattle_data %>% mutate(city = "Seattle")
)

# 🔍 Verify merged dataset
glimpse(all_data)

## Rows: 17,974,641
## Columns: 5
## $ date       <dttm> 2021-05-24 15:06:00, 2021-06-26 09:24:00, 2023-11-09 07:30…
## $ crime_type <chr> "HOMICIDE", "HOMICIDE", "BURGLARY", "BATTERY", "CRIMINAL DA…
## $ latitude   <dbl> 41.91784, 41.99522, 41.95235, 41.73775, 41.88602, 41.99491,…
## $ longitude  <dbl> -87.75597, -87.71335, -87.67798, -87.60486, -87.63394, -87.…
## $ city       <chr> "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chi…

summary(all_data$date)

##                       Min.                    1st Qu. 
## "2001-01-01 00:00:00.0000" "2008-01-14 00:00:00.0000" 
##                     Median                       Mean 
## "2012-03-20 00:00:00.0000" "2012-06-08 14:59:32.5429" 
##                    3rd Qu.                       Max. 
## "2016-08-31 00:00:00.0000" "2025-01-13 00:00:00.0000"

2. Exploratory Data Analysis (EDA)

2.1 Crime Type Distribution Across Cities

# Calculate crime counts by city and type
crime_distribution <- all_data %>%
  group_by(city, crime_type) %>%
  summarize(total_crimes = n(), .groups = "drop")

# Filter for the top 20 crime types to reduce clutter
top_crimes <- crime_distribution %>%
  group_by(crime_type) %>%
  summarize(total_count = sum(total_crimes)) %>%
  slice_max(total_count, n = 20)  # Show only the 20 most common crime types

# Plot with improved readability
ggplot(crime_distribution %>% filter(crime_type %in% top_crimes$crime_type), 
       aes(x = reorder(crime_type, total_crimes), y = total_crimes, fill = city)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(
    title = "Top 20 Crime Types Across Cities",
    x = "Crime Type",
    y = "Number of Crimes",
    fill = "City"
  ) +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") +
  scale_x_discrete(guide = guide_axis(n.dodge = 2))  # Adjust text to be readable

2.2 Trends Over Time

# Group data by year and city
crime_trends <- all_data %>%
  mutate(year = year(date)) %>%
  group_by(city, year) %>%
  summarize(crime_count = n(), .groups = "drop")

# Line plot showing crime trends over years
ggplot(crime_trends, aes(x = year, y = crime_count, color = city)) +
  geom_line(size = 1) +
  geom_point() +
  labs(
    title = "Crime Trends Over Time by City",
    x = "Year",
    y = "Number of Crimes",
    color = "City"
  ) +
  theme_minimal() +
  scale_color_brewer(palette = "Set1")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

2.3 Top Crime Types in Each City

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.4.2

library(forcats)  # Ensure forcats is loaded for fct_reorder()

# Get the top 5 crime types for each city
top_crime_types <- all_data %>%
  group_by(city, crime_type) %>%
  summarize(total_crimes = n(), .groups = "drop") %>%
  group_by(city) %>%
  slice_max(total_crimes, n = 5)  # Top 5 crimes for each city

# Bar chart for top crime types by city
ggplot(top_crime_types, aes(x = fct_reorder(crime_type, total_crimes), y = total_crimes, fill = city)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(
    title = "Top 5 Crime Types by City",
    x = "Crime Type",
    y = "Number of Crimes",
    fill = "City"
  ) +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

2.4 Total Crimes Across Cities

# Summarize total crimes per city
city_crime_totals <- all_data %>%
  group_by(city) %>%
  summarize(total_crimes = n(), .groups = "drop")

# Bar chart for total crimes across cities
ggplot(city_crime_totals, aes(x = reorder(city, -total_crimes), y = total_crimes, fill = city)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Total Crimes Across Cities",
    x = "City",
    y = "Total Crimes",
    fill = "City"
  ) +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")

2.5 Interactive Table of Crime Data

datatable(
  all_data %>%
    select(city, date, crime_type, latitude, longitude) %>%
    arrange(desc(date)) %>%
    head(1000),  # Show only the most recent 1000 crimes
  options = list(pageLength = 10, scrollX = TRUE),
  rownames = FALSE
)

2.6

# Aggregate crime counts per city
crime_summary <- all_data %>%
  filter(year(date) >= 2013) %>%  # Last decade of data
  group_by(city) %>%
  summarize(total_crimes = n())

# Load U.S. state-level map
us_states <- map_data("state")

# Convert city names to lowercase to match map data
crime_summary <- crime_summary %>%
  mutate(city = tolower(city))

3. GIS Mapping: Interactive Crime Hotspots

3.1 Use a Random Sample of Crimes

# Reduce dataset size to 10,000 rows (adjust as needed)
crime_sample <- all_data %>% sample_n(10000)

leaflet(crime_sample) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~longitude, lat = ~latitude,
    radius = 3, color = "red", stroke = FALSE, fillOpacity = 0.5,
    popup = ~paste0("<strong>City:</strong> ", city, "<br>",
                    "<strong>Crime Type:</strong> ", crime_type, "<br>",
                    "<strong>Date:</strong> ", as.character(date))
  ) %>%
  setView(lng = -95, lat = 37, zoom = 4)  # Centered on the U.S.

3.2: Aggregate Crime Data by City

crime_summary <- all_data %>%
  group_by(city) %>%
  summarize(
    avg_lat = mean(latitude, na.rm = TRUE),
    avg_lon = mean(longitude, na.rm = TRUE),
    total_crimes = n()
  )

leaflet(crime_summary) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~avg_lon, lat = ~avg_lat,
    radius = ~sqrt(total_crimes) / 10,  # Scale by crime count
    color = "blue", fillOpacity = 0.6,
    popup = ~paste0("<strong>City:</strong> ", city, "<br>",
                    "<strong>Total Crimes:</strong> ", total_crimes)
  ) %>%
  setView(lng = -95, lat = 37, zoom = 4)

4. Interactive Dashboard (Shiny App)

library(shiny)
library(tidyverse)
library(plotly)
library(leaflet)
library(DT)

# Define UI
ui <- fluidPage(
  titlePanel("Crime Trends Dashboard"),
  
  sidebarLayout(
    sidebarPanel(
      selectInput("city", "Select City", choices = unique(all_data$city), selected = "Chicago"),
      selectInput("crime_type", "Select Crime Type", choices = unique(all_data$crime_type)),
      sliderInput("year_range", "Select Year Range", 
                  min = min(year(all_data$date)), 
                  max = max(year(all_data$date)), 
                  value = c(2010, 2023),
                  step = 1, sep = "")
    ),
    
    mainPanel(
      tabsetPanel(
        tabPanel("Crime Trends Over Time", plotlyOutput("crime_trend_plot")),
        tabPanel("Crime Type Distribution", plotlyOutput("crime_type_plot")),
        tabPanel("Top Crime Types by City", plotlyOutput("top_crime_types_plot")),
        tabPanel("Total Crimes Across Cities", plotlyOutput("total_crimes_plot")),
        tabPanel("Crime Data Table", DTOutput("crime_table")),
        tabPanel("Crime Map", leafletOutput("crime_map"))
      )
    )
  )
)

# Define Server Logic
server <- function(input, output, session) {
  
  # Filtered data based on input
  filtered_data <- reactive({
    all_data %>%
      filter(city == input$city, crime_type == input$crime_type,
             between(year(date), input$year_range[1], input$year_range[2]))
  })

  # 1️⃣ Crime Trends Over Time
  output$crime_trend_plot <- renderPlotly({
    trend_data <- filtered_data() %>%
      group_by(year = year(date)) %>%
      summarize(crime_count = n())

    ggplot(trend_data, aes(x = year, y = crime_count)) +
      geom_line(size = 1, color = "blue") +
      geom_point() +
      labs(title = "Crime Trends Over Time", x = "Year", y = "Crime Count") +
      theme_minimal()
  })

  # 2️⃣ Crime Type Distribution
  output$crime_type_plot <- renderPlotly({
    crime_distribution <- all_data %>%
      filter(between(year(date), input$year_range[1], input$year_range[2])) %>%
      group_by(city, crime_type) %>%
      summarize(total_crimes = n(), .groups = "drop")

    top_crimes <- crime_distribution %>%
      group_by(crime_type) %>%
      summarize(total_count = sum(total_crimes)) %>%
      slice_max(total_count, n = 20)

    ggplot(crime_distribution %>% filter(crime_type %in% top_crimes$crime_type), 
           aes(x = reorder(crime_type, total_crimes), y = total_crimes, fill = city)) +
      geom_bar(stat = "identity", position = "dodge") +
      coord_flip() +
      labs(title = "Top 20 Crime Types", x = "Crime Type", y = "Number of Crimes") +
      theme_minimal()
  })

  # 3️⃣ Top Crime Types in Each City
  output$top_crime_types_plot <- renderPlotly({
    top_crime_types <- all_data %>%
      group_by(city, crime_type) %>%
      summarize(total_crimes = n(), .groups = "drop") %>%
      group_by(city) %>%
      slice_max(total_crimes, n = 5)

    ggplot(top_crime_types, aes(x = fct_reorder(crime_type, total_crimes), y = total_crimes, fill = city)) +
      geom_bar(stat = "identity") +
      coord_flip() +
      labs(title = "Top 5 Crime Types by City", x = "Crime Type", y = "Total Crimes") +
      theme_minimal()
  })

  # 4️⃣ Total Crimes Across Cities
  output$total_crimes_plot <- renderPlotly({
    city_crime_totals <- all_data %>%
      group_by(city) %>%
      summarize(total_crimes = n(), .groups = "drop")

    ggplot(city_crime_totals, aes(x = reorder(city, -total_crimes), y = total_crimes, fill = city)) +
      geom_bar(stat = "identity") +
      labs(title = "Total Crimes Across Cities", x = "City", y = "Total Crimes") +
      theme_minimal()
  })

  # 5️⃣ Interactive Data Table
  output$crime_table <- renderDT({
    datatable(
      filtered_data() %>%
        select(city, date, crime_type, latitude, longitude) %>%
        arrange(desc(date)) %>%
        head(1000),
      options = list(pageLength = 10, scrollX = TRUE),
      rownames = FALSE
    )
  })

  # 6️⃣ Interactive Crime Map
  output$crime_map <- renderLeaflet({
    crime_sample <- filtered_data() %>% sample_n(min(5000, nrow(.)))

    leaflet(crime_sample) %>%
      addTiles() %>%
      addCircleMarkers(
        lng = ~longitude, lat = ~latitude,
        radius = 3, color = "red", stroke = FALSE, fillOpacity = 0.5,
        popup = ~paste0("<strong>City:</strong> ", city, "<br>",
                        "<strong>Crime Type:</strong> ", crime_type, "<br>",
                        "<strong>Date:</strong> ", as.character(date))
      ) %>%
      setView(lng = -95, lat = 37, zoom = 4)
  })
}

# Run the App
shinyApp(ui, server)

Shiny applications not supported in static R Markdown documents

Stat 317: Final Project

Meerab Sulman 251802146

2025-01-28

2. Exploratory Data Analysis (EDA)

2.1 Crime Type Distribution Across Cities

2.2 Trends Over Time

2.3 Top Crime Types in Each City

2.4 Total Crimes Across Cities

2.5 Interactive Table of Crime Data

2.6

3. GIS Mapping: Interactive Crime Hotspots

3.1 Use a Random Sample of Crimes

3.2: Aggregate Crime Data by City

4. Interactive Dashboard (Shiny App)