The Impact of Crimes on Housing Prices in NYC

# Load the library 

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.4.2

## Warning: package 'readr' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(janitor)

## Warning: package 'janitor' was built under R version 4.4.3

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(sf)

## Warning: package 'sf' was built under R version 4.4.2

## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE

library(tigris)

## Warning: package 'tigris' was built under R version 4.4.3

## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.

library(leaflet)

## Warning: package 'leaflet' was built under R version 4.4.3

library(leaflet.extras)

## Warning: package 'leaflet.extras' was built under R version 4.4.3

library(ggplot2)
library(broom)

## Warning: package 'broom' was built under R version 4.4.2

library(gt)

## Warning: package 'gt' was built under R version 4.4.3

library(lubridate)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.4.3

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

library(spdep)

## Warning: package 'spdep' was built under R version 4.4.3

## Loading required package: spData

## Warning: package 'spData' was built under R version 4.4.3

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`

library(viridis)

## Warning: package 'viridis' was built under R version 4.4.3

## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## 
## The following object is masked from 'package:scales':
## 
##     viridis_pal

library(stargazer)

## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(patchwork)

## Warning: package 'patchwork' was built under R version 4.4.2

library(quantreg)

## Warning: package 'quantreg' was built under R version 4.4.2

## Loading required package: SparseM
## 
## Attaching package: 'quantreg'
## 
## The following object is masked from 'package:gt':
## 
##     latex

library(dplyr)
library(SHAPforxgboost)

## Warning: package 'SHAPforxgboost' was built under R version 4.4.3

library(shiny)
library(DT)

## Warning: package 'DT' was built under R version 4.4.3

## 
## Attaching package: 'DT'
## 
## The following objects are masked from 'package:shiny':
## 
##     dataTableOutput, renderDataTable

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:xgboost':
## 
##     slice
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(bslib)

## Warning: package 'bslib' was built under R version 4.4.2

## 
## Attaching package: 'bslib'
## 
## The following object is masked from 'package:spdep':
## 
##     card
## 
## The following object is masked from 'package:broom':
## 
##     bootstrap
## 
## The following object is masked from 'package:utils':
## 
##     page

library(marginaleffects)

## Warning: package 'marginaleffects' was built under R version 4.4.3

Read and Clean the Data

# Read the Data from folder
crime      <- read_csv("D:/SPS/3rd Semester/Data 608_Master's Research Project/Data/NYPD_Complaint_Data/NYPD_Complaint_Data_Historic_2012_2023.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 5796339 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): RPT_DT, OFNS_DESC, PD_DESC, LAW_CAT_CD, BORO_NM, PREM_TYP_DESC, SUS...
## dbl (3): CMPLNT_NUM, Latitude, Longitude
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

all_sales  <- read_csv("D:/SPS/3rd Semester/Data 608_Master's Research Project/Data/Sales/all_sales_clean.csv")

## Rows: 719099 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): neighborhood, building_class_category, address, building_class_at_...
## dbl (12): block, lot, zip_code, residential_units, commercial_units, total_u...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cpi_raw    <- read_excel("D:/SPS/3rd Semester/Data 608_Master's Research Project/Data/CPI/cpi.xlsx", skip = 11)

# Clean Crime & Sales
crime_clean <- crime %>%
  mutate(
    RPT_DT = mdy(RPT_DT),
    year = year(RPT_DT),
    borough = str_to_title(BORO_NM)
  ) %>%
  filter(!is.na(borough), year >= 2012)

sales_clean <- all_sales %>%
  clean_names() %>% 
  mutate(
    borough = dplyr::recode(str_to_title(borough), 
                            "Staten" = "Staten Island", 
                            "Staten Island" = "Staten Island", 
                            "Statenisland" = "Staten Island"),
    zip_code = as.character(zip_code)
  )

# CPI & Real Price Adjustment
cpi_annual <- cpi_raw %>%
  select(-contains("HALF")) %>%
  pivot_longer(cols = c(Jan:Dec), names_to = "month", values_to = "cpi_value") %>%
  group_by(Year) %>%
  summarize(cpi_value = mean(cpi_value, na.rm = TRUE), .groups = "drop") %>%
  rename(year = Year)

cpi_base <- cpi_annual %>% filter(year == 2023) %>% pull(cpi_value)

# Spatial Geometry & Area Calculation
nyc_zips <- zctas(cb = TRUE, starts_with = "1", year = 2020, class = "sf") %>%
  st_transform(2263) # Long Island ft

## ZCTAs can take several minutes to download.  To cache the data and avoid re-downloading in future R sessions, set `options(tigris_use_cache = TRUE)`

##   |                                                                              |                                                                      |   0%  |                                                                              |                                                                      |   1%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |==                                                                    |   4%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |=======                                                               |  11%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |=========                                                             |  14%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  16%  |                                                                              |============                                                          |  17%  |                                                                              |============                                                          |  18%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%  |                                                                              |===============                                                       |  21%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  22%  |                                                                              |================                                                      |  23%  |                                                                              |================                                                      |  24%  |                                                                              |=================                                                     |  24%  |                                                                              |=================                                                     |  25%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |===================                                                   |  28%  |                                                                              |====================                                                  |  28%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |=====================                                                 |  31%  |                                                                              |======================                                                |  31%  |                                                                              |======================                                                |  32%  |                                                                              |=======================                                               |  32%  |                                                                              |=======================                                               |  33%  |                                                                              |=======================                                               |  34%  |                                                                              |========================                                              |  34%  |                                                                              |========================                                              |  35%  |                                                                              |=========================                                             |  35%  |                                                                              |=========================                                             |  36%  |                                                                              |==========================                                            |  36%  |                                                                              |==========================                                            |  37%  |                                                                              |==========================                                            |  38%  |                                                                              |===========================                                           |  38%  |                                                                              |===========================                                           |  39%  |                                                                              |============================                                          |  39%  |                                                                              |============================                                          |  40%  |                                                                              |============================                                          |  41%  |                                                                              |=============================                                         |  41%  |                                                                              |=============================                                         |  42%  |                                                                              |==============================                                        |  42%  |                                                                              |==============================                                        |  43%  |                                                                              |==============================                                        |  44%  |                                                                              |===============================                                       |  44%  |                                                                              |===============================                                       |  45%  |                                                                              |================================                                      |  45%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  46%  |                                                                              |=================================                                     |  47%  |                                                                              |=================================                                     |  48%  |                                                                              |==================================                                    |  48%  |                                                                              |==================================                                    |  49%  |                                                                              |===================================                                   |  49%  |                                                                              |===================================                                   |  50%  |                                                                              |===================================                                   |  51%  |                                                                              |====================================                                  |  51%  |                                                                              |====================================                                  |  52%  |                                                                              |=====================================                                 |  52%  |                                                                              |=====================================                                 |  53%  |                                                                              |=====================================                                 |  54%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |=======================================                               |  55%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  56%  |                                                                              |========================================                              |  57%  |                                                                              |========================================                              |  58%  |                                                                              |=========================================                             |  58%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |==========================================                            |  61%  |                                                                              |===========================================                           |  61%  |                                                                              |===========================================                           |  62%  |                                                                              |============================================                          |  62%  |                                                                              |============================================                          |  63%  |                                                                              |============================================                          |  64%  |                                                                              |=============================================                         |  64%  |                                                                              |=============================================                         |  65%  |                                                                              |==============================================                        |  65%  |                                                                              |==============================================                        |  66%  |                                                                              |===============================================                       |  66%  |                                                                              |===============================================                       |  67%  |                                                                              |===============================================                       |  68%  |                                                                              |================================================                      |  68%  |                                                                              |================================================                      |  69%  |                                                                              |=================================================                     |  69%  |                                                                              |=================================================                     |  70%  |                                                                              |=================================================                     |  71%  |                                                                              |==================================================                    |  71%  |                                                                              |==================================================                    |  72%  |                                                                              |===================================================                   |  72%  |                                                                              |===================================================                   |  73%  |                                                                              |===================================================                   |  74%  |                                                                              |====================================================                  |  74%  |                                                                              |====================================================                  |  75%  |                                                                              |=====================================================                 |  75%  |                                                                              |=====================================================                 |  76%  |                                                                              |======================================================                |  76%  |                                                                              |======================================================                |  77%  |                                                                              |======================================================                |  78%  |                                                                              |=======================================================               |  78%  |                                                                              |=======================================================               |  79%  |                                                                              |========================================================              |  79%  |                                                                              |========================================================              |  80%  |                                                                              |========================================================              |  81%  |                                                                              |=========================================================             |  81%  |                                                                              |=========================================================             |  82%  |                                                                              |==========================================================            |  82%  |                                                                              |==========================================================            |  83%  |                                                                              |==========================================================            |  84%  |                                                                              |===========================================================           |  84%  |                                                                              |===========================================================           |  85%  |                                                                              |============================================================          |  85%  |                                                                              |============================================================          |  86%  |                                                                              |=============================================================         |  86%  |                                                                              |=============================================================         |  87%  |                                                                              |=============================================================         |  88%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |===============================================================       |  91%  |                                                                              |================================================================      |  91%  |                                                                              |================================================================      |  92%  |                                                                              |=================================================================     |  92%  |                                                                              |=================================================================     |  93%  |                                                                              |=================================================================     |  94%  |                                                                              |==================================================================    |  94%  |                                                                              |==================================================================    |  95%  |                                                                              |===================================================================   |  95%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  96%  |                                                                              |====================================================================  |  97%  |                                                                              |====================================================================  |  98%  |                                                                              |===================================================================== |  98%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================|  99%  |                                                                              |======================================================================| 100%

# Calculate Area in Sq Miles
zip_area <- nyc_zips %>%
  mutate(area_sq_miles = as.numeric(st_area(.)) / 27878400) %>%
  st_drop_geometry() %>%
  select(ZCTA5CE20, area_sq_miles)

# Spatial Join (Crime to Zip)
crime_sf <- crime_clean %>%
  filter(!is.na(Longitude), !is.na(Latitude)) %>%
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
  st_transform(st_crs(nyc_zips))

crime_by_zip <- st_join(crime_sf, nyc_zips, join = st_within) %>%
  st_drop_geometry() %>%
  group_by(year, ZCTA5CE20) %>%
  summarize(total_crimes_zip = n(), .groups = "drop") %>%
  rename(zip_code = ZCTA5CE20)

# Load ACS Zip Population

folder <- "D:/SPS/3rd Semester/Data 608_Master's Research Project/Data/ACS population by ZIP (B01003)"
files <- list.files(folder, pattern = "B01003-Data\\.csv$", full.names = TRUE)

read_pop <- function(file) {
  year_val <- str_extract(file, "20\\d{2}")
  read_csv(file, show_col_types = FALSE) %>%
    filter(GEO_ID != "Geography") %>%
    mutate(
      year       = as.numeric(year_val),
      zip_code   = str_sub(GEO_ID, -5),
      population = as.numeric(B01003_001E)
    ) %>%
    select(year, zip_code, population)
}

zip_population_all <- map_dfr(files, read_pop)

## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## • `` -> `...5`

# Merge all data set
final_df <- sales_clean %>%
  # Join CPI (standard join on 'year')
  left_join(cpi_annual, by = "year") %>%
   
  # Join Crime (standard join on 'year' and 'zip_code')
  left_join(crime_by_zip, by = c("year", "zip_code")) %>%
   
  # Join Population (standard join on 'year' and 'zip_code')
  left_join(zip_population_all, by = c("year", "zip_code")) %>%
   
  # Join Area
  left_join(zip_area, by = c("zip_code" = "ZCTA5CE20")) %>%
   
  mutate(
    # Handle missing crime counts (NA -> 0)
    total_crimes_zip = replace_na(total_crimes_zip, 0),
     
    # Crime Density (Crimes per Sq Mile)
    crime_density = total_crimes_zip / area_sq_miles,
     
    # Crime Rate (Crimes per 100,000 people)
    population = replace(population, population < 100, NA), 
    crime_rate_per_100k = (total_crimes_zip / population) * 100000,
     
    # Real Price Calculation
    real_price = sale_price * (cpi_base / cpi_value),
    log_real_price = log(real_price),
     
    # Pandemic Indicator
    pandemic = if_else(year >= 2020, 1, 0)
  ) %>%
  filter(
    !is.na(log_real_price), 
    !is.na(crime_rate_per_100k),
    is.finite(crime_rate_per_100k),
    !is.na(area_sq_miles)
  )

# View the final dataset
print(paste("Rows in final dataset:", nrow(final_df)))

## [1] "Rows in final dataset: 660046"

head(final_df)

## # A tibble: 6 × 27
##   neighborhood building_class_category  block   lot address             zip_code
##   <chr>        <chr>                    <dbl> <dbl> <chr>               <chr>   
## 1 BATHGATE     01  ONE FAMILY DWELLINGS  3028    25 412 EAST 179TH STR… 10457   
## 2 BATHGATE     01  ONE FAMILY DWELLINGS  3039    28 2329 WASHINGTON AV… 10458   
## 3 BATHGATE     01  ONE FAMILY DWELLINGS  3039    28 2329 WASHINGTON AV… 10458   
## 4 BATHGATE     01  ONE FAMILY DWELLINGS  3046    39 2075 BATHGATE AVEN… 10457   
## 5 BATHGATE     01  ONE FAMILY DWELLINGS  3046    52 2047 BATHGATE AVEN… 10457   
## 6 BATHGATE     01  ONE FAMILY DWELLINGS  3048    53 4411 3 AVENUE       10457   
## # ℹ 21 more variables: residential_units <dbl>, commercial_units <dbl>,
## #   total_units <dbl>, land_square_feet <dbl>, gross_square_feet <dbl>,
## #   year_built <dbl>, tax_class_at_time_of_sale <dbl>,
## #   building_class_at_time_of_sale <chr>, sale_price <dbl>, source_file <chr>,
## #   year <dbl>, borough <chr>, cpi_value <dbl>, total_crimes_zip <int>,
## #   population <dbl>, area_sq_miles <dbl>, crime_density <dbl>,
## #   crime_rate_per_100k <dbl>, real_price <dbl>, log_real_price <dbl>, …

Crime Rate Trend 2012-2023

crime_summary <- crime_clean %>%
  group_by(year, borough) %>%
  summarize(total_crimes = n(), .groups = "drop")


# map Zips to Boroughs
zip_to_borough <- sales_clean %>% 
  select(zip_code, borough) %>% 
  distinct()

population_long <- zip_population_all %>%
  inner_join(zip_to_borough, by = "zip_code") %>%
  group_by(year, borough) %>%
  summarize(population = sum(population, na.rm = TRUE), .groups = "drop")

## Warning in inner_join(., zip_to_borough, by = "zip_code"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 12 of `x` matches multiple rows in `y`.
## ℹ Row 71 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# PLOT
crime_trend_data <- crime_summary %>%
  left_join(population_long, by = c("year", "borough")) %>%
  mutate(crime_rate_per_100k = (total_crimes / population) * 100000) %>%
  filter(!is.na(crime_rate_per_100k))

ggplot(crime_trend_data, aes(x = year, y = crime_rate_per_100k, color = borough)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  geom_vline(xintercept = 2020, linetype = "dashed", color = "black", linewidth = 0.8) +
  annotate("text", x = 2020.2, y = max(crime_trend_data$crime_rate_per_100k), 
           label = "Pandemic Start", hjust = 0, vjust = 1, fontface = "italic") +
  scale_y_continuous(labels = comma) +
  scale_x_continuous(breaks = 2012:2023) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Crime Rate Trends (2012–2023)",
       y = "Crime Rate (per 100k)", x = "Year", color = "Borough") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 16))

The “Table 1”: Descriptive Statistics

# Table 1: Descriptive Statistics

table1_clean <- final_df %>%
  select(borough, sale_price, real_price, crime_rate_per_100k, gross_square_feet) %>%
  mutate(
    sale_price = sale_price / 1000, 
    real_price = real_price / 1000
  ) %>%
  group_by(borough) %>%
  summarize(
    Obs = n(),
    `Nominal Price ($K)` = mean(sale_price, na.rm = TRUE),
    `Real Price ($K)`    = mean(real_price, na.rm = TRUE),
    `Crime Rate`         = mean(crime_rate_per_100k, na.rm = TRUE), # Shortened name
    `Sq Ft`              = mean(gross_square_feet, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  
  # Pass to gt for formatting
  gt() %>%
  tab_header(
    title = "Table 1: Descriptive Statistics by Borough",
    subtitle = "Mean values (2012–2023)"
  ) %>%
  
  # Format the integer columns (Price, Obs, Sq Ft) with commas and 0 decimals
  fmt_number(
    columns = c(Obs, `Nominal Price ($K)`, `Real Price ($K)`, `Sq Ft`),
    decimals = 0,
    use_seps = TRUE
  ) %>%
  
  # Format Crime Rate
  fmt_number(
    columns = c(`Crime Rate`),
    decimals = 2
  ) %>%
  
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    heading.align = "left"
  )

# Display the table
table1_clean

borough	Obs	Nominal Price ($K)	Real Price ($K)	Crime Rate	Sq Ft
Table 1: Descriptive Statistics by Borough
Mean values (2012–2023)
Bronx	54,160	1,076	1,235	6,419.29	6,885
Brooklyn	170,637	1,399	1,598	5,319.05	3,975
Manhattan	176,997	3,600	4,178	7,324.37	7,969
Queens	195,663	970	1,101	4,185.83	2,003
Staten Island	62,589	603	690	4,265.28	1,814

The “Decoupling” Plot (Crime vs. Price Trends)

# Calculate Index (2012 = 100)

trend_index <- final_df %>%
  group_by(year) %>%
  summarize(
    median_real_price = median(real_price, na.rm = TRUE),
    mean_crime_rate   = mean(crime_rate_per_100k, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(year) %>% 
  mutate(
    base_price = first(median_real_price),
    base_crime = first(mean_crime_rate),
     
    Price_Index = (median_real_price / base_price) * 100,
    Crime_Index = (mean_crime_rate / base_crime) * 100
  ) %>%
  pivot_longer(cols = c("Price_Index", "Crime_Index"), 
               names_to = "Metric", 
               values_to = "Value")

# Plot
ggplot(trend_index, aes(x = year, y = Value, color = Metric)) +
  geom_hline(yintercept = 100, linetype = "dashed", color = "grey") +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  annotate("rect", xmin = 2020, xmax = 2023, ymin = -Inf, ymax = Inf, alpha = 0.1, fill = "red") +
  annotate("text", x = 2021.5, y = 140, label = "Pandemic Era", fontface = "italic") +
  scale_color_manual(values = c("Price_Index" = "#27AE60", "Crime_Index" = "#C0392B"), 
                     labels = c("Crime Rate", "Real Housing Price")) +
  scale_x_continuous(breaks = min(trend_index$year):max(trend_index$year)) +
  theme_minimal() +
  labs(
    title = "The Decoupling: Trends Indexed to Base Year",
    y = "Growth Index", x = NULL, color = NULL
  ) +
  theme(legend.position = "bottom")

The “Hotspot” Evolution (Crime Density)

# Prepare Density Data

lisa_prep_data <- crime_by_zip %>%
  inner_join(zip_population_all, by = c("year", "zip_code")) %>%
  mutate(
    population = if_else(population < 100, NA_real_, population),
    crime_rate = (total_crimes_zip / population) * 100000
  ) %>%
  filter(!is.na(crime_rate), is.finite(crime_rate))
map_data_all <- crime_by_zip %>%
  # Join with Area to calculate density
  inner_join(zip_area, by = c("zip_code" = "ZCTA5CE20")) %>%
  mutate(crime_density = total_crimes_zip / area_sq_miles) %>%
  # Filter the year
  filter(year %in% c(2012, 2016, 2023))

# Define Global Scale
global_limits <- range(map_data_all$crime_density, na.rm = TRUE)

# Mapping
plot_density_map_robust <- function(target_year) {
  
  # Filter for the specific year
  plot_sf <- map_data_all %>%
    filter(year == target_year) %>%
    inner_join(nyc_zips, by = c("zip_code" = "ZCTA5CE20")) %>%
    st_as_sf()
  
  ggplot(plot_sf) +
    geom_sf(aes(fill = crime_density), color = NA) +
    scale_fill_viridis_c(
      option = "inferno", 
      direction = -1, 
      trans = "log10", 
      name = "Crimes/Sq Mi",
      labels = comma,
      # FORCE the scale to be consistent across all maps
      limits = global_limits 
    ) +
    labs(title = paste(target_year)) +
    theme_void() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
      legend.position = "right"
    )
}

# 3 Maps
m12 <- plot_density_map_robust(2012)
m16 <- plot_density_map_robust(2016)
m23 <- plot_density_map_robust(2023)

# Combine the maps
combined_plot <- (m12 + m16 + m23) + 
  plot_layout(guides = "collect") + 
  plot_annotation(
    title = "Evolution of Crime Density in NYC (2012, 2016, 2023)",
    theme = theme(plot.title = element_text(size = 18, face = "bold"))
  )

combined_plot

Evolution of Spatial Crime Clusters (LISA Analysis)

# Prepare Robust Data
get_lisa_map <- function(target_year, show_legend = FALSE) { 
   
  # Filter & Join Geometry
  data_yr <- lisa_prep_data %>%
    filter(year == target_year) %>%
    inner_join(nyc_zips, by = c("zip_code" = "ZCTA5CE20")) %>%
    st_as_sf() %>%
    st_transform(2263) %>% 
    drop_na(crime_rate)
   
  # Calculate Neighbors
  coords <- st_coordinates(st_centroid(st_geometry(data_yr))) 
  knn <- knearneigh(coords, k = 4) 
  nb  <- knn2nb(knn)
  lw  <- nb2listw(nb, style = "W", zero.policy = TRUE)
   
  local_moran <- localmoran(data_yr$crime_rate, lw, zero.policy = TRUE)
   
  # Classify Clusters
  mean_val <- mean(data_yr$crime_rate, na.rm = TRUE)
   
  data_yr <- data_yr %>%
    mutate(
      Ii = local_moran[, 1],
      p_value = local_moran[, 5],
      lag_val = lag.listw(lw, crime_rate, zero.policy = TRUE),
        
      cluster = case_when(
        p_value >= 0.05 ~ "Not Significant",
        crime_rate > mean_val & lag_val > mean(lag_val) ~ "High-High",
        crime_rate < mean_val & lag_val < mean(lag_val) ~ "Low-Low",
        crime_rate > mean_val & lag_val < mean(lag_val) ~ "High-Low",
        crime_rate < mean_val & lag_val > mean(lag_val) ~ "Low-High"
      )
    ) %>%
    mutate(cluster = factor(cluster, levels = c("High-High", "Low-Low", "Low-High", "High-Low", "Not Significant")))

  p <- ggplot(data_yr) +
    geom_sf(aes(fill = cluster), color = "white", size = 0.1) +
    
    scale_fill_manual(
      values = c(
        "High-High" = "#D32F2F",   # Red
        "Low-Low"   = "#1976D2",   # Blue
        "Low-High"  = "#388E3C",   # Green
        "Not Significant" = "grey90"
      ),
      # ... [rest of your code stays the same] ...
      drop = FALSE, 
      name = "Cluster Type"
    ) +
    labs(title = paste(target_year)) +
    theme_void() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold")
    )

  if (show_legend == TRUE) {
    p <- p + theme(legend.position = "right")
  } else {
    p <- p + theme(legend.position = "none") 
  }
  
  return(p)
}

# Generate Maps
lisa_12 <- get_lisa_map(2012, show_legend = FALSE)
lisa_16 <- get_lisa_map(2016, show_legend = FALSE)
lisa_23 <- get_lisa_map(2023, show_legend = TRUE)

# Combine all th maps
(lisa_12 + lisa_16 + lisa_23) +
  plot_annotation(
    title = "Evolution of Spatial Crime Clusters (LISA Analysis)",
    theme = theme(plot.title = element_text(size = 16, face = "bold"))
  )

Model 1: Baseline OLS

# Model 1: Baseline OLS
options(scipen = 999)

model_data <- final_df %>%
  mutate(
    borough = as.factor(borough),
    year = as.factor(year)
  )


model_h1 <- lm(
  log_real_price ~ crime_rate_per_100k + 
                   gross_square_feet + 
                   year_built + 
                   residential_units + 
                   borough + 
                   year,     
  data = model_data
)

summary(model_h1)

## 
## Call:
## lm(formula = log_real_price ~ crime_rate_per_100k + gross_square_feet + 
##     year_built + residential_units + borough + year, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.0106  -0.2280   0.2824   0.7201   8.6364 
## 
## Coefficients:
##                            Estimate     Std. Error t value             Pr(>|t|)
## (Intercept)          12.15609555168  0.01804680998 673.587 < 0.0000000000000002
## crime_rate_per_100k  -0.00001958727  0.00000096175 -20.366 < 0.0000000000000002
## gross_square_feet     0.00000061722  0.00000009964   6.194       0.000000000586
## year_built            0.00026089545  0.00000645467  40.420 < 0.0000000000000002
## residential_units     0.00225447813  0.00014324529  15.739 < 0.0000000000000002
## boroughBrooklyn       0.48744968792  0.01083163948  45.002 < 0.0000000000000002
## boroughManhattan      1.10879200787  0.01118918449  99.095 < 0.0000000000000002
## boroughQueens         0.16686318947  0.01081868178  15.424 < 0.0000000000000002
## boroughStaten Island  0.03463129772  0.01262720561   2.743               0.0061
## year2014              0.13419605249  0.01137090646  11.802 < 0.0000000000000002
## year2015              0.20115412608  0.01126401058  17.858 < 0.0000000000000002
## year2016              0.29977836543  0.01140061410  26.295 < 0.0000000000000002
## year2017              0.40831714070  0.01141265384  35.778 < 0.0000000000000002
## year2018              0.43444095788  0.01159497034  37.468 < 0.0000000000000002
## year2019              0.54261703170  0.01299985325  41.740 < 0.0000000000000002
## year2020              0.49656551724  0.01407137721  35.289 < 0.0000000000000002
## year2021              0.60221719428  0.01399993140  43.016 < 0.0000000000000002
## year2022              0.60053238357  0.01428131167  42.050 < 0.0000000000000002
## year2023              0.40001275002  0.01560075305  25.641 < 0.0000000000000002
##                         
## (Intercept)          ***
## crime_rate_per_100k  ***
## gross_square_feet    ***
## year_built           ***
## residential_units    ***
## boroughBrooklyn      ***
## boroughManhattan     ***
## boroughQueens        ***
## boroughStaten Island ** 
## year2014             ***
## year2015             ***
## year2016             ***
## year2017             ***
## year2018             ***
## year2019             ***
## year2020             ***
## year2021             ***
## year2022             ***
## year2023             ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.996 on 521488 degrees of freedom
##   (138539 observations deleted due to missingness)
## Multiple R-squared:  0.04108,    Adjusted R-squared:  0.04105 
## F-statistic:  1241 on 18 and 521488 DF,  p-value: < 0.00000000000000022

# Model 2: DiD The Pandemic Structural Break


model_h2 <- lm(
  log_real_price ~ crime_rate_per_100k * pandemic + 
                   gross_square_feet + 
                   year_built + 
                   residential_units + 
                   borough,
  data = model_data
)

summary(model_h2)

## 
## Call:
## lm(formula = log_real_price ~ crime_rate_per_100k * pandemic + 
##     gross_square_feet + year_built + residential_units + borough, 
##     data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.8224  -0.2339   0.2897   0.7273   8.6984 
## 
## Coefficients:
##                                   Estimate    Std. Error t value
## (Intercept)                  12.4362981074  0.0165056618 753.456
## crime_rate_per_100k          -0.0000249441  0.0000010095 -24.709
## pandemic                      0.1516218900  0.0150886632  10.049
## gross_square_feet             0.0000005036  0.0000001001   5.030
## year_built                    0.0002828456  0.0000064450  43.886
## residential_units             0.0023470944  0.0001436596  16.338
## boroughBrooklyn               0.4737427706  0.0108831690  43.530
## boroughManhattan              1.0714170642  0.0112320121  95.390
## boroughQueens                 0.1511611952  0.0108679184  13.909
## boroughStaten Island          0.0394348526  0.0127305841   3.098
## crime_rate_per_100k:pandemic  0.0000198132  0.0000027249   7.271
##                                          Pr(>|t|)    
## (Intercept)                  < 0.0000000000000002 ***
## crime_rate_per_100k          < 0.0000000000000002 ***
## pandemic                     < 0.0000000000000002 ***
## gross_square_feet               0.000000491122439 ***
## year_built                   < 0.0000000000000002 ***
## residential_units            < 0.0000000000000002 ***
## boroughBrooklyn              < 0.0000000000000002 ***
## boroughManhattan             < 0.0000000000000002 ***
## boroughQueens                < 0.0000000000000002 ***
## boroughStaten Island                      0.00195 ** 
## crime_rate_per_100k:pandemic    0.000000000000357 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.001 on 521496 degrees of freedom
##   (138539 observations deleted due to missingness)
## Multiple R-squared:  0.03558,    Adjusted R-squared:  0.03556 
## F-statistic:  1924 on 10 and 521496 DF,  p-value: < 0.00000000000000022

# Define ONE dataset for ML Model
ml_data <- final_df %>%
  select(log_real_price, crime_rate_per_100k, gross_square_feet, 
         year_built, residential_units, pandemic, borough) %>%
  drop_na() %>%
  mutate(borough = as.factor(borough))

# Train/Test Split
set.seed(123)
train_index <- createDataPartition(ml_data$log_real_price, p = 0.8, list = FALSE)
train_data  <- ml_data[train_index, ]
test_data   <- ml_data[-train_index, ]

Model 3: XGBoost with Tuned

# Model 3: XGBoost (Tuned)
X_train_mat <- model.matrix(log_real_price ~ . - 1, data = train_data)
y_train_vec <- train_data$log_real_price
X_test_mat  <- model.matrix(log_real_price ~ . - 1, data = test_data)
y_test_vec  <- test_data$log_real_price

xgb_params <- list(
  objective = "reg:squarederror",
  eta = 0.05,
  max_depth = 6,
  subsample = 0.7,
  colsample_bytree = 0.7
)

set.seed(123)
xgb_model <- xgb.train(
  params = xgb_params,
  data = xgb.DMatrix(data = X_train_mat, label = y_train_vec),
  nrounds = 500,
  watchlist = list(train = xgb.DMatrix(data = X_train_mat, label = y_train_vec)),
  print_every_n = 100,
  early_stopping_rounds = 20,
  verbose = 0
)

SHAP

# Prepare Matrix
X_matrix <- model.matrix(log_real_price ~ . - 1, data = ml_data)

# Calculate SHAP
shap_values <- shap.values(xgb_model = xgb_model, X_train = X_matrix)
shap_long   <- shap.prep(shap_contrib = shap_values$shap_score, X_train = X_matrix)

# Plot 
shap.plot.summary(shap_long) +
  ggtitle("What Drives NYC Housing Prices?",
          subtitle = "XGBoost Feature Importance (SHAP)") +
  theme_minimal(base_size = 14) +
  scale_color_gradientn(
    colors = c("yellow", "purple"),   # yellow = low, purple = high
    name = "Feature Value\nLow → High"
  ) +
  theme(
    legend.position = "right",
    legend.title = element_text(size = 14, face = "bold"),
    legend.text  = element_text(size = 12),
    plot.title    = element_text(size = 18, face = "bold"),
    plot.subtitle = element_text(size = 14)
  )

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

Shiny App

# Shiny App
# UI: The Visual Layout 
ui <- fluidPage(
  theme = bs_theme(version = 4, bootswatch = "minty"),
  
  # Title
  titlePanel("NYC Housing & Crime: A Thesis Dashboard"),
  
  sidebarLayout(
    sidebarPanel(
      width = 3,
      h4("Data Controls"),
      
      hr(),
      helpText("Note: The 'Models' tabs display the static results from your thesis analysis."),
      div(style="text-align:center;", 
          actionButton("update", "Update View", class = "btn-primary"))
    ),
    
    mainPanel(
      tabsetPanel(
        

        
        
        tabPanel("Geospatial Map", icon = icon("map"),
                 br(),
                 p("Visualizing Crime Density (Crimes per Sq. Mile) by Zip Code"),
                 sliderInput("map_year", "Select Map Year:", 
                             min = 2012, max = 2023, value = 2023, 
                             step = 1, animate = TRUE),
                 leafletOutput("crime_map", height = "600px")
        ),
        

        
        tabPanel("Data Table", icon = icon("table"),
                 br(),
                 DTOutput("raw_table")
        )
      )
    )
  )
)


server <- function(input, output, session) {
  

  filtered_data <- reactive({
    req(input$update) 
    isolate({
      final_df %>%
        filter(borough %in% input$boro_select,
               year >= input$year_range[1],
               year <= input$year_range[2])
    })
  })
  

  
 
  output$plot_boro_trends <- renderPlotly({
    p <- ggplot(filtered_data(), aes(x = year, y = crime_rate_per_1k, color = borough)) +
      geom_smooth(se=FALSE) +
      labs(title = "Smoothed Crime Trends by Borough", y = "Crime Rate (per 1k)") +
      theme_minimal()
    ggplotly(p)
  })
  

  output$crime_map <- renderLeaflet({
    target_year <- input$map_year
    
    map_df <- crime_by_zip %>%
      filter(year == target_year) %>%
      inner_join(zip_area, by=c("zip_code"="ZCTA5CE20")) %>%
      mutate(density = total_crimes_zip / area_sq_miles)
    
    #  Join with Geometry
    map_sf <- nyc_zips %>%
      inner_join(map_df, by=c("ZCTA5CE20"="zip_code")) %>%
      st_transform(4326) 
    
    #  Palette
    pal <- colorNumeric(palette = "YlOrRd", domain = map_sf$density)
    
    leaflet(map_sf) %>%
      addProviderTiles(providers$CartoDB.Positron) %>%
      addPolygons(
        fillColor = ~pal(density),
        weight = 1, color = "white", fillOpacity = 0.7,
        popup = ~paste0("<b>Zip:</b> ", ZCTA5CE20, "<br>",
                        "<b>Crimes/SqMi:</b> ", round(density, 1))
      ) %>%
      addLegend("bottomright", pal = pal, values = ~density,
                title = paste("Crime Density", target_year))
  })
  

  output$plot_structural_break <- renderPlot({

    
    ggplot(final_df, aes(x = crime_rate_per_1k, y = log_real_price, color = (year >= 2020))) +
      geom_point(alpha = 0.1) +
      geom_smooth(method = "lm", size=2) +
      scale_color_manual(values = c("TRUE"="#D35400", "FALSE"="#2980B9"),
                         labels = c("Pre-Pandemic", "Post-Pandemic"), name="Era") +
      labs(title = "Visualizing the Structural Break (Interaction)",
           subtitle = "Steeper slope = Higher sensitivity to crime",
           x = "Crime Rate", y = "Log Real Price") +
      theme_minimal(base_size = 14)
  })
  

  output$plot_quantile <- renderPlot({
   
    taus <- c(0.1, 0.25, 0.5, 0.75, 0.9)
    # Fast re-run for the plot
    mq_app <- rq(log_real_price ~ crime_rate_per_1k + gross_square_feet + borough, 
                 data = final_df, tau = taus)
    
    plot(summary(mq_app), parm="crime_rate_per_1k", 
         main="Impact of Crime across Price Quantiles")
  })
  

  output$plot_shap <- renderPlot({

    req(exists("shap_long"))
    shap.plot.summary(shap_long) + 
      theme_minimal() + 
      ggtitle("XGBoost Feature Importance")
  })
  

  output$raw_table <- renderDT({
    datatable(head(filtered_data(), 100), options = list(scrollX = TRUE))
  })
}

# Run the app
shinyApp(ui = ui, server = server)

Shiny applications not supported in static R Markdown documents

The Impact of Crimes on Housing Prices in NYC

Jiaxin Zheng

2025-12-17

Read and Clean the Data

Crime Rate Trend 2012-2023

The “Table 1”: Descriptive Statistics

The “Decoupling” Plot (Crime vs. Price Trends)

The “Hotspot” Evolution (Crime Density)

Evolution of Spatial Crime Clusters (LISA Analysis)

Model 1: Baseline OLS

Model 3: XGBoost with Tuned

SHAP

Shiny App