# Load the library
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(sf)
## Warning: package 'sf' was built under R version 4.4.2
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(tigris)
## Warning: package 'tigris' was built under R version 4.4.3
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.4.3
library(leaflet.extras)
## Warning: package 'leaflet.extras' was built under R version 4.4.3
library(ggplot2)
library(broom)
## Warning: package 'broom' was built under R version 4.4.2
library(gt)
## Warning: package 'gt' was built under R version 4.4.3
library(lubridate)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.3
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(spdep)
## Warning: package 'spdep' was built under R version 4.4.3
## Loading required package: spData
## Warning: package 'spData' was built under R version 4.4.3
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
##
## The following object is masked from 'package:scales':
##
## viridis_pal
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.4.2
library(quantreg)
## Warning: package 'quantreg' was built under R version 4.4.2
## Loading required package: SparseM
##
## Attaching package: 'quantreg'
##
## The following object is masked from 'package:gt':
##
## latex
library(dplyr)
library(SHAPforxgboost)
## Warning: package 'SHAPforxgboost' was built under R version 4.4.3
library(shiny)
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
##
## Attaching package: 'DT'
##
## The following objects are masked from 'package:shiny':
##
## dataTableOutput, renderDataTable
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:xgboost':
##
## slice
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(bslib)
## Warning: package 'bslib' was built under R version 4.4.2
##
## Attaching package: 'bslib'
##
## The following object is masked from 'package:spdep':
##
## card
##
## The following object is masked from 'package:broom':
##
## bootstrap
##
## The following object is masked from 'package:utils':
##
## page
library(marginaleffects)
## Warning: package 'marginaleffects' was built under R version 4.4.3
Read and Clean the Data
# Read the Data from folder
crime <- read_csv("D:/SPS/3rd Semester/Data 608_Master's Research Project/Data/NYPD_Complaint_Data/NYPD_Complaint_Data_Historic_2012_2023.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 5796339 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): RPT_DT, OFNS_DESC, PD_DESC, LAW_CAT_CD, BORO_NM, PREM_TYP_DESC, SUS...
## dbl (3): CMPLNT_NUM, Latitude, Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_sales <- read_csv("D:/SPS/3rd Semester/Data 608_Master's Research Project/Data/Sales/all_sales_clean.csv")
## Rows: 719099 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): neighborhood, building_class_category, address, building_class_at_...
## dbl (12): block, lot, zip_code, residential_units, commercial_units, total_u...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cpi_raw <- read_excel("D:/SPS/3rd Semester/Data 608_Master's Research Project/Data/CPI/cpi.xlsx", skip = 11)
# Clean Crime & Sales
crime_clean <- crime %>%
mutate(
RPT_DT = mdy(RPT_DT),
year = year(RPT_DT),
borough = str_to_title(BORO_NM)
) %>%
filter(!is.na(borough), year >= 2012)
sales_clean <- all_sales %>%
clean_names() %>%
mutate(
borough = dplyr::recode(str_to_title(borough),
"Staten" = "Staten Island",
"Staten Island" = "Staten Island",
"Statenisland" = "Staten Island"),
zip_code = as.character(zip_code)
)
# CPI & Real Price Adjustment
cpi_annual <- cpi_raw %>%
select(-contains("HALF")) %>%
pivot_longer(cols = c(Jan:Dec), names_to = "month", values_to = "cpi_value") %>%
group_by(Year) %>%
summarize(cpi_value = mean(cpi_value, na.rm = TRUE), .groups = "drop") %>%
rename(year = Year)
cpi_base <- cpi_annual %>% filter(year == 2023) %>% pull(cpi_value)
# Spatial Geometry & Area Calculation
nyc_zips <- zctas(cb = TRUE, starts_with = "1", year = 2020, class = "sf") %>%
st_transform(2263) # Long Island ft
## ZCTAs can take several minutes to download. To cache the data and avoid re-downloading in future R sessions, set `options(tigris_use_cache = TRUE)`
## | | | 0% | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |== | 4% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======= | 11% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========= | 14% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 19% | |============== | 20% | |============== | 21% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================ | 24% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |===================== | 31% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================= | 34% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |============================== | 44% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |===================================== | 54% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |=================================================== | 74% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |======================================================== | 81% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |========================================================== | 84% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================= | 94% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 99% | |======================================================================| 100%
# Calculate Area in Sq Miles
zip_area <- nyc_zips %>%
mutate(area_sq_miles = as.numeric(st_area(.)) / 27878400) %>%
st_drop_geometry() %>%
select(ZCTA5CE20, area_sq_miles)
# Spatial Join (Crime to Zip)
crime_sf <- crime_clean %>%
filter(!is.na(Longitude), !is.na(Latitude)) %>%
st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>%
st_transform(st_crs(nyc_zips))
crime_by_zip <- st_join(crime_sf, nyc_zips, join = st_within) %>%
st_drop_geometry() %>%
group_by(year, ZCTA5CE20) %>%
summarize(total_crimes_zip = n(), .groups = "drop") %>%
rename(zip_code = ZCTA5CE20)
# Load ACS Zip Population
folder <- "D:/SPS/3rd Semester/Data 608_Master's Research Project/Data/ACS population by ZIP (B01003)"
files <- list.files(folder, pattern = "B01003-Data\\.csv$", full.names = TRUE)
read_pop <- function(file) {
year_val <- str_extract(file, "20\\d{2}")
read_csv(file, show_col_types = FALSE) %>%
filter(GEO_ID != "Geography") %>%
mutate(
year = as.numeric(year_val),
zip_code = str_sub(GEO_ID, -5),
population = as.numeric(B01003_001E)
) %>%
select(year, zip_code, population)
}
zip_population_all <- map_dfr(files, read_pop)
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## • `` -> `...5`
# Merge all data set
final_df <- sales_clean %>%
# Join CPI (standard join on 'year')
left_join(cpi_annual, by = "year") %>%
# Join Crime (standard join on 'year' and 'zip_code')
left_join(crime_by_zip, by = c("year", "zip_code")) %>%
# Join Population (standard join on 'year' and 'zip_code')
left_join(zip_population_all, by = c("year", "zip_code")) %>%
# Join Area
left_join(zip_area, by = c("zip_code" = "ZCTA5CE20")) %>%
mutate(
# Handle missing crime counts (NA -> 0)
total_crimes_zip = replace_na(total_crimes_zip, 0),
# Crime Density (Crimes per Sq Mile)
crime_density = total_crimes_zip / area_sq_miles,
# Crime Rate (Crimes per 100,000 people)
population = replace(population, population < 100, NA),
crime_rate_per_100k = (total_crimes_zip / population) * 100000,
# Real Price Calculation
real_price = sale_price * (cpi_base / cpi_value),
log_real_price = log(real_price),
# Pandemic Indicator
pandemic = if_else(year >= 2020, 1, 0)
) %>%
filter(
!is.na(log_real_price),
!is.na(crime_rate_per_100k),
is.finite(crime_rate_per_100k),
!is.na(area_sq_miles)
)
# View the final dataset
print(paste("Rows in final dataset:", nrow(final_df)))
## [1] "Rows in final dataset: 660046"
head(final_df)
## # A tibble: 6 × 27
## neighborhood building_class_category block lot address zip_code
## <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 BATHGATE 01 ONE FAMILY DWELLINGS 3028 25 412 EAST 179TH STR… 10457
## 2 BATHGATE 01 ONE FAMILY DWELLINGS 3039 28 2329 WASHINGTON AV… 10458
## 3 BATHGATE 01 ONE FAMILY DWELLINGS 3039 28 2329 WASHINGTON AV… 10458
## 4 BATHGATE 01 ONE FAMILY DWELLINGS 3046 39 2075 BATHGATE AVEN… 10457
## 5 BATHGATE 01 ONE FAMILY DWELLINGS 3046 52 2047 BATHGATE AVEN… 10457
## 6 BATHGATE 01 ONE FAMILY DWELLINGS 3048 53 4411 3 AVENUE 10457
## # ℹ 21 more variables: residential_units <dbl>, commercial_units <dbl>,
## # total_units <dbl>, land_square_feet <dbl>, gross_square_feet <dbl>,
## # year_built <dbl>, tax_class_at_time_of_sale <dbl>,
## # building_class_at_time_of_sale <chr>, sale_price <dbl>, source_file <chr>,
## # year <dbl>, borough <chr>, cpi_value <dbl>, total_crimes_zip <int>,
## # population <dbl>, area_sq_miles <dbl>, crime_density <dbl>,
## # crime_rate_per_100k <dbl>, real_price <dbl>, log_real_price <dbl>, …
Crime Rate Trend 2012-2023
crime_summary <- crime_clean %>%
group_by(year, borough) %>%
summarize(total_crimes = n(), .groups = "drop")
# map Zips to Boroughs
zip_to_borough <- sales_clean %>%
select(zip_code, borough) %>%
distinct()
population_long <- zip_population_all %>%
inner_join(zip_to_borough, by = "zip_code") %>%
group_by(year, borough) %>%
summarize(population = sum(population, na.rm = TRUE), .groups = "drop")
## Warning in inner_join(., zip_to_borough, by = "zip_code"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 12 of `x` matches multiple rows in `y`.
## ℹ Row 71 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# PLOT
crime_trend_data <- crime_summary %>%
left_join(population_long, by = c("year", "borough")) %>%
mutate(crime_rate_per_100k = (total_crimes / population) * 100000) %>%
filter(!is.na(crime_rate_per_100k))
ggplot(crime_trend_data, aes(x = year, y = crime_rate_per_100k, color = borough)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2) +
geom_vline(xintercept = 2020, linetype = "dashed", color = "black", linewidth = 0.8) +
annotate("text", x = 2020.2, y = max(crime_trend_data$crime_rate_per_100k),
label = "Pandemic Start", hjust = 0, vjust = 1, fontface = "italic") +
scale_y_continuous(labels = comma) +
scale_x_continuous(breaks = 2012:2023) +
scale_color_brewer(palette = "Set1") +
labs(title = "Crime Rate Trends (2012–2023)",
y = "Crime Rate (per 100k)", x = "Year", color = "Borough") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 16))

The “Table 1”: Descriptive Statistics
# Table 1: Descriptive Statistics
table1_clean <- final_df %>%
select(borough, sale_price, real_price, crime_rate_per_100k, gross_square_feet) %>%
mutate(
sale_price = sale_price / 1000,
real_price = real_price / 1000
) %>%
group_by(borough) %>%
summarize(
Obs = n(),
`Nominal Price ($K)` = mean(sale_price, na.rm = TRUE),
`Real Price ($K)` = mean(real_price, na.rm = TRUE),
`Crime Rate` = mean(crime_rate_per_100k, na.rm = TRUE), # Shortened name
`Sq Ft` = mean(gross_square_feet, na.rm = TRUE)
) %>%
ungroup() %>%
# Pass to gt for formatting
gt() %>%
tab_header(
title = "Table 1: Descriptive Statistics by Borough",
subtitle = "Mean values (2012–2023)"
) %>%
# Format the integer columns (Price, Obs, Sq Ft) with commas and 0 decimals
fmt_number(
columns = c(Obs, `Nominal Price ($K)`, `Real Price ($K)`, `Sq Ft`),
decimals = 0,
use_seps = TRUE
) %>%
# Format Crime Rate
fmt_number(
columns = c(`Crime Rate`),
decimals = 2
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
heading.align = "left"
)
# Display the table
table1_clean
| Table 1: Descriptive Statistics by Borough |
| Mean values (2012–2023) |
| borough |
Obs |
Nominal Price ($K) |
Real Price ($K) |
Crime Rate |
Sq Ft |
| Bronx |
54,160 |
1,076 |
1,235 |
6,419.29 |
6,885 |
| Brooklyn |
170,637 |
1,399 |
1,598 |
5,319.05 |
3,975 |
| Manhattan |
176,997 |
3,600 |
4,178 |
7,324.37 |
7,969 |
| Queens |
195,663 |
970 |
1,101 |
4,185.83 |
2,003 |
| Staten Island |
62,589 |
603 |
690 |
4,265.28 |
1,814 |
The “Decoupling” Plot (Crime vs. Price Trends)
# Calculate Index (2012 = 100)
trend_index <- final_df %>%
group_by(year) %>%
summarize(
median_real_price = median(real_price, na.rm = TRUE),
mean_crime_rate = mean(crime_rate_per_100k, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(year) %>%
mutate(
base_price = first(median_real_price),
base_crime = first(mean_crime_rate),
Price_Index = (median_real_price / base_price) * 100,
Crime_Index = (mean_crime_rate / base_crime) * 100
) %>%
pivot_longer(cols = c("Price_Index", "Crime_Index"),
names_to = "Metric",
values_to = "Value")
# Plot
ggplot(trend_index, aes(x = year, y = Value, color = Metric)) +
geom_hline(yintercept = 100, linetype = "dashed", color = "grey") +
geom_line(linewidth = 1.2) +
geom_point(size = 2) +
annotate("rect", xmin = 2020, xmax = 2023, ymin = -Inf, ymax = Inf, alpha = 0.1, fill = "red") +
annotate("text", x = 2021.5, y = 140, label = "Pandemic Era", fontface = "italic") +
scale_color_manual(values = c("Price_Index" = "#27AE60", "Crime_Index" = "#C0392B"),
labels = c("Crime Rate", "Real Housing Price")) +
scale_x_continuous(breaks = min(trend_index$year):max(trend_index$year)) +
theme_minimal() +
labs(
title = "The Decoupling: Trends Indexed to Base Year",
y = "Growth Index", x = NULL, color = NULL
) +
theme(legend.position = "bottom")

The “Hotspot” Evolution (Crime Density)
# Prepare Density Data
lisa_prep_data <- crime_by_zip %>%
inner_join(zip_population_all, by = c("year", "zip_code")) %>%
mutate(
population = if_else(population < 100, NA_real_, population),
crime_rate = (total_crimes_zip / population) * 100000
) %>%
filter(!is.na(crime_rate), is.finite(crime_rate))
map_data_all <- crime_by_zip %>%
# Join with Area to calculate density
inner_join(zip_area, by = c("zip_code" = "ZCTA5CE20")) %>%
mutate(crime_density = total_crimes_zip / area_sq_miles) %>%
# Filter the year
filter(year %in% c(2012, 2016, 2023))
# Define Global Scale
global_limits <- range(map_data_all$crime_density, na.rm = TRUE)
# Mapping
plot_density_map_robust <- function(target_year) {
# Filter for the specific year
plot_sf <- map_data_all %>%
filter(year == target_year) %>%
inner_join(nyc_zips, by = c("zip_code" = "ZCTA5CE20")) %>%
st_as_sf()
ggplot(plot_sf) +
geom_sf(aes(fill = crime_density), color = NA) +
scale_fill_viridis_c(
option = "inferno",
direction = -1,
trans = "log10",
name = "Crimes/Sq Mi",
labels = comma,
# FORCE the scale to be consistent across all maps
limits = global_limits
) +
labs(title = paste(target_year)) +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
legend.position = "right"
)
}
# 3 Maps
m12 <- plot_density_map_robust(2012)
m16 <- plot_density_map_robust(2016)
m23 <- plot_density_map_robust(2023)
# Combine the maps
combined_plot <- (m12 + m16 + m23) +
plot_layout(guides = "collect") +
plot_annotation(
title = "Evolution of Crime Density in NYC (2012, 2016, 2023)",
theme = theme(plot.title = element_text(size = 18, face = "bold"))
)
combined_plot

Evolution of Spatial Crime Clusters (LISA Analysis)
# Prepare Robust Data
get_lisa_map <- function(target_year, show_legend = FALSE) {
# Filter & Join Geometry
data_yr <- lisa_prep_data %>%
filter(year == target_year) %>%
inner_join(nyc_zips, by = c("zip_code" = "ZCTA5CE20")) %>%
st_as_sf() %>%
st_transform(2263) %>%
drop_na(crime_rate)
# Calculate Neighbors
coords <- st_coordinates(st_centroid(st_geometry(data_yr)))
knn <- knearneigh(coords, k = 4)
nb <- knn2nb(knn)
lw <- nb2listw(nb, style = "W", zero.policy = TRUE)
local_moran <- localmoran(data_yr$crime_rate, lw, zero.policy = TRUE)
# Classify Clusters
mean_val <- mean(data_yr$crime_rate, na.rm = TRUE)
data_yr <- data_yr %>%
mutate(
Ii = local_moran[, 1],
p_value = local_moran[, 5],
lag_val = lag.listw(lw, crime_rate, zero.policy = TRUE),
cluster = case_when(
p_value >= 0.05 ~ "Not Significant",
crime_rate > mean_val & lag_val > mean(lag_val) ~ "High-High",
crime_rate < mean_val & lag_val < mean(lag_val) ~ "Low-Low",
crime_rate > mean_val & lag_val < mean(lag_val) ~ "High-Low",
crime_rate < mean_val & lag_val > mean(lag_val) ~ "Low-High"
)
) %>%
mutate(cluster = factor(cluster, levels = c("High-High", "Low-Low", "Low-High", "High-Low", "Not Significant")))
p <- ggplot(data_yr) +
geom_sf(aes(fill = cluster), color = "white", size = 0.1) +
scale_fill_manual(
values = c(
"High-High" = "#D32F2F", # Red
"Low-Low" = "#1976D2", # Blue
"Low-High" = "#388E3C", # Green
"Not Significant" = "grey90"
),
# ... [rest of your code stays the same] ...
drop = FALSE,
name = "Cluster Type"
) +
labs(title = paste(target_year)) +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold")
)
if (show_legend == TRUE) {
p <- p + theme(legend.position = "right")
} else {
p <- p + theme(legend.position = "none")
}
return(p)
}
# Generate Maps
lisa_12 <- get_lisa_map(2012, show_legend = FALSE)
lisa_16 <- get_lisa_map(2016, show_legend = FALSE)
lisa_23 <- get_lisa_map(2023, show_legend = TRUE)
# Combine all th maps
(lisa_12 + lisa_16 + lisa_23) +
plot_annotation(
title = "Evolution of Spatial Crime Clusters (LISA Analysis)",
theme = theme(plot.title = element_text(size = 16, face = "bold"))
)

Model 1: Baseline OLS
# Model 1: Baseline OLS
options(scipen = 999)
model_data <- final_df %>%
mutate(
borough = as.factor(borough),
year = as.factor(year)
)
model_h1 <- lm(
log_real_price ~ crime_rate_per_100k +
gross_square_feet +
year_built +
residential_units +
borough +
year,
data = model_data
)
summary(model_h1)
##
## Call:
## lm(formula = log_real_price ~ crime_rate_per_100k + gross_square_feet +
## year_built + residential_units + borough + year, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.0106 -0.2280 0.2824 0.7201 8.6364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.15609555168 0.01804680998 673.587 < 0.0000000000000002
## crime_rate_per_100k -0.00001958727 0.00000096175 -20.366 < 0.0000000000000002
## gross_square_feet 0.00000061722 0.00000009964 6.194 0.000000000586
## year_built 0.00026089545 0.00000645467 40.420 < 0.0000000000000002
## residential_units 0.00225447813 0.00014324529 15.739 < 0.0000000000000002
## boroughBrooklyn 0.48744968792 0.01083163948 45.002 < 0.0000000000000002
## boroughManhattan 1.10879200787 0.01118918449 99.095 < 0.0000000000000002
## boroughQueens 0.16686318947 0.01081868178 15.424 < 0.0000000000000002
## boroughStaten Island 0.03463129772 0.01262720561 2.743 0.0061
## year2014 0.13419605249 0.01137090646 11.802 < 0.0000000000000002
## year2015 0.20115412608 0.01126401058 17.858 < 0.0000000000000002
## year2016 0.29977836543 0.01140061410 26.295 < 0.0000000000000002
## year2017 0.40831714070 0.01141265384 35.778 < 0.0000000000000002
## year2018 0.43444095788 0.01159497034 37.468 < 0.0000000000000002
## year2019 0.54261703170 0.01299985325 41.740 < 0.0000000000000002
## year2020 0.49656551724 0.01407137721 35.289 < 0.0000000000000002
## year2021 0.60221719428 0.01399993140 43.016 < 0.0000000000000002
## year2022 0.60053238357 0.01428131167 42.050 < 0.0000000000000002
## year2023 0.40001275002 0.01560075305 25.641 < 0.0000000000000002
##
## (Intercept) ***
## crime_rate_per_100k ***
## gross_square_feet ***
## year_built ***
## residential_units ***
## boroughBrooklyn ***
## boroughManhattan ***
## boroughQueens ***
## boroughStaten Island **
## year2014 ***
## year2015 ***
## year2016 ***
## year2017 ***
## year2018 ***
## year2019 ***
## year2020 ***
## year2021 ***
## year2022 ***
## year2023 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.996 on 521488 degrees of freedom
## (138539 observations deleted due to missingness)
## Multiple R-squared: 0.04108, Adjusted R-squared: 0.04105
## F-statistic: 1241 on 18 and 521488 DF, p-value: < 0.00000000000000022
# Model 2: DiD The Pandemic Structural Break
model_h2 <- lm(
log_real_price ~ crime_rate_per_100k * pandemic +
gross_square_feet +
year_built +
residential_units +
borough,
data = model_data
)
summary(model_h2)
##
## Call:
## lm(formula = log_real_price ~ crime_rate_per_100k * pandemic +
## gross_square_feet + year_built + residential_units + borough,
## data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.8224 -0.2339 0.2897 0.7273 8.6984
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 12.4362981074 0.0165056618 753.456
## crime_rate_per_100k -0.0000249441 0.0000010095 -24.709
## pandemic 0.1516218900 0.0150886632 10.049
## gross_square_feet 0.0000005036 0.0000001001 5.030
## year_built 0.0002828456 0.0000064450 43.886
## residential_units 0.0023470944 0.0001436596 16.338
## boroughBrooklyn 0.4737427706 0.0108831690 43.530
## boroughManhattan 1.0714170642 0.0112320121 95.390
## boroughQueens 0.1511611952 0.0108679184 13.909
## boroughStaten Island 0.0394348526 0.0127305841 3.098
## crime_rate_per_100k:pandemic 0.0000198132 0.0000027249 7.271
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## crime_rate_per_100k < 0.0000000000000002 ***
## pandemic < 0.0000000000000002 ***
## gross_square_feet 0.000000491122439 ***
## year_built < 0.0000000000000002 ***
## residential_units < 0.0000000000000002 ***
## boroughBrooklyn < 0.0000000000000002 ***
## boroughManhattan < 0.0000000000000002 ***
## boroughQueens < 0.0000000000000002 ***
## boroughStaten Island 0.00195 **
## crime_rate_per_100k:pandemic 0.000000000000357 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.001 on 521496 degrees of freedom
## (138539 observations deleted due to missingness)
## Multiple R-squared: 0.03558, Adjusted R-squared: 0.03556
## F-statistic: 1924 on 10 and 521496 DF, p-value: < 0.00000000000000022
# Define ONE dataset for ML Model
ml_data <- final_df %>%
select(log_real_price, crime_rate_per_100k, gross_square_feet,
year_built, residential_units, pandemic, borough) %>%
drop_na() %>%
mutate(borough = as.factor(borough))
# Train/Test Split
set.seed(123)
train_index <- createDataPartition(ml_data$log_real_price, p = 0.8, list = FALSE)
train_data <- ml_data[train_index, ]
test_data <- ml_data[-train_index, ]
Model 3: XGBoost with Tuned
# Model 3: XGBoost (Tuned)
X_train_mat <- model.matrix(log_real_price ~ . - 1, data = train_data)
y_train_vec <- train_data$log_real_price
X_test_mat <- model.matrix(log_real_price ~ . - 1, data = test_data)
y_test_vec <- test_data$log_real_price
xgb_params <- list(
objective = "reg:squarederror",
eta = 0.05,
max_depth = 6,
subsample = 0.7,
colsample_bytree = 0.7
)
set.seed(123)
xgb_model <- xgb.train(
params = xgb_params,
data = xgb.DMatrix(data = X_train_mat, label = y_train_vec),
nrounds = 500,
watchlist = list(train = xgb.DMatrix(data = X_train_mat, label = y_train_vec)),
print_every_n = 100,
early_stopping_rounds = 20,
verbose = 0
)
SHAP
# Prepare Matrix
X_matrix <- model.matrix(log_real_price ~ . - 1, data = ml_data)
# Calculate SHAP
shap_values <- shap.values(xgb_model = xgb_model, X_train = X_matrix)
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = X_matrix)
# Plot
shap.plot.summary(shap_long) +
ggtitle("What Drives NYC Housing Prices?",
subtitle = "XGBoost Feature Importance (SHAP)") +
theme_minimal(base_size = 14) +
scale_color_gradientn(
colors = c("yellow", "purple"), # yellow = low, purple = high
name = "Feature Value\nLow → High"
) +
theme(
legend.position = "right",
legend.title = element_text(size = 14, face = "bold"),
legend.text = element_text(size = 12),
plot.title = element_text(size = 18, face = "bold"),
plot.subtitle = element_text(size = 14)
)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

Shiny App
# Shiny App
# UI: The Visual Layout
ui <- fluidPage(
theme = bs_theme(version = 4, bootswatch = "minty"),
# Title
titlePanel("NYC Housing & Crime: A Thesis Dashboard"),
sidebarLayout(
sidebarPanel(
width = 3,
h4("Data Controls"),
hr(),
helpText("Note: The 'Models' tabs display the static results from your thesis analysis."),
div(style="text-align:center;",
actionButton("update", "Update View", class = "btn-primary"))
),
mainPanel(
tabsetPanel(
tabPanel("Geospatial Map", icon = icon("map"),
br(),
p("Visualizing Crime Density (Crimes per Sq. Mile) by Zip Code"),
sliderInput("map_year", "Select Map Year:",
min = 2012, max = 2023, value = 2023,
step = 1, animate = TRUE),
leafletOutput("crime_map", height = "600px")
),
tabPanel("Data Table", icon = icon("table"),
br(),
DTOutput("raw_table")
)
)
)
)
)
server <- function(input, output, session) {
filtered_data <- reactive({
req(input$update)
isolate({
final_df %>%
filter(borough %in% input$boro_select,
year >= input$year_range[1],
year <= input$year_range[2])
})
})
output$plot_boro_trends <- renderPlotly({
p <- ggplot(filtered_data(), aes(x = year, y = crime_rate_per_1k, color = borough)) +
geom_smooth(se=FALSE) +
labs(title = "Smoothed Crime Trends by Borough", y = "Crime Rate (per 1k)") +
theme_minimal()
ggplotly(p)
})
output$crime_map <- renderLeaflet({
target_year <- input$map_year
map_df <- crime_by_zip %>%
filter(year == target_year) %>%
inner_join(zip_area, by=c("zip_code"="ZCTA5CE20")) %>%
mutate(density = total_crimes_zip / area_sq_miles)
# Join with Geometry
map_sf <- nyc_zips %>%
inner_join(map_df, by=c("ZCTA5CE20"="zip_code")) %>%
st_transform(4326)
# Palette
pal <- colorNumeric(palette = "YlOrRd", domain = map_sf$density)
leaflet(map_sf) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
fillColor = ~pal(density),
weight = 1, color = "white", fillOpacity = 0.7,
popup = ~paste0("<b>Zip:</b> ", ZCTA5CE20, "<br>",
"<b>Crimes/SqMi:</b> ", round(density, 1))
) %>%
addLegend("bottomright", pal = pal, values = ~density,
title = paste("Crime Density", target_year))
})
output$plot_structural_break <- renderPlot({
ggplot(final_df, aes(x = crime_rate_per_1k, y = log_real_price, color = (year >= 2020))) +
geom_point(alpha = 0.1) +
geom_smooth(method = "lm", size=2) +
scale_color_manual(values = c("TRUE"="#D35400", "FALSE"="#2980B9"),
labels = c("Pre-Pandemic", "Post-Pandemic"), name="Era") +
labs(title = "Visualizing the Structural Break (Interaction)",
subtitle = "Steeper slope = Higher sensitivity to crime",
x = "Crime Rate", y = "Log Real Price") +
theme_minimal(base_size = 14)
})
output$plot_quantile <- renderPlot({
taus <- c(0.1, 0.25, 0.5, 0.75, 0.9)
# Fast re-run for the plot
mq_app <- rq(log_real_price ~ crime_rate_per_1k + gross_square_feet + borough,
data = final_df, tau = taus)
plot(summary(mq_app), parm="crime_rate_per_1k",
main="Impact of Crime across Price Quantiles")
})
output$plot_shap <- renderPlot({
req(exists("shap_long"))
shap.plot.summary(shap_long) +
theme_minimal() +
ggtitle("XGBoost Feature Importance")
})
output$raw_table <- renderDT({
datatable(head(filtered_data(), 100), options = list(scrollX = TRUE))
})
}
# Run the app
shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents