Libraries
library(readr)
library(rmdformats)
library(prettydoc)
library(shiny)
library(FNN)
library(bslib)
library(formattable)
library(ggplot2)
library(tidyverse)
library(shiny)
library(leaflet)
library(sf)
library(DT)
library(RColorBrewer)
library(viridis)
library(shiny)
library(kableExtra)
library(ggplot2)
library(dplyr)
library(plotly)
library(sf)
library(scales)
library(lubridate)
library(purrr)
library(rsample)
library(xgboost)
library(scales)
library(sf)
library(janitor)
library(readxl)
library(dplyr)
library(tidyr)
library(patchwork)
library(Metrics)
library(xgboost)
library(dplyr)
library(caret)
library(corrplot)
library(shiny)
library(rsconnect)
# sales_2017 <- read_excel("2017_statenisland.xls", skip = 4, col_types = "text") %
# clean_names() %>%
# mutate(year = 2017)
#sales_2018 <- read_excel("2018_statenisland.xlsx", skip = 4, col_types = "text")
# %>%
# clean_names()%>%
# mutate(year = 2018)
# sales_2019 <- read_excel("2019_statenisland.xlsx", skip = 4, col_types = "text") # %>%
# clean_names()%>%
# mutate(year = 2019)
# sales_2020 <- read_excel("2020_staten_island.xlsx", skip = 6, col_types = "text") # %>%
# clean_names()%>%
# mutate(year = 2020)
# sales_2021 <- read_excel("2021_staten_island.xlsx", skip = 6, col_types = "text") %>%
# clean_names()%>%
# mutate(year = 2021)
# sales_2022 <- read_excel("2022_staten_island.xlsx", skip = 6, col_types = "text") # %>%
# clean_names()%>%
# mutate(year = 2022)
# sales_2023 <- read_excel("2023_staten_island.xlsx", skip = 6, col_types = "text") # %>%
# clean_names()%>%
# mutate(year = 2023)
# sales_2024 <- read_excel("2024_staten_island.xlsx", skip = 6, col_types = "text") %>%
# clean_names()%>%
# mutate(year = 2024)
# sales_2025 <- read_excel("sales_2025.xlsx", skip = 4, col_types = "text") %>%
# clean_names()%>%
# mutate(year = 2025)
# ------------------------------------------------------------------
# SALES DATA – REPRODUCIBILITY NOTE
#
# Original Staten Island sales files were provided as Excel workbooks
# with metadata rows and formatting issues.
#
# These files were cleaned, standardized and exported to CSV format
# to ensure consistency, lighter storage and reproducibility.
#
# The cleaned CSV files are hosted on GitHub and loaded directly below.
# The original Excel-to-CSV cleaning process is documented directly above
# ------------------------------------------------------------------
sales_2017 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2017_statenisland.csv")
sales_2018 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2018_statenisland.csv")
sales_2019 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2019_statenisland.csv")
sales_2020 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2020_staten_island.csv")
sales_2021 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2021_staten_island.csv")
sales_2022 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2022_staten_island.csv")
sales_2023 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2023_staten_island.csv")
sales_2024 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2024_staten_island.csv")
sales_2025 <- read_csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/sales_2025.csv")
si_sales_raw <- bind_rows(
sales_2017, sales_2018, sales_2019, sales_2020,
sales_2021, sales_2022, sales_2023, sales_2024, sales_2025)
head(si_sales_raw)
## # A tibble: 6 × 27
## borough neighborhood building_class_categ…¹ tax_class_as_of_fina…² block lot
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 5 ANNADALE 01 ONE FAMILY DWELLIN… 1 5395 19
## 2 5 ANNADALE 01 ONE FAMILY DWELLIN… 1 5407 10
## 3 5 ANNADALE 01 ONE FAMILY DWELLIN… 1 5407 39
## 4 5 ANNADALE 01 ONE FAMILY DWELLIN… 1 5426 32
## 5 5 ANNADALE 01 ONE FAMILY DWELLIN… 1 6205 15
## 6 5 ANNADALE 01 ONE FAMILY DWELLIN… 1 6205 22
## # ℹ abbreviated names: ¹building_class_category,
## # ²tax_class_as_of_final_roll_17_18
## # ℹ 21 more variables: ease_ment <lgl>,
## # building_class_as_of_final_roll_17_18 <chr>, address <chr>,
## # apartment_number <chr>, zip_code <dbl>, residential_units <dbl>,
## # commercial_units <dbl>, total_units <dbl>, land_square_feet <dbl>,
## # gross_square_feet <dbl>, year_built <dbl>, …
## spc_tbl_ [74,985 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ borough : num [1:74985] 5 5 5 5 5 5 5 5 5 5 ...
## $ neighborhood : chr [1:74985] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
## $ building_class_category : chr [1:74985] "01 ONE FAMILY DWELLINGS" "01 ONE FAMILY DWELLINGS" "01 ONE FAMILY DWELLINGS" "01 ONE FAMILY DWELLINGS" ...
## $ tax_class_as_of_final_roll_17_18 : chr [1:74985] "1" "1" "1" "1" ...
## $ block : num [1:74985] 5395 5407 5407 5426 6205 ...
## $ lot : num [1:74985] 19 10 39 32 15 22 58 71 6 14 ...
## $ ease_ment : logi [1:74985] NA NA NA NA NA NA ...
## $ building_class_as_of_final_roll_17_18: chr [1:74985] "A1" "A2" "A2" "A6" ...
## $ address : chr [1:74985] "4 EDWIN STREET" "112 ELMBANK STREET" "193 BATHGATE STREET" "3 OCEAN DRIVEWAY" ...
## $ apartment_number : chr [1:74985] NA NA NA NA ...
## $ zip_code : num [1:74985] 10312 10312 10312 10312 10312 ...
## $ residential_units : num [1:74985] 1 1 1 1 1 1 1 1 1 1 ...
## $ commercial_units : num [1:74985] 0 0 0 0 0 0 0 0 0 0 ...
## $ total_units : num [1:74985] 1 1 1 1 1 1 1 1 1 1 ...
## $ land_square_feet : num [1:74985] 7258 6242 5000 2500 1546 ...
## $ gross_square_feet : num [1:74985] 2230 1768 808 540 1579 ...
## $ year_built : num [1:74985] 1980 1975 1920 1910 1986 ...
## $ tax_class_at_time_of_sale : num [1:74985] 1 1 1 1 1 1 1 1 1 1 ...
## $ building_class_at_time_of_sale : chr [1:74985] "A1" "A2" "A2" "A6" ...
## $ sale_price : num [1:74985] 866000 735000 350000 0 475000 ...
## $ sale_date : num [1:74985] 42950 43046 42933 42889 42985 ...
## $ year : num [1:74985] 2017 2017 2017 2017 2017 ...
## $ tax_class_as_of_final_roll_18_19 : chr [1:74985] NA NA NA NA ...
## $ building_class_as_of_final_roll_18_19: chr [1:74985] NA NA NA NA ...
## $ tax_class_at_present : chr [1:74985] NA NA NA NA ...
## $ building_class_at_present : chr [1:74985] NA NA NA NA ...
## $ easement : logi [1:74985] NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. borough = col_double(),
## .. neighborhood = col_character(),
## .. building_class_category = col_character(),
## .. tax_class_as_of_final_roll_17_18 = col_character(),
## .. block = col_double(),
## .. lot = col_double(),
## .. ease_ment = col_logical(),
## .. building_class_as_of_final_roll_17_18 = col_character(),
## .. address = col_character(),
## .. apartment_number = col_character(),
## .. zip_code = col_double(),
## .. residential_units = col_double(),
## .. commercial_units = col_double(),
## .. total_units = col_double(),
## .. land_square_feet = col_double(),
## .. gross_square_feet = col_double(),
## .. year_built = col_double(),
## .. tax_class_at_time_of_sale = col_double(),
## .. building_class_at_time_of_sale = col_character(),
## .. sale_price = col_double(),
## .. sale_date = col_double(),
## .. year = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# initial dataset clean
si_sales_clean <- si_sales_raw %>%
mutate(
sale_price = as.numeric(sale_price),
residential_units = as.numeric(residential_units),
year_built = as.numeric(year_built)
) %>%
filter(!is.na(sale_price) & sale_price > 10000) %>%
filter(grepl("FAMILY DWELLING", building_class_category)) %>%
select(
year,
neighborhood,
zip_code,
residential_units,
year_built,
sale_price,
land_square_feet
)
head(si_sales_clean)
## # A tibble: 6 × 7
## year neighborhood zip_code residential_units year_built sale_price
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2017 ANNADALE 10312 1 1980 866000
## 2 2017 ANNADALE 10312 1 1975 735000
## 3 2017 ANNADALE 10312 1 1920 350000
## 4 2017 ANNADALE 10312 1 1986 475000
## 5 2017 ANNADALE 10312 1 1986 437500
## 6 2017 ANNADALE 10312 1 1986 354900
## # ℹ 1 more variable: land_square_feet <dbl>
## tibble [40,690 × 7] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:40690] 2017 2017 2017 2017 2017 ...
## $ neighborhood : chr [1:40690] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
## $ zip_code : num [1:40690] 10312 10312 10312 10312 10312 ...
## $ residential_units: num [1:40690] 1 1 1 1 1 1 1 1 1 1 ...
## $ year_built : num [1:40690] 1980 1975 1920 1986 1986 ...
## $ sale_price : num [1:40690] 866000 735000 350000 475000 437500 ...
## $ land_square_feet : num [1:40690] 7258 6242 5000 1546 1546 ...
## year neighborhood zip_code residential_units
## Min. :2017 Length:40690 Min. : 0 Min. :0.000
## 1st Qu.:2018 Class :character 1st Qu.:10305 1st Qu.:1.000
## Median :2021 Mode :character Median :10308 Median :1.000
## Mean :2021 Mean :10306 Mean :1.285
## 3rd Qu.:2023 3rd Qu.:10312 3rd Qu.:2.000
## Max. :2025 Max. :10314 Max. :4.000
## NA's :2
## year_built sale_price land_square_feet
## Min. : 0 Min. : 11500 Min. : 0
## 1st Qu.:1950 1st Qu.: 480000 1st Qu.: 2400
## Median :1975 Median : 610000 Median : 3500
## Mean :1965 Mean : 651616 Mean : 3977
## 3rd Qu.:1995 3rd Qu.: 765000 3rd Qu.: 4700
## Max. :2025 Max. :17000000 Max. :88945
## NA's :42 NA's :2
# summary table of home sales by neighborhood and year
si_neighborhood_summary <- si_sales_clean %>%
group_by(year, neighborhood) %>%
summarise(
avg_sale_price = mean(sale_price, na.rm = TRUE),
min_sale_price = min(sale_price, na.rm = TRUE),
max_sale_price = max(sale_price, na.rm = TRUE),
number_of_sales = n(),
.groups = 'drop'
)
head(si_neighborhood_summary)
## # A tibble: 6 × 6
## year neighborhood avg_sale_price min_sale_price max_sale_price
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2017 ANNADALE 686084. 42000 1320000
## 2 2017 ARDEN HEIGHTS 423725. 157170 1075000
## 3 2017 ARROCHAR 564824. 200000 1150000
## 4 2017 ARROCHAR-SHORE ACRES 495400 150000 755000
## 5 2017 BULLS HEAD 500105. 70000 895000
## 6 2017 CASTLETON CORNERS 534650. 110000 1150000
## # ℹ 1 more variable: number_of_sales <int>
## some eda on that
cat("--- Summary of the si_sales_clean dataset ---\n")
## --- Summary of the si_sales_clean dataset ---
## year neighborhood zip_code residential_units
## Min. :2017 Length:40690 Min. : 0 Min. :0.000
## 1st Qu.:2018 Class :character 1st Qu.:10305 1st Qu.:1.000
## Median :2021 Mode :character Median :10308 Median :1.000
## Mean :2021 Mean :10306 Mean :1.285
## 3rd Qu.:2023 3rd Qu.:10312 3rd Qu.:2.000
## Max. :2025 Max. :10314 Max. :4.000
## NA's :2
## year_built sale_price land_square_feet
## Min. : 0 Min. : 11500 Min. : 0
## 1st Qu.:1950 1st Qu.: 480000 1st Qu.: 2400
## Median :1975 Median : 610000 Median : 3500
## Mean :1965 Mean : 651616 Mean : 3977
## 3rd Qu.:1995 3rd Qu.: 765000 3rd Qu.: 4700
## Max. :2025 Max. :17000000 Max. :88945
## NA's :42 NA's :2
# Get the structure of the dataset to check data types.
cat("\n--- Structure of the si_sales_clean dataset ---\n")
##
## --- Structure of the si_sales_clean dataset ---
## tibble [40,690 × 7] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:40690] 2017 2017 2017 2017 2017 ...
## $ neighborhood : chr [1:40690] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
## $ zip_code : num [1:40690] 10312 10312 10312 10312 10312 ...
## $ residential_units: num [1:40690] 1 1 1 1 1 1 1 1 1 1 ...
## $ year_built : num [1:40690] 1980 1975 1920 1986 1986 ...
## $ sale_price : num [1:40690] 866000 735000 350000 475000 437500 ...
## $ land_square_feet : num [1:40690] 7258 6242 5000 1546 1546 ...
## Note for later: we got 2 NAs at residential_units and 27 at year_built
# Plot 1: Distribution of Sale Prices
# This histogram shows the frequency of sales at different price points.
ggplot(si_sales_clean, aes(x = sale_price)) +
geom_histogram(bins = 50, fill = "purple", color = "black") +
scale_x_log10(labels = scales::dollar_format()) +
labs(
title = "Distribution of Staten Island Home Sale Prices (2017-2025)",
x = "Sale Price (Log Scale)",
y = "Number of Sales"
) +
theme_minimal()

# Plot 2: Median Sale Price Over Time
# This line chart shows how the overall market has changed year over year.
si_yearly_median <- si_sales_clean %>%
group_by(year) %>%
summarise(median_sale_price = median(sale_price, na.rm = TRUE))
ggplot(si_yearly_median, aes(x = year, y = median_sale_price, group = 1)) +
geom_line(color = "navy", size = 1.5) +
geom_point(color = "navy", size = 3) +
scale_y_continuous(labels = scales::dollar_format()) +
labs(
title = "Median Home Sale Price in Staten Island (2017-2025)",
x = "Year",
y = "Median Sale Price"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Plot 3: Number of Sales Per Year
# This bar chart shows the volume of sales activity over time.
ggplot(si_neighborhood_summary, aes(x = year, y = number_of_sales)) +
stat_summary(fun = "sum", geom = "bar", fill = "steelblue") +
labs(
title = "Total Number of Residential Sales in Staten Island (2017-2025)",
x = "Year",
y = "Total Sales"
) +
theme_minimal()

Mapping of Staten Island sales
nta_map <- st_read("nynta2020_25c/nynta2020.shp")
## Reading layer `nynta2020' from data source
## `C:\Users\nicol\OneDrive\Έγγραφα\nynta2020_25c\nynta2020.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 262 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
si_map <- nta_map %>%
filter(BoroName == "Staten Island")
si_sales_clean <- si_sales_clean %>%
mutate(neighborhood_upper = toupper(trimws(neighborhood)))
si_map <- si_map %>%
mutate(NTAName = if_else(NTAName == "Snug Harbor", "St. George-New Brighton", NTAName)) %>%
mutate(NTAName = if_else(NTAName == "Great Kills Park", "Great Kills-Eltingville", NTAName))%>%
mutate(NTAName = if_else(NTAName == "Miller Field", "New Dorp-Midland Beach", NTAName))
# df same as si_sales_clean but with the ntas
si_sales_mapped <- si_sales_clean %>%
mutate(
nta_name = case_when(
grepl("ARDEN HEIGHTS|ROSSVILLE|ROSSVILLE-CHARLESTON|ROSSVILLE-RICHMOND VALLEY", neighborhood_upper) ~ "Arden Heights-Rossville",
grepl("ARROCHAR|ARROCHAR-SHORE ACRES|DONGAN HILLS|DONGAN HILLS-COLONY|DONGAN HILLS-OLD TOWN|GRASMERE|SOUTH BEACH", neighborhood_upper) ~ "Grasmere-Arrochar-South Beach-Dongan Hills",
grepl("ANNADALE|HUGUENOT|PRINCES BAY|WOODROW|PLEASANT PLAINS", neighborhood_upper) ~ "Annadale-Huguenot-Prince's Bay-Woodrow",
grepl("BULLS HEAD|NEW SPRINGVILLE|TRAVIS|WILLOWBROOK|BLOOMFIELD", neighborhood_upper) ~ "New Springville-Willowbrook-Bulls Head-Travis",
grepl("CASTLETON CORNERS|WESTERLEIGH|CLOVE LAKES|SUNNYSIDE", neighborhood_upper) ~ "Westerleigh-Castleton Corners",
grepl("CONCORD|CONCORD-FOX HILLS|STAPLETON|STAPLETON-CLIFTON|TOMPKINSVILLE", neighborhood_upper) ~ "Tompkinsville-Stapleton-Clifton-Fox Hills",
grepl("ELTINGVILLE|GREAT KILLS|GREAT KILLS-BAY TERRACE", neighborhood_upper) ~ "Great Kills-Eltingville",
grepl("GRYMES HILL|SILVER LAKE|WEST NEW BRIGHTON|LIVINGSTON", neighborhood_upper) ~ "West New Brighton-Silver Lake-Grymes Hill",
grepl("EMERSON HILL|MANOR HEIGHTS|TODT HILL", neighborhood_upper) ~ "Todt Hill-Emerson Hill-Lighthouse Hill-Manor Heights",
grepl("PORT RICHMOND", neighborhood_upper) ~ "Port Richmond",
grepl("TOTTENVILLE", neighborhood_upper) ~ "Tottenville-Charleston",
grepl("OAKWOOD|OAKWOOD-BEACH|RICHMONDTOWN|RICHMONDTOWN-LIGHTHS HILL", neighborhood_upper) ~ "Oakwood-Richmondtown",
grepl("NEW DORP|MIDLAND BEACH|NEW DORP-BEACH|NEW DORP-HEIGHTS|GRANT CITY", neighborhood_upper) ~ "New Dorp-Midland Beach",
grepl("ROSEBANK", neighborhood_upper) ~ "Rosebank-Shore Acres-Park Hill",
grepl("MARINERS HARBOR|PORT IVORY", neighborhood_upper) ~ "Mariner's Harbor-Arlington-Graniteville",
grepl("NEW BRIGHTON", neighborhood_upper) ~ "St. George-New Brighton",
grepl("FRESH KILLS", neighborhood_upper) ~ "Freshkills Park (South)",
TRUE ~ "Other" # Catches any names that don't fit the rules above
)
)
other_rows <- si_sales_mapped[si_sales_mapped$nta_name == "Other", ]
nrow(other_rows)
## [1] 0
si_sales_mapped$land_square_feet <- as.numeric(si_sales_mapped$land_square_feet)
si_sales_mapped <- si_sales_mapped %>%
mutate(year_built = na_if(year_built, 0))
summary(si_sales_mapped)
## year neighborhood zip_code residential_units
## Min. :2017 Length:40690 Min. : 0 Min. :0.000
## 1st Qu.:2018 Class :character 1st Qu.:10305 1st Qu.:1.000
## Median :2021 Mode :character Median :10308 Median :1.000
## Mean :2021 Mean :10306 Mean :1.285
## 3rd Qu.:2023 3rd Qu.:10312 3rd Qu.:2.000
## Max. :2025 Max. :10314 Max. :4.000
## NA's :2
## year_built sale_price land_square_feet neighborhood_upper
## Min. :1835 Min. : 11500 Min. : 0 Length:40690
## 1st Qu.:1950 1st Qu.: 480000 1st Qu.: 2400 Class :character
## Median :1975 Median : 610000 Median : 3500 Mode :character
## Mean :1969 Mean : 651616 Mean : 3977
## 3rd Qu.:1995 3rd Qu.: 765000 3rd Qu.: 4700
## Max. :2025 Max. :17000000 Max. :88945
## NA's :134 NA's :2
## nta_name
## Length:40690
## Class :character
## Mode :character
##
##
##
##
## # A tibble: 6 × 9
## year neighborhood zip_code residential_units year_built sale_price
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2017 ANNADALE 10312 1 1980 866000
## 2 2017 ANNADALE 10312 1 1975 735000
## 3 2017 ANNADALE 10312 1 1920 350000
## 4 2017 ANNADALE 10312 1 1986 475000
## 5 2017 ANNADALE 10312 1 1986 437500
## 6 2017 ANNADALE 10312 1 1986 354900
## # ℹ 3 more variables: land_square_feet <dbl>, neighborhood_upper <chr>,
## # nta_name <chr>
## tibble [40,690 × 9] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:40690] 2017 2017 2017 2017 2017 ...
## $ neighborhood : chr [1:40690] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
## $ zip_code : num [1:40690] 10312 10312 10312 10312 10312 ...
## $ residential_units : num [1:40690] 1 1 1 1 1 1 1 1 1 1 ...
## $ year_built : num [1:40690] 1980 1975 1920 1986 1986 ...
## $ sale_price : num [1:40690] 866000 735000 350000 475000 437500 ...
## $ land_square_feet : num [1:40690] 7258 6242 5000 1546 1546 ...
## $ neighborhood_upper: chr [1:40690] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
## $ nta_name : chr [1:40690] "Annadale-Huguenot-Prince's Bay-Woodrow" "Annadale-Huguenot-Prince's Bay-Woodrow" "Annadale-Huguenot-Prince's Bay-Woodrow" "Annadale-Huguenot-Prince's Bay-Woodrow" ...
neighborhood_prices <- si_sales_mapped %>%
filter(nta_name != "Other") %>%
group_by(nta_name) %>%
summarise(overall_avg_price = mean(sale_price, na.rm = TRUE))
si_map_prices <- left_join(si_map, neighborhood_prices, by = c("NTAName" = "nta_name"))
# overall avg price per neighborhood
get_clean_label <- function(name_vec) {
sapply(name_vec, function(name) {
if (is.na(name)) return(NA)
strsplit(name, "-")[[1]][1] |> trimws()
})
}
si_map_prices <- si_map_prices %>%
mutate(
label_name = get_clean_label(NTAName),
centroid = st_centroid(geometry),
label_color = if_else(overall_avg_price > 900000, "white", "black")
)
heatmap<- ggplot(data = si_map_prices) +
geom_sf(aes(fill = overall_avg_price), color = "white", size = 0.3) +
geom_sf_text(
aes(geometry = centroid, label = label_name, color = label_color),
size = 2,
fontface = "bold",
alpha = 0.9
) +
scale_color_identity() +
scale_fill_viridis_c(
option = "mako",
direction = -1,
labels = scales::dollar_format(),
name = "Avg. Sale Price"
) +
labs(
title = "Where Are Homes Most Expensive in Staten Island?",
subtitle = "Southern neighborhoods consistently show higher prices (2017–2025)",
caption = "Labels shortened for readability"
) +
theme_void(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
plot.subtitle = element_text(size = 14, hjust = 0.5),
plot.caption = element_text(size = 10, color = "gray40", hjust = 0.5),
legend.position = "right"
)
heatmap

num_vars <- sapply(si_sales_mapped, is.numeric)
num_data <- si_sales_mapped[ , num_vars]
n_num <- ncol(num_data)
par(mfrow = c(2, 3))
for (colname in names(num_data)) {
hist(num_data[[colname]],
main = paste("Histogram of", colname),
xlab = colname,
col = "purple",
breaks = 30)
}

# boxplots
par(mfrow = c(2, 3))
for (colname in names(num_data)) {
boxplot(num_data[[colname]],
horizontal = TRUE,
main = paste("Boxplot of", colname),
xlab = colname,
col = "orange")
}

hist(num_data$sale_price)
boxplot(num_data$sale_price)

heat_data <- si_sales_mapped %>%
mutate(
year_bin = cut(
year_built,
breaks = seq(1830, 2030, by = 10),
labels = paste0(seq(1830, 2020, by = 10), "-", seq(1840, 2030, by = 10))
)
) %>%
group_by(nta_name, year_bin) %>%
summarise(avg_price = mean(sale_price, na.rm = TRUE), .groups = "drop")%>%
drop_na(year_bin)
nta_order <- heat_data %>%
group_by(nta_name) %>%
summarise(avg = mean(avg_price, na.rm = TRUE)) %>%
arrange(desc(avg)) %>%
pull(nta_name)
ggplot(heat_data, aes(x = year_bin, y = factor(nta_name, levels = nta_order), fill = avg_price)) +
geom_tile(color = "white") +
scale_fill_viridis_c(option = "plasma", labels = scales::dollar) +
labs(
x = "Decade Built",
y = "Neighborhood (NTA)",
fill = "Average Price",
title = "Average Sale Price by NTA and Decade Built"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45,size = 8, hjust = 1),
axis.text.y = element_text(size = 6)
)

CPI
cpi_all_items_raw <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/CPI_allcostumer.csv")
colnames(cpi_all_items_raw) <- c("date", "cpi_value")
cpi2025 <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/cpi_2025.csv")
colnames(cpi2025) <- c("date", "cpi_value")
cpi_all_items_raw <- bind_rows(cpi_all_items_raw, cpi2025)
head(cpi_all_items_raw)
## # A tibble: 6 × 2
## date cpi_value
## <date> <dbl>
## 1 2017-01-01 267.
## 2 2017-02-01 268.
## 3 2017-03-01 268.
## 4 2017-04-01 268.
## 5 2017-05-01 268.
## 6 2017-06-01 269.
cpi_all_items <- cpi_all_items_raw %>%
mutate(
date = as.Date(date),
year = as.numeric(format(date, "%Y"))
) %>%
group_by(year) %>%
summarise(all_items_annual_avg_cpi = mean(cpi_value, na.rm = TRUE)) %>%
mutate(
previous_year_cpi = lag(all_items_annual_avg_cpi),
all_items_inflation_pct = ((all_items_annual_avg_cpi - previous_year_cpi) / previous_year_cpi) * 100,
all_items_inflation_pct = ifelse(year == 2017, 2.1, all_items_inflation_pct)
) %>%
select(year, all_items_annual_avg_cpi, all_items_inflation_pct)
cpi_vs_home <- si_yearly_median %>%
left_join(cpi_all_items, by = "year")
p_home <- ggplot(cpi_vs_home, aes(x = year, y = median_sale_price)) +
geom_line(color = "darkblue", linewidth = 1.2) +
geom_point(color = "darkblue", size = 3) +
scale_y_continuous(labels = scales::dollar_format()) +
labs(
title = "Staten Island Median Home Prices",
x = "Year",
y = "Median Sale Price (USD)"
) +
theme_minimal(base_size = 15) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title = element_text(face = "bold")
)
p_cpi <- ggplot(cpi_vs_home, aes(x = year, y = all_items_annual_avg_cpi)) +
geom_line(color = "firebrick", linewidth = 1.2, linetype = "dashed") +
geom_point(color = "firebrick", size = 3) +
labs(
title = "Consumer Price Index (All Items)",
x = "Year",
y = "CPI Index"
) +
theme_minimal(base_size = 15) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title = element_text(face = "bold")
)
p_home + p_cpi +
plot_annotation(
title = "Housing Prices and Inflation Trends (2017–2025)",
theme = theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5)
)
)
## Warning: annotation$theme is not a valid theme.
## Please use `theme()` to construct themes.

mortgage_rates_raw <- read.csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/mortgage_rt.csv")
colnames(mortgage_rates_raw) <- c("date", "mortgage_rt")
mort2025<- read.csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/mortgage_rt_2025.csv")
colnames(mort2025) <- c("date", "mortgage_rt")
mortgage_rates_raw <- bind_rows(mortgage_rates_raw, mort2025)
head(mortgage_rates_raw)
## date mortgage_rt
## 1 2017-01-05 4.20
## 2 2017-01-12 4.12
## 3 2017-01-19 4.09
## 4 2017-01-26 4.19
## 5 2017-02-02 4.19
## 6 2017-02-09 4.17
Mortgage rate
mortgage_rates_yearly <- mortgage_rates_raw %>%
mutate(
date = as.Date(date),
year = year(date)
) %>%
group_by(year) %>%
summarise(
avg_mortgage_rate = mean(mortgage_rt, na.rm = TRUE))
print(mortgage_rates_yearly)
## # A tibble: 9 × 2
## year avg_mortgage_rate
## <dbl> <dbl>
## 1 2017 3.99
## 2 2018 4.54
## 3 2019 3.94
## 4 2020 3.11
## 5 2021 2.96
## 6 2022 5.34
## 7 2023 6.81
## 8 2024 6.72
## 9 2025 5.87
economic_factors <- full_join(cpi_all_items, mortgage_rates_yearly, by = "year")
## Normalization to index for cpi
economic_factors <- economic_factors %>%
arrange(year) %>%
mutate(
all_items_annual_avg_cpi =
(all_items_annual_avg_cpi / first(all_items_annual_avg_cpi)) * 100 ) %>%
select(-all_items_inflation_pct)
print(economic_factors)
## # A tibble: 9 × 3
## year all_items_annual_avg_cpi avg_mortgage_rate
## <dbl> <dbl> <dbl>
## 1 2017 100 3.99
## 2 2018 102. 4.54
## 3 2019 104. 3.94
## 4 2020 105. 3.11
## 5 2021 109. 2.96
## 6 2022 116. 5.34
## 7 2023 120. 6.81
## 8 2024 124. 6.72
## 9 2025 128. 5.87
comparison_data <- full_join(si_yearly_median, cpi_all_items, by = "year") %>%
left_join(mortgage_rates_yearly, by = "year") %>%
mutate(price_bar = 0.7)
ggplot(comparison_data, aes(x = as.factor(year))) +
geom_col(aes(y = price_bar, fill = "Median Sale Price"),
alpha = 0.6, width = 0.6) +
geom_text(
aes(y = price_bar,
label = scales::dollar(median_sale_price)),
vjust = -0.6,
size = 3.5,
fontface = "bold"
) +
geom_line(aes(y = all_items_inflation_pct,
color = "Overall Inflation (CPI)"),
linewidth = 1.3, group = 1) +
geom_point(aes(y = all_items_inflation_pct,
color = "Overall Inflation (CPI)"),
size = 3) +
geom_line(aes(y = avg_mortgage_rate,
color = "30-Year Mortgage Rate"),
linewidth = 1.3, linetype = "dashed", group = 1) +
geom_point(aes(y = avg_mortgage_rate,
color = "30-Year Mortgage Rate"),
size = 3) +
scale_y_continuous(
name = "Inflation & Mortgage Rates (%)",
expand = expansion(mult = c(0.05, 0.25))
) +
scale_fill_manual(
values = c("Median Sale Price" = "#FF8C66"),
name = ""
) +
scale_color_manual(
values = c(
"Overall Inflation (CPI)" = "#1F78B4",
"30-Year Mortgage Rate" = "#33A02C"
),
name = ""
) +
labs(
title = "Housing Prices (Labeled) with Inflation and Mortgage Rate Trends",
subtitle = "Bars show annual median prices as labels-lines show macroeconomic rates",
x = "Year"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold")
)

Population growth/decay of NYC and SI
ny_population <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/nypopulation_fred.csv")
## Rows: 8 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): observation_date
## dbl (1): NYPOP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
si_population <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/si_population.csv")
## Rows: 9 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): year
## num (1): population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
new_row <- tibble(
year = 2025,
population = 489345
)
si_population <- bind_rows(si_population, new_row)
## population in 2025 is estimated
print(ny_population)
## # A tibble: 8 × 2
## observation_date NYPOP
## <chr> <dbl>
## 1 1/1/2017 19594.
## 2 1/1/2018 19544.
## 3 1/1/2019 19463.
## 4 1/1/2020 20105.
## 5 1/1/2021 19848.
## 6 1/1/2022 19704.
## 7 1/1/2023 19737.
## 8 1/1/2024 19867.
## # A tibble: 10 × 2
## year population
## <dbl> <dbl>
## 1 2016 473324
## 2 2017 475948
## 3 2018 474101
## 4 2019 474893
## 5 2020 475596
## 6 2021 493194
## 7 2022 492925
## 8 2023 491734
## 9 2024 498212
## 10 2025 489345
Population of ntas
nta_pop <- data.frame(
nta_name = c(
"St. George-New Brighton",
"Tompkinsville-Stapleton-Clifton-Fox Hills",
"Rosebank-Shore Acres-Park Hill",
"West New Brighton-Silver Lake-Grymes Hill",
"Westerleigh-Castleton Corners",
"Port Richmond",
"Mariner's Harbor-Arlington-Graniteville",
"Grasmere-Arrochar-South Beach-Dongan Hills",
"New Dorp-Midland Beach",
"Todt Hill-Emerson Hill-Lighthouse Hill-Manor Heights",
"New Springville-Willowbrook-Bulls Head-Travis",
"Oakwood-Richmondtown",
"Great Kills-Eltingville",
"Arden Heights-Rossville",
"Annadale-Huguenot-Prince's Bay-Woodrow",
"Tottenville-Charleston"
),
population = c(
20225,
17596,
22705,
35491,
31582,
20896,
32812,
36672,
29303,
33699,
42458,
20418,
57572,
31491,
42360,
16845
)
)
## SI
si_population <- si_population %>%
rename(
si_population = population
)
si_population <- si_population %>%
mutate(
yoy_growth_percentage_si = ((si_population - lag(si_population)) / lag(si_population)) * 100
) %>%
filter(year > 2016)
print(si_population)
## # A tibble: 9 × 3
## year si_population yoy_growth_percentage_si
## <dbl> <dbl> <dbl>
## 1 2017 475948 0.554
## 2 2018 474101 -0.388
## 3 2019 474893 0.167
## 4 2020 475596 0.148
## 5 2021 493194 3.70
## 6 2022 492925 -0.0545
## 7 2023 491734 -0.242
## 8 2024 498212 1.32
## 9 2025 489345 -1.78
Crime data
crime_data <- read.csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/NYPD_Arrests_Data__Historic__20250927.csv")
crime_points <- st_as_sf(crime_data,
coords = c("Longitude", "Latitude"),
crs = 4326)
crime_points <- st_transform(crime_points, st_crs(si_map))
crime_with_nta <- st_join(crime_points, si_map, join = st_within)
crime_clean <- crime_with_nta %>%
mutate(
year = year(mdy(ARREST_DATE))
) %>%
select(
year,
OFNS_DESC,
NTAName
)
head(crime_clean)
## Simple feature collection with 6 features and 3 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 937288 ymin: 151474.4 xmax: 966060 ymax: 172421.8
## Projected CRS: NAD83 / New York Long Island (ftUS)
## year OFNS_DESC NTAName
## 1 2024 VEHICLE AND TRAFFIC LAWS Rosebank-Shore Acres-Park Hill
## 2 2024 PETIT LARCENY Freshkills Park (North)
## 3 2024 ASSAULT 3 & RELATED OFFENSES Port Richmond
## 4 2024 FORGERY St. George-New Brighton
## 5 2024 MISCELLANEOUS PENAL LAW Mariner's Harbor-Arlington-Graniteville
## 6 2024 MISCELLANEOUS PENAL LAW Mariner's Harbor-Arlington-Graniteville
## geometry
## 1 POINT (966060 158870)
## 2 POINT (937288 151474.4)
## 3 POINT (947631.5 172421.8)
## 4 POINT (961010 170904)
## 5 POINT (943691.2 170168.5)
## 6 POINT (943691.2 170168.5)
## Classes 'sf' and 'data.frame': 73637 obs. of 4 variables:
## $ year : num 2024 2024 2024 2024 2024 ...
## $ OFNS_DESC: chr "VEHICLE AND TRAFFIC LAWS" "PETIT LARCENY" "ASSAULT 3 & RELATED OFFENSES" "FORGERY" ...
## $ NTAName : chr "Rosebank-Shore Acres-Park Hill" "Freshkills Park (North)" "Port Richmond" "St. George-New Brighton" ...
## $ geometry :sfc_POINT of length 73637; first list element: 'XY' num 966060 158870
## - attr(*, "sf_column")= chr "geometry"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA
## ..- attr(*, "names")= chr [1:3] "year" "OFNS_DESC" "NTAName"
crime_data_2025 <- read.csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/crimedata_2025.csv")
crime_points_25 <- st_as_sf(crime_data_2025,
coords = c("Longitude", "Latitude"),
crs = 4326)
crime_points_25 <- st_transform(crime_points_25, st_crs(si_map))
crime_with_nta_25 <- st_join(crime_points_25, si_map, join = st_within)
crime_clean_25 <- crime_with_nta_25 %>%
mutate(
year = year(mdy(CMPLNT_FR_DT))
) %>%
select(
year,
OFNS_DESC,
NTAName
)
head(crime_clean_25)
## Simple feature collection with 6 features and 3 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 923772 ymin: -2280005 xmax: 31130730 ymax: 170765.5
## Projected CRS: NAD83 / New York Long Island (ftUS)
## year OFNS_DESC
## 1 2025 THEFT-FRAUD
## 2 2025 DANGEROUS WEAPONS
## 3 2025 CRIMINAL MISCHIEF & RELATED OF
## 4 2025 ROBBERY
## 5 2025 MISCELLANEOUS PENAL LAW
## 6 2025 HARRASSMENT 2
## NTAName
## 1 <NA>
## 2 Annadale-Huguenot-Prince's Bay-Woodrow
## 3 Mariner's Harbor-Arlington-Graniteville
## 4 Todt Hill-Emerson Hill-Lighthouse Hill-Manor Heights
## 5 Arden Heights-Rossville
## 6 Annadale-Huguenot-Prince's Bay-Woodrow
## geometry
## 1 POINT (31130727 -2280005)
## 2 POINT (923772 129133)
## 3 POINT (939222.3 170765.5)
## 4 POINT (951842.8 159589)
## 5 POINT (927979 137152.2)
## 6 POINT (932667.8 129951.1)
crime_clean_25<-crime_clean_25 %>%
drop_na()
crime_clean <- bind_rows(crime_clean, crime_clean_25)
crime_clean
## Simple feature collection with 85521 features and 3 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 913511.3 ymin: 121134 xmax: 968884 ymax: 175494
## Projected CRS: NAD83 / New York Long Island (ftUS)
## First 10 features:
## year OFNS_DESC NTAName
## 1 2024 VEHICLE AND TRAFFIC LAWS Rosebank-Shore Acres-Park Hill
## 2 2024 PETIT LARCENY Freshkills Park (North)
## 3 2024 ASSAULT 3 & RELATED OFFENSES Port Richmond
## 4 2024 FORGERY St. George-New Brighton
## 5 2024 MISCELLANEOUS PENAL LAW Mariner's Harbor-Arlington-Graniteville
## 6 2024 MISCELLANEOUS PENAL LAW Mariner's Harbor-Arlington-Graniteville
## 7 2024 GRAND LARCENY OF MOTOR VEHICLE Rosebank-Shore Acres-Park Hill
## 8 2024 ROBBERY Mariner's Harbor-Arlington-Graniteville
## 9 2024 FELONY ASSAULT St. George-New Brighton
## 10 2024 MISCELLANEOUS PENAL LAW St. George-New Brighton
## geometry
## 1 POINT (966060 158870)
## 2 POINT (937288 151474.4)
## 3 POINT (947631.5 172421.8)
## 4 POINT (961010 170904)
## 5 POINT (943691.2 170168.5)
## 6 POINT (943691.2 170168.5)
## 7 POINT (964889 164799)
## 8 POINT (940598.5 167435.6)
## 9 POINT (962808.2 174278.5)
## 10 POINT (962808.2 174278.5)
Crime rate per nta
crime_counts_by_nta_year <- crime_clean %>%
group_by(year, NTAName) %>%
summarise(total_incidents = n(), .groups = "drop") %>%
as.data.frame()
si_map_counts <- si_map %>%
left_join(crime_counts_by_nta_year, by = c("NTAName" = "NTAName"))%>%
filter(!is.na(total_incidents))
## Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 14 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
crime_totals_by_nta <- crime_counts_by_nta_year %>%
group_by(NTAName) %>%
summarise(total_incidents_all_years = sum(total_incidents, na.rm = TRUE), .groups = "drop")
# crime rate per 1,000 residents
nta_pop <- nta_pop %>%
rename(NTAName = nta_name)
nta_pop <- nta_pop %>%
mutate(population = as.numeric(gsub(",", "", population)))
crime_with_pop <- crime_totals_by_nta %>%
left_join(nta_pop, by = "NTAName") %>%
mutate(crime_rate_per_1000 = (total_incidents_all_years / population) * 1000)
crime_with_pop <- crime_with_pop %>%
drop_na()
crime_with_pop <- crime_with_pop %>%
rename(nta_name = NTAName)
head(crime_with_pop)
## # A tibble: 6 × 4
## nta_name total_incidents_all_…¹ population crime_rate_per_1000
## <chr> <int> <dbl> <dbl>
## 1 Annadale-Huguenot-Princ… 2537 42360 59.9
## 2 Arden Heights-Rossville 1494 31491 47.4
## 3 Grasmere-Arrochar-South… 3909 36672 107.
## 4 Great Kills-Eltingville 2717 57572 47.2
## 5 Mariner's Harbor-Arling… 12250 32812 373.
## 6 New Dorp-Midland Beach 6719 29303 229.
## # ℹ abbreviated name: ¹total_incidents_all_years
#### Higher numbers = relatively more crime incidents per resident (higher crime density).
#### Lower numbers = relatively fewer incidents per resident (lower crime density).
ggplot(si_map_counts) +
geom_sf(aes(fill = total_incidents), color = "white", size = 0.2) +
scale_fill_viridis_c(option = "viridis") +
facet_wrap(~year, ncol = 3) +
labs(
title = "Crime Incidents per NTA",
fill = "Incidents"
) +
theme_void(base_size = 19) +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
strip.text = element_text(size = 10),
legend.text = element_text(size = 6),
legend.title = element_text(size = 8, face = "bold")
)

top_10_nta <- crime_counts_by_nta_year %>%
group_by(NTAName) %>%
summarise(total_incidents_all_years = sum(total_incidents)) %>%
slice_max(order_by = total_incidents_all_years, n = 10)
top_10_data <- crime_counts_by_nta_year %>%
filter(NTAName %in% top_10_nta$NTAName)
ggplot(top_10_data, aes(x = reorder(NTAName, total_incidents), y = total_incidents, fill = NTAName)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~year, ncol = 3) +
labs(
title = "Top 10 Neighborhoods by Crime Incidents",
x = "Neighborhood (NTA)",
y = "Total Incidents"
) +
theme(
legend.position = "none",
axis.text.y = element_text(size = 7.2)
)

Average house price per year and nta
# Average house price per year and nta
avg_nta_house_prices <- si_sales_mapped %>%
dplyr::group_by(year, nta_name) %>%
summarise(
avg_sale_price = mean(sale_price, na.rm = TRUE),
.groups = "drop" )
print(avg_nta_house_prices)
## # A tibble: 146 × 3
## year nta_name avg_sale_price
## <dbl> <chr> <dbl>
## 1 2017 Annadale-Huguenot-Prince's Bay-Woodrow 672129.
## 2 2017 Arden Heights-Rossville 480176.
## 3 2017 Freshkills Park (South) 328536
## 4 2017 Grasmere-Arrochar-South Beach-Dongan Hills 584653.
## 5 2017 Great Kills-Eltingville 534381.
## 6 2017 Mariner's Harbor-Arlington-Graniteville 349458.
## 7 2017 New Dorp-Midland Beach 498886.
## 8 2017 New Springville-Willowbrook-Bulls Head-Travis 548967.
## 9 2017 Oakwood-Richmondtown 625325.
## 10 2017 Port Richmond 364958.
## # ℹ 136 more rows
si_map_prices2 <- si_map %>%
left_join(avg_nta_house_prices, by = c("NTAName" = "nta_name"))
## Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 12 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
years <- 2017:2025
all_year_plots <- list()
for (yr in years) {
map_prices_year <- si_map_prices2 %>% filter(year == yr)
map_incidents_year <- si_map_counts %>% filter(year == yr)
p1 <- ggplot(map_prices_year) +
geom_sf(aes(fill = avg_sale_price), color = "white", size = 0.3) +
scale_fill_viridis_c(
option = "magma",
na.value = "grey80",
labels = scales::label_dollar(scale = 1e-3, suffix = "K"),
breaks = c(400000, 600000, 800000, 1000000)
) +
labs(title = paste("Average House Price by NTA (", yr, ")", sep = ""),
fill = "Avg Price") +
theme_void() +
theme(
legend.position = "bottom",
legend.key.height = unit(0.3, "cm"),
legend.key.width = unit(1.2, "cm"),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8)
)
p2 <- ggplot(map_incidents_year) +
geom_sf(aes(fill = total_incidents), color = "white", size = 0.3) +
scale_fill_viridis_c(
option = "viridis",
trans = "log",
na.value = "grey80",
labels = scales::label_comma(),
breaks = c(1, 10, 50, 200, 500, 1000)
) +
labs(title = paste("Incidents by NTA (", yr, ")", sep = ""),
fill = "Incidents") +
theme_void() +
theme(
legend.position = "bottom",
legend.key.height = unit(0.3, "cm"),
legend.key.width = unit(1.2, "cm"),
legend.title = element_text(size = 9),
legend.text = element_text(size = 8)
)
combined <- p1 + p2
all_year_plots[[as.character(yr)]] <- combined
}
big_combined_plot <- wrap_plots(all_year_plots, ncol = 2)
Schools
schools <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/staten_island_schoolss.csv")
## Rows: 79 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): School Name, Nta_name
## dbl (1): Ranking
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
merged_data <- schools %>%
left_join(neighborhood_prices, by = c("Nta_name" = "nta_name"))
ggplot(merged_data, aes(x = overall_avg_price, y = Ranking)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Average Housing Price", y = "School Ranking (1–10)",
title = "Relationship between Housing Prices and School Rankings")
## `geom_smooth()` using formula = 'y ~ x'

Average school ranking per nta
avg_school_rankings_by_nta <- schools %>%
group_by(Nta_name) %>%
summarise(avg_school_ranking = mean(Ranking, na.rm = TRUE), .groups = "drop")
avg_school_rankings_by_nta <- avg_school_rankings_by_nta %>%
rename(nta_name = Nta_name)
avg_school_rankings_by_nta
## # A tibble: 16 × 2
## nta_name avg_school_ranking
## <chr> <dbl>
## 1 Annadale-Huguenot-Prince's Bay-Woodrow 9.17
## 2 Arden Heights-Rossville 8.33
## 3 Grasmere-Arrochar-South Beach-Dongan Hills 6
## 4 Great Kills-Eltingville 9.17
## 5 Mariner's Harbor-Arlington-Graniteville 3.33
## 6 New Dorp-Midland Beach 6.5
## 7 New Springville-Willowbrook-Bulls Head-Travis 6.54
## 8 Oakwood-Richmondtown 10
## 9 Port Richmond 3.5
## 10 Rosebank-Shore Acres-Park Hill 4.33
## 11 St. George-New Brighton 2.43
## 12 Todt Hill-Emerson Hill-Lighthouse Hill-Manor Heights 6
## 13 Tompkinsville-Stapleton-Clifton-Fox Hills 3.25
## 14 Tottenville-Charleston 8.67
## 15 West New Brighton-Silver Lake-Grymes Hill 4.6
## 16 Westerleigh-Castleton Corners 7
schools <- schools %>%
rename(nta_name = Nta_name)
totalincidents_per_nta <- crime_clean %>%
group_by(NTAName) %>%
summarise(total_incidents = n())
crime_housingprice_schools <- merged_data %>%
left_join(totalincidents_per_nta, by = c("Nta_name" = "NTAName"))
cor(crime_housingprice_schools$overall_avg_price, crime_housingprice_schools$Ranking, use = "complete.obs")
## [1] 0.5065811
cor(crime_housingprice_schools$overall_avg_price, crime_housingprice_schools$total_incidents, use = "complete.obs")
## [1] -0.5747773
crime2 <- ggplot(
crime_housingprice_schools,
aes(
x = Ranking,
y = overall_avg_price,
color = total_incidents
)
) +
geom_point(size = 4, alpha = 0.85) +
scale_color_viridis_c(option = "plasma") +
labs(
x = "School Ranking",
y = "Average Housing Price",
color = "Crime Incidents",
title = "Joint Influence of School Quality and Crime on Housing Prices",
subtitle = "Average housing prices by school ranking, colored by crime incidents"
) +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(
size = 15,
face = "bold",
hjust = 0.5
),
plot.subtitle = element_text(
size = 10,
hjust = 0.5
),
axis.title = element_text(
size = 10,
face = "bold"
),
axis.text = element_text(
size = 10
),
legend.title = element_text(
size = 10,
face = "bold"
),
legend.text = element_text(
size = 10
),
legend.position = "right"
)
crime2

## Do neighborhoods with higher-ranked schools tend to have higher housing prices?
crime_housingprice_schools <- crime_housingprice_schools %>%
sf::st_drop_geometry() %>%
mutate(
rank_group = ifelse(Ranking >= 8, "High-Rank Schools", "Low-Rank Schools")
)
price_by_rank <- crime_housingprice_schools %>%
group_by(rank_group) %>%
summarise(
overall_avg_price = mean(overall_avg_price, na.rm = TRUE),
n = n()
)
# Plot 1: Boxplot
p1 <- ggplot(crime_housingprice_schools,
aes(x = rank_group, y = overall_avg_price, fill = rank_group)) +
geom_boxplot(alpha = 0.75) +
scale_fill_brewer(palette = "Set2") +
labs(
x = "School Rank Group",
y = "Average Housing Price (USD)",
title = "Do Higher-Ranked Schools Correspond to Higher Housing Prices?",
subtitle = "Distribution of Housing Prices by School Rank"
) +
theme_minimal() +
theme(legend.position = "none")
# Plot 2: Bar Chart
p2 <- ggplot(price_by_rank,
aes(x = rank_group, y = overall_avg_price, fill = rank_group)) +
geom_col(alpha = 0.75) +
geom_text(aes(label = scales::dollar(overall_avg_price)),
vjust = -0.5, size = 4.2) +
scale_fill_brewer(palette = "Set2") +
labs(
x = "School Rank Group",
y = "Mean Housing Price (USD)",
title = "Do Higher-Ranked Schools Correspond to Higher Housing Prices?",
subtitle ="Mean Housing Price by School Rank Group"
) +
theme_minimal() +
theme(legend.position = "none")
plt <- p1 + p2
plt

DOB Permits
# The original Department of Buildings (DOB) permits file is really large and
# unnecessary for reproducing the results of this project.
# To keep the repository lightweight and fully reproducible, the
# analysis directly loads a cleaned permits dataset from GitHub.
#
# The commented code below documents the full cleaning process used
# to generate the cleaned permits file, including variable selection,
# date parsing and filtering to the 2017–2025 study period.
# permits_raw <- read.csv("DOB_Permits_Staten_Island_20251006 (1).csv")
# permits_clean <- permits_raw %>%
# rename(
# permit_issued_date = Issuance.Date,
# lon = LONGITUDE,
# lat = LATITUDE
# ) %>%
# select(permit_issued_date, lon, lat) %>%
# filter(!is.na(lon) & !is.na(lat))
# permits_clean <- permits_clean %>%
# mutate(
# permit_issued_date = mdy(permit_issued_date),
# year = year(permit_issued_date)
# ) %>%
# filter(year >= 2017, year <= 2025)
# write.csv(
# permits_clean,
# "permits_clean_staten_island_2017_2025.csv",
# row.names = FALSE
# )
# - - - - - - - - - - - - - - - - - - - - -
permits_clean <- read.csv(
"https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/permits_clean_staten_island_2017_2025.csv"
)
permits_sf <- st_as_sf(
permits_clean,
coords = c("lon", "lat"),
crs = 4326, # WGS84 (latitude/longitude)
remove = FALSE
)
si_map <- st_transform(si_map, 4326)
permits_with_nta <- st_join(permits_sf, si_map, join = st_within)
permits_summary <- permits_with_nta %>%
st_drop_geometry() %>%
group_by(year, NTAName) %>%
rename(nta_name=NTAName)%>%
summarise(number_of_permits = n(), .groups = "drop")
head(permits_summary)
## # A tibble: 6 × 3
## year nta_name number_of_permits
## <int> <chr> <int>
## 1 2017 Annadale-Huguenot-Prince's Bay-Woodrow 789
## 2 2017 Arden Heights-Rossville 175
## 3 2017 Freshkills Park (South) 2
## 4 2017 Grasmere-Arrochar-South Beach-Dongan Hills 409
## 5 2017 Great Kills-Eltingville 481
## 6 2017 Mariner's Harbor-Arlington-Graniteville 237
## year nta_name number_of_permits
## Min. :2017 Length:145 Min. : 1.0
## 1st Qu.:2019 Class :character 1st Qu.: 68.0
## Median :2021 Mode :character Median :152.0
## Mean :2021 Mean :180.2
## 3rd Qu.:2023 3rd Qu.:249.0
## Max. :2025 Max. :945.0
## tibble [145 × 3] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:145] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
## $ nta_name : chr [1:145] "Annadale-Huguenot-Prince's Bay-Woodrow" "Arden Heights-Rossville" "Freshkills Park (South)" "Grasmere-Arrochar-South Beach-Dongan Hills" ...
## $ number_of_permits: int [1:145] 789 175 2 409 481 237 945 311 217 253 ...
Preprocessing
si_sales_mapped <- si_sales_mapped %>%
drop_na()
character_cols <- names(si_sales_mapped)[sapply(si_sales_mapped, is.character)]
si_sales_mapped <- si_sales_mapped %>%
mutate(across(all_of(character_cols), as.factor))
si_sales_with_features <- si_sales_mapped %>%
left_join(crime_with_pop %>% select(nta_name, crime_rate_per_1000), by = "nta_name") %>%
left_join(avg_school_rankings_by_nta, by = "nta_name") %>%
left_join(permits_summary, by = c("nta_name", "year"))
si_sales_with_features$building_age <- si_sales_with_features$year - si_sales_with_features$year_built
si_sales_with_features <- si_sales_with_features %>%
mutate(
land_square_feet_log = log1p(land_square_feet)
) %>%
select(-neighborhood_upper) %>%
select(-land_square_feet)
si_sales_with_features <- si_sales_with_features %>%
left_join(si_population, by = "year")
si_sales_with_features <- si_sales_with_features %>%
select(-si_population)
si_sales_with_features <- si_sales_with_features %>%
select(-year_built)
si_sales_with_features <- si_sales_with_features %>%
select(-starts_with("zip_code"))
character_cols <- names(si_sales_with_features)[sapply(si_sales_with_features, is.character)]
cap_values <- quantile(si_sales_with_features$sale_price, c(0.01, 0.99))
si_sales_with_features <- si_sales_with_features %>%
filter(sale_price >= cap_values[1], sale_price <= cap_values[2]) # removes only clearly bad records
si_sales_with_features <- si_sales_with_features %>%
mutate(across(all_of(character_cols), as.factor))
si_sales_with_features<-si_sales_with_features %>%
drop_na()
head(si_sales_with_features)
## # A tibble: 6 × 11
## year neighborhood residential_units sale_price nta_name crime_rate_per_1000
## <dbl> <fct> <dbl> <dbl> <fct> <dbl>
## 1 2017 ANNADALE 1 866000 Annadale-… 59.9
## 2 2017 ANNADALE 1 735000 Annadale-… 59.9
## 3 2017 ANNADALE 1 350000 Annadale-… 59.9
## 4 2017 ANNADALE 1 475000 Annadale-… 59.9
## 5 2017 ANNADALE 1 437500 Annadale-… 59.9
## 6 2017 ANNADALE 1 354900 Annadale-… 59.9
## # ℹ 5 more variables: avg_school_ranking <dbl>, number_of_permits <int>,
## # building_age <dbl>, land_square_feet_log <dbl>,
## # yoy_growth_percentage_si <dbl>
num_data <- si_sales_with_features %>%
select(where(is.numeric))
cat("Number of numeric columns:", ncol(num_data), "\n")
## Number of numeric columns: 9
corr_matrix <- cor(num_data, use = "pairwise.complete.obs")
max_cols <- min(10, ncol(corr_matrix))
round(corr_matrix[1:max_cols, 1:max_cols], 2)
## year residential_units sale_price crime_rate_per_1000
## year 1.00 0.01 0.30 -0.01
## residential_units 0.01 1.00 0.26 0.12
## sale_price 0.30 0.26 1.00 -0.21
## crime_rate_per_1000 -0.01 0.12 -0.21 1.00
## avg_school_ranking 0.01 -0.08 0.27 -0.70
## number_of_permits -0.70 -0.01 -0.09 -0.18
## building_age 0.09 0.00 -0.18 0.22
## land_square_feet_log 0.03 0.19 0.47 -0.05
## yoy_growth_percentage_si -0.09 -0.01 -0.03 -0.01
## avg_school_ranking number_of_permits building_age
## year 0.01 -0.70 0.09
## residential_units -0.08 -0.01 0.00
## sale_price 0.27 -0.09 -0.18
## crime_rate_per_1000 -0.70 -0.18 0.22
## avg_school_ranking 1.00 0.27 -0.30
## number_of_permits 0.27 1.00 -0.15
## building_age -0.30 -0.15 1.00
## land_square_feet_log 0.06 0.06 0.29
## yoy_growth_percentage_si 0.02 0.02 -0.02
## land_square_feet_log yoy_growth_percentage_si
## year 0.03 -0.09
## residential_units 0.19 -0.01
## sale_price 0.47 -0.03
## crime_rate_per_1000 -0.05 -0.01
## avg_school_ranking 0.06 0.02
## number_of_permits 0.06 0.02
## building_age 0.29 -0.02
## land_square_feet_log 1.00 0.00
## yoy_growth_percentage_si 0.00 1.00
## There’s no multicollinearity problem — all correlations are below ±0.9.
Models
set.seed(123)
idx <- sample(seq_len(nrow(si_sales_with_features)), size = 0.8 * nrow(si_sales_with_features))
train_data_wf <- si_sales_with_features[idx, ]
test_data_wf <- si_sales_with_features[-idx, ]
# model matrices
train_features_wf <- model.matrix(~ . - sale_price - 1, data = train_data_wf)
test_features_wf <- model.matrix(~ . - sale_price - 1, data = test_data_wf)
train_target_wf <- train_data_wf$sale_price
test_target_wf <- test_data_wf$sale_price
dtrain_wf <- xgb.DMatrix(data = as.matrix(train_features_wf), label = train_target_wf)
dtest_wf <- xgb.DMatrix(data = as.matrix(test_features_wf), label = test_target_wf)
train_df <- data.frame(train_target_wf, train_features_wf)
lm_model <- lm(train_target_wf ~ ., data = train_df)
summary(lm_model)
##
## Call:
## lm(formula = train_target_wf ~ ., data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -866770 -80932 -1936 75374 1644934
##
## Coefficients: (21 not defined because of singularities)
## Estimate
## (Intercept) -5.795e+07
## year 2.839e+04
## neighborhoodANNADALE 9.008e+04
## neighborhoodARDEN.HEIGHTS -2.542e+04
## neighborhoodARROCHAR 5.515e+04
## neighborhoodARROCHAR.SHORE.ACRES 2.380e+04
## neighborhoodBLOOMFIELD 9.526e+04
## neighborhoodBULLS.HEAD -5.395e+04
## neighborhoodCASTLETON.CORNERS 1.957e+04
## neighborhoodCLOVE.LAKES 4.651e+04
## neighborhoodCONCORD -9.979e+04
## neighborhoodCONCORD.FOX.HILLS -9.283e+04
## neighborhoodDONGAN.HILLS 6.633e+04
## neighborhoodDONGAN.HILLS.COLONY 1.683e+05
## neighborhoodDONGAN.HILLS.OLD.TOWN -9.819e+04
## neighborhoodELTINGVILLE -6.988e+03
## neighborhoodEMERSON.HILL 2.108e+05
## neighborhoodFRESH.KILLS NA
## neighborhoodGRANT.CITY 2.829e+04
## neighborhoodGRASMERE 5.955e+04
## neighborhoodGREAT.KILLS -2.648e+03
## neighborhoodGREAT.KILLS.BAY.TERRACE 1.044e+04
## neighborhoodGRYMES.HILL 2.179e+04
## neighborhoodHUGUENOT 1.019e+05
## neighborhoodLIVINGSTON -3.344e+04
## neighborhoodMANOR.HEIGHTS 2.693e+04
## neighborhoodMARINERS.HARBOR -1.635e+05
## neighborhoodMIDLAND.BEACH -4.028e+04
## neighborhoodNEW.BRIGHTON -9.029e+04
## neighborhoodNEW.BRIGHTON.ST..GEORGE 2.548e+05
## neighborhoodNEW.DORP 5.092e+04
## neighborhoodNEW.DORP.BEACH -8.538e+04
## neighborhoodNEW.DORP.HEIGHTS 5.618e+04
## neighborhoodNEW.SPRINGVILLE 2.087e+04
## neighborhoodOAKWOOD 1.936e+04
## neighborhoodOAKWOOD.BEACH -5.770e+03
## neighborhoodPLEASANT.PLAINS 8.839e+04
## neighborhoodPORT.IVORY -1.987e+05
## neighborhoodPORT.RICHMOND -1.143e+05
## neighborhoodPRINCES.BAY 5.975e+04
## neighborhoodRICHMONDTOWN 1.295e+05
## neighborhoodRICHMONDTOWN.LIGHTHS.HILL 1.962e+05
## neighborhoodROSEBANK -9.302e+03
## neighborhoodROSSVILLE 1.737e+04
## neighborhoodROSSVILLE.CHARLESTON 5.084e+04
## neighborhoodROSSVILLE.PORT.MOBIL NA
## neighborhoodROSSVILLE.RICHMOND.VALLEY 1.532e+05
## neighborhoodSILVER.LAKE 9.783e+04
## neighborhoodSOUTH.BEACH -1.053e+04
## neighborhoodSTAPLETON -8.632e+04
## neighborhoodSTAPLETON.CLIFTON -9.103e+04
## neighborhoodSUNNYSIDE 1.811e+04
## neighborhoodTODT.HILL 3.672e+05
## neighborhoodTOMPKINSVILLE -5.432e+04
## neighborhoodTOTTENVILLE 4.954e+04
## neighborhoodTRAVIS -5.661e+04
## neighborhoodWEST.NEW.BRIGHTON -6.901e+04
## neighborhoodWESTERLEIGH 1.465e+04
## neighborhoodWILLOWBROOK 2.919e+04
## neighborhoodWOODROW NA
## residential_units 9.852e+04
## nta_nameArden.Heights.Rossville NA
## nta_nameFreshkills.Park..South. NA
## nta_nameGrasmere.Arrochar.South.Beach.Dongan.Hills NA
## nta_nameGreat.Kills.Eltingville NA
## nta_nameMariner.s.Harbor.Arlington.Graniteville NA
## nta_nameNew.Dorp.Midland.Beach NA
## nta_nameNew.Springville.Willowbrook.Bulls.Head.Travis NA
## nta_nameOakwood.Richmondtown NA
## nta_namePort.Richmond NA
## nta_nameRosebank.Shore.Acres.Park.Hill NA
## nta_nameSt..George.New.Brighton NA
## nta_nameTodt.Hill.Emerson.Hill.Lighthouse.Hill.Manor.Heights NA
## nta_nameTompkinsville.Stapleton.Clifton.Fox.Hills NA
## nta_nameTottenville.Charleston NA
## nta_nameWest.New.Brighton.Silver.Lake.Grymes.Hill NA
## nta_nameWesterleigh.Castleton.Corners NA
## crime_rate_per_1000 NA
## avg_school_ranking NA
## number_of_permits 1.644e+00
## building_age -2.131e+03
## land_square_feet_log 1.489e+05
## yoy_growth_percentage_si -1.960e+03
## Std. Error t value
## (Intercept) 1.342e+06 -43.167
## year 6.625e+02 42.855
## neighborhoodANNADALE 8.334e+03 10.809
## neighborhoodARDEN.HEIGHTS 8.072e+03 -3.149
## neighborhoodARROCHAR 1.523e+04 3.622
## neighborhoodARROCHAR.SHORE.ACRES 1.821e+04 1.307
## neighborhoodBLOOMFIELD 8.175e+04 1.165
## neighborhoodBULLS.HEAD 8.076e+03 -6.681
## neighborhoodCASTLETON.CORNERS 8.892e+03 2.200
## neighborhoodCLOVE.LAKES 1.063e+04 4.377
## neighborhoodCONCORD 1.035e+04 -9.640
## neighborhoodCONCORD.FOX.HILLS 1.424e+04 -6.521
## neighborhoodDONGAN.HILLS 1.015e+04 6.537
## neighborhoodDONGAN.HILLS.COLONY 1.162e+04 14.486
## neighborhoodDONGAN.HILLS.OLD.TOWN 3.889e+04 -2.525
## neighborhoodELTINGVILLE 7.229e+03 -0.967
## neighborhoodEMERSON.HILL 1.686e+04 12.507
## neighborhoodFRESH.KILLS NA NA
## neighborhoodGRANT.CITY 9.843e+03 2.874
## neighborhoodGRASMERE 1.072e+04 5.556
## neighborhoodGREAT.KILLS 6.706e+03 -0.395
## neighborhoodGREAT.KILLS.BAY.TERRACE 1.192e+04 0.876
## neighborhoodGRYMES.HILL 1.325e+04 1.645
## neighborhoodHUGUENOT 8.729e+03 11.670
## neighborhoodLIVINGSTON 1.420e+04 -2.356
## neighborhoodMANOR.HEIGHTS 8.973e+03 3.001
## neighborhoodMARINERS.HARBOR 8.268e+03 -19.778
## neighborhoodMIDLAND.BEACH 7.668e+03 -5.254
## neighborhoodNEW.BRIGHTON 9.342e+03 -9.665
## neighborhoodNEW.BRIGHTON.ST..GEORGE 5.475e+04 4.655
## neighborhoodNEW.DORP 8.470e+03 6.011
## neighborhoodNEW.DORP.BEACH 9.809e+03 -8.705
## neighborhoodNEW.DORP.HEIGHTS 9.858e+03 5.699
## neighborhoodNEW.SPRINGVILLE 8.054e+03 2.591
## neighborhoodOAKWOOD 1.197e+04 1.617
## neighborhoodOAKWOOD.BEACH 1.131e+04 -0.510
## neighborhoodPLEASANT.PLAINS 1.123e+04 7.874
## neighborhoodPORT.IVORY 1.417e+04 -14.021
## neighborhoodPORT.RICHMOND 8.700e+03 -13.133
## neighborhoodPRINCES.BAY 8.602e+03 6.946
## neighborhoodRICHMONDTOWN 1.074e+04 12.057
## neighborhoodRICHMONDTOWN.LIGHTHS.HILL 2.498e+04 7.853
## neighborhoodROSEBANK 8.855e+03 -1.050
## neighborhoodROSSVILLE 9.580e+03 1.813
## neighborhoodROSSVILLE.CHARLESTON 1.428e+04 3.561
## neighborhoodROSSVILLE.PORT.MOBIL NA NA
## neighborhoodROSSVILLE.RICHMOND.VALLEY 2.061e+04 7.432
## neighborhoodSILVER.LAKE 1.267e+04 7.723
## neighborhoodSOUTH.BEACH 8.115e+03 -1.298
## neighborhoodSTAPLETON 1.029e+04 -8.387
## neighborhoodSTAPLETON.CLIFTON 1.508e+04 -6.039
## neighborhoodSUNNYSIDE 1.239e+04 1.462
## neighborhoodTODT.HILL 1.607e+04 22.845
## neighborhoodTOMPKINSVILLE 1.353e+04 -4.015
## neighborhoodTOTTENVILLE 8.036e+03 6.164
## neighborhoodTRAVIS 1.128e+04 -5.017
## neighborhoodWEST.NEW.BRIGHTON 8.148e+03 -8.470
## neighborhoodWESTERLEIGH 8.468e+03 1.730
## neighborhoodWILLOWBROOK 8.805e+03 3.316
## neighborhoodWOODROW NA NA
## residential_units 2.023e+03 48.700
## nta_nameArden.Heights.Rossville NA NA
## nta_nameFreshkills.Park..South. NA NA
## nta_nameGrasmere.Arrochar.South.Beach.Dongan.Hills NA NA
## nta_nameGreat.Kills.Eltingville NA NA
## nta_nameMariner.s.Harbor.Arlington.Graniteville NA NA
## nta_nameNew.Dorp.Midland.Beach NA NA
## nta_nameNew.Springville.Willowbrook.Bulls.Head.Travis NA NA
## nta_nameOakwood.Richmondtown NA NA
## nta_namePort.Richmond NA NA
## nta_nameRosebank.Shore.Acres.Park.Hill NA NA
## nta_nameSt..George.New.Brighton NA NA
## nta_nameTodt.Hill.Emerson.Hill.Lighthouse.Hill.Manor.Heights NA NA
## nta_nameTompkinsville.Stapleton.Clifton.Fox.Hills NA NA
## nta_nameTottenville.Charleston NA NA
## nta_nameWest.New.Brighton.Silver.Lake.Grymes.Hill NA NA
## nta_nameWesterleigh.Castleton.Corners NA NA
## crime_rate_per_1000 NA NA
## avg_school_ranking NA NA
## number_of_permits 1.079e+01 0.152
## building_age 3.342e+01 -63.743
## land_square_feet_log 1.619e+03 91.931
## yoy_growth_percentage_si 6.284e+02 -3.120
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## year < 2e-16 ***
## neighborhoodANNADALE < 2e-16 ***
## neighborhoodARDEN.HEIGHTS 0.001641 **
## neighborhoodARROCHAR 0.000293 ***
## neighborhoodARROCHAR.SHORE.ACRES 0.191128
## neighborhoodBLOOMFIELD 0.243898
## neighborhoodBULLS.HEAD 2.41e-11 ***
## neighborhoodCASTLETON.CORNERS 0.027784 *
## neighborhoodCLOVE.LAKES 1.21e-05 ***
## neighborhoodCONCORD < 2e-16 ***
## neighborhoodCONCORD.FOX.HILLS 7.08e-11 ***
## neighborhoodDONGAN.HILLS 6.37e-11 ***
## neighborhoodDONGAN.HILLS.COLONY < 2e-16 ***
## neighborhoodDONGAN.HILLS.OLD.TOWN 0.011582 *
## neighborhoodELTINGVILLE 0.333675
## neighborhoodEMERSON.HILL < 2e-16 ***
## neighborhoodFRESH.KILLS NA
## neighborhoodGRANT.CITY 0.004056 **
## neighborhoodGRASMERE 2.78e-08 ***
## neighborhoodGREAT.KILLS 0.692945
## neighborhoodGREAT.KILLS.BAY.TERRACE 0.381215
## neighborhoodGRYMES.HILL 0.099995 .
## neighborhoodHUGUENOT < 2e-16 ***
## neighborhoodLIVINGSTON 0.018496 *
## neighborhoodMANOR.HEIGHTS 0.002693 **
## neighborhoodMARINERS.HARBOR < 2e-16 ***
## neighborhoodMIDLAND.BEACH 1.50e-07 ***
## neighborhoodNEW.BRIGHTON < 2e-16 ***
## neighborhoodNEW.BRIGHTON.ST..GEORGE 3.26e-06 ***
## neighborhoodNEW.DORP 1.86e-09 ***
## neighborhoodNEW.DORP.BEACH < 2e-16 ***
## neighborhoodNEW.DORP.HEIGHTS 1.21e-08 ***
## neighborhoodNEW.SPRINGVILLE 0.009578 **
## neighborhoodOAKWOOD 0.105843
## neighborhoodOAKWOOD.BEACH 0.610070
## neighborhoodPLEASANT.PLAINS 3.55e-15 ***
## neighborhoodPORT.IVORY < 2e-16 ***
## neighborhoodPORT.RICHMOND < 2e-16 ***
## neighborhoodPRINCES.BAY 3.82e-12 ***
## neighborhoodRICHMONDTOWN < 2e-16 ***
## neighborhoodRICHMONDTOWN.LIGHTHS.HILL 4.18e-15 ***
## neighborhoodROSEBANK 0.293547
## neighborhoodROSSVILLE 0.069815 .
## neighborhoodROSSVILLE.CHARLESTON 0.000370 ***
## neighborhoodROSSVILLE.PORT.MOBIL NA
## neighborhoodROSSVILLE.RICHMOND.VALLEY 1.10e-13 ***
## neighborhoodSILVER.LAKE 1.17e-14 ***
## neighborhoodSOUTH.BEACH 0.194407
## neighborhoodSTAPLETON < 2e-16 ***
## neighborhoodSTAPLETON.CLIFTON 1.57e-09 ***
## neighborhoodSUNNYSIDE 0.143728
## neighborhoodTODT.HILL < 2e-16 ***
## neighborhoodTOMPKINSVILLE 5.96e-05 ***
## neighborhoodTOTTENVILLE 7.17e-10 ***
## neighborhoodTRAVIS 5.27e-07 ***
## neighborhoodWEST.NEW.BRIGHTON < 2e-16 ***
## neighborhoodWESTERLEIGH 0.083680 .
## neighborhoodWILLOWBROOK 0.000915 ***
## neighborhoodWOODROW NA
## residential_units < 2e-16 ***
## nta_nameArden.Heights.Rossville NA
## nta_nameFreshkills.Park..South. NA
## nta_nameGrasmere.Arrochar.South.Beach.Dongan.Hills NA
## nta_nameGreat.Kills.Eltingville NA
## nta_nameMariner.s.Harbor.Arlington.Graniteville NA
## nta_nameNew.Dorp.Midland.Beach NA
## nta_nameNew.Springville.Willowbrook.Bulls.Head.Travis NA
## nta_nameOakwood.Richmondtown NA
## nta_namePort.Richmond NA
## nta_nameRosebank.Shore.Acres.Park.Hill NA
## nta_nameSt..George.New.Brighton NA
## nta_nameTodt.Hill.Emerson.Hill.Lighthouse.Hill.Manor.Heights NA
## nta_nameTompkinsville.Stapleton.Clifton.Fox.Hills NA
## nta_nameTottenville.Charleston NA
## nta_nameWest.New.Brighton.Silver.Lake.Grymes.Hill NA
## nta_nameWesterleigh.Castleton.Corners NA
## crime_rate_per_1000 NA
## avg_school_ranking NA
## number_of_permits 0.878934
## building_age < 2e-16 ***
## land_square_feet_log < 2e-16 ***
## yoy_growth_percentage_si 0.001813 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 162900 on 31730 degrees of freedom
## Multiple R-squared: 0.5233, Adjusted R-squared: 0.5224
## F-statistic: 571 on 61 and 31730 DF, p-value: < 2.2e-16
test_df <- data.frame(test_features_wf)
predsln <- predict(lm_model, newdata = test_df)
## Warning in predict.lm(lm_model, newdata = test_df): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
rmse_val_lin <- sqrt(mean((test_target_wf - predsln)^2))
mae_val_lin <- mean(abs(test_target_wf - predsln))
r2_val_lin <- 1 - sum((test_target_wf - predsln)^2) / sum((test_target_wf - mean(test_target_wf))^2)
lr_model <- data.frame(
Model = "Linear_Regression",
RMSE = rmse_val_lin,
MAE = mae_val_lin,
R2 = r2_val_lin
)
mean_price_lin <- mean(test_target_wf)
# percentage error
pct_error_lin <- (test_target_wf - predsln) / test_target_wf
MAPE_lin <- mean(abs(pct_error_lin), na.rm = TRUE) * 100
# normalized RMSE and MAE
RMSE_pct_lin <- rmse_val_lin / mean_price_lin
MAE_pct_lin <- mae_val_lin / mean_price_lin
normalized_lin <- data.frame(
RMSE_as_pct_of_avg_price = RMSE_pct_lin,
MAE_as_pct_of_avg_price = MAE_pct_lin,
MAPE_percent = MAPE_lin
)
print(lr_model)
## Model RMSE MAE R2
## 1 Linear_Regression 161079.3 112535 0.5404037
## RMSE_as_pct_of_avg_price MAE_as_pct_of_avg_price MAPE_percent
## 1 0.2512699 0.1755449 21.2104
params <- list(
objective = "reg:squarederror",
eta = 0.02,
max_depth = 7,
subsample = 0.9,
colsample_bytree = 0.85,
min_child_weight = 2,
gamma = 0.1,
lambda = 1.2,
alpha = 0.2
)
model_wf <- xgb.train(
params = params,
data = dtrain_wf,
nrounds = 6000,
watchlist = list(train = dtrain_wf, test = dtest_wf),
early_stopping_rounds = 120,
print_every_n = 50,
verbose = 1
)
## [1] train-rmse:668039.921953 test-rmse:670772.637998
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 120 rounds.
##
## [51] train-rmse:282212.813243 test-rmse:284739.851548
## [101] train-rmse:167432.250826 test-rmse:171806.694432
## [151] train-rmse:141833.423118 test-rmse:148321.012310
## [201] train-rmse:135835.331342 test-rmse:144018.196184
## [251] train-rmse:133653.192904 test-rmse:143038.662974
## [301] train-rmse:132247.342682 test-rmse:142665.030593
## [351] train-rmse:131074.183760 test-rmse:142421.555985
## [401] train-rmse:130122.670667 test-rmse:142275.827925
## [451] train-rmse:129201.682523 test-rmse:142184.150786
## [501] train-rmse:128280.948434 test-rmse:142068.929157
## [551] train-rmse:127334.459796 test-rmse:141950.369875
## [601] train-rmse:126406.723868 test-rmse:141860.721966
## [651] train-rmse:125536.840913 test-rmse:141807.047152
## [701] train-rmse:124644.164614 test-rmse:141782.136912
## [751] train-rmse:123880.619362 test-rmse:141756.845628
## [801] train-rmse:123045.399304 test-rmse:141754.578410
## [851] train-rmse:122231.052345 test-rmse:141706.063893
## [901] train-rmse:121400.287204 test-rmse:141661.174172
## [951] train-rmse:120707.192075 test-rmse:141698.676292
## [1001] train-rmse:119924.615465 test-rmse:141658.236588
## [1051] train-rmse:119206.479108 test-rmse:141655.300503
## [1101] train-rmse:118536.855677 test-rmse:141661.072831
## Stopping. Best iteration:
## [1026] train-rmse:119543.596830 test-rmse:141652.272101
preds_wf <- predict(model_wf, newdata = test_features_wf)
rmse_val_wf <- rmse(test_target_wf, preds_wf)
mae_val_wf <- mae(test_target_wf, preds_wf)
r2_val_wf <- 1 - sum((test_target_wf - preds_wf)^2) / sum((test_target_wf - mean(test_target_wf))^2)
xgboost_metrics <- data.frame(
Model = "XGBoost",
RMSE = rmse_val_wf,
MAE = mae_val_wf,
R2 = r2_val_wf
)
mean_price_xgb <- mean(test_target_wf)
pct_error_xgb <- (test_target_wf - preds_wf) / test_target_wf
MAPE_xgb <- mean(abs(pct_error_xgb), na.rm = TRUE) * 100
RMSE_pct_xgb <- rmse_val_wf / mean_price_xgb
MAE_pct_xgb <- mae_val_wf / mean_price_xgb
normalized_xgb <- data.frame(
RMSE_as_pct_of_avg_price = RMSE_pct_xgb,
MAE_as_pct_of_avg_price = MAE_pct_xgb,
MAPE_percent = MAPE_xgb
)
print(normalized_xgb)
## RMSE_as_pct_of_avg_price MAE_as_pct_of_avg_price MAPE_percent
## 1 0.2209653 0.1522951 18.84078
## Model RMSE MAE R2
## 1 XGBoost 141652.3 97630.48 0.6445782
train_scaled <- scale(train_features_wf)
test_scaled <- scale(test_features_wf, center = attr(train_scaled, "scaled:center"),
scale = attr(train_scaled, "scaled:scale"))
train_scaled[is.na(train_scaled)] <- median(train_scaled, na.rm = TRUE)
test_scaled[is.na(test_scaled)] <- median(test_scaled, na.rm = TRUE)
set.seed(123)
preds_wf1 <- knn.reg(
train = train_scaled,
test = test_scaled,
y = train_target_wf,
k = 20
)$pred
rmse_val_knn <- sqrt(mean((test_target_wf - preds_wf1)^2))
mae_val_knn <- mean(abs(test_target_wf - preds_wf1))
r2_val_knn <- 1 - sum((test_target_wf - preds_wf1)^2) /
sum((test_target_wf - mean(test_target_wf))^2)
knn_metrics <- data.frame(
Model = "kNN",
RMSE = rmse_val_knn,
MAE = mae_val_knn,
R2 = r2_val_knn
)
print(knn_metrics)
## Model RMSE MAE R2
## 1 kNN 156144.6 109437.1 0.568132
mean_price_knn <- mean(test_target_wf)
pct_error_knn <- (test_target_wf - preds_wf1) / test_target_wf
MAPE_knn <- mean(abs(pct_error_knn), na.rm = TRUE) * 100
RMSE_pct_knn <- rmse_val_knn / mean_price_knn
MAE_pct_knn <- mae_val_knn / mean_price_knn
normalized_knn <- data.frame(
RMSE_as_pct_of_avg_price = RMSE_pct_knn,
MAE_as_pct_of_avg_price = MAE_pct_knn,
MAPE_percent = MAPE_knn
)
print(normalized_knn)
## RMSE_as_pct_of_avg_price MAE_as_pct_of_avg_price MAPE_percent
## 1 0.2435722 0.1707124 20.48197
metrics_table <- bind_rows(
lr_model,
knn_metrics,
xgboost_metrics
)
metrics_table %>%
kbl(
caption = "Table 2: Predictive Performance Metrics Across Models",
digits = 3,
align = "c",
booktabs = TRUE
) %>%
kable_styling(
latex_options = c("striped", "hold_position"),
full_width = FALSE,
position = "center"
) %>%
row_spec(
which(metrics_table$Model == "XGBoost_with_features .2"),
bold = TRUE
)
Table 2: Predictive Performance Metrics Across Models
|
Model
|
RMSE
|
MAE
|
R2
|
|
Linear_Regression
|
161079.3
|
112535.03
|
0.540
|
|
kNN
|
156144.6
|
109437.07
|
0.568
|
|
XGBoost
|
141652.3
|
97630.48
|
0.645
|
normalized_all <- bind_rows(
cbind(Model = "Linear Regression", normalized_lin),
cbind(Model = "kNN Regression", normalized_knn),
cbind(Model = "XGBoost", normalized_xgb)
)
normalized_all %>%
kbl(
caption = "Table 3: Normalised Error Metrics Across Models",
digits = 3,
align = "c",
booktabs = TRUE
) %>%
kable_styling(
latex_options = c("striped", "hold_position"),
full_width = FALSE,
position = "center"
) %>%
row_spec(
which(normalized_all$Model == "XGBoost"),
bold = TRUE
)
Table 3: Normalised Error Metrics Across Models
|
Model
|
RMSE_as_pct_of_avg_price
|
MAE_as_pct_of_avg_price
|
MAPE_percent
|
|
Linear Regression
|
0.251
|
0.176
|
21.210
|
|
kNN Regression
|
0.244
|
0.171
|
20.482
|
|
XGBoost
|
0.221
|
0.152
|
18.841
|
residuals_df <- bind_rows(
data.frame(
Model = "Linear Regression",
Actual = test_target_wf,
Predicted = predsln,
Residual = test_target_wf - predsln,
Index = seq_along(test_target_wf)
),
data.frame(
Model = "kNN",
Actual = test_target_wf,
Predicted = preds_wf1,
Residual = test_target_wf - preds_wf1,
Index = seq_along(test_target_wf)
),
data.frame(
Model = "XGBoost",
Actual = test_target_wf,
Predicted = preds_wf,
Residual = test_target_wf - preds_wf,
Index = seq_along(test_target_wf)
)
)
Residuals vs Fitted
ggplot(residuals_df, aes(x = Predicted, y = Residual)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed") +
facet_wrap(~Model, scales = "free") +
labs(
title = "Residuals vs Fitted Values",
x = "Predicted Sale Price",
y = "Residuals"
) +
theme_minimal()

Histogram of Residuals
ggplot(residuals_df, aes(x = Residual)) +
geom_histogram(bins = 40, alpha = 0.7) +
facet_wrap(~Model, scales = "free") +
labs(
title = "Distribution of Residuals",
x = "Residual",
y = "Frequency"
) +
theme_minimal()

Q–Q Plot of Residuals
ggplot(residuals_df, aes(sample = Residual)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~Model, scales = "free") +
labs(
title = "Q–Q Plots of Residuals",
x = "Theoretical Quantiles",
y = "Sample Quantiles"
) +
theme_minimal()

Actual vs Predicted
ggplot(residuals_df, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
facet_wrap(~Model, scales = "free") +
labs(
title = "Actual vs Predicted Sale Prices",
x = "Actual Sale Price",
y = "Predicted Sale Price"
) +
theme_minimal()

Pilot plan:Forecasting
si_sales_with_features_econ <- si_sales_with_features %>%
left_join(economic_factors, by = "year")
set.seed(123)
idx <- sample(seq_len(nrow(si_sales_with_features_econ)), size = 0.8 * nrow(si_sales_with_features_econ))
train_data_wf <- si_sales_with_features_econ[idx, ]
test_data_wf <- si_sales_with_features_econ[-idx, ]
train_features_wf <- model.matrix(~ . - sale_price - 1, data = train_data_wf)
test_features_wf <- model.matrix(~ . - sale_price - 1, data = test_data_wf)
train_target_wf <- train_data_wf$sale_price
test_target_wf <- test_data_wf$sale_price
dtrain_wf <- xgb.DMatrix(data = as.matrix(train_features_wf), label = train_target_wf)
dtest_wf <- xgb.DMatrix(data = as.matrix(test_features_wf), label = test_target_wf)
params <- list(
objective = "reg:squarederror",
eta = 0.02,
max_depth = 7,
subsample = 0.9,
colsample_bytree = 0.85,
min_child_weight = 2,
gamma = 0.1,
lambda = 1.2,
alpha = 0.2
)
model_econ <- xgb.train(
params = params,
data = dtrain_wf,
nrounds = 6000,
watchlist = list(train = dtrain_wf, test = dtest_wf),
early_stopping_rounds = 120,
print_every_n = 50,
verbose = 1
)
## [1] train-rmse:668039.921953 test-rmse:670772.637998
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 120 rounds.
##
## [51] train-rmse:281523.310598 test-rmse:284047.781693
## [101] train-rmse:167316.643948 test-rmse:171832.538989
## [151] train-rmse:141866.303479 test-rmse:148407.532598
## [201] train-rmse:135998.588114 test-rmse:144097.716280
## [251] train-rmse:133876.547191 test-rmse:143091.479699
## [301] train-rmse:132446.799127 test-rmse:142682.447261
## [351] train-rmse:131227.990787 test-rmse:142410.340236
## [401] train-rmse:130185.187325 test-rmse:142261.349864
## [451] train-rmse:129198.104811 test-rmse:142105.480675
## [501] train-rmse:128218.938997 test-rmse:142005.130696
## [551] train-rmse:127292.590057 test-rmse:141914.348203
## [601] train-rmse:126401.781493 test-rmse:141848.464033
## [651] train-rmse:125553.466553 test-rmse:141762.158493
## [701] train-rmse:124701.897431 test-rmse:141717.997321
## [751] train-rmse:123850.279845 test-rmse:141653.966366
## [801] train-rmse:123045.249232 test-rmse:141582.146880
## [851] train-rmse:122243.799860 test-rmse:141564.444292
## [901] train-rmse:121357.777711 test-rmse:141530.880372
## [951] train-rmse:120568.095962 test-rmse:141528.707121
## [1001] train-rmse:119882.551178 test-rmse:141510.443205
## [1051] train-rmse:119068.445800 test-rmse:141567.719868
## Stopping. Best iteration:
## [941] train-rmse:120737.167013 test-rmse:141494.720605
preds_wf <- predict(model_econ, newdata = test_features_wf)
rmse_val_wf <- rmse(test_target_wf, preds_wf)
mae_val_wf <- mae(test_target_wf, preds_wf)
r2_val_wf <- 1 - sum((test_target_wf - preds_wf)^2) / sum((test_target_wf - mean(test_target_wf))^2)
econ <- data.frame(
Model = "XGBoost_with_features_Econ",
RMSE = rmse_val_wf,
MAE = mae_val_wf,
R2 = r2_val_wf
)
print(econ)
## Model RMSE MAE R2
## 1 XGBoost_with_features_Econ 141494.7 97640.86 0.6453684
Importance
importance_df <- xgb.importance(
feature_names = model_econ$feature_names,
model = model_econ
)
importance_df_clean <- importance_df %>%
select(Feature, Gain, Cover, Frequency) %>%
arrange(desc(Gain)) %>%
mutate(
Rank = row_number(),
Gain_Percent = round(100 * Gain / sum(Gain), 2)
)
# top drivers
head(importance_df_clean, 15)
## Feature Gain
## <char> <num>
## 1: land_square_feet_log 0.425993622
## 2: building_age 0.230422127
## 3: year 0.106835234
## 4: residential_units 0.042525457
## 5: crime_rate_per_1000 0.039539506
## 6: avg_school_ranking 0.033437261
## 7: all_items_annual_avg_cpi 0.020154534
## 8: number_of_permits 0.015654791
## 9: yoy_growth_percentage_si 0.006382437
## 10: nta_nameTodt Hill-Emerson Hill-Lighthouse Hill-Manor Heights 0.005355159
## 11: neighborhoodTODT HILL 0.004661509
## 12: avg_mortgage_rate 0.004121506
## 13: neighborhoodDONGAN HILLS-COLONY 0.003349855
## 14: neighborhoodARDEN HEIGHTS 0.002972110
## 15: neighborhoodWEST NEW BRIGHTON 0.002396801
## Cover Frequency Rank Gain_Percent
## <num> <num> <int> <num>
## 1: 0.342785259 0.301562917 1 42.60
## 2: 0.125050356 0.227606871 2 23.04
## 3: 0.044291870 0.082792682 3 10.68
## 4: 0.034306910 0.051089979 4 4.25
## 5: 0.016519567 0.025989360 5 3.95
## 6: 0.009377774 0.017482828 6 3.34
## 7: 0.007392482 0.006462425 7 2.02
## 8: 0.025366703 0.053375316 8 1.57
## 9: 0.005677096 0.043446796 9 0.64
## 10: 0.005371268 0.003262953 10 0.54
## 11: 0.007761550 0.002805886 11 0.47
## 12: 0.004118376 0.027855719 12 0.41
## 13: 0.012140370 0.003326435 13 0.33
## 14: 0.005224645 0.001840966 14 0.30
## 15: 0.009030806 0.003770806 15 0.24
importance_top15 <- importance_df_clean[1:15, ]
ggplot(
importance_top15,
aes(x = reorder(Feature, Gain_Percent), y = Gain_Percent)
) +
geom_col(
fill = "#1a9850",
width = 0.75
) +
coord_flip() +
labs(
title = "Top 15 Feature Importances",
subtitle = "Gain-based relative contribution to predictive accuracy",
x = "Feature",
y = "Relative Importance (%)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
axis.title.x = element_text(face = "bold", size = 13),
axis.title.y = element_text(face = "bold", size = 13),
axis.text = element_text(size = 11),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = margin(15, 20, 15, 20)
)

ggsave(
filename = "Figure_5_5_Feature_Importance_XGBoost_Thesis.png",
width = 10,
height = 7,
dpi = 300,
bg = "white"
)
crime_school_importance <- importance_df_clean %>%
filter(grepl("crime|school|education", Feature, ignore.case = TRUE))
crime_school_importance
## Feature Gain Cover Frequency Rank Gain_Percent
## <char> <num> <num> <num> <int> <num>
## 1: crime_rate_per_1000 0.03953951 0.016519567 0.02598936 5 3.95
## 2: avg_school_ranking 0.03343726 0.009377774 0.01748283 6 3.34
kable(
crime_school_importance,
caption = "Importance of Crime and School Quality Variables",
align = "c"
) %>%
kable_styling(full_width = FALSE)
Importance of Crime and School Quality Variables
|
Feature
|
Gain
|
Cover
|
Frequency
|
Rank
|
Gain_Percent
|
|
crime_rate_per_1000
|
0.0395395
|
0.0165196
|
0.0259894
|
5
|
3.95
|
|
avg_school_ranking
|
0.0334373
|
0.0093778
|
0.0174828
|
6
|
3.34
|
## Now its time to prepare the data we’ll feed into the prediction model
feature_template <- si_sales_with_features %>%
select(-sale_price)
future_features <- feature_template %>%
filter(year == 2025) %>%
select(-year) %>%
distinct() %>%
crossing(year = 2026:2030)
# Blueprint: 2017-2021 historical data
blueprint <- economic_factors %>%
filter(year >= 2017 & year <= 2021) %>%
select(year, avg_mortgage_rate, all_items_annual_avg_cpi)
# Calculate year-to-year percentage changes
blueprint_trends <- blueprint %>%
arrange(year) %>%
mutate(
cpi_change = c(0, diff(all_items_annual_avg_cpi) / head(all_items_annual_avg_cpi, -1)),
avg_mortgage_rate_change = c(0, diff(avg_mortgage_rate) / head(avg_mortgage_rate, -1))
)
# Compute average annual changes
avg_changes <- blueprint_trends %>%
summarise(
mortgage_rate_change = mean(avg_mortgage_rate_change[-1], na.rm = TRUE),
cpi_change = mean(cpi_change[-1], na.rm = TRUE)
)
# These averages will help us project the future.
latest_values <- tail(economic_factors, 1) %>%
select(avg_mortgage_rate, all_items_annual_avg_cpi)
# Project future years (CPI relative to 2017 = 100)
future_years <- data.frame(year = 2026:2030)
future_years$avg_mortgage_rate[1] <- latest_values$avg_mortgage_rate * (1 + avg_changes$mortgage_rate_change)
future_years$all_items_annual_avg_cpi[1] <- latest_values$all_items_annual_avg_cpi * (1 + avg_changes$cpi_change)
for (i in 2:nrow(future_years)) {
future_years$avg_mortgage_rate[i] <- future_years$avg_mortgage_rate[i-1] * (1 + avg_changes$mortgage_rate_change)
future_years$all_items_annual_avg_cpi[i] <- future_years$all_items_annual_avg_cpi[i-1] * (1 + avg_changes$cpi_change)
}
econ_full <- bind_rows(blueprint, future_years)
feature_template <- si_sales_with_features %>% select(-sale_price)
future_features <- feature_template %>%
filter(year == 2025) %>%
select(-year) %>%
distinct() %>%
crossing(year = 2026:2030) %>%
left_join(future_years, by = "year")
econ_full <- bind_rows(blueprint, future_years)
feature_template <- si_sales_with_features %>%
select(-sale_price)
future_features <- feature_template %>%
filter(year == 2025) %>%
select(-year) %>%
distinct() %>%
crossing(year = 2026:2030)
future_features <- future_features %>%
left_join(future_years, by = "year")
view(future_features)
model_features <- model_econ$feature_names
future_features_matrix <- model.matrix(~ . - 1, data = future_features)
extra_cols <- setdiff(colnames(future_features_matrix), model_features)
if (length(extra_cols) > 0) {
future_features_matrix <- future_features_matrix[, !(colnames(future_features_matrix) %in% extra_cols), drop = FALSE]
}
missing_cols <- setdiff(model_features, colnames(future_features_matrix))
for (col in missing_cols) {
future_features_matrix <- cbind(future_features_matrix, setNames(data.frame(0), col))
}
future_features_matrix <- future_features_matrix[, model_features, drop = FALSE]
future_matrix_clean <- as.matrix(future_features_matrix)
predictions <- predict(model_econ, newdata = future_matrix_clean)
future_forecast <- future_features %>%
mutate(predicted_sale_price = predictions)
future_forecast_summary_nta <- future_forecast %>%
group_by(year, nta_name) %>%
summarise(mean_price = mean(predicted_sale_price, na.rm = TRUE)) %>%
arrange(nta_name, year)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
future_forecast_summary_neighborhood <- future_forecast %>%
group_by(year, neighborhood) %>%
summarise(mean_price = mean(predicted_sale_price, na.rm = TRUE)) %>%
arrange(neighborhood, year)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
future_forecast_summary_nta
## # A tibble: 80 × 3
## # Groups: year [5]
## year nta_name mean_price
## <int> <fct> <dbl>
## 1 2026 Annadale-Huguenot-Prince's Bay-Woodrow 914210.
## 2 2027 Annadale-Huguenot-Prince's Bay-Woodrow 914210.
## 3 2028 Annadale-Huguenot-Prince's Bay-Woodrow 913703.
## 4 2029 Annadale-Huguenot-Prince's Bay-Woodrow 913703.
## 5 2030 Annadale-Huguenot-Prince's Bay-Woodrow 912697.
## 6 2026 Arden Heights-Rossville 777333.
## 7 2027 Arden Heights-Rossville 777333.
## 8 2028 Arden Heights-Rossville 776933.
## 9 2029 Arden Heights-Rossville 776933.
## 10 2030 Arden Heights-Rossville 776482.
## # ℹ 70 more rows
future_forecast_summary_neighborhood
## # A tibble: 275 × 3
## # Groups: year [5]
## year neighborhood mean_price
## <int> <fct> <dbl>
## 1 2026 ANNADALE 962672.
## 2 2027 ANNADALE 962672.
## 3 2028 ANNADALE 962307.
## 4 2029 ANNADALE 962307.
## 5 2030 ANNADALE 960634.
## 6 2026 ARDEN HEIGHTS 655246.
## 7 2027 ARDEN HEIGHTS 655246.
## 8 2028 ARDEN HEIGHTS 654850.
## 9 2029 ARDEN HEIGHTS 654850.
## 10 2030 ARDEN HEIGHTS 654606.
## # ℹ 265 more rows
NTA Increase %
nta_baseline_2025 <- si_sales_with_features_econ %>%
filter(year == 2025) %>%
group_by(nta_name) %>%
summarise(
baseline_price_2025 = mean(sale_price, na.rm = TRUE),
.groups = "drop"
)
nta_forecast_with_baseline <- future_forecast_summary_nta %>%
left_join(nta_baseline_2025, by = "nta_name") %>%
mutate(
pct_change_from_2025 =
(mean_price - baseline_price_2025) / baseline_price_2025 * 100
)
nta_growth_2030 <- nta_forecast_with_baseline %>%
filter(year == 2030) %>%
select(nta_name, pct_change_from_2025)
## Adding missing grouping variables: `year`
nta_growth_bands <- nta_growth_2030 %>%
mutate(
trajectory_band = case_when(
pct_change_from_2025 >= quantile(pct_change_from_2025, 0.75, na.rm = TRUE) ~
"High Relative Growth",
pct_change_from_2025 >= quantile(pct_change_from_2025, 0.50, na.rm = TRUE) ~
"Moderate Growth",
pct_change_from_2025 >= quantile(pct_change_from_2025, 0.25, na.rm = TRUE) ~
"Low Growth",
TRUE ~
"Relative Deceleration"
)
)
nta_positive_growth <- nta_growth_2030 %>%
filter(pct_change_from_2025 > 0) %>%
arrange(desc(pct_change_from_2025))
ggplot(
nta_positive_growth,
aes(
x = reorder(nta_name, pct_change_from_2025),
y = pct_change_from_2025
)
) +
geom_col(width = 0.7) +
coord_flip() +
labs(
title = "NTAs with Forecasted Positive Relative Price Growth (2025–2030)",
subtitle = "Percentage increase relative to observed 2025 prices",
x = "Neighborhood Tabulation Area (NTA)",
y = "Percent Change from 2025"
) +
theme_minimal(base_size = 12)

ggsave(
filename = "nta_groeth.png",
width = 10,
height = 6,
units = "in",
dpi = 300
)
hist_units <- si_sales_with_features %>%
filter(residential_units %in% c(1, 2)) %>%
group_by(year, neighborhood, residential_units) %>%
summarise(
mean_price = mean(sale_price, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(source = "Observed")
future_units <- future_forecast %>%
filter(residential_units %in% c(1, 2)) %>%
group_by(year, neighborhood, residential_units) %>%
summarise(
mean_price = mean(predicted_sale_price, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(source = "Forecast")
units_panel <- bind_rows(hist_units, future_units) %>%
filter(!is.na(neighborhood)) %>%
mutate(
unit_group = ifelse(residential_units == 1,
"1 unit (single-family)",
"2 units")
)
set.seed(261)
random_neighborhoods <- units_panel %>%
distinct(neighborhood) %>%
slice_sample(n = 6) %>%
pull(neighborhood)
random_neighborhoods
## [1] GRYMES HILL ARROCHAR ROSSVILLE
## [4] NEW BRIGHTON-ST. GEORGE WEST NEW BRIGHTON WOODROW
## 58 Levels: ANNADALE ARDEN HEIGHTS ARROCHAR ARROCHAR-SHORE ACRES ... WOODROW
set.seed(261)
neighborhood_units_trends <- units_panel %>%
filter(neighborhood %in% random_neighborhoods)
combined_plot <- ggplot(
neighborhood_units_trends,
aes(x = year,
y = mean_price,
color = unit_group,
group = interaction(unit_group, neighborhood))
) +
geom_line(size = 1.1) +
facet_wrap(~ neighborhood, scales = "free_y") +
scale_y_continuous(labels = scales::dollar_format()) +
scale_color_manual(
name = "Home Type",
values = c(
"1 unit (single-family)" = "#1B9E77",
"2 units" = "#D95F02"
)
) +
labs(
title = "Observed and Forecasted Price Trends for 1-Unit and 2-Unit Homes",
subtitle = "Observed: 2017–2025 | Forecast: 2026–2030",
x = "Year",
y = "Average Sale Price"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 15),
plot.subtitle = element_text(hjust = 0.5, size = 11),
strip.text = element_text(face = "bold"),
legend.position = "bottom",
panel.grid.minor = element_blank(),
panel.spacing = unit(1, "lines")
)
combined_plot

## Idea-Experimenting
# Average house prices per year per NTA
avg_price_per_nta <- si_sales_with_features %>%
group_by(year, nta_name) %>%
summarise(
mean_price = mean(sale_price, na.rm = TRUE),
n_sales = n(),
.groups = "drop"
)
# Average house prices per year per Neighborhood
avg_price_per_neighborhood <- si_sales_with_features %>%
group_by(year, neighborhood) %>%
summarise(
mean_price = mean(sale_price, na.rm = TRUE),
n_sales = n(),
.groups = "drop"
)
combined_nta <- bind_rows(future_forecast_summary_nta, avg_price_per_nta) %>%
arrange(nta_name, year, mean_price)%>%
select(-n_sales)
combined_neighborhood <- bind_rows(future_forecast_summary_neighborhood, avg_price_per_neighborhood) %>%
arrange(neighborhood, year, mean_price)%>%
select(-n_sales)
trend_table_simple <- neighborhood_units_trends %>%
filter(year >= 2025) %>% # keep only 2025+
arrange(neighborhood, unit_group, year) %>%
mutate(mean_price = round(mean_price, 0)) %>%
select(neighborhood, year, unit_group, mean_price) %>%
tidyr::pivot_wider(
names_from = unit_group,
values_from = mean_price
) %>%
arrange(neighborhood, year)
trend_table_simple
## # A tibble: 36 × 4
## neighborhood year `1 unit (single-family)` `2 units`
## <fct> <dbl> <dbl> <dbl>
## 1 ARROCHAR 2025 901500 940000
## 2 ARROCHAR 2026 880934 878298
## 3 ARROCHAR 2027 880934 878298
## 4 ARROCHAR 2028 880744 878150
## 5 ARROCHAR 2029 880744 878150
## 6 ARROCHAR 2030 882387 880149
## 7 GRYMES HILL 2025 877743 931756
## 8 GRYMES HILL 2026 853299 869962
## 9 GRYMES HILL 2027 853299 869962
## 10 GRYMES HILL 2028 853367 870041
## # ℹ 26 more rows
nta_map <- st_read("nynta2020_25c/nynta2020.shp")
## Reading layer `nynta2020' from data source
## `C:\Users\nicol\OneDrive\Έγγραφα\nynta2020_25c\nynta2020.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 262 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
si_map <- nta_map %>%
filter(BoroName == "Staten Island") %>%
mutate(
NTAName = if_else(NTAName == "Snug Harbor", "St. George-New Brighton", NTAName),
NTAName = if_else(NTAName == "Great Kills Park", "Great Kills-Eltingville", NTAName),
NTAName = if_else(NTAName == "Miller Field", "New Dorp-Midland Beach", NTAName)
)
nta_shapes <- si_map %>%
select(nta_name = NTAName, geometry)
nta_map <- st_read("nynta2020_25c/nynta2020.shp")
## Reading layer `nynta2020' from data source
## `C:\Users\nicol\OneDrive\Έγγραφα\nynta2020_25c\nynta2020.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 262 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
si_map <- nta_map %>%
filter(BoroName == "Staten Island") %>%
mutate(
NTAName = if_else(NTAName == "Snug Harbor", "St. George-New Brighton", NTAName),
NTAName = if_else(NTAName == "Great Kills Park", "Great Kills-Eltingville", NTAName),
NTAName = if_else(NTAName == "Miller Field", "New Dorp-Midland Beach", NTAName)
)
nta_shapes <- si_map %>%
select(nta_name = NTAName, geometry)
schools_coords <- nta_shapes %>%
st_centroid() %>%
st_coordinates() %>%
as.data.frame() %>%
bind_cols(nta_shapes %>% st_drop_geometry()) %>%
rename(lon = X, lat = Y)
## Warning: st_centroid assumes attributes are constant over geometries
schools <- schools %>%
as.data.frame()
schools_mapped <- schools %>%
mutate(
lon = schools_coords$lon[match(nta_name, schools_coords$nta_name)],
lat = schools_coords$lat[match(nta_name, schools_coords$nta_name)]
)
schools_by_nta <- schools_mapped %>%
filter(!is.na(nta_name)) %>%
group_by(nta_name) %>%
summarise(
total_schools = n(),
avg_school_rank = round(mean(Ranking, na.rm = TRUE), 1),
top_school = avg_school_rank > 8,
.groups = "drop"
)
nta_across_nj <- c(
"Tottenville-Charleston",
"New Springville-Willowbrook-Bulls Head-Travis",
"Mariner's Harbor-Arlington-Graniteville",
"Port Richmond",
"West New Brighton-Silver Lake-Grymes Hill",
"St. George-New Brighton"
)
nta_across_brooklyn <- c(
"Tompkinsville-Stapleton-Clifton-Fox Hills",
"Rosebank-Shore Acres-Park Hill",
"Grasmere-Arrochar-South Beach-Dongan Hills"
)
nta_near_nj <- data.frame(
nta_name = nta_across_nj,
near_nj = TRUE
)
nta_near_brooklyn <- data.frame(
nta_name = nta_across_brooklyn,
near_brooklyn = TRUE
)
nta_with_train <- c(
"St. George-New Brighton",
"Tompkinsville-Stapleton-Clifton-Fox Hills",
"Rosebank-Shore Acres-Park Hill",
"Grasmere-Arrochar-South Beach-Dongan Hills",
"New Dorp-Midland Beach",
"Oakwood-Richmondtown",
"Great Kills-Eltingville",
"Annadale-Huguenot-Prince's Bay-Woodrow",
"Tottenville-Charleston"
)
nta_train_access <- data.frame(
nta_name = nta_with_train,
train_access = TRUE
)
combined_nta <- combined_nta %>%
ungroup() %>%
dplyr::mutate(
near_nj = FALSE,
near_brooklyn = FALSE,
train_access = FALSE
)
combined_nta <- combined_nta %>%
left_join(nta_near_nj, by = "nta_name", suffix = c("", "_new")) %>%
left_join(nta_near_brooklyn, by = "nta_name", suffix = c("", "_new")) %>%
left_join(nta_train_access, by = "nta_name", suffix = c("", "_new")) %>%
mutate(
near_nj = if_else(!is.na(near_nj_new), TRUE, near_nj),
near_brooklyn = if_else(!is.na(near_brooklyn_new), TRUE, near_brooklyn),
train_access = if_else(!is.na(train_access_new), TRUE, train_access)
) %>%
select(-ends_with("_new"))
avg_house_age_by_nta <- si_sales_mapped %>%
filter(!is.na(year_built)) %>%
mutate(house_age = 2026 - year_built) %>%
group_by(nta_name) %>%
summarise(
avg_house_age = round(mean(house_age, na.rm = TRUE), 1),
.groups = "drop"
)
combined_nta <- combined_nta %>%
left_join(avg_house_age_by_nta, by = "nta_name")
combined_nta <- combined_nta %>%
left_join(schools_by_nta, by = "nta_name")
price_stats_2025 <- avg_price_per_nta %>%
filter(year == 2025) %>%
summarise(
min_price = min(mean_price, na.rm = TRUE),
q1_price = quantile(mean_price, 0.25, na.rm = TRUE),
median_price = median(mean_price, na.rm = TRUE),
q3_price = quantile(mean_price, 0.75, na.rm = TRUE),
max_price = max(mean_price, na.rm = TRUE)
)
price_stats_2025
## # A tibble: 1 × 5
## min_price q1_price median_price q3_price max_price
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 573283 650934. 750683. 836370. 915486.
Final App
A working R Shiny application is available and can
be accessed here
forecast_year <- 2026
# UI
ui <- fluidPage(
titlePanel("Staten Island Housing Price Dashboard"),
sidebarLayout(
sidebarPanel(
radioButtons(
"mode",
"What would you like to do?",
choices = c(
"📝 Take the NTA Preference Quiz" = "quiz",
"📊 View All NTAs (Forecasted Prices)" = "all"
),
selected = "quiz"
),
conditionalPanel(
condition = "input.mode == 'quiz'",
hr(),
h4("💸 Budget (Forecasted)"),
radioButtons("budget", NULL, choices = c(
"Lower-Priced: Under $650K" = "low",
"Mid-Priced: $650K – $835K" = "mid",
"Upper-Priced: Over $835K" = "high",
"I don't have a budget" = "none"
)),
hr(),
h4("🥇 Interested in Top-Ranked Schools?"),
radioButtons("schools", NULL, choices = c(
"Yes" = "yes",
"No" = "no",
"I don't care" = "na"
)),
hr(),
h4("🏠 Home Style Preference"),
radioButtons("house_age_pref", NULL, choices = c(
"More modern neighborhoods (mostly built after 1970)" = "new",
"More historic neighborhoods (mostly built before 1970)" = "old",
"I don't have a preference" = "na"
)),
hr(),
h4("🌇 Preferred Location"),
radioButtons("location", NULL, choices = c(
"Across from New Jersey" = "nj",
"Across from Brooklyn" = "bk",
"I don't mind" = "na"
)),
hr(),
h4("🚂 Train Access"),
radioButtons("train", NULL, choices = c(
"Yes" = "yes",
"No" = "no",
"I don't care" = "na"
)),
hr(),
actionButton("run_quiz", "🔍 Show My NTA Matches", class = "btn-primary")
),
hr(),
selectInput(
"neighborhood",
"Select Neighborhood (Tab 2):",
choices = sort(unique(combined_neighborhood$neighborhood))
)
),
mainPanel(
tabsetPanel(
tabPanel(
"NTA Finder",
DTOutput("quiz_results"),
hr(),
h4(textOutput("nta_detail_title")),
plotlyOutput("nta_detail_trend", height = "350px"),
DTOutput("nta_detail_table")
),
tabPanel(
"🏘️ Neighborhood Price Trends",
plotlyOutput("price_trend_plot", height = "400px"),
DTOutput("price_table")
)
)
)
)
)
# SERVER
server <- function(input, output, session) {
quiz_filtered <- eventReactive(input$run_quiz, {
df <- combined_nta %>% filter(year == forecast_year)
if (input$budget == "low") df <- df %>% filter(mean_price < 650000)
if (input$budget == "mid") df <- df %>% filter(mean_price >= 650000 & mean_price <= 835000)
if (input$budget == "high") df <- df %>% filter(mean_price > 835000)
if ("top_school" %in% colnames(df)) {
if (input$schools == "yes") df <- df %>% filter(top_school)
if (input$schools == "no") df <- df %>% filter(!top_school)
}
if ("avg_house_age" %in% colnames(df)) {
if (input$house_age_pref == "new") df <- df %>% filter(avg_house_age < 55)
if (input$house_age_pref == "old") df <- df %>% filter(avg_house_age >= 55)
}
if (input$location == "nj") df <- df %>% filter(near_nj)
if (input$location == "bk") df <- df %>% filter(near_brooklyn)
if (input$train == "yes") df <- df %>% filter(train_access)
if (input$train == "no") df <- df %>% filter(!train_access)
df
})
nta_display_data <- reactive({
if (input$mode == "all") {
return(
combined_nta %>% filter(year == forecast_year)
)
}
req(input$run_quiz)
quiz_filtered()
})
output$quiz_results <- renderDT({
df <- nta_display_data()
if (nrow(df) == 0) {
return(
datatable(
data.frame(Message = "❌ No NTAs matched your preferences."),
options = list(dom = "t"),
rownames = FALSE
)
)
}
df %>%
mutate(`Forecasted Avg Price` = dollar(round(mean_price))) %>%
select(NTA = nta_name, `Forecasted Avg Price`) %>%
datatable(
selection = "single",
options = list(pageLength = 10),
rownames = FALSE
)
})
selected_nta <- reactive({
s <- input$quiz_results_rows_selected
req(length(s) == 1)
nta_display_data()$nta_name[s]
})
# DETAIL OUTPUTS
output$nta_detail_title <- renderText({
req(selected_nta())
paste("📈 Price History for:", selected_nta())
})
output$nta_detail_trend <- renderPlotly({
req(selected_nta())
df <- combined_nta %>% filter(nta_name == selected_nta())
plot_ly(
df,
x = ~year,
y = ~mean_price,
type = "scatter",
mode = "lines+markers"
)
})
output$nta_detail_table <- renderDT({
req(selected_nta())
combined_nta %>%
filter(nta_name == selected_nta()) %>%
arrange(year) %>%
mutate(mean_price = dollar(round(mean_price))) %>%
datatable(options = list(pageLength = 10), rownames = FALSE)
})
# NEIGHBORHOOD TAB
output$price_trend_plot <- renderPlotly({
df <- combined_neighborhood %>%
filter(neighborhood == input$neighborhood)
plot_ly(
df,
x = ~year,
y = ~mean_price,
type = "scatter",
mode = "markers"
)
})
output$price_table <- renderDT({
combined_neighborhood %>%
filter(neighborhood == input$neighborhood) %>%
datatable(options = list(pageLength = 10), rownames = FALSE)
})
}
shinyApp(ui, server)
##
## Listening on http://127.0.0.1:5388
