Beyond the ‘Forgotten Borough’

Libraries

library(readr)
library(rmdformats)
library(prettydoc)
library(shiny)
library(FNN)
library(bslib)
library(formattable)
library(ggplot2)
library(tidyverse)
library(shiny)
library(leaflet)
library(sf)
library(DT)
library(RColorBrewer)
library(viridis)
library(shiny)
library(kableExtra)
library(ggplot2)
library(dplyr)
library(plotly)
library(sf)
library(scales)
library(lubridate)
library(purrr)
library(rsample)
library(xgboost)
library(scales)
library(sf) 
library(janitor)
library(readxl)
library(dplyr)
library(tidyr)
library(patchwork)
library(Metrics)
library(xgboost)
library(dplyr)
library(caret)
library(corrplot)
library(shiny)
library(rsconnect)

# sales_2017 <- read_excel("2017_statenisland.xls", skip = 4, col_types = "text") %
#  clean_names() %>%
#  mutate(year = 2017)

#sales_2018 <- read_excel("2018_statenisland.xlsx", skip = 4, col_types = "text")
# %>%
#  clean_names()%>%
#  mutate(year = 2018)

# sales_2019 <- read_excel("2019_statenisland.xlsx", skip = 4, col_types = "text") # %>%
#  clean_names()%>%
#  mutate(year = 2019)

# sales_2020 <- read_excel("2020_staten_island.xlsx", skip = 6, col_types = "text") # %>%
#  clean_names()%>%
#  mutate(year = 2020)

# sales_2021 <- read_excel("2021_staten_island.xlsx", skip = 6, col_types = "text") %>%
# clean_names()%>%
#  mutate(year = 2021)

# sales_2022 <- read_excel("2022_staten_island.xlsx", skip = 6, col_types = "text") # %>%
#  clean_names()%>%
#  mutate(year = 2022)

# sales_2023 <- read_excel("2023_staten_island.xlsx", skip = 6, col_types = "text") # %>%
#  clean_names()%>%
#  mutate(year = 2023)

# sales_2024 <- read_excel("2024_staten_island.xlsx", skip = 6, col_types = "text") %>%
#  clean_names()%>%
#  mutate(year = 2024)

# sales_2025 <- read_excel("sales_2025.xlsx", skip = 4, col_types = "text") %>%
#  clean_names()%>%
#  mutate(year = 2025)
# ------------------------------------------------------------------
# SALES DATA – REPRODUCIBILITY NOTE
#
# Original Staten Island sales files were provided as Excel workbooks
# with metadata rows and formatting issues.
#
# These files were cleaned, standardized and exported to CSV format
# to ensure consistency, lighter storage and reproducibility.
#
# The cleaned CSV files are hosted on GitHub and loaded directly below.
# The original Excel-to-CSV cleaning process is documented directly above
# ------------------------------------------------------------------

sales_2017 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2017_statenisland.csv")

sales_2018 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2018_statenisland.csv")

sales_2019 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2019_statenisland.csv")

sales_2020 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2020_staten_island.csv")

sales_2021 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2021_staten_island.csv")

sales_2022 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2022_staten_island.csv")

sales_2023 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2023_staten_island.csv")

sales_2024 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/2024_staten_island.csv")

sales_2025 <- read_csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/main/DataUsed/sales_2025.csv")

si_sales_raw <- bind_rows(
  sales_2017, sales_2018, sales_2019, sales_2020,
  sales_2021, sales_2022, sales_2023, sales_2024, sales_2025)


head(si_sales_raw)

## # A tibble: 6 × 27
##   borough neighborhood building_class_categ…¹ tax_class_as_of_fina…² block   lot
##     <dbl> <chr>        <chr>                  <chr>                  <dbl> <dbl>
## 1       5 ANNADALE     01 ONE FAMILY DWELLIN… 1                       5395    19
## 2       5 ANNADALE     01 ONE FAMILY DWELLIN… 1                       5407    10
## 3       5 ANNADALE     01 ONE FAMILY DWELLIN… 1                       5407    39
## 4       5 ANNADALE     01 ONE FAMILY DWELLIN… 1                       5426    32
## 5       5 ANNADALE     01 ONE FAMILY DWELLIN… 1                       6205    15
## 6       5 ANNADALE     01 ONE FAMILY DWELLIN… 1                       6205    22
## # ℹ abbreviated names: ¹building_class_category,
## #   ²tax_class_as_of_final_roll_17_18
## # ℹ 21 more variables: ease_ment <lgl>,
## #   building_class_as_of_final_roll_17_18 <chr>, address <chr>,
## #   apartment_number <chr>, zip_code <dbl>, residential_units <dbl>,
## #   commercial_units <dbl>, total_units <dbl>, land_square_feet <dbl>,
## #   gross_square_feet <dbl>, year_built <dbl>, …

str(si_sales_raw)

## spc_tbl_ [74,985 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ borough                              : num [1:74985] 5 5 5 5 5 5 5 5 5 5 ...
##  $ neighborhood                         : chr [1:74985] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
##  $ building_class_category              : chr [1:74985] "01 ONE FAMILY DWELLINGS" "01 ONE FAMILY DWELLINGS" "01 ONE FAMILY DWELLINGS" "01 ONE FAMILY DWELLINGS" ...
##  $ tax_class_as_of_final_roll_17_18     : chr [1:74985] "1" "1" "1" "1" ...
##  $ block                                : num [1:74985] 5395 5407 5407 5426 6205 ...
##  $ lot                                  : num [1:74985] 19 10 39 32 15 22 58 71 6 14 ...
##  $ ease_ment                            : logi [1:74985] NA NA NA NA NA NA ...
##  $ building_class_as_of_final_roll_17_18: chr [1:74985] "A1" "A2" "A2" "A6" ...
##  $ address                              : chr [1:74985] "4 EDWIN STREET" "112 ELMBANK STREET" "193 BATHGATE STREET" "3 OCEAN DRIVEWAY" ...
##  $ apartment_number                     : chr [1:74985] NA NA NA NA ...
##  $ zip_code                             : num [1:74985] 10312 10312 10312 10312 10312 ...
##  $ residential_units                    : num [1:74985] 1 1 1 1 1 1 1 1 1 1 ...
##  $ commercial_units                     : num [1:74985] 0 0 0 0 0 0 0 0 0 0 ...
##  $ total_units                          : num [1:74985] 1 1 1 1 1 1 1 1 1 1 ...
##  $ land_square_feet                     : num [1:74985] 7258 6242 5000 2500 1546 ...
##  $ gross_square_feet                    : num [1:74985] 2230 1768 808 540 1579 ...
##  $ year_built                           : num [1:74985] 1980 1975 1920 1910 1986 ...
##  $ tax_class_at_time_of_sale            : num [1:74985] 1 1 1 1 1 1 1 1 1 1 ...
##  $ building_class_at_time_of_sale       : chr [1:74985] "A1" "A2" "A2" "A6" ...
##  $ sale_price                           : num [1:74985] 866000 735000 350000 0 475000 ...
##  $ sale_date                            : num [1:74985] 42950 43046 42933 42889 42985 ...
##  $ year                                 : num [1:74985] 2017 2017 2017 2017 2017 ...
##  $ tax_class_as_of_final_roll_18_19     : chr [1:74985] NA NA NA NA ...
##  $ building_class_as_of_final_roll_18_19: chr [1:74985] NA NA NA NA ...
##  $ tax_class_at_present                 : chr [1:74985] NA NA NA NA ...
##  $ building_class_at_present            : chr [1:74985] NA NA NA NA ...
##  $ easement                             : logi [1:74985] NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   borough = col_double(),
##   ..   neighborhood = col_character(),
##   ..   building_class_category = col_character(),
##   ..   tax_class_as_of_final_roll_17_18 = col_character(),
##   ..   block = col_double(),
##   ..   lot = col_double(),
##   ..   ease_ment = col_logical(),
##   ..   building_class_as_of_final_roll_17_18 = col_character(),
##   ..   address = col_character(),
##   ..   apartment_number = col_character(),
##   ..   zip_code = col_double(),
##   ..   residential_units = col_double(),
##   ..   commercial_units = col_double(),
##   ..   total_units = col_double(),
##   ..   land_square_feet = col_double(),
##   ..   gross_square_feet = col_double(),
##   ..   year_built = col_double(),
##   ..   tax_class_at_time_of_sale = col_double(),
##   ..   building_class_at_time_of_sale = col_character(),
##   ..   sale_price = col_double(),
##   ..   sale_date = col_double(),
##   ..   year = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

# initial dataset clean
si_sales_clean <- si_sales_raw %>%

  mutate(
    sale_price = as.numeric(sale_price),
    residential_units = as.numeric(residential_units),
    year_built = as.numeric(year_built)
  ) %>%
  
  filter(!is.na(sale_price) & sale_price > 10000) %>%

  filter(grepl("FAMILY DWELLING", building_class_category)) %>%
  
  select(
    year,
    neighborhood,
    zip_code,
    residential_units,
    year_built,
    sale_price,
    land_square_feet
  )

head(si_sales_clean)

## # A tibble: 6 × 7
##    year neighborhood zip_code residential_units year_built sale_price
##   <dbl> <chr>           <dbl>             <dbl>      <dbl>      <dbl>
## 1  2017 ANNADALE        10312                 1       1980     866000
## 2  2017 ANNADALE        10312                 1       1975     735000
## 3  2017 ANNADALE        10312                 1       1920     350000
## 4  2017 ANNADALE        10312                 1       1986     475000
## 5  2017 ANNADALE        10312                 1       1986     437500
## 6  2017 ANNADALE        10312                 1       1986     354900
## # ℹ 1 more variable: land_square_feet <dbl>

str(si_sales_clean)

## tibble [40,690 × 7] (S3: tbl_df/tbl/data.frame)
##  $ year             : num [1:40690] 2017 2017 2017 2017 2017 ...
##  $ neighborhood     : chr [1:40690] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
##  $ zip_code         : num [1:40690] 10312 10312 10312 10312 10312 ...
##  $ residential_units: num [1:40690] 1 1 1 1 1 1 1 1 1 1 ...
##  $ year_built       : num [1:40690] 1980 1975 1920 1986 1986 ...
##  $ sale_price       : num [1:40690] 866000 735000 350000 475000 437500 ...
##  $ land_square_feet : num [1:40690] 7258 6242 5000 1546 1546 ...

summary(si_sales_clean)

##       year      neighborhood          zip_code     residential_units
##  Min.   :2017   Length:40690       Min.   :    0   Min.   :0.000    
##  1st Qu.:2018   Class :character   1st Qu.:10305   1st Qu.:1.000    
##  Median :2021   Mode  :character   Median :10308   Median :1.000    
##  Mean   :2021                      Mean   :10306   Mean   :1.285    
##  3rd Qu.:2023                      3rd Qu.:10312   3rd Qu.:2.000    
##  Max.   :2025                      Max.   :10314   Max.   :4.000    
##                                                    NA's   :2        
##    year_built     sale_price       land_square_feet
##  Min.   :   0   Min.   :   11500   Min.   :    0   
##  1st Qu.:1950   1st Qu.:  480000   1st Qu.: 2400   
##  Median :1975   Median :  610000   Median : 3500   
##  Mean   :1965   Mean   :  651616   Mean   : 3977   
##  3rd Qu.:1995   3rd Qu.:  765000   3rd Qu.: 4700   
##  Max.   :2025   Max.   :17000000   Max.   :88945   
##  NA's   :42                        NA's   :2

# summary table of home sales by neighborhood and year
si_neighborhood_summary <- si_sales_clean %>%
  group_by(year, neighborhood) %>%
  summarise(
    avg_sale_price = mean(sale_price, na.rm = TRUE),
    min_sale_price = min(sale_price, na.rm = TRUE),
    max_sale_price = max(sale_price, na.rm = TRUE),
    number_of_sales = n(),
    .groups = 'drop' 
  )
head(si_neighborhood_summary)

## # A tibble: 6 × 6
##    year neighborhood         avg_sale_price min_sale_price max_sale_price
##   <dbl> <chr>                         <dbl>          <dbl>          <dbl>
## 1  2017 ANNADALE                    686084.          42000        1320000
## 2  2017 ARDEN HEIGHTS               423725.         157170        1075000
## 3  2017 ARROCHAR                    564824.         200000        1150000
## 4  2017 ARROCHAR-SHORE ACRES        495400          150000         755000
## 5  2017 BULLS HEAD                  500105.          70000         895000
## 6  2017 CASTLETON CORNERS           534650.         110000        1150000
## # ℹ 1 more variable: number_of_sales <int>

## some eda on that 
cat("--- Summary of the si_sales_clean dataset ---\n")

## --- Summary of the si_sales_clean dataset ---

summary(si_sales_clean)

##       year      neighborhood          zip_code     residential_units
##  Min.   :2017   Length:40690       Min.   :    0   Min.   :0.000    
##  1st Qu.:2018   Class :character   1st Qu.:10305   1st Qu.:1.000    
##  Median :2021   Mode  :character   Median :10308   Median :1.000    
##  Mean   :2021                      Mean   :10306   Mean   :1.285    
##  3rd Qu.:2023                      3rd Qu.:10312   3rd Qu.:2.000    
##  Max.   :2025                      Max.   :10314   Max.   :4.000    
##                                                    NA's   :2        
##    year_built     sale_price       land_square_feet
##  Min.   :   0   Min.   :   11500   Min.   :    0   
##  1st Qu.:1950   1st Qu.:  480000   1st Qu.: 2400   
##  Median :1975   Median :  610000   Median : 3500   
##  Mean   :1965   Mean   :  651616   Mean   : 3977   
##  3rd Qu.:1995   3rd Qu.:  765000   3rd Qu.: 4700   
##  Max.   :2025   Max.   :17000000   Max.   :88945   
##  NA's   :42                        NA's   :2

# Get the structure of the dataset to check data types.
cat("\n--- Structure of the si_sales_clean dataset ---\n")

## 
## --- Structure of the si_sales_clean dataset ---

str(si_sales_clean)

## tibble [40,690 × 7] (S3: tbl_df/tbl/data.frame)
##  $ year             : num [1:40690] 2017 2017 2017 2017 2017 ...
##  $ neighborhood     : chr [1:40690] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
##  $ zip_code         : num [1:40690] 10312 10312 10312 10312 10312 ...
##  $ residential_units: num [1:40690] 1 1 1 1 1 1 1 1 1 1 ...
##  $ year_built       : num [1:40690] 1980 1975 1920 1986 1986 ...
##  $ sale_price       : num [1:40690] 866000 735000 350000 475000 437500 ...
##  $ land_square_feet : num [1:40690] 7258 6242 5000 1546 1546 ...

## Note for later: we got 2 NAs at residential_units and 27 at year_built

# Plot 1: Distribution of Sale Prices
# This histogram shows the frequency of sales at different price points.
ggplot(si_sales_clean, aes(x = sale_price)) +
  geom_histogram(bins = 50, fill = "purple", color = "black") +
  scale_x_log10(labels = scales::dollar_format()) +
  labs(
    title = "Distribution of Staten Island Home Sale Prices (2017-2025)",
    x = "Sale Price (Log Scale)",
    y = "Number of Sales"
  ) +
  theme_minimal()

# Plot 2: Median Sale Price Over Time
# This line chart shows how the overall market has changed year over year.
si_yearly_median <- si_sales_clean %>%
  group_by(year) %>%
  summarise(median_sale_price = median(sale_price, na.rm = TRUE))

ggplot(si_yearly_median, aes(x = year, y = median_sale_price, group = 1)) +
  geom_line(color = "navy", size = 1.5) +
  geom_point(color = "navy", size = 3) +
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(
    title = "Median Home Sale Price in Staten Island (2017-2025)",
    x = "Year",
    y = "Median Sale Price"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Plot 3: Number of Sales Per Year
# This bar chart shows the volume of sales activity over time.
ggplot(si_neighborhood_summary, aes(x = year, y = number_of_sales)) +
  stat_summary(fun = "sum", geom = "bar", fill = "steelblue") +
  labs(
    title = "Total Number of Residential Sales in Staten Island (2017-2025)",
    x = "Year",
    y = "Total Sales"
  ) +
  theme_minimal()

Mapping of Staten Island sales

nta_map <- st_read("nynta2020_25c/nynta2020.shp")

## Reading layer `nynta2020' from data source 
##   `C:\Users\nicol\OneDrive\Έγγραφα\nynta2020_25c\nynta2020.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 262 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)

si_map <- nta_map %>%
  filter(BoroName == "Staten Island")

si_sales_clean <- si_sales_clean %>%
  mutate(neighborhood_upper = toupper(trimws(neighborhood)))

si_map <- si_map %>%
  mutate(NTAName = if_else(NTAName == "Snug Harbor", "St. George-New Brighton", NTAName)) %>%
    mutate(NTAName = if_else(NTAName == "Great Kills Park", "Great Kills-Eltingville", NTAName))%>%
      mutate(NTAName = if_else(NTAName == "Miller Field", "New Dorp-Midland Beach", NTAName))
    

# df same as si_sales_clean but with the ntas 
si_sales_mapped <- si_sales_clean %>%
  mutate(
    nta_name = case_when(
      grepl("ARDEN HEIGHTS|ROSSVILLE|ROSSVILLE-CHARLESTON|ROSSVILLE-RICHMOND VALLEY", neighborhood_upper) ~ "Arden Heights-Rossville",
      grepl("ARROCHAR|ARROCHAR-SHORE ACRES|DONGAN HILLS|DONGAN HILLS-COLONY|DONGAN HILLS-OLD TOWN|GRASMERE|SOUTH BEACH", neighborhood_upper) ~ "Grasmere-Arrochar-South Beach-Dongan Hills",
      grepl("ANNADALE|HUGUENOT|PRINCES BAY|WOODROW|PLEASANT PLAINS", neighborhood_upper) ~ "Annadale-Huguenot-Prince's Bay-Woodrow",
      grepl("BULLS HEAD|NEW SPRINGVILLE|TRAVIS|WILLOWBROOK|BLOOMFIELD", neighborhood_upper) ~ "New Springville-Willowbrook-Bulls Head-Travis",
      grepl("CASTLETON CORNERS|WESTERLEIGH|CLOVE LAKES|SUNNYSIDE", neighborhood_upper) ~ "Westerleigh-Castleton Corners",
      grepl("CONCORD|CONCORD-FOX HILLS|STAPLETON|STAPLETON-CLIFTON|TOMPKINSVILLE", neighborhood_upper) ~ "Tompkinsville-Stapleton-Clifton-Fox Hills",
      grepl("ELTINGVILLE|GREAT KILLS|GREAT KILLS-BAY TERRACE", neighborhood_upper) ~ "Great Kills-Eltingville",
      grepl("GRYMES HILL|SILVER LAKE|WEST NEW BRIGHTON|LIVINGSTON", neighborhood_upper) ~ "West New Brighton-Silver Lake-Grymes Hill",
      grepl("EMERSON HILL|MANOR HEIGHTS|TODT HILL", neighborhood_upper) ~ "Todt Hill-Emerson Hill-Lighthouse Hill-Manor Heights",
      grepl("PORT RICHMOND", neighborhood_upper) ~ "Port Richmond",
      grepl("TOTTENVILLE", neighborhood_upper) ~ "Tottenville-Charleston",
      grepl("OAKWOOD|OAKWOOD-BEACH|RICHMONDTOWN|RICHMONDTOWN-LIGHTHS HILL", neighborhood_upper) ~ "Oakwood-Richmondtown",
      grepl("NEW DORP|MIDLAND BEACH|NEW DORP-BEACH|NEW DORP-HEIGHTS|GRANT CITY", neighborhood_upper) ~ "New Dorp-Midland Beach",
      grepl("ROSEBANK", neighborhood_upper) ~ "Rosebank-Shore Acres-Park Hill",
      grepl("MARINERS HARBOR|PORT IVORY", neighborhood_upper) ~ "Mariner's Harbor-Arlington-Graniteville",
      grepl("NEW BRIGHTON", neighborhood_upper) ~ "St. George-New Brighton",
      grepl("FRESH KILLS", neighborhood_upper) ~ "Freshkills Park (South)",
      TRUE ~ "Other" # Catches any names that don't fit the rules above
    )
  )

other_rows <- si_sales_mapped[si_sales_mapped$nta_name == "Other", ]
nrow(other_rows)

## [1] 0

si_sales_mapped$land_square_feet <- as.numeric(si_sales_mapped$land_square_feet)

si_sales_mapped <- si_sales_mapped %>%
  mutate(year_built = na_if(year_built, 0))
summary(si_sales_mapped)

##       year      neighborhood          zip_code     residential_units
##  Min.   :2017   Length:40690       Min.   :    0   Min.   :0.000    
##  1st Qu.:2018   Class :character   1st Qu.:10305   1st Qu.:1.000    
##  Median :2021   Mode  :character   Median :10308   Median :1.000    
##  Mean   :2021                      Mean   :10306   Mean   :1.285    
##  3rd Qu.:2023                      3rd Qu.:10312   3rd Qu.:2.000    
##  Max.   :2025                      Max.   :10314   Max.   :4.000    
##                                                    NA's   :2        
##    year_built     sale_price       land_square_feet neighborhood_upper
##  Min.   :1835   Min.   :   11500   Min.   :    0    Length:40690      
##  1st Qu.:1950   1st Qu.:  480000   1st Qu.: 2400    Class :character  
##  Median :1975   Median :  610000   Median : 3500    Mode  :character  
##  Mean   :1969   Mean   :  651616   Mean   : 3977                      
##  3rd Qu.:1995   3rd Qu.:  765000   3rd Qu.: 4700                      
##  Max.   :2025   Max.   :17000000   Max.   :88945                      
##  NA's   :134                       NA's   :2                          
##    nta_name        
##  Length:40690      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

head(si_sales_mapped)

## # A tibble: 6 × 9
##    year neighborhood zip_code residential_units year_built sale_price
##   <dbl> <chr>           <dbl>             <dbl>      <dbl>      <dbl>
## 1  2017 ANNADALE        10312                 1       1980     866000
## 2  2017 ANNADALE        10312                 1       1975     735000
## 3  2017 ANNADALE        10312                 1       1920     350000
## 4  2017 ANNADALE        10312                 1       1986     475000
## 5  2017 ANNADALE        10312                 1       1986     437500
## 6  2017 ANNADALE        10312                 1       1986     354900
## # ℹ 3 more variables: land_square_feet <dbl>, neighborhood_upper <chr>,
## #   nta_name <chr>

str(si_sales_mapped)

## tibble [40,690 × 9] (S3: tbl_df/tbl/data.frame)
##  $ year              : num [1:40690] 2017 2017 2017 2017 2017 ...
##  $ neighborhood      : chr [1:40690] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
##  $ zip_code          : num [1:40690] 10312 10312 10312 10312 10312 ...
##  $ residential_units : num [1:40690] 1 1 1 1 1 1 1 1 1 1 ...
##  $ year_built        : num [1:40690] 1980 1975 1920 1986 1986 ...
##  $ sale_price        : num [1:40690] 866000 735000 350000 475000 437500 ...
##  $ land_square_feet  : num [1:40690] 7258 6242 5000 1546 1546 ...
##  $ neighborhood_upper: chr [1:40690] "ANNADALE" "ANNADALE" "ANNADALE" "ANNADALE" ...
##  $ nta_name          : chr [1:40690] "Annadale-Huguenot-Prince's Bay-Woodrow" "Annadale-Huguenot-Prince's Bay-Woodrow" "Annadale-Huguenot-Prince's Bay-Woodrow" "Annadale-Huguenot-Prince's Bay-Woodrow" ...

neighborhood_prices <- si_sales_mapped %>% 
  filter(nta_name != "Other") %>% 
  group_by(nta_name) %>% 
  summarise(overall_avg_price = mean(sale_price, na.rm = TRUE)) 

si_map_prices <- left_join(si_map, neighborhood_prices, by = c("NTAName" = "nta_name"))

# overall avg price per neighborhood
get_clean_label <- function(name_vec) {
  sapply(name_vec, function(name) {
    if (is.na(name)) return(NA)
    strsplit(name, "-")[[1]][1] |> trimws()
  })
}

si_map_prices <- si_map_prices %>%
  mutate(
    label_name = get_clean_label(NTAName),
    centroid = st_centroid(geometry),
    label_color = if_else(overall_avg_price > 900000, "white", "black")
  )

 
heatmap<- ggplot(data = si_map_prices) +
    geom_sf(aes(fill = overall_avg_price), color = "white", size = 0.3) +
    
    geom_sf_text(
        aes(geometry = centroid, label = label_name, color = label_color),
        size = 2,         
        fontface = "bold",
        alpha = 0.9
    ) +
    
    scale_color_identity() +
    scale_fill_viridis_c(
        option = "mako",
        direction = -1,
        labels = scales::dollar_format(),
        name = "Avg. Sale Price"
    ) +
    
    labs(
        title = "Where Are Homes Most Expensive in Staten Island?",
        subtitle = "Southern neighborhoods consistently show higher prices (2017–2025)",
        caption = "Labels shortened for readability"
    ) +
    
    theme_void(base_size = 14) +
    theme(
        plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
        plot.subtitle = element_text(size = 14, hjust = 0.5),
        plot.caption = element_text(size = 10, color = "gray40", hjust = 0.5),
        legend.position = "right"
    )

heatmap

num_vars <- sapply(si_sales_mapped, is.numeric)
num_data <- si_sales_mapped[ , num_vars]

n_num <- ncol(num_data)

par(mfrow = c(2, 3))  

for (colname in names(num_data)) {
  hist(num_data[[colname]],
       main = paste("Histogram of", colname),
       xlab = colname,
       col = "purple",
       breaks = 30)
}

# boxplots
par(mfrow = c(2, 3))  

for (colname in names(num_data)) {
  boxplot(num_data[[colname]],
          horizontal = TRUE,
          main = paste("Boxplot of", colname),
          xlab = colname,
          col = "orange")
}

hist(num_data$sale_price)
boxplot(num_data$sale_price)

heat_data <- si_sales_mapped %>%
  mutate(
    year_bin = cut(
      year_built,
      breaks = seq(1830, 2030, by = 10),
      labels = paste0(seq(1830, 2020, by = 10), "-", seq(1840, 2030, by = 10)) 
    )
  ) %>%
  group_by(nta_name, year_bin) %>%
  summarise(avg_price = mean(sale_price, na.rm = TRUE), .groups = "drop")%>%
  drop_na(year_bin)

nta_order <- heat_data %>%
  group_by(nta_name) %>%
  summarise(avg = mean(avg_price, na.rm = TRUE)) %>%
  arrange(desc(avg)) %>%
  pull(nta_name)

ggplot(heat_data, aes(x = year_bin, y = factor(nta_name, levels = nta_order), fill = avg_price)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(option = "plasma", labels = scales::dollar) +
  labs(
    x = "Decade Built",
    y = "Neighborhood (NTA)",
    fill = "Average Price",
    title = "Average Sale Price by NTA and Decade Built"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45,size = 8, hjust = 1),
    axis.text.y = element_text(size = 6)
  )

CPI

cpi_all_items_raw <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/CPI_allcostumer.csv")

colnames(cpi_all_items_raw) <- c("date", "cpi_value")

cpi2025 <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/cpi_2025.csv")
colnames(cpi2025) <- c("date", "cpi_value")

cpi_all_items_raw <- bind_rows(cpi_all_items_raw, cpi2025)

head(cpi_all_items_raw)

## # A tibble: 6 × 2
##   date       cpi_value
##   <date>         <dbl>
## 1 2017-01-01      267.
## 2 2017-02-01      268.
## 3 2017-03-01      268.
## 4 2017-04-01      268.
## 5 2017-05-01      268.
## 6 2017-06-01      269.

cpi_all_items <- cpi_all_items_raw %>%
  mutate(
    date = as.Date(date),
    year = as.numeric(format(date, "%Y"))
  ) %>%
  group_by(year) %>%
  summarise(all_items_annual_avg_cpi = mean(cpi_value, na.rm = TRUE)) %>%
  mutate(
    previous_year_cpi = lag(all_items_annual_avg_cpi),
    all_items_inflation_pct = ((all_items_annual_avg_cpi - previous_year_cpi) / previous_year_cpi) * 100,
    all_items_inflation_pct = ifelse(year == 2017, 2.1, all_items_inflation_pct)
  ) %>%
  select(year, all_items_annual_avg_cpi, all_items_inflation_pct)

cpi_vs_home <- si_yearly_median %>%
  left_join(cpi_all_items, by = "year")

p_home <- ggplot(cpi_vs_home, aes(x = year, y = median_sale_price)) +
  geom_line(color = "darkblue", linewidth = 1.2) +
  geom_point(color = "darkblue", size = 3) +
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(
    title = "Staten Island Median Home Prices",
    x = "Year",
    y = "Median Sale Price (USD)"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title = element_text(face = "bold")
  )
p_cpi <- ggplot(cpi_vs_home, aes(x = year, y = all_items_annual_avg_cpi)) +
  geom_line(color = "firebrick", linewidth = 1.2, linetype = "dashed") +
  geom_point(color = "firebrick", size = 3) +
  labs(
    title = "Consumer Price Index (All Items)",
    x = "Year",
    y = "CPI Index"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title = element_text(face = "bold")
  )
p_home + p_cpi +
  plot_annotation(
    title = "Housing Prices and Inflation Trends (2017–2025)",
    theme = theme(
      plot.title = element_text(size = 18, face = "bold", hjust = 0.5)
    )
  )

## Warning: annotation$theme is not a valid theme.
## Please use `theme()` to construct themes.

mortgage_rates_raw <- read.csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/mortgage_rt.csv")


colnames(mortgage_rates_raw) <- c("date", "mortgage_rt")



mort2025<- read.csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/mortgage_rt_2025.csv")
colnames(mort2025) <- c("date", "mortgage_rt")
mortgage_rates_raw <- bind_rows(mortgage_rates_raw, mort2025)

head(mortgage_rates_raw)

##         date mortgage_rt
## 1 2017-01-05        4.20
## 2 2017-01-12        4.12
## 3 2017-01-19        4.09
## 4 2017-01-26        4.19
## 5 2017-02-02        4.19
## 6 2017-02-09        4.17

Mortgage rate

mortgage_rates_yearly <- mortgage_rates_raw %>%
  mutate(
    date = as.Date(date),
    year = year(date) 
  ) %>%
  group_by(year) %>%
  summarise(
    avg_mortgage_rate = mean(mortgage_rt, na.rm = TRUE))

print(mortgage_rates_yearly)

## # A tibble: 9 × 2
##    year avg_mortgage_rate
##   <dbl>             <dbl>
## 1  2017              3.99
## 2  2018              4.54
## 3  2019              3.94
## 4  2020              3.11
## 5  2021              2.96
## 6  2022              5.34
## 7  2023              6.81
## 8  2024              6.72
## 9  2025              5.87

economic_factors <- full_join(cpi_all_items, mortgage_rates_yearly, by = "year")


## Normalization to index for cpi
economic_factors <- economic_factors %>%
  arrange(year) %>%
  mutate(
    all_items_annual_avg_cpi =
      (all_items_annual_avg_cpi / first(all_items_annual_avg_cpi)) * 100 ) %>%
  select(-all_items_inflation_pct)



print(economic_factors)

## # A tibble: 9 × 3
##    year all_items_annual_avg_cpi avg_mortgage_rate
##   <dbl>                    <dbl>             <dbl>
## 1  2017                     100               3.99
## 2  2018                     102.              4.54
## 3  2019                     104.              3.94
## 4  2020                     105.              3.11
## 5  2021                     109.              2.96
## 6  2022                     116.              5.34
## 7  2023                     120.              6.81
## 8  2024                     124.              6.72
## 9  2025                     128.              5.87

comparison_data <- full_join(si_yearly_median, cpi_all_items, by = "year") %>%
  left_join(mortgage_rates_yearly, by = "year") %>%
  mutate(price_bar = 0.7)


ggplot(comparison_data, aes(x = as.factor(year))) +

  
  geom_col(aes(y = price_bar, fill = "Median Sale Price"),
           alpha = 0.6, width = 0.6) +

 
  geom_text(
    aes(y = price_bar,
        label = scales::dollar(median_sale_price)),
    vjust = -0.6,
    size = 3.5,
    fontface = "bold"
  ) +


  geom_line(aes(y = all_items_inflation_pct,
                color = "Overall Inflation (CPI)"),
            linewidth = 1.3, group = 1) +
  geom_point(aes(y = all_items_inflation_pct,
                 color = "Overall Inflation (CPI)"),
             size = 3) +


  geom_line(aes(y = avg_mortgage_rate,
                color = "30-Year Mortgage Rate"),
            linewidth = 1.3, linetype = "dashed", group = 1) +
  geom_point(aes(y = avg_mortgage_rate,
                 color = "30-Year Mortgage Rate"),
             size = 3) +

  scale_y_continuous(
    name = "Inflation & Mortgage Rates (%)",
    expand = expansion(mult = c(0.05, 0.25))
  ) +

  scale_fill_manual(
    values = c("Median Sale Price" = "#FF8C66"),
    name = ""
  ) +
  scale_color_manual(
    values = c(
      "Overall Inflation (CPI)" = "#1F78B4",
      "30-Year Mortgage Rate"  = "#33A02C"
    ),
    name = ""
  ) +

  labs(
    title = "Housing Prices (Labeled) with Inflation and Mortgage Rate Trends",
    subtitle = "Bars show annual median prices as labels-lines show macroeconomic rates",
    x = "Year"
  ) +

  theme_minimal(base_size = 14) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold")
  )

Population growth/decay of NYC and SI

ny_population <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/nypopulation_fred.csv")

## Rows: 8 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): observation_date
## dbl (1): NYPOP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

si_population <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/si_population.csv")

## Rows: 9 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): year
## num (1): population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

new_row <- tibble(
  year = 2025,
  population = 489345
)


si_population <- bind_rows(si_population, new_row)
## population in 2025 is estimated

print(ny_population)

## # A tibble: 8 × 2
##   observation_date  NYPOP
##   <chr>             <dbl>
## 1 1/1/2017         19594.
## 2 1/1/2018         19544.
## 3 1/1/2019         19463.
## 4 1/1/2020         20105.
## 5 1/1/2021         19848.
## 6 1/1/2022         19704.
## 7 1/1/2023         19737.
## 8 1/1/2024         19867.

print(si_population)

## # A tibble: 10 × 2
##     year population
##    <dbl>      <dbl>
##  1  2016     473324
##  2  2017     475948
##  3  2018     474101
##  4  2019     474893
##  5  2020     475596
##  6  2021     493194
##  7  2022     492925
##  8  2023     491734
##  9  2024     498212
## 10  2025     489345

Population of ntas

nta_pop <- data.frame(
  nta_name = c(
    "St. George-New Brighton",
    "Tompkinsville-Stapleton-Clifton-Fox Hills",
    "Rosebank-Shore Acres-Park Hill",
    "West New Brighton-Silver Lake-Grymes Hill",
    "Westerleigh-Castleton Corners",
    "Port Richmond",
    "Mariner's Harbor-Arlington-Graniteville",
    "Grasmere-Arrochar-South Beach-Dongan Hills",
    "New Dorp-Midland Beach",
    "Todt Hill-Emerson Hill-Lighthouse Hill-Manor Heights",
    "New Springville-Willowbrook-Bulls Head-Travis",
    "Oakwood-Richmondtown",
    "Great Kills-Eltingville",
    "Arden Heights-Rossville",
    "Annadale-Huguenot-Prince's Bay-Woodrow",
    "Tottenville-Charleston"
  ),
  population = c(
    20225,
    17596,
    22705,
    35491,
    31582,
    20896,
    32812,
    36672,
    29303,
    33699,
    42458,
    20418,
    57572,
    31491,
    42360,
    16845
  )
)

## SI
si_population <- si_population %>%
  rename(
    si_population = population
  )


si_population <- si_population %>%
  mutate(
    yoy_growth_percentage_si = ((si_population - lag(si_population)) / lag(si_population)) * 100
  ) %>%
  filter(year > 2016) 
print(si_population)

## # A tibble: 9 × 3
##    year si_population yoy_growth_percentage_si
##   <dbl>         <dbl>                    <dbl>
## 1  2017        475948                   0.554 
## 2  2018        474101                  -0.388 
## 3  2019        474893                   0.167 
## 4  2020        475596                   0.148 
## 5  2021        493194                   3.70  
## 6  2022        492925                  -0.0545
## 7  2023        491734                  -0.242 
## 8  2024        498212                   1.32  
## 9  2025        489345                  -1.78

Crime data

crime_data <- read.csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/NYPD_Arrests_Data__Historic__20250927.csv")
crime_points <- st_as_sf(crime_data, 
                         coords = c("Longitude", "Latitude"), 
                         crs = 4326)  

crime_points <- st_transform(crime_points, st_crs(si_map))

crime_with_nta <- st_join(crime_points, si_map, join = st_within)

crime_clean <- crime_with_nta %>%
  mutate(
    year = year(mdy(ARREST_DATE))   
  ) %>%
  select(
    year,
    OFNS_DESC,    
    NTAName       
  )

head(crime_clean)

## Simple feature collection with 6 features and 3 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 937288 ymin: 151474.4 xmax: 966060 ymax: 172421.8
## Projected CRS: NAD83 / New York Long Island (ftUS)
##   year                    OFNS_DESC                                 NTAName
## 1 2024     VEHICLE AND TRAFFIC LAWS          Rosebank-Shore Acres-Park Hill
## 2 2024                PETIT LARCENY                 Freshkills Park (North)
## 3 2024 ASSAULT 3 & RELATED OFFENSES                           Port Richmond
## 4 2024                      FORGERY                 St. George-New Brighton
## 5 2024      MISCELLANEOUS PENAL LAW Mariner's Harbor-Arlington-Graniteville
## 6 2024      MISCELLANEOUS PENAL LAW Mariner's Harbor-Arlington-Graniteville
##                    geometry
## 1     POINT (966060 158870)
## 2   POINT (937288 151474.4)
## 3 POINT (947631.5 172421.8)
## 4     POINT (961010 170904)
## 5 POINT (943691.2 170168.5)
## 6 POINT (943691.2 170168.5)

str(crime_clean)

## Classes 'sf' and 'data.frame':   73637 obs. of  4 variables:
##  $ year     : num  2024 2024 2024 2024 2024 ...
##  $ OFNS_DESC: chr  "VEHICLE AND TRAFFIC LAWS" "PETIT LARCENY" "ASSAULT 3 & RELATED OFFENSES" "FORGERY" ...
##  $ NTAName  : chr  "Rosebank-Shore Acres-Park Hill" "Freshkills Park (North)" "Port Richmond" "St. George-New Brighton" ...
##  $ geometry :sfc_POINT of length 73637; first list element:  'XY' num  966060 158870
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA
##   ..- attr(*, "names")= chr [1:3] "year" "OFNS_DESC" "NTAName"

crime_data_2025 <- read.csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/crimedata_2025.csv")
crime_points_25 <- st_as_sf(crime_data_2025, 
                         coords = c("Longitude", "Latitude"), 
                         crs = 4326)  

crime_points_25 <- st_transform(crime_points_25, st_crs(si_map))

crime_with_nta_25 <- st_join(crime_points_25, si_map, join = st_within)

crime_clean_25 <- crime_with_nta_25 %>%
  mutate(
    year = year(mdy(CMPLNT_FR_DT))   
  ) %>%
  select(
    year,
    OFNS_DESC,    
    NTAName       
  )

head(crime_clean_25)

## Simple feature collection with 6 features and 3 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 923772 ymin: -2280005 xmax: 31130730 ymax: 170765.5
## Projected CRS: NAD83 / New York Long Island (ftUS)
##   year                      OFNS_DESC
## 1 2025                    THEFT-FRAUD
## 2 2025              DANGEROUS WEAPONS
## 3 2025 CRIMINAL MISCHIEF & RELATED OF
## 4 2025                        ROBBERY
## 5 2025        MISCELLANEOUS PENAL LAW
## 6 2025                  HARRASSMENT 2
##                                                NTAName
## 1                                                 <NA>
## 2               Annadale-Huguenot-Prince's Bay-Woodrow
## 3              Mariner's Harbor-Arlington-Graniteville
## 4 Todt Hill-Emerson Hill-Lighthouse Hill-Manor Heights
## 5                              Arden Heights-Rossville
## 6               Annadale-Huguenot-Prince's Bay-Woodrow
##                    geometry
## 1 POINT (31130727 -2280005)
## 2     POINT (923772 129133)
## 3 POINT (939222.3 170765.5)
## 4   POINT (951842.8 159589)
## 5   POINT (927979 137152.2)
## 6 POINT (932667.8 129951.1)

crime_clean_25<-crime_clean_25 %>%
  drop_na()
crime_clean <- bind_rows(crime_clean, crime_clean_25)
crime_clean

## Simple feature collection with 85521 features and 3 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 913511.3 ymin: 121134 xmax: 968884 ymax: 175494
## Projected CRS: NAD83 / New York Long Island (ftUS)
## First 10 features:
##    year                      OFNS_DESC                                 NTAName
## 1  2024       VEHICLE AND TRAFFIC LAWS          Rosebank-Shore Acres-Park Hill
## 2  2024                  PETIT LARCENY                 Freshkills Park (North)
## 3  2024   ASSAULT 3 & RELATED OFFENSES                           Port Richmond
## 4  2024                        FORGERY                 St. George-New Brighton
## 5  2024        MISCELLANEOUS PENAL LAW Mariner's Harbor-Arlington-Graniteville
## 6  2024        MISCELLANEOUS PENAL LAW Mariner's Harbor-Arlington-Graniteville
## 7  2024 GRAND LARCENY OF MOTOR VEHICLE          Rosebank-Shore Acres-Park Hill
## 8  2024                        ROBBERY Mariner's Harbor-Arlington-Graniteville
## 9  2024                 FELONY ASSAULT                 St. George-New Brighton
## 10 2024        MISCELLANEOUS PENAL LAW                 St. George-New Brighton
##                     geometry
## 1      POINT (966060 158870)
## 2    POINT (937288 151474.4)
## 3  POINT (947631.5 172421.8)
## 4      POINT (961010 170904)
## 5  POINT (943691.2 170168.5)
## 6  POINT (943691.2 170168.5)
## 7      POINT (964889 164799)
## 8  POINT (940598.5 167435.6)
## 9  POINT (962808.2 174278.5)
## 10 POINT (962808.2 174278.5)

Crime rate per nta

crime_counts_by_nta_year <- crime_clean %>%
  group_by(year, NTAName) %>%
  summarise(total_incidents = n(), .groups = "drop") %>%
  as.data.frame()  

si_map_counts <- si_map %>%
  left_join(crime_counts_by_nta_year, by = c("NTAName" = "NTAName"))%>%
  filter(!is.na(total_incidents))

## Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 14 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

crime_totals_by_nta <- crime_counts_by_nta_year %>%
  group_by(NTAName) %>%
  summarise(total_incidents_all_years = sum(total_incidents, na.rm = TRUE), .groups = "drop")

# crime rate per 1,000 residents
nta_pop <- nta_pop %>%
  rename(NTAName = nta_name)
nta_pop <- nta_pop %>%
  mutate(population = as.numeric(gsub(",", "", population)))
crime_with_pop <- crime_totals_by_nta %>%
  left_join(nta_pop, by = "NTAName") %>%
  mutate(crime_rate_per_1000 = (total_incidents_all_years / population) * 1000)


crime_with_pop <- crime_with_pop %>%
    drop_na()


crime_with_pop <- crime_with_pop %>%
  rename(nta_name = NTAName)

head(crime_with_pop)

## # A tibble: 6 × 4
##   nta_name                 total_incidents_all_…¹ population crime_rate_per_1000
##   <chr>                                     <int>      <dbl>               <dbl>
## 1 Annadale-Huguenot-Princ…                   2537      42360                59.9
## 2 Arden Heights-Rossville                    1494      31491                47.4
## 3 Grasmere-Arrochar-South…                   3909      36672               107. 
## 4 Great Kills-Eltingville                    2717      57572                47.2
## 5 Mariner's Harbor-Arling…                  12250      32812               373. 
## 6 New Dorp-Midland Beach                     6719      29303               229. 
## # ℹ abbreviated name: ¹total_incidents_all_years

#### Higher numbers = relatively more crime incidents per resident (higher crime density).
#### Lower numbers = relatively fewer incidents per resident (lower crime density).

ggplot(si_map_counts) +
  geom_sf(aes(fill = total_incidents), color = "white", size = 0.2) +
  
  scale_fill_viridis_c(option = "viridis") +
  facet_wrap(~year, ncol = 3) +
  labs(
    title = "Crime Incidents per NTA",
    fill = "Incidents"
  ) +
  theme_void(base_size = 19) +  
  theme(
     legend.position = "bottom",
    plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 
    strip.text = element_text(size = 10),
    legend.text = element_text(size = 6),
    legend.title = element_text(size = 8, face = "bold")
  )

top_10_nta <- crime_counts_by_nta_year %>%
  group_by(NTAName) %>%
  summarise(total_incidents_all_years = sum(total_incidents)) %>%
  slice_max(order_by = total_incidents_all_years, n = 10)


top_10_data <- crime_counts_by_nta_year %>%
  filter(NTAName %in% top_10_nta$NTAName)

ggplot(top_10_data, aes(x = reorder(NTAName, total_incidents), y = total_incidents, fill = NTAName)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  facet_wrap(~year, ncol = 3) +
  labs(
    title = "Top 10 Neighborhoods by Crime Incidents",
    x = "Neighborhood (NTA)",
    y = "Total Incidents"
  ) +
  theme(
    legend.position = "none",
    axis.text.y = element_text(size = 7.2) 
  )

Average house price per year and nta

# Average house price per year and nta
avg_nta_house_prices <- si_sales_mapped %>%
  dplyr::group_by(year, nta_name)  %>%
  summarise(
    avg_sale_price = mean(sale_price, na.rm = TRUE),
    .groups = "drop" )


print(avg_nta_house_prices)

## # A tibble: 146 × 3
##     year nta_name                                      avg_sale_price
##    <dbl> <chr>                                                  <dbl>
##  1  2017 Annadale-Huguenot-Prince's Bay-Woodrow               672129.
##  2  2017 Arden Heights-Rossville                              480176.
##  3  2017 Freshkills Park (South)                              328536 
##  4  2017 Grasmere-Arrochar-South Beach-Dongan Hills           584653.
##  5  2017 Great Kills-Eltingville                              534381.
##  6  2017 Mariner's Harbor-Arlington-Graniteville              349458.
##  7  2017 New Dorp-Midland Beach                               498886.
##  8  2017 New Springville-Willowbrook-Bulls Head-Travis        548967.
##  9  2017 Oakwood-Richmondtown                                 625325.
## 10  2017 Port Richmond                                        364958.
## # ℹ 136 more rows

si_map_prices2 <- si_map %>%
  left_join(avg_nta_house_prices, by = c("NTAName" = "nta_name"))

## Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 12 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

years <- 2017:2025


all_year_plots <- list()

for (yr in years) {
  
  map_prices_year <- si_map_prices2 %>% filter(year == yr)
  map_incidents_year <- si_map_counts %>% filter(year == yr)


  p1 <- ggplot(map_prices_year) +
    geom_sf(aes(fill = avg_sale_price), color = "white", size = 0.3) +
    scale_fill_viridis_c(
      option = "magma", 
      na.value = "grey80",
      labels = scales::label_dollar(scale = 1e-3, suffix = "K"),  
      breaks = c(400000, 600000, 800000, 1000000)         
    ) +
    labs(title = paste("Average House Price by NTA (", yr, ")", sep = ""),
         fill = "Avg Price") +
    theme_void() +
    theme(
      legend.position = "bottom",
      legend.key.height = unit(0.3, "cm"),
      legend.key.width = unit(1.2, "cm"),
      legend.title = element_text(size = 9),
      legend.text = element_text(size = 8)
    )


  p2 <- ggplot(map_incidents_year) +
    geom_sf(aes(fill = total_incidents), color = "white", size = 0.3) +
    scale_fill_viridis_c(
      option = "viridis",
      trans = "log",
      na.value = "grey80",
      labels = scales::label_comma(),                        
      breaks = c(1, 10, 50, 200, 500, 1000)          
    ) +
    labs(title = paste("Incidents by NTA (", yr, ")", sep = ""),
         fill = "Incidents") +
    theme_void() +
    theme(
      legend.position = "bottom",
      legend.key.height = unit(0.3, "cm"),
      legend.key.width = unit(1.2, "cm"),
      legend.title = element_text(size = 9),
      legend.text = element_text(size = 8)
    )


  combined <- p1 + p2


  all_year_plots[[as.character(yr)]] <- combined
}

big_combined_plot <- wrap_plots(all_year_plots, ncol = 2)

Schools

schools <- read_csv("https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/staten_island_schoolss.csv")

## Rows: 79 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): School Name, Nta_name
## dbl (1): Ranking
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

merged_data <- schools %>%
  left_join(neighborhood_prices, by = c("Nta_name" = "nta_name"))

ggplot(merged_data, aes(x = overall_avg_price, y = Ranking)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Average Housing Price", y = "School Ranking (1–10)",
       title = "Relationship between Housing Prices and School Rankings")

## `geom_smooth()` using formula = 'y ~ x'

Average school ranking per nta

avg_school_rankings_by_nta <- schools %>%
  group_by(Nta_name) %>%
  summarise(avg_school_ranking = mean(Ranking, na.rm = TRUE), .groups = "drop")

avg_school_rankings_by_nta <- avg_school_rankings_by_nta %>% 
  rename(nta_name = Nta_name)

avg_school_rankings_by_nta

## # A tibble: 16 × 2
##    nta_name                                             avg_school_ranking
##    <chr>                                                             <dbl>
##  1 Annadale-Huguenot-Prince's Bay-Woodrow                             9.17
##  2 Arden Heights-Rossville                                            8.33
##  3 Grasmere-Arrochar-South Beach-Dongan Hills                         6   
##  4 Great Kills-Eltingville                                            9.17
##  5 Mariner's Harbor-Arlington-Graniteville                            3.33
##  6 New Dorp-Midland Beach                                             6.5 
##  7 New Springville-Willowbrook-Bulls Head-Travis                      6.54
##  8 Oakwood-Richmondtown                                              10   
##  9 Port Richmond                                                      3.5 
## 10 Rosebank-Shore Acres-Park Hill                                     4.33
## 11 St. George-New Brighton                                            2.43
## 12 Todt Hill-Emerson Hill-Lighthouse Hill-Manor Heights               6   
## 13 Tompkinsville-Stapleton-Clifton-Fox Hills                          3.25
## 14 Tottenville-Charleston                                             8.67
## 15 West New Brighton-Silver Lake-Grymes Hill                          4.6 
## 16 Westerleigh-Castleton Corners                                      7

schools <- schools %>%
  rename(nta_name = Nta_name)

totalincidents_per_nta <- crime_clean %>%
  group_by(NTAName) %>%
  summarise(total_incidents = n())

crime_housingprice_schools <- merged_data %>%
  left_join(totalincidents_per_nta, by = c("Nta_name" = "NTAName"))

cor(crime_housingprice_schools$overall_avg_price, crime_housingprice_schools$Ranking, use = "complete.obs")

## [1] 0.5065811

cor(crime_housingprice_schools$overall_avg_price, crime_housingprice_schools$total_incidents, use = "complete.obs")

## [1] -0.5747773

crime2 <- ggplot(
  crime_housingprice_schools,
  aes(
    x = Ranking,
    y = overall_avg_price,
    color = total_incidents
  )
) +
  geom_point(size = 4, alpha = 0.85) +
  scale_color_viridis_c(option = "plasma") +
  labs(
    x = "School Ranking",
    y = "Average Housing Price",
    color = "Crime Incidents",
    title = "Joint Influence of School Quality and Crime on Housing Prices",
    subtitle = "Average housing prices by school ranking, colored by crime incidents"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(
      size = 15,
      face = "bold",
      hjust = 0.5
    ),
    plot.subtitle = element_text(
      size = 10,
      hjust = 0.5
    ),
    axis.title = element_text(
      size = 10,
      face = "bold"
    ),
    axis.text = element_text(
      size = 10
    ),
    legend.title = element_text(
      size = 10,
      face = "bold"
    ),
    legend.text = element_text(
      size = 10
    ),
    legend.position = "right"
  )

crime2

## Do neighborhoods with higher-ranked schools tend to have higher housing prices?

crime_housingprice_schools <- crime_housingprice_schools %>%
  sf::st_drop_geometry() %>%   
  mutate(
    rank_group = ifelse(Ranking >= 8, "High-Rank Schools", "Low-Rank Schools")
  )


price_by_rank <- crime_housingprice_schools %>%
  group_by(rank_group) %>%
  summarise(
    overall_avg_price = mean(overall_avg_price, na.rm = TRUE),
    n = n()
  )

# Plot 1: Boxplot
p1 <- ggplot(crime_housingprice_schools, 
             aes(x = rank_group, y = overall_avg_price, fill = rank_group)) +
  geom_boxplot(alpha = 0.75) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    x = "School Rank Group",
    y = "Average Housing Price (USD)",
    title = "Do Higher-Ranked Schools Correspond to Higher Housing Prices?",
    subtitle = "Distribution of Housing Prices by School Rank"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Plot 2: Bar Chart
p2 <- ggplot(price_by_rank, 
             aes(x = rank_group, y = overall_avg_price, fill = rank_group)) +
  geom_col(alpha = 0.75) +
  geom_text(aes(label = scales::dollar(overall_avg_price)),
            vjust = -0.5, size = 4.2) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    x = "School Rank Group",
    y = "Mean Housing Price (USD)",
    title = "Do Higher-Ranked Schools Correspond to Higher Housing Prices?",
    subtitle ="Mean Housing Price by School Rank Group"
  ) +
  theme_minimal() +
  theme(legend.position = "none")


plt <- p1 + p2



plt

DOB Permits

# The original Department of Buildings (DOB) permits file is really large and
# unnecessary for reproducing the results of this project.
# To keep the repository lightweight and fully reproducible, the
# analysis directly loads a cleaned permits dataset from GitHub.
#
# The commented code below documents the full cleaning process used
# to generate the cleaned permits file, including variable selection,
# date parsing and filtering to the 2017–2025 study period.

# permits_raw <- read.csv("DOB_Permits_Staten_Island_20251006 (1).csv")

# permits_clean <- permits_raw %>%
#  rename(
#    permit_issued_date = Issuance.Date,  
#    lon = LONGITUDE,                   
#    lat = LATITUDE
#  ) %>%
#  select(permit_issued_date, lon, lat) %>%
#  filter(!is.na(lon) & !is.na(lat)) 

# permits_clean <- permits_clean %>%
#  mutate(
#    permit_issued_date = mdy(permit_issued_date),
#    year = year(permit_issued_date)
#  ) %>%
#  filter(year >= 2017, year <= 2025)

# write.csv(
#  permits_clean,
#  "permits_clean_staten_island_2017_2025.csv",
#  row.names = FALSE
# )
# - - - - - - - - - - - - - - - - - - - - - 

permits_clean <- read.csv(
  "https://raw.githubusercontent.com/NikoletaEm/Capstone/refs/heads/main/DataUsed/permits_clean_staten_island_2017_2025.csv"
)


permits_sf <- st_as_sf(
  permits_clean,
  coords = c("lon", "lat"),
  crs = 4326,       # WGS84 (latitude/longitude)
  remove = FALSE
)


si_map <- st_transform(si_map, 4326)


permits_with_nta <- st_join(permits_sf, si_map, join = st_within)

permits_summary <- permits_with_nta %>%
  st_drop_geometry() %>%
  group_by(year, NTAName) %>%
  rename(nta_name=NTAName)%>%
  summarise(number_of_permits = n(), .groups = "drop")

head(permits_summary)

## # A tibble: 6 × 3
##    year nta_name                                   number_of_permits
##   <int> <chr>                                                  <int>
## 1  2017 Annadale-Huguenot-Prince's Bay-Woodrow                   789
## 2  2017 Arden Heights-Rossville                                  175
## 3  2017 Freshkills Park (South)                                    2
## 4  2017 Grasmere-Arrochar-South Beach-Dongan Hills               409
## 5  2017 Great Kills-Eltingville                                  481
## 6  2017 Mariner's Harbor-Arlington-Graniteville                  237

summary(permits_summary)

##       year        nta_name         number_of_permits
##  Min.   :2017   Length:145         Min.   :  1.0    
##  1st Qu.:2019   Class :character   1st Qu.: 68.0    
##  Median :2021   Mode  :character   Median :152.0    
##  Mean   :2021                      Mean   :180.2    
##  3rd Qu.:2023                      3rd Qu.:249.0    
##  Max.   :2025                      Max.   :945.0

str(permits_summary)

## tibble [145 × 3] (S3: tbl_df/tbl/data.frame)
##  $ year             : int [1:145] 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
##  $ nta_name         : chr [1:145] "Annadale-Huguenot-Prince's Bay-Woodrow" "Arden Heights-Rossville" "Freshkills Park (South)" "Grasmere-Arrochar-South Beach-Dongan Hills" ...
##  $ number_of_permits: int [1:145] 789 175 2 409 481 237 945 311 217 253 ...

Preprocessing

si_sales_mapped <- si_sales_mapped %>%
  drop_na()

character_cols <- names(si_sales_mapped)[sapply(si_sales_mapped, is.character)]

si_sales_mapped <- si_sales_mapped %>%
  mutate(across(all_of(character_cols), as.factor))

si_sales_with_features <- si_sales_mapped %>%
  left_join(crime_with_pop %>% select(nta_name, crime_rate_per_1000), by = "nta_name") %>%
  left_join(avg_school_rankings_by_nta, by = "nta_name") %>%
  left_join(permits_summary, by = c("nta_name", "year")) 

si_sales_with_features$building_age <- si_sales_with_features$year - si_sales_with_features$year_built

si_sales_with_features <- si_sales_with_features %>%
  mutate(
    land_square_feet_log = log1p(land_square_feet)
  ) %>%
  select(-neighborhood_upper) %>%
  select(-land_square_feet)  

si_sales_with_features <- si_sales_with_features %>%
  left_join(si_population, by = "year")

si_sales_with_features <- si_sales_with_features %>%
  select(-si_population)
  
si_sales_with_features <- si_sales_with_features %>%
  select(-year_built)

si_sales_with_features <- si_sales_with_features %>%
  select(-starts_with("zip_code"))
character_cols <- names(si_sales_with_features)[sapply(si_sales_with_features, is.character)]

cap_values <- quantile(si_sales_with_features$sale_price, c(0.01, 0.99))
si_sales_with_features <- si_sales_with_features %>%
  filter(sale_price >= cap_values[1], sale_price <= cap_values[2]) # removes only clearly bad records

si_sales_with_features <- si_sales_with_features %>%
  mutate(across(all_of(character_cols), as.factor))

si_sales_with_features<-si_sales_with_features %>%
  drop_na()

head(si_sales_with_features)

## # A tibble: 6 × 11
##    year neighborhood residential_units sale_price nta_name   crime_rate_per_1000
##   <dbl> <fct>                    <dbl>      <dbl> <fct>                    <dbl>
## 1  2017 ANNADALE                     1     866000 Annadale-…                59.9
## 2  2017 ANNADALE                     1     735000 Annadale-…                59.9
## 3  2017 ANNADALE                     1     350000 Annadale-…                59.9
## 4  2017 ANNADALE                     1     475000 Annadale-…                59.9
## 5  2017 ANNADALE                     1     437500 Annadale-…                59.9
## 6  2017 ANNADALE                     1     354900 Annadale-…                59.9
## # ℹ 5 more variables: avg_school_ranking <dbl>, number_of_permits <int>,
## #   building_age <dbl>, land_square_feet_log <dbl>,
## #   yoy_growth_percentage_si <dbl>

num_data <- si_sales_with_features %>% 
  select(where(is.numeric))


cat("Number of numeric columns:", ncol(num_data), "\n")

## Number of numeric columns: 9

corr_matrix <- cor(num_data, use = "pairwise.complete.obs")

max_cols <- min(10, ncol(corr_matrix))
round(corr_matrix[1:max_cols, 1:max_cols], 2)

##                           year residential_units sale_price crime_rate_per_1000
## year                      1.00              0.01       0.30               -0.01
## residential_units         0.01              1.00       0.26                0.12
## sale_price                0.30              0.26       1.00               -0.21
## crime_rate_per_1000      -0.01              0.12      -0.21                1.00
## avg_school_ranking        0.01             -0.08       0.27               -0.70
## number_of_permits        -0.70             -0.01      -0.09               -0.18
## building_age              0.09              0.00      -0.18                0.22
## land_square_feet_log      0.03              0.19       0.47               -0.05
## yoy_growth_percentage_si -0.09             -0.01      -0.03               -0.01
##                          avg_school_ranking number_of_permits building_age
## year                                   0.01             -0.70         0.09
## residential_units                     -0.08             -0.01         0.00
## sale_price                             0.27             -0.09        -0.18
## crime_rate_per_1000                   -0.70             -0.18         0.22
## avg_school_ranking                     1.00              0.27        -0.30
## number_of_permits                      0.27              1.00        -0.15
## building_age                          -0.30             -0.15         1.00
## land_square_feet_log                   0.06              0.06         0.29
## yoy_growth_percentage_si               0.02              0.02        -0.02
##                          land_square_feet_log yoy_growth_percentage_si
## year                                     0.03                    -0.09
## residential_units                        0.19                    -0.01
## sale_price                               0.47                    -0.03
## crime_rate_per_1000                     -0.05                    -0.01
## avg_school_ranking                       0.06                     0.02
## number_of_permits                        0.06                     0.02
## building_age                             0.29                    -0.02
## land_square_feet_log                     1.00                     0.00
## yoy_growth_percentage_si                 0.00                     1.00

## There’s no multicollinearity problem — all correlations are below ±0.9.

Models

set.seed(123)
idx <- sample(seq_len(nrow(si_sales_with_features)), size = 0.8 * nrow(si_sales_with_features))
train_data_wf <- si_sales_with_features[idx, ]
test_data_wf  <- si_sales_with_features[-idx, ]

# model matrices
train_features_wf <- model.matrix(~ . - sale_price - 1, data = train_data_wf)
test_features_wf  <- model.matrix(~ . - sale_price - 1, data = test_data_wf)

train_target_wf <- train_data_wf$sale_price
test_target_wf  <- test_data_wf$sale_price

dtrain_wf <- xgb.DMatrix(data = as.matrix(train_features_wf), label = train_target_wf)
dtest_wf  <- xgb.DMatrix(data = as.matrix(test_features_wf),  label = test_target_wf)

train_df <- data.frame(train_target_wf, train_features_wf)


lm_model <- lm(train_target_wf ~ ., data = train_df)

summary(lm_model)

## 
## Call:
## lm(formula = train_target_wf ~ ., data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -866770  -80932   -1936   75374 1644934 
## 
## Coefficients: (21 not defined because of singularities)
##                                                                Estimate
## (Intercept)                                                  -5.795e+07
## year                                                          2.839e+04
## neighborhoodANNADALE                                          9.008e+04
## neighborhoodARDEN.HEIGHTS                                    -2.542e+04
## neighborhoodARROCHAR                                          5.515e+04
## neighborhoodARROCHAR.SHORE.ACRES                              2.380e+04
## neighborhoodBLOOMFIELD                                        9.526e+04
## neighborhoodBULLS.HEAD                                       -5.395e+04
## neighborhoodCASTLETON.CORNERS                                 1.957e+04
## neighborhoodCLOVE.LAKES                                       4.651e+04
## neighborhoodCONCORD                                          -9.979e+04
## neighborhoodCONCORD.FOX.HILLS                                -9.283e+04
## neighborhoodDONGAN.HILLS                                      6.633e+04
## neighborhoodDONGAN.HILLS.COLONY                               1.683e+05
## neighborhoodDONGAN.HILLS.OLD.TOWN                            -9.819e+04
## neighborhoodELTINGVILLE                                      -6.988e+03
## neighborhoodEMERSON.HILL                                      2.108e+05
## neighborhoodFRESH.KILLS                                              NA
## neighborhoodGRANT.CITY                                        2.829e+04
## neighborhoodGRASMERE                                          5.955e+04
## neighborhoodGREAT.KILLS                                      -2.648e+03
## neighborhoodGREAT.KILLS.BAY.TERRACE                           1.044e+04
## neighborhoodGRYMES.HILL                                       2.179e+04
## neighborhoodHUGUENOT                                          1.019e+05
## neighborhoodLIVINGSTON                                       -3.344e+04
## neighborhoodMANOR.HEIGHTS                                     2.693e+04
## neighborhoodMARINERS.HARBOR                                  -1.635e+05
## neighborhoodMIDLAND.BEACH                                    -4.028e+04
## neighborhoodNEW.BRIGHTON                                     -9.029e+04
## neighborhoodNEW.BRIGHTON.ST..GEORGE                           2.548e+05
## neighborhoodNEW.DORP                                          5.092e+04
## neighborhoodNEW.DORP.BEACH                                   -8.538e+04
## neighborhoodNEW.DORP.HEIGHTS                                  5.618e+04
## neighborhoodNEW.SPRINGVILLE                                   2.087e+04
## neighborhoodOAKWOOD                                           1.936e+04
## neighborhoodOAKWOOD.BEACH                                    -5.770e+03
## neighborhoodPLEASANT.PLAINS                                   8.839e+04
## neighborhoodPORT.IVORY                                       -1.987e+05
## neighborhoodPORT.RICHMOND                                    -1.143e+05
## neighborhoodPRINCES.BAY                                       5.975e+04
## neighborhoodRICHMONDTOWN                                      1.295e+05
## neighborhoodRICHMONDTOWN.LIGHTHS.HILL                         1.962e+05
## neighborhoodROSEBANK                                         -9.302e+03
## neighborhoodROSSVILLE                                         1.737e+04
## neighborhoodROSSVILLE.CHARLESTON                              5.084e+04
## neighborhoodROSSVILLE.PORT.MOBIL                                     NA
## neighborhoodROSSVILLE.RICHMOND.VALLEY                         1.532e+05
## neighborhoodSILVER.LAKE                                       9.783e+04
## neighborhoodSOUTH.BEACH                                      -1.053e+04
## neighborhoodSTAPLETON                                        -8.632e+04
## neighborhoodSTAPLETON.CLIFTON                                -9.103e+04
## neighborhoodSUNNYSIDE                                         1.811e+04
## neighborhoodTODT.HILL                                         3.672e+05
## neighborhoodTOMPKINSVILLE                                    -5.432e+04
## neighborhoodTOTTENVILLE                                       4.954e+04
## neighborhoodTRAVIS                                           -5.661e+04
## neighborhoodWEST.NEW.BRIGHTON                                -6.901e+04
## neighborhoodWESTERLEIGH                                       1.465e+04
## neighborhoodWILLOWBROOK                                       2.919e+04
## neighborhoodWOODROW                                                  NA
## residential_units                                             9.852e+04
## nta_nameArden.Heights.Rossville                                      NA
## nta_nameFreshkills.Park..South.                                      NA
## nta_nameGrasmere.Arrochar.South.Beach.Dongan.Hills                   NA
## nta_nameGreat.Kills.Eltingville                                      NA
## nta_nameMariner.s.Harbor.Arlington.Graniteville                      NA
## nta_nameNew.Dorp.Midland.Beach                                       NA
## nta_nameNew.Springville.Willowbrook.Bulls.Head.Travis                NA
## nta_nameOakwood.Richmondtown                                         NA
## nta_namePort.Richmond                                                NA
## nta_nameRosebank.Shore.Acres.Park.Hill                               NA
## nta_nameSt..George.New.Brighton                                      NA
## nta_nameTodt.Hill.Emerson.Hill.Lighthouse.Hill.Manor.Heights         NA
## nta_nameTompkinsville.Stapleton.Clifton.Fox.Hills                    NA
## nta_nameTottenville.Charleston                                       NA
## nta_nameWest.New.Brighton.Silver.Lake.Grymes.Hill                    NA
## nta_nameWesterleigh.Castleton.Corners                                NA
## crime_rate_per_1000                                                  NA
## avg_school_ranking                                                   NA
## number_of_permits                                             1.644e+00
## building_age                                                 -2.131e+03
## land_square_feet_log                                          1.489e+05
## yoy_growth_percentage_si                                     -1.960e+03
##                                                              Std. Error t value
## (Intercept)                                                   1.342e+06 -43.167
## year                                                          6.625e+02  42.855
## neighborhoodANNADALE                                          8.334e+03  10.809
## neighborhoodARDEN.HEIGHTS                                     8.072e+03  -3.149
## neighborhoodARROCHAR                                          1.523e+04   3.622
## neighborhoodARROCHAR.SHORE.ACRES                              1.821e+04   1.307
## neighborhoodBLOOMFIELD                                        8.175e+04   1.165
## neighborhoodBULLS.HEAD                                        8.076e+03  -6.681
## neighborhoodCASTLETON.CORNERS                                 8.892e+03   2.200
## neighborhoodCLOVE.LAKES                                       1.063e+04   4.377
## neighborhoodCONCORD                                           1.035e+04  -9.640
## neighborhoodCONCORD.FOX.HILLS                                 1.424e+04  -6.521
## neighborhoodDONGAN.HILLS                                      1.015e+04   6.537
## neighborhoodDONGAN.HILLS.COLONY                               1.162e+04  14.486
## neighborhoodDONGAN.HILLS.OLD.TOWN                             3.889e+04  -2.525
## neighborhoodELTINGVILLE                                       7.229e+03  -0.967
## neighborhoodEMERSON.HILL                                      1.686e+04  12.507
## neighborhoodFRESH.KILLS                                              NA      NA
## neighborhoodGRANT.CITY                                        9.843e+03   2.874
## neighborhoodGRASMERE                                          1.072e+04   5.556
## neighborhoodGREAT.KILLS                                       6.706e+03  -0.395
## neighborhoodGREAT.KILLS.BAY.TERRACE                           1.192e+04   0.876
## neighborhoodGRYMES.HILL                                       1.325e+04   1.645
## neighborhoodHUGUENOT                                          8.729e+03  11.670
## neighborhoodLIVINGSTON                                        1.420e+04  -2.356
## neighborhoodMANOR.HEIGHTS                                     8.973e+03   3.001
## neighborhoodMARINERS.HARBOR                                   8.268e+03 -19.778
## neighborhoodMIDLAND.BEACH                                     7.668e+03  -5.254
## neighborhoodNEW.BRIGHTON                                      9.342e+03  -9.665
## neighborhoodNEW.BRIGHTON.ST..GEORGE                           5.475e+04   4.655
## neighborhoodNEW.DORP                                          8.470e+03   6.011
## neighborhoodNEW.DORP.BEACH                                    9.809e+03  -8.705
## neighborhoodNEW.DORP.HEIGHTS                                  9.858e+03   5.699
## neighborhoodNEW.SPRINGVILLE                                   8.054e+03   2.591
## neighborhoodOAKWOOD                                           1.197e+04   1.617
## neighborhoodOAKWOOD.BEACH                                     1.131e+04  -0.510
## neighborhoodPLEASANT.PLAINS                                   1.123e+04   7.874
## neighborhoodPORT.IVORY                                        1.417e+04 -14.021
## neighborhoodPORT.RICHMOND                                     8.700e+03 -13.133
## neighborhoodPRINCES.BAY                                       8.602e+03   6.946
## neighborhoodRICHMONDTOWN                                      1.074e+04  12.057
## neighborhoodRICHMONDTOWN.LIGHTHS.HILL                         2.498e+04   7.853
## neighborhoodROSEBANK                                          8.855e+03  -1.050
## neighborhoodROSSVILLE                                         9.580e+03   1.813
## neighborhoodROSSVILLE.CHARLESTON                              1.428e+04   3.561
## neighborhoodROSSVILLE.PORT.MOBIL                                     NA      NA
## neighborhoodROSSVILLE.RICHMOND.VALLEY                         2.061e+04   7.432
## neighborhoodSILVER.LAKE                                       1.267e+04   7.723
## neighborhoodSOUTH.BEACH                                       8.115e+03  -1.298
## neighborhoodSTAPLETON                                         1.029e+04  -8.387
## neighborhoodSTAPLETON.CLIFTON                                 1.508e+04  -6.039
## neighborhoodSUNNYSIDE                                         1.239e+04   1.462
## neighborhoodTODT.HILL                                         1.607e+04  22.845
## neighborhoodTOMPKINSVILLE                                     1.353e+04  -4.015
## neighborhoodTOTTENVILLE                                       8.036e+03   6.164
## neighborhoodTRAVIS                                            1.128e+04  -5.017
## neighborhoodWEST.NEW.BRIGHTON                                 8.148e+03  -8.470
## neighborhoodWESTERLEIGH                                       8.468e+03   1.730
## neighborhoodWILLOWBROOK                                       8.805e+03   3.316
## neighborhoodWOODROW                                                  NA      NA
## residential_units                                             2.023e+03  48.700
## nta_nameArden.Heights.Rossville                                      NA      NA
## nta_nameFreshkills.Park..South.                                      NA      NA
## nta_nameGrasmere.Arrochar.South.Beach.Dongan.Hills                   NA      NA
## nta_nameGreat.Kills.Eltingville                                      NA      NA
## nta_nameMariner.s.Harbor.Arlington.Graniteville                      NA      NA
## nta_nameNew.Dorp.Midland.Beach                                       NA      NA
## nta_nameNew.Springville.Willowbrook.Bulls.Head.Travis                NA      NA
## nta_nameOakwood.Richmondtown                                         NA      NA
## nta_namePort.Richmond                                                NA      NA
## nta_nameRosebank.Shore.Acres.Park.Hill                               NA      NA
## nta_nameSt..George.New.Brighton                                      NA      NA
## nta_nameTodt.Hill.Emerson.Hill.Lighthouse.Hill.Manor.Heights         NA      NA
## nta_nameTompkinsville.Stapleton.Clifton.Fox.Hills                    NA      NA
## nta_nameTottenville.Charleston                                       NA      NA
## nta_nameWest.New.Brighton.Silver.Lake.Grymes.Hill                    NA      NA
## nta_nameWesterleigh.Castleton.Corners                                NA      NA
## crime_rate_per_1000                                                  NA      NA
## avg_school_ranking                                                   NA      NA
## number_of_permits                                             1.079e+01   0.152
## building_age                                                  3.342e+01 -63.743
## land_square_feet_log                                          1.619e+03  91.931
## yoy_growth_percentage_si                                      6.284e+02  -3.120
##                                                              Pr(>|t|)    
## (Intercept)                                                   < 2e-16 ***
## year                                                          < 2e-16 ***
## neighborhoodANNADALE                                          < 2e-16 ***
## neighborhoodARDEN.HEIGHTS                                    0.001641 ** 
## neighborhoodARROCHAR                                         0.000293 ***
## neighborhoodARROCHAR.SHORE.ACRES                             0.191128    
## neighborhoodBLOOMFIELD                                       0.243898    
## neighborhoodBULLS.HEAD                                       2.41e-11 ***
## neighborhoodCASTLETON.CORNERS                                0.027784 *  
## neighborhoodCLOVE.LAKES                                      1.21e-05 ***
## neighborhoodCONCORD                                           < 2e-16 ***
## neighborhoodCONCORD.FOX.HILLS                                7.08e-11 ***
## neighborhoodDONGAN.HILLS                                     6.37e-11 ***
## neighborhoodDONGAN.HILLS.COLONY                               < 2e-16 ***
## neighborhoodDONGAN.HILLS.OLD.TOWN                            0.011582 *  
## neighborhoodELTINGVILLE                                      0.333675    
## neighborhoodEMERSON.HILL                                      < 2e-16 ***
## neighborhoodFRESH.KILLS                                            NA    
## neighborhoodGRANT.CITY                                       0.004056 ** 
## neighborhoodGRASMERE                                         2.78e-08 ***
## neighborhoodGREAT.KILLS                                      0.692945    
## neighborhoodGREAT.KILLS.BAY.TERRACE                          0.381215    
## neighborhoodGRYMES.HILL                                      0.099995 .  
## neighborhoodHUGUENOT                                          < 2e-16 ***
## neighborhoodLIVINGSTON                                       0.018496 *  
## neighborhoodMANOR.HEIGHTS                                    0.002693 ** 
## neighborhoodMARINERS.HARBOR                                   < 2e-16 ***
## neighborhoodMIDLAND.BEACH                                    1.50e-07 ***
## neighborhoodNEW.BRIGHTON                                      < 2e-16 ***
## neighborhoodNEW.BRIGHTON.ST..GEORGE                          3.26e-06 ***
## neighborhoodNEW.DORP                                         1.86e-09 ***
## neighborhoodNEW.DORP.BEACH                                    < 2e-16 ***
## neighborhoodNEW.DORP.HEIGHTS                                 1.21e-08 ***
## neighborhoodNEW.SPRINGVILLE                                  0.009578 ** 
## neighborhoodOAKWOOD                                          0.105843    
## neighborhoodOAKWOOD.BEACH                                    0.610070    
## neighborhoodPLEASANT.PLAINS                                  3.55e-15 ***
## neighborhoodPORT.IVORY                                        < 2e-16 ***
## neighborhoodPORT.RICHMOND                                     < 2e-16 ***
## neighborhoodPRINCES.BAY                                      3.82e-12 ***
## neighborhoodRICHMONDTOWN                                      < 2e-16 ***
## neighborhoodRICHMONDTOWN.LIGHTHS.HILL                        4.18e-15 ***
## neighborhoodROSEBANK                                         0.293547    
## neighborhoodROSSVILLE                                        0.069815 .  
## neighborhoodROSSVILLE.CHARLESTON                             0.000370 ***
## neighborhoodROSSVILLE.PORT.MOBIL                                   NA    
## neighborhoodROSSVILLE.RICHMOND.VALLEY                        1.10e-13 ***
## neighborhoodSILVER.LAKE                                      1.17e-14 ***
## neighborhoodSOUTH.BEACH                                      0.194407    
## neighborhoodSTAPLETON                                         < 2e-16 ***
## neighborhoodSTAPLETON.CLIFTON                                1.57e-09 ***
## neighborhoodSUNNYSIDE                                        0.143728    
## neighborhoodTODT.HILL                                         < 2e-16 ***
## neighborhoodTOMPKINSVILLE                                    5.96e-05 ***
## neighborhoodTOTTENVILLE                                      7.17e-10 ***
## neighborhoodTRAVIS                                           5.27e-07 ***
## neighborhoodWEST.NEW.BRIGHTON                                 < 2e-16 ***
## neighborhoodWESTERLEIGH                                      0.083680 .  
## neighborhoodWILLOWBROOK                                      0.000915 ***
## neighborhoodWOODROW                                                NA    
## residential_units                                             < 2e-16 ***
## nta_nameArden.Heights.Rossville                                    NA    
## nta_nameFreshkills.Park..South.                                    NA    
## nta_nameGrasmere.Arrochar.South.Beach.Dongan.Hills                 NA    
## nta_nameGreat.Kills.Eltingville                                    NA    
## nta_nameMariner.s.Harbor.Arlington.Graniteville                    NA    
## nta_nameNew.Dorp.Midland.Beach                                     NA    
## nta_nameNew.Springville.Willowbrook.Bulls.Head.Travis              NA    
## nta_nameOakwood.Richmondtown                                       NA    
## nta_namePort.Richmond                                              NA    
## nta_nameRosebank.Shore.Acres.Park.Hill                             NA    
## nta_nameSt..George.New.Brighton                                    NA    
## nta_nameTodt.Hill.Emerson.Hill.Lighthouse.Hill.Manor.Heights       NA    
## nta_nameTompkinsville.Stapleton.Clifton.Fox.Hills                  NA    
## nta_nameTottenville.Charleston                                     NA    
## nta_nameWest.New.Brighton.Silver.Lake.Grymes.Hill                  NA    
## nta_nameWesterleigh.Castleton.Corners                              NA    
## crime_rate_per_1000                                                NA    
## avg_school_ranking                                                 NA    
## number_of_permits                                            0.878934    
## building_age                                                  < 2e-16 ***
## land_square_feet_log                                          < 2e-16 ***
## yoy_growth_percentage_si                                     0.001813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 162900 on 31730 degrees of freedom
## Multiple R-squared:  0.5233, Adjusted R-squared:  0.5224 
## F-statistic:   571 on 61 and 31730 DF,  p-value: < 2.2e-16

test_df <- data.frame(test_features_wf)


predsln <- predict(lm_model, newdata = test_df)

## Warning in predict.lm(lm_model, newdata = test_df): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases

rmse_val_lin <- sqrt(mean((test_target_wf - predsln)^2))
mae_val_lin  <- mean(abs(test_target_wf - predsln))
r2_val_lin   <- 1 - sum((test_target_wf - predsln)^2) / sum((test_target_wf - mean(test_target_wf))^2)

lr_model <- data.frame(
  Model = "Linear_Regression",
  RMSE = rmse_val_lin,
  MAE  = mae_val_lin,
  R2   = r2_val_lin
)


mean_price_lin <- mean(test_target_wf)

# percentage error
pct_error_lin <- (test_target_wf - predsln) / test_target_wf
MAPE_lin <- mean(abs(pct_error_lin), na.rm = TRUE) * 100

# normalized RMSE and MAE 
RMSE_pct_lin <- rmse_val_lin / mean_price_lin
MAE_pct_lin  <- mae_val_lin / mean_price_lin

normalized_lin <- data.frame(
  RMSE_as_pct_of_avg_price = RMSE_pct_lin,
  MAE_as_pct_of_avg_price  = MAE_pct_lin,
  MAPE_percent             = MAPE_lin
)

print(lr_model)

##               Model     RMSE    MAE        R2
## 1 Linear_Regression 161079.3 112535 0.5404037

print(normalized_lin)

##   RMSE_as_pct_of_avg_price MAE_as_pct_of_avg_price MAPE_percent
## 1                0.2512699               0.1755449      21.2104

params <- list(
  objective = "reg:squarederror",
  eta = 0.02,             
  max_depth = 7,           
  subsample = 0.9,
  colsample_bytree = 0.85,
  min_child_weight = 2,    
  gamma = 0.1,
  lambda = 1.2,
  alpha = 0.2
)

model_wf <- xgb.train(
  params = params,
  data = dtrain_wf,
  nrounds = 6000,
  watchlist = list(train = dtrain_wf, test = dtest_wf),
  early_stopping_rounds = 120,
  print_every_n = 50,
  verbose = 1
)

## [1]  train-rmse:668039.921953    test-rmse:670772.637998 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 120 rounds.
## 
## [51] train-rmse:282212.813243    test-rmse:284739.851548 
## [101]    train-rmse:167432.250826    test-rmse:171806.694432 
## [151]    train-rmse:141833.423118    test-rmse:148321.012310 
## [201]    train-rmse:135835.331342    test-rmse:144018.196184 
## [251]    train-rmse:133653.192904    test-rmse:143038.662974 
## [301]    train-rmse:132247.342682    test-rmse:142665.030593 
## [351]    train-rmse:131074.183760    test-rmse:142421.555985 
## [401]    train-rmse:130122.670667    test-rmse:142275.827925 
## [451]    train-rmse:129201.682523    test-rmse:142184.150786 
## [501]    train-rmse:128280.948434    test-rmse:142068.929157 
## [551]    train-rmse:127334.459796    test-rmse:141950.369875 
## [601]    train-rmse:126406.723868    test-rmse:141860.721966 
## [651]    train-rmse:125536.840913    test-rmse:141807.047152 
## [701]    train-rmse:124644.164614    test-rmse:141782.136912 
## [751]    train-rmse:123880.619362    test-rmse:141756.845628 
## [801]    train-rmse:123045.399304    test-rmse:141754.578410 
## [851]    train-rmse:122231.052345    test-rmse:141706.063893 
## [901]    train-rmse:121400.287204    test-rmse:141661.174172 
## [951]    train-rmse:120707.192075    test-rmse:141698.676292 
## [1001]   train-rmse:119924.615465    test-rmse:141658.236588 
## [1051]   train-rmse:119206.479108    test-rmse:141655.300503 
## [1101]   train-rmse:118536.855677    test-rmse:141661.072831 
## Stopping. Best iteration:
## [1026]   train-rmse:119543.596830    test-rmse:141652.272101

preds_wf <- predict(model_wf, newdata = test_features_wf)

rmse_val_wf <- rmse(test_target_wf, preds_wf)
mae_val_wf  <- mae(test_target_wf, preds_wf)
r2_val_wf   <- 1 - sum((test_target_wf - preds_wf)^2) / sum((test_target_wf - mean(test_target_wf))^2)

xgboost_metrics <- data.frame(
  Model = "XGBoost",
  RMSE = rmse_val_wf,
  MAE  = mae_val_wf,
  R2   = r2_val_wf
)




mean_price_xgb <- mean(test_target_wf)


pct_error_xgb <- (test_target_wf - preds_wf) / test_target_wf
MAPE_xgb <- mean(abs(pct_error_xgb), na.rm = TRUE) * 100


RMSE_pct_xgb <- rmse_val_wf / mean_price_xgb
MAE_pct_xgb  <- mae_val_wf / mean_price_xgb

normalized_xgb <- data.frame(
  RMSE_as_pct_of_avg_price = RMSE_pct_xgb,
  MAE_as_pct_of_avg_price  = MAE_pct_xgb,
  MAPE_percent             = MAPE_xgb
)

print(normalized_xgb)

##   RMSE_as_pct_of_avg_price MAE_as_pct_of_avg_price MAPE_percent
## 1                0.2209653               0.1522951     18.84078

print(xgboost_metrics)

##     Model     RMSE      MAE        R2
## 1 XGBoost 141652.3 97630.48 0.6445782

train_scaled <- scale(train_features_wf)
test_scaled  <- scale(test_features_wf, center = attr(train_scaled, "scaled:center"),
                                       scale  = attr(train_scaled, "scaled:scale"))

train_scaled[is.na(train_scaled)] <- median(train_scaled, na.rm = TRUE)
test_scaled[is.na(test_scaled)]   <- median(test_scaled, na.rm = TRUE)

set.seed(123)
preds_wf1 <- knn.reg(
  train = train_scaled,
  test  = test_scaled,
  y     = train_target_wf,
  k     = 20   
)$pred


rmse_val_knn <- sqrt(mean((test_target_wf - preds_wf1)^2))
mae_val_knn <- mean(abs(test_target_wf - preds_wf1))
r2_val_knn   <- 1 - sum((test_target_wf - preds_wf1)^2) /
                     sum((test_target_wf - mean(test_target_wf))^2)


knn_metrics <- data.frame(
  Model = "kNN",
  RMSE = rmse_val_knn,
  MAE  = mae_val_knn,
  R2   = r2_val_knn
)

print(knn_metrics)

##   Model     RMSE      MAE       R2
## 1   kNN 156144.6 109437.1 0.568132

mean_price_knn <- mean(test_target_wf)

pct_error_knn <- (test_target_wf - preds_wf1) / test_target_wf
MAPE_knn <- mean(abs(pct_error_knn), na.rm = TRUE) * 100


RMSE_pct_knn <- rmse_val_knn / mean_price_knn
MAE_pct_knn  <- mae_val_knn / mean_price_knn

normalized_knn <- data.frame(
  RMSE_as_pct_of_avg_price = RMSE_pct_knn,
  MAE_as_pct_of_avg_price  = MAE_pct_knn,
  MAPE_percent             = MAPE_knn
)

print(normalized_knn)

##   RMSE_as_pct_of_avg_price MAE_as_pct_of_avg_price MAPE_percent
## 1                0.2435722               0.1707124     20.48197

metrics_table <- bind_rows(
  lr_model,
  knn_metrics,
  xgboost_metrics
)

metrics_table %>%
  kbl(
    caption = "Table 2: Predictive Performance Metrics Across Models",
    digits = 3,
    align = "c",
    booktabs = TRUE
  ) %>%
  kable_styling(
    latex_options = c("striped", "hold_position"),
    full_width = FALSE,
    position = "center"
  ) %>%
  row_spec(
    which(metrics_table$Model == "XGBoost_with_features .2"),
    bold = TRUE
  )

Table 2: Predictive Performance Metrics Across Models
Model	RMSE	MAE	R2
Linear_Regression	161079.3	112535.03	0.540
kNN	156144.6	109437.07	0.568
XGBoost	141652.3	97630.48	0.645

normalized_all <- bind_rows(
  cbind(Model = "Linear Regression", normalized_lin),
  cbind(Model = "kNN Regression", normalized_knn),
  cbind(Model = "XGBoost", normalized_xgb)
)

normalized_all %>%
  kbl(
    caption = "Table 3: Normalised Error Metrics Across Models",
    digits = 3,
    align = "c",
    booktabs = TRUE
  ) %>%
  kable_styling(
    latex_options = c("striped", "hold_position"),
    full_width = FALSE,
    position = "center"
  ) %>%
  row_spec(
    which(normalized_all$Model == "XGBoost"),
    bold = TRUE
  )

Table 3: Normalised Error Metrics Across Models
Model	RMSE_as_pct_of_avg_price	MAE_as_pct_of_avg_price	MAPE_percent
Linear Regression	0.251	0.176	21.210
kNN Regression	0.244	0.171	20.482
XGBoost	0.221	0.152	18.841

residuals_df <- bind_rows(
  data.frame(
    Model = "Linear Regression",
    Actual = test_target_wf,
    Predicted = predsln,
    Residual = test_target_wf - predsln,
    Index = seq_along(test_target_wf)
  ),
  data.frame(
    Model = "kNN",
    Actual = test_target_wf,
    Predicted = preds_wf1,
    Residual = test_target_wf - preds_wf1,
    Index = seq_along(test_target_wf)
  ),
  data.frame(
    Model = "XGBoost",
    Actual = test_target_wf,
    Predicted = preds_wf,
    Residual = test_target_wf - preds_wf,
    Index = seq_along(test_target_wf)
  )
)

Residuals vs Fitted

ggplot(residuals_df, aes(x = Predicted, y = Residual)) +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  facet_wrap(~Model, scales = "free") +
  labs(
    title = "Residuals vs Fitted Values",
    x = "Predicted Sale Price",
    y = "Residuals"
  ) +
  theme_minimal()

Histogram of Residuals

ggplot(residuals_df, aes(x = Residual)) +
  geom_histogram(bins = 40, alpha = 0.7) +
  facet_wrap(~Model, scales = "free") +
  labs(
    title = "Distribution of Residuals",
    x = "Residual",
    y = "Frequency"
  ) +
  theme_minimal()

Q–Q Plot of Residuals

ggplot(residuals_df, aes(sample = Residual)) +
  stat_qq() +
  stat_qq_line() +
  facet_wrap(~Model, scales = "free") +
  labs(
    title = "Q–Q Plots of Residuals",
    x = "Theoretical Quantiles",
    y = "Sample Quantiles"
  ) +
  theme_minimal()

Actual vs Predicted

ggplot(residuals_df, aes(x = Actual, y = Predicted)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  facet_wrap(~Model, scales = "free") +
  labs(
    title = "Actual vs Predicted Sale Prices",
    x = "Actual Sale Price",
    y = "Predicted Sale Price"
  ) +
  theme_minimal()

Pilot plan:Forecasting

si_sales_with_features_econ <- si_sales_with_features %>%
  left_join(economic_factors, by = "year")

set.seed(123)
idx <- sample(seq_len(nrow(si_sales_with_features_econ)), size = 0.8 * nrow(si_sales_with_features_econ))
train_data_wf <- si_sales_with_features_econ[idx, ]
test_data_wf  <- si_sales_with_features_econ[-idx, ]


train_features_wf <- model.matrix(~ . - sale_price - 1, data = train_data_wf)
test_features_wf  <- model.matrix(~ . - sale_price - 1, data = test_data_wf)

train_target_wf <- train_data_wf$sale_price
test_target_wf  <- test_data_wf$sale_price

dtrain_wf <- xgb.DMatrix(data = as.matrix(train_features_wf), label = train_target_wf)
dtest_wf  <- xgb.DMatrix(data = as.matrix(test_features_wf),  label = test_target_wf)


params <- list(
  objective = "reg:squarederror",
  eta = 0.02,             
  max_depth = 7,           
  subsample = 0.9,
  colsample_bytree = 0.85,
  min_child_weight = 2,    
  gamma = 0.1,
  lambda = 1.2,
  alpha = 0.2
)

model_econ <- xgb.train(
  params = params,
  data = dtrain_wf,
  nrounds = 6000,
  watchlist = list(train = dtrain_wf, test = dtest_wf),
  early_stopping_rounds = 120,
  print_every_n = 50,
  verbose = 1
)

## [1]  train-rmse:668039.921953    test-rmse:670772.637998 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 120 rounds.
## 
## [51] train-rmse:281523.310598    test-rmse:284047.781693 
## [101]    train-rmse:167316.643948    test-rmse:171832.538989 
## [151]    train-rmse:141866.303479    test-rmse:148407.532598 
## [201]    train-rmse:135998.588114    test-rmse:144097.716280 
## [251]    train-rmse:133876.547191    test-rmse:143091.479699 
## [301]    train-rmse:132446.799127    test-rmse:142682.447261 
## [351]    train-rmse:131227.990787    test-rmse:142410.340236 
## [401]    train-rmse:130185.187325    test-rmse:142261.349864 
## [451]    train-rmse:129198.104811    test-rmse:142105.480675 
## [501]    train-rmse:128218.938997    test-rmse:142005.130696 
## [551]    train-rmse:127292.590057    test-rmse:141914.348203 
## [601]    train-rmse:126401.781493    test-rmse:141848.464033 
## [651]    train-rmse:125553.466553    test-rmse:141762.158493 
## [701]    train-rmse:124701.897431    test-rmse:141717.997321 
## [751]    train-rmse:123850.279845    test-rmse:141653.966366 
## [801]    train-rmse:123045.249232    test-rmse:141582.146880 
## [851]    train-rmse:122243.799860    test-rmse:141564.444292 
## [901]    train-rmse:121357.777711    test-rmse:141530.880372 
## [951]    train-rmse:120568.095962    test-rmse:141528.707121 
## [1001]   train-rmse:119882.551178    test-rmse:141510.443205 
## [1051]   train-rmse:119068.445800    test-rmse:141567.719868 
## Stopping. Best iteration:
## [941]    train-rmse:120737.167013    test-rmse:141494.720605

preds_wf <- predict(model_econ, newdata = test_features_wf)

rmse_val_wf <- rmse(test_target_wf, preds_wf)
mae_val_wf  <- mae(test_target_wf, preds_wf)
r2_val_wf   <- 1 - sum((test_target_wf - preds_wf)^2) / sum((test_target_wf - mean(test_target_wf))^2)

econ <- data.frame(
  Model = "XGBoost_with_features_Econ",
  RMSE = rmse_val_wf,
  MAE  = mae_val_wf,
  R2   = r2_val_wf
)

print(econ)

##                        Model     RMSE      MAE        R2
## 1 XGBoost_with_features_Econ 141494.7 97640.86 0.6453684

Importance

importance_df <- xgb.importance(
  feature_names = model_econ$feature_names,
  model = model_econ
)


importance_df_clean <- importance_df %>%
  select(Feature, Gain, Cover, Frequency) %>%
  arrange(desc(Gain)) %>%
  mutate(
    Rank = row_number(),
    Gain_Percent = round(100 * Gain / sum(Gain), 2)
  )

#  top drivers
head(importance_df_clean, 15)

##                                                          Feature        Gain
##                                                           <char>       <num>
##  1:                                         land_square_feet_log 0.425993622
##  2:                                                 building_age 0.230422127
##  3:                                                         year 0.106835234
##  4:                                            residential_units 0.042525457
##  5:                                          crime_rate_per_1000 0.039539506
##  6:                                           avg_school_ranking 0.033437261
##  7:                                     all_items_annual_avg_cpi 0.020154534
##  8:                                            number_of_permits 0.015654791
##  9:                                     yoy_growth_percentage_si 0.006382437
## 10: nta_nameTodt Hill-Emerson Hill-Lighthouse Hill-Manor Heights 0.005355159
## 11:                                        neighborhoodTODT HILL 0.004661509
## 12:                                            avg_mortgage_rate 0.004121506
## 13:                              neighborhoodDONGAN HILLS-COLONY 0.003349855
## 14:                                    neighborhoodARDEN HEIGHTS 0.002972110
## 15:                                neighborhoodWEST NEW BRIGHTON 0.002396801
##           Cover   Frequency  Rank Gain_Percent
##           <num>       <num> <int>        <num>
##  1: 0.342785259 0.301562917     1        42.60
##  2: 0.125050356 0.227606871     2        23.04
##  3: 0.044291870 0.082792682     3        10.68
##  4: 0.034306910 0.051089979     4         4.25
##  5: 0.016519567 0.025989360     5         3.95
##  6: 0.009377774 0.017482828     6         3.34
##  7: 0.007392482 0.006462425     7         2.02
##  8: 0.025366703 0.053375316     8         1.57
##  9: 0.005677096 0.043446796     9         0.64
## 10: 0.005371268 0.003262953    10         0.54
## 11: 0.007761550 0.002805886    11         0.47
## 12: 0.004118376 0.027855719    12         0.41
## 13: 0.012140370 0.003326435    13         0.33
## 14: 0.005224645 0.001840966    14         0.30
## 15: 0.009030806 0.003770806    15         0.24

importance_top15 <- importance_df_clean[1:15, ]
ggplot(
  importance_top15,
  aes(x = reorder(Feature, Gain_Percent), y = Gain_Percent)
) +
  geom_col(
    fill = "#1a9850",   
    width = 0.75
  ) +
  coord_flip() +
  labs(
    title = "Top 15 Feature Importances",
    subtitle = "Gain-based relative contribution to predictive accuracy",
    x = "Feature",
    y = "Relative Importance (%)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    axis.title.x = element_text(face = "bold", size = 13),
    axis.title.y = element_text(face = "bold", size = 13),
    axis.text = element_text(size = 11),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.margin = margin(15, 20, 15, 20)
  )

ggsave(
  filename = "Figure_5_5_Feature_Importance_XGBoost_Thesis.png",
  width = 10,
  height = 7,
  dpi = 300,
  bg = "white"
)

crime_school_importance <- importance_df_clean %>%
  filter(grepl("crime|school|education", Feature, ignore.case = TRUE))

crime_school_importance

##                Feature       Gain       Cover  Frequency  Rank Gain_Percent
##                 <char>      <num>       <num>      <num> <int>        <num>
## 1: crime_rate_per_1000 0.03953951 0.016519567 0.02598936     5         3.95
## 2:  avg_school_ranking 0.03343726 0.009377774 0.01748283     6         3.34

kable(
  crime_school_importance,
  caption = "Importance of Crime and School Quality Variables",
  align = "c"
) %>%
  kable_styling(full_width = FALSE)

Importance of Crime and School Quality Variables
Feature	Gain	Cover	Frequency	Rank	Gain_Percent
crime_rate_per_1000	0.0395395	0.0165196	0.0259894	5	3.95
avg_school_ranking	0.0334373	0.0093778	0.0174828	6	3.34

## Now its time to  prepare the data we’ll feed into the prediction model
feature_template <- si_sales_with_features %>%
  select(-sale_price)


future_features <- feature_template %>%
  filter(year == 2025) %>%          
  select(-year) %>%
  distinct() %>%
  crossing(year = 2026:2030)

# Blueprint: 2017-2021 historical data 
blueprint <- economic_factors %>%
  filter(year >= 2017 & year <= 2021) %>%
  select(year, avg_mortgage_rate, all_items_annual_avg_cpi)

# Calculate year-to-year percentage changes 
blueprint_trends <- blueprint %>%
  arrange(year) %>%
  mutate(
    cpi_change = c(0, diff(all_items_annual_avg_cpi) / head(all_items_annual_avg_cpi, -1)),
    avg_mortgage_rate_change = c(0, diff(avg_mortgage_rate) / head(avg_mortgage_rate, -1))
  )

#  Compute average annual changes 
avg_changes <- blueprint_trends %>%
  summarise(
    mortgage_rate_change = mean(avg_mortgage_rate_change[-1], na.rm = TRUE),
    cpi_change = mean(cpi_change[-1], na.rm = TRUE)
  )
# These averages will help us project the future.

latest_values <- tail(economic_factors, 1) %>%
  select(avg_mortgage_rate, all_items_annual_avg_cpi)

# Project future years (CPI relative to 2017 = 100) 
future_years <- data.frame(year = 2026:2030)


future_years$avg_mortgage_rate[1] <- latest_values$avg_mortgage_rate * (1 + avg_changes$mortgage_rate_change)
future_years$all_items_annual_avg_cpi[1] <- latest_values$all_items_annual_avg_cpi * (1 + avg_changes$cpi_change)


for (i in 2:nrow(future_years)) {
  future_years$avg_mortgage_rate[i] <- future_years$avg_mortgage_rate[i-1] * (1 + avg_changes$mortgage_rate_change)
  future_years$all_items_annual_avg_cpi[i] <- future_years$all_items_annual_avg_cpi[i-1] * (1 + avg_changes$cpi_change)
}

econ_full <- bind_rows(blueprint, future_years)


feature_template <- si_sales_with_features %>% select(-sale_price)

future_features <- feature_template %>%
  filter(year == 2025) %>%    
  select(-year) %>%
  distinct() %>%
  crossing(year = 2026:2030) %>%
  left_join(future_years, by = "year")

econ_full <- bind_rows(blueprint, future_years)


feature_template <- si_sales_with_features %>%
  select(-sale_price)

future_features <- feature_template %>%
  filter(year == 2025) %>%
  select(-year) %>%
  distinct() %>%
  crossing(year = 2026:2030)

future_features <- future_features %>%
  left_join(future_years, by = "year")
view(future_features)

model_features <- model_econ$feature_names

future_features_matrix <- model.matrix(~ . - 1, data = future_features)


extra_cols <- setdiff(colnames(future_features_matrix), model_features)
if (length(extra_cols) > 0) {
  future_features_matrix <- future_features_matrix[, !(colnames(future_features_matrix) %in% extra_cols), drop = FALSE]
}


missing_cols <- setdiff(model_features, colnames(future_features_matrix))
for (col in missing_cols) {
  future_features_matrix <- cbind(future_features_matrix, setNames(data.frame(0), col))
}

future_features_matrix <- future_features_matrix[, model_features, drop = FALSE]

future_matrix_clean <- as.matrix(future_features_matrix)

predictions <- predict(model_econ, newdata = future_matrix_clean)
future_forecast <- future_features %>%
  mutate(predicted_sale_price = predictions)


future_forecast_summary_nta <- future_forecast %>%
  group_by(year, nta_name) %>%
  summarise(mean_price = mean(predicted_sale_price, na.rm = TRUE)) %>%
  arrange(nta_name, year)

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

future_forecast_summary_neighborhood <- future_forecast %>%
  group_by(year, neighborhood) %>%
  summarise(mean_price = mean(predicted_sale_price, na.rm = TRUE)) %>%
  arrange(neighborhood, year)

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

future_forecast_summary_nta

## # A tibble: 80 × 3
## # Groups:   year [5]
##     year nta_name                               mean_price
##    <int> <fct>                                       <dbl>
##  1  2026 Annadale-Huguenot-Prince's Bay-Woodrow    914210.
##  2  2027 Annadale-Huguenot-Prince's Bay-Woodrow    914210.
##  3  2028 Annadale-Huguenot-Prince's Bay-Woodrow    913703.
##  4  2029 Annadale-Huguenot-Prince's Bay-Woodrow    913703.
##  5  2030 Annadale-Huguenot-Prince's Bay-Woodrow    912697.
##  6  2026 Arden Heights-Rossville                   777333.
##  7  2027 Arden Heights-Rossville                   777333.
##  8  2028 Arden Heights-Rossville                   776933.
##  9  2029 Arden Heights-Rossville                   776933.
## 10  2030 Arden Heights-Rossville                   776482.
## # ℹ 70 more rows

future_forecast_summary_neighborhood

## # A tibble: 275 × 3
## # Groups:   year [5]
##     year neighborhood  mean_price
##    <int> <fct>              <dbl>
##  1  2026 ANNADALE         962672.
##  2  2027 ANNADALE         962672.
##  3  2028 ANNADALE         962307.
##  4  2029 ANNADALE         962307.
##  5  2030 ANNADALE         960634.
##  6  2026 ARDEN HEIGHTS    655246.
##  7  2027 ARDEN HEIGHTS    655246.
##  8  2028 ARDEN HEIGHTS    654850.
##  9  2029 ARDEN HEIGHTS    654850.
## 10  2030 ARDEN HEIGHTS    654606.
## # ℹ 265 more rows

NTA Increase %

nta_baseline_2025 <- si_sales_with_features_econ %>%
  filter(year == 2025) %>%
  group_by(nta_name) %>%
  summarise(
    baseline_price_2025 = mean(sale_price, na.rm = TRUE),
    .groups = "drop"
  )

nta_forecast_with_baseline <- future_forecast_summary_nta %>%
  left_join(nta_baseline_2025, by = "nta_name") %>%
  mutate(
    pct_change_from_2025 =
      (mean_price - baseline_price_2025) / baseline_price_2025 * 100
  )

nta_growth_2030 <- nta_forecast_with_baseline %>%
  filter(year == 2030) %>%
  select(nta_name, pct_change_from_2025)

## Adding missing grouping variables: `year`

nta_growth_bands <- nta_growth_2030 %>%
  mutate(
    trajectory_band = case_when(
      pct_change_from_2025 >= quantile(pct_change_from_2025, 0.75, na.rm = TRUE) ~
        "High Relative Growth",
      pct_change_from_2025 >= quantile(pct_change_from_2025, 0.50, na.rm = TRUE) ~
        "Moderate Growth",
      pct_change_from_2025 >= quantile(pct_change_from_2025, 0.25, na.rm = TRUE) ~
        "Low Growth",
      TRUE ~
        "Relative Deceleration"
    )
  )

nta_positive_growth <- nta_growth_2030 %>%
  filter(pct_change_from_2025 > 0) %>%
  arrange(desc(pct_change_from_2025))

ggplot(
  nta_positive_growth,
  aes(
    x = reorder(nta_name, pct_change_from_2025),
    y = pct_change_from_2025
  )
) +
  geom_col(width = 0.7) +
  coord_flip() +
  labs(
    title = "NTAs with Forecasted Positive Relative Price Growth (2025–2030)",
    subtitle = "Percentage increase relative to observed 2025 prices",
    x = "Neighborhood Tabulation Area (NTA)",
    y = "Percent Change from 2025"
  ) +
  theme_minimal(base_size = 12)

ggsave(
    filename = "nta_groeth.png",
    width = 10,
    height = 6,
    units = "in",
    dpi = 300
)

hist_units <- si_sales_with_features %>%
  filter(residential_units %in% c(1, 2)) %>%   
  group_by(year, neighborhood, residential_units) %>%
  summarise(
    mean_price = mean(sale_price, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(source = "Observed")


future_units <- future_forecast %>%
  filter(residential_units %in% c(1, 2)) %>%   
  group_by(year, neighborhood, residential_units) %>%
  summarise(
    mean_price = mean(predicted_sale_price, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(source = "Forecast")


units_panel <- bind_rows(hist_units, future_units) %>%
  filter(!is.na(neighborhood)) %>%
  mutate(
    unit_group = ifelse(residential_units == 1,
                        "1 unit (single-family)",
                        "2 units")
  )

set.seed(261)

random_neighborhoods <- units_panel %>%
  distinct(neighborhood) %>%
  slice_sample(n = 6) %>%  
  pull(neighborhood)

random_neighborhoods

## [1] GRYMES HILL             ARROCHAR                ROSSVILLE              
## [4] NEW BRIGHTON-ST. GEORGE WEST NEW BRIGHTON       WOODROW                
## 58 Levels: ANNADALE ARDEN HEIGHTS ARROCHAR ARROCHAR-SHORE ACRES ... WOODROW

set.seed(261) 
neighborhood_units_trends <- units_panel %>%
  filter(neighborhood %in% random_neighborhoods)


combined_plot <- ggplot(
  neighborhood_units_trends,
  aes(x = year,
      y = mean_price,
      color = unit_group,
      group = interaction(unit_group, neighborhood))
) +
  geom_line(size = 1.1) +   
  facet_wrap(~ neighborhood, scales = "free_y") +
  
  scale_y_continuous(labels = scales::dollar_format()) +
  
  scale_color_manual(
    name = "Home Type",
    values = c(
      "1 unit (single-family)" = "#1B9E77",  
      "2 units"                = "#D95F02"    
    )
  ) +
  
  labs(
    title = "Observed and Forecasted Price Trends for 1-Unit and 2-Unit Homes",
    subtitle = "Observed: 2017–2025   |   Forecast: 2026–2030",
    x = "Year",
    y = "Average Sale Price"
  ) +
  
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 15),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    strip.text = element_text(face = "bold"),
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    panel.spacing = unit(1, "lines")
  )
combined_plot

## Idea-Experimenting 
# Average house prices per year per NTA
avg_price_per_nta <- si_sales_with_features %>%
  group_by(year, nta_name) %>%
  summarise(
    mean_price = mean(sale_price, na.rm = TRUE),
    n_sales = n(),
    .groups = "drop"
  )

# Average house prices per year per Neighborhood
avg_price_per_neighborhood <- si_sales_with_features %>%
  group_by(year, neighborhood) %>%
  summarise(
    mean_price = mean(sale_price, na.rm = TRUE),
    n_sales = n(),
    .groups = "drop"
  )

combined_nta <- bind_rows(future_forecast_summary_nta, avg_price_per_nta) %>%
  arrange(nta_name, year, mean_price)%>%
  select(-n_sales)

combined_neighborhood <- bind_rows(future_forecast_summary_neighborhood, avg_price_per_neighborhood) %>%
  arrange(neighborhood, year, mean_price)%>%
  select(-n_sales)

trend_table_simple <- neighborhood_units_trends %>%
  filter(year >= 2025) %>%                       # keep only 2025+
  arrange(neighborhood, unit_group, year) %>%
  mutate(mean_price = round(mean_price, 0)) %>%
  select(neighborhood, year, unit_group, mean_price) %>%
  tidyr::pivot_wider(
    names_from = unit_group,
    values_from = mean_price
  ) %>%
  arrange(neighborhood, year)

trend_table_simple

## # A tibble: 36 × 4
##    neighborhood  year `1 unit (single-family)` `2 units`
##    <fct>        <dbl>                    <dbl>     <dbl>
##  1 ARROCHAR      2025                   901500    940000
##  2 ARROCHAR      2026                   880934    878298
##  3 ARROCHAR      2027                   880934    878298
##  4 ARROCHAR      2028                   880744    878150
##  5 ARROCHAR      2029                   880744    878150
##  6 ARROCHAR      2030                   882387    880149
##  7 GRYMES HILL   2025                   877743    931756
##  8 GRYMES HILL   2026                   853299    869962
##  9 GRYMES HILL   2027                   853299    869962
## 10 GRYMES HILL   2028                   853367    870041
## # ℹ 26 more rows

nta_map <- st_read("nynta2020_25c/nynta2020.shp")

## Reading layer `nynta2020' from data source 
##   `C:\Users\nicol\OneDrive\Έγγραφα\nynta2020_25c\nynta2020.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 262 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)

si_map <- nta_map %>%
  filter(BoroName == "Staten Island") %>%
  mutate(
    NTAName = if_else(NTAName == "Snug Harbor", "St. George-New Brighton", NTAName),
    NTAName = if_else(NTAName == "Great Kills Park", "Great Kills-Eltingville", NTAName),
    NTAName = if_else(NTAName == "Miller Field", "New Dorp-Midland Beach", NTAName)
  )


nta_shapes <- si_map %>%
  select(nta_name = NTAName, geometry)

nta_map <- st_read("nynta2020_25c/nynta2020.shp")

## Reading layer `nynta2020' from data source 
##   `C:\Users\nicol\OneDrive\Έγγραφα\nynta2020_25c\nynta2020.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 262 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)

si_map <- nta_map %>%
  filter(BoroName == "Staten Island") %>%
  mutate(
    NTAName = if_else(NTAName == "Snug Harbor", "St. George-New Brighton", NTAName),
    NTAName = if_else(NTAName == "Great Kills Park", "Great Kills-Eltingville", NTAName),
    NTAName = if_else(NTAName == "Miller Field", "New Dorp-Midland Beach", NTAName)
  )

nta_shapes <- si_map %>%
  select(nta_name = NTAName, geometry)

schools_coords <- nta_shapes %>%
  st_centroid() %>%
  st_coordinates() %>%
  as.data.frame() %>%
  bind_cols(nta_shapes %>% st_drop_geometry()) %>%
  rename(lon = X, lat = Y)

## Warning: st_centroid assumes attributes are constant over geometries

schools <- schools %>%
  as.data.frame()
    
schools_mapped <- schools %>%
  mutate(
    lon = schools_coords$lon[match(nta_name, schools_coords$nta_name)],
    lat = schools_coords$lat[match(nta_name, schools_coords$nta_name)]
  )

schools_by_nta <- schools_mapped %>%
  filter(!is.na(nta_name)) %>%
  group_by(nta_name) %>%
  summarise(
    total_schools   = n(),
    avg_school_rank = round(mean(Ranking, na.rm = TRUE), 1),
    top_school      = avg_school_rank > 8,   
    .groups = "drop"
  )

nta_across_nj <- c(
  "Tottenville-Charleston",
  "New Springville-Willowbrook-Bulls Head-Travis",
  "Mariner's Harbor-Arlington-Graniteville",
  "Port Richmond",
  "West New Brighton-Silver Lake-Grymes Hill",
  "St. George-New Brighton"
)

nta_across_brooklyn <- c(
  "Tompkinsville-Stapleton-Clifton-Fox Hills",
  "Rosebank-Shore Acres-Park Hill",
  "Grasmere-Arrochar-South Beach-Dongan Hills"
)

nta_near_nj <- data.frame(
  nta_name = nta_across_nj,
  near_nj = TRUE
)
nta_near_brooklyn <- data.frame(
  nta_name = nta_across_brooklyn,
  near_brooklyn = TRUE
)
nta_with_train <- c(
  "St. George-New Brighton",
  "Tompkinsville-Stapleton-Clifton-Fox Hills",
  "Rosebank-Shore Acres-Park Hill",
  "Grasmere-Arrochar-South Beach-Dongan Hills",
  "New Dorp-Midland Beach",
  "Oakwood-Richmondtown",
  "Great Kills-Eltingville",
  "Annadale-Huguenot-Prince's Bay-Woodrow",
  "Tottenville-Charleston"
)

nta_train_access <- data.frame(
  nta_name = nta_with_train,
  train_access = TRUE
)

combined_nta <- combined_nta %>%
  ungroup() %>%
  dplyr::mutate(
    near_nj = FALSE,
    near_brooklyn = FALSE,
    train_access = FALSE
  )
combined_nta <- combined_nta %>%
  left_join(nta_near_nj, by = "nta_name", suffix = c("", "_new")) %>%
  left_join(nta_near_brooklyn, by = "nta_name", suffix = c("", "_new")) %>%
  left_join(nta_train_access, by = "nta_name", suffix = c("", "_new")) %>%
  mutate(
    near_nj = if_else(!is.na(near_nj_new), TRUE, near_nj),
    near_brooklyn = if_else(!is.na(near_brooklyn_new), TRUE, near_brooklyn),
    train_access = if_else(!is.na(train_access_new), TRUE, train_access)
  ) %>%
  select(-ends_with("_new"))

avg_house_age_by_nta <- si_sales_mapped %>%
  filter(!is.na(year_built)) %>%
  mutate(house_age = 2026 - year_built) %>%   
  group_by(nta_name) %>%
  summarise(
    avg_house_age = round(mean(house_age, na.rm = TRUE), 1),
    .groups = "drop"
  )

combined_nta <- combined_nta %>%
  left_join(avg_house_age_by_nta, by = "nta_name")


combined_nta <- combined_nta %>%
  left_join(schools_by_nta, by = "nta_name") 



price_stats_2025 <- avg_price_per_nta %>%
  filter(year == 2025) %>%
  summarise(
    min_price    = min(mean_price, na.rm = TRUE),
    q1_price     = quantile(mean_price, 0.25, na.rm = TRUE),
    median_price = median(mean_price, na.rm = TRUE),
    q3_price     = quantile(mean_price, 0.75, na.rm = TRUE),
    max_price    = max(mean_price, na.rm = TRUE)
  )

price_stats_2025

## # A tibble: 1 × 5
##   min_price q1_price median_price q3_price max_price
##       <dbl>    <dbl>        <dbl>    <dbl>     <dbl>
## 1    573283  650934.      750683.  836370.   915486.

Final App

A working R Shiny application is available and can be accessed here

forecast_year <- 2026


# UI

ui <- fluidPage(
  
  titlePanel("Staten Island Housing Price Dashboard"),
  
  sidebarLayout(
    sidebarPanel(
      
      radioButtons(
        "mode",
        "What would you like to do?",
        choices = c(
          "📝 Take the NTA Preference Quiz" = "quiz",
          "📊 View All NTAs (Forecasted Prices)" = "all"
        ),
        selected = "quiz"
      ),
      
      conditionalPanel(
        condition = "input.mode == 'quiz'",
        
        hr(),
        h4("💸 Budget (Forecasted)"),
        radioButtons("budget", NULL, choices = c(
          "Lower-Priced: Under $650K" = "low",
          "Mid-Priced: $650K – $835K" = "mid",
          "Upper-Priced: Over $835K" = "high",
          "I don't have a budget" = "none"
        )),
        
        hr(),
        h4("🥇 Interested in Top-Ranked Schools?"),
        radioButtons("schools", NULL, choices = c(
          "Yes" = "yes",
          "No" = "no",
          "I don't care" = "na"
        )),
        
        hr(),
        h4("🏠 Home Style Preference"),
        radioButtons("house_age_pref", NULL, choices = c(
          "More modern neighborhoods (mostly built after 1970)" = "new",
          "More historic neighborhoods (mostly built before 1970)" = "old",
          "I don't have a preference" = "na"
        )),
        
        hr(),
        h4("🌇 Preferred Location"),
        radioButtons("location", NULL, choices = c(
          "Across from New Jersey" = "nj",
          "Across from Brooklyn" = "bk",
          "I don't mind" = "na"
        )),
        
        hr(),
        h4("🚂 Train Access"),
        radioButtons("train", NULL, choices = c(
          "Yes" = "yes",
          "No" = "no",
          "I don't care" = "na"
        )),
        
        hr(),
        actionButton("run_quiz", "🔍 Show My NTA Matches", class = "btn-primary")
      ),
      
      hr(),
      selectInput(
        "neighborhood",
        "Select Neighborhood (Tab 2):",
        choices = sort(unique(combined_neighborhood$neighborhood))
      )
    ),
    
    mainPanel(
      tabsetPanel(
        
        tabPanel(
          "NTA Finder",
          DTOutput("quiz_results"),
          hr(),
          h4(textOutput("nta_detail_title")),
          plotlyOutput("nta_detail_trend", height = "350px"),
          DTOutput("nta_detail_table")
        ),
        
        tabPanel(
          "🏘️ Neighborhood Price Trends",
          plotlyOutput("price_trend_plot", height = "400px"),
          DTOutput("price_table")
        )
      )
    )
  )
)

# SERVER

server <- function(input, output, session) {
  

  quiz_filtered <- eventReactive(input$run_quiz, {
    
    df <- combined_nta %>% filter(year == forecast_year)
    
    if (input$budget == "low")  df <- df %>% filter(mean_price < 650000)
    if (input$budget == "mid")  df <- df %>% filter(mean_price >= 650000 & mean_price <= 835000)
    if (input$budget == "high") df <- df %>% filter(mean_price > 835000)
    
    if ("top_school" %in% colnames(df)) {
      if (input$schools == "yes") df <- df %>% filter(top_school)
      if (input$schools == "no")  df <- df %>% filter(!top_school)
    }
    
    if ("avg_house_age" %in% colnames(df)) {
      if (input$house_age_pref == "new") df <- df %>% filter(avg_house_age < 55)
      if (input$house_age_pref == "old") df <- df %>% filter(avg_house_age >= 55)
    }
    
    if (input$location == "nj") df <- df %>% filter(near_nj)
    if (input$location == "bk") df <- df %>% filter(near_brooklyn)
    
    if (input$train == "yes") df <- df %>% filter(train_access)
    if (input$train == "no")  df <- df %>% filter(!train_access)
    
    df
  })
  

  nta_display_data <- reactive({
    
    if (input$mode == "all") {
      return(
        combined_nta %>% filter(year == forecast_year)
      )
    }
    
    req(input$run_quiz)
    quiz_filtered()
  })
  

  output$quiz_results <- renderDT({
    
    df <- nta_display_data()
    
    if (nrow(df) == 0) {
      return(
        datatable(
          data.frame(Message = "❌ No NTAs matched your preferences."),
          options = list(dom = "t"),
          rownames = FALSE
        )
      )
    }
    
    df %>%
      mutate(`Forecasted Avg Price` = dollar(round(mean_price))) %>%
      select(NTA = nta_name, `Forecasted Avg Price`) %>%
      datatable(
        selection = "single",
        options = list(pageLength = 10),
        rownames = FALSE
      )
  })
  

  selected_nta <- reactive({
    s <- input$quiz_results_rows_selected
    req(length(s) == 1)
    nta_display_data()$nta_name[s]
  })

  # DETAIL OUTPUTS

  output$nta_detail_title <- renderText({
    req(selected_nta())
    paste("📈 Price History for:", selected_nta())
  })
  
  output$nta_detail_trend <- renderPlotly({
    req(selected_nta())
    
    df <- combined_nta %>% filter(nta_name == selected_nta())
    
    plot_ly(
      df,
      x = ~year,
      y = ~mean_price,
      type = "scatter",
      mode = "lines+markers"
    )
  })
  
  output$nta_detail_table <- renderDT({
    req(selected_nta())
    
    combined_nta %>%
      filter(nta_name == selected_nta()) %>%
      arrange(year) %>%
      mutate(mean_price = dollar(round(mean_price))) %>%
      datatable(options = list(pageLength = 10), rownames = FALSE)
  })
  
  
  # NEIGHBORHOOD TAB
 
  output$price_trend_plot <- renderPlotly({
    
    df <- combined_neighborhood %>%
      filter(neighborhood == input$neighborhood)
    
    plot_ly(
      df,
      x = ~year,
      y = ~mean_price,
      type = "scatter",
      mode = "markers"
    )
  })
  
  output$price_table <- renderDT({
    
    combined_neighborhood %>%
      filter(neighborhood == input$neighborhood) %>%
      datatable(options = list(pageLength = 10), rownames = FALSE)
  })
}

shinyApp(ui, server)

## 
## Listening on http://127.0.0.1:5388

Beyond the ‘Forgotten Borough’

Forecasting Staten Island Housing Prices with Machine Learning

Nikoleta Emanouilidi

2025-12-14

Libraries

Mapping of Staten Island sales

CPI

Mortgage rate

Population growth/decay of NYC and SI

Population of ntas

Crime data

Crime rate per nta

Average house price per year and nta

Schools

Average school ranking per nta

DOB Permits

Preprocessing

Models

Residuals vs Fitted

Histogram of Residuals

Q–Q Plot of Residuals

Actual vs Predicted

Pilot plan:Forecasting

Importance

NTA Increase %

Final App

A working R Shiny application is available and can be accessed here