# ----------------------------------------------------------------------
# I. DATA & LIBRARY LOADING
# ----------------------------------------------------------------------
setwd("~/Desktop/Statistical Model/assignment_3")
library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.5
✔ ggplot2 3.5.2 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.2.0 ✔ tidyr 1.3.1── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ MASS::select() masks dplyr::select()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(lubridate)
library(scales)
Attaching package: ‘scales’
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
library(dplyr)
library(ggplot2)
# Load Data
raw_real_estate_2016 <- read.csv("2016_brooklyn.csv")
raw_real_estate_2017 <- read.csv("2017_brooklyn.csv")
raw_real_estate_2018 <- read.csv("2018_brooklyn.csv")
raw_real_estate_2019 <- read.csv("2019_brooklyn.csv")
raw_real_estate_2020 <- read.csv("2020_brooklyn.csv")
# Check Column Names
colnames(raw_real_estate_2016)
[1] "BROOKLYN.ANNUALIZE.SALE.FOR.2016....All.Sales.From..January.1..2016...December.31..2016."
[2] "X"
[3] "X.1"
[4] "X.2"
[5] "X.3"
[6] "X.4"
[7] "X.5"
[8] "X.6"
[9] "X.7"
[10] "X.8"
[11] "X.9"
[12] "X.10"
[13] "X.11"
[14] "X.12"
[15] "X.13"
[16] "X.14"
[17] "X.15"
[18] "X.16"
[19] "X.17"
[20] "X.18"
[21] "X.19"
colnames(raw_real_estate_2017)
[1] "BROOKLYN.ANNUALIZE.SALE.FOR.2017....All.Sales.From..January.1..2017...December.31..2017."
[2] "X"
[3] "X.1"
[4] "X.2"
[5] "X.3"
[6] "X.4"
[7] "X.5"
[8] "X.6"
[9] "X.7"
[10] "X.8"
[11] "X.9"
[12] "X.10"
[13] "X.11"
[14] "X.12"
[15] "X.13"
[16] "X.14"
[17] "X.15"
[18] "X.16"
[19] "X.17"
[20] "X.18"
[21] "X.19"
colnames(raw_real_estate_2018)
[1] "BROOKLYN.ANNUALIZE.SALE.FOR.2018....All.Sales.From..January.1..2018...December.31..2018."
[2] "X"
[3] "X.1"
[4] "X.2"
[5] "X.3"
[6] "X.4"
[7] "X.5"
[8] "X.6"
[9] "X.7"
[10] "X.8"
[11] "X.9"
[12] "X.10"
[13] "X.11"
[14] "X.12"
[15] "X.13"
[16] "X.14"
[17] "X.15"
[18] "X.16"
[19] "X.17"
[20] "X.18"
[21] "X.19"
colnames(raw_real_estate_2019)
[1] "BROOKLYN.ANNUAL.SALES.FOR.CALENDAR.YEAR.2019...All.Sales.From.January.2019...December.2019..PTS.Sales.as.of.04.01.2020."
[2] "X"
[3] "X.1"
[4] "X.2"
[5] "X.3"
[6] "X.4"
[7] "X.5"
[8] "X.6"
[9] "X.7"
[10] "X.8"
[11] "X.9"
[12] "X.10"
[13] "X.11"
[14] "X.12"
[15] "X.13"
[16] "X.14"
[17] "X.15"
[18] "X.16"
[19] "X.17"
[20] "X.18"
[21] "X.19"
colnames(raw_real_estate_2020)
[1] "BROOKLYN.ANNUAL.SALES.FOR.CALENDAR.YEAR.2020" "X"
[3] "X.1" "X.2"
[5] "X.3" "X.4"
[7] "X.5" "X.6"
[9] "X.7" "X.8"
[11] "X.9" "X.10"
[13] "X.11" "X.12"
[15] "X.13" "X.14"
[17] "X.15" "X.16"
[19] "X.17" "X.18"
[21] "X.19"
# Check Raw Data
head(raw_real_estate_2016)
head(raw_real_estate_2017)
head(raw_real_estate_2018)
head(raw_real_estate_2019)
head(raw_real_estate_2020)
# ----------------------------------------------------------------------
# II. CLEAN & COMBINE DATA
# ----------------------------------------------------------------------
# Remove Metadata Rows
real_estate_2016 <- raw_real_estate_2016[-(1:4), ]
real_estate_2017 <- raw_real_estate_2017[-(1:4), ]
real_estate_2018 <- raw_real_estate_2018[-(1:4), ]
real_estate_2019 <- raw_real_estate_2019[-(1:4), ]
real_estate_2020 <- raw_real_estate_2020[-(1:7), ]
head(real_estate_2016)
head(real_estate_2017)
head(real_estate_2018)
head(real_estate_2019)
head(real_estate_2020)
# Standardize Column Names
standardized_column_names <- c("BOROUGH",
"NEIGHBORHOOD",
"BUILDING CLASS CATEGORY",
"TAX CLASS AT PRESENT",
"BLOCK",
"LOT",
"EASE-MENT",
"BUILDING CLASS AT PRESENT",
"ADDRESS",
"APARTMENT NUMBER",
"ZIP CODE",
"RESIDENTIAL UNITS",
"COMMERCIAL UNITS",
"TOTAL UNITS",
"LAND SQUARE FEET",
"GROSS SQUARE FEET",
"YEAR BUILT",
"TAX CLASS AT TIME OF SALE",
"BUILDING CLASS AT TIME OF SALE",
"SALE PRICE",
"SALE DATE")
colnames(real_estate_2016) <- standardized_column_names
colnames(real_estate_2017) <- standardized_column_names
colnames(real_estate_2018) <- standardized_column_names
colnames(real_estate_2019) <- standardized_column_names
colnames(real_estate_2020) <- standardized_column_names
head(real_estate_2016)
head(real_estate_2017)
head(real_estate_2018)
head(real_estate_2019)
head(real_estate_2020)
# Set Column Data Types
set_column_types <- function(df) {
string_cols <- c(
"BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY",
"TAX CLASS AT PRESENT", "BLOCK", "EASE-MENT",
"BUILDING CLASS AT PRESENT", "ADDRESS", "APARTMENT NUMBER",
"ZIP CODE", "TAX CLASS AT TIME OF SALE",
"BUILDING CLASS AT TIME OF SALE"
)
numeric_cols <- c(
"RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS",
"LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT", "SALE PRICE"
)
# Clean and convert string/numeric columns first
df <- df %>%
mutate(across(where(is.character), str_squish)) %>%
mutate(across(all_of(string_cols), as.character)) %>%
mutate(across(all_of(numeric_cols),
~ suppressWarnings(as.numeric(gsub("[^0-9.]", "", str_squish(.))))))
# Handle SALE DATE format flexibly (2-digit vs 4-digit year)
if (any(grepl("^\\d{1,2}/\\d{1,2}/\\d{2}$", df$`SALE DATE`))) {
df$`SALE DATE` <- as.Date(df$`SALE DATE`, format = "%m/%d/%y") # ✅ 2-digit year
} else {
df$`SALE DATE` <- as.Date(df$`SALE DATE`, format = "%m/%d/%Y") # ✅ 4-digit year
}
return(df)
}
real_estate_2016 <- set_column_types(real_estate_2016)
real_estate_2017 <- set_column_types(real_estate_2017)
real_estate_2018 <- set_column_types(real_estate_2018)
real_estate_2019 <- set_column_types(real_estate_2019)
real_estate_2020 <- set_column_types(real_estate_2020)
head(real_estate_2016)
head(real_estate_2017)
head(real_estate_2018)
head(real_estate_2019)
head(real_estate_2020)
# Filter data
# first letter of BUILDING CLASS AT TIME OF SALE in c ("A", "R)
# AND number of TOTAL UNITS == 1
# AND number of RESIDENTIAL UNITS == 1
# AND GROSS SQUARE FOOT > 0
# AND SALE PRICE is not N/A
filter_data <- function(df) {
df %>%
filter(
substr(`BUILDING CLASS AT TIME OF SALE`, 1, 1) %in% c("A", "R"),
`TOTAL UNITS` == 1,
`RESIDENTIAL UNITS` == 1,
`GROSS SQUARE FEET` > 0,
!is.na(`SALE PRICE`)
)
}
real_estate_2016 <- filter_data(real_estate_2016)
real_estate_2017 <- filter_data(real_estate_2017)
real_estate_2018 <- filter_data(real_estate_2018)
real_estate_2019 <- filter_data(real_estate_2019)
real_estate_2020 <- filter_data(real_estate_2020)
head(real_estate_2016)
head(real_estate_2017)
head(real_estate_2018)
head(real_estate_2019)
head(real_estate_2020)
# Create New "Age" column
datasets <- list(
real_estate_2016 = real_estate_2016,
real_estate_2017 = real_estate_2017,
real_estate_2018 = real_estate_2018,
real_estate_2019 = real_estate_2019,
real_estate_2020 = real_estate_2020
)
for (year in 2016:2020) {
df_name <- paste0("real_estate_", year)
df <- get(df_name)
df$`SALE DATE` <- as.Date(df$`SALE DATE`)
df$`SALE YEAR` <- as.numeric(format(df$`SALE DATE`, "%Y"))
df$`BUILDING AGE AT SALE` <- ifelse(
is.na(df$`YEAR BUILT`) | df$`YEAR BUILT` == 0,
NA,
df$`SALE YEAR` - df$`YEAR BUILT`
)
assign(df_name, df)
}
# Combine All Data
real_estate_df <- rbind(real_estate_2016,
real_estate_2017,
real_estate_2018,
real_estate_2019,
real_estate_2020)
head(real_estate_df)
# Drop Irrelevant columns
real_estate_df <- real_estate_df[ , !(names(real_estate_2016) %in% c(
"BOROUGH",
"EASE-MENT",
"APARTMENT NUMBER",
"TOTAL UNITS", # Always 1
"RESIDENTIAL UNITS", # Always 1
"COMMERCIAL UNITS",
"ADDRESS NUMBER",
"SALE YEAR"
))]
# Check Row
head(real_estate_df)
nrow(real_estate_df)
[1] 19640
## Build cleaned data ready for one-line regressions
## Input: real_estate_df
# 1) Copy and make names R-safe (handles spaces)
df <- real_estate_df
names(df) <- make.names(names(df))
# 2) Identify the price column robustly and coerce to numeric if needed
price_col <- if ("SALE.PRICE" %in% names(df)) "SALE.PRICE" else {
cand <- grep("^SALE[._ ]?PRICE$", names(df), ignore.case = TRUE, value = TRUE)
stopifnot(length(cand) > 0); cand[1]
}
if (is.character(df[[price_col]])) {
df[[price_col]] <- as.numeric(gsub("[^0-9.]", "", df[[price_col]]))
}
# 3) Drop obvious IDs/dates (optional but typical)
drop_cols <- intersect(c("ADDRESS","BLOCK","LOT","SALE.DATE"), names(df))
if (length(drop_cols)) df[drop_cols] <- list(NULL)
# 4) Convert ALL character cols to factors (so lm() makes dummies)
char_cols <- names(df)[sapply(df, is.character)]
if (length(char_cols)) df[char_cols] <- lapply(df[char_cols], as.factor)
# 5) Drop single-level columns (avoids contrasts errors)
one_level <- names(df)[sapply(df, function(x) length(unique(na.omit(x))) <= 1)]
if (length(one_level)) df[one_level] <- list(NULL)
# 6) Keep rows with finite prices; also make a log-ready version (>0)
df_ready <- subset(df, is.finite(df[[price_col]]))
df_ready_log <- subset(df_ready, df_ready[[price_col]] > 0)
cat("df_ready:", nrow(df_ready), "rows,", ncol(df_ready), "cols\n")
df_ready: 19640 rows, 12 cols
cat("df_ready_log:", nrow(df_ready_log), "rows\n")
df_ready_log: 13985 rows
# ============ LAST-MILE++ (≤40 df, ≥13k rows): ZIP-relative size, timing×neigh, age hinge ============
suppressPackageStartupMessages({ library(dplyr); library(splines); library(MASS) })
wins <- function(x, qlo=.01, qhi=.99){ q <- quantile(x,c(qlo,qhi),na.rm=TRUE); pmin(q[2],pmax(q[1],x)) }
rmse <- function(a,b) sqrt(mean((a-b)^2,na.rm=TRUE))
mae <- function(a,b) mean(abs(a-b),na.rm=TRUE)
count_df <- function(f,d) ncol(model.matrix(f,d))-1L
# ---- data & core numerics ----
df <- real_estate_df
names(df) <- make.names(names(df))
if (is.character(df$SALE.PRICE)) df$SALE.PRICE <- as.numeric(gsub("[^0-9.]","",df$SALE.PRICE))
df <- df %>% filter(is.finite(SALE.PRICE), SALE.PRICE>0)
df <- df %>%
mutate(
GSF = pmax(1, suppressWarnings(as.numeric(GROSS.SQUARE.FEET))),
LSF = pmax(1, suppressWarnings(as.numeric(LAND.SQUARE.FEET))),
LOG_GSF = log(GSF), LOG_LSF = log(LSF),
AGE = dplyr::case_when(
!is.na(BUILDING.AGE.AT.SALE) ~ suppressWarnings(as.numeric(BUILDING.AGE.AT.SALE)),
!is.na(YEAR.BUILT) ~ as.numeric(format(Sys.Date(),"%Y")) - suppressWarnings(as.numeric(YEAR.BUILT)),
TRUE ~ NA_real_
),
AGE_LOG = log1p(pmax(0, AGE)),
FAR = pmin(5, pmax(0.05, GSF / pmax(1, LSF))),
SALE.DATE = suppressWarnings(as.Date(if ("SALE.DATE" %in% names(df)) SALE.DATE else NA)),
TIME_INDEX = as.numeric(difftime(SALE.DATE, min(SALE.DATE, na.rm=TRUE), units="days"))/365,
ZIP = if ("ZIP.CODE" %in% names(df)) as.character(ZIP.CODE) else NA_character_
) %>%
mutate(across(c(SALE.PRICE,GSF,LSF,FAR), wins))
# ---- simple target encodings (in-bag shrinked; we already validated oof earlier) ----
ylog <- log(df$SALE.PRICE)
target_enc <- function(k,y,m=70){
mu<-mean(y,na.rm=TRUE); key<-as.character(k)
t<-tapply(y,key,function(v){n<-sum(is.finite(v));(sum(v,na.rm=TRUE)+m*mu)/(pmax(1,n)+m)})
out<-as.numeric(t[key]); out[!is.finite(out)]<-mu; out
}
df$NEIGH_TE <- if ("NEIGHBORHOOD"%in%names(df)) target_enc(df$NEIGHBORHOOD,ylog,70) else NA
df$BCLASS_TE <- if ("BUILDING.CLASS.AT.PRESENT"%in%names(df)) target_enc(substr(df$BUILDING.CLASS.AT.PRESENT,1,1),ylog,60) else NA
df$TAX_TE <- if ("TAX.CLASS.AT.PRESENT"%in%names(df)) target_enc(df$TAX.CLASS.AT.PRESENT,ylog,50) else NA
df$BCLASS_R <- if("BUILDING.CLASS.AT.PRESENT"%in%names(df))
as.numeric(substr(df$BUILDING.CLASS.AT.PRESENT,1,1)=="R") else 0
df$TAX_2C <- if("TAX.CLASS.AT.PRESENT"%in%names(df))
as.numeric(df$TAX.CLASS.AT.PRESENT%in%c("2C","02C")) else 0
df$TAX_2 <- if("TAX.CLASS.AT.PRESENT"%in%names(df))
as.numeric(df$TAX.CLASS.AT.PRESENT%in%c("2","02")) else 0
# ---- NEW 1: relative size within ZIP (1 df) ----
if (!all(is.na(df$ZIP))) {
zip_med <- tapply(df$LOG_GSF, df$ZIP, function(v) median(v, na.rm=TRUE))
df$REL_SIZE_ZIP <- df$LOG_GSF - as.numeric(zip_med[df$ZIP])
df$REL_SIZE_ZIP[!is.finite(df$REL_SIZE_ZIP)] <- 0
} else df$REL_SIZE_ZIP <- 0
# ---- NEW 2: timing × neighborhood TE (1 df) ----
df$TIxNE <- with(df, TIME_INDEX * NEIGH_TE)
# ---- NEW 3: very-old stock hinge (1 df; hinge ~50y on original scale) ----
cut_age <- log1p(50)
df$AGE_HINGE <- pmax(0, df$AGE_LOG - cut_age)
# model data (keep ≥13k)
feats <- c("LOG_GSF","LOG_LSF","AGE_LOG","AGE_HINGE","FAR","TIME_INDEX",
"NEIGH_TE","BCLASS_TE","TAX_TE","BCLASS_R","TAX_2","TAX_2C",
"REL_SIZE_ZIP","TIxNE")
dfm <- df[complete.cases(df[,c("SALE.PRICE",feats)]), c("SALE.PRICE",feats)]
stopifnot(nrow(dfm) >= 13000)
# ---- formula: keep your spine + previous interactions + 3 new terms ----
terms <- c(
"ns(LOG_GSF,4)","ns(LOG_LSF,2)","ns(AGE_LOG,2)","AGE_HINGE","ns(TIME_INDEX,3)","FAR",
"NEIGH_TE","BCLASS_TE","TAX_TE","BCLASS_R","TAX_2","TAX_2C",
"REL_SIZE_ZIP","TIxNE",
"LOG_GSF:TAX_2","LOG_GSF:BCLASS_R","FAR:LOG_GSF" # from your strong variant
)
form_full <- as.formula(paste("log(SALE.PRICE) ~", paste(terms, collapse=" + ")))
# ---- keep ≤40 df (shrink splines first, then drop weakest new terms if needed) ----
reduce_to40 <- function(f,d){
cur <- f
drop_order <- c("TIxNE","REL_SIZE_ZIP","AGE_HINGE","FAR:LOG_GSF","LOG_GSF:TAX_2","LOG_GSF:BCLASS_R")
for (iter in 1:120){
k <- try(count_df(cur,d), silent=TRUE)
if (!inherits(k,"try-error") && k <= 40) return(cur)
# step 1: reduce LOG_GSF spline 4->3->2
fstr <- deparse(cur)
if (grepl("ns\\(LOG_GSF,4\\)", fstr)) { fstr <- gsub("ns\\(LOG_GSF,4\\)","ns(LOG_GSF,3)", fstr)
} else if (grepl("ns\\(LOG_GSF,3\\)", fstr)) { fstr <- gsub("ns\\(LOG_GSF,3\\)","ns(LOG_GSF,2)", fstr)
} else {
# step 2: drop lowest-priority single-df terms one by one
for (t in drop_order) {
if (grepl(paste0("(^|\\+|\\s)", gsub("\\+","\\\\+",t), "($|\\+|\\s)"), fstr)) {
fstr <- sub(paste0("\\+\\s*", t), "", fstr); break
}
}
}
cur <- as.formula(fstr)
}
cur
}
form <- reduce_to40(form_full, dfm)
# ---- fit two variants: robust vs. OLS, pick lower RMSE (tie -> higher R^2) ----
fit_and_eval <- function(dat, form, robust = FALSE, k = 1.5){
# base fit (unweighted) to get residual scale
m0 <- lm(form, data = dat)
if (robust){
# standardized residuals
sig <- summary(m0)$sigma
u <- residuals(m0) / sig
# Huber weights: w = min(1, k/|u|)
w <- pmin(1, k / pmax(1e-8, abs(u)))
w[!is.finite(w)] <- 1
# attach to data so lm can see it
dat$.__w <- w
m <- lm(form, data = dat, weights = .__w, na.action = na.exclude)
} else {
m <- m0
}
# predictions on log-scale, then Duan smearing to get dollars
pred_log <- predict(m, newdata = dat, na.action = na.exclude)
smear <- mean(exp(residuals(m)), na.rm = TRUE)
yhat <- exp(pred_log) * smear
y <- dat$SALE.PRICE
rmse <- sqrt(mean((y - yhat)^2, na.rm = TRUE))
mae <- mean(abs(y - yhat), na.rm = TRUE)
r2 <- 1 - sum((y - yhat)^2, na.rm = TRUE) / sum((y - mean(y, na.rm = TRUE))^2, na.rm = TRUE)
dfmod <- ncol(model.matrix(form, dat)) - 1L
list(m = m, smear = smear, rmse = rmse, mae = mae, r2 = r2,
adjr2 = summary(m)$adj.r.squared, df = dfmod)
}
res_ols <- fit_and_eval(dfm, form, robust=FALSE)
res_rob <- fit_and_eval(dfm, form, robust=TRUE)
best <- if (res_rob$rmse + 1e-9 < res_ols$rmse) res_rob else if (abs(res_rob$rmse-res_ols$rmse)<1e-9 && res_rob$r2>res_ols$r2) res_rob else res_ols
cat("\n=========== LAST-MILE++ RESULTS ===========\n")
=========== LAST-MILE++ RESULTS ===========
cat("Formula:\n ", deparse(form), "\n\n")
Formula:
log(SALE.PRICE) ~ ns(LOG_GSF, 4) + ns(LOG_LSF, 2) + ns(AGE_LOG, 2) + AGE_HINGE + ns(TIME_INDEX, 3) + FAR + NEIGH_TE + BCLASS_TE + TAX_TE + BCLASS_R + TAX_2 + TAX_2C + REL_SIZE_ZIP + TIxNE + LOG_GSF:TAX_2 + LOG_GSF:BCLASS_R + FAR:LOG_GSF
cat(sprintf("Rows: %d (≥13k) | df: %d (≤40)\n", nrow(dfm), best$df))
Rows: 13985 (≥13k) | df: 24 (≤40)
cat(sprintf("Adj R^2 (log): %.3f | Duan smearing: %.4f | Fit: %s\n",
best$adjr2, best$smear, if (identical(best,res_rob)) "robust" else "OLS"))
Adj R^2 (log): 0.215 | Duan smearing: 1.1423 | Fit: robust
cat(sprintf("$-scale RMSE: $%s | MAE: $%s | R^2: %.3f\n",
format(round(best$rmse), big.mark=","), format(round(best$mae), big.mark=","), best$r2))
$-scale RMSE: $464,722 | MAE: $275,386 | R^2: 0.601
# ==== COEFFICIENT TABLE & INFERENCE (for the chosen 'best$m') ====
pretty_stars <- function(p){
cut(p, breaks=c(-Inf, .001, .01, .05, .1, Inf),
labels=c("***","**","*","."," "), right=FALSE)
}
summ <- summary(best$m)
coeft <- as.data.frame(summ$coefficients)
names(coeft) <- c("Estimate","Std.Error","t.value","Pr(>|t|)")
coeft$Signif <- pretty_stars(coeft[["Pr(>|t|)"]])
# 95% CIs (base-R; works for weighted lm as well)
ci <- try(confint(best$m), silent=TRUE)
if (!inherits(ci, "try-error")) {
coeft$CI_2.5 <- ci[rownames(coeft), 1]
coeft$CI_97.5 <- ci[rownames(coeft), 2]
}
# Order by absolute t
coeft$Abs_t <- abs(coeft$`t.value`)
coeft <- coeft[order(-coeft$Abs_t), ]
cat("\n=========== COEFFICIENT TABLE (sorted by |t|) ===========\n")
=========== COEFFICIENT TABLE (sorted by |t|) ===========
printCoefmat(as.matrix(coeft[, c("Estimate","Std.Error","t.value","Pr(>|t|)")]),
P.values=TRUE, has.Pvalue=TRUE, signif.stars=FALSE, digits=4)
Estimate Std.Error t.value Pr(>|t|)
NEIGH_TE 1.10981 0.04735 23.440 < 2e-16
ns(AGE_LOG, 2)1 -1.66316 0.26285 -6.328 2.57e-10
REL_SIZE_ZIP 0.49319 0.07989 6.174 6.86e-10
ns(AGE_LOG, 2)2 -3.01867 0.49800 -6.062 1.38e-09
AGE_HINGE 0.75361 0.12746 5.913 3.45e-09
ns(TIME_INDEX, 3)2 5.19921 1.26457 4.111 3.95e-05
ns(TIME_INDEX, 3)1 2.41292 0.59423 4.061 4.92e-05
ns(TIME_INDEX, 3)3 3.69154 0.92786 3.979 6.97e-05
TIxNE -0.06384 0.01687 -3.784 0.000155
ns(LOG_LSF, 2)2 0.51394 0.20014 2.568 0.010242
BCLASS_TE -3.24065 1.64281 -1.973 0.048558
ns(LOG_GSF, 4)3 1.20838 0.64211 1.882 0.059872
TAX_TE 0.50637 0.27325 1.853 0.063881
TAX_2:LOG_GSF 0.14987 0.09172 1.634 0.102265
TAX_2 -1.05791 0.65800 -1.608 0.107910
ns(LOG_GSF, 4)1 0.45456 0.29230 1.555 0.119943
BCLASS_R:LOG_GSF 0.14079 0.09200 1.530 0.125962
(Intercept) 34.15769 22.34039 1.529 0.126296
ns(LOG_GSF, 4)4 0.53788 0.38967 1.380 0.167499
ns(LOG_GSF, 4)2 0.31910 0.23838 1.339 0.180719
FAR:LOG_GSF -0.02041 0.01632 -1.250 0.211282
FAR 0.14036 0.13534 1.037 0.299697
TAX_2C -0.03878 0.11326 -0.342 0.732043
ns(LOG_LSF, 2)1 -0.09459 0.41667 -0.227 0.820415
cat("\nSignif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n")
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Add stars & CIs in a tidy view
cat("\nTop 25 by |t| (with stars & 95% CI):\n")
Top 25 by |t| (with stars & 95% CI):
print(utils::head(coeft[, c("Estimate","Std.Error","t.value","Pr(>|t|)","Signif","CI_2.5","CI_97.5")], 25), digits=4)
# Model overview again for convenience
cat("\n=========== MODEL OVERVIEW ===========\n")
=========== MODEL OVERVIEW ===========
cat(sprintf("Observations: %d\n", nobs(best$m)))
Observations: 13985
cat(sprintf("Model df (excl. intercept): %d\n", best$df))
Model df (excl. intercept): 24
cat(sprintf("Adj R^2 (log): %.3f | Duan smearing: %.4f | Fit: %s\n",
best$adjr2, best$smear, if (identical(best$rmse, res_rob$rmse) && identical(best$r2, res_rob$r2)) "robust" else "OLS"))
Adj R^2 (log): 0.215 | Duan smearing: 1.1423 | Fit: robust
cat(sprintf("$-scale RMSE: $%s | MAE: $%s | R^2: %.3f\n",
format(round(best$rmse), big.mark=","), format(round(best$mae), big.mark=","), best$r2))
$-scale RMSE: $464,722 | MAE: $275,386 | R^2: 0.601
# Optional: write to CSV for the writeup
out_tab <- coeft[, c("Estimate","Std.Error","t.value","Pr(>|t|)","Signif","CI_2.5","CI_97.5","Abs_t")]
utils::write.csv(out_tab, "last_mile_coefficients.csv", row.names=TRUE)
cat("\nSaved coefficients to: last_mile_coefficients.csv\n")
Saved coefficients to: last_mile_coefficients.csv
---
title: "assignment_3_section1&2"
output: html_notebook
---


```{r}
# ----------------------------------------------------------------------
# I. DATA & LIBRARY LOADING
# ----------------------------------------------------------------------

setwd("~/Desktop/Statistical Model/assignment_3")

library(tidyverse)
library(lubridate)
library(scales)
library(dplyr)
library(ggplot2)

# Load Data
raw_real_estate_2016 <- read.csv("2016_brooklyn.csv")
raw_real_estate_2017 <- read.csv("2017_brooklyn.csv")
raw_real_estate_2018 <- read.csv("2018_brooklyn.csv")
raw_real_estate_2019 <- read.csv("2019_brooklyn.csv")
raw_real_estate_2020 <- read.csv("2020_brooklyn.csv")

# Check Column Names
colnames(raw_real_estate_2016)
colnames(raw_real_estate_2017)
colnames(raw_real_estate_2018)
colnames(raw_real_estate_2019)
colnames(raw_real_estate_2020)

# Check Raw Data
head(raw_real_estate_2016)
head(raw_real_estate_2017)
head(raw_real_estate_2018)
head(raw_real_estate_2019)
head(raw_real_estate_2020)

# ----------------------------------------------------------------------
# II. CLEAN & COMBINE DATA
# ----------------------------------------------------------------------

# Remove Metadata Rows
real_estate_2016 <- raw_real_estate_2016[-(1:4), ]
real_estate_2017 <- raw_real_estate_2017[-(1:4), ]
real_estate_2018 <- raw_real_estate_2018[-(1:4), ]
real_estate_2019 <- raw_real_estate_2019[-(1:4), ]
real_estate_2020 <- raw_real_estate_2020[-(1:7), ]

head(real_estate_2016)
head(real_estate_2017)
head(real_estate_2018)
head(real_estate_2019)
head(real_estate_2020)

# Standardize Column Names
standardized_column_names <- c("BOROUGH", 
                           "NEIGHBORHOOD",
                           "BUILDING CLASS CATEGORY", 
                           "TAX CLASS AT PRESENT", 
                           "BLOCK", 
                           "LOT",
                           "EASE-MENT",
                           "BUILDING CLASS AT PRESENT",
                           "ADDRESS",
                           "APARTMENT NUMBER",
                           "ZIP CODE",
                           "RESIDENTIAL UNITS",
                           "COMMERCIAL UNITS",
                           "TOTAL UNITS",
                           "LAND SQUARE FEET",
                           "GROSS SQUARE FEET",
                           "YEAR BUILT",
                           "TAX CLASS AT TIME OF SALE",
                           "BUILDING CLASS AT TIME OF SALE",
                           "SALE PRICE",
                           "SALE DATE")

colnames(real_estate_2016) <- standardized_column_names
colnames(real_estate_2017) <- standardized_column_names
colnames(real_estate_2018) <- standardized_column_names
colnames(real_estate_2019) <- standardized_column_names
colnames(real_estate_2020) <- standardized_column_names

head(real_estate_2016)
head(real_estate_2017)
head(real_estate_2018)
head(real_estate_2019)
head(real_estate_2020)

# Set Column Data Types
set_column_types <- function(df) {
  string_cols <- c(
    "BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY", 
    "TAX CLASS AT PRESENT", "BLOCK", "EASE-MENT", 
    "BUILDING CLASS AT PRESENT", "ADDRESS", "APARTMENT NUMBER", 
    "ZIP CODE", "TAX CLASS AT TIME OF SALE", 
    "BUILDING CLASS AT TIME OF SALE"
  )
  
  numeric_cols <- c(
    "RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS",
    "LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT", "SALE PRICE"
  )
  
  # Clean and convert string/numeric columns first
  df <- df %>%
    mutate(across(where(is.character), str_squish)) %>%
    mutate(across(all_of(string_cols), as.character)) %>%
    mutate(across(all_of(numeric_cols),
                  ~ suppressWarnings(as.numeric(gsub("[^0-9.]", "", str_squish(.))))))
  
  # Handle SALE DATE format flexibly (2-digit vs 4-digit year)
  if (any(grepl("^\\d{1,2}/\\d{1,2}/\\d{2}$", df$`SALE DATE`))) {
    df$`SALE DATE` <- as.Date(df$`SALE DATE`, format = "%m/%d/%y")  # ✅ 2-digit year
  } else {
    df$`SALE DATE` <- as.Date(df$`SALE DATE`, format = "%m/%d/%Y")  # ✅ 4-digit year
  }
  
  return(df)
}




real_estate_2016 <- set_column_types(real_estate_2016)
real_estate_2017 <- set_column_types(real_estate_2017)
real_estate_2018 <- set_column_types(real_estate_2018)
real_estate_2019 <- set_column_types(real_estate_2019)
real_estate_2020 <- set_column_types(real_estate_2020)

head(real_estate_2016)
head(real_estate_2017)
head(real_estate_2018)
head(real_estate_2019)
head(real_estate_2020)

# Filter data 
# first letter of BUILDING CLASS AT TIME OF SALE in c ("A", "R)
# AND number of TOTAL UNITS == 1
# AND number of RESIDENTIAL UNITS == 1
# AND GROSS SQUARE FOOT > 0
# AND SALE PRICE is not N/A

filter_data <- function(df) {
  df %>%
    filter(
      substr(`BUILDING CLASS AT TIME OF SALE`, 1, 1) %in% c("A", "R"),
      `TOTAL UNITS` == 1,
      `RESIDENTIAL UNITS` == 1,
      `GROSS SQUARE FEET` > 0,
      !is.na(`SALE PRICE`)
    )
}

real_estate_2016 <- filter_data(real_estate_2016)
real_estate_2017 <- filter_data(real_estate_2017)
real_estate_2018 <- filter_data(real_estate_2018)
real_estate_2019 <- filter_data(real_estate_2019)
real_estate_2020 <- filter_data(real_estate_2020)

head(real_estate_2016)
head(real_estate_2017)
head(real_estate_2018)
head(real_estate_2019)
head(real_estate_2020)

# Create New "Age" column
datasets <- list(
  real_estate_2016 = real_estate_2016,
  real_estate_2017 = real_estate_2017,
  real_estate_2018 = real_estate_2018,
  real_estate_2019 = real_estate_2019,
  real_estate_2020 = real_estate_2020
)

for (year in 2016:2020) {
  df_name <- paste0("real_estate_", year)
  df <- get(df_name)
  
  df$`SALE DATE` <- as.Date(df$`SALE DATE`)
  df$`SALE YEAR` <- as.numeric(format(df$`SALE DATE`, "%Y"))
  
  df$`BUILDING AGE AT SALE` <- ifelse(
    is.na(df$`YEAR BUILT`) | df$`YEAR BUILT` == 0,
    NA,
    df$`SALE YEAR` - df$`YEAR BUILT`
  )
  assign(df_name, df)
}


# Combine All Data
real_estate_df <- rbind(real_estate_2016, 
                        real_estate_2017, 
                        real_estate_2018, 
                        real_estate_2019,
                        real_estate_2020)

head(real_estate_df)

# Drop Irrelevant columns
real_estate_df <- real_estate_df[ , !(names(real_estate_2016) %in% c(
  "BOROUGH", 
  "EASE-MENT", 
  "APARTMENT NUMBER",
  "TOTAL UNITS", # Always 1
  "RESIDENTIAL UNITS", # Always 1
  "COMMERCIAL UNITS", 
  "ADDRESS NUMBER",
  "SALE YEAR"
))]

# Check Row 
head(real_estate_df)
nrow(real_estate_df)

```

```{r}
## Build cleaned data ready for one-line regressions
## Input: real_estate_df

# 1) Copy and make names R-safe (handles spaces)
df <- real_estate_df
names(df) <- make.names(names(df))

# 2) Identify the price column robustly and coerce to numeric if needed
price_col <- if ("SALE.PRICE" %in% names(df)) "SALE.PRICE" else {
  cand <- grep("^SALE[._ ]?PRICE$", names(df), ignore.case = TRUE, value = TRUE)
  stopifnot(length(cand) > 0); cand[1]
}
if (is.character(df[[price_col]])) {
  df[[price_col]] <- as.numeric(gsub("[^0-9.]", "", df[[price_col]]))
}

# 3) Drop obvious IDs/dates (optional but typical)
drop_cols <- intersect(c("ADDRESS","BLOCK","LOT","SALE.DATE"), names(df))
if (length(drop_cols)) df[drop_cols] <- list(NULL)

# 4) Convert ALL character cols to factors (so lm() makes dummies)
char_cols <- names(df)[sapply(df, is.character)]
if (length(char_cols)) df[char_cols] <- lapply(df[char_cols], as.factor)

# 5) Drop single-level columns (avoids contrasts errors)
one_level <- names(df)[sapply(df, function(x) length(unique(na.omit(x))) <= 1)]
if (length(one_level)) df[one_level] <- list(NULL)

# 6) Keep rows with finite prices; also make a log-ready version (>0)
df_ready     <- subset(df, is.finite(df[[price_col]]))
df_ready_log <- subset(df_ready, df_ready[[price_col]] > 0)

cat("df_ready:", nrow(df_ready), "rows,", ncol(df_ready), "cols\n")
cat("df_ready_log:", nrow(df_ready_log), "rows\n")
```

```{r}
# ============ LAST-MILE++ (≤40 df, ≥13k rows): ZIP-relative size, timing×neigh, age hinge ============
suppressPackageStartupMessages({ library(dplyr); library(splines); library(MASS) })

wins  <- function(x, qlo=.01, qhi=.99){ q <- quantile(x,c(qlo,qhi),na.rm=TRUE); pmin(q[2],pmax(q[1],x)) }
rmse  <- function(a,b) sqrt(mean((a-b)^2,na.rm=TRUE))
mae   <- function(a,b) mean(abs(a-b),na.rm=TRUE)
count_df <- function(f,d) ncol(model.matrix(f,d))-1L

# ---- data & core numerics ----
df <- real_estate_df
names(df) <- make.names(names(df))
if (is.character(df$SALE.PRICE)) df$SALE.PRICE <- as.numeric(gsub("[^0-9.]","",df$SALE.PRICE))
df <- df %>% filter(is.finite(SALE.PRICE), SALE.PRICE>0)

df <- df %>%
  mutate(
    GSF = pmax(1, suppressWarnings(as.numeric(GROSS.SQUARE.FEET))),
    LSF = pmax(1, suppressWarnings(as.numeric(LAND.SQUARE.FEET))),
    LOG_GSF = log(GSF), LOG_LSF = log(LSF),
    AGE = dplyr::case_when(
      !is.na(BUILDING.AGE.AT.SALE) ~ suppressWarnings(as.numeric(BUILDING.AGE.AT.SALE)),
      !is.na(YEAR.BUILT) ~ as.numeric(format(Sys.Date(),"%Y")) - suppressWarnings(as.numeric(YEAR.BUILT)),
      TRUE ~ NA_real_
    ),
    AGE_LOG = log1p(pmax(0, AGE)),
    FAR = pmin(5, pmax(0.05, GSF / pmax(1, LSF))),
    SALE.DATE = suppressWarnings(as.Date(if ("SALE.DATE" %in% names(df)) SALE.DATE else NA)),
    TIME_INDEX = as.numeric(difftime(SALE.DATE, min(SALE.DATE, na.rm=TRUE), units="days"))/365,
    ZIP = if ("ZIP.CODE" %in% names(df)) as.character(ZIP.CODE) else NA_character_
  ) %>%
  mutate(across(c(SALE.PRICE,GSF,LSF,FAR), wins))

# ---- simple target encodings (in-bag shrinked; we already validated oof earlier) ----
ylog <- log(df$SALE.PRICE)
target_enc <- function(k,y,m=70){
  mu<-mean(y,na.rm=TRUE); key<-as.character(k)
  t<-tapply(y,key,function(v){n<-sum(is.finite(v));(sum(v,na.rm=TRUE)+m*mu)/(pmax(1,n)+m)})
  out<-as.numeric(t[key]); out[!is.finite(out)]<-mu; out
}
df$NEIGH_TE  <- if ("NEIGHBORHOOD"%in%names(df)) target_enc(df$NEIGHBORHOOD,ylog,70) else NA
df$BCLASS_TE <- if ("BUILDING.CLASS.AT.PRESENT"%in%names(df)) target_enc(substr(df$BUILDING.CLASS.AT.PRESENT,1,1),ylog,60) else NA
df$TAX_TE    <- if ("TAX.CLASS.AT.PRESENT"%in%names(df)) target_enc(df$TAX.CLASS.AT.PRESENT,ylog,50) else NA

df$BCLASS_R <- if("BUILDING.CLASS.AT.PRESENT"%in%names(df))
                 as.numeric(substr(df$BUILDING.CLASS.AT.PRESENT,1,1)=="R") else 0
df$TAX_2C   <- if("TAX.CLASS.AT.PRESENT"%in%names(df))
                 as.numeric(df$TAX.CLASS.AT.PRESENT%in%c("2C","02C")) else 0
df$TAX_2    <- if("TAX.CLASS.AT.PRESENT"%in%names(df))
                 as.numeric(df$TAX.CLASS.AT.PRESENT%in%c("2","02")) else 0

# ---- NEW 1: relative size within ZIP (1 df) ----
if (!all(is.na(df$ZIP))) {
  zip_med <- tapply(df$LOG_GSF, df$ZIP, function(v) median(v, na.rm=TRUE))
  df$REL_SIZE_ZIP <- df$LOG_GSF - as.numeric(zip_med[df$ZIP])
  df$REL_SIZE_ZIP[!is.finite(df$REL_SIZE_ZIP)] <- 0
} else df$REL_SIZE_ZIP <- 0

# ---- NEW 2: timing × neighborhood TE (1 df) ----
df$TIxNE <- with(df, TIME_INDEX * NEIGH_TE)

# ---- NEW 3: very-old stock hinge (1 df; hinge ~50y on original scale) ----
cut_age <- log1p(50)
df$AGE_HINGE <- pmax(0, df$AGE_LOG - cut_age)

# model data (keep ≥13k)
feats <- c("LOG_GSF","LOG_LSF","AGE_LOG","AGE_HINGE","FAR","TIME_INDEX",
           "NEIGH_TE","BCLASS_TE","TAX_TE","BCLASS_R","TAX_2","TAX_2C",
           "REL_SIZE_ZIP","TIxNE")
dfm <- df[complete.cases(df[,c("SALE.PRICE",feats)]), c("SALE.PRICE",feats)]
stopifnot(nrow(dfm) >= 13000)

# ---- formula: keep your spine + previous interactions + 3 new terms ----
terms <- c(
  "ns(LOG_GSF,4)","ns(LOG_LSF,2)","ns(AGE_LOG,2)","AGE_HINGE","ns(TIME_INDEX,3)","FAR",
  "NEIGH_TE","BCLASS_TE","TAX_TE","BCLASS_R","TAX_2","TAX_2C",
  "REL_SIZE_ZIP","TIxNE",
  "LOG_GSF:TAX_2","LOG_GSF:BCLASS_R","FAR:LOG_GSF"   # from your strong variant
)
form_full <- as.formula(paste("log(SALE.PRICE) ~", paste(terms, collapse=" + ")))

# ---- keep ≤40 df (shrink splines first, then drop weakest new terms if needed) ----
reduce_to40 <- function(f,d){
  cur <- f
  drop_order <- c("TIxNE","REL_SIZE_ZIP","AGE_HINGE","FAR:LOG_GSF","LOG_GSF:TAX_2","LOG_GSF:BCLASS_R")
  for (iter in 1:120){
    k <- try(count_df(cur,d), silent=TRUE)
    if (!inherits(k,"try-error") && k <= 40) return(cur)
    # step 1: reduce LOG_GSF spline 4->3->2
    fstr <- deparse(cur)
    if (grepl("ns\\(LOG_GSF,4\\)", fstr)) { fstr <- gsub("ns\\(LOG_GSF,4\\)","ns(LOG_GSF,3)", fstr)
    } else if (grepl("ns\\(LOG_GSF,3\\)", fstr)) { fstr <- gsub("ns\\(LOG_GSF,3\\)","ns(LOG_GSF,2)", fstr)
    } else {
      # step 2: drop lowest-priority single-df terms one by one
      for (t in drop_order) {
        if (grepl(paste0("(^|\\+|\\s)", gsub("\\+","\\\\+",t), "($|\\+|\\s)"), fstr)) {
          fstr <- sub(paste0("\\+\\s*", t), "", fstr); break
        }
      }
    }
    cur <- as.formula(fstr)
  }
  cur
}
form <- reduce_to40(form_full, dfm)

# ---- fit two variants: robust vs. OLS, pick lower RMSE (tie -> higher R^2) ----
fit_and_eval <- function(dat, form, robust = FALSE, k = 1.5){
  # base fit (unweighted) to get residual scale
  m0 <- lm(form, data = dat)
  if (robust){
    # standardized residuals
    sig <- summary(m0)$sigma
    u   <- residuals(m0) / sig

    # Huber weights: w = min(1, k/|u|)
    w    <- pmin(1, k / pmax(1e-8, abs(u)))
    w[!is.finite(w)] <- 1

    # attach to data so lm can see it
    dat$.__w <- w
    m <- lm(form, data = dat, weights = .__w, na.action = na.exclude)
  } else {
    m <- m0
  }

  # predictions on log-scale, then Duan smearing to get dollars
  pred_log <- predict(m, newdata = dat, na.action = na.exclude)
  smear    <- mean(exp(residuals(m)), na.rm = TRUE)
  yhat     <- exp(pred_log) * smear
  y        <- dat$SALE.PRICE

  rmse  <- sqrt(mean((y - yhat)^2, na.rm = TRUE))
  mae   <- mean(abs(y - yhat), na.rm = TRUE)
  r2    <- 1 - sum((y - yhat)^2, na.rm = TRUE) / sum((y - mean(y, na.rm = TRUE))^2, na.rm = TRUE)
  dfmod <- ncol(model.matrix(form, dat)) - 1L

  list(m = m, smear = smear, rmse = rmse, mae = mae, r2 = r2,
       adjr2 = summary(m)$adj.r.squared, df = dfmod)
}
res_ols  <- fit_and_eval(dfm, form, robust=FALSE)
res_rob  <- fit_and_eval(dfm, form, robust=TRUE)
best <- if (res_rob$rmse + 1e-9 < res_ols$rmse) res_rob else if (abs(res_rob$rmse-res_ols$rmse)<1e-9 && res_rob$r2>res_ols$r2) res_rob else res_ols

cat("\n=========== LAST-MILE++ RESULTS ===========\n")
cat("Formula:\n  ", deparse(form), "\n\n")
cat(sprintf("Rows: %d (≥13k) | df: %d (≤40)\n", nrow(dfm), best$df))
cat(sprintf("Adj R^2 (log): %.3f | Duan smearing: %.4f | Fit: %s\n",
            best$adjr2, best$smear, if (identical(best,res_rob)) "robust" else "OLS"))
cat(sprintf("$-scale RMSE: $%s | MAE: $%s | R^2: %.3f\n",
            format(round(best$rmse), big.mark=","), format(round(best$mae), big.mark=","), best$r2))

```

```{r}
# ==== COEFFICIENT TABLE & INFERENCE (for the chosen 'best$m') ====

pretty_stars <- function(p){
  cut(p, breaks=c(-Inf, .001, .01, .05, .1, Inf),
      labels=c("***","**","*","."," "), right=FALSE)
}

summ   <- summary(best$m)
coeft  <- as.data.frame(summ$coefficients)
names(coeft) <- c("Estimate","Std.Error","t.value","Pr(>|t|)")
coeft$Signif <- pretty_stars(coeft[["Pr(>|t|)"]])

# 95% CIs (base-R; works for weighted lm as well)
ci <- try(confint(best$m), silent=TRUE)
if (!inherits(ci, "try-error")) {
  coeft$CI_2.5  <- ci[rownames(coeft), 1]
  coeft$CI_97.5 <- ci[rownames(coeft), 2]
}

# Order by absolute t
coeft$Abs_t <- abs(coeft$`t.value`)
coeft <- coeft[order(-coeft$Abs_t), ]

cat("\n=========== COEFFICIENT TABLE (sorted by |t|) ===========\n")
printCoefmat(as.matrix(coeft[, c("Estimate","Std.Error","t.value","Pr(>|t|)")]),
             P.values=TRUE, has.Pvalue=TRUE, signif.stars=FALSE, digits=4)
cat("\nSignif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n")

# Add stars & CIs in a tidy view
cat("\nTop 25 by |t| (with stars & 95% CI):\n")
print(utils::head(coeft[, c("Estimate","Std.Error","t.value","Pr(>|t|)","Signif","CI_2.5","CI_97.5")], 25), digits=4)

# Model overview again for convenience
cat("\n=========== MODEL OVERVIEW ===========\n")
cat(sprintf("Observations: %d\n", nobs(best$m)))
cat(sprintf("Model df (excl. intercept): %d\n", best$df))
cat(sprintf("Adj R^2 (log): %.3f | Duan smearing: %.4f | Fit: %s\n",
            best$adjr2, best$smear, if (identical(best$rmse, res_rob$rmse) && identical(best$r2, res_rob$r2)) "robust" else "OLS"))
cat(sprintf("$-scale RMSE: $%s | MAE: $%s | R^2: %.3f\n",
            format(round(best$rmse), big.mark=","), format(round(best$mae), big.mark=","), best$r2))

# Optional: write to CSV for the writeup
out_tab <- coeft[, c("Estimate","Std.Error","t.value","Pr(>|t|)","Signif","CI_2.5","CI_97.5","Abs_t")]
utils::write.csv(out_tab, "last_mile_coefficients.csv", row.names=TRUE)
cat("\nSaved coefficients to: last_mile_coefficients.csv\n")
```