# install.packages("openxlsx")

suppressPackageStartupMessages({
  library(readxl)     # Opens and reads Excel files directly into R
  library(dplyr)      # Cleans, filters, and transforms data with simple commands
  library(ggplot2)    # Builds professional and publication-quality charts
  library(car)        # Tests for multicollinearity using Variance Inflation Factors
  library(lmtest)     # Runs formal statistical tests on regression model residuals
  library(gridExtra)  # Arranges multiple charts side by side on one page
  library(scales)     # Formats numbers on chart axes as dollar amounts
  library(openxlsx)    # Saves the final transformed dataset back to Excel
})

# Ensures results are reproducible — same output every run
set.seed(42)

# All charts will be saved here automatically
dir.create("output_plots", showWarnings = FALSE)
# ══════════════════════════════════════════════════════════════════
# PART 2: Load the Dataset
# We read the Excel file into R and store it as "raw".
# Make sure the Excel file is in the same folder as this R script.
# ══════════════════════════════════════════════════════════════════

raw <- read_excel("C:/Users/ravit/Downloads/SAMPLE_DATA Final.xlsx")

# Print a quick summary so we can confirm the data loaded correctly
cat("Total number of sales      :", nrow(raw), "\n")
## Total number of sales      : 121
cat("Total number of columns    :", ncol(raw), "\n")
## Total number of columns    : 26
cat("Earliest sale date         :", format(min(raw$SALE_DATE),  "%m/%d/%Y"), "\n")
## Earliest sale date         : 06/01/2020
cat("Latest sale date           :", format(max(raw$SALE_DATE),  "%m/%d/%Y"), "\n")
## Latest sale date           : 06/26/2021
cat("Lowest adjusted sale price :", dollar(min(raw$ADJ_SALE_PRICE)),  "\n")
## Lowest adjusted sale price : $152,600
cat("Highest adjusted sale price:", dollar(max(raw$ADJ_SALE_PRICE)), "\n")
## Highest adjusted sale price: $655,000
cat("Median adjusted sale price :", dollar(median(raw$ADJ_SALE_PRICE)), "\n")
## Median adjusted sale price : $375,000
# Check if any columns have missing values
cat("\nMissing values per column:\n")
## 
## Missing values per column:
print(colSums(is.na(raw)))
##              PARCEL_ID              SALE_DATE             SALE_PRICE 
##                      0                      0                      0 
##         ADJ_SALE_PRICE          PROPERTY_TYPE             STYLE_BLDG 
##                      0                      0                      0 
##              NBHD_TYPE         MAIN_BLDG_SQFT        UPPER_BLDG_SQFT 
##                      0                      0                      0 
##             YRBLT_BLDG     OVERALL_GRADE_BLDG OVERALL_CONDITION_BLDG 
##                      0                      0                      0 
##              BSMT_SQFT           FN_BSMT_SQFT          FN_BSMT_GRADE 
##                      0                      0                      0 
##           NUM_BEDROOMS             FULL_BATHS        THREE_QTR_BATHS 
##                      0                      0                      0 
##             HALF_BATHS           PRI_BTH_QUAL           NUM_KITCHENS 
##                      0                      0                      0 
##           PRI_KIT_QUAL          EXT_WALL_TYPE                ROOFING 
##                      0                      0                      0 
##               LANDSQFT               CAR_SQFT 
##                      0                      0
# ══════════════════════════════════════════════════════════════════════════════
#  PART 3 — Ordinal Encoding
#
#  Regression requires numeric inputs. Quality and condition fields
#  stored as text (e.g. "Fair", "Average") are converted to an
#  ordered numeric scale that preserves their inherent ranking.
#
#  Building grade and condition  →  Low Cost / Poor = 1  through  Excellent = 6
#  Bathroom and kitchen quality  →  Basic = 1, Standard = 2, Modern = 3
#  Basement finish grade         →  No basement = 0  through  Excellent = 6
# ══════════════════════════════════════════════════════════════════════════════

grade_map <- c(
  "Low Cost"  = 1L, "Poor"      = 1L,
  "Fair"      = 2L, "Average"   = 3L,
  "Good"      = 4L, "Very Good" = 5L, "Excellent" = 6L
)

cond_map      <- grade_map

bth_qual_map  <- c("BASIC" = 1L, "STANDARD" = 2L, "MODERN" = 3L)
kit_qual_map  <- c("BASIC" = 1L, "STANDARD" = 2L, "MODERN" = 3L)

bsmt_grade_map <- c(
  "None"      = 0L,
  "Low Cost"  = 1L, "Poor"      = 1L,
  "Fair"      = 2L, "Average"   = 3L,
  "Good"      = 4L, "Very Good" = 5L, "Excellent" = 6L
)

cat("Encoding confirmed — grade and quality scales defined.\n")
## Encoding confirmed — grade and quality scales defined.
# ══════════════════════════════════════════════════════════════════════════════
#  PART 4 — Feature Engineering
#
#  We construct the analysis-ready dataset by deriving variables
#  that are more informative for regression than the raw fields alone.
#
#  TOTAL_LIVING_SQFT   — Combines main and upper floor area into the
#                         single above-grade living area figure that
#                         appraisers and buyers use as the primary
#                         size metric.
#
#  TOTAL_BATH_EQUIV    — Converts mixed bath types into one comparable
#                         number using appraisal convention:
#                         Full = 1.0,  Three-Quarter = 0.75,  Half = 0.5
#
#  UNFIN_BSMT_SQFT     — Separating total basement into finished and
#                         unfinished components eliminates the severe
#                         multicollinearity (VIF > 11) that results
#                         from including both BSMT_SQFT and FN_BSMT_SQFT
#                         together. Each component is now an independent
#                         value driver.
#
#  AGE                 — Chronological age at time of sale captures
#                         physical depreciation not reflected in
#                         grade or condition alone.
#
#  LOG_LANDSQFT        — Land area is right-skewed due to a small
#                         number of large lots. Log transformation
#                         linearises its relationship with price.
#
#  BSMT_GRADE_NUM = 0  — Properties with no basement receive a grade
#                         of 0, distinguishing them from the lowest
#                         finished basement grade of 1.
# ══════════════════════════════════════════════════════════════════════════════

df <- raw %>%
  mutate(
    TOTAL_LIVING_SQFT = MAIN_BLDG_SQFT + UPPER_BLDG_SQFT,
    TOTAL_BATH_EQUIV  = FULL_BATHS + (0.75 * THREE_QTR_BATHS) + (0.5 * HALF_BATHS),
    SALE_YEAR         = as.integer(format(SALE_DATE, "%Y")),
    AGE               = SALE_YEAR - YRBLT_BLDG,
    UNFIN_BSMT_SQFT   = BSMT_SQFT - FN_BSMT_SQFT,
    LOG_LANDSQFT      = log1p(LANDSQFT),
    GRADE_NUM         = grade_map[OVERALL_GRADE_BLDG],
    COND_NUM          = cond_map[OVERALL_CONDITION_BLDG],
    BTH_QUAL_NUM      = bth_qual_map[PRI_BTH_QUAL],
    KIT_QUAL_NUM      = kit_qual_map[PRI_KIT_QUAL],
    BSMT_GRADE_NUM    = ifelse(is.na(FN_BSMT_GRADE), 0L,
                               bsmt_grade_map[FN_BSMT_GRADE]),
    HAS_CONCESSION    = as.integer(SALE_PRICE != ADJ_SALE_PRICE)
  )

cat(sprintf("Analysis dataset ready: %d rows × %d columns\n\n", nrow(df), ncol(df)))
## Analysis dataset ready: 121 rows × 38 columns
print(summary(df[, c("ADJ_SALE_PRICE", "TOTAL_LIVING_SQFT",
                      "UNFIN_BSMT_SQFT", "FN_BSMT_SQFT", "AGE",
                      "GRADE_NUM", "COND_NUM", "TOTAL_BATH_EQUIV",
                      "CAR_SQFT", "LOG_LANDSQFT",
                      "BTH_QUAL_NUM", "KIT_QUAL_NUM")]))
##  ADJ_SALE_PRICE   TOTAL_LIVING_SQFT UNFIN_BSMT_SQFT    FN_BSMT_SQFT   
##  Min.   :152600   Min.   : 676      Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:327000   1st Qu.: 980      1st Qu.:   0.00   1st Qu.:   0.0  
##  Median :375000   Median :1408      Median :   0.00   Median :   0.0  
##  Mean   :384772   Mean   :1417      Mean   :  95.67   Mean   : 330.8  
##  3rd Qu.:439000   3rd Qu.:1673      3rd Qu.:  50.00   3rd Qu.: 820.0  
##  Max.   :655000   Max.   :2480      Max.   :1394.00   Max.   :1440.0  
##       AGE           GRADE_NUM        COND_NUM    TOTAL_BATH_EQUIV
##  Min.   :  3.00   Min.   :1.000   Min.   :2.00   Min.   :1.000   
##  1st Qu.:  7.00   1st Qu.:2.000   1st Qu.:4.00   1st Qu.:1.750   
##  Median :  8.00   Median :3.000   Median :6.00   Median :2.250   
##  Mean   : 30.67   Mean   :2.554   Mean   :4.95   Mean   :2.262   
##  3rd Qu.: 47.00   3rd Qu.:3.000   3rd Qu.:6.00   3rd Qu.:2.750   
##  Max.   :122.00   Max.   :3.000   Max.   :6.00   Max.   :4.500   
##     CAR_SQFT      LOG_LANDSQFT    BTH_QUAL_NUM   KIT_QUAL_NUM 
##  Min.   :  0.0   Min.   :6.080   Min.   :1.00   Min.   :1.00  
##  1st Qu.:336.0   1st Qu.:6.771   1st Qu.:1.00   1st Qu.:1.00  
##  Median :422.0   Median :8.274   Median :2.00   Median :2.00  
##  Mean   :393.7   Mean   :7.802   Mean   :1.57   Mean   :1.57  
##  3rd Qu.:480.0   3rd Qu.:8.785   3rd Qu.:2.00   3rd Qu.:2.00  
##  Max.   :784.0   Max.   :9.511   Max.   :2.00   Max.   :2.00
# ══════════════════════════════════════════════════════════════════════════════
#  PART 5 — Exploratory Chart: Sale Price Distribution
#
#  Before modelling, we examine whether sale prices are roughly
#  normally distributed. Heavy skew would suggest a log-transformation
#  of the dependent variable may improve model fit. The dashed
#  vertical line marks the mean; the shape of the bars shows spread.
# ══════════════════════════════════════════════════════════════════════════════

pal_main  <- "#2C5F8A"
pal_acc   <- "#E05C2C"
pal_light <- "#A8C8E8"

p1 <- ggplot(df, aes(x = ADJ_SALE_PRICE)) +
  geom_histogram(bins = 25, fill = pal_main,
                 colour = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(ADJ_SALE_PRICE)),
             colour = pal_acc, linewidth = 1, linetype = "dashed") +
  scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Distribution of Adjusted Sale Price",
    subtitle = "Dashed line = mean  |  n = 121 sales over 18 months",
    x        = "Adjusted Sale Price",
    y        = "Number of Sales"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold", colour = pal_main))

print(p1)

# ══════════════════════════════════════════════════════════════════════════════
#  PART 6 — Exploratory Chart: Sale Price vs. Total Living Area
#
#  The scatter plot reveals the relationship between above-grade
#  living area and sale price before modelling. A strong upward
#  trend confirms this variable belongs in the model. Points are
#  coloured by property type to identify any clustering effects.
# ══════════════════════════════════════════════════════════════════════════════

p2 <- ggplot(df, aes(x = TOTAL_LIVING_SQFT, y = ADJ_SALE_PRICE,
                      colour = PROPERTY_TYPE)) +
  geom_point(alpha = 0.7, size = 2.5) +
  geom_smooth(
    method      = "lm",
    se          = TRUE,
    colour      = pal_acc,
    fill        = pal_light,
    linewidth   = 1.1,
    inherit.aes = FALSE,
    aes(x = TOTAL_LIVING_SQFT, y = ADJ_SALE_PRICE)
  ) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Adjusted Sale Price vs. Total Above-Grade Living Area",
    subtitle = "OLS trend line with 95% confidence interval",
    x        = "Total Living Area (sq ft)",
    y        = "Adjusted Sale Price",
    colour   = "Property Type"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold", colour = pal_main))

print(p2)
## `geom_smooth()` using formula = 'y ~ x'

# ══════════════════════════════════════════════════════════════════════════════
#  PART 7 — Exploratory Charts: Price by Grade and Condition
#
#  Box plots confirm whether median sale price rises consistently
#  across each grade and condition level as expected.
#  NA values in condition are removed before plotting.
# ══════════════════════════════════════════════════════════════════════════════

grade_labels <- c("1" = "Low Cost", "2" = "Fair",   "3" = "Average",
                  "4" = "Good",     "5" = "Very Good", "6" = "Excellent")

p3 <- df %>%
  mutate(Grade_Label = factor(grade_labels[as.character(GRADE_NUM)],
                               levels = grade_labels)) %>%
  filter(!is.na(Grade_Label)) %>%
  ggplot(aes(x = Grade_Label, y = ADJ_SALE_PRICE, fill = Grade_Label)) +
  geom_boxplot(outlier.colour = pal_acc, outlier.shape = 16,
               outlier.size = 2, alpha = 0.75) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_fill_brewer(palette = "Blues") +
  labs(
    title    = "Sale Price by Building Grade",
    subtitle = "Box spans the middle 50% of prices  |  Centre line = median",
    x        = "Building Grade",
    y        = "Adjusted Sale Price"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title      = element_text(face = "bold", colour = pal_main,
                                   size = 11),
    plot.subtitle   = element_text(size = 8),
    legend.position = "none",
    axis.text.x     = element_text(angle = 25, hjust = 1, size = 9),
    plot.margin     = margin(10, 15, 10, 10)
  )

cond_labels <- c("1" = "Fair",  "2" = "Average", "3" = "Good",
                 "4" = "Very Good", "5" = "Excellent")

p4 <- df %>%
  mutate(Cond_Label = factor(cond_labels[as.character(COND_NUM)],
                              levels = cond_labels)) %>%
  filter(!is.na(Cond_Label)) %>%
  ggplot(aes(x = Cond_Label, y = ADJ_SALE_PRICE, fill = Cond_Label)) +
  geom_boxplot(outlier.colour = pal_acc, outlier.shape = 16,
               outlier.size = 2, alpha = 0.75) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_fill_brewer(palette = "Greens") +
  labs(
    title    = "Sale Price by Building Condition",
    subtitle = "Box spans the middle 50% of prices  |  Centre line = median",
    x        = "Building Condition",
    y        = "Adjusted Sale Price"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title      = element_text(face = "bold", colour = pal_main,
                                   size = 11),
    plot.subtitle   = element_text(size = 8),
    legend.position = "none",
    axis.text.x     = element_text(angle = 25, hjust = 1, size = 9),
    plot.margin     = margin(10, 15, 10, 10)
  )

grid.arrange(p3, p4, ncol = 2,
             top = grid::textGrob(
               "Sale Price Distribution by Grade and Condition",
               gp = grid::gpar(fontface = "bold",
                               fontsize = 13,
                               col = pal_main)
             ))

# ══════════════════════════════════════════════════════════════════════════════
#  PART 8 — Correlation Matrix
#
#  Pearson correlations are calculated between all numeric model
#  candidates and displayed as a colour-coded heatmap. Variables with
#  high correlation to ADJ_SALE_PRICE are strong model candidates.
#  High correlations between two predictors signal potential
#  multicollinearity — addressed in Part 10 via VIF diagnostics.
# ══════════════════════════════════════════════════════════════════════════════

num_vars <- df %>%
  select(ADJ_SALE_PRICE, TOTAL_LIVING_SQFT, UNFIN_BSMT_SQFT, FN_BSMT_SQFT,
         AGE, GRADE_NUM, COND_NUM, BTH_QUAL_NUM, KIT_QUAL_NUM,
         BSMT_GRADE_NUM, CAR_SQFT, LOG_LANDSQFT, TOTAL_BATH_EQUIV)

cor_mat <- cor(num_vars, use = "complete.obs")

# Print ranked correlations with sale price for quick reference
cat("Correlations with ADJ_SALE_PRICE — ranked highest to lowest:\n")
## Correlations with ADJ_SALE_PRICE — ranked highest to lowest:
price_cors <- sort(cor_mat["ADJ_SALE_PRICE", -1], decreasing = TRUE)
print(round(price_cors, 3))
##  TOTAL_BATH_EQUIV         GRADE_NUM TOTAL_LIVING_SQFT    BSMT_GRADE_NUM 
##             0.565             0.455             0.453             0.410 
##      BTH_QUAL_NUM      FN_BSMT_SQFT      KIT_QUAL_NUM          COND_NUM 
##             0.410             0.407             0.396             0.385 
##   UNFIN_BSMT_SQFT          CAR_SQFT      LOG_LANDSQFT               AGE 
##             0.345             0.271             0.067            -0.465
# Build correlation heatmap using ggplot2 (no corrplot package needed)
cor_melted        <- expand.grid(Var1 = colnames(cor_mat),
                                  Var2 = colnames(cor_mat))
cor_melted$value  <- as.vector(cor_mat)

p_cor <- ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = round(value, 2)), size = 2.8, colour = "black") +
  scale_fill_gradient2(
    low      = "#C00000",
    mid      = "white",
    high     = "#2C5F8A",
    midpoint = 0,
    limits   = c(-1, 1),
    name     = "Correlation"
  ) +
  labs(
    title = "Pairwise Correlation Matrix — All Candidate Variables",
    x     = NULL,
    y     = NULL
  ) +
  theme_minimal(base_size = 10) +
  theme(
    plot.title  = element_text(face = "bold", colour = pal_main),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p_cor)

# ══════════════════════════════════════════════════════════════════════════════
#  PART 9 — Multiple Linear Regression Model
#
#  We fit an OLS (Ordinary Least Squares) regression model with
#  ADJ_SALE_PRICE as the dependent variable. The model finds the
#  combination of predictor weights that minimises the total
#  squared difference between predicted and actual sale prices.
#
#  All 11 predictors are entered simultaneously (Enter method).
#  Variable selection rationale is documented in Part 4.
# ══════════════════════════════════════════════════════════════════════════════

model <- lm(
  ADJ_SALE_PRICE ~
    TOTAL_LIVING_SQFT +   # Above-grade living area (sq ft)
    UNFIN_BSMT_SQFT   +   # Unfinished basement area (sq ft)
    FN_BSMT_SQFT      +   # Finished basement area (sq ft)
    GRADE_NUM         +   # Overall building quality (1–6 scale)
    COND_NUM          +   # Overall building condition (1–6 scale)
    AGE               +   # Structure age at time of sale (years)
    TOTAL_BATH_EQUIV  +   # Total bath equivalents
    CAR_SQFT          +   # Garage area (sq ft)
    LOG_LANDSQFT      +   # Log-transformed land area
    BTH_QUAL_NUM      +   # Primary bathroom finish quality (1–3)
    KIT_QUAL_NUM,         # Primary kitchen finish quality (1–3)
  data = df
)

print(summary(model))
## 
## Call:
## lm(formula = ADJ_SALE_PRICE ~ TOTAL_LIVING_SQFT + UNFIN_BSMT_SQFT + 
##     FN_BSMT_SQFT + GRADE_NUM + COND_NUM + AGE + TOTAL_BATH_EQUIV + 
##     CAR_SQFT + LOG_LANDSQFT + BTH_QUAL_NUM + KIT_QUAL_NUM, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -101427  -37556   -5808   37799  135283 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        34453.760 133524.206   0.258 0.796867    
## TOTAL_LIVING_SQFT     56.156     15.913   3.529 0.000612 ***
## UNFIN_BSMT_SQFT       80.936     25.680   3.152 0.002096 ** 
## FN_BSMT_SQFT          84.300     21.297   3.958 0.000135 ***
## GRADE_NUM           8774.503  15167.192   0.579 0.564107    
## COND_NUM           10962.632  11317.143   0.969 0.334852    
## AGE                 -432.101    294.968  -1.465 0.145824    
## TOTAL_BATH_EQUIV    1273.198  12499.960   0.102 0.919058    
## CAR_SQFT              -1.874     34.459  -0.054 0.956730    
## LOG_LANDSQFT       18486.237  12704.402   1.455 0.148514    
## BTH_QUAL_NUM       11394.507  23091.275   0.493 0.622684    
## KIT_QUAL_NUM        4729.111  17983.359   0.263 0.793069    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51150 on 109 degrees of freedom
## Multiple R-squared:  0.6495, Adjusted R-squared:  0.6141 
## F-statistic: 18.36 on 11 and 109 DF,  p-value: < 2.2e-16
# ══════════════════════════════════════════════════════════════════════════════
#  PART 10 — Variance Inflation Factor (VIF) Diagnostics
#  VIF measures how much the variance of a coefficient is inflated
#  because of its correlation with other predictors. Elevated VIF
#  makes individual coefficients unstable and harder to interpret.
#  Accepted thresholds:
#    VIF < 5   — no concern
#    VIF 5–10  — moderate; review but acceptable in practice
#    VIF > 10  — high; consider removing or combining the variable
# ══════════════════════════════════════════════════════════════════════════════

vif_vals <- vif(model)

cat("═══════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════
cat("  VARIANCE INFLATION FACTORS\n")
##   VARIANCE INFLATION FACTORS
cat("═══════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════
print(round(vif_vals, 2))
## TOTAL_LIVING_SQFT   UNFIN_BSMT_SQFT      FN_BSMT_SQFT         GRADE_NUM 
##              2.45              1.77              3.92              4.56 
##          COND_NUM               AGE  TOTAL_BATH_EQUIV          CAR_SQFT 
##             10.27              4.79              3.73              1.33 
##      LOG_LANDSQFT      BTH_QUAL_NUM      KIT_QUAL_NUM 
##              8.40              6.04              3.67
cat("\nVariables with VIF above 5:\n")
## 
## Variables with VIF above 5:
high_vif <- vif_vals[vif_vals > 5]
if (length(high_vif) > 0) print(round(high_vif, 2)) else cat("None.\n")
##     COND_NUM LOG_LANDSQFT BTH_QUAL_NUM 
##        10.27         8.40         6.04
# ══════════════════════════════════════════════════════════════════════════════
#  PART 11 — Breusch-Pagan Test for Heteroskedasticity
#  A core OLS assumption is that residuals have constant variance
#  across all fitted values. If errors grow larger as price increases,
#  the model is heteroskedastic and standard errors may be understated.
#  H0 (null hypothesis) : residual variance is constant
#  p > 0.05  →  assumption is satisfied
#  p < 0.05  →  heteroskedasticity present — note in report
# ══════════════════════════════════════════════════════════════════════════════

bp_test <- bptest(model)

cat("═══════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════
cat("  BREUSCH-PAGAN TEST RESULT\n")
##   BREUSCH-PAGAN TEST RESULT
cat("═══════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════
print(bp_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 24.283, df = 11, p-value = 0.01159
if (bp_test$p.value < 0.05) {
  cat("\nInterpretation: Heteroskedasticity detected (p =",
      round(bp_test$p.value, 4), ").\n")
  cat("Prediction errors tend to grow with price — a common\n")
  cat("characteristic of mixed residential portfolios. Results\n")
  cat("remain valid but should be noted in the final report.\n")
} else {
  cat("\nInterpretation: Residual variance is consistent across\n")
  cat("the price range. Core OLS assumption satisfied.\n")
}
## 
## Interpretation: Heteroskedasticity detected (p = 0.0116 ).
## Prediction errors tend to grow with price — a common
## characteristic of mixed residential portfolios. Results
## remain valid but should be noted in the final report.
# ══════════════════════════════════════════════════════════════════════════════
#  PART 12 — IAAO Mass Appraisal Accuracy Statistics
#  Median Assessment Ratio (MAR) — predicted divided by actual price.
#  Target is 1.00. Values above 1 indicate systematic over-prediction.
#  Coefficient of Dispersion (COD) — measures uniformity of predictions.
#  IAAO residential standard: COD below 15%. Below 10% is excellent.
#  Price-Related Differential (PRD) — detects regressivity.
#  IAAO standard: 0.98 to 1.03.
#  RMSE — average dollar prediction error across all properties.
# ══════════════════════════════════════════════════════════════════════════════

fitted_prices <- fitted(model)
ratios        <- fitted_prices / df$ADJ_SALE_PRICE
median_ratio  <- median(ratios)
cod           <- (mean(abs(ratios - median_ratio)) / median_ratio) * 100
prd           <- mean(ratios) / weighted.mean(ratios, df$ADJ_SALE_PRICE)
rmse          <- sqrt(mean(residuals(model)^2))

cat("═══════════════════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════════════════
cat("  MASS APPRAISAL ACCURACY STATISTICS\n")
##   MASS APPRAISAL ACCURACY STATISTICS
cat("═══════════════════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════════════════
cat(sprintf("  R-Squared (R²)           : %.4f  — explains %.1f%% of price variation\n",
            summary(model)$r.squared, summary(model)$r.squared * 100))
##   R-Squared (R²)           : 0.6495  — explains 64.9% of price variation
cat(sprintf("  Adjusted R-Squared       : %.4f\n",
            summary(model)$adj.r.squared))
##   Adjusted R-Squared       : 0.6141
cat(sprintf("  RMSE                     : %s  — average prediction error\n",
            dollar(rmse)))
##   RMSE                     : $48,546.54  — average prediction error
cat(sprintf("  Median Assessment Ratio  : %.4f  — target ≈ 1.00\n",
            median_ratio))
##   Median Assessment Ratio  : 1.0153  — target ≈ 1.00
cat(sprintf("  COD                      : %.2f%%  — IAAO standard: below 15%%\n",
            cod))
##   COD                      : 10.22%  — IAAO standard: below 15%
cat(sprintf("  PRD                      : %.4f  — IAAO standard: 0.98 to 1.03\n",
            prd))
##   PRD                      : 1.0160  — IAAO standard: 0.98 to 1.03
cat("═══════════════════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════════════════
# ══════════════════════════════════════════════════════════════════════════════
#  PART 13 — Diagnostic Chart: Actual vs. Predicted Prices
#  Each point represents one sale. The dashed diagonal is the line
#  of perfect prediction where predicted equals actual. Points
#  clustered tightly along this line indicate a well-calibrated model.
# ══════════════════════════════════════════════════════════════════════════════

df_diag <- df %>%
  mutate(
    Fitted    = fitted(model),
    Residuals = residuals(model),
    Std_Resid = rstandard(model),
    Ratio     = fitted(model) / ADJ_SALE_PRICE
  )

pA <- ggplot(df_diag, aes(x = Fitted, y = ADJ_SALE_PRICE)) +
  geom_point(colour = pal_main, alpha = 0.7, size = 2.5) +
  geom_abline(intercept = 0, slope = 1,
              colour = pal_acc, linewidth = 1.1, linetype = "dashed") +
  scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Actual vs. Model-Predicted Sale Price",
    subtitle = "Points near the diagonal = accurate predictions",
    x        = "Model-Predicted Price",
    y        = "Actual Adjusted Sale Price"
  ) +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold", colour = pal_main))

print(pA)

# ══════════════════════════════════════════════════════════════════════════════
#  PART 14 — Diagnostic Chart: Residuals vs. Fitted Values
#  Residuals are the difference between actual and predicted prices.
#  A well-specified model produces residuals scattered randomly around
#  zero with no discernible pattern. A curved smooth line suggests
#  a non-linear relationship the model has not fully captured.
# ══════════════════════════════════════════════════════════════════════════════

pB <- ggplot(df_diag, aes(x = Fitted, y = Residuals)) +
  geom_hline(yintercept = 0, colour = pal_acc,
             linewidth = 1, linetype = "dashed") +
  geom_point(colour = pal_main, alpha = 0.7, size = 2.5) +
  geom_smooth(method = "loess", se = FALSE,
              colour = "firebrick", linewidth = 0.9) +
  scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Residuals vs. Fitted Values",
    subtitle = "Random scatter around zero = well-specified model",
    x        = "Model-Predicted Price",
    y        = "Residual  (Actual minus Predicted)"
  ) +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold", colour = pal_main))

print(pB)
## `geom_smooth()` using formula = 'y ~ x'

# ══════════════════════════════════════════════════════════════════════════════
#  PART 15 — Diagnostic Charts: Q-Q Plot and Scale-Location
#  Normal Q-Q Plot — points tracking the diagonal confirm residuals
#  are approximately normally distributed.
#  Scale-Location Plot — a flat loess trend confirms homoskedasticity;
#  prediction error magnitude is consistent across all price levels.
# ══════════════════════════════════════════════════════════════════════════════

pC <- ggplot(df_diag, aes(sample = Std_Resid)) +
  stat_qq(colour = pal_main, alpha = 0.8, size = 2.5) +
  stat_qq_line(colour = pal_acc, linewidth = 1.1, linetype = "dashed") +
  labs(
    title    = "Normal Q-Q Plot of Standardised Residuals",
    subtitle = "Points tracking the diagonal = normally distributed errors",
    x        = "Theoretical Quantiles",
    y        = "Standardised Residuals"
  ) +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold", colour = pal_main))

pD <- ggplot(df_diag, aes(x = Fitted, y = sqrt(abs(Std_Resid)))) +
  geom_point(colour = pal_main, alpha = 0.7, size = 2.5) +
  geom_smooth(method = "loess", se = FALSE,
              colour = pal_acc, linewidth = 1) +
  scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  labs(
    title    = "Scale-Location Plot",
    subtitle = "Flat trend = consistent prediction error across the price range",
    x        = "Model-Predicted Price",
    y        = "√|Standardised Residual|"
  ) +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold", colour = pal_main))

grid.arrange(pC, pD, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'

# ══════════════════════════════════════════════════════════════════════════════
#  PART 16 — Ratio Study Chart
#  The ratio study is the standard IAAO diagnostic for mass appraisal
#  models. Each point is one sale's predicted-to-actual price ratio.
#  Green line  = 1.00 (perfect prediction)
#  Orange band = ±10% acceptable tolerance
#  A flat loess trend across the price range confirms the model
#  has no regressivity.
# ══════════════════════════════════════════════════════════════════════════════

pE <- ggplot(df_diag, aes(x = ADJ_SALE_PRICE, y = Ratio)) +
  geom_hline(yintercept = 1.00, colour = "darkgreen", linewidth = 1) +
  geom_hline(yintercept = c(0.90, 1.10), colour = "orange",
             linewidth = 0.8, linetype = "dashed") +
  geom_point(colour = pal_main, alpha = 0.7, size = 2.5) +
  geom_smooth(method = "loess", se = FALSE,
              colour = pal_acc, linewidth = 1) +
  scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
  scale_y_continuous(limits = c(0.70, 1.35),
                     breaks = seq(0.70, 1.35, 0.10)) +
  labs(
    title    = "Ratio Study — Predicted-to-Actual Price Ratio",
    subtitle = "Green = 1.00 (perfect)  |  Orange = ±10% acceptable band",
    x        = "Actual Adjusted Sale Price",
    y        = "Predicted ÷ Actual  (Assessment Ratio)"
  ) +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold", colour = pal_main))

print(pE)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ══════════════════════════════════════════════════════════════════════════════
#  PART 17 — Model Coefficient Bar Chart
#  Visualises the dollar contribution of each predictor.
#  Each bar represents the change in estimated sale price
#  associated with a one-unit increase in that variable,
#  holding all other variables constant.
# ══════════════════════════════════════════════════════════════════════════════

coef_df <- data.frame(
  Variable    = names(coef(model))[-1],
  Coefficient = coef(model)[-1]
) %>%
  mutate(
    Direction = ifelse(Coefficient > 0,
                       "Adds to Value", "Reduces Value"),
    Variable  = reorder(Variable, abs(Coefficient))
  )

pF <- ggplot(coef_df,
             aes(x = Variable, y = Coefficient, fill = Direction)) +
  geom_col(width = 0.65) +
  geom_hline(yintercept = 0, colour = "grey30") +
  coord_flip() +
  scale_fill_manual(values = c(
    "Adds to Value"  = pal_main,
    "Reduces Value"  = pal_acc
  )) +
  labs(
    title    = "Regression Coefficients — Dollar Impact per One-Unit Increase",
    subtitle = "Holding all other variables constant",
    x        = NULL,
    y        = "Coefficient  ($ change in estimated sale price)"
  ) +
  theme_minimal(base_size = 11) +
  theme(plot.title   = element_text(face = "bold", colour = pal_main),
        legend.title = element_blank())

print(pF)

# ══════════════════════════════════════════════════════════════════════════════
#  PART 18 — Export All Charts to PNG
#  All charts saved at high resolution to the output_plots folder.
#  Ready to insert into Word, PowerPoint, or PDF reports.
# ══════════════════════════════════════════════════════════════════════════════

# EDA overview — four charts on one page
png("output_plots/01_EDA_Overview.png", width = 1400, height = 1000, res = 120)
grid.arrange(p1, p2, p3, p4, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
dev.off()
## png 
##   2
# Correlation heatmap
png("output_plots/02_Correlation_Matrix.png", width = 1100, height = 1000, res = 110)
print(
  ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
    geom_tile(colour = "white") +
    geom_text(aes(label = round(value, 2)), size = 2.8, colour = "black") +
    scale_fill_gradient2(
      low = "#C00000", mid = "white", high = "#2C5F8A",
      midpoint = 0, limits = c(-1, 1), name = "Correlation"
    ) +
    labs(title = "Pairwise Correlation Matrix — All Candidate Variables",
         x = NULL, y = NULL) +
    theme_minimal(base_size = 10) +
    theme(plot.title  = element_text(face = "bold", colour = pal_main),
          axis.text.x = element_text(angle = 45, hjust = 1))
)
dev.off()
## png 
##   2
# Four diagnostic plots on one page
png("output_plots/03_Diagnostic_Plots.png", width = 1400, height = 1000, res = 120)
grid.arrange(pA, pB, pC, pD, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
dev.off()
## png 
##   2
# Ratio study and coefficient chart side by side
png("output_plots/04_Ratio_and_Coefficients.png", width = 1400, height = 600, res = 120)
grid.arrange(pE, pF, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
dev.off()
## png 
##   2
cat("All four charts exported to ./output_plots/\n")
## All four charts exported to ./output_plots/
# ══════════════════════════════════════════════════════════════════════════════
#  PART 19 — Export the Analysis-Ready Dataset to Excel
#  Includes all original columns plus the 11 engineered variables
#  created in Part 4. Serves as the complete audit trail of every
#  transformation applied to the raw data.
# ══════════════════════════════════════════════════════════════════════════════

write.xlsx(df, "SAMPLE_DATA_Transformed.xlsx")

cat(sprintf("Transformed dataset exported: %d rows × %d columns\n",
            nrow(df), ncol(df)))
## Transformed dataset exported: 121 rows × 38 columns
cat("File: SAMPLE_DATA_Transformed.xlsx\n")
## File: SAMPLE_DATA_Transformed.xlsx
# ══════════════════════════════════════════════════════════════════════════════
#  PART 20 — Final Model Results Summary
#  Consolidated output of all key statistics for inclusion in the
#  written report. Run this after all prior parts are complete.
# ══════════════════════════════════════════════════════════════════════════════

s <- summary(model)

cat("
╔══════════════════════════════════════════════════════════════════════╗
║                  FINAL MODEL RESULTS SUMMARY                        ║
╚══════════════════════════════════════════════════════════════════════╝

  GOODNESS OF FIT
  ─────────────────────────────────────────────────────────────────────\n")
## 
## ╔══════════════════════════════════════════════════════════════════════╗
## ║                  FINAL MODEL RESULTS SUMMARY                        ║
## ╚══════════════════════════════════════════════════════════════════════╝
## 
##   GOODNESS OF FIT
##   ─────────────────────────────────────────────────────────────────────
cat(sprintf("  R-Squared (R²)       : %.4f   — model accounts for %.1f%% of price variance\n",
            s$r.squared, s$r.squared * 100))
##   R-Squared (R²)       : 0.6495   — model accounts for 64.9% of price variance
cat(sprintf("  Adjusted R-Squared   : %.4f\n", s$adj.r.squared))
##   Adjusted R-Squared   : 0.6141
cat(sprintf("  RMSE                 : %s   — average dollar prediction error\n",
            dollar(rmse)))
##   RMSE                 : $48,546.54   — average dollar prediction error
cat(sprintf("  F-Statistic          : %.2f   (p-value < 0.001) — model is significant\n",
            s$fstatistic[1]))
##   F-Statistic          : 18.36   (p-value < 0.001) — model is significant
cat(sprintf("  Observations         : %d\n", nrow(df)))
##   Observations         : 121
cat("\n  IAAO MASS APPRAISAL STANDARDS\n")
## 
##   IAAO MASS APPRAISAL STANDARDS
cat("  ─────────────────────────────────────────────────────────────────────\n")
##   ─────────────────────────────────────────────────────────────────────
cat(sprintf("  Median Assessment Ratio : %.4f   (target ≈ 1.00)\n",      median_ratio))
##   Median Assessment Ratio : 1.0153   (target ≈ 1.00)
cat(sprintf("  COD                     : %.2f%%   (IAAO standard: < 15%%)\n", cod))
##   COD                     : 10.22%   (IAAO standard: < 15%)
cat(sprintf("  PRD                     : %.4f   (IAAO standard: 0.98–1.03)\n", prd))
##   PRD                     : 1.0160   (IAAO standard: 0.98–1.03)
cat("\n  REGRESSION COEFFICIENTS\n")
## 
##   REGRESSION COEFFICIENTS
cat("  ─────────────────────────────────────────────────────────────────────\n")
##   ─────────────────────────────────────────────────────────────────────
coef_out           <- as.data.frame(coef(s))
colnames(coef_out) <- c("Estimate ($)", "Std. Error", "t-value", "p-value")
print(round(coef_out, 3))
##                   Estimate ($) Std. Error t-value p-value
## (Intercept)          34453.760 133524.206   0.258   0.797
## TOTAL_LIVING_SQFT       56.156     15.913   3.529   0.001
## UNFIN_BSMT_SQFT         80.936     25.680   3.152   0.002
## FN_BSMT_SQFT            84.300     21.297   3.958   0.000
## GRADE_NUM             8774.503  15167.192   0.579   0.564
## COND_NUM             10962.632  11317.143   0.969   0.335
## AGE                   -432.101    294.968  -1.465   0.146
## TOTAL_BATH_EQUIV      1273.198  12499.960   0.102   0.919
## CAR_SQFT                -1.874     34.459  -0.054   0.957
## LOG_LANDSQFT         18486.237  12704.402   1.455   0.149
## BTH_QUAL_NUM         11394.507  23091.275   0.493   0.623
## KIT_QUAL_NUM          4729.111  17983.359   0.263   0.793
cat("\n  Significance: *** p < 0.001   ** p < 0.01   * p < 0.05\n")
## 
##   Significance: *** p < 0.001   ** p < 0.01   * p < 0.05
cat("\n  COLLINEARITY SUMMARY (VIF)\n")
## 
##   COLLINEARITY SUMMARY (VIF)
cat("  ─────────────────────────────────────────────────────────────────────\n")
##   ─────────────────────────────────────────────────────────────────────
print(round(vif_vals, 2))
## TOTAL_LIVING_SQFT   UNFIN_BSMT_SQFT      FN_BSMT_SQFT         GRADE_NUM 
##              2.45              1.77              3.92              4.56 
##          COND_NUM               AGE  TOTAL_BATH_EQUIV          CAR_SQFT 
##             10.27              4.79              3.73              1.33 
##      LOG_LANDSQFT      BTH_QUAL_NUM      KIT_QUAL_NUM 
##              8.40              6.04              3.67
cat("\n")