# install.packages("openxlsx")
suppressPackageStartupMessages({
library(readxl) # Opens and reads Excel files directly into R
library(dplyr) # Cleans, filters, and transforms data with simple commands
library(ggplot2) # Builds professional and publication-quality charts
library(car) # Tests for multicollinearity using Variance Inflation Factors
library(lmtest) # Runs formal statistical tests on regression model residuals
library(gridExtra) # Arranges multiple charts side by side on one page
library(scales) # Formats numbers on chart axes as dollar amounts
library(openxlsx) # Saves the final transformed dataset back to Excel
})
# Ensures results are reproducible — same output every run
set.seed(42)
# All charts will be saved here automatically
dir.create("output_plots", showWarnings = FALSE)
# ══════════════════════════════════════════════════════════════════
# PART 2: Load the Dataset
# We read the Excel file into R and store it as "raw".
# Make sure the Excel file is in the same folder as this R script.
# ══════════════════════════════════════════════════════════════════
raw <- read_excel("C:/Users/ravit/Downloads/SAMPLE_DATA Final.xlsx")
# Print a quick summary so we can confirm the data loaded correctly
cat("Total number of sales :", nrow(raw), "\n")
## Total number of sales : 121
cat("Total number of columns :", ncol(raw), "\n")
## Total number of columns : 26
cat("Earliest sale date :", format(min(raw$SALE_DATE), "%m/%d/%Y"), "\n")
## Earliest sale date : 06/01/2020
cat("Latest sale date :", format(max(raw$SALE_DATE), "%m/%d/%Y"), "\n")
## Latest sale date : 06/26/2021
cat("Lowest adjusted sale price :", dollar(min(raw$ADJ_SALE_PRICE)), "\n")
## Lowest adjusted sale price : $152,600
cat("Highest adjusted sale price:", dollar(max(raw$ADJ_SALE_PRICE)), "\n")
## Highest adjusted sale price: $655,000
cat("Median adjusted sale price :", dollar(median(raw$ADJ_SALE_PRICE)), "\n")
## Median adjusted sale price : $375,000
# Check if any columns have missing values
cat("\nMissing values per column:\n")
##
## Missing values per column:
print(colSums(is.na(raw)))
## PARCEL_ID SALE_DATE SALE_PRICE
## 0 0 0
## ADJ_SALE_PRICE PROPERTY_TYPE STYLE_BLDG
## 0 0 0
## NBHD_TYPE MAIN_BLDG_SQFT UPPER_BLDG_SQFT
## 0 0 0
## YRBLT_BLDG OVERALL_GRADE_BLDG OVERALL_CONDITION_BLDG
## 0 0 0
## BSMT_SQFT FN_BSMT_SQFT FN_BSMT_GRADE
## 0 0 0
## NUM_BEDROOMS FULL_BATHS THREE_QTR_BATHS
## 0 0 0
## HALF_BATHS PRI_BTH_QUAL NUM_KITCHENS
## 0 0 0
## PRI_KIT_QUAL EXT_WALL_TYPE ROOFING
## 0 0 0
## LANDSQFT CAR_SQFT
## 0 0
# ══════════════════════════════════════════════════════════════════════════════
# PART 3 — Ordinal Encoding
#
# Regression requires numeric inputs. Quality and condition fields
# stored as text (e.g. "Fair", "Average") are converted to an
# ordered numeric scale that preserves their inherent ranking.
#
# Building grade and condition → Low Cost / Poor = 1 through Excellent = 6
# Bathroom and kitchen quality → Basic = 1, Standard = 2, Modern = 3
# Basement finish grade → No basement = 0 through Excellent = 6
# ══════════════════════════════════════════════════════════════════════════════
grade_map <- c(
"Low Cost" = 1L, "Poor" = 1L,
"Fair" = 2L, "Average" = 3L,
"Good" = 4L, "Very Good" = 5L, "Excellent" = 6L
)
cond_map <- grade_map
bth_qual_map <- c("BASIC" = 1L, "STANDARD" = 2L, "MODERN" = 3L)
kit_qual_map <- c("BASIC" = 1L, "STANDARD" = 2L, "MODERN" = 3L)
bsmt_grade_map <- c(
"None" = 0L,
"Low Cost" = 1L, "Poor" = 1L,
"Fair" = 2L, "Average" = 3L,
"Good" = 4L, "Very Good" = 5L, "Excellent" = 6L
)
cat("Encoding confirmed — grade and quality scales defined.\n")
## Encoding confirmed — grade and quality scales defined.
# ══════════════════════════════════════════════════════════════════════════════
# PART 4 — Feature Engineering
#
# We construct the analysis-ready dataset by deriving variables
# that are more informative for regression than the raw fields alone.
#
# TOTAL_LIVING_SQFT — Combines main and upper floor area into the
# single above-grade living area figure that
# appraisers and buyers use as the primary
# size metric.
#
# TOTAL_BATH_EQUIV — Converts mixed bath types into one comparable
# number using appraisal convention:
# Full = 1.0, Three-Quarter = 0.75, Half = 0.5
#
# UNFIN_BSMT_SQFT — Separating total basement into finished and
# unfinished components eliminates the severe
# multicollinearity (VIF > 11) that results
# from including both BSMT_SQFT and FN_BSMT_SQFT
# together. Each component is now an independent
# value driver.
#
# AGE — Chronological age at time of sale captures
# physical depreciation not reflected in
# grade or condition alone.
#
# LOG_LANDSQFT — Land area is right-skewed due to a small
# number of large lots. Log transformation
# linearises its relationship with price.
#
# BSMT_GRADE_NUM = 0 — Properties with no basement receive a grade
# of 0, distinguishing them from the lowest
# finished basement grade of 1.
# ══════════════════════════════════════════════════════════════════════════════
df <- raw %>%
mutate(
TOTAL_LIVING_SQFT = MAIN_BLDG_SQFT + UPPER_BLDG_SQFT,
TOTAL_BATH_EQUIV = FULL_BATHS + (0.75 * THREE_QTR_BATHS) + (0.5 * HALF_BATHS),
SALE_YEAR = as.integer(format(SALE_DATE, "%Y")),
AGE = SALE_YEAR - YRBLT_BLDG,
UNFIN_BSMT_SQFT = BSMT_SQFT - FN_BSMT_SQFT,
LOG_LANDSQFT = log1p(LANDSQFT),
GRADE_NUM = grade_map[OVERALL_GRADE_BLDG],
COND_NUM = cond_map[OVERALL_CONDITION_BLDG],
BTH_QUAL_NUM = bth_qual_map[PRI_BTH_QUAL],
KIT_QUAL_NUM = kit_qual_map[PRI_KIT_QUAL],
BSMT_GRADE_NUM = ifelse(is.na(FN_BSMT_GRADE), 0L,
bsmt_grade_map[FN_BSMT_GRADE]),
HAS_CONCESSION = as.integer(SALE_PRICE != ADJ_SALE_PRICE)
)
cat(sprintf("Analysis dataset ready: %d rows × %d columns\n\n", nrow(df), ncol(df)))
## Analysis dataset ready: 121 rows × 38 columns
print(summary(df[, c("ADJ_SALE_PRICE", "TOTAL_LIVING_SQFT",
"UNFIN_BSMT_SQFT", "FN_BSMT_SQFT", "AGE",
"GRADE_NUM", "COND_NUM", "TOTAL_BATH_EQUIV",
"CAR_SQFT", "LOG_LANDSQFT",
"BTH_QUAL_NUM", "KIT_QUAL_NUM")]))
## ADJ_SALE_PRICE TOTAL_LIVING_SQFT UNFIN_BSMT_SQFT FN_BSMT_SQFT
## Min. :152600 Min. : 676 Min. : 0.00 Min. : 0.0
## 1st Qu.:327000 1st Qu.: 980 1st Qu.: 0.00 1st Qu.: 0.0
## Median :375000 Median :1408 Median : 0.00 Median : 0.0
## Mean :384772 Mean :1417 Mean : 95.67 Mean : 330.8
## 3rd Qu.:439000 3rd Qu.:1673 3rd Qu.: 50.00 3rd Qu.: 820.0
## Max. :655000 Max. :2480 Max. :1394.00 Max. :1440.0
## AGE GRADE_NUM COND_NUM TOTAL_BATH_EQUIV
## Min. : 3.00 Min. :1.000 Min. :2.00 Min. :1.000
## 1st Qu.: 7.00 1st Qu.:2.000 1st Qu.:4.00 1st Qu.:1.750
## Median : 8.00 Median :3.000 Median :6.00 Median :2.250
## Mean : 30.67 Mean :2.554 Mean :4.95 Mean :2.262
## 3rd Qu.: 47.00 3rd Qu.:3.000 3rd Qu.:6.00 3rd Qu.:2.750
## Max. :122.00 Max. :3.000 Max. :6.00 Max. :4.500
## CAR_SQFT LOG_LANDSQFT BTH_QUAL_NUM KIT_QUAL_NUM
## Min. : 0.0 Min. :6.080 Min. :1.00 Min. :1.00
## 1st Qu.:336.0 1st Qu.:6.771 1st Qu.:1.00 1st Qu.:1.00
## Median :422.0 Median :8.274 Median :2.00 Median :2.00
## Mean :393.7 Mean :7.802 Mean :1.57 Mean :1.57
## 3rd Qu.:480.0 3rd Qu.:8.785 3rd Qu.:2.00 3rd Qu.:2.00
## Max. :784.0 Max. :9.511 Max. :2.00 Max. :2.00
# ══════════════════════════════════════════════════════════════════════════════
# PART 5 — Exploratory Chart: Sale Price Distribution
#
# Before modelling, we examine whether sale prices are roughly
# normally distributed. Heavy skew would suggest a log-transformation
# of the dependent variable may improve model fit. The dashed
# vertical line marks the mean; the shape of the bars shows spread.
# ══════════════════════════════════════════════════════════════════════════════
pal_main <- "#2C5F8A"
pal_acc <- "#E05C2C"
pal_light <- "#A8C8E8"
p1 <- ggplot(df, aes(x = ADJ_SALE_PRICE)) +
geom_histogram(bins = 25, fill = pal_main,
colour = "white", alpha = 0.85) +
geom_vline(aes(xintercept = mean(ADJ_SALE_PRICE)),
colour = pal_acc, linewidth = 1, linetype = "dashed") +
scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Distribution of Adjusted Sale Price",
subtitle = "Dashed line = mean | n = 121 sales over 18 months",
x = "Adjusted Sale Price",
y = "Number of Sales"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", colour = pal_main))
print(p1)

# ══════════════════════════════════════════════════════════════════════════════
# PART 6 — Exploratory Chart: Sale Price vs. Total Living Area
#
# The scatter plot reveals the relationship between above-grade
# living area and sale price before modelling. A strong upward
# trend confirms this variable belongs in the model. Points are
# coloured by property type to identify any clustering effects.
# ══════════════════════════════════════════════════════════════════════════════
p2 <- ggplot(df, aes(x = TOTAL_LIVING_SQFT, y = ADJ_SALE_PRICE,
colour = PROPERTY_TYPE)) +
geom_point(alpha = 0.7, size = 2.5) +
geom_smooth(
method = "lm",
se = TRUE,
colour = pal_acc,
fill = pal_light,
linewidth = 1.1,
inherit.aes = FALSE,
aes(x = TOTAL_LIVING_SQFT, y = ADJ_SALE_PRICE)
) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Adjusted Sale Price vs. Total Above-Grade Living Area",
subtitle = "OLS trend line with 95% confidence interval",
x = "Total Living Area (sq ft)",
y = "Adjusted Sale Price",
colour = "Property Type"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", colour = pal_main))
print(p2)
## `geom_smooth()` using formula = 'y ~ x'

# ══════════════════════════════════════════════════════════════════════════════
# PART 7 — Exploratory Charts: Price by Grade and Condition
#
# Box plots confirm whether median sale price rises consistently
# across each grade and condition level as expected.
# NA values in condition are removed before plotting.
# ══════════════════════════════════════════════════════════════════════════════
grade_labels <- c("1" = "Low Cost", "2" = "Fair", "3" = "Average",
"4" = "Good", "5" = "Very Good", "6" = "Excellent")
p3 <- df %>%
mutate(Grade_Label = factor(grade_labels[as.character(GRADE_NUM)],
levels = grade_labels)) %>%
filter(!is.na(Grade_Label)) %>%
ggplot(aes(x = Grade_Label, y = ADJ_SALE_PRICE, fill = Grade_Label)) +
geom_boxplot(outlier.colour = pal_acc, outlier.shape = 16,
outlier.size = 2, alpha = 0.75) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_fill_brewer(palette = "Blues") +
labs(
title = "Sale Price by Building Grade",
subtitle = "Box spans the middle 50% of prices | Centre line = median",
x = "Building Grade",
y = "Adjusted Sale Price"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", colour = pal_main,
size = 11),
plot.subtitle = element_text(size = 8),
legend.position = "none",
axis.text.x = element_text(angle = 25, hjust = 1, size = 9),
plot.margin = margin(10, 15, 10, 10)
)
cond_labels <- c("1" = "Fair", "2" = "Average", "3" = "Good",
"4" = "Very Good", "5" = "Excellent")
p4 <- df %>%
mutate(Cond_Label = factor(cond_labels[as.character(COND_NUM)],
levels = cond_labels)) %>%
filter(!is.na(Cond_Label)) %>%
ggplot(aes(x = Cond_Label, y = ADJ_SALE_PRICE, fill = Cond_Label)) +
geom_boxplot(outlier.colour = pal_acc, outlier.shape = 16,
outlier.size = 2, alpha = 0.75) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_fill_brewer(palette = "Greens") +
labs(
title = "Sale Price by Building Condition",
subtitle = "Box spans the middle 50% of prices | Centre line = median",
x = "Building Condition",
y = "Adjusted Sale Price"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", colour = pal_main,
size = 11),
plot.subtitle = element_text(size = 8),
legend.position = "none",
axis.text.x = element_text(angle = 25, hjust = 1, size = 9),
plot.margin = margin(10, 15, 10, 10)
)
grid.arrange(p3, p4, ncol = 2,
top = grid::textGrob(
"Sale Price Distribution by Grade and Condition",
gp = grid::gpar(fontface = "bold",
fontsize = 13,
col = pal_main)
))

# ══════════════════════════════════════════════════════════════════════════════
# PART 8 — Correlation Matrix
#
# Pearson correlations are calculated between all numeric model
# candidates and displayed as a colour-coded heatmap. Variables with
# high correlation to ADJ_SALE_PRICE are strong model candidates.
# High correlations between two predictors signal potential
# multicollinearity — addressed in Part 10 via VIF diagnostics.
# ══════════════════════════════════════════════════════════════════════════════
num_vars <- df %>%
select(ADJ_SALE_PRICE, TOTAL_LIVING_SQFT, UNFIN_BSMT_SQFT, FN_BSMT_SQFT,
AGE, GRADE_NUM, COND_NUM, BTH_QUAL_NUM, KIT_QUAL_NUM,
BSMT_GRADE_NUM, CAR_SQFT, LOG_LANDSQFT, TOTAL_BATH_EQUIV)
cor_mat <- cor(num_vars, use = "complete.obs")
# Print ranked correlations with sale price for quick reference
cat("Correlations with ADJ_SALE_PRICE — ranked highest to lowest:\n")
## Correlations with ADJ_SALE_PRICE — ranked highest to lowest:
price_cors <- sort(cor_mat["ADJ_SALE_PRICE", -1], decreasing = TRUE)
print(round(price_cors, 3))
## TOTAL_BATH_EQUIV GRADE_NUM TOTAL_LIVING_SQFT BSMT_GRADE_NUM
## 0.565 0.455 0.453 0.410
## BTH_QUAL_NUM FN_BSMT_SQFT KIT_QUAL_NUM COND_NUM
## 0.410 0.407 0.396 0.385
## UNFIN_BSMT_SQFT CAR_SQFT LOG_LANDSQFT AGE
## 0.345 0.271 0.067 -0.465
# Build correlation heatmap using ggplot2 (no corrplot package needed)
cor_melted <- expand.grid(Var1 = colnames(cor_mat),
Var2 = colnames(cor_mat))
cor_melted$value <- as.vector(cor_mat)
p_cor <- ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(colour = "white") +
geom_text(aes(label = round(value, 2)), size = 2.8, colour = "black") +
scale_fill_gradient2(
low = "#C00000",
mid = "white",
high = "#2C5F8A",
midpoint = 0,
limits = c(-1, 1),
name = "Correlation"
) +
labs(
title = "Pairwise Correlation Matrix — All Candidate Variables",
x = NULL,
y = NULL
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(face = "bold", colour = pal_main),
axis.text.x = element_text(angle = 45, hjust = 1)
)
print(p_cor)

# ══════════════════════════════════════════════════════════════════════════════
# PART 9 — Multiple Linear Regression Model
#
# We fit an OLS (Ordinary Least Squares) regression model with
# ADJ_SALE_PRICE as the dependent variable. The model finds the
# combination of predictor weights that minimises the total
# squared difference between predicted and actual sale prices.
#
# All 11 predictors are entered simultaneously (Enter method).
# Variable selection rationale is documented in Part 4.
# ══════════════════════════════════════════════════════════════════════════════
model <- lm(
ADJ_SALE_PRICE ~
TOTAL_LIVING_SQFT + # Above-grade living area (sq ft)
UNFIN_BSMT_SQFT + # Unfinished basement area (sq ft)
FN_BSMT_SQFT + # Finished basement area (sq ft)
GRADE_NUM + # Overall building quality (1–6 scale)
COND_NUM + # Overall building condition (1–6 scale)
AGE + # Structure age at time of sale (years)
TOTAL_BATH_EQUIV + # Total bath equivalents
CAR_SQFT + # Garage area (sq ft)
LOG_LANDSQFT + # Log-transformed land area
BTH_QUAL_NUM + # Primary bathroom finish quality (1–3)
KIT_QUAL_NUM, # Primary kitchen finish quality (1–3)
data = df
)
print(summary(model))
##
## Call:
## lm(formula = ADJ_SALE_PRICE ~ TOTAL_LIVING_SQFT + UNFIN_BSMT_SQFT +
## FN_BSMT_SQFT + GRADE_NUM + COND_NUM + AGE + TOTAL_BATH_EQUIV +
## CAR_SQFT + LOG_LANDSQFT + BTH_QUAL_NUM + KIT_QUAL_NUM, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101427 -37556 -5808 37799 135283
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34453.760 133524.206 0.258 0.796867
## TOTAL_LIVING_SQFT 56.156 15.913 3.529 0.000612 ***
## UNFIN_BSMT_SQFT 80.936 25.680 3.152 0.002096 **
## FN_BSMT_SQFT 84.300 21.297 3.958 0.000135 ***
## GRADE_NUM 8774.503 15167.192 0.579 0.564107
## COND_NUM 10962.632 11317.143 0.969 0.334852
## AGE -432.101 294.968 -1.465 0.145824
## TOTAL_BATH_EQUIV 1273.198 12499.960 0.102 0.919058
## CAR_SQFT -1.874 34.459 -0.054 0.956730
## LOG_LANDSQFT 18486.237 12704.402 1.455 0.148514
## BTH_QUAL_NUM 11394.507 23091.275 0.493 0.622684
## KIT_QUAL_NUM 4729.111 17983.359 0.263 0.793069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51150 on 109 degrees of freedom
## Multiple R-squared: 0.6495, Adjusted R-squared: 0.6141
## F-statistic: 18.36 on 11 and 109 DF, p-value: < 2.2e-16
# ══════════════════════════════════════════════════════════════════════════════
# PART 10 — Variance Inflation Factor (VIF) Diagnostics
# VIF measures how much the variance of a coefficient is inflated
# because of its correlation with other predictors. Elevated VIF
# makes individual coefficients unstable and harder to interpret.
# Accepted thresholds:
# VIF < 5 — no concern
# VIF 5–10 — moderate; review but acceptable in practice
# VIF > 10 — high; consider removing or combining the variable
# ══════════════════════════════════════════════════════════════════════════════
vif_vals <- vif(model)
cat("═══════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════
cat(" VARIANCE INFLATION FACTORS\n")
## VARIANCE INFLATION FACTORS
cat("═══════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════
print(round(vif_vals, 2))
## TOTAL_LIVING_SQFT UNFIN_BSMT_SQFT FN_BSMT_SQFT GRADE_NUM
## 2.45 1.77 3.92 4.56
## COND_NUM AGE TOTAL_BATH_EQUIV CAR_SQFT
## 10.27 4.79 3.73 1.33
## LOG_LANDSQFT BTH_QUAL_NUM KIT_QUAL_NUM
## 8.40 6.04 3.67
cat("\nVariables with VIF above 5:\n")
##
## Variables with VIF above 5:
high_vif <- vif_vals[vif_vals > 5]
if (length(high_vif) > 0) print(round(high_vif, 2)) else cat("None.\n")
## COND_NUM LOG_LANDSQFT BTH_QUAL_NUM
## 10.27 8.40 6.04
# ══════════════════════════════════════════════════════════════════════════════
# PART 11 — Breusch-Pagan Test for Heteroskedasticity
# A core OLS assumption is that residuals have constant variance
# across all fitted values. If errors grow larger as price increases,
# the model is heteroskedastic and standard errors may be understated.
# H0 (null hypothesis) : residual variance is constant
# p > 0.05 → assumption is satisfied
# p < 0.05 → heteroskedasticity present — note in report
# ══════════════════════════════════════════════════════════════════════════════
bp_test <- bptest(model)
cat("═══════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════
cat(" BREUSCH-PAGAN TEST RESULT\n")
## BREUSCH-PAGAN TEST RESULT
cat("═══════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 24.283, df = 11, p-value = 0.01159
if (bp_test$p.value < 0.05) {
cat("\nInterpretation: Heteroskedasticity detected (p =",
round(bp_test$p.value, 4), ").\n")
cat("Prediction errors tend to grow with price — a common\n")
cat("characteristic of mixed residential portfolios. Results\n")
cat("remain valid but should be noted in the final report.\n")
} else {
cat("\nInterpretation: Residual variance is consistent across\n")
cat("the price range. Core OLS assumption satisfied.\n")
}
##
## Interpretation: Heteroskedasticity detected (p = 0.0116 ).
## Prediction errors tend to grow with price — a common
## characteristic of mixed residential portfolios. Results
## remain valid but should be noted in the final report.
# ══════════════════════════════════════════════════════════════════════════════
# PART 12 — IAAO Mass Appraisal Accuracy Statistics
# Median Assessment Ratio (MAR) — predicted divided by actual price.
# Target is 1.00. Values above 1 indicate systematic over-prediction.
# Coefficient of Dispersion (COD) — measures uniformity of predictions.
# IAAO residential standard: COD below 15%. Below 10% is excellent.
# Price-Related Differential (PRD) — detects regressivity.
# IAAO standard: 0.98 to 1.03.
# RMSE — average dollar prediction error across all properties.
# ══════════════════════════════════════════════════════════════════════════════
fitted_prices <- fitted(model)
ratios <- fitted_prices / df$ADJ_SALE_PRICE
median_ratio <- median(ratios)
cod <- (mean(abs(ratios - median_ratio)) / median_ratio) * 100
prd <- mean(ratios) / weighted.mean(ratios, df$ADJ_SALE_PRICE)
rmse <- sqrt(mean(residuals(model)^2))
cat("═══════════════════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════════════════
cat(" MASS APPRAISAL ACCURACY STATISTICS\n")
## MASS APPRAISAL ACCURACY STATISTICS
cat("═══════════════════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════════════════
cat(sprintf(" R-Squared (R²) : %.4f — explains %.1f%% of price variation\n",
summary(model)$r.squared, summary(model)$r.squared * 100))
## R-Squared (R²) : 0.6495 — explains 64.9% of price variation
cat(sprintf(" Adjusted R-Squared : %.4f\n",
summary(model)$adj.r.squared))
## Adjusted R-Squared : 0.6141
cat(sprintf(" RMSE : %s — average prediction error\n",
dollar(rmse)))
## RMSE : $48,546.54 — average prediction error
cat(sprintf(" Median Assessment Ratio : %.4f — target ≈ 1.00\n",
median_ratio))
## Median Assessment Ratio : 1.0153 — target ≈ 1.00
cat(sprintf(" COD : %.2f%% — IAAO standard: below 15%%\n",
cod))
## COD : 10.22% — IAAO standard: below 15%
cat(sprintf(" PRD : %.4f — IAAO standard: 0.98 to 1.03\n",
prd))
## PRD : 1.0160 — IAAO standard: 0.98 to 1.03
cat("═══════════════════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════════════════
# ══════════════════════════════════════════════════════════════════════════════
# PART 13 — Diagnostic Chart: Actual vs. Predicted Prices
# Each point represents one sale. The dashed diagonal is the line
# of perfect prediction where predicted equals actual. Points
# clustered tightly along this line indicate a well-calibrated model.
# ══════════════════════════════════════════════════════════════════════════════
df_diag <- df %>%
mutate(
Fitted = fitted(model),
Residuals = residuals(model),
Std_Resid = rstandard(model),
Ratio = fitted(model) / ADJ_SALE_PRICE
)
pA <- ggplot(df_diag, aes(x = Fitted, y = ADJ_SALE_PRICE)) +
geom_point(colour = pal_main, alpha = 0.7, size = 2.5) +
geom_abline(intercept = 0, slope = 1,
colour = pal_acc, linewidth = 1.1, linetype = "dashed") +
scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Actual vs. Model-Predicted Sale Price",
subtitle = "Points near the diagonal = accurate predictions",
x = "Model-Predicted Price",
y = "Actual Adjusted Sale Price"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold", colour = pal_main))
print(pA)

# ══════════════════════════════════════════════════════════════════════════════
# PART 14 — Diagnostic Chart: Residuals vs. Fitted Values
# Residuals are the difference between actual and predicted prices.
# A well-specified model produces residuals scattered randomly around
# zero with no discernible pattern. A curved smooth line suggests
# a non-linear relationship the model has not fully captured.
# ══════════════════════════════════════════════════════════════════════════════
pB <- ggplot(df_diag, aes(x = Fitted, y = Residuals)) +
geom_hline(yintercept = 0, colour = pal_acc,
linewidth = 1, linetype = "dashed") +
geom_point(colour = pal_main, alpha = 0.7, size = 2.5) +
geom_smooth(method = "loess", se = FALSE,
colour = "firebrick", linewidth = 0.9) +
scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_y_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Residuals vs. Fitted Values",
subtitle = "Random scatter around zero = well-specified model",
x = "Model-Predicted Price",
y = "Residual (Actual minus Predicted)"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold", colour = pal_main))
print(pB)
## `geom_smooth()` using formula = 'y ~ x'

# ══════════════════════════════════════════════════════════════════════════════
# PART 15 — Diagnostic Charts: Q-Q Plot and Scale-Location
# Normal Q-Q Plot — points tracking the diagonal confirm residuals
# are approximately normally distributed.
# Scale-Location Plot — a flat loess trend confirms homoskedasticity;
# prediction error magnitude is consistent across all price levels.
# ══════════════════════════════════════════════════════════════════════════════
pC <- ggplot(df_diag, aes(sample = Std_Resid)) +
stat_qq(colour = pal_main, alpha = 0.8, size = 2.5) +
stat_qq_line(colour = pal_acc, linewidth = 1.1, linetype = "dashed") +
labs(
title = "Normal Q-Q Plot of Standardised Residuals",
subtitle = "Points tracking the diagonal = normally distributed errors",
x = "Theoretical Quantiles",
y = "Standardised Residuals"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold", colour = pal_main))
pD <- ggplot(df_diag, aes(x = Fitted, y = sqrt(abs(Std_Resid)))) +
geom_point(colour = pal_main, alpha = 0.7, size = 2.5) +
geom_smooth(method = "loess", se = FALSE,
colour = pal_acc, linewidth = 1) +
scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
labs(
title = "Scale-Location Plot",
subtitle = "Flat trend = consistent prediction error across the price range",
x = "Model-Predicted Price",
y = "√|Standardised Residual|"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold", colour = pal_main))
grid.arrange(pC, pD, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'

# ══════════════════════════════════════════════════════════════════════════════
# PART 16 — Ratio Study Chart
# The ratio study is the standard IAAO diagnostic for mass appraisal
# models. Each point is one sale's predicted-to-actual price ratio.
# Green line = 1.00 (perfect prediction)
# Orange band = ±10% acceptable tolerance
# A flat loess trend across the price range confirms the model
# has no regressivity.
# ══════════════════════════════════════════════════════════════════════════════
pE <- ggplot(df_diag, aes(x = ADJ_SALE_PRICE, y = Ratio)) +
geom_hline(yintercept = 1.00, colour = "darkgreen", linewidth = 1) +
geom_hline(yintercept = c(0.90, 1.10), colour = "orange",
linewidth = 0.8, linetype = "dashed") +
geom_point(colour = pal_main, alpha = 0.7, size = 2.5) +
geom_smooth(method = "loess", se = FALSE,
colour = pal_acc, linewidth = 1) +
scale_x_continuous(labels = dollar_format(scale = 1e-3, suffix = "K")) +
scale_y_continuous(limits = c(0.70, 1.35),
breaks = seq(0.70, 1.35, 0.10)) +
labs(
title = "Ratio Study — Predicted-to-Actual Price Ratio",
subtitle = "Green = 1.00 (perfect) | Orange = ±10% acceptable band",
x = "Actual Adjusted Sale Price",
y = "Predicted ÷ Actual (Assessment Ratio)"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold", colour = pal_main))
print(pE)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ══════════════════════════════════════════════════════════════════════════════
# PART 17 — Model Coefficient Bar Chart
# Visualises the dollar contribution of each predictor.
# Each bar represents the change in estimated sale price
# associated with a one-unit increase in that variable,
# holding all other variables constant.
# ══════════════════════════════════════════════════════════════════════════════
coef_df <- data.frame(
Variable = names(coef(model))[-1],
Coefficient = coef(model)[-1]
) %>%
mutate(
Direction = ifelse(Coefficient > 0,
"Adds to Value", "Reduces Value"),
Variable = reorder(Variable, abs(Coefficient))
)
pF <- ggplot(coef_df,
aes(x = Variable, y = Coefficient, fill = Direction)) +
geom_col(width = 0.65) +
geom_hline(yintercept = 0, colour = "grey30") +
coord_flip() +
scale_fill_manual(values = c(
"Adds to Value" = pal_main,
"Reduces Value" = pal_acc
)) +
labs(
title = "Regression Coefficients — Dollar Impact per One-Unit Increase",
subtitle = "Holding all other variables constant",
x = NULL,
y = "Coefficient ($ change in estimated sale price)"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold", colour = pal_main),
legend.title = element_blank())
print(pF)

# ══════════════════════════════════════════════════════════════════════════════
# PART 18 — Export All Charts to PNG
# All charts saved at high resolution to the output_plots folder.
# Ready to insert into Word, PowerPoint, or PDF reports.
# ══════════════════════════════════════════════════════════════════════════════
# EDA overview — four charts on one page
png("output_plots/01_EDA_Overview.png", width = 1400, height = 1000, res = 120)
grid.arrange(p1, p2, p3, p4, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
dev.off()
## png
## 2
# Correlation heatmap
png("output_plots/02_Correlation_Matrix.png", width = 1100, height = 1000, res = 110)
print(
ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(colour = "white") +
geom_text(aes(label = round(value, 2)), size = 2.8, colour = "black") +
scale_fill_gradient2(
low = "#C00000", mid = "white", high = "#2C5F8A",
midpoint = 0, limits = c(-1, 1), name = "Correlation"
) +
labs(title = "Pairwise Correlation Matrix — All Candidate Variables",
x = NULL, y = NULL) +
theme_minimal(base_size = 10) +
theme(plot.title = element_text(face = "bold", colour = pal_main),
axis.text.x = element_text(angle = 45, hjust = 1))
)
dev.off()
## png
## 2
# Four diagnostic plots on one page
png("output_plots/03_Diagnostic_Plots.png", width = 1400, height = 1000, res = 120)
grid.arrange(pA, pB, pC, pD, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
dev.off()
## png
## 2
# Ratio study and coefficient chart side by side
png("output_plots/04_Ratio_and_Coefficients.png", width = 1400, height = 600, res = 120)
grid.arrange(pE, pF, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
dev.off()
## png
## 2
cat("All four charts exported to ./output_plots/\n")
## All four charts exported to ./output_plots/
# ══════════════════════════════════════════════════════════════════════════════
# PART 19 — Export the Analysis-Ready Dataset to Excel
# Includes all original columns plus the 11 engineered variables
# created in Part 4. Serves as the complete audit trail of every
# transformation applied to the raw data.
# ══════════════════════════════════════════════════════════════════════════════
write.xlsx(df, "SAMPLE_DATA_Transformed.xlsx")
cat(sprintf("Transformed dataset exported: %d rows × %d columns\n",
nrow(df), ncol(df)))
## Transformed dataset exported: 121 rows × 38 columns
cat("File: SAMPLE_DATA_Transformed.xlsx\n")
## File: SAMPLE_DATA_Transformed.xlsx
# ══════════════════════════════════════════════════════════════════════════════
# PART 20 — Final Model Results Summary
# Consolidated output of all key statistics for inclusion in the
# written report. Run this after all prior parts are complete.
# ══════════════════════════════════════════════════════════════════════════════
s <- summary(model)
cat("
╔══════════════════════════════════════════════════════════════════════╗
║ FINAL MODEL RESULTS SUMMARY ║
╚══════════════════════════════════════════════════════════════════════╝
GOODNESS OF FIT
─────────────────────────────────────────────────────────────────────\n")
##
## ╔══════════════════════════════════════════════════════════════════════╗
## ║ FINAL MODEL RESULTS SUMMARY ║
## ╚══════════════════════════════════════════════════════════════════════╝
##
## GOODNESS OF FIT
## ─────────────────────────────────────────────────────────────────────
cat(sprintf(" R-Squared (R²) : %.4f — model accounts for %.1f%% of price variance\n",
s$r.squared, s$r.squared * 100))
## R-Squared (R²) : 0.6495 — model accounts for 64.9% of price variance
cat(sprintf(" Adjusted R-Squared : %.4f\n", s$adj.r.squared))
## Adjusted R-Squared : 0.6141
cat(sprintf(" RMSE : %s — average dollar prediction error\n",
dollar(rmse)))
## RMSE : $48,546.54 — average dollar prediction error
cat(sprintf(" F-Statistic : %.2f (p-value < 0.001) — model is significant\n",
s$fstatistic[1]))
## F-Statistic : 18.36 (p-value < 0.001) — model is significant
cat(sprintf(" Observations : %d\n", nrow(df)))
## Observations : 121
cat("\n IAAO MASS APPRAISAL STANDARDS\n")
##
## IAAO MASS APPRAISAL STANDARDS
cat(" ─────────────────────────────────────────────────────────────────────\n")
## ─────────────────────────────────────────────────────────────────────
cat(sprintf(" Median Assessment Ratio : %.4f (target ≈ 1.00)\n", median_ratio))
## Median Assessment Ratio : 1.0153 (target ≈ 1.00)
cat(sprintf(" COD : %.2f%% (IAAO standard: < 15%%)\n", cod))
## COD : 10.22% (IAAO standard: < 15%)
cat(sprintf(" PRD : %.4f (IAAO standard: 0.98–1.03)\n", prd))
## PRD : 1.0160 (IAAO standard: 0.98–1.03)
cat("\n REGRESSION COEFFICIENTS\n")
##
## REGRESSION COEFFICIENTS
cat(" ─────────────────────────────────────────────────────────────────────\n")
## ─────────────────────────────────────────────────────────────────────
coef_out <- as.data.frame(coef(s))
colnames(coef_out) <- c("Estimate ($)", "Std. Error", "t-value", "p-value")
print(round(coef_out, 3))
## Estimate ($) Std. Error t-value p-value
## (Intercept) 34453.760 133524.206 0.258 0.797
## TOTAL_LIVING_SQFT 56.156 15.913 3.529 0.001
## UNFIN_BSMT_SQFT 80.936 25.680 3.152 0.002
## FN_BSMT_SQFT 84.300 21.297 3.958 0.000
## GRADE_NUM 8774.503 15167.192 0.579 0.564
## COND_NUM 10962.632 11317.143 0.969 0.335
## AGE -432.101 294.968 -1.465 0.146
## TOTAL_BATH_EQUIV 1273.198 12499.960 0.102 0.919
## CAR_SQFT -1.874 34.459 -0.054 0.957
## LOG_LANDSQFT 18486.237 12704.402 1.455 0.149
## BTH_QUAL_NUM 11394.507 23091.275 0.493 0.623
## KIT_QUAL_NUM 4729.111 17983.359 0.263 0.793
cat("\n Significance: *** p < 0.001 ** p < 0.01 * p < 0.05\n")
##
## Significance: *** p < 0.001 ** p < 0.01 * p < 0.05
cat("\n COLLINEARITY SUMMARY (VIF)\n")
##
## COLLINEARITY SUMMARY (VIF)
cat(" ─────────────────────────────────────────────────────────────────────\n")
## ─────────────────────────────────────────────────────────────────────
print(round(vif_vals, 2))
## TOTAL_LIVING_SQFT UNFIN_BSMT_SQFT FN_BSMT_SQFT GRADE_NUM
## 2.45 1.77 3.92 4.56
## COND_NUM AGE TOTAL_BATH_EQUIV CAR_SQFT
## 10.27 4.79 3.73 1.33
## LOG_LANDSQFT BTH_QUAL_NUM KIT_QUAL_NUM
## 8.40 6.04 3.67
cat("\n")