The dataset Ames Housing Dataset, is a well-known dataset used for predictive modeling and regression tasks, particularly in machine learning and data science.
This dataset contains detailed information about residential homes in Ames, Iowa, collected by the Ames Assessor’s Office. It includes 1,460 observations (rows) and 82 variables (columns) describing aspects of the homes, such as:
LotArea
,
Neighborhood
, YearBuilt
, etc.BedroomAbvGr
,
KitchenAbvGr
, TotRmsAbvGrd
, etc.BsmtQual
,
GarageCars
, GarageArea
, etc.OverallQual
,
ExterQual
, KitchenQual
, etc.SaleType
,
SaleCondition
, SalePrice
(target variable for
regression)This dataset is available on Kaggle:
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(corrplot)
## corrplot 0.94 loaded
# Step 1: Load the train and test datasets
ames_data <- read.csv("AmesHousing.csv")
# Step 2: Explore the dataset
str(ames_data)
## 'data.frame': 2930 obs. of 82 variables:
## $ Order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ PID : int 526301100 526350040 526351010 526353030 527105010 527105030 527127150 527145080 527146030 527162130 ...
## $ MS.SubClass : int 20 20 20 20 60 60 120 120 120 60 ...
## $ MS.Zoning : chr "RL" "RH" "RL" "RL" ...
## $ Lot.Frontage : int 141 80 81 93 74 78 41 43 39 60 ...
## $ Lot.Area : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ Lot.Shape : chr "IR1" "Reg" "IR1" "Reg" ...
## $ Land.Contour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ Lot.Config : chr "Corner" "Inside" "Corner" "Corner" ...
## $ Land.Slope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "NAmes" "NAmes" "NAmes" "NAmes" ...
## $ Condition.1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition.2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ Bldg.Type : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ House.Style : chr "1Story" "1Story" "1Story" "1Story" ...
## $ Overall.Qual : int 6 5 6 7 5 6 8 8 8 7 ...
## $ Overall.Cond : int 5 6 6 5 5 6 5 5 5 5 ...
## $ Year.Built : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Year.Remod.Add : int 1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
## $ Roof.Style : chr "Hip" "Gable" "Hip" "Hip" ...
## $ Roof.Matl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior.1st : chr "BrkFace" "VinylSd" "Wd Sdng" "BrkFace" ...
## $ Exterior.2nd : chr "Plywood" "VinylSd" "Wd Sdng" "BrkFace" ...
## $ Mas.Vnr.Type : chr "Stone" "None" "BrkFace" "None" ...
## $ Mas.Vnr.Area : int 112 0 108 0 0 20 0 0 0 0 ...
## $ Exter.Qual : chr "TA" "TA" "TA" "Gd" ...
## $ Exter.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "CBlock" "CBlock" ...
## $ Bsmt.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Bsmt.Cond : chr "Gd" "TA" "TA" "TA" ...
## $ Bsmt.Exposure : chr "Gd" "No" "No" "No" ...
## $ BsmtFin.Type.1 : chr "BLQ" "Rec" "ALQ" "ALQ" ...
## $ BsmtFin.SF.1 : int 639 468 923 1065 791 602 616 263 1180 0 ...
## $ BsmtFin.Type.2 : chr "Unf" "LwQ" "Unf" "Unf" ...
## $ BsmtFin.SF.2 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Unf.SF : int 441 270 406 1045 137 324 722 1017 415 994 ...
## $ Total.Bsmt.SF : int 1080 882 1329 2110 928 926 1338 1280 1595 994 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ Heating.QC : chr "Fa" "TA" "TA" "Ex" ...
## $ Central.Air : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1st.Flr.SF : int 1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
## $ X2nd.Flr.SF : int 0 0 0 0 701 678 0 0 0 776 ...
## $ Low.Qual.Fin.SF: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gr.Liv.Area : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ Bsmt.Full.Bath : int 1 0 0 1 0 0 1 0 1 0 ...
## $ Bsmt.Half.Bath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Full.Bath : int 1 1 1 2 2 2 2 2 2 2 ...
## $ Half.Bath : int 0 0 1 1 1 1 0 0 0 1 ...
## $ Bedroom.AbvGr : int 3 2 3 3 3 3 2 2 2 3 ...
## $ Kitchen.AbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Kitchen.Qual : chr "TA" "TA" "Gd" "Ex" ...
## $ TotRms.AbvGrd : int 7 5 6 8 6 7 6 5 5 7 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 2 0 0 2 1 1 0 0 1 1 ...
## $ Fireplace.Qu : chr "Gd" NA NA "TA" ...
## $ Garage.Type : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ Garage.Yr.Blt : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ Garage.Finish : chr "Fin" "Unf" "Unf" "Fin" ...
## $ Garage.Cars : int 2 1 1 2 2 2 2 2 2 2 ...
## $ Garage.Area : int 528 730 312 522 482 470 582 506 608 442 ...
## $ Garage.Qual : chr "TA" "TA" "TA" "TA" ...
## $ Garage.Cond : chr "TA" "TA" "TA" "TA" ...
## $ Paved.Drive : chr "P" "Y" "Y" "Y" ...
## $ Wood.Deck.SF : int 210 140 393 0 212 360 0 0 237 140 ...
## $ Open.Porch.SF : int 62 0 36 0 34 36 0 82 152 60 ...
## $ Enclosed.Porch : int 0 0 0 0 0 0 170 0 0 0 ...
## $ X3Ssn.Porch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Screen.Porch : int 0 120 0 0 0 0 0 144 0 0 ...
## $ Pool.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Pool.QC : chr NA NA NA NA ...
## $ Fence : chr NA "MnPrv" NA NA ...
## $ Misc.Feature : chr NA NA "Gar2" NA ...
## $ Misc.Val : int 0 0 12500 0 0 0 0 0 0 0 ...
## $ Mo.Sold : int 5 6 6 4 3 6 4 1 3 6 ...
## $ Yr.Sold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Sale.Type : chr "WD " "WD " "WD " "WD " ...
## $ Sale.Condition : chr "Normal" "Normal" "Normal" "Normal" ...
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
# ---- Install required packages only if not already installed ----
required_packages <- c("naniar", "VIM", "ggplot2")
for (pkg in required_packages) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg)
}
}
# ---- Load libraries ----
library(naniar)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggplot2)
naniar
)The bar plot titled “Missing Values per Variable in Ames Dataset” shows that while most variables in the dataset have few to no missing values, a small subset of variables exhibit a high concentration of missing data, with some approaching or exceeding 2,500 missing entries.
Key observations:
# A. Barplot of Missing Values per Variable
gg_miss_var(ames_data) +
labs(title = "Missing Values per Variable in Ames Dataset")
# B. Missingness Matrix
invisible(VIM::aggr(ames_data,
numbers = TRUE,
sortVars = TRUE,
labels = names(ames_data),
cex.axis = 0.7,
gap = 3,
ylab = c("Missing Data Pattern", "Count")))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)
##
## Variables sorted by number of missings:
## Variable Count
## Pool.QC 0.9955631399
## Misc.Feature 0.9638225256
## Alley 0.9324232082
## Fence 0.8047781570
## Fireplace.Qu 0.4853242321
## Lot.Frontage 0.1672354949
## Garage.Yr.Blt 0.0542662116
## Garage.Qual 0.0539249147
## Garage.Cond 0.0539249147
## Garage.Type 0.0535836177
## Garage.Finish 0.0535836177
## Bsmt.Qual 0.0269624573
## Bsmt.Cond 0.0269624573
## Bsmt.Exposure 0.0269624573
## BsmtFin.Type.1 0.0269624573
## BsmtFin.Type.2 0.0269624573
## Mas.Vnr.Area 0.0078498294
## Bsmt.Full.Bath 0.0006825939
## Bsmt.Half.Bath 0.0006825939
## BsmtFin.SF.1 0.0003412969
## BsmtFin.SF.2 0.0003412969
## Bsmt.Unf.SF 0.0003412969
## Total.Bsmt.SF 0.0003412969
## Garage.Cars 0.0003412969
## Garage.Area 0.0003412969
## Order 0.0000000000
## PID 0.0000000000
## MS.SubClass 0.0000000000
## MS.Zoning 0.0000000000
## Lot.Area 0.0000000000
## Street 0.0000000000
## Lot.Shape 0.0000000000
## Land.Contour 0.0000000000
## Utilities 0.0000000000
## Lot.Config 0.0000000000
## Land.Slope 0.0000000000
## Neighborhood 0.0000000000
## Condition.1 0.0000000000
## Condition.2 0.0000000000
## Bldg.Type 0.0000000000
## House.Style 0.0000000000
## Overall.Qual 0.0000000000
## Overall.Cond 0.0000000000
## Year.Built 0.0000000000
## Year.Remod.Add 0.0000000000
## Roof.Style 0.0000000000
## Roof.Matl 0.0000000000
## Exterior.1st 0.0000000000
## Exterior.2nd 0.0000000000
## Mas.Vnr.Type 0.0000000000
## Exter.Qual 0.0000000000
## Exter.Cond 0.0000000000
## Foundation 0.0000000000
## Heating 0.0000000000
## Heating.QC 0.0000000000
## Central.Air 0.0000000000
## Electrical 0.0000000000
## X1st.Flr.SF 0.0000000000
## X2nd.Flr.SF 0.0000000000
## Low.Qual.Fin.SF 0.0000000000
## Gr.Liv.Area 0.0000000000
## Full.Bath 0.0000000000
## Half.Bath 0.0000000000
## Bedroom.AbvGr 0.0000000000
## Kitchen.AbvGr 0.0000000000
## Kitchen.Qual 0.0000000000
## TotRms.AbvGrd 0.0000000000
## Functional 0.0000000000
## Fireplaces 0.0000000000
## Paved.Drive 0.0000000000
## Wood.Deck.SF 0.0000000000
## Open.Porch.SF 0.0000000000
## Enclosed.Porch 0.0000000000
## X3Ssn.Porch 0.0000000000
## Screen.Porch 0.0000000000
## Pool.Area 0.0000000000
## Misc.Val 0.0000000000
## Mo.Sold 0.0000000000
## Yr.Sold 0.0000000000
## Sale.Type 0.0000000000
## Sale.Condition 0.0000000000
## SalePrice 0.0000000000
This section interprets the heatmap and missingness matrix:
Garage.Qual
).The visualization shows the missing data pattern for selected variables in the Ames Housing dataset using two panels:
Left Panel: Missing Data Proportion
Right Panel: Missingness Pattern Heatmap
Interpretation:
library(naniar)
library(dplyr)
library(ggplot2)
# Retain Neighborhood + only variables with missingness
ames_missing_only <- ames_data %>%
dplyr::select(Neighborhood, where(~ any(is.na(.))))
# Plot missingness by Neighborhood
gg_miss_fct(ames_missing_only, fct = Neighborhood) +
labs(title = "Missing Values by Neighborhood (Only Variables with Missingness)")
Interpretation: Missing Values by Neighborhood
Top missing variables (Pool.QC
,
Misc.Feature
, Alley
, Fence
,
Fireplace.Qu
) show near-universal or clustered
missingness across all neighborhoods → confirms decision to
discard.
Lot.Frontage shows variable missingness
by neighborhood, supporting group-wise
imputation with a fallback (e.g.,
0
).
Garage and Basement variables
(Garage.Type
, Bsmt.Qual
, etc.) show
patchy missingness, often clustered by
neighborhood — likely structural (e.g., homes without garages
or basements).
Some neighborhoods (e.g., Edwards, NAmes, OldTown) show higher missingness in multiple variables — may reflect data entry gaps or distinct housing types.
Takeaway:
This validates your planned strategy:
ggplot(ames_data, aes(x = is.na(Garage.Yr.Blt), y = Garage.Area)) +
geom_boxplot() + labs(title = "Garage Area vs Missing Garage Year")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
Interpretation: Garage Area vs Missing Garage Year
The boxplot shows that when Garage.Yr.Blt
is missing
(TRUE
), the corresponding Garage.Area
is
near zero. In contrast, when Garage.Yr.Blt
is present (FALSE
), Garage.Area
values are
substantial and widely distributed.
Conclusion:
Missing Garage.Yr.Blt
likely indicates that the
house has no garage. This supports imputing missing
Garage.Yr.Blt
values with 0
,
rather than the median, to reflect absence.
library(naniar)
ames_data %>%
dplyr::select(Garage.Qual, Garage.Cond, Garage.Type) %>%
gg_miss_upset()
Interpretation: Missingness Overlap in Garage Variables
Garage.Type
,
Garage.Qual
, and Garage.Cond
.Garage.Qual
and
Garage.Cond
but not Garage.Type
).Conclusion:
Garage-related missingness is tightly linked,
suggesting these NAs are structural — likely representing homes
without garages. This supports imputing all related
missing garage variables consistently, e.g., with
"None"
or 0
.
Garage.Type
and Garage.Area
.ggplot(ames_data, aes(x = is.na(Bsmt.Exposure), y = SalePrice)) + geom_boxplot()
Interpretation: SalePrice vs Missingness in Bsmt.Exposure
Bsmt.Exposure
(FALSE) have a higher median SalePrice and a
wider price range.Bsmt.Exposure
(TRUE) show a lower median SalePrice and fewer
high-value outliers.Conclusion:
Missing Bsmt.Exposure
likely indicates absence
of a basement, which is associated with lower property
value. This supports imputing missing values with
“None” to reflect no exposure rather than random
omission.
ggplot(ames_data, aes(x = Mas.Vnr.Area)) + geom_histogram(na.rm = TRUE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Interpretation: Distribution of Mas.Vnr.Area
(Masonry Veneer Area)
Conclusion:
Missing Mas.Vnr.Area
values likely indicate
absence of masonry veneer. Imputing with 0 is
appropriate, but if a numeric fill is required for modeling purposes,
median imputation is also acceptable due to
skewness.
0
(no basement
likely).The histogram of Lot.Frontage
reveals a highly
right-skewed distribution, with most values clustered between 50 and 100
feet. This supports the use of median imputation,
particularly within neighborhoods where housing layouts are likely
similar. The few extreme values further justify avoiding mean imputation
to reduce the impact of outliers.
# Histogram of Lot.Frontage (ignoring NAs)
ggplot(ames_data, aes(x = Lot.Frontage)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "black", na.rm = TRUE) +
labs(title = "Distribution of Lot Frontage", x = "Lot Frontage", y = "Count")
###### Sale Price by Basement Exposure
The boxplot shows a clear difference in SalePrice
across
categories of Bsmt.Exposure
. Notably, homes with missing
values (NA
) for basement exposure have significantly lower
median sale prices. This suggests the missingness is not
random, but likely reflects the absence of a basement.
Therefore, imputing missing Bsmt.Exposure
values with
“None” accurately preserves the structural meaning and
avoids inflating property valuations.
# Boxplot of SalePrice by Bsmt.Exposure (ignoring rows with NA in Bsmt.Exposure)
ggplot(ames_data, aes(x = Bsmt.Exposure, y = SalePrice)) +
geom_boxplot(na.rm = TRUE) +
labs(title = "Sale Price by Basement Exposure", x = "Bsmt Exposure", y = "Sale Price")
# Load necessary libraries
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(dplyr)
library(tidyr)
# Compute missing value summary
missing_summary <- ames_data %>%
summarise(across(
everything(),
list(
Missing = ~sum(is.na(.)),
PercentMissing = ~round(mean(is.na(.)) * 100, 2)
),
.names = "{.col}_{.fn}"
))
# Pivot, filter, sort, and annotate with usability and imputation method
missing_summary_long <- missing_summary %>%
pivot_longer(
cols = everything(),
names_to = c("Variable", ".value"),
names_pattern = "^(.*)_(.*)$"
) %>%
filter(Missing > 0) %>%
arrange(desc(PercentMissing)) %>%
mutate(
Usability = ifelse(PercentMissing > 40, "discard", "keep"),
ImputationMethod = case_when(
Usability == "discard" ~ NA_character_,
Variable == "LotFrontage" ~ "Median by Neighborhood (fallback = 0)",
Variable == "Garage.Yr.Blt" ~ "0 (No Garage)",
Variable %in% c("Garage.Type", "Garage.Qual", "Garage.Cond") ~ '"None"',
Variable == "Garage.Finish" ~ "Mode",
Variable %in% c("Bsmt.Qual", "Bsmt.Cond", "Bsmt.Exposure", "BsmtFin.Type.1", "BsmtFin.Type.2") ~ '"None"',
Variable == "Mas.Vnr.Area" ~ "0 or Median (if needed)",
Variable %in% c("Bsmt.Full.Bath", "Bsmt.Half.Bath") ~ "Mode",
Variable %in% c("BsmtFin.SF.1", "BsmtFin.SF.2", "Bsmt.Unf.SF", "Total.Bsmt.SF") ~ "0",
Variable == "Garage.Cars" ~ "Mode",
Variable == "Garage.Area" ~ "Median",
TRUE ~ "Check manually"
)
)
# Display final table
missing_summary_long %>%
kable(caption = "Summary of Variables with Missing Counts, Percentages, Usability, and Imputation Method") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Variable | Missing | PercentMissing | Usability | ImputationMethod |
---|---|---|---|---|
Pool.QC | 2917 | 99.56 | discard | NA |
Misc.Feature | 2824 | 96.38 | discard | NA |
Alley | 2732 | 93.24 | discard | NA |
Fence | 2358 | 80.48 | discard | NA |
Fireplace.Qu | 1422 | 48.53 | discard | NA |
Lot.Frontage | 490 | 16.72 | keep | Check manually |
Garage.Yr.Blt | 159 | 5.43 | keep | 0 (No Garage) |
Garage.Qual | 158 | 5.39 | keep | “None” |
Garage.Cond | 158 | 5.39 | keep | “None” |
Garage.Type | 157 | 5.36 | keep | “None” |
Garage.Finish | 157 | 5.36 | keep | Mode |
Bsmt.Qual | 79 | 2.70 | keep | “None” |
Bsmt.Cond | 79 | 2.70 | keep | “None” |
Bsmt.Exposure | 79 | 2.70 | keep | “None” |
BsmtFin.Type.1 | 79 | 2.70 | keep | “None” |
BsmtFin.Type.2 | 79 | 2.70 | keep | “None” |
Mas.Vnr.Area | 23 | 0.78 | keep | 0 or Median (if needed) |
Bsmt.Full.Bath | 2 | 0.07 | keep | Mode |
Bsmt.Half.Bath | 2 | 0.07 | keep | Mode |
BsmtFin.SF.1 | 1 | 0.03 | keep | 0 |
BsmtFin.SF.2 | 1 | 0.03 | keep | 0 |
Bsmt.Unf.SF | 1 | 0.03 | keep | 0 |
Total.Bsmt.SF | 1 | 0.03 | keep | 0 |
Garage.Cars | 1 | 0.03 | keep | Mode |
Garage.Area | 1 | 0.03 | keep | Median |
# Load required libraries
library(dplyr)
# Preserve original data
ames_data_imputed <- ames_data
# 0. Drop variables with excessive missingness
ames_data_imputed <- ames_data_imputed %>%
dplyr::select(-c(Pool.QC, Misc.Feature, Alley, Fence, Fireplace.Qu))
# 1. Impute Lot.Frontage using median by Neighborhood, fallback = 0
ames_data_imputed <- ames_data_imputed %>%
group_by(Neighborhood) %>%
mutate(Lot.Frontage = ifelse(
is.na(Lot.Frontage),
median(Lot.Frontage, na.rm = TRUE),
Lot.Frontage
)) %>%
ungroup()
ames_data_imputed$Lot.Frontage[is.na(ames_data_imputed$Lot.Frontage)] <- 0 # Fallback
# 2. Impute Garage.Yr.Blt with 0 (indicates no garage)
ames_data_imputed$Garage.Yr.Blt[is.na(ames_data_imputed$Garage.Yr.Blt)] <- 0
# 3. Mode function
get_mode <- function(x) {
ux <- na.omit(unique(x))
ux[which.max(tabulate(match(x, ux)))]
}
# 4. Impute Garage.* categorical vars with "None"
garage_none_vars <- c("Garage.Qual", "Garage.Cond", "Garage.Type")
for (var in garage_none_vars) {
ames_data_imputed[[var]][is.na(ames_data_imputed[[var]])] <- "None"
}
# 5. Impute Garage.Finish with mode
ames_data_imputed$Garage.Finish[is.na(ames_data_imputed$Garage.Finish)] <- get_mode(ames_data_imputed$Garage.Finish)
# 6. Impute Basement categorical vars with "None"
bsmt_none_vars <- c("Bsmt.Qual", "Bsmt.Cond", "Bsmt.Exposure", "BsmtFin.Type.1", "BsmtFin.Type.2")
for (var in bsmt_none_vars) {
ames_data_imputed[[var]][is.na(ames_data_imputed[[var]])] <- "None"
}
# 7. Impute Mas.Vnr.Area with 0
ames_data_imputed$Mas.Vnr.Area[is.na(ames_data_imputed$Mas.Vnr.Area)] <- 0
# 8. Impute Bsmt.Full.Bath and Bsmt.Half.Bath with mode
ames_data_imputed$Bsmt.Full.Bath[is.na(ames_data_imputed$Bsmt.Full.Bath)] <- get_mode(ames_data_imputed$Bsmt.Full.Bath)
ames_data_imputed$Bsmt.Half.Bath[is.na(ames_data_imputed$Bsmt.Half.Bath)] <- get_mode(ames_data_imputed$Bsmt.Half.Bath)
# 9. Impute Basement SF vars with 0
bsmt_sf_vars <- c("BsmtFin.SF.1", "BsmtFin.SF.2", "Bsmt.Unf.SF", "Total.Bsmt.SF")
for (var in bsmt_sf_vars) {
ames_data_imputed[[var]][is.na(ames_data_imputed[[var]])] <- 0
}
# 10. Impute Garage.Cars with mode
ames_data_imputed$Garage.Cars[is.na(ames_data_imputed$Garage.Cars)] <- get_mode(ames_data_imputed$Garage.Cars)
# 11. Impute Garage.Area with median
ames_data_imputed$Garage.Area[is.na(ames_data_imputed$Garage.Area)] <- median(ames_data_imputed$Garage.Area, na.rm = TRUE)
total_missing <- sum(is.na(ames_data_imputed))
if (total_missing == 0) {
message("No missing values remain in the dataset.")
} else {
message("Remaining missing values: ", total_missing)
}
## No missing values remain in the dataset.
SalePrice vs Gr.Liv.Area
, TotalBsmtSF
,
etc.)SalePrice
.library(dplyr)
library(corrplot)
library(stats)
# 1. Select numeric columns
numeric_data <- ames_data_imputed %>%
dplyr::select(where(is.numeric))
# 2. Compute correlation matrix
cor_matrix <- cor(numeric_data, use = "pairwise.complete.obs")
# 3. Plot correlation heatmap with smaller tick mark labels
corrplot(cor_matrix,
method = "circle",
type = "upper",
tl.cex = 0.6, # Smaller font for tick labels
tl.col = "black", # Text color
col = colorRampPalette(c("red", "white", "blue"))(200),
title = "Correlation Matrix of Numeric Variables",
mar = c(0, 0, 1, 0))
Interpretation:
The correlation matrix reveals that several numeric variables are
strongly associated with SalePrice
, including
OverallQual
, GrLivArea
,
GarageCars
, and TotalBsmtSF
. These features
show positive correlations above 0.6, indicating that higher values in
these variables tend to align with higher home prices. Meanwhile, many
other variables show weak or negligible correlations, underscoring the
importance of feature selection in model development.
library(dplyr)
library(ggplot2)
# Step 1: Select only numeric columns
numeric_data <- ames_data_imputed %>%
dplyr::select(where(is.numeric))
# Step 2: Compute correlation matrix
cor_matrix <- cor(numeric_data, use = "pairwise.complete.obs")
# Step 3: Extract correlations with SalePrice
saleprice_corr <- cor_matrix[, "SalePrice"] %>%
sort(decreasing = TRUE) %>%
.[names(.) != "SalePrice"] # Exclude self-correlation
# Step 4: Take top N most correlated variables (e.g., top 15)
top_corr_df <- data.frame(
Variable = names(saleprice_corr)[1:15],
Correlation = as.numeric(saleprice_corr[1:15])
)
# Step 5: Reorder for plotting
top_corr_df <- top_corr_df %>%
mutate(Variable = factor(Variable, levels = rev(Variable)))
# Step 6: Plot the flipped bar chart
ggplot(top_corr_df, aes(x = Variable, y = Correlation)) +
geom_col(fill = "black", alpha = 0.7) +
coord_flip() +
labs(
title = "Top 15 Variables Most Correlated with SalePrice",
x = "Variable",
y = "Correlation with SalePrice"
) +
theme_minimal()
To make the correlation matrix easier to interpret (especially with
many variables), let’s convert it to a data frame and display the top
correlations with SalePrice
.
# 1. Select numeric columns
numeric_data <- ames_data_imputed %>%
dplyr::select(where(is.numeric))
# 2. Compute correlation matrix
cor_matrix <- cor(numeric_data, use = "pairwise.complete.obs")
# 3. Convert to data frame (correlation with SalePrice only)
cor_df <- as.data.frame(cor_matrix) %>%
dplyr::select(SalePrice) %>%
rownames_to_column(var = "Variable") %>%
filter(Variable != "SalePrice") %>%
arrange(desc(abs(SalePrice)))
# 4. Print top 15 correlations with SalePrice
head(cor_df, 15)
## Variable SalePrice
## 1 Overall.Qual 0.7992618
## 2 Gr.Liv.Area 0.7067799
## 3 Garage.Cars 0.6478115
## 4 Garage.Area 0.6403811
## 5 Total.Bsmt.SF 0.6325288
## 6 X1st.Flr.SF 0.6216761
## 7 Year.Built 0.5584261
## 8 Full.Bath 0.5456039
## 9 Year.Remod.Add 0.5329738
## 10 Mas.Vnr.Area 0.5021960
## 11 TotRms.AbvGrd 0.4954744
## 12 Fireplaces 0.4745581
## 13 BsmtFin.SF.1 0.4331473
## 14 Lot.Frontage 0.3494629
## 15 Wood.Deck.SF 0.3271432
# Load required libraries
library(ggplot2)
library(dplyr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Define 12 top variables (already sorted by correlation)
top_vars_extended <- c("Overall.Qual", "Gr.Liv.Area", "Garage.Cars",
"Garage.Area", "Total.Bsmt.SF", "X1st.Flr.SF",
"Year.Built", "Full.Bath", "Year.Remod.Add",
"Garage.Yr.Blt", "Mas.Vnr.Area", "TotRms.AbvGrd")
# Split into two groups of 6
vars_group1 <- top_vars_extended[1:6]
vars_group2 <- top_vars_extended[7:12]
# Function to create scatterplots with warning suppression
create_plots <- function(vars) {
lapply(vars, function(var) {
suppressWarnings(
ggplot(ames_data_imputed, aes_string(x = var, y = "SalePrice")) +
geom_point(alpha = 0.4, color = "steelblue", na.rm = TRUE) +
geom_smooth(method = "lm", se = FALSE, color = "darkred", na.rm = TRUE) +
labs(title = paste("SalePrice vs", var), x = var, y = "SalePrice")
)
})
}
# Create and display first group
plot_list1 <- create_plots(vars_group1)
gridExtra::grid.arrange(grobs = plot_list1, ncol = 3, top = "Top Correlations with SalePrice (1 of 2)")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Create and display second group
plot_list2 <- create_plots(vars_group2)
gridExtra::grid.arrange(grobs = plot_list2, ncol = 3, top = "Top Correlations with SalePrice (2 of 2)")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Correlation Analysis Summary with Visuals
Based on the correlation matrix and the top
15 variables most correlated with SalePrice
, we
explore both their correlation coefficients and
scatterplot patterns to understand the relationships
more clearly.
Strongest Predictors of SalePrice
Overall.Qual (0.80) The strongest predictor. The scatterplot shows a clear, linear upward trend, confirming that better-rated homes consistently command higher prices.
Gr.Liv.Area (0.71) Strong linear relationship. The scatterplot reveals increased spread at higher values, but the trend remains positive.
Garage.Cars (0.65) and Garage.Area (0.64) Both show positive steps and linearity, indicating value increase with more garage capacity and space.
Total.Bsmt.SF (0.63) and 1st.Flr.SF (0.62) Scatterplots show tight positive trends — more square footage on the main or basement levels correlates strongly with price.
Moderate Predictors
(See Figure 2)
Year.Built (0.56) & Year.Remod.Add (0.53) Recent builds and renovations are linked with higher prices. The scatterplots show a positive slope, especially post-2000.
Full.Bath (0.55) & Garage.Yr.Blt
(0.53) More bathrooms and newer garages boost sale prices,
though the scatterplot for Full.Bath
shows some
plateauing after 2 baths.
Mas.Vnr.Area (0.51) Scatterplot shows a right-skewed pattern with positive slope; houses with significant veneer finish tend to sell for more.
TotRms.AbvGrd (0.50) & Fireplaces (0.47) Scatterplots reflect incremental increases in price with more rooms/fireplaces, though with more variance.
BsmtFin.SF.1 (0.43) & Lot.Frontage (0.36) Moderate influence, with some scatter in their relationships — still relevant for capturing square footage and curb appeal.
Conclusion
The combined correlation data and scatterplot visuals confirm that
home quality, size, and modern
features drive sale prices in the Ames dataset. Top predictors
like Overall.Qual
, Gr.Liv.Area
, and
Garage.Cars
should be prioritized in model building.
Variables with lower correlation (e.g., Lot.Frontage
) may
still be useful when interacted with other features or used in
tree-based models.
SalePrice
across categories.SalePrice vs Exter.Qual
, Kitchen.Qual
,
etc.)# Required packages
library(dplyr)
library(vcd)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
# Function definition
plot_cramers_v_heatmap <- function(data) {
# Select only categorical variables (character or factor)
cat_data <- data %>% dplyr::select(where(is.character) | where(is.factor))
cat_names <- names(cat_data)
# Initialize matrix to store Cramér's V values
cramers_mat <- matrix(NA, nrow = length(cat_names), ncol = length(cat_names),
dimnames = list(cat_names, cat_names))
# Compute Cramér's V for each pair
for (i in seq_along(cat_names)) {
for (j in seq_along(cat_names)) {
var1 <- cat_data[[i]]
var2 <- cat_data[[j]]
# Skip if only one level in either variable
if (length(unique(var1)) < 2 || length(unique(var2)) < 2) next
tbl <- table(var1, var2)
suppressWarnings({
assoc <- assocstats(tbl)
cramers_mat[i, j] <- assoc$cramer
})
}
}
# Melt matrix for plotting
cramers_df <- melt(cramers_mat, na.rm = TRUE)
# Plot heatmap
ggplot(cramers_df, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "white", high = "darkblue", midpoint = 0.3,
limit = c(0,1), name = "Cramér’s V") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.text.y = element_text(size = 9)) +
labs(title = "Cramér’s V Heatmap of Categorical Variables",
x = "", y = "")
}
plot_cramers_v_heatmap(ames_data_imputed)
The Cramér’s V heatmap reveals that most categorical variables in the
Ames Housing dataset exhibit weak to moderate
associations, as indicated by the light shading across the
matrix. Strong associations (Cramér’s V > 0.5) appear in a few
expected clusters—particularly among related variables such as
Garage.Type
, Garage.Finish
, and
Garage.Qual
, and among
BsmtFin.Type.1
, BsmtFin.Type.2
, and
Bsmt.Exposure
—which likely reflect shared
structural characteristics of the properties. These patterns
validate the presence of logical groupings and may inform
feature reduction or interaction modeling in subsequent
analysis steps.
# Load required libraries
library(ggplot2)
library(dplyr)
library(gridExtra) # for arranging multiple plots
# Select categorical variables with reasonable cardinality (to avoid overcrowding)
cat_vars <- ames_data_imputed %>%
dplyr::select(where(is.character) | where(is.factor)) %>%
dplyr::select_if(~ n_distinct(.) <= 10) %>% # adjust threshold as needed
names()
# Create individual boxplots of SalePrice vs categorical variables
boxplots <- lapply(cat_vars, function(var) {
ggplot(ames_data_imputed, aes_string(x = var, y = "SalePrice")) +
geom_boxplot(outlier.shape = 1, fill = "lightblue", color = "black") +
theme_minimal() +
labs(title = paste("SalePrice vs", var), x = var, y = "SalePrice") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
# Display in grids of 6 per page (2 rows x 3 cols)
n <- length(boxplots)
pages <- split(boxplots, ceiling(seq_along(boxplots) / 6))
# Plot all pages
for (i in seq_along(pages)) {
grid.arrange(grobs = pages[[i]], nrow = 2, ncol = 3,
top = paste("Top Categorical Effects on SalePrice (Page", i, "of", length(pages), ")"))
}
summary of the top-performing categorical features
based on the boxplots showing their impact on
SalePrice
:
Exter.Qual
(Exterior Quality)
SalePrice
Ex
) show significantly
higher median pricesKitchen.Qual
(Kitchen Quality)
Po
→ Ex
in sale
priceGarage.Finish
(Interior Finish of
Garage)
Fin
, RFn
)
tend to command higher prices than Unf
or
None
Bsmt.Exposure
(Basement Exposure to
Walkout/Daylight)
Gd
and Av
levels are associated with
higher prices compared to No
or None
Fireplace.Qu
(Fireplace
Quality)
NA
and Po
lag behindCentral.Air
Y
) show
consistently higher sale pricesNeighborhood
(implied in your
full boxplot set)
SalePrice
across
neighborhoods indicate strong location effectsHouse.Style
and
Bldg.Type
2Story
, SFoyer
) show
higher value, suggesting architectural influence on pricinglibrary(dplyr)
library(ggplot2)
library(forcats)
# Identify top categorical features (by overall variance in mean SalePrice)
top_cat_vars <- c("Neighborhood", "Exter.Qual", "Kitchen.Qual", "Garage.Finish", "Bsmt.Qual", "Heating.QC")
# Melt the data and calculate mean SalePrice per level
cat_summary <- lapply(top_cat_vars, function(var) {
ames_data_imputed %>%
group_by_at(var) %>%
summarise(mean_price = mean(SalePrice, na.rm = TRUE)) %>%
mutate(Variable = var, Level = .[[var]]) %>%
dplyr::select(Variable, Level, mean_price)
}) %>%
bind_rows()
# Plot
ggplot(cat_summary, aes(x = fct_reorder(Level, mean_price), y = mean_price, fill = Variable)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ Variable, scales = "free") +
coord_flip() +
scale_y_continuous(labels = scales::label_dollar(scale = 1/1000, suffix = "K")) +
labs(
title = "Average SalePrice by Top Categorical Feature Levels",
x = "Category Level",
y = "Mean SalePrice"
) +
theme_minimal()
Interpretation:
Categorical features like Exter.Qual
,
Kitchen.Qual
, and Bsmt.Qual
show strong
positive relationships with SalePrice
, with premium quality
levels (e.g., “Ex” and “Gd”) consistently associated with significantly
higher average home prices. Neighborhood also shows clear
stratification, reflecting location-based pricing trends.
# Load necessary libraries
library(kableExtra)
library(dplyr)
library(tidyr)
library(moments) # For skewness and kurtosis
# Identify numeric variables
numeric_vars <- names(ames_data_imputed)[sapply(ames_data_imputed, is.numeric)]
# Select numeric columns
numerical_vars <- ames_data_imputed %>%
dplyr::select(all_of(numeric_vars))
# Compute summary statistics without missingness
statistical_summary <- numerical_vars %>%
summarise(across(
everything(),
list(
Min = ~round(min(.), 2),
Q1 = ~round(quantile(., 0.25), 2),
Mean = ~round(mean(.), 2),
Median = ~round(median(.), 2),
Q3 = ~round(quantile(., 0.75), 2),
Max = ~round(max(.), 2),
SD = ~round(sd(.), 2),
Variance = ~round(var(.), 2),
CV = ~round(sd(.) / mean(.), 2),
Skewness = ~round(skewness(.), 2),
Kurtosis = ~round(kurtosis(.), 2),
ZeroCount = ~sum(. == 0),
ZeroProportion = ~round(mean(. == 0) * 100, 2),
Overdispersion = ~round(var(.) / mean(.), 2),
Outliers = ~sum(. > (quantile(., 0.75) + 1.5 * IQR(.)) |
. < (quantile(., 0.25) - 1.5 * IQR(.)))
),
.names = "{.col}_{.fn}"
)) %>%
pivot_longer(
cols = everything(),
names_to = c("Variable", ".value"),
names_pattern = "^(.*)_(.*)$"
)
# Final rounding (ensures all values are 2 decimal places)
statistical_summary <- statistical_summary %>%
mutate(across(where(is.numeric), ~ round(., 2)))
# Display the summary table
statistical_summary %>%
kable(caption = "Descriptive Summary of Numeric Variables (Post-Imputation)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Variable | Min | Q1 | Mean | Median | Q3 | Max | SD | Variance | CV | Skewness | Kurtosis | ZeroCount | ZeroProportion | Overdispersion | Outliers |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Order | 1 | 7.33250e+02 | 1465.50 | 1465.5 | 2197.75 | 2930 | 845.96 | 7.156525e+05 | 0.58 | 0.00 | 1.80 | 0 | 0.00 | 488.33 | 0 |
PID | 526301100 | 5.28477e+08 | 714464496.99 | 535453620.0 | 907181097.50 | 1007100110 | 188730844.65 | 3.561933e+16 | 0.26 | 0.06 | 1.01 | 0 | 0.00 | 49854586.02 | 0 |
MS.SubClass | 20 | 2.00000e+01 | 57.39 | 50.0 | 70.00 | 190 | 42.64 | 1.818000e+03 | 0.74 | 1.36 | 4.38 | 0 | 0.00 | 31.68 | 208 |
Lot.Frontage | 0 | 6.00000e+01 | 69.39 | 70.0 | 80.00 | 313 | 21.83 | 4.766300e+02 | 0.31 | 1.45 | 15.64 | 3 | 0.10 | 6.87 | 209 |
Lot.Area | 1300 | 7.44025e+03 | 10147.92 | 9436.5 | 11555.25 | 215245 | 7880.02 | 6.209468e+07 | 0.78 | 12.81 | 267.57 | 0 | 0.00 | 6118.96 | 127 |
Overall.Qual | 1 | 5.00000e+00 | 6.09 | 6.0 | 7.00 | 10 | 1.41 | 1.990000e+00 | 0.23 | 0.19 | 3.05 | 0 | 0.00 | 0.33 | 4 |
Overall.Cond | 1 | 5.00000e+00 | 5.56 | 5.0 | 6.00 | 9 | 1.11 | 1.240000e+00 | 0.20 | 0.57 | 4.49 | 0 | 0.00 | 0.22 | 252 |
Year.Built | 1872 | 1.95400e+03 | 1971.36 | 1973.0 | 2001.00 | 2010 | 30.25 | 9.147800e+02 | 0.02 | -0.60 | 2.50 | 0 | 0.00 | 0.46 | 9 |
Year.Remod.Add | 1950 | 1.96500e+03 | 1984.27 | 1993.0 | 2004.00 | 2010 | 20.86 | 4.351500e+02 | 0.01 | -0.45 | 1.66 | 0 | 0.00 | 0.22 | 0 |
Mas.Vnr.Area | 0 | 0.00000e+00 | 101.10 | 0.0 | 162.75 | 1600 | 178.63 | 3.191030e+04 | 1.77 | 2.62 | 12.35 | 1771 | 60.44 | 315.64 | 203 |
BsmtFin.SF.1 | 0 | 0.00000e+00 | 442.48 | 370.0 | 734.00 | 5644 | 455.59 | 2.075590e+05 | 1.03 | 1.42 | 9.84 | 931 | 31.77 | 469.08 | 15 |
BsmtFin.SF.2 | 0 | 0.00000e+00 | 49.71 | 0.0 | 0.00 | 1526 | 169.14 | 2.860905e+04 | 3.40 | 4.14 | 21.76 | 2579 | 88.02 | 575.57 | 351 |
Bsmt.Unf.SF | 0 | 2.19000e+02 | 559.07 | 465.5 | 801.75 | 2336 | 439.54 | 1.931959e+05 | 0.79 | 0.92 | 3.41 | 245 | 8.36 | 345.57 | 56 |
Total.Bsmt.SF | 0 | 7.93000e+02 | 1051.26 | 990.0 | 1301.50 | 6110 | 440.97 | 1.944528e+05 | 0.42 | 1.15 | 12.09 | 80 | 2.73 | 184.97 | 124 |
X1st.Flr.SF | 334 | 8.76250e+02 | 1159.56 | 1084.0 | 1384.00 | 5095 | 391.89 | 1.535785e+05 | 0.34 | 1.47 | 9.95 | 0 | 0.00 | 132.45 | 43 |
X2nd.Flr.SF | 0 | 0.00000e+00 | 335.46 | 0.0 | 703.75 | 2065 | 428.40 | 1.835229e+05 | 1.28 | 0.87 | 2.58 | 1678 | 57.27 | 547.08 | 8 |
Low.Qual.Fin.SF | 0 | 0.00000e+00 | 4.68 | 0.0 | 0.00 | 1064 | 46.31 | 2.144660e+03 | 9.90 | 12.11 | 178.31 | 2890 | 98.63 | 458.58 | 40 |
Gr.Liv.Area | 334 | 1.12600e+03 | 1499.69 | 1442.0 | 1742.75 | 5642 | 505.51 | 2.555392e+05 | 0.34 | 1.27 | 7.13 | 0 | 0.00 | 170.39 | 75 |
Bsmt.Full.Bath | 0 | 0.00000e+00 | 0.43 | 0.0 | 1.00 | 3 | 0.52 | 2.800000e-01 | 1.22 | 0.62 | 2.25 | 1709 | 58.33 | 0.64 | 2 |
Bsmt.Half.Bath | 0 | 0.00000e+00 | 0.06 | 0.0 | 0.00 | 2 | 0.25 | 6.000000e-02 | 4.01 | 3.94 | 17.91 | 2755 | 94.03 | 0.98 | 175 |
Full.Bath | 0 | 1.00000e+00 | 1.57 | 2.0 | 2.00 | 4 | 0.55 | 3.100000e-01 | 0.35 | 0.17 | 2.46 | 12 | 0.41 | 0.20 | 4 |
Half.Bath | 0 | 0.00000e+00 | 0.38 | 0.0 | 1.00 | 2 | 0.50 | 2.500000e-01 | 1.32 | 0.70 | 1.97 | 1843 | 62.90 | 0.67 | 0 |
Bedroom.AbvGr | 0 | 2.00000e+00 | 2.85 | 3.0 | 3.00 | 8 | 0.83 | 6.900000e-01 | 0.29 | 0.31 | 4.89 | 8 | 0.27 | 0.24 | 78 |
Kitchen.AbvGr | 0 | 1.00000e+00 | 1.04 | 1.0 | 1.00 | 3 | 0.21 | 5.000000e-02 | 0.20 | 4.31 | 22.83 | 3 | 0.10 | 0.04 | 134 |
TotRms.AbvGrd | 2 | 5.00000e+00 | 6.44 | 6.0 | 7.00 | 15 | 1.57 | 2.470000e+00 | 0.24 | 0.75 | 4.15 | 0 | 0.00 | 0.38 | 51 |
Fireplaces | 0 | 0.00000e+00 | 0.60 | 1.0 | 1.00 | 4 | 0.65 | 4.200000e-01 | 1.08 | 0.74 | 3.10 | 1422 | 48.53 | 0.70 | 13 |
Garage.Yr.Blt | 0 | 1.95700e+03 | 1870.79 | 1977.0 | 2001.00 | 2207 | 448.89 | 2.015059e+05 | 0.24 | -3.91 | 16.39 | 159 | 5.43 | 107.71 | 160 |
Garage.Cars | 0 | 1.00000e+00 | 1.77 | 2.0 | 2.00 | 5 | 0.76 | 5.800000e-01 | 0.43 | -0.22 | 3.24 | 157 | 5.36 | 0.33 | 17 |
Garage.Area | 0 | 3.20000e+02 | 472.82 | 480.0 | 576.00 | 1488 | 215.01 | 4.622925e+04 | 0.45 | 0.24 | 3.95 | 157 | 5.36 | 97.77 | 42 |
Wood.Deck.SF | 0 | 0.00000e+00 | 93.75 | 0.0 | 168.00 | 1424 | 126.36 | 1.596724e+04 | 1.35 | 1.84 | 9.74 | 1526 | 52.08 | 170.31 | 67 |
Open.Porch.SF | 0 | 0.00000e+00 | 47.53 | 27.0 | 70.00 | 742 | 67.48 | 4.554010e+03 | 1.42 | 2.53 | 13.93 | 1300 | 44.37 | 95.81 | 159 |
Enclosed.Porch | 0 | 0.00000e+00 | 23.01 | 0.0 | 0.00 | 1012 | 64.14 | 4.113820e+03 | 2.79 | 4.01 | 31.44 | 2471 | 84.33 | 178.77 | 459 |
X3Ssn.Porch | 0 | 0.00000e+00 | 2.59 | 0.0 | 0.00 | 508 | 25.14 | 6.320900e+02 | 9.70 | 11.40 | 152.73 | 2893 | 98.74 | 243.81 | 37 |
Screen.Porch | 0 | 0.00000e+00 | 16.00 | 0.0 | 0.00 | 576 | 56.09 | 3.145790e+03 | 3.51 | 3.96 | 20.83 | 2674 | 91.26 | 196.59 | 256 |
Pool.Area | 0 | 0.00000e+00 | 2.24 | 0.0 | 0.00 | 800 | 35.60 | 1.267160e+03 | 15.87 | 16.93 | 302.26 | 2917 | 99.56 | 564.85 | 13 |
Misc.Val | 0 | 0.00000e+00 | 50.64 | 0.0 | 0.00 | 17000 | 566.34 | 3.207458e+05 | 11.18 | 21.99 | 568.24 | 2827 | 96.48 | 6334.45 | 103 |
Mo.Sold | 1 | 4.00000e+00 | 6.22 | 6.0 | 8.00 | 12 | 2.71 | 7.370000e+00 | 0.44 | 0.19 | 2.54 | 0 | 0.00 | 1.19 | 0 |
Yr.Sold | 2006 | 2.00700e+03 | 2007.79 | 2008.0 | 2009.00 | 2010 | 1.32 | 1.730000e+00 | 0.00 | 0.13 | 1.84 | 0 | 0.00 | 0.00 | 0 |
SalePrice | 12789 | 1.29500e+05 | 180796.06 | 160000.0 | 213500.00 | 755000 | 79886.69 | 6.381884e+09 | 0.44 | 1.74 | 8.11 | 0 | 0.00 | 35298.80 | 137 |
Interpretation of Sale Price Distribution
The distribution of the original SalePrice
variable is
strongly right-skewed, with most home prices
concentrated between $100,000 and $250,000. This skewness can distort
model assumptions that rely on normality and homoscedasticity.
In contrast, the log-transformed
SalePrice
is approximately symmetric and
bell-shaped, closely resembling a normal distribution. This
transformation helps stabilize variance, reduces the impact of extreme
outliers, and supports improved performance and interpretability in
regression models such as SVR and ANN.
Conclusion: Log-transforming SalePrice
is a
justified preprocessing step for models sensitive to non-normal
distributions or heteroscedasticity.
library(ggplot2)
library(dplyr)
# Create tidy dataset for faceting
plot_data <- bind_rows(
ames_data %>% dplyr::select(Price = SalePrice) %>% mutate(Scale = "SalePrice"),
ames_data %>% mutate(Price = log(SalePrice)) %>% dplyr::select(Price) %>% mutate(Scale = "Log(SalePrice)")
) %>%
mutate(Scale = factor(Scale, levels = c("SalePrice", "Log(SalePrice)")))
# Plot without black outlines
ggplot(plot_data, aes(x = Price)) +
geom_histogram(data = subset(plot_data, Scale == "SalePrice"),
binwidth = 10000, fill = "black", alpha = 0.7) +
geom_histogram(data = subset(plot_data, Scale == "Log(SalePrice)"),
binwidth = 0.05, fill = "black", alpha = 0.7) +
facet_wrap(~ Scale, scales = "free") +
labs(
title = "Log Transformation Normalizes SalePrice Distribution",
x = "Price Scale",
y = "Count"
) +
theme_minimal()
library(ggplot2)
library(dplyr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# Define bin breaks and labels for Gr.Liv.Area
max_grliv <- max(ames_data$`Gr.Liv.Area`, na.rm = TRUE)
breaks <- seq(0, ceiling(max_grliv / 500) * 500, by = 500)
labels <- paste0(head(breaks, -1) / 1000, "K–", tail(breaks, -1) / 1000, "K")
# Bin and label Gr.Liv.Area
ames_data_clean <- ames_data %>%
mutate(GrLivArea_bin = cut(`Gr.Liv.Area`, breaks = breaks, labels = labels, include.lowest = TRUE)) %>%
filter(!is.na(GrLivArea_bin))
# Boxplot with formatted axes
ggplot(ames_data_clean, aes(x = GrLivArea_bin, y = SalePrice)) +
geom_boxplot(fill = "black", alpha = 0.7, outlier.shape = NA) +
coord_flip() +
scale_y_continuous(labels = label_number(scale = 1/1000, suffix = "K")) +
labs(
title = "SalePrice Across Gross Living Area Ranges",
x = "Gross Living Area (Binned, K sq ft)",
y = "Sale Price (USD, K)"
) +
theme_minimal()
Interpretation:
Larger Homes Command Higher and More Variable Sale Prices
As gross living area increases, median sale price rises steadily, indicating a strong positive relationship between home size and market value. The boxplot also reveals increasing variability in sale price for larger homes, suggesting that additional features (like quality, location, or amenities) may further differentiate prices within higher size categories.
skewed_vars <- c(
"Lot.Area", "Mas.Vnr.Area", "BsmtFin.SF.2", "Low.Qual.Fin.SF",
"X3Ssn.Porch", "Screen.Porch", "Pool.Area", "Misc.Val", "SalePrice"
)
# Load required libraries
library(ggplot2)
library(gridExtra)
# Plot histograms with density overlay for each skewed variable
plot_skewed_vars <- function(var_names, data) {
lapply(var_names, function(var) {
ggplot(data, aes_string(x = var)) +
geom_histogram(aes(y = ..density..), fill = "steelblue", bins = 30, alpha = 0.6) +
geom_density(color = "red", size = 0.5) +
labs(title = paste("Distribution of", var), x = var, y = "Density")
})
}
# Generate plots
skewed_plots <- plot_skewed_vars(skewed_vars, ames_data_imputed)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display first 6 (page 1)
gridExtra::grid.arrange(grobs = skewed_plots[1:6], ncol = 3)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display remaining (page 2)
gridExtra::grid.arrange(grobs = skewed_plots[7:9], ncol = 3)
The histograms show that most variables, including
Lot.Area
, Mas.Vnr.Area
,
BsmtFin.SF.2
, Pool.Area
, and
Misc.Val
, are highly right-skewed, with a
long tail of large values. To address this skewness and stabilize
variance, Box-Cox transformations will be applied. This
helps normalize distributions and improve model performance, especially
for algorithms sensitive to non-normality like SVR and ANN.
# Load required libraries
library(ggplot2)
library(gridExtra)
# Outlier variables
outlier_vars <- c("Misc.Val", "Pool.Area", "Enclosed.Porch",
"Lot.Area", "Total.Bsmt.SF", "Gr.Liv.Area", "SalePrice")
# Create boxplots
boxplot_list <- lapply(outlier_vars, function(var) {
ggplot(ames_data_imputed, aes_string(y = var)) +
geom_boxplot(fill = "skyblue") +
labs(title = paste("Boxplot of", var), y = var)
})
# Display boxplots (2 pages if needed)
grid.arrange(grobs = boxplot_list[1:4], ncol = 2)
grid.arrange(grobs = boxplot_list[5:7], ncol = 2)
The boxplots reveal substantial outliers in variables such as
Misc.Val
, Pool.Area
, Lot.Area
,
Gr.Liv.Area
, Total.Bsmt.SF
, and
SalePrice
. These extreme values could distort model
training. Winsorizing (IQR-based capping) will be applied as a
corrective measure to limit the influence of these outliers and improve
model robustness.
# Load libraries
library(ggplot2)
library(dplyr)
# List of variables with many zeros
zero_vars <- c(
"Low.Qual.Fin.SF", "BsmtFin.SF.2", "X2nd.Flr.SF", "Fireplaces", "Pool.Area",
"Misc.Val", "Bsmt.Half.Bath", "Screen.Porch", "X3Ssn.Porch", "Enclosed.Porch"
)
# Compute zero proportions
zero_stats <- ames_data_imputed %>%
summarise(across(all_of(zero_vars), ~mean(. == 0))) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "ZeroProportion")
# Plot
ggplot(zero_stats, aes(x = reorder(Variable, ZeroProportion), y = ZeroProportion)) +
geom_col(fill = "tomato") +
coord_flip() +
labs(title = "Proportion of Zeros in Variables", x = "Variable", y = "Zero Proportion") +
scale_y_continuous(labels = scales::percent_format())
Several variables, including Pool.Area
,
X3Ssn.Porch
, and Low.Qual.Fin.SF
, have over
90% zero values, indicating extreme sparsity. Binary flags will be
created to capture the presence or absence of these features, allowing
models to learn from their occurrence without distortion from zero
inflation.
Candidate Variables based on summary statistics
Misc.Val
, Pool.Area
,
BsmtFin.SF.2
, Screen.Porch
,
Low.Qual.Fin.SF
, Open.Porch.SF
# List of overdispersed features
over_vars <- c("Misc.Val", "Pool.Area", "BsmtFin.SF.2", "Screen.Porch", "Low.Qual.Fin.SF", "Open.Porch.SF")
# Create mean-variance data
overdisp_df <- data.frame(
Variable = over_vars,
Mean = sapply(over_vars, function(var) mean(ames_data_imputed[[var]])),
Variance = sapply(over_vars, function(var) var(ames_data_imputed[[var]]))
)
# Scatterplot
library(ggplot2)
ggplot(overdisp_df, aes(x = Mean, y = Variance, label = Variable)) +
geom_point(color = "firebrick", size = 3) +
geom_text(vjust = -0.5) +
labs(title = "Mean vs. Variance (Overdispersion Check)", x = "Mean", y = "Variance") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray")
Interpretation: Overdispersion Check
Action: Apply Box-Cox transformation, create binary flags, or use distribution-aware models like Poisson or tree-based methods.
For this project, I plan to tailor my data preprocessing approach based on the specific needs of each model type to optimize performance and model compatibility. Here’s how I intend to proceed:
1. Decision Tree and Random Forest: These models are robust to most preprocessing issues. They do not require feature scaling or dummy encoding. I will retain all features, including categorical variables in their original form. No normalization is necessary, and I will allow the models to handle nonlinearity and interactions natively.
2. Support Vector Regression (SVR): SVR is highly sensitive to feature scales and outliers. I will center and scale all numeric features, dummy encode all categorical variables, and consider applying Box-Cox transformations to both predictors and the target variable. Feature selection is also critical here to reduce noise and improve margin-based optimization.
3. K-Nearest Neighbors (KNN Regression): For KNN, I will normalize all numeric predictors and the target variable using Min-Max scaling, as KNN relies heavily on distance calculations. Dummy encoding is essential for categorical variables, and I will include a dimensionality reduction step if needed, such as PCA, to mitigate the curse of dimensionality.
4. Artificial Neural Networks (ANN Regression): I will apply feature scaling to all numeric inputs and dummy encode categorical variables. Since ANNs are also sensitive to outliers and varying scales, I will ensure all inputs and outputs are scaled, and potentially use dropout or regularization to minimize overfitting. Feature selection will also be considered to simplify the network.
5. XGBoost Regression: XGBoost handles nonlinearity and interactions well, but dummy encoding is required for categorical variables. Although it is not sensitive to scaling, I will still ensure features are clean, free of multicollinearity, and filtered through optional feature selection to enhance interpretability and efficiency.
6. Unsupervised Learning (PCA and K-Means): For PCA and K-Means clustering, I will scale all numeric variables and dummy encode categorical features. Outlier removal or capping will be done prior to clustering. PCA will be used as a dimensionality reduction step either for exploration or preprocessing before clustering.
This split will be reused for all subsequent models to ensure consistency in evaluation.
# --- Load Required Libraries ---
library(caret)
## Variables to Exclude from Modeling
drop_vars <- c("Order", "PID")
# Drop raw versions of skewed variables if using boxcox_ versions instead
# skewed_vars <- c("Misc.Val", "Pool.Area", "BsmtFin.SF.2", "Screen.Porch", "Low.Qual.Fin.SF", "Open.Porch.SF")
# drop_vars <- c(drop_vars, skewed_vars)
# Now remove them
ames_data_imputed <- ames_data_imputed %>%
dplyr::select(-all_of(drop_vars))
# Convert character columns to factors
ames_data_imputed <- ames_data_imputed %>%
dplyr::mutate(across(where(is.character), as.factor))
# Split data
set.seed(42)
split_index <- createDataPartition(ames_data_imputed$SalePrice, p = 0.8, list = FALSE)
train_data <- ames_data_imputed[split_index, ]
test_data <- ames_data_imputed[-split_index, ]
# Align factor levels in test to training
factor_columns <- names(train_data)[sapply(train_data, is.factor)]
for (col in factor_columns) {
test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
}
glimpse(ames_data_imputed)
## Rows: 2,930
## Columns: 75
## $ MS.SubClass <int> 20, 20, 20, 20, 60, 60, 120, 120, 120, 60, 60, 20, 60,…
## $ MS.Zoning <fct> RL, RH, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL…
## $ Lot.Frontage <dbl> 141.0, 80.0, 81.0, 93.0, 74.0, 78.0, 41.0, 43.0, 39.0,…
## $ Lot.Area <int> 31770, 11622, 14267, 11160, 13830, 9978, 4920, 5005, 5…
## $ Street <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, …
## $ Lot.Shape <fct> IR1, Reg, IR1, Reg, IR1, IR1, Reg, IR1, IR1, Reg, IR1,…
## $ Land.Contour <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, HLS, Lvl, Lvl, Lvl,…
## $ Utilities <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub…
## $ Lot.Config <fct> Corner, Inside, Corner, Corner, Inside, Inside, Inside…
## $ Land.Slope <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl,…
## $ Neighborhood <fct> NAmes, NAmes, NAmes, NAmes, Gilbert, Gilbert, StoneBr,…
## $ Condition.1 <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, Norm, Norm,…
## $ Condition.2 <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, …
## $ Bldg.Type <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, TwnhsE, TwnhsE, Tw…
## $ House.Style <fct> 1Story, 1Story, 1Story, 1Story, 2Story, 2Story, 1Story…
## $ Overall.Qual <int> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9, …
## $ Overall.Cond <int> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2, …
## $ Year.Built <int> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ Year.Remod.Add <int> 1960, 1961, 1958, 1968, 1998, 1998, 2001, 1992, 1996, …
## $ Roof.Style <fct> Hip, Gable, Hip, Hip, Gable, Gable, Gable, Gable, Gabl…
## $ Roof.Matl <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, …
## $ Exterior.1st <fct> BrkFace, VinylSd, Wd Sdng, BrkFace, VinylSd, VinylSd, …
## $ Exterior.2nd <fct> Plywood, VinylSd, Wd Sdng, BrkFace, VinylSd, VinylSd, …
## $ Mas.Vnr.Type <fct> Stone, None, BrkFace, None, None, BrkFace, None, None,…
## $ Mas.Vnr.Area <dbl> 112, 0, 108, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603,…
## $ Exter.Qual <fct> TA, TA, TA, Gd, TA, TA, Gd, Gd, Gd, TA, TA, TA, TA, TA…
## $ Exter.Cond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, Gd, TA, TA…
## $ Foundation <fct> CBlock, CBlock, CBlock, CBlock, PConc, PConc, PConc, P…
## $ Bsmt.Qual <fct> TA, TA, TA, TA, Gd, TA, Gd, Gd, Gd, TA, Gd, Gd, Gd, Gd…
## $ Bsmt.Cond <fct> Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Bsmt.Exposure <fct> Gd, No, No, No, No, No, Mn, No, No, No, No, No, No, Gd…
## $ BsmtFin.Type.1 <fct> BLQ, Rec, ALQ, ALQ, GLQ, GLQ, GLQ, ALQ, GLQ, Unf, Unf,…
## $ BsmtFin.SF.1 <dbl> 639, 468, 923, 1065, 791, 602, 616, 263, 1180, 0, 0, 9…
## $ BsmtFin.Type.2 <fct> Unf, LwQ, Unf, Unf, Unf, Unf, Unf, Unf, Unf, Unf, Unf,…
## $ BsmtFin.SF.2 <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0…
## $ Bsmt.Unf.SF <dbl> 441, 270, 406, 1045, 137, 324, 722, 1017, 415, 994, 76…
## $ Total.Bsmt.SF <dbl> 1080, 882, 1329, 2110, 928, 926, 1338, 1280, 1595, 994…
## $ Heating <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, …
## $ Heating.QC <fct> Fa, TA, TA, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Gd, Ex, Gd, Gd…
## $ Central.Air <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
## $ Electrical <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr…
## $ X1st.Flr.SF <int> 1656, 896, 1329, 2110, 928, 926, 1338, 1280, 1616, 102…
## $ X2nd.Flr.SF <int> 0, 0, 0, 0, 701, 678, 0, 0, 0, 776, 892, 0, 676, 0, 0,…
## $ Low.Qual.Fin.SF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Gr.Liv.Area <int> 1656, 896, 1329, 2110, 1629, 1604, 1338, 1280, 1616, 1…
## $ Bsmt.Full.Bath <int> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, …
## $ Bsmt.Half.Bath <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Full.Bath <int> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, …
## $ Half.Bath <int> 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
## $ Bedroom.AbvGr <int> 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 2, 1, 4, 4, 1, …
## $ Kitchen.AbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Kitchen.Qual <fct> TA, TA, Gd, Ex, TA, Gd, Gd, Gd, Gd, Gd, TA, TA, TA, Gd…
## $ TotRms.AbvGrd <int> 7, 5, 6, 8, 6, 7, 6, 5, 5, 7, 7, 6, 7, 5, 4, 12, 8, 8,…
## $ Functional <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ,…
## $ Fireplaces <int> 2, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ Garage.Type <fct> Attchd, Attchd, Attchd, Attchd, Attchd, Attchd, Attchd…
## $ Garage.Yr.Blt <dbl> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ Garage.Finish <fct> Fin, Unf, Unf, Fin, Fin, Fin, Fin, RFn, RFn, Fin, Fin,…
## $ Garage.Cars <int> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ Garage.Area <int> 528, 730, 312, 522, 482, 470, 582, 506, 608, 442, 440,…
## $ Garage.Qual <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Garage.Cond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Paved.Drive <fct> P, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
## $ Wood.Deck.SF <int> 210, 140, 393, 0, 212, 360, 0, 0, 237, 140, 157, 483, …
## $ Open.Porch.SF <int> 62, 0, 36, 0, 34, 36, 0, 82, 152, 60, 84, 21, 75, 0, 5…
## $ Enclosed.Porch <int> 0, 0, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ X3Ssn.Porch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Screen.Porch <int> 0, 120, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 140, 210…
## $ Pool.Area <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Misc.Val <int> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0, …
## $ Mo.Sold <int> 5, 6, 6, 4, 3, 6, 4, 1, 3, 6, 4, 3, 5, 2, 6, 6, 6, 6, …
## $ Yr.Sold <int> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
## $ Sale.Type <fct> WD , WD , WD , WD , WD , WD , WD , WD , WD , WD , WD ,…
## $ Sale.Condition <fct> Normal, Normal, Normal, Normal, Normal, Normal, Normal…
## $ SalePrice <int> 215000, 105000, 172000, 244000, 189900, 195500, 213500…
Dataset: ames_data_imputed
Preprocessing Plan:
rpart
and
randomForest
packages).ames_data_imputed
.Evaluate how pruning the tree using the complexity parameter
(cp
) affects performance. The hypothesis is that tuning
cp
prevents overfitting and enhances generalization by
removing unnecessary splits.
cp = 0.01, 0.005, 0.001
using 5-fold cross-validationtrain_data
and test_data
), fixed feature
setRoot Mean Squared Error (RMSE), R-squared (R²)
# --- Load Libraries (caret should be loaded last to avoid conflicts) ---
library(rpart)
library(rpart.plot)
library(caret)
# --- Align factor levels ---
factor_columns <- names(train_data)[sapply(train_data, is.factor)]
for (col in factor_columns) {
test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
}
# --- Set Training Control ---
train_control_tree <- trainControl(method = "cv", number = 5)
# --- Grid for cp ---
tune_grid_tree <- expand.grid(cp = c(0.01, 0.005, 0.001))
# --- Train Decision Tree Regressor ---
set.seed(123) # For reproducibility
tree_model <- caret::train(
SalePrice ~ .,
data = train_data,
method = "rpart",
trControl = train_control_tree,
metric = "RMSE",
tuneGrid = tune_grid_tree
)
# --- Predict on Test Set ---
pred_tree <- predict(tree_model, newdata = test_data)
# --- Evaluate ---
results_tree <- postResample(pred = pred_tree, obs = test_data$SalePrice)
print(results_tree)
## RMSE Rsquared MAE
## 3.133066e+04 8.442167e-01 2.123098e+04
The pruned Decision Tree model achieved strong performance with an R² of 0.84, RMSE of 31,330, and MAE of 21,230, indicating good predictive accuracy with moderately low error.
Decision Tree Regression – Evaluation and Interpretation
Shows how well predictions align with actual values. Ideal points fall along the 45° line.
ggplot(data = NULL, aes(x = test_data$SalePrice, y = pred_tree)) +
geom_point(color = "black", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
geom_vline(xintercept = 300000, color = "blue", linetype = "dotted") +
annotate("text", x = 280000, y = 100000, label = "Underprediction above $300K",
color = "blue", hjust = 0, size = 3.5) +
labs(
title = "Actual vs Predicted SalePrice – Decision Tree Regressor",
x = "Actual SalePrice",
y = "Predicted SalePrice"
) +
theme_minimal()
The scatterplot shows a strong linear relationship between actual and predicted SalePrice values, with most points clustering near the diagonal red line. This indicates that the Decision Tree model is capturing the overall pricing trend well, though some underestimation is visible at higher price ranges.
library(ggplot2)
library(gridExtra)
# --- Step 1: Compute residuals and standardize ---
residuals_tree <- test_data$SalePrice - pred_tree
standardized_resid_tree <- scale(residuals_tree)[, 1]
sqrt_abs_resid_tree <- sqrt(abs(standardized_resid_tree))
# --- Plot 1: Residuals vs Predicted ---
p1_tree <- ggplot(data.frame(pred_tree, residuals_tree), aes(x = pred_tree, y = residuals_tree)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Residuals vs Predicted (Tree)",
x = "Predicted SalePrice",
y = "Residuals") +
theme_bw(base_size = 11)
# --- Plot 2: Q-Q Plot of Standardized Residuals ---
qq_data_tree <- qqnorm(standardized_resid_tree, plot.it = FALSE)
p2_tree <- ggplot(data.frame(x = qq_data_tree$x, y = qq_data_tree$y), aes(x = x, y = y)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Normal Q-Q Plot (Tree Residuals)",
x = "Theoretical Quantiles",
y = "Standardized Residuals") +
theme_bw(base_size = 11)
# --- Plot 3: Scale-Location Plot ---
p3_tree <- ggplot(data.frame(pred_tree, sqrt_abs_resid_tree), aes(x = pred_tree, y = sqrt_abs_resid_tree)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Scale-Location Plot (Tree)",
x = "Predicted SalePrice",
y = "Sqrt(Abs(Standardized Residuals))") +
theme_bw(base_size = 11)
# --- Plot 4: Histogram of Residuals ---
p4_tree <- ggplot(data.frame(residuals_tree), aes(x = residuals_tree)) +
geom_histogram(fill = "black", bins = 30, alpha = 0.6) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Distribution of Residuals (Tree)",
x = "Residual (Actual - Predicted)",
y = "Frequency") +
theme_bw(base_size = 11)
# --- Combine Tree plots ---
gridExtra::grid.arrange(p1_tree, p2_tree, p3_tree, p4_tree, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
Residuals vs. Predicted Interpretation
The residuals vs. predicted plot shows a fairly random scatter around the horizontal zero line, suggesting that the Decision Tree model exhibits no major systematic bias across most price ranges. However, there is noticeable heteroscedasticity — the spread of residuals increases as the predicted home price rises.
*Model Performance Interpretation with $325K Threshold**
The Residuals vs. Predicted plot confirms that the Decision Tree model maintains a fairly consistent and balanced residual distribution for predicted sale prices up to $300,000. Within this range:
Beyond $300K, the residual spread becomes wider and less stable, indicating increased variance and diminished reliability in predictions for higher-priced homes.
Conclusion:
The model performs best and is most trustworthy for homes priced up to $325,000. While small over- or underestimations are acceptable throughout, the increased variability beyond this threshold suggests the model’s generalization weakens for high-end properties. For this segment, more complex models or additional features may improve accuracy.
Interpretation of Residual Distribution
The histogram of residuals displays a roughly symmetric, bell-shaped distribution centered around zero, suggesting that the Decision Tree model’s prediction errors are fairly normally distributed. This reflects a good overall fit with no major bias in over- or under-predicting.
Combined with earlier plots, this supports the finding that Most homes in the dataset are priced between $100,000 and $300,000, where the model achieves the highest predictive stability.
To show which variables influenced predictions the most:
library(rpart.plot)
# --- Extract the final fitted rpart model from caret ---
best_tree_model <- tree_model$finalModel # tree_model is your caret-trained object
# --- Plot using rpart.plot ---
rpart.plot(
best_tree_model,
type = 2, # Label all nodes
extra = 101, # Show predicted value and % of observations
fallen.leaves = TRUE,
main = "Pruned Decision Tree (Final Model from caret)"
)
Interpretation of Pruned Decision Tree (Best cp)
The visualized tree represents the final pruned
model selected through cross-validation using the optimal
complexity parameter (cp
). Pruning has removed overly
specific branches, leaving a structure that balances model
complexity with generalizability.
Gr.Liv.Area
,
Overall.Qual
, or Year.Built
.Conclusion:
The pruned decision tree reflects a well-regularized model that preserves interpretability while still capturing meaningful variation in home sale prices. This makes it a valuable tool for understanding how different features contribute to price estimation.
# --- Load required libraries ---
library(rpart)
library(dplyr)
library(ggplot2)
# --- Extract split variables from final pruned tree ---
split_vars_tree <- tree_model$finalModel$frame$var
split_vars_tree <- split_vars_tree[split_vars_tree != "<leaf>"]
# --- Count and sort frequency of splitting variables ---
top_splits_tree <- as.data.frame(table(split_vars_tree)) %>%
arrange(desc(Freq)) %>%
rename(Feature = split_vars_tree, Frequency = Freq)
# --- Plot with annotations centered inside each bar ---
ggplot(top_splits_tree, aes(x = reorder(Feature, Frequency), y = Frequency)) +
geom_bar(stat = "identity", fill = "black", alpha = 0.7) +
geom_text(aes(y = Frequency / 2, label = Frequency), color = "white", size = 4) +
coord_flip() +
labs(
title = "Top Splitting Variables in Pruned Decision Tree",
x = "Feature",
y = "Frequency Used"
) +
theme_minimal()
Interpretation: Most Influential Features in Tree Splitting
The results reveal that the top three most frequently used variables in the pruned decision tree are:
Gr.Liv.Area
(Above-ground living area
in square feet – 8 splits)X1st.Flr.SF
(First-floor square
footage – 6 splits)Overall.Qual
(Overall material and
finish quality, rated 1–10 – 5 splits) and
Year.Built
(Year the house was built – 5
splits)Other notable contributors include:
BsmtFin.SF.1
(Finished basement
area)Lot.Area
(Total lot size in square
feet)Total.Bsmt.SF
(Total basement square
footage)These variables form the backbone of the model’s decision-making logic, aligning with domain knowledge that home price is primarily driven by size, quality, and age.
mtry
and ntree
for Random
Forest RegressorOptimize model performance by first tuning the number of predictors
sampled at each split (mtry
) via cross-validation, and then
assessing the interaction with the number of trees (ntree
).
Hypothesis: a well-tuned mtry
combined with higher
ntree
improves model generalization.
Changes:
mtry = c(2, 4, 6, 8)
ntree = c(100, 300, 500)
with
the best mtry
Controls:
RMSE, R-squared, MAE
Here is your code updated to clearly identify the model and
result objects using consistent and descriptive names — just
like you’ve done for svr_model
, tree_model
,
and results_svr
:
# --- Install caret if missing ---
if (!requireNamespace("caret", quietly = TRUE)) {
install.packages("caret")
}
# --- Load Required Libraries ---
library(caret)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
# --- Set up Parallel Backend ---
cl <- makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl)
# --- Step 1: Tune mtry with caret ---
set.seed(123)
mtry_grid_rf <- expand.grid(mtry = c(2, 4, 6, 8))
rf_model <- train(
SalePrice ~ ., data = train_data,
method = "rf",
trControl = trainControl(method = "cv", number = 5),
metric = "RMSE",
tuneGrid = mtry_grid_rf,
ntree = 500
)
# --- Predict and Evaluate on test data ---
pred_rf <- predict(rf_model, newdata = test_data)
results_rf <- postResample(pred = pred_rf, obs = test_data$SalePrice)
cat("Best mtry:", rf_model$bestTune$mtry, "\n")
## Best mtry: 8
print(results_rf)
## RMSE Rsquared MAE
## 2.571367e+04 9.144091e-01 1.627715e+04
# --- Step 2: Tune ntree manually using best mtry ---
ntree_values_rf <- c(100, 300, 500)
rf_model_combo_results <- data.frame()
for (nt in ntree_values_rf) {
rf_model_combo <- randomForest(SalePrice ~ ., data = train_data,
ntree = nt,
mtry = rf_model$bestTune$mtry)
pred_combo <- predict(rf_model_combo, newdata = test_data)
perf_combo <- postResample(pred = pred_combo, obs = test_data$SalePrice)
rf_model_combo_results <- rbind(rf_model_combo_results, data.frame(
ntree = nt,
mtry = rf_model$bestTune$mtry,
RMSE = perf_combo["RMSE"],
Rsquared = perf_combo["Rsquared"],
MAE = perf_combo["MAE"]
))
}
# --- Stop Parallel Backend ---
suppressWarnings(stopCluster(cl))
registerDoSEQ()
# --- Output Summary ---
print(rf_model_combo_results)
## ntree mtry RMSE Rsquared MAE
## RMSE 100 8 21571.13 0.9325483 14061.37
## RMSE1 300 8 21938.02 0.9329585 14026.71
## RMSE2 500 8 21679.48 0.9338925 13935.65
The Random Forest regression model performed well, with
mtry = 8
yielding strong predictive accuracy. Across
different values of ntree
, performance remained consistent,
with RMSE ranging from 21,679 to 21,641, R² stabilizing
around 0.933, and MAE decreasing slightly from
14,165 to 13,857. This suggests that the model is
robust to changes in the number of trees, and further increasing
ntree
beyond 300 yields marginal gains.
# --- Predict with the final Random Forest model ---
pred_rf <- predict(rf_model, newdata = test_data)
# --- Plot Actual vs Predicted ---
ggplot(data = NULL, aes(x = test_data$SalePrice, y = pred_rf)) +
geom_point(color = "black", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + # 45° line
geom_vline(xintercept = 400000, color = "blue", linetype = "dashed") + # reference line
annotate("text", x = 360000, y = 100000,
label = "Model underpredicts\nabove $400K",
color = "red", hjust = 0, size = 3.5) +
labs(
title = "Actual vs Predicted SalePrice – Random Forest Regressor",
x = "Actual SalePrice",
y = "Predicted SalePrice"
) +
theme_minimal()
Interpretation: The Actual vs Predicted plot for the Random Forest model shows a strong alignment along the 45° red dashed line, indicating high predictive accuracy. Most predictions cluster closely around the line, confirming that the model captures the relationship between input features and SalePrice effectively. Slight underestimation appears at higher price ranges, but overall, the fit is excellent.
# --- Step 1: Compute residuals and standardize ---
residuals_rf <- test_data$SalePrice - pred_rf
standardized_resid_rf <- scale(residuals_rf)[, 1]
sqrt_abs_resid_rf <- sqrt(abs(standardized_resid_rf))
# --- Plot 1: Residuals vs Predicted ---
p1_rf <- ggplot(data.frame(pred_rf, residuals_rf), aes(x = pred_rf, y = residuals_rf)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Residuals vs Predicted (Random Forest)",
x = "Predicted SalePrice",
y = "Residuals") +
theme_bw(base_size = 11)
# --- Plot 2: Q-Q Plot of Standardized Residuals ---
qq_data_rf <- qqnorm(standardized_resid_rf, plot.it = FALSE)
p2_rf <- ggplot(data.frame(x = qq_data_rf$x, y = qq_data_rf$y), aes(x = x, y = y)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Normal Q-Q Plot (RF Residuals)",
x = "Theoretical Quantiles",
y = "Standardized Residuals") +
theme_bw(base_size = 11)
# --- Plot 3: Scale-Location Plot ---
p3_rf <- ggplot(data.frame(pred_rf, sqrt_abs_resid_rf), aes(x = pred_rf, y = sqrt_abs_resid_rf)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Scale-Location Plot (Random Forest)",
x = "Predicted SalePrice",
y = "Sqrt(Abs(Standardized Residuals))") +
theme_bw(base_size = 11)
# --- Plot 4: Histogram of Residuals ---
p4_rf <- ggplot(data.frame(residuals_rf), aes(x = residuals_rf)) +
geom_histogram(fill = "black", bins = 30, alpha = 0.6) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Distribution of Residuals (Random Forest)",
x = "Residual (Actual - Predicted)",
y = "Frequency") +
theme_bw(base_size = 11)
# --- Combine RF plots ---
gridExtra::grid.arrange(p1_rf, p2_rf, p3_rf, p4_rf, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
Residuals vs. Predicted Interpretation (Random Forest)
The residuals vs. predicted plot for the Random Forest model shows a tighter, more compact distribution around the horizontal zero line compared to the Decision Tree model. This indicates that Random Forest reduces overall prediction error and improves model stability across most price ranges.
However, some heteroscedasticity remains—particularly for homes with predicted sale prices above $400,000, where the residuals spread out more widely. This suggests that while the Random Forest model outperforms the Decision Tree overall, its predictive accuracy diminishes slightly for higher-end homes.
Model Performance Interpretation with $400K Threshold
The residuals plot suggests the Random Forest model performs best for homes priced up to $400,000, where:
Beyond $400,000, the model shows greater variance and error volatility, meaning predictions for luxury homes are less dependable.
Conclusion
The Random Forest model represents a notable improvement over the Decision Tree, especially for homes priced between $275,000 and $400,000—a segment where the Decision Tree started to struggle. Still, as prices move into the high-end market (above $400K), the Random Forest’s residuals widen, signaling diminished precision.
For highly-priced homes, enhancing model performance may require:
Interpretation of Residual Distribution (Random Forest)
The histogram of residuals from the Random Forest model reveals a nearly symmetric, bell-shaped curve centered around zero, indicating that the model’s prediction errors are approximately normally distributed. This supports the model’s strong general fit, with minimal consistent over- or underestimation.
Combined with the residual and scatter plots, this reinforces the insight that the Random Forest model performs most reliably for homes priced between $100,000 and $400,000, delivering stable and trustworthy predictions within that range.
# --- Plot variable importance from final RF model ---
varImpPlot(rf_model$finalModel, main = "Random Forest Feature Importance")
# --- Extract variable importance values ---
rf_importance <- importance(rf_model$finalModel)
# --- Convert to data frame for display ---
rf_importance_df <- data.frame(
Feature = rownames(rf_importance),
IncreaseInNodePurity = rf_importance[, 1]
)
# --- Sort by descending importance ---
rf_importance_df <- rf_importance_df[order(-rf_importance_df$IncreaseInNodePurity), ]
# --- Print the table ---
print(rf_importance_df)
## Feature IncreaseInNodePurity
## Overall.Qual Overall.Qual 816554014722
## Garage.Cars Garage.Cars 707956995452
## Gr.Liv.Area Gr.Liv.Area 691141881606
## Total.Bsmt.SF Total.Bsmt.SF 628043945140
## X1st.Flr.SF X1st.Flr.SF 545587411729
## Garage.Area Garage.Area 533450737596
## Year.Built Year.Built 488038526910
## Exter.QualTA Exter.QualTA 438486031490
## Garage.Yr.Blt Garage.Yr.Blt 432653017495
## Bsmt.QualEx Bsmt.QualEx 422373635543
## Mas.Vnr.Area Mas.Vnr.Area 340325833837
## BsmtFin.SF.1 BsmtFin.SF.1 327564445858
## Full.Bath Full.Bath 322431421025
## Year.Remod.Add Year.Remod.Add 309862473963
## Lot.Area Lot.Area 290572606118
## Fireplaces Fireplaces 281052108168
## TotRms.AbvGrd TotRms.AbvGrd 278535109650
## X2nd.Flr.SF X2nd.Flr.SF 275788771535
## FoundationPConc FoundationPConc 238163459865
## Kitchen.QualTA Kitchen.QualTA 235427084252
## NeighborhoodNridgHt NeighborhoodNridgHt 233521971341
## Lot.Frontage Lot.Frontage 233520036965
## Garage.FinishUnf Garage.FinishUnf 226893362647
## Exter.QualGd Exter.QualGd 186152926692
## Open.Porch.SF Open.Porch.SF 181777629070
## BsmtFin.Type.1GLQ BsmtFin.Type.1GLQ 169807029354
## Bsmt.QualTA Bsmt.QualTA 154823119613
## Garage.FinishFin Garage.FinishFin 129100504876
## Bsmt.QualGd Bsmt.QualGd 128307550335
## Wood.Deck.SF Wood.Deck.SF 122854451157
## Mas.Vnr.TypeNone Mas.Vnr.TypeNone 122498599702
## Kitchen.QualGd Kitchen.QualGd 114999873300
## Bsmt.ExposureGd Bsmt.ExposureGd 107279737919
## Bsmt.Unf.SF Bsmt.Unf.SF 105090167521
## Half.Bath Half.Bath 95647571320
## MS.SubClass MS.SubClass 89591029488
## Bedroom.AbvGr Bedroom.AbvGr 88784153661
## Roof.StyleHip Roof.StyleHip 85107117325
## Garage.TypeAttchd Garage.TypeAttchd 82465124837
## Garage.TypeDetchd Garage.TypeDetchd 73737950403
## NeighborhoodNoRidge NeighborhoodNoRidge 73675707742
## Roof.StyleGable Roof.StyleGable 71855491614
## Bsmt.Full.Bath Bsmt.Full.Bath 71845609198
## Overall.Cond Overall.Cond 66977311831
## Exterior.1stVinylSd Exterior.1stVinylSd 64546875602
## Exterior.2ndVinylSd Exterior.2ndVinylSd 63448725916
## Bsmt.ExposureNo Bsmt.ExposureNo 62061194495
## Heating.QCTA Heating.QCTA 60403151475
## Sale.TypeNew Sale.TypeNew 55855012304
## FoundationCBlock FoundationCBlock 55143194989
## Mo.Sold Mo.Sold 54179575758
## House.Style2Story House.Style2Story 51743055080
## Sale.ConditionPartial Sale.ConditionPartial 49933907275
## MS.ZoningRM MS.ZoningRM 46976821154
## Garage.CondTA Garage.CondTA 46429107167
## Central.AirY Central.AirY 45271024806
## Screen.Porch Screen.Porch 44542255769
## Lot.ShapeReg Lot.ShapeReg 44429758554
## Mas.Vnr.TypeStone Mas.Vnr.TypeStone 43657013053
## Paved.DriveY Paved.DriveY 43553400399
## MS.ZoningRL MS.ZoningRL 37644664830
## Garage.TypeBuiltIn Garage.TypeBuiltIn 36876800115
## Garage.QualTA Garage.QualTA 36199578318
## Mas.Vnr.TypeBrkFace Mas.Vnr.TypeBrkFace 35645899444
## Yr.Sold Yr.Sold 34700555411
## NeighborhoodStoneBr NeighborhoodStoneBr 34610843187
## House.Style1Story House.Style1Story 34147068695
## Land.ContourHLS Land.ContourHLS 32180733579
## Garage.QualNone Garage.QualNone 30975486284
## Roof.MatlWdShngl Roof.MatlWdShngl 29098534462
## Sale.TypeWD Sale.TypeWD 28903687265
## Enclosed.Porch Enclosed.Porch 27703880081
## BsmtFin.Type.1Unf BsmtFin.Type.1Unf 26876558610
## Exterior.1stCemntBd Exterior.1stCemntBd 26827218793
## Sale.ConditionNormal Sale.ConditionNormal 26678530614
## Garage.TypeNone Garage.TypeNone 26254142305
## Garage.FinishRFn Garage.FinishRFn 26072235971
## Exterior.2ndCmentBd Exterior.2ndCmentBd 23517805791
## Pool.Area Pool.Area 23135106103
## NeighborhoodCrawfor NeighborhoodCrawfor 23085849400
## ElectricalSBrkr ElectricalSBrkr 21237157051
## Land.ContourLvl Land.ContourLvl 20909465163
## BsmtFin.SF.2 BsmtFin.SF.2 20460290327
## Roof.MatlCompShg Roof.MatlCompShg 19828947460
## NeighborhoodEdwards NeighborhoodEdwards 19546594900
## Lot.ConfigCulDSac Lot.ConfigCulDSac 18096652737
## Exterior.1stWd Sdng Exterior.1stWd Sdng 17951486307
## Garage.CondNone Garage.CondNone 17403765915
## NeighborhoodSomerst NeighborhoodSomerst 17109285350
## Exterior.2ndMetalSd Exterior.2ndMetalSd 15692283880
## Exterior.2ndHdBoard Exterior.2ndHdBoard 15529453342
## Lot.ConfigInside Lot.ConfigInside 14377469665
## NeighborhoodOldTown NeighborhoodOldTown 14118974347
## Exterior.1stMetalSd Exterior.1stMetalSd 14105881784
## Exterior.1stHdBoard Exterior.1stHdBoard 13862467486
## Bldg.TypeTwnhsE Bldg.TypeTwnhsE 13831112320
## Bsmt.ExposureAv Bsmt.ExposureAv 13779154969
## NeighborhoodNAmes NeighborhoodNAmes 13691314085
## Land.SlopeMod Land.SlopeMod 13294739703
## Exter.CondGd Exter.CondGd 12839288831
## Exter.CondTA Exter.CondTA 12774029197
## Bldg.TypeDuplex Bldg.TypeDuplex 12308030642
## Exterior.1stBrkFace Exterior.1stBrkFace 12171148953
## Exterior.2ndWd Sdng Exterior.2ndWd Sdng 11718269253
## Heating.QCGd Heating.QCGd 11667580026
## BsmtFin.Type.1ALQ BsmtFin.Type.1ALQ 11225137271
## Bsmt.CondTA Bsmt.CondTA 11152615047
## Condition.1Norm Condition.1Norm 11149548200
## Bsmt.Half.Bath Bsmt.Half.Bath 11073394365
## FunctionalTyp FunctionalTyp 11053102743
## Exterior.2ndImStucc Exterior.2ndImStucc 11008432189
## Bsmt.CondNone Bsmt.CondNone 10509575989
## Lot.ShapeIR2 Lot.ShapeIR2 10504214491
## Kitchen.AbvGr Kitchen.AbvGr 10261951997
## Garage.QualGd Garage.QualGd 9805923910
## BsmtFin.Type.2Unf BsmtFin.Type.2Unf 9584229324
## ElectricalFuseA ElectricalFuseA 9578925434
## NeighborhoodGilbert NeighborhoodGilbert 9486740290
## NeighborhoodIDOTRR NeighborhoodIDOTRR 9413236855
## NeighborhoodCollgCr NeighborhoodCollgCr 9392333151
## Bsmt.CondGd Bsmt.CondGd 9200950990
## BsmtFin.Type.1Rec BsmtFin.Type.1Rec 9183194436
## Bsmt.ExposureNone Bsmt.ExposureNone 9180199633
## BsmtFin.Type.1None BsmtFin.Type.1None 9011016923
## Bsmt.QualNone Bsmt.QualNone 8846073183
## BsmtFin.Type.2None BsmtFin.Type.2None 8733854152
## Bsmt.ExposureMn Bsmt.ExposureMn 8618057240
## MS.ZoningFV MS.ZoningFV 8472521059
## Condition.1PosA Condition.1PosA 8351151715
## Exter.CondFa Exter.CondFa 8303475774
## Bsmt.QualFa Bsmt.QualFa 8171123597
## Condition.1PosN Condition.1PosN 8011502135
## Kitchen.QualFa Kitchen.QualFa 7726924661
## NeighborhoodNWAmes NeighborhoodNWAmes 7549422433
## Exterior.2ndPlywood Exterior.2ndPlywood 7534914644
## Misc.Val Misc.Val 7471796410
## Condition.1Feedr Condition.1Feedr 7381937977
## NeighborhoodTimber NeighborhoodTimber 7333419577
## Bldg.TypeTwnhs Bldg.TypeTwnhs 7284696425
## Exterior.2ndBrkFace Exterior.2ndBrkFace 7180847390
## Exterior.1stPlywood Exterior.1stPlywood 7051373369
## Land.ContourLow Land.ContourLow 6753195110
## NeighborhoodSawyerW NeighborhoodSawyerW 6121643796
## MS.ZoningC (all) MS.ZoningC (all) 5968154197
## BsmtFin.Type.1BLQ BsmtFin.Type.1BLQ 5873631310
## Bsmt.CondFa Bsmt.CondFa 5825772356
## NeighborhoodMitchel NeighborhoodMitchel 5773488701
## Condition.2PosA Condition.2PosA 5723017739
## NeighborhoodMeadowV NeighborhoodMeadowV 5661208472
## NeighborhoodClearCr NeighborhoodClearCr 5439889939
## Exter.QualFa Exter.QualFa 5399755047
## House.StyleSLvl House.StyleSLvl 5301707088
## Lot.ShapeIR3 Lot.ShapeIR3 5246681346
## Heating.QCFa Heating.QCFa 5221251937
## Exterior.2ndStucco Exterior.2ndStucco 5110981662
## Garage.QualFa Garage.QualFa 4942730097
## Exterior.1stStucco Exterior.1stStucco 4744850746
## Low.Qual.Fin.SF Low.Qual.Fin.SF 4610503374
## BsmtFin.Type.1LwQ BsmtFin.Type.1LwQ 4569863979
## Condition.2Norm Condition.2Norm 4497927697
## Garage.CondFa Garage.CondFa 4468882111
## NeighborhoodSawyer NeighborhoodSawyer 4466905916
## NeighborhoodBrkSide NeighborhoodBrkSide 4368349013
## Sale.ConditionFamily Sale.ConditionFamily 4294718510
## Exterior.2ndWd Shng Exterior.2ndWd Shng 4205114105
## FunctionalMod FunctionalMod 4106961455
## House.Style2.5Unf House.Style2.5Unf 4067287276
## NeighborhoodVeenker NeighborhoodVeenker 4003693516
## HeatingGasA HeatingGasA 3916646314
## BsmtFin.Type.2Rec BsmtFin.Type.2Rec 3732282120
## Lot.ConfigFR2 Lot.ConfigFR2 3611282595
## Sale.ConditionAlloca Sale.ConditionAlloca 3606281701
## Land.SlopeSev Land.SlopeSev 3533199531
## HeatingGasW HeatingGasW 3445511160
## Garage.QualEx Garage.QualEx 3438558790
## FunctionalMin1 FunctionalMin1 3261431428
## Condition.2PosN Condition.2PosN 3245586648
## BsmtFin.Type.2LwQ BsmtFin.Type.2LwQ 3226082664
## X3Ssn.Porch X3Ssn.Porch 3137156685
## House.StyleSFoyer House.StyleSFoyer 3125021002
## Garage.TypeBasment Garage.TypeBasment 3038799613
## BsmtFin.Type.2GLQ BsmtFin.Type.2GLQ 2864433610
## NeighborhoodBrDale NeighborhoodBrDale 2800979006
## ElectricalFuseF ElectricalFuseF 2781586388
## Bldg.Type2fmCon Bldg.Type2fmCon 2722663121
## FunctionalMin2 FunctionalMin2 2707703490
## Roof.MatlTar&Grv Roof.MatlTar&Grv 2691780552
## House.Style2.5Fin House.Style2.5Fin 2686560343
## FoundationSlab FoundationSlab 2645196116
## BsmtFin.Type.2ALQ BsmtFin.Type.2ALQ 2527834198
## Paved.DriveP Paved.DriveP 2198846516
## Condition.1RRAn Condition.1RRAn 2192291941
## Exterior.1stWdShing Exterior.1stWdShing 2044655425
## BsmtFin.Type.2BLQ BsmtFin.Type.2BLQ 1818574632
## Sale.TypeConLD Sale.TypeConLD 1786274296
## FunctionalSal FunctionalSal 1700570035
## Roof.StyleMansard Roof.StyleMansard 1664762457
## NeighborhoodSWISU NeighborhoodSWISU 1661472929
## StreetPave StreetPave 1660651166
## Roof.StyleGambrel Roof.StyleGambrel 1517457437
## FoundationStone FoundationStone 1426256408
## Exterior.1stPreCast Exterior.1stPreCast 1397165789
## Roof.MatlWdShake Roof.MatlWdShake 1387949053
## Sale.TypeCon Sale.TypeCon 1240807290
## NeighborhoodGrnHill NeighborhoodGrnHill 1149410050
## MS.ZoningRH MS.ZoningRH 1126716626
## Exterior.2ndBrk Cmn Exterior.2ndBrk Cmn 1115235755
## Condition.1RRAe Condition.1RRAe 1104662341
## Roof.StyleShed Roof.StyleShed 1057290821
## HeatingGrav HeatingGrav 1020810707
## Sale.TypeCWD Sale.TypeCWD 945897251
## FunctionalMaj2 FunctionalMaj2 939366713
## Lot.ConfigFR3 Lot.ConfigFR3 935394143
## Mas.Vnr.TypeBrkCmn Mas.Vnr.TypeBrkCmn 928265571
## Sale.TypeConLI Sale.TypeConLI 864871374
## Garage.TypeCarPort Garage.TypeCarPort 838036492
## Exterior.2ndPreCast Exterior.2ndPreCast 825219126
## NeighborhoodNPkVill NeighborhoodNPkVill 724951198
## Garage.CondPo Garage.CondPo 719127735
## NeighborhoodGreens NeighborhoodGreens 714678040
## Exterior.1stBrkComm Exterior.1stBrkComm 713781911
## Sale.TypeOth Sale.TypeOth 659195631
## Garage.CondGd Garage.CondGd 648692554
## Exter.CondPo Exter.CondPo 594321034
## Condition.2Feedr Condition.2Feedr 576786281
## Exterior.1stStone Exterior.1stStone 537692247
## HeatingWall HeatingWall 504871817
## Condition.1RRNn Condition.1RRNn 476238414
## NeighborhoodBlueste NeighborhoodBlueste 469858412
## Sale.TypeConLw Sale.TypeConLw 454477451
## Exterior.1stCBlock Exterior.1stCBlock 388527228
## Exterior.2ndAsphShn Exterior.2ndAsphShn 357927329
## Exterior.2ndStone Exterior.2ndStone 335409400
## Mas.Vnr.TypeCBlock Mas.Vnr.TypeCBlock 331799973
## ElectricalFuseP ElectricalFuseP 328744326
## FunctionalSev FunctionalSev 324171847
## Heating.QCPo Heating.QCPo 306937692
## Bsmt.CondPo Bsmt.CondPo 281865854
## House.Style1.5Unf House.Style1.5Unf 245594886
## Garage.QualPo Garage.QualPo 227049778
## Exterior.2ndCBlock Exterior.2ndCBlock 210723620
## Sale.ConditionAdjLand Sale.ConditionAdjLand 175171210
## Exterior.2ndOther Exterior.2ndOther 173495368
## UtilitiesNoSewr UtilitiesNoSewr 168210293
## Garage.CondEx Garage.CondEx 142802241
## NeighborhoodLandmrk NeighborhoodLandmrk 125798268
## MS.ZoningI (all) MS.ZoningI (all) 103929347
## Roof.MatlMetal Roof.MatlMetal 98064192
## Condition.1RRNe Condition.1RRNe 92391143
## FoundationWood FoundationWood 89949997
## Condition.2RRNn Condition.2RRNn 88566812
## Condition.2RRAn Condition.2RRAn 81321450
## ElectricalMix ElectricalMix 73696940
## Exterior.1stImStucc Exterior.1stImStucc 59312671
## Exterior.1stAsphShn Exterior.1stAsphShn 54001814
## Bsmt.CondEx Bsmt.CondEx 49374838
## Roof.MatlRoll Roof.MatlRoll 42259651
## Condition.2RRAe Condition.2RRAe 36403800
## UtilitiesNoSeWa UtilitiesNoSeWa 32145638
## Bsmt.QualPo Bsmt.QualPo 31173710
## HeatingOthW HeatingOthW 25725626
## Sale.TypeVWD Sale.TypeVWD 10417084
## Roof.MatlMembran Roof.MatlMembran 0
## Kitchen.QualPo Kitchen.QualPo 0
Interpretation: Most Influential Features in Random Forest vs. Decision Tree
Both the Random Forest and Decision Tree models prioritize similar predictors of home sale prices—particularly those tied to size, quality, and construction year—but they differ in how they evaluate and rank these variables.
Top Predictors in Random Forest (Based on Increase in Node Purity)
The Random Forest model identifies
Overall.Qual
,
Garage.Cars
, and
Gr.Liv.Area
as the most impactful
variables. These features are followed closely by
Total.Bsmt.SF
,
X1st.Flr.SF
,
Garage.Area
, and
Year.Built
. Notably,
garage-related variables such as the number of car
spaces, garage size, and garage year built are weighted more heavily in
Random Forest than in the Decision Tree.
These rankings reflect the model’s strength in capturing non-linear interactions and subtleties across multiple trees. Random Forest excels in considering variables that may not be selected early in a single tree but still hold predictive power when averaged over many models.
Top Predictors in the Pruned Decision Tree (Based on Split Frequency)
The pruned decision tree relies most on
Gr.Liv.Area
,
X1st.Flr.SF
,
Overall.Qual
, and
Year.Built
, which appear near the root and
are used in multiple splits. These core variables serve as the
foundation for the tree’s decisions, emphasizing above-ground
space, construction date, and
material/finish quality.
Other notable contributors include
BsmtFin.SF.1
,
Lot.Area
, and
Total.Bsmt.SF
, highlighting the influence
of both usable interior space and property
size.
*Conclusion**
The overlap in feature importance across models reinforces the reliability of these predictors. However:
Together, these insights provide strong evidence that living space, overall quality, and build year are the primary drivers of home prices in this dataset, with garage and basement features adding meaningful refinement to more advanced models like Random Forest.
This block loads the necessary R libraries for data manipulation, transformation, modeling, and visualization. These packages will be used throughout the preprocessing pipeline.
knitr::opts_chunk$set(echo = TRUE)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(dplyr)
library(car)
library(caret)
library(kableExtra)
library(fastDummies)
skewed_vars <- c("Lot.Area", "Mas.Vnr.Area", "BsmtFin.SF.2", "Low.Qual.Fin.SF",
"X3Ssn.Porch", "Screen.Porch", "Pool.Area", "Misc.Val", "SalePrice")
ames_data_transformed <- ames_data_imputed
for (var in setdiff(skewed_vars, "SalePrice")) {
x <- ames_data_imputed[[var]]
shift <- ifelse(any(x <= 0), abs(min(x)) + 1, 0)
x <- x + shift
bc <- boxcox(lm(x ~ 1), lambda = seq(-5, 5, 0.1), plotit = FALSE)
lambda <- bc$x[which.max(bc$y)]
x_transformed <- if (abs(lambda) < 1e-4) log(x) else (x^lambda - 1) / lambda
ames_data_transformed[[paste0("boxcox_", var)]] <- x_transformed
}
ames_data_transformed <- ames_data_transformed %>%
dplyr::select(-all_of(setdiff(skewed_vars, "SalePrice")))
Applies Box-Cox transformation to variables with extreme dispersion.
over_vars <- c("Misc.Val", "Pool.Area", "BsmtFin.SF.2",
"Screen.Porch", "Low.Qual.Fin.SF", "Open.Porch.SF")
for (var in over_vars) {
if (var %in% names(ames_data_imputed)) {
x <- ames_data_imputed[[var]]
shift <- ifelse(any(x <= 0), abs(min(x)) + 1, 0)
x_shifted <- x + shift
bc <- boxcox(lm(x_shifted ~ 1), lambda = seq(-5, 5, 0.1), plotit = FALSE)
lambda <- bc$x[which.max(bc$y)]
x_transformed <- if (abs(lambda) < 1e-4) log(x_shifted) else (x_shifted^lambda - 1) / lambda
ames_data_imputed[[paste0("boxcox_", var)]] <- x_transformed
}
}
ames_data_imputed <- ames_data_imputed %>% dplyr::select(-all_of(over_vars))
Flags outliers using the 1.5*IQR rule and adds binary indicators.
outlier_vars <- c("Misc.Val", "Pool.Area", "Enclosed.Porch",
"Lot.Area", "Total.Bsmt.SF", "Gr.Liv.Area", "SalePrice")
for (var in outlier_vars) {
if (var %in% names(ames_data_imputed)) {
Q1 <- quantile(ames_data_imputed[[var]], 0.25, na.rm = TRUE)
Q3 <- quantile(ames_data_imputed[[var]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
ames_data_imputed[[paste0("is_outlier_", var)]] <- ifelse(
ames_data_imputed[[var]] < lower | ames_data_imputed[[var]] > upper,
1, 0)
}
}
Caps outlier values at the 1.5*IQR bounds.
ames_data_capped <- ames_data_imputed
cap_outliers <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
x[x < lower] <- lower
x[x > upper] <- upper
return(x)
}
all_outlier_vars <- c("Misc.Val", "Pool.Area", "Enclosed.Porch",
"Lot.Area", "Total.Bsmt.SF", "Gr.Liv.Area", "SalePrice")
for (var in intersect(all_outlier_vars, names(ames_data_capped))) {
ames_data_capped[[paste0("capped_", var)]] <- cap_outliers(ames_data_capped[[var]])
}
Adds binary indicators for zero/non-zero values.
ames_data_flagged <- ames_data_imputed
zero_vars <- intersect(c("Low.Qual.Fin.SF", "BsmtFin.SF.2", "X2nd.Flr.SF", "Fireplaces",
"Pool.Area", "Misc.Val", "Bsmt.Half.Bath", "Screen.Porch",
"X3Ssn.Porch", "Enclosed.Porch"), names(ames_data_flagged))
for (var in zero_vars) {
ames_data_flagged[[paste0("has_", var)]] <- ifelse(ames_data_flagged[[var]] > 0, 1, 0)
}
Encodes ordered factors numerically, dummy encodes nominal factors,and converts year-based columns into age.
library(fastDummies)
library(dplyr)
df <- ames_data_imputed
# --- Step 1: Standardize Column Names and Create Age Features ---
names(df) <- gsub("\\.", "", names(df)) # just in case
# Create Age Features
if (all(c("YrSold", "YearBuilt") %in% names(df))) df$HouseAge <- df$YrSold - df$YearBuilt
if (all(c("YrSold", "YearRemodAdd") %in% names(df))) df$RemodAge <- df$YrSold - df$YearRemodAdd
if (all(c("YrSold", "GarageYrBlt") %in% names(df))) df$GarageAge <- df$YrSold - df$GarageYrBlt
# Drop original year columns
df <- df %>% dplyr::select(-any_of(c("YearBuilt", "YearRemodAdd", "GarageYrBlt")))
# --- Step 2: Ordinal Encoding (corrected names) ---
ordinal_mapping <- list(
ExterQual = c("Po", "Fa", "TA", "Gd", "Ex"),
ExterCond = c("Po", "Fa", "TA", "Gd", "Ex"),
BsmtQual = c("", "None", "Po", "Fa", "TA", "Gd", "Ex"),
BsmtCond = c("", "None", "Po", "Fa", "TA", "Gd", "Ex"),
BsmtExposure = c("", "No", "Mn", "Av", "Gd", "None"),
BsmtFinType1 = c("", "None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"),
BsmtFinType2 = c("", "None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"),
HeatingQC = c("Po", "Fa", "TA", "Gd", "Ex"),
KitchenQual = c("Po", "Fa", "TA", "Gd", "Ex"),
GarageFinish = c("", "None", "Unf", "RFn", "Fin"),
GarageQual = c("", "None", "Po", "Fa", "TA", "Gd", "Ex"),
GarageCond = c("", "None", "Po", "Fa", "TA", "Gd", "Ex"),
Functional = c("Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"),
LandSlope = c("Sev", "Mod", "Gtl"),
LotShape = c("IR3", "IR2", "IR1", "Reg"),
PavedDrive = c("N", "P", "Y"),
Utilities = c("NoSewr", "NoSeWa", "AllPub")
)
for (var in names(ordinal_mapping)) {
if (var %in% names(df)) {
unique_vals <- unique(df[[var]])
expected_vals <- ordinal_mapping[[var]]
extra_vals <- setdiff(unique_vals, expected_vals)
df[[var]] <- factor(df[[var]], levels = expected_vals, ordered = TRUE)
df[[var]] <- as.numeric(df[[var]])
}
}
# --- Step 3: Dummy encode nominal variables (corrected names) ---
nominal_vars <- c(
"MSSubClass", "MSZoning", "Street", "LandContour", "LotConfig", "Neighborhood",
"Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl",
"Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "CentralAir",
"Electrical", "GarageType", "SaleType", "SaleCondition"
)
nominal_present <- intersect(nominal_vars, colnames(df))
df[nominal_present] <- lapply(df[nominal_present], as.factor)
# Keep numeric and ordinal columns
df_non_nominal <- df %>% dplyr::select(-any_of(nominal_present))
# Dummy encode
df_dummies <- fastDummies::dummy_cols(
df,
select_columns = nominal_present,
remove_first_dummy = TRUE,
remove_selected_columns = TRUE
)
# Combine everything
df_encoded <- bind_cols(df_non_nominal, df_dummies %>% dplyr::select(-any_of(names(df_non_nominal))))
# --- Step 4: Combine with other feature blocks ---
ames_data_svr <- df_encoded %>%
bind_cols(ames_data_transformed %>% dplyr::select(starts_with("boxcox_"))) %>%
bind_cols(ames_data_capped %>% dplyr::select(starts_with("capped_"), starts_with("is_outlier_"))) %>%
bind_cols(ames_data_flagged %>% dplyr::select(starts_with("has_")))
## New names:
## • `is_outlier_SalePrice` -> `is_outlier_SalePrice...55`
## • `is_outlier_SalePrice` -> `is_outlier_SalePrice...242`
# --- Step 5: Drop multicollinear variable ---
ames_data_svr <- ames_data_svr %>% dplyr::select(-any_of("GarageArea"))
zero_var_cols <- nearZeroVar(ames_data_svr, saveMetrics = TRUE)
zero_var_names <- rownames(zero_var_cols)[zero_var_cols$zeroVar == TRUE]
ames_data_svr <- ames_data_svr %>% dplyr::select(-all_of(zero_var_names))
library(caret)
library(dplyr)
# --- Step 1: Identify Columns to Scale ---
all_numeric <- names(ames_data_svr)[sapply(ames_data_svr, is.numeric)]
# Identify dummy columns (0/1 only)
dummy_cols <- names(ames_data_svr)[
sapply(ames_data_svr, function(x) is.numeric(x) && all(x %in% c(0, 1)))
]
# Exclude SalePrice, flags, dummy vars, IDs, and engineered columns
scale_cols <- setdiff(all_numeric, c(dummy_cols, "SalePrice"))
scale_cols <- scale_cols[!grepl("^(PID|Order|has_|is_outlier_)", scale_cols)]
# --- Step 2a: Standard Scaling ---
preproc_standard <- preProcess(ames_data_svr[, scale_cols], method = c("center", "scale"))
ames_data_scaled_standard <- ames_data_svr
ames_data_scaled_standard[, scale_cols] <- predict(preproc_standard, ames_data_svr[, scale_cols])
# Save the preProcess object for reverse scaling later
saveRDS(preproc_standard, "preproc_standard.rds")
# --- Step 2b: Min-Max Scaling ---
preproc_minmax <- preProcess(ames_data_svr[, scale_cols], method = c("range"))
ames_data_scaled_minmax <- ames_data_svr
ames_data_scaled_minmax[, scale_cols] <- predict(preproc_minmax, ames_data_svr[, scale_cols])
# Save the min-max preProcess object
saveRDS(preproc_minmax, "preproc_minmax.rds")
# Select top predictors based on variable importance
top_rf_vars <- c("Overall.Qual", "Total.Bsmt.SF", "Gr.Liv.Area", "X1st.Flr.SF", "Garage.Cars",
"HouseAge", "Garage.Area", "Full.Bath", "TotRms.AbvGrd", "RemodAge",
"Mas.Vnr.Area", "Fireplaces", "BsmtFin.SF.1", "Kitchen.QualTA", "Kitchen.QualGd",
"GarageAge", "Bsmt.QualTA", "Exter.QualTA", "Open.Porch.SF", "Wood.Deck.SF",
"SalePrice")
# Ensure the selected variables exist
top_rf_vars_present <- top_rf_vars[top_rf_vars %in% names(ames_data_scaled_standard)]
# Subset for selected features
ames_data_svr_selected <- ames_data_scaled_standard[, top_rf_vars_present]
ames_data_minmax_selected <- ames_data_scaled_minmax[, top_rf_vars_present]
# Full sets
ames_data_svr_full_final <- ames_data_scaled_standard # for SVR, KNN, ANN (caret)
ames_data_minmax_full_final <- ames_data_scaled_minmax # for ANN (tensorflow/keras)
# Selected sets
ames_data_svr_final <- ames_data_svr_selected
ames_data_minmax_final <- ames_data_minmax_selected
# Use the same selected for XGB and clustering
ames_data_xgb_final <- ames_data_svr_full_final
ames_data_kmeans <- ames_data_svr_selected
ames_data_dbscan <- ames_data_svr_selected
ames_data_hclust <- ames_data_svr_selected
Train an optimized Support Vector Regression model using the Radial (RBF) kernel, with hyperparameter tuning via grid search and performance evaluation through 5-fold cross-validation.
Model: svmRadial
via
caret::train()
Grid Search:
sigma
: 0.0001, 0.0005, 0.001, 0.005, 0.01C
: 0.1, 1, 10, 50, 100, 200, 500epsilon
: 0.1, 1, 5Cross-Validation: 5-fold
Parallelization: Yes (via
doParallel
)
Here’s your revised R code where the model is explicitly trained
on the original (non-log-transformed)
SalePrice
, and all relevant objects are renamed
using the suffix _svr_orig
to reflect that
this is the original scale SVR model:
# --- Load Required Libraries ---
library(caret)
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
##
## kurtosis, moment, skewness
library(doParallel)
library(ggplot2)
# --- Set up parallel processing ---
cl <- makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl)
# --- Dataset Reference (full scaled data) ---
df_full_svr_orig <- ames_data_svr_full_final
# --- Split data (on original SalePrice) ---
set.seed(123)
split_index_svr_orig <- createDataPartition(df_full_svr_orig$SalePrice, p = 0.8, list = FALSE)
train_data_svr_orig <- df_full_svr_orig[split_index_svr_orig, ]
test_data_svr_orig <- df_full_svr_orig[-split_index_svr_orig, ]
# --- Remove near-zero variance predictors ---
zero_var_cols_svr_orig <- nearZeroVar(train_data_svr_orig, saveMetrics = TRUE)
zero_var_names_svr_orig <- rownames(zero_var_cols_svr_orig)[zero_var_cols_svr_orig$zeroVar == TRUE]
if (length(zero_var_names_svr_orig) > 0) {
message("⚠️ Dropping near-zero variance predictors: ", paste(zero_var_names_svr_orig, collapse = ", "))
}
## ⚠️ Dropping near-zero variance predictors: RoofMatl_Roll, Exterior1st_CBlock, Exterior1st_ImStucc, Exterior1st_PreCast, Exterior2nd_Other, Exterior2nd_PreCast
train_data_svr_orig <- train_data_svr_orig[, !names(train_data_svr_orig) %in% zero_var_names_svr_orig]
test_data_svr_orig <- test_data_svr_orig[, !names(test_data_svr_orig) %in% zero_var_names_svr_orig]
# --- Remove constant columns ---
constant_cols_svr_orig <- names(train_data_svr_orig)[sapply(train_data_svr_orig, function(x) length(unique(x)) == 1)]
if (length(constant_cols_svr_orig) > 0) {
message("⚠️ Dropping constant columns: ", paste(constant_cols_svr_orig, collapse = ", "))
}
train_data_svr_orig <- train_data_svr_orig[, !names(train_data_svr_orig) %in% constant_cols_svr_orig]
test_data_svr_orig <- test_data_svr_orig[, !names(test_data_svr_orig) %in% constant_cols_svr_orig]
# --- Align columns ---
common_cols_svr_orig <- intersect(names(train_data_svr_orig), names(test_data_svr_orig))
train_data_svr_orig <- train_data_svr_orig[, common_cols_svr_orig]
test_data_svr_orig <- test_data_svr_orig[, common_cols_svr_orig]
# --- Add back the original SalePrice target ---
train_data_svr_orig$SalePrice <- df_full_svr_orig$SalePrice[split_index_svr_orig]
test_data_svr_orig$SalePrice <- df_full_svr_orig$SalePrice[-split_index_svr_orig]
# --- Set training control ---
train_control_svr_orig <- trainControl(method = "cv", number = 5)
# --- Expanded tuning grid ---
tune_grid_svr_orig <- expand.grid(
sigma = c(0.0001, 0.0005, 0.001, 0.005, 0.01),
C = c(0.1, 1, 10, 50, 100, 200, 500)
)
# --- Train SVR Model on Original SalePrice ---
set.seed(42)
svr_model_orig <- train(
SalePrice ~ ., data = train_data_svr_orig,
method = "svmRadial",
trControl = train_control_svr_orig,
tuneGrid = tune_grid_svr_orig,
metric = "RMSE"
)
# --- Predict on test set ---
pred_svr_orig <- predict(svr_model_orig, newdata = test_data_svr_orig)
# --- Evaluate Model on original scale ---
results_svr_orig <- postResample(pred = pred_svr_orig, obs = test_data_svr_orig$SalePrice)
print(results_svr_orig)
## RMSE Rsquared MAE
## 2.758001e+04 8.838909e-01 1.423527e+04
# --- Stop parallel processing ---
stopCluster(cl)
registerDoSEQ()
# --- Step 1: Align test data for original SalePrice SVR model ---
missing_vars_svr_orig <- setdiff(names(train_data_svr_orig), names(test_data_svr_orig))
test_data_aligned_svr_orig <- test_data_svr_orig
for (var in missing_vars_svr_orig) {
test_data_aligned_svr_orig[[var]] <- NA
}
test_data_aligned_svr_orig <- test_data_aligned_svr_orig[, names(train_data_svr_orig)]
test_data_aligned_svr_orig <- test_data_aligned_svr_orig %>% dplyr::select(-SalePrice) # <- ensure target is excluded
# --- Step 2: Predict using SVR model (original scale) ---
pred_svr_orig <- predict(svr_model_orig, newdata = test_data_aligned_svr_orig)
# --- Step 3: Plot Actual vs Predicted (original SalePrice scale) ---
ggplot(data = NULL, aes(x = test_data_svr_orig$SalePrice, y = pred_svr_orig)) +
geom_point(color = "black", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = "Actual vs Predicted SalePrice (SVR - Original Scale)",
x = "Actual SalePrice",
y = "Predicted SalePrice"
) +
theme_minimal()
# --- Load Required Libraries ---
library(ggplot2)
library(gridExtra)
# --- Step 1: Align test data for original scale model ---
test_data_aligned_svr_orig <- test_data_svr_orig
test_data_aligned_svr_orig$SalePrice <- NULL # remove target column
missing_vars_svr_orig <- setdiff(names(train_data_svr_orig), names(test_data_aligned_svr_orig))
for (var in missing_vars_svr_orig) {
test_data_aligned_svr_orig[[var]] <- NA
}
test_data_aligned_svr_orig <- test_data_aligned_svr_orig[, names(train_data_svr_orig)]
test_data_aligned_svr_orig <- test_data_aligned_svr_orig %>% dplyr::select(-SalePrice)
# --- Step 2: Predict and compute residuals (original scale) ---
predicted_svr_orig <- predict(svr_model_orig, newdata = test_data_aligned_svr_orig)
actual_svr_orig <- test_data_svr_orig$SalePrice
residuals_svr_orig <- actual_svr_orig - predicted_svr_orig
standardized_resid_svr_orig <- scale(residuals_svr_orig)[, 1]
# --- Plot 1: Residuals vs Predicted ---
p1 <- ggplot(data.frame(predicted_svr_orig, residuals_svr_orig), aes(x = predicted_svr_orig, y = residuals_svr_orig)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Residuals vs Predicted (SalePrice)",
x = "Predicted SalePrice",
y = "Residuals") +
theme_bw(base_size = 11)
# --- Plot 2: Q-Q Plot of Standardized Residuals ---
qq_data_svr_orig <- qqnorm(standardized_resid_svr_orig, plot.it = FALSE)
p2 <- ggplot(data.frame(x = qq_data_svr_orig$x, y = qq_data_svr_orig$y), aes(x = x, y = y)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Normal Q-Q Plot (Residuals)",
x = "Theoretical Quantiles",
y = "Standardized Residuals") +
theme_bw(base_size = 11)
# --- Plot 3: Scale-Location Plot ---
sqrt_abs_resid_svr_orig <- sqrt(abs(standardized_resid_svr_orig))
p3 <- ggplot(data.frame(predicted_svr_orig, sqrt_abs_resid_svr_orig), aes(x = predicted_svr_orig, y = sqrt_abs_resid_svr_orig)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Scale-Location Plot",
x = "Predicted SalePrice",
y = "Sqrt(Abs(Standardized Residuals))") +
theme_bw(base_size = 11)
# --- Plot 4: Histogram of Residuals ---
p4 <- ggplot(data.frame(residuals_svr_orig), aes(x = residuals_svr_orig)) +
geom_histogram(fill = "black", bins = 30, alpha = 0.6) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Distribution of Residuals (SalePrice)",
x = "Residual (Actual - Predicted)",
y = "Frequency") +
theme_bw(base_size = 11)
# --- Combine and display all plots ---
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
# --- Step 1: Prepare diagnostics data frame ---
diag_data_svr_orig <- data.frame(
actual = test_data_svr_orig$SalePrice,
predicted = predicted_svr_orig
)
diag_data_svr_orig$residual <- diag_data_svr_orig$actual - diag_data_svr_orig$predicted
diag_data_svr_orig$standardized_resid <- scale(diag_data_svr_orig$residual)[, 1]
# --- Step 2: Identify Influential Points (|Standardized Residual| > 2) ---
diag_data_svr_orig$Influential <- abs(diag_data_svr_orig$standardized_resid) > 2
# --- Step 3: Annotate with K-format SalePrice labels ---
diag_data_svr_orig$Label <- ifelse(
diag_data_svr_orig$Influential,
paste0("$", round(diag_data_svr_orig$actual / 1000), "K"),
NA
)
# --- Step 4: Plot Influential Points with Labels ---
p5_svr_orig <- ggplot(diag_data_svr_orig, aes(x = predicted, y = standardized_resid)) +
geom_point(aes(color = Influential), alpha = 0.7) +
geom_text(
data = subset(diag_data_svr_orig, Influential),
aes(label = Label),
hjust = -0.1, vjust = 0,
size = 3, color = "red"
) +
scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) +
geom_hline(yintercept = c(-2, 2), linetype = "dashed", color = "blue") +
labs(title = "Influential Points (|Std. Residual| > 2) – SVR Original Scale",
x = "Predicted SalePrice",
y = "Standardized Residuals") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom")
p5_svr_orig
# --- Load Required Libraries ---
library(caret)
library(e1071)
library(doParallel)
library(ggplot2)
# --- Set up parallel processing ---
cl <- makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl)
# --- Dataset Reference (full scaled data) ---
df_full_svr_log <- ames_data_svr_full_final
df_full_svr_log$LogSalePrice <- log(df_full_svr_log$SalePrice)
# --- Split data ---
set.seed(123)
split_index_svr_log <- createDataPartition(df_full_svr_log$LogSalePrice, p = 0.8, list = FALSE)
# ✅ Assign with proper variable names
train_data_svr_log <- df_full_svr_log[split_index_svr_log, ]
test_data_svr_log <- df_full_svr_log[-split_index_svr_log, ]
# --- Drop original SalePrice to prevent leakage ---
train_data_svr_log <- train_data_svr_log %>% dplyr::select(-SalePrice)
test_data_svr_log <- test_data_svr_log %>% dplyr::select(-SalePrice)
# --- Remove near-zero variance predictors ---
zero_var_cols_svr_log <- nearZeroVar(train_data_svr_log, saveMetrics = TRUE)
zero_var_names_svr_log <- rownames(zero_var_cols_svr_log)[zero_var_cols_svr_log$zeroVar == TRUE]
if (length(zero_var_names_svr_log) > 0) {
message("⚠️ Dropping near-zero variance predictors: ", paste(zero_var_names_svr_log, collapse = ", "))
}
## ⚠️ Dropping near-zero variance predictors: RoofMatl_Roll, Exterior1st_CBlock, Exterior1st_ImStucc, Exterior1st_PreCast, Exterior2nd_Other, Exterior2nd_PreCast
train_data_svr_log <- train_data_svr_log[, !names(train_data_svr_log) %in% zero_var_names_svr_log]
test_data_svr_log <- test_data_svr_log[, !names(test_data_svr_log) %in% zero_var_names_svr_log]
# --- Remove constant columns ---
constant_cols_svr_log <- names(train_data_svr_log)[sapply(train_data_svr_log, function(x) length(unique(x)) == 1)]
if (length(constant_cols_svr_log) > 0) {
message("⚠️ Dropping constant columns: ", paste(constant_cols_svr_log, collapse = ", "))
}
train_data_svr_log <- train_data_svr_log[, !names(train_data_svr_log) %in% constant_cols_svr_log]
test_data_svr_log <- test_data_svr_log[, !names(test_data_svr_log) %in% constant_cols_svr_log]
# --- Align columns ---
common_cols_svr_log <- intersect(names(train_data_svr_log), names(test_data_svr_log))
train_data_svr_log <- train_data_svr_log[, common_cols_svr_log]
test_data_svr_log <- test_data_svr_log[, common_cols_svr_log]
# --- Add back LogSalePrice target ---
train_data_svr_log$LogSalePrice <- df_full_svr_log$LogSalePrice[split_index_svr_log]
test_data_svr_log$LogSalePrice <- df_full_svr_log$LogSalePrice[-split_index_svr_log]
# --- Set training control ---
train_control_svr_log <- trainControl(method = "cv", number = 5)
# --- Expanded tuning grid ---
tune_grid_svr_log <- expand.grid(
sigma = c(0.0001, 0.0005, 0.001, 0.005, 0.01),
C = c(0.1, 1, 10, 50, 100, 200, 500)
)
# --- Train SVR Model on Log-Transformed Target ---
set.seed(42)
svr_model_log <- train(
LogSalePrice ~ ., data = train_data_svr_log,
method = "svmRadial",
trControl = train_control_svr_log,
tuneGrid = tune_grid_svr_log,
metric = "RMSE"
)
# --- Predict and back-transform to original scale ---
log_pred_svr_log <- predict(svr_model_log, newdata = test_data_svr_log)
pred_svr_log <- exp(log_pred_svr_log)
# --- Evaluate Model ---
results_svr_log <- postResample(pred = pred_svr_log, obs = exp(test_data_svr_log$LogSalePrice))
print(results_svr_log)
## RMSE Rsquared MAE
## 1.536412e+04 9.638954e-01 7.489817e+03
# --- Stop parallel processing ---
stopCluster(cl)
registerDoSEQ()
svr_model_log
# --- Align test data for log model ---
missing_vars_svr_log <- setdiff(names(train_data_svr_log), names(test_data_svr_log))
test_data_aligned_svr_log <- test_data_svr_log
for (var in missing_vars_svr_log) {
test_data_aligned_svr_log[[var]] <- NA
}
test_data_aligned_svr_log <- test_data_aligned_svr_log[, names(train_data_svr_log)]
test_data_aligned_svr_log <- test_data_aligned_svr_log %>% dplyr::select(-LogSalePrice)
# --- Predict and back-transform ---
pred_svr_log <- exp(predict(svr_model_log, newdata = test_data_aligned_svr_log))
# --- Plot Actual vs Predicted (Original Scale) ---
ggplot(data = NULL, aes(x = exp(test_data_svr_log$LogSalePrice), y = pred_svr_log)) +
geom_point(color = "black", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = "Actual vs Predicted SalePrice (SVR - Log Model, Original Scale)",
x = "Actual SalePrice",
y = "Predicted SalePrice"
) +
theme_minimal()
# --- Align test data ---
test_data_aligned_svr_log <- test_data_svr_log
test_data_aligned_svr_log$LogSalePrice <- NULL
missing_vars_svr_log <- setdiff(names(train_data_svr_log), names(test_data_aligned_svr_log))
for (var in missing_vars_svr_log) {
test_data_aligned_svr_log[[var]] <- NA
}
test_data_aligned_svr_log <- test_data_aligned_svr_log[, names(train_data_svr_log)]
test_data_aligned_svr_log <- test_data_aligned_svr_log %>% dplyr::select(-LogSalePrice)
# --- Predict and compute residuals ---
log_predicted_svr_log <- predict(svr_model_log, newdata = test_data_aligned_svr_log)
predicted_svr_log <- exp(log_predicted_svr_log)
actual_svr_log <- exp(test_data_svr_log$LogSalePrice)
residuals_svr_log <- actual_svr_log - predicted_svr_log
standardized_resid_svr_log <- scale(residuals_svr_log)[, 1]
# --- Plot 1: Residuals vs Predicted ---
p1 <- ggplot(data.frame(predicted_svr_log, residuals_svr_log), aes(x = predicted_svr_log, y = residuals_svr_log)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Residuals vs Predicted (SalePrice) – Log Model",
x = "Predicted SalePrice",
y = "Residuals") +
theme_bw(base_size = 11)
# --- Plot 2: Q-Q Plot of Standardized Residuals ---
qq_data_svr_log <- qqnorm(standardized_resid_svr_log, plot.it = FALSE)
p2 <- ggplot(data.frame(x = qq_data_svr_log$x, y = qq_data_svr_log$y), aes(x = x, y = y)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Normal Q-Q Plot (Residuals) – Log Model",
x = "Theoretical Quantiles",
y = "Standardized Residuals") +
theme_bw(base_size = 11)
# --- Plot 3: Scale-Location Plot ---
sqrt_abs_resid_svr_log <- sqrt(abs(standardized_resid_svr_log))
p3 <- ggplot(data.frame(predicted_svr_log, sqrt_abs_resid_svr_log), aes(x = predicted_svr_log, y = sqrt_abs_resid_svr_log)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Scale-Location Plot – Log Model",
x = "Predicted SalePrice",
y = "Sqrt(Abs(Standardized Residuals))") +
theme_bw(base_size = 11)
# --- Plot 4: Histogram of Residuals ---
p4 <- ggplot(data.frame(residuals_svr_log), aes(x = residuals_svr_log)) +
geom_histogram(fill = "black", bins = 30, alpha = 0.6) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Distribution of Residuals (SalePrice) – Log Model",
x = "Residual (Actual - Predicted)",
y = "Frequency") +
theme_bw(base_size = 11)
# --- Combine all plots ---
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
# --- Prepare diagnostics data ---
diag_data_svr_log <- data.frame(
actual = exp(test_data_svr_log$LogSalePrice),
predicted = predicted_svr_log
)
diag_data_svr_log$residual <- diag_data_svr_log$actual - diag_data_svr_log$predicted
diag_data_svr_log$standardized_resid <- scale(diag_data_svr_log$residual)[, 1]
# --- Identify Influential Points ---
diag_data_svr_log$Influential <- abs(diag_data_svr_log$standardized_resid) > 2
# --- Annotate with labels ---
diag_data_svr_log$Label <- ifelse(
diag_data_svr_log$Influential,
paste0("$", round(diag_data_svr_log$actual / 1000), "K"),
NA
)
# --- Plot Influential Points ---
p5_svr_log <- ggplot(diag_data_svr_log, aes(x = predicted, y = standardized_resid)) +
geom_point(aes(color = Influential), alpha = 0.7) +
geom_text(
data = subset(diag_data_svr_log, Influential),
aes(label = Label),
hjust = -0.1, vjust = 0,
size = 3, color = "red"
) +
scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) +
geom_hline(yintercept = c(-2, 2), linetype = "dashed", color = "blue") +
labs(title = "Influential Points (|Std. Residual| > 2) – SVR Log Model, Original Scale",
x = "Predicted SalePrice",
y = "Standardized Residuals") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom")
p5_svr_log
SVR Model Evaluation: Predicting Log-Transformed SalePrice
The Support Vector Regression (SVR) model, trained on log-transformed SalePrice, achieved strong overall predictive performance with the following metrics:
Prior to model training, the following near-zero variance predictors were removed to improve generalizability and reduce noise: RoofMatl_Roll, Exterior1st_CBlock, Exterior1st_ImStucc, Exterior1st_PreCast, Exterior2nd_Other, Exterior2nd_PreCast.
The Actual vs Predicted plot illustrates a tight alignment along the 45-degree line, indicating excellent calibration across most values. Residual diagnostics confirm that residuals are centered, approximately normally distributed, and display limited heteroscedasticity. However, the Scale-Location and Residuals vs Predicted plots reveal slightly increased variance at both extremes of the predicted range. The Q-Q plot shows mild tail deviation, and the Influential Points plot identifies several observations with standardized residuals exceeding ±2 — particularly for properties priced at the very high and low ends.
From a business perspective, the model performs most accurately for mid-market homes, particularly those priced between $150K and $350K (log-transformed range ≈ 11.9–12.7). Predictions within this range exhibit minimal error. In contrast, the model tends to overestimate prices for lower-end homes (under $100K) and underestimate higher-end homes (above $450K). Influential cases, including properties as low as $35K and as high as $592K, highlight these edge-case deviations.
While the SVR model provides reliable estimates for the core housing market, business users should apply caution when interpreting predictions for luxury or budget properties. Supplementary models or price-segment-specific adjustments may be required to enhance accuracy in those segments.
Train a high-performance XGBoost regression model using gradient-boosted decision trees, with hyperparameter tuning through grid search and model evaluation via 5-fold cross-validation.
Model: xgbTree
via
caret::train()
Grid Search:
nrounds
: 100, 200max_depth
: 3, 6eta
: 0.01, 0.1gamma
: 0colsample_bytree
: 1min_child_weight
: 1subsample
: 1Cross-Validation: 5-fold
Parallelization: Enabled
(doParallel
)
#### XGBoost: Grid Search with Cross-Validation
# --- Load Required Libraries ---
library(caret)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(doParallel)
library(ggplot2)
# --- Set up parallel processing ---
cl <- makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl)
# --- Dataset Reference ---
df_full_xgb <- ames_data_svr_full_final
# --- Drop capped_SalePrice from predictors (if applicable) ---
drop_predictors_xgb <- c("capped_SalePrice")
# --- Split data using same index as other models ---
train_data_xgb <- df_full_xgb[split_index, ]
test_data_xgb <- df_full_xgb[-split_index, ]
# --- Drop unnecessary predictors ---
train_data_xgb <- train_data_xgb[, !names(train_data_xgb) %in% drop_predictors_xgb]
test_data_xgb <- test_data_xgb[, !names(test_data_xgb) %in% drop_predictors_xgb]
# --- Set training control ---
train_control_xgb <- trainControl(
method = "cv", number = 5,
verboseIter = TRUE,
allowParallel = TRUE
)
# --- Define tuning grid ---
tune_grid_xgb <- expand.grid(
nrounds = c(100, 200),
max_depth = c(3, 6),
eta = c(0.01, 0.1),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
# --- Train XGBoost Model ---
set.seed(42)
xgb_model <- train(
SalePrice ~ ., data = train_data_xgb,
method = "xgbTree",
trControl = train_control_xgb,
tuneGrid = tune_grid_xgb,
metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 200, max_depth = 3, eta = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set
# --- Predict and Evaluate ---
pred_xgb <- predict(xgb_model, newdata = test_data_xgb)
results_xgb <- postResample(pred = pred_xgb, obs = test_data_xgb$SalePrice)
cat("Best parameters:\n")
## Best parameters:
print(xgb_model$bestTune)
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 6 200 3 0.1 0 1 1 1
cat("\nPerformance:\n")
##
## Performance:
print(results_xgb)
## RMSE Rsquared MAE
## 1.922593e+04 9.435035e-01 1.283358e+04
# --- Stop parallel processing ---
stopCluster(cl)
registerDoSEQ()
xgb_model
(XGBoost)# --- Plot Actual vs Predicted (XGBoost Model) ---
ggplot(data = NULL, aes(x = test_data_xgb$SalePrice, y = pred_xgb)) +
geom_point(alpha = 0.6, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
annotate("text", x = 360000, y = 100000,
label = "$450K",
color = "red", hjust = 0) +
labs(
title = "Actual vs Predicted SalePrice (XGBoost Model)",
x = "Actual SalePrice",
y = "Predicted SalePrice"
) +
theme_minimal()
Model Comparison: XGBoost vs. SVR vs. Random Forest (Actual vs. Predicted)
The Actual vs Predicted plots below compare the predictive accuracy and behavior of the XGBoost, Support Vector Regression (SVR), and Random Forest (RF) models on the Ames Housing dataset. Here are the key comparative insights:
*XGBoost**
Best overall performer, especially in high-price segments. Indicates excellent generalization.
SVR (Support Vector Regression)
Solid performer, with less variance than RF but slightly weaker than XGBoost at the very high end.
Random Forest
Performs well in mid-range homes, but struggles with systematic bias in high-price segments.
Conclusion
For homes priced below $400K, all models perform reliably, with RF being acceptable, SVR being stable, and XGBoost slightly more accurate.
For luxury homes above $400K:
XGBoost offers the most balanced and precise model performance across all price ranges.
xgb_model
(XGBoost)# --- Load Required Libraries ---
library(ggplot2)
library(gridExtra)
# --- Step 1: Predict and compute residuals ---
predicted_xgb <- predict(xgb_model, newdata = test_data_xgb)
actual_xgb <- test_data_xgb$SalePrice
residuals_xgb <- actual_xgb - predicted_xgb
standardized_resid_xgb <- scale(residuals_xgb)[, 1]
# --- Plot 1: Residuals vs Predicted ---
p1 <- ggplot(data.frame(predicted_xgb, residuals_xgb), aes(x = predicted_xgb, y = residuals_xgb)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Residuals vs Predicted (SalePrice) – XGBoost",
x = "Predicted SalePrice",
y = "Residuals") +
theme_bw(base_size = 11)
# --- Plot 2: Q-Q Plot of Standardized Residuals ---
qq_data_xgb <- qqnorm(standardized_resid_xgb, plot.it = FALSE)
p2 <- ggplot(data.frame(x = qq_data_xgb$x, y = qq_data_xgb$y), aes(x = x, y = y)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Normal Q-Q Plot (Residuals) – XGBoost",
x = "Theoretical Quantiles",
y = "Standardized Residuals") +
theme_bw(base_size = 11)
# --- Plot 3: Scale-Location Plot ---
sqrt_abs_resid_xgb <- sqrt(abs(standardized_resid_xgb))
p3 <- ggplot(data.frame(predicted_xgb, sqrt_abs_resid_xgb), aes(x = predicted_xgb, y = sqrt_abs_resid_xgb)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
geom_rug(sides = "b", alpha = 0.2) +
labs(title = "Scale-Location Plot – XGBoost",
x = "Predicted SalePrice",
y = "Sqrt(Abs(Standardized Residuals))") +
theme_bw(base_size = 11)
# --- Plot 4: Histogram of Residuals ---
p4 <- ggplot(data.frame(residuals_xgb), aes(x = residuals_xgb)) +
geom_histogram(fill = "black", bins = 30, alpha = 0.6) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Distribution of Residuals (SalePrice) – XGBoost",
x = "Residual (Actual - Predicted)",
y = "Frequency") +
theme_bw(base_size = 11)
# --- Combine all plots ---
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
# --- Step 1: Create diagnostics data frame ---
diag_data_xgb <- data.frame(
actual = actual_xgb,
predicted = predicted_xgb
)
# --- Step 2: Compute residuals and standardized residuals ---
diag_data_xgb$residual <- diag_data_xgb$actual - diag_data_xgb$predicted
diag_data_xgb$standardized_resid <- scale(diag_data_xgb$residual)[, 1]
# --- Step 3: Flag and label influential points ---
diag_data_xgb$Influential <- abs(diag_data_xgb$standardized_resid) > 2
diag_data_xgb$Label <- ifelse(
diag_data_xgb$Influential,
paste0("$", round(diag_data_xgb$actual / 1000), "K"),
NA
)
# --- Step 4: Plot Influential Points ---
p_influential_xgb <- ggplot(diag_data_xgb, aes(x = predicted, y = standardized_resid)) +
geom_point(aes(color = Influential), alpha = 0.7) +
geom_text(
data = subset(diag_data_xgb, Influential),
aes(label = Label),
hjust = -0.1, vjust = 0,
size = 3, color = "red"
) +
scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) +
geom_hline(yintercept = c(-2, 2), linetype = "dashed", color = "blue") +
labs(
title = "Influential Points (|Std. Residual| > 2) – XGBoost",
x = "Predicted SalePrice",
y = "Standardized Residuals"
) +
theme_bw(base_size = 11) +
theme(legend.position = "bottom")
# --- Display the plot ---
p_influential_xgb
Residuals vs. Predicted: Comparative Analysis of Model Error Patterns
XGBoost
Support Vector Regression (SVR)
Random Forest
*Conclusion**
Among the three:
If high accuracy in the luxury housing segment (>$400K) is critical, XGBoost or SVR are better candidates than Random Forest.
Distribution of Residuals – XGBoost vs SVR vs Random Forest
XGBoost shows a bell-shaped distribution centered around zero, but with a noticeably wider spread than the other models. Most residuals fall within approximately ±125,000, and the tails extend farther than with SVR or Random Forest. This suggests that while XGBoost captures the general trend well and handles variability across price ranges, it allows larger errors at the extremes. It is more tolerant of outliers, especially on both ends of the sale price spectrum.
SVR (Support Vector Regression) produces a much narrower, peaked distribution of residuals. Most of its prediction errors lie within a tight ±25,000 range, indicating that the model is highly precise in the mid-range of sale prices. However, the residuals also reveal some sharp positive spikes, particularly for high-priced homes. This means SVR tends to perform exceptionally well for homes below $400,000 but may underperform for more expensive properties.
Random Forest has a moderately symmetric distribution of residuals, centered around zero like the others, but with a spread wider than SVR and narrower than XGBoost. The majority of residuals fall within ±75,000. The distribution suggests that Random Forest offers a good balance between precision and flexibility, though it shows more variability than SVR.
Conclusion
Among the three, SVR is the most consistent and reliable model for homes priced up to around $400,000, offering minimal prediction error. Random Forest is slightly less consistent but performs solidly across all ranges. XGBoost, while more prone to larger errors, is the most adaptable and suitable for capturing complex patterns and predicting homes at both lower and higher ends of the market.
# --- Load Required Library ---
library(xgboost)
# --- Extract final XGBoost model ---
xgb_final <- xgb_model$finalModel # assuming your caret model is named xgb_model
# --- Get feature importance matrix ---
importance_matrix <- xgb.importance(model = xgb_final)
# --- Plot variable importance ---
xgb.plot.importance(importance_matrix,
top_n = 20,
main = "XGBoost Feature Importance")
Across all three tree-based models—Pruned Decision Tree, Random
Forest, and XGBoost—Overall Quality
(Overall.Qual
), Above-ground Living Area
(Gr.Liv.Area
), and First Floor SF
(X1st.Flr.SF
) consistently rank among the most
important predictors of house prices. However, each model weighs and
interprets feature influence differently:
Pruned Decision Tree (Split Frequency)
Gr.Liv.Area
, X1st.Flr.SF
,
Year.Built
, Overall.Qual
Random Forest (Increase in Node Purity)
Overall.Qual
, Garage.Cars
,
Gr.Liv.Area
, Total.Bsmt.SF
,
X1st.Flr.SF
XGBoost (Gain Importance)
Overall.Qual
, followed by
is_outlier_SalePrice_80
, Gr.Liv.Area
,
Total.Bsmt.SF
, Garage.Cars
Overall.Qual
and engineered flags like outlier indicators.Key Observations:
Overall.Qual
is the most dominant feature in
every model.Gr.Liv.Area
and Total.Bsmt.SF
consistently rank high.Conclusion: While all models agree on the core importance of house size and quality, their feature ranking reflects their architecture:
Train an accurate KNN regression model with optimized number of neighbors (k) using grid search and evaluate performance through 5-fold cross-validation.
Model: knn
via
caret::train()
Grid Search:
k
: 3, 5, 7, 9Cross-Validation: 5-fold
Parallelization: Enabled
(doParallel
)
# --- Load Required Libraries ---
library(caret)
library(doParallel)
library(ggplot2)
# --- Set up parallel processing ---
cl_knn_model <- makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl_knn_model)
# --- Drop capped_SalePrice before modeling ---
drop_predictors_knn_model <- c("capped_SalePrice")
# --- Split data ---
set.seed(123)
split_index_knn_model <- createDataPartition(ames_data_minmax_full_final $SalePrice, p = 0.8, list = FALSE)
train_data_knn_model <- ames_data_minmax_full_final [split_index_knn_model, ]
test_data_knn_model <- ames_data_minmax_full_final [-split_index_knn_model, ]
# --- Exclude dropped predictors if present ---
train_data_knn_model <- train_data_knn_model[, !names(train_data_knn_model) %in% drop_predictors_knn_model]
test_data_knn_model <- test_data_knn_model[, !names(test_data_knn_model) %in% drop_predictors_knn_model]
# --- Set training control ---
train_control_knn_model <- trainControl(
method = "cv", number = 5,
verboseIter = TRUE,
allowParallel = TRUE
)
# --- Define tuning grid ---
tune_grid_knn_model <- expand.grid(k = c(3, 5, 7, 9))
# --- Train KNN Model ---
set.seed(42)
knn_model <- train(
SalePrice ~ ., data = train_data_knn_model,
method = "knn",
trControl = train_control_knn_model,
tuneGrid = tune_grid_knn_model,
metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting k = 7 on full training set
# --- Predict and Evaluate ---
pred_knn_model <- predict(knn_model, newdata = test_data_knn_model)
results_knn_model <- postResample(pred = pred_knn_model, obs = test_data_knn_model$SalePrice)
print(results_knn_model)
## RMSE Rsquared MAE
## 3.447383e+04 8.304116e-01 2.274321e+04
# --- Stop parallel processing ---
stopCluster(cl_knn_model)
registerDoSEQ()
# --- Plot 1: Actual vs Predicted ---
ggplot(data = NULL, aes(x = test_data_knn_model$SalePrice, y = pred_knn_model)) +
geom_point(alpha = 0.6, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "KNN: Actual vs Predicted", x = "Actual SalePrice", y = "Predicted SalePrice") +
theme_minimal()
# --- Plot 2: Elbow Curve (k vs RMSE) ---
rmse_df_knn_model <- knn_model$results[, c("k", "RMSE")]
ggplot(rmse_df_knn_model, aes(x = k, y = RMSE)) +
geom_line(color = "orange", linewidth = 1) +
geom_point(color = "orange", size = 2) +
labs(
title = "KNN: Elbow Curve (k vs RMSE)",
x = "Number of Neighbors (k)",
y = "RMSE"
) +
theme_minimal()
# --- Load Required Libraries ---
library(ggplot2)
library(gridExtra)
# --- Predict and compute residuals ---
pred_knn_model <- predict(knn_model, newdata = test_data_knn_model)
actual_knn_model <- test_data_knn_model$SalePrice
residuals_knn_model <- actual_knn_model - pred_knn_model
standardized_resid_knn_model <- scale(residuals_knn_model)[, 1]
# --- Plot 1: Residuals vs Predicted ---
p1_full <- ggplot(data.frame(pred_knn_model, residuals_knn_model), aes(x = pred_knn_model, y = residuals_knn_model)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residuals vs Predicted – KNN (Full Data)",
x = "Predicted SalePrice",
y = "Residuals") +
theme_bw(base_size = 11)
# --- Plot 2: Q-Q Plot ---
qq_data_full <- qqnorm(standardized_resid_knn_model, plot.it = FALSE)
p2_full <- ggplot(data.frame(x = qq_data_full$x, y = qq_data_full$y), aes(x = x, y = y)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Q-Q Plot – KNN (Full Data)",
x = "Theoretical Quantiles",
y = "Standardized Residuals") +
theme_bw(base_size = 11)
# --- Plot 3: Scale-Location Plot ---
sqrt_abs_resid_full <- sqrt(abs(standardized_resid_knn_model))
p3_full <- ggplot(data.frame(pred_knn_model, sqrt_abs_resid_full), aes(x = pred_knn_model, y = sqrt_abs_resid_full)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Scale-Location Plot – KNN (Full Data)",
x = "Predicted SalePrice",
y = "√|Standardized Residuals|") +
theme_bw(base_size = 11)
# --- Plot 4: Histogram of Residuals ---
p4_full <- ggplot(data.frame(residuals_knn_model), aes(x = residuals_knn_model)) +
geom_histogram(fill = "black", bins = 30, alpha = 0.6) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residual Distribution – KNN (Full Data)",
x = "Residual",
y = "Frequency") +
theme_bw(base_size = 11)
# --- Combine plots ---
gridExtra::grid.arrange(p1_full, p2_full, p3_full, p4_full, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
#### KNN: Grid Search with Cross-Validation (XGBoost Top Variables, Min-Max Scaled)
# --- Load Required Libraries ---
library(caret)
library(doParallel)
library(ggplot2)
library(dplyr)
# --- Set up parallel processing ---
cl_knn_model_selected <- makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl_knn_model_selected)
# --- Use Min-Max Scaled Full Data ---
ames_data_knn_model_selected <- ames_data_minmax_full_final
# --- Define XGBoost-important variables ---
xgb_vars_knn_model_selected <- c(
"OverallQual", "is_outlier_SalePrice", "GrLivArea", "GarageCars", "TotalBsmtSF",
"KitchenQual", "BsmtQual", "FullBath", "BsmtFinSF1", "LotArea", "Fireplaces",
"RemodAge", "X2ndFlrSF", "HouseAge", "GarageFinish", "OverallCond",
"X1stFlrSF", "CentralAir_Y", "BsmtFinType1", "GarageType_Attchd", "SalePrice"
)
# --- Subset the data ---
xgb_vars_knn_model_selected <- intersect(xgb_vars_knn_model_selected, names(ames_data_knn_model_selected))
ames_data_knn_model_selected <- ames_data_knn_model_selected[, xgb_vars_knn_model_selected]
# --- Split data ---
set.seed(123)
split_index_knn_model_selected <- createDataPartition(ames_data_knn_model_selected$SalePrice, p = 0.8, list = FALSE)
train_data_knn_model_selected <- ames_data_knn_model_selected[split_index_knn_model_selected, ]
test_data_knn_model_selected <- ames_data_knn_model_selected[-split_index_knn_model_selected, ]
# --- Set training control ---
train_control_knn_model_selected <- trainControl(
method = "cv", number = 5,
verboseIter = TRUE,
allowParallel = TRUE
)
# --- Define tuning grid ---
tune_grid_knn_model_selected <- expand.grid(k = c(3, 5, 7, 9))
# --- Train KNN Model ---
set.seed(42)
knn_model_selected <- train(
SalePrice ~ ., data = train_data_knn_model_selected,
method = "knn",
trControl = train_control_knn_model_selected,
tuneGrid = tune_grid_knn_model_selected,
metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting k = 5 on full training set
# --- Predict and Evaluate ---
pred_knn_model_selected <- predict(knn_model_selected, newdata = test_data_knn_model_selected)
results_knn_model_selected <- postResample(pred = pred_knn_model_selected, obs = test_data_knn_model_selected$SalePrice)
print(results_knn_model_selected)
## RMSE Rsquared MAE
## 3.080953e+04 8.580536e-01 2.012279e+04
# --- Stop parallel processing ---
stopCluster(cl_knn_model_selected)
registerDoSEQ()
# --- Plot 1: Actual vs Predicted for XGBoost-Selected KNN ---
ggplot(data = NULL, aes(x = test_data_knn_model_selected$SalePrice, y = pred_knn_model_selected)) +
geom_point(alpha = 0.6, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
title = "KNN: Actual vs Predicted (XGBoost Variables)",
x = "Actual SalePrice",
y = "Predicted SalePrice"
) +
theme_minimal()
# --- Plot 2: Elbow Curve (k vs RMSE) for XGBoost-Selected KNN ---
rmse_df_knn_model_selected <- knn_model_selected$results[, c("k", "RMSE")]
ggplot(rmse_df_knn_model_selected, aes(x = k, y = RMSE)) +
geom_line(color = "orange", linewidth = 1) +
geom_point(color = "orange", size = 2) +
labs(
title = "KNN: Elbow Curve (k vs RMSE) – XGBoost Variables",
x = "Number of Neighbors (k)",
y = "RMSE"
) +
theme_minimal()
# --- Predict and compute residuals ---
pred_knn_selected <- predict(knn_model_selected, newdata = test_data_knn_model_selected)
actual_knn_selected <- test_data_knn_model_selected$SalePrice
residuals_knn_selected <- actual_knn_selected - pred_knn_selected
standardized_resid_knn_selected <- scale(residuals_knn_selected)[, 1]
# --- Plot 1: Residuals vs Predicted ---
p1_sel <- ggplot(data.frame(pred_knn_selected, residuals_knn_selected), aes(x = pred_knn_selected, y = residuals_knn_selected)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residuals vs Predicted – KNN (XGBoost Vars)",
x = "Predicted SalePrice",
y = "Residuals") +
theme_bw(base_size = 11)
# --- Plot 2: Q-Q Plot ---
qq_data_sel <- qqnorm(standardized_resid_knn_selected, plot.it = FALSE)
p2_sel <- ggplot(data.frame(x = qq_data_sel$x, y = qq_data_sel$y), aes(x = x, y = y)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Q-Q Plot – KNN (XGBoost Vars)",
x = "Theoretical Quantiles",
y = "Standardized Residuals") +
theme_bw(base_size = 11)
# --- Plot 3: Scale-Location Plot ---
sqrt_abs_resid_sel <- sqrt(abs(standardized_resid_knn_selected))
p3_sel <- ggplot(data.frame(pred_knn_selected, sqrt_abs_resid_sel), aes(x = pred_knn_selected, y = sqrt_abs_resid_sel)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Scale-Location Plot – KNN (XGBoost Vars)",
x = "Predicted SalePrice",
y = "√|Standardized Residuals|") +
theme_bw(base_size = 11)
# --- Plot 4: Histogram of Residuals ---
p4_sel <- ggplot(data.frame(residuals_knn_selected), aes(x = residuals_knn_selected)) +
geom_histogram(fill = "black", bins = 30, alpha = 0.6) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residual Distribution – KNN (XGBoost Vars)",
x = "Residual",
y = "Frequency") +
theme_bw(base_size = 11)
# --- Combine plots ---
gridExtra::grid.arrange(p1_sel, p2_sel, p3_sel, p4_sel, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
KNN Model Comparison: Full Dataset vs. XGBoost-Selected Features
Evaluation Plots and Interpretation
Using the top features from XGBoost improves KNN performance:
Compared to other models:
Develop and evaluate a predictive artificial neural network (ANN) regression model using grid search and 5-fold cross-validation to identify the best-performing hyperparameters.
Modeling Framework: nnet
(via
caret::train()
)
Hyperparameter Grid Search:
size
(number of hidden units):
1
, 3
, 5
decay
(weight regularization):
0
, 0.01
, 0.1
Resampling Strategy: 5-fold cross-validation
Parallel Processing: Enabled using the
doParallel
package for faster model training
This analysis compares three Keras artificial neural network (ANN) architectures across both their design and predictive performance. It combines quantitative evaluation with visual diagnostics to provide a holistic view of each model’s effectiveness.
# Install if not yet installed
# install.packages("keras")
# keras::install_keras()
library(keras)
library(dplyr)
library(caret)
library(tibble)
# --- Load Required Libraries ---
library(dplyr)
library(caret)
# --- Step 1: Assign min-max scaled dataset ---
ames_data_keras <- ames_data_minmax_full_final # Best for ANN
# --- Step 2: Split data ---
set.seed(123)
split_index <- createDataPartition(ames_data_keras$SalePrice, p = 0.8, list = FALSE)
train_data <- ames_data_keras[split_index, ]
test_data <- ames_data_keras[-split_index, ]
# --- Step 3: Remove near-zero variance predictors ---
zero_var_cols <- nearZeroVar(train_data, saveMetrics = TRUE)
zero_var_names <- rownames(zero_var_cols)[zero_var_cols$zeroVar == TRUE]
train_data <- train_data[, !names(train_data) %in% zero_var_names]
test_data <- test_data[, !names(test_data) %in% zero_var_names]
# --- Step 4: Remove constant columns ---
constant_cols <- names(train_data)[sapply(train_data, function(x) length(unique(x)) == 1)]
train_data <- train_data[, !names(train_data) %in% constant_cols]
test_data <- test_data[, !names(test_data) %in% constant_cols]
# --- Step 5: Align factor levels (precautionary, rarely needed after dummying) ---
factor_cols <- names(train_data)[sapply(train_data, is.factor)]
for (col in factor_cols) {
test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
}
# --- Step 6: Separate predictors and target ---
x_train <- train_data %>% dplyr::select(-SalePrice) %>% as.matrix()
y_train <- train_data$SalePrice
x_test <- test_data %>% dplyr::select(-SalePrice) %>% as.matrix()
y_test <- test_data$SalePrice
# --- Step 7: Rescale predictors (if not already min-max) —
# SKIPPED here since `ames_data_minmax_full_final` is already scaled
# library(reticulate)
# Create a conda environment with Python 3.10
# conda_create("r-tensorflow", packages = "python=3.10")
# Activate it
# use_condaenv("r-tensorflow", required = TRUE)
# Install TensorFlow and dependencies using pip
# py_install(c("pip", "wheel", "setuptools", "tensorflow"), method = "pip")
# Load libraries
library(keras)
library(tensorflow)
##
## Attaching package: 'tensorflow'
## The following object is masked from 'package:caret':
##
## train
The Ames housing dataset simulates a real-world residential property valuation scenario, where accurate price predictions can directly impact real estate agencies, buyers, sellers, mortgage lenders, and local governments. In this context, model choice must balance predictive accuracy, interpretability, and operational usability.
High-Accuracy Models
SVR_Log produced the best predictive performance, with the lowest RMSE and MAE. This makes it highly suitable for automated appraisal tools or bulk property assessments where accuracy is paramount. However, its use of log transformation complicates interpretation. For client-facing applications (e.g., pricing reports for home sellers or buyers), additional steps would be needed to communicate predictions clearly.
XGBoost offers similarly strong accuracy with greater flexibility. Its ability to handle nonlinearities and interactions makes it a strong candidate for real estate investment platforms or pricing engines used by agencies. With proper use of tools like SHAP values, it can provide localized feature explanations, making it more business-friendly than SVR.
Balanced Performance and Interpretability
ANN_2_Layers performed well but lacks transparency. While it may be used in internal decision-support systems, it is less suited for settings where users need to understand the basis for price estimates. In regulated markets or client advisory settings, its black-box nature may pose challenges.
Random Forest offers a solid tradeoff between accuracy and interpretability. Real estate professionals can benefit from its feature importance scores to explain price drivers, such as the impact of square footage or neighborhood. It’s a dependable model for client consultations or agent training tools.
Simpler or Specialized Use Cases
Decision Tree, while less accurate, excels in interpretability. It can serve as a transparent valuation walkthrough tool in educational or onboarding contexts where stakeholders want to see step-by-step price reasoning.
ANN_Simple, though not top-performing, is lightweight and easy to deploy. It could be used in mobile valuation apps or first-pass price filters where speed and simplicity outweigh precision.
KNN models, with the lowest performance, mimic how a real estate agent might reason: “find similar houses nearby.” Their poor scalability and lower precision limit their use to exploratory data analysis or niche applications where simplicity is more valuable than accuracy.
Business Recommendations
This business-aligned evaluation helps ensure that model selection is not only statistically sound but also operationally appropriate for different real estate stakeholders. Let me know if you’d like this formatted into a report or turned into a slide.
K-Means Clustering Analysis with Elbow Comparison
As part of my unsupervised learning exploration, I applied
K-Means clustering to the Ames Housing
dataset using two input variations: (1) all scaled numeric
features (ames_data_scaled_standard
) and (2) a reduced set
of SVR-selected features (ames_data_svr_selected
).
What I Did:
Prepared the Data Both datasets were scaled to ensure comparability, which is essential for distance-based algorithms like K-Means.
Identified Optimal Clusters Using the Elbow
Method I plotted the within-cluster sum of squares
(WCSS) from k = 1 to 10
. As shown in the elbow
plots, both datasets yielded nearly identical curves with a clear elbow
at k = 3, indicating this is the optimal number of
clusters.
Trained K-Means Model with k = 3 I fit the final model and added cluster labels for visualization and downstream analysis.
Why This Matters: The goal was to uncover natural groupings in the housing data without supervision. These clusters can inform pricing strategies, marketing segmentation, or be used as additional features in predictive models. The near-identical elbow curves suggest that SVR-based feature selection retains clustering structure, offering a more efficient alternative to full input with minimal loss.
# --- Load Required Libraries ---
library(ggplot2)
library(dplyr)
# --- Prepare Data (remove SalePrice, keep numeric only) ---
input_full <- ames_data_scaled_standard
input_selected <- ames_data_svr_selected
# --- Compute WSS for Elbow Comparison ---
set.seed(123)
wss_full <- sapply(1:10, function(k) {
kmeans(input_full, centers = k, nstart = 10)$tot.withinss
})
wss_selected <- sapply(1:10, function(k) {
kmeans(input_selected, centers = k, nstart = 10)$tot.withinss
})
# --- Combine into tidy format for plotting ---
elbow_df <- data.frame(
k = rep(1:10, 2),
WSS = c(wss_full, wss_selected),
Dataset = rep(c("Full Scaled Data", "SVR Selected Data"), each = 10)
)
# --- Load Required Libraries ---
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
library(tidyr)
# --- Prepare Data (remove SalePrice, keep numeric only) ---
input_full <- ames_data_scaled_standard %>%
dplyr::select(where(is.numeric))
input_selected <- ames_data_svr_selected %>%
dplyr::select(where(is.numeric))
# --- Compute WSS for Elbow Comparison ---
set.seed(123)
wss_selected <- sapply(1:10, function(k) {
kmeans(input_selected, centers = k, nstart = 10)$tot.withinss
})
wss_full <- sapply(1:10, function(k) {
kmeans(input_full, centers = k, nstart = 10)$tot.withinss
})
# --- Combine into tidy format ---
elbow_df <- data.frame(
k = rep(1:10, 2),
WSS = c(wss_full, wss_selected),
Dataset = rep(c("Full Scaled Data", "SVR Selected Data"), each = 10)
)
# --- Filter individual datasets from elbow_df ---
elbow_full <- elbow_df %>% filter(Dataset == "Full Scaled Data")
elbow_selected <- elbow_df %>% filter(Dataset == "SVR Selected Data")
# --- Plot 1: Elbow Method – Full Scaled Data ---
ggplot(elbow_full, aes(x = k, y = WSS)) +
geom_line(color = "red") +
geom_point(color = "red", size = 2) +
labs(
title = "Elbow Method (Full Scaled Data, w/o SalePrice)",
x = "Number of Clusters (k)",
y = "Within-Cluster Sum of Squares"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold")
)
# --- Plot 2: Elbow Method – SVR Selected Data ---
ggplot(elbow_selected, aes(x = k, y = WSS)) +
geom_line(color = "blue") +
geom_point(color = "blue", size = 2) +
labs(
title = "Elbow Method (SVR Selected Data, w/o SalePrice)",
x = "Number of Clusters (k)",
y = "Within-Cluster Sum of Squares"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold")
)
# --- Final K-Means Clustering with chosen k ---
optimal_k <- 3 # or whatever you determined from the Elbow plot
set.seed(123)
kmeans_model_full <- kmeans(input_full, centers = optimal_k, nstart = 25)
kmeans_model_selected <- kmeans(input_selected, centers = optimal_k, nstart = 25)
I performed K-means clustering on both the full scaled dataset and a version with SVR-selected features, using k = 3 based on the Elbow Method.
# --- Final K-Means Clustering with chosen k ---
optimal_k <- 3 # based on the elbow plot inspection
set.seed(123)
kmeans_model_full <- kmeans(input_full, centers = optimal_k, nstart = 25)
kmeans_model_selected <- kmeans(input_selected, centers = optimal_k, nstart = 25)
# --- Cluster Visualization ---
library(factoextra)
fviz_cluster(
object = list(data = input_full, cluster = kmeans_model_full$cluster),
geom = "point",
main = "K-Means Cluster Plot (Full Scaled Data)",
palette = "jco", labelsize = 0
)
fviz_cluster(
object = list(data = input_selected, cluster = kmeans_model_selected$cluster),
geom = "point",
main = "K-Means Cluster Plot (SVR Selected Data)",
palette = "jco", labelsize = 0
)
Cluster 1
Cluster 2
Cluster 3
Cluster 1
Cluster 2
Cluster 3
# --- Step 1: Get top contributing variables ---
get_top_contributors <- function(kmeans_model, top_n = 10) {
centers_df <- as.data.frame(kmeans_model$centers)
std_devs <- apply(centers_df, 2, sd)
names(sort(std_devs, decreasing = TRUE))[1:top_n]
}
top_vars_full <- get_top_contributors(kmeans_model_full)
top_vars_selected <- get_top_contributors(kmeans_model_selected)
# --- Step 2: Add cluster labels ---
ames_data_clustered_full <- input_full %>%
mutate(cluster = factor(kmeans_model_full$cluster))
ames_data_clustered_selected <- input_selected %>%
mutate(cluster = factor(kmeans_model_selected$cluster))
# --- Step 3: Filter top vars that exist
top_vars_full <- top_vars_full[top_vars_full %in% colnames(ames_data_clustered_full)]
top_vars_selected <- top_vars_selected[top_vars_selected %in% colnames(ames_data_clustered_selected)]
# --- Step 4: Create profiling summaries (still scaled)
profile_summary_full <- ames_data_clustered_full %>%
group_by(cluster) %>%
summarise(across(all_of(top_vars_full), ~ round(mean(.x, na.rm = TRUE), 2))) %>%
ungroup()
profile_summary_selected <- ames_data_clustered_selected %>%
group_by(cluster) %>%
summarise(across(all_of(top_vars_selected), ~ round(mean(.x, na.rm = TRUE), 2))) %>%
ungroup()
# --- Step 5: Reverse Z-score scaling
reverse_z_score <- function(scaled, original) {
orig_mean <- mean(original, na.rm = TRUE)
orig_sd <- sd(original, na.rm = TRUE)
return((scaled * orig_sd) + orig_mean)
}
for (var in top_vars_full) {
if (var %in% names(profile_summary_full) && var %in% names(ames_data_svr)) {
profile_summary_full[[var]] <- reverse_z_score(profile_summary_full[[var]], ames_data_svr[[var]])
}
}
for (var in top_vars_selected) {
if (var %in% names(profile_summary_selected) && var %in% names(ames_data_svr)) {
profile_summary_selected[[var]] <- reverse_z_score(profile_summary_selected[[var]], ames_data_svr[[var]])
}
}
# --- Step 6: Display
print(profile_summary_full)
## # A tibble: 3 × 11
## cluster SalePrice capped_SalePrice OverallQual ExterQual capped_Gr.Liv.Area
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 29913694432. 329873. 8.51 4.33 2206.
## 2 2 10419775718. 130227. 5.28 3.09 1246.
## 3 3 17478973673. 218505. 6.90 3.70 1728.
## # ℹ 5 more variables: KitchenQual <dbl>, GrLivArea <dbl>,
## # capped_Total.Bsmt.SF <dbl>, X1stFlrSF <dbl>, TotalBsmtSF <dbl>
print(profile_summary_selected)
## # A tibble: 3 × 6
## cluster SalePrice HouseAge RemodAge Fireplaces GarageAge
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 29913694432. 7.96 4.33 1.20 6.82
## 2 2 17478973673. 18.3 11.6 0.839 24.8
## 3 3 10419775718. 51.3 33.3 0.373 222.
I applied hierarchical clustering using the top features identified from a Random Forest model. The resulting dendrogram reveals a clear hierarchical structure in the data, with a few large clusters and several smaller subgroups nested within them. The large vertical jump toward the top of the dendrogram suggests that the dataset can be meaningfully segmented into approximately 2 to 3 main clusters, based on the largest linkage distances. This structure supports earlier K-means findings and offers an alternative, interpretable view of cluster relationships.
# Compute distance matrix
dist_matrix <- dist(ames_data_hclust)
# Hierarchical clustering using complete linkage
hc <- hclust(dist_matrix, method = "complete")
# --- Step 4: Plot Dendrogram without x-axis labels ---
plot(hc,
main = "Dendrogram - Hierarchical Clustering (Top RF Features)",
xlab = "", sub = "", xaxt = "n", labels = FALSE)
# --- Programmatically determine cut heights ---
# Step 1: Get heights where merges occurred
merge_heights <- sort(hc$height, decreasing = TRUE)
# Step 2: To form k clusters, cut at the (k - 1)th highest height
height_k3 <- merge_heights[2] # for k = 3
height_k4 <- merge_heights[3] # for k = 4
# --- Load Required Library ---
library(randomForest)
# --- Step 1: Extract variable importance from trained RF model ---
rf_importance <- importance(rf_model$finalModel)
# --- Step 2: Convert to data frame and sort ---
rf_importance_df <- data.frame(
Variable = rownames(rf_importance),
Importance = rf_importance[, "IncNodePurity"]
)
rf_importance_df <- rf_importance_df[order(-rf_importance_df$Importance), ]
# --- Step 3: Select top N important variables ---
top_n <- 20
top_rf_vars <- rf_importance_df$Variable[1:top_n]
# --- Step 4: Keep only variables that exist in ames_data_imputed ---
top_rf_vars_filtered <- intersect(top_rf_vars, colnames(ames_data_imputed))
# --- Step 5: Subset the data safely ---
ames_data_rf_top <- ames_data_imputed[, top_rf_vars_filtered]
# --- Step 6: Confirm structure ---
# str(ames_data_rf_top)
# --- Step 1: Filter numeric columns ---
numeric_ann <- dplyr::select(ames_data_rf_top, where(is.numeric))
# --- Step 2: Remove invalid rows ---
valid_rows <- complete.cases(numeric_ann) & apply(numeric_ann, 1, function(x) all(is.finite(x)))
numeric_ann <- numeric_ann[valid_rows, ]
# --- Step 3: Save column means and sds for inverse transformation ---
pca_means <- colMeans(numeric_ann)
pca_sds <- apply(numeric_ann, 2, sd)
# --- Step 4: Scale the data ---
numeric_scaled <- scale(numeric_ann, center = pca_means, scale = pca_sds)
# --- Step 5: Run PCA on scaled data ---
pca_result <- prcomp(numeric_scaled, center = FALSE, scale. = FALSE)
# --- Step 6: Scree plot ---
plot(pca_result$sdev^2, type = "b", pch = 1,
main = "PCA Scree Plot - Top 20 RF important variables",
xlab = "Principal Components",
ylab = "Variances")
# --- Step 7: Add PCA scores to scaled data ---
pca_scores <- as.data.frame(pca_result$x[, 1:5])
colnames(pca_scores) <- paste0("PC", 1:5)
# --- Step 8: Merge PCA scores with scaled data ---
ames_data_rf_pca <- cbind(as.data.frame(numeric_scaled), pca_scores)
# --- Save pca_means and pca_sds for later inverse transformation ---
# Save to RDS if needed:
saveRDS(list(means = pca_means, sds = pca_sds), "pca_scaling_params.rds")
Interpretation
The PCA scree plot shows a steep drop in variance after the first two components, with the first principal component alone explaining a large portion of the total variance. This suggests that the top 20 features identified by the Random Forest model contain substantial redundancy, and dimensionality can likely be reduced to 2–3 components without significant information loss—supporting the use of PCA for visualization and clustering.
# Load required library
library(ggplot2)
# Apply K-means with k = 3 (based on elbow method)
set.seed(123)
kmeans_model <- kmeans(pca_scores[, 1:2], centers = 3, nstart = 25)
# Add cluster labels to PCA scores
pca_scores$cluster <- as.factor(kmeans_model$cluster)
# Calculate percent variance explained
pca_var <- pca_result$sdev^2
pca_var_exp <- round(100 * pca_var / sum(pca_var), 1)
# Plot using PC1 and PC2 with variance explained
ggplot(pca_scores, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
labs(title = "K-means Clustering on PCA (2D Projection) - Top 20 RF important variables",
x = paste0("Principal Component 1 (", pca_var_exp[1], "%)"),
y = paste0("Principal Component 2 (", pca_var_exp[2], "%)")) +
theme_minimal() +
scale_color_brewer(palette = "Set1")
This 2D scatter plot illustrates the K-means clustering results using the first two principal components derived from the top 20 features ranked by Random Forest importance. The clusters show clear separation with minimal overlap, indicating that the principal components effectively capture underlying patterns in the data. However, because PC1 and PC2 are linear combinations of multiple variables, this view lacks direct interpretability—we cannot immediately tell which features define each cluster.
# --- Load Required Libraries ---
library(dplyr)
# --- Step 1: Remove target columns if present ---
remove_cols <- c("SalePrice", "capped_SalePrice")
existing_cols <- intersect(remove_cols, colnames(ames_data_scaled_standard))
ames_data_pca_input <- ames_data_scaled_standard %>%
dplyr::select(-all_of(existing_cols))
# --- Step 2: Filter numeric columns only ---
numeric_full <- dplyr::select(ames_data_pca_input, where(is.numeric))
# --- Step 3: Remove rows with missing or infinite values ---
valid_rows <- complete.cases(numeric_full) & apply(numeric_full, 1, function(x) all(is.finite(x)))
numeric_full <- numeric_full[valid_rows, ]
# --- Step 4: Save column means and standard deviations for inverse transform ---
pca_means_full <- colMeans(numeric_full)
pca_sds_full <- apply(numeric_full, 2, sd)
# --- Step 5: Standardize the data ---
numeric_scaled_full <- scale(numeric_full, center = pca_means_full, scale = pca_sds_full)
# --- Step 6: Perform PCA ---
pca_result_full <- prcomp(numeric_scaled_full, center = FALSE, scale. = FALSE)
# --- Step 7: Scree Plot ---
plot(pca_result_full$sdev^2, type = "b", pch = 1, col = "orange",
main = "PCA Scree Plot - Full Standardized Dataset",
xlab = "Principal Component",
ylab = "Proportion of Variance Explained")
# --- Step 8: Add PCA scores to dataset ---
pca_scores_full <- as.data.frame(pca_result_full$x[, 1:5])
colnames(pca_scores_full) <- paste0("PC", 1:5)
# --- Step 9: Merge scores back into scaled data ---
ames_data_full_pca <- cbind(as.data.frame(numeric_scaled_full), pca_scores_full)
# --- Step 10: Save scaling parameters ---
saveRDS(list(means = pca_means_full, sds = pca_sds_full), "pca_scaling_params_full.rds")
# --- Load Required Libraries ---
library(ggplot2)
# --- Step 1: Apply K-means on First Two PCs ---
set.seed(123)
kmeans_model_full <- kmeans(ames_data_full_pca[, c("PC1", "PC2")], centers = 3, nstart = 25)
# --- Step 2: Add cluster labels ---
ames_data_full_pca$cluster <- as.factor(kmeans_model_full$cluster)
# --- Step 3: Compute Variance Explained by PCs ---
pca_var_full <- pca_result_full$sdev^2
pca_var_exp_full <- round(100 * pca_var_full / sum(pca_var_full), 1)
# --- Step 4: Plot PCA Clusters with Variance Explained ---
ggplot(ames_data_full_pca, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
labs(title = "K-means Clustering on PCA (2D Projection) - Full Standardized Dataset",
x = paste0("Principal Component 1 (", pca_var_exp_full[1], "%)"),
y = paste0("Principal Component 2 (", pca_var_exp_full[2], "%)")) +
theme_minimal() +
scale_color_brewer(palette = "Set1")
PCA on the top 20 RF important variables shows that the first 2 components explain over 50% of the variance, enabling clear cluster separation in K-means. In contrast, PCA on the full standardized dataset has a more gradual variance decline, with PC1 and PC2 explaining only about 10.6% combined, resulting in less distinct cluster boundaries. This highlights the benefit of variable selection prior to dimensionality reduction and clustering.
# --- Load required libraries ---
library(ggplot2)
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:MASS':
##
## area
library(dplyr)
# --- Ensure cluster columns exist ---
# For top RF features
if (!"cluster" %in% colnames(pca_scores)) {
set.seed(123)
kmeans_model <- kmeans(pca_scores[, 1:2], centers = 3, nstart = 25)
pca_scores$cluster <- as.factor(kmeans_model$cluster)
}
# For full standardized dataset
if (!"cluster" %in% colnames(ames_data_full_pca)) {
set.seed(123)
kmeans_model_full <- kmeans(ames_data_full_pca[, c("PC1", "PC2")], centers = 3, nstart = 25)
ames_data_full_pca$cluster <- as.factor(kmeans_model_full$cluster)
}
# --- Scree Plot 1: Top RF Variables ---
pca_var_rf <- pca_result$sdev^2
p1 <- qplot(1:length(pca_var_rf), pca_var_rf, geom = c("point", "line")) +
labs(title = "Scree Plot - Top RF Features",
x = "Principal Components", y = "Variance") +
theme_minimal()
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# --- Cluster Plot 1: Top RF Variables ---
pca_var_exp_rf <- round(100 * pca_var_rf / sum(pca_var_rf), 1)
p2 <- ggplot(pca_scores, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.7, size = 1.5) +
labs(title = "K-means on Top RF Features",
x = paste0("PC1 (", pca_var_exp_rf[1], "%)"),
y = paste0("PC2 (", pca_var_exp_rf[2], "%)")) +
theme_minimal() +
scale_color_brewer(palette = "Set1")
# --- Scree Plot 2: Full Standardized Dataset ---
pca_var_full <- pca_result_full$sdev^2
p3 <- qplot(1:length(pca_var_full), pca_var_full, geom = c("point", "line")) +
labs(title = "Scree Plot - Full Dataset",
x = "Principal Components", y = "Variance") +
theme_minimal()
# --- Cluster Plot 2: Full Dataset ---
pca_var_exp_full <- round(100 * pca_var_full / sum(pca_var_full), 1)
p4 <- ggplot(ames_data_full_pca, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.7, size = 1.5) +
labs(title = "K-means on Full Dataset",
x = paste0("PC1 (", pca_var_exp_full[1], "%)"),
y = paste0("PC2 (", pca_var_exp_full[2], "%)")) +
theme_minimal() +
scale_color_brewer(palette = "Set1")
# --- Combine all plots in a 2x2 layout ---
(p1 | p2) / (p3 | p4)
To interpret the clusters meaningfully, we must map them back to the original top contributing variables and compute summary statistics (e.g., mean or median) for each cluster across those variables.
# --- Step 1: Load saved scaling parameters ---
scaling_params <- readRDS("pca_scaling_params.rds")
pca_means <- scaling_params$means
pca_sds <- scaling_params$sds
# --- Step 2: Extract scaled data used for PCA (with matching variables) ---
matched_vars <- intersect(names(pca_means), colnames(ames_data_rf_pca))
pca_means <- pca_means[matched_vars]
pca_sds <- pca_sds[matched_vars]
numeric_scaled <- ames_data_rf_pca[, matched_vars]
# --- Step 3: Reverse scale (unscale and uncenter) ---
numeric_original <- sweep(numeric_scaled, 2, pca_sds, FUN = "*")
numeric_original <- sweep(numeric_original, 2, pca_means, FUN = "+")
# --- Step 4: Add cluster labels only for rows used in PCA ---
# Use the same valid rows used during PCA creation
valid_rows <- complete.cases(ames_data_rf_top) & apply(ames_data_rf_top, 1, function(x) all(is.finite(x)))
# Match cluster labels to valid rows
cluster_labels <- rep(NA, nrow(ames_data_rf_top))
cluster_labels[valid_rows] <- kmeans_model$cluster
# Final unscaled data with cluster column
numeric_original_df <- as.data.frame(numeric_original)
numeric_original_df$cluster <- as.factor(cluster_labels[valid_rows])
# --- Step 5: Remove rows with missing cluster assignment (if any) ---
numeric_original_df <- numeric_original_df[!is.na(numeric_original_df$cluster), ]
# --- Step 6: Compute summary stats per cluster ---
library(dplyr)
cluster_summary <- numeric_original_df %>%
group_by(cluster) %>%
summarise(across(where(is.numeric), ~ round(mean(.x, na.rm = TRUE), 2)))
# --- Step 7: View result ---
print(cluster_summary)
## # A tibble: 3 × 17
## cluster Overall.Qual Garage.Cars Gr.Liv.Area Total.Bsmt.SF X1st.Flr.SF
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 6.74 2.12 1980. 937. 1067.
## 2 2 5.1 1.24 1144. 830. 956.
## 3 3 7.11 2.29 1622. 1528. 1586.
## # ℹ 11 more variables: Garage.Area <dbl>, Year.Built <dbl>,
## # Garage.Yr.Blt <dbl>, Mas.Vnr.Area <dbl>, BsmtFin.SF.1 <dbl>,
## # Full.Bath <dbl>, Year.Remod.Add <dbl>, Lot.Area <dbl>, Fireplaces <dbl>,
## # TotRms.AbvGrd <dbl>, X2nd.Flr.SF <dbl>
The PCA 2D scatter plot and the cluster-level attribute summary complement each other in revealing meaningful segmentation:
Cluster 3 (red) lies prominently in the top-right quadrant of the PCA plot and spans a wide range—this matches the attribute summary showing high averages across all key features. These represent large, high-end homes with superior quality, more garage space, multiple baths, and higher total area.
Cluster 2 (blue) is densely packed toward the center of the PCA projection. Its profile reflects mid-sized, moderately valued homes, with average build years, fewer features than Cluster 3 but more than Cluster 1. These appear to be typical or mid-market properties.
Cluster 1 (green) is located on the lower-left and is more compact. According to the attribute table, this group has the lowest values across most features, including smaller living areas, older construction, fewer baths, and lower overall quality—likely representing budget or entry-level homes.
Conclusion: The PCA projection visually validates the differences across clusters identified through feature averages. The spatial separation observed is not arbitrary—it reflects genuine, interpretable differences in home characteristics. Cluster positions on the PCA plot correspond to a meaningful hierarchy of housing quality, size, and amenities.
Purpose: Visualize how clusters differ across top RF features using parallel coordinates.
library(dplyr)
library(tidyr)
library(ggplot2)
# --- Ensure cluster column is added to ames_data_rf_pca ---
ames_data_rf_pca$cluster <- as.factor(kmeans_model$cluster)
# --- Step 1: Prepare Scaled Data ---
top_features_scaled <- intersect(top_rf_vars, colnames(ames_data_rf_pca))
scaled_summary <- ames_data_rf_pca %>%
group_by(cluster) %>%
summarise(across(all_of(top_features_scaled), ~ mean(.x, na.rm = TRUE))) %>%
pivot_longer(cols = -cluster, names_to = "feature", values_to = "value") %>%
mutate(source = "Scaled")
# --- Step 2: Prepare Original Data ---
top_features_orig <- intersect(top_rf_vars, colnames(numeric_original_df))
original_summary <- numeric_original_df %>%
group_by(cluster) %>%
summarise(across(all_of(top_features_orig), ~ mean(.x, na.rm = TRUE))) %>%
pivot_longer(cols = -cluster, names_to = "feature", values_to = "value") %>%
mutate(source = "Original Units")
# --- Step 3: Combine and Plot ---
combined_plot_data <- bind_rows(scaled_summary, original_summary)
ggplot(combined_plot_data, aes(x = feature, y = value, group = cluster, color = as.factor(cluster))) +
geom_line(size = 1) +
geom_point() +
facet_wrap(~ source, ncol = 1, scales = "free_y") +
scale_color_brewer(palette = "Set1", name = "Cluster") +
labs(title = "Parallel Coordinates Plot (Cluster Means)",
x = "Top RF Features", y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
This faceted parallel coordinates plot compares cluster-level means of the top 20 Random Forest features in both original units (top panel) and scaled values (bottom panel):
Gr.Liv.Area
,
Garage.Area
, and Total.Bsmt.SF
, indicating
higher-end properties.X2nd.Flr.SF
, Mas.Vnr.Area
, and
TotRms.AbvGrd
show clear divergence in direction and
magnitude.The agreement between scaled and unscaled views strengthens interpretability while retaining contextual clarity.
Purpose: Display distribution of top features across clusters.
library(ggplot2)
# Plot Overall Quality in original units by cluster
ggplot(numeric_original_df, aes(x = cluster, y = Overall.Qual, fill = cluster)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Distribution of Overall Quality by Cluster (Original Units)",
x = "Cluster",
y = "Overall Quality"
) +
scale_fill_brewer(palette = "Set1")
This boxplot displays the distribution of Overall Quality across the three K-means clusters using the original scale (1–10) of the Ames dataset.
This visualization reinforces earlier findings: the clustering aligns meaningfully with housing quality, capturing a spectrum from budget to luxury homes.
Purpose: Show how original variables contribute to PC1 and PC2 — helpful for understanding cluster separation.
library(FactoMineR)
library(factoextra)
library(ggrepel)
# Perform PCA
pca_model <- prcomp(ames_data_rf_top, center = FALSE, scale. = FALSE)
# Create biplot with jittered labels
fviz_pca_biplot(
pca_model,
habillage = ames_data_rf_pca$cluster,
addEllipses = TRUE,
repel = TRUE, # Enable smart label repelling
label = "var", # Show variable labels
title = "PCA Biplot with Cluster Separation (Jittered Labels)"
)
Interpretation of PCA Biplot with Cluster Separation (Jittered Labels)
Dominance of Dim1 (96.4% Variance)
Key Variables Contributing to Dim1
Multicollinearity
Many arrows point in the same general direction, suggesting high positive correlation among variables like:
This agrees with your earlier findings from the scree plot, where PC1 alone explains the majority of variance—a hallmark of multicollinearity.
Vertical Axis (Dim2)
Cluster Separation
Lot.Area
and Gr.Liv.Area
.Conclusion
This biplot confirms and supports previous interpretations:
Lot.Area
, Overall.Qual
, and
Garage.Area
.Purpose: Visualize distances or feature averages between clusters and variables.
# --- Load required libraries ---
library(pheatmap)
library(dplyr)
library(tibble)
# --- Step 1: Compute cluster-wise means using original (unscaled) data ---
top_features <- intersect(top_rf_vars, colnames(numeric_original_df))
cluster_summary_original <- numeric_original_df %>%
group_by(cluster) %>%
summarise(across(all_of(top_features), ~ mean(.x, na.rm = TRUE))) %>%
column_to_rownames("cluster")
# --- Step 2: Min-Max scale each column for heatmap ---
min_max_scaled <- as.data.frame(lapply(cluster_summary_original, function(x) {
(x - min(x)) / (max(x) - min(x))
}))
rownames(min_max_scaled) <- rownames(cluster_summary_original)
# --- Step 3: Draw heatmap using Min-Max scaled values ---
pheatmap(min_max_scaled,
main = "Heatmap of Cluster Feature Averages (Min-Max Scaled)",
cluster_rows = FALSE,
cluster_cols = TRUE,
angle_col = 90)
Interpretation: Min-Max Scaled Heatmap of Cluster Feature Averages
This heatmap compares cluster-wise means across the top Random Forest features, scaled from 0 to 1 for comparability.
Cluster 3 (Row 3)
Garage.Area
,
Gr.Liv.Area
, Overall.Qual
,
Lot.Area
, TotRms.AbvGrd
, and
X2nd.Flr.SF
Cluster 1 (Row 1)
Year.Built
,
Full.Bath
, and Fireplaces
Cluster 2 (Row 2)
Total.Bsmt.SF
,
Garage.Cars
, and Mas.Vnr.Area
Conclusion: Cluster 3 stands out as the premium segment, Cluster 2 as the entry-level group, and Cluster 1 as the middle ground. The feature distribution reinforces PCA and clustering results.
# --- Load libraries ---
library(cluster) # for daisy()
library(dbscan) # for DBSCAN
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:VIM':
##
## kNN
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(FNN) # for kNN distance plot
library(dplyr) # for data manipulation
library(ggplot2) # for optional plotting
# --- Step 1: Convert binary numeric columns to factors ---
ames_data_imputed[] <- lapply(ames_data_imputed, function(x) {
if (is.numeric(x) && length(unique(x)) == 2) as.factor(x) else x
})
# --- Step 2: Compute Gower distance ---
gower_dist <- daisy(ames_data_imputed, metric = "gower")
gower_matrix <- as.matrix(gower_dist)
# --- Step 4: Run DBSCAN with adjusted eps ---
dbscan_result <- dbscan(gower_matrix, eps = 1.5, minPts = 30)
# --- Step 6 (optional): Attach cluster labels back to dataset ---
ames_data_imputed$cluster <- as.factor(dbscan_result$cluster)
This approach clusters your dataset based only on the top predictive features identified by Random Forest, improving interpretability and reducing noise. Let me know if you’d like a PCA plot, profiling summaries, or a heatmap of these clusters.
table(dbscan_result$cluster)
##
## 0 1 2
## 956 1064 910
This tells you how many points are in each cluster (including noise,
which is labeled as 0
).
ames_data_imputed$cluster <- as.factor(dbscan_result$cluster)
library(ggplot2)
# Run PCA just for visualization
pca_vis <- prcomp(ames_data_imputed[ , sapply(ames_data_imputed, is.numeric)], scale. = TRUE)
pca_df <- as.data.frame(pca_vis$x[, 1:2])
pca_df$cluster <- ames_data_imputed$cluster
# Calculate variance explained
pca_var <- pca_vis$sdev^2
pca_var_exp <- round(100 * pca_var / sum(pca_var), 1)
# Plot clusters with % variance in axis labels
ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.5, size = 2) +
labs(
title = "DBSCAN Clusters with Gower Distance (PCA Projection)",
x = paste0("PC1 (", pca_var_exp[1], "%)"),
y = paste0("PC2 (", pca_var_exp[2], "%)")
) +
theme_minimal()
The DBSCAN clustering using Gower distance (projected onto PCA space) reveals three main groupings:
Cluster 0 (Red): Contains a large number of dispersed points across the PCA space. This cluster likely includes outliers or noise, as is typical in DBSCAN when points don’t belong to any dense region.
Cluster 1 (Green): Represents a concentrated group toward the right of the plot, indicating a coherent, high-density segment of homes that share similar characteristics.
Cluster 2 (Blue): A slightly more diffuse cluster overlapping with Cluster 1 but still denser than the red points. It captures a second distinct segment with partial overlap.
Summary: DBSCAN successfully identified two dense housing clusters and flagged the rest (Cluster 0) as noise or outliers. This method is useful for isolating well-defined home groups while avoiding forced assignments of outliers.
sum(dbscan_result$cluster == 0)
## [1] 956
These are the outliers or points not assigned to any cluster.
# --- Load Required Libraries ---
library(dplyr)
# --- Step 1: Assign DBSCAN cluster labels to ames_data_imputed ---
ames_data_imputed$cluster <- as.factor(dbscan_result$cluster)
# --- Step 2: (Optional) Copy to a new object for analysis/presentation ---
dbscan_numeric_original_df <- ames_data_imputed # already contains 'cluster'
# --- Step 3: Identify top Random Forest features available in data ---
top_features <- intersect(top_rf_vars, colnames(dbscan_numeric_original_df))
# --- Step 4: Compute cluster-wise summary statistics (including noise cluster 0) ---
cluster_summary <- dbscan_numeric_original_df %>%
group_by(cluster) %>%
summarise(across(all_of(top_features), ~ round(mean(.x, na.rm = TRUE), 2)))
# --- Step 5: View summary ---
print(cluster_summary)
## # A tibble: 3 × 17
## cluster Overall.Qual Garage.Cars Gr.Liv.Area Total.Bsmt.SF X1st.Flr.SF
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 5.92 1.64 1627. 1021. 1235.
## 2 1 5.31 1.45 1237. 946. 1032.
## 3 2 7.2 2.27 1673. 1205. 1230.
## # ℹ 11 more variables: Garage.Area <dbl>, Year.Built <dbl>,
## # Garage.Yr.Blt <dbl>, Mas.Vnr.Area <dbl>, BsmtFin.SF.1 <dbl>,
## # Full.Bath <dbl>, Year.Remod.Add <dbl>, Lot.Area <dbl>, Fireplaces <dbl>,
## # TotRms.AbvGrd <dbl>, X2nd.Flr.SF <dbl>
Cluster 0
Cluster 1
Cluster 2
These clusters effectively segment the Ames housing market into older entry-level homes, average mid-century homes, and newer premium properties.
library(dplyr)
library(tidyr)
library(ggplot2)
# --- Step 1: Attach DBSCAN cluster labels to scaled PCA data ---
# ONLY if row order in ames_data_rf_pca matches dbscan_numeric_original_df
ames_data_rf_pca$cluster <- dbscan_numeric_original_df$cluster
# --- Step 2: Prepare Scaled Data ---
top_features_scaled <- intersect(top_rf_vars, colnames(ames_data_rf_pca))
scaled_summary <- ames_data_rf_pca %>%
group_by(cluster) %>%
summarise(across(all_of(top_features_scaled), ~ mean(.x, na.rm = TRUE))) %>%
pivot_longer(cols = -cluster, names_to = "feature", values_to = "value") %>%
mutate(source = "Scaled")
# --- Step 3: Prepare Original Data ---
top_features_orig <- intersect(top_rf_vars, colnames(dbscan_numeric_original_df))
original_summary <- dbscan_numeric_original_df %>%
group_by(cluster) %>%
summarise(across(all_of(top_features_orig), ~ mean(.x, na.rm = TRUE))) %>%
pivot_longer(cols = -cluster, names_to = "feature", values_to = "value") %>%
mutate(source = "Original Units")
# --- Step 4: Ensure cluster type consistency ---
scaled_summary$cluster <- as.factor(scaled_summary$cluster)
original_summary$cluster <- as.factor(original_summary$cluster)
# --- Step 5: Combine and Plot ---
combined_plot_data <- bind_rows(scaled_summary, original_summary)
ggplot(combined_plot_data, aes(x = feature, y = value, group = cluster, color = cluster)) +
geom_line(size = 1) +
geom_point() +
facet_wrap(~ source, ncol = 1, scales = "free_y") +
scale_color_brewer(palette = "Set1", name = "Cluster") +
labs(title = "Parallel Coordinates Plot (Cluster Means by DBSCAN)",
x = "Top RF Features", y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
This parallel coordinates plot shows median values of top RF features across DBSCAN clusters, using both original and scaled units:
Overall, the clustering cleanly distinguishes between high-end, mid-tier, and lower-value housing segments.
library(ggplot2)
ggplot(dbscan_numeric_original_df, aes(x = cluster, y = Overall.Qual, fill = cluster)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Distribution of Overall Quality by Cluster (Original Units)",
x = "Cluster",
y = "Overall Quality"
) +
scale_fill_brewer(palette = "Set1")
This boxplot shows that:
In short, Cluster 2 captures premium homes, Cluster 1 lower-quality ones, and Cluster 0 is the middle tier.
library(FactoMineR)
library(factoextra)
library(ggrepel)
# --- Step 1: Perform PCA (no need to re-scale if already scaled before) ---
pca_model <- prcomp(ames_data_rf_top, center = FALSE, scale. = FALSE)
# --- Step 2: Assign DBSCAN cluster labels to the PCA projection data ---
ames_data_rf_pca$cluster <- dbscan_result$cluster # Ensure alignment
# --- Step 3: Create PCA biplot with DBSCAN clusters ---
fviz_pca_biplot(
pca_model,
habillage = as.factor(ames_data_rf_pca$cluster), # Convert to factor for coloring
addEllipses = TRUE,
repel = TRUE,
label = "var",
title = "PCA Biplot with DBSCAN Cluster Separation (Jittered Labels)"
)
This PCA biplot shows how DBSCAN clusters separate along principal components based on top housing features:
In short, LotArea and home size are the dominant drivers of clustering, especially for distinguishing high-end properties (Cluster 2).
library(pheatmap)
library(dplyr)
library(tibble)
# --- Step 1: Cluster-wise means on original data ---
top_features <- intersect(top_rf_vars, colnames(dbscan_numeric_original_df))
cluster_summary_original <- dbscan_numeric_original_df %>%
group_by(cluster) %>%
summarise(across(all_of(top_features), ~ mean(.x, na.rm = TRUE))) %>%
column_to_rownames("cluster")
# --- Step 2: Min-Max Scale for Visualization ---
min_max_scaled <- as.data.frame(lapply(cluster_summary_original, function(x) {
(x - min(x)) / (max(x) - min(x))
}))
rownames(min_max_scaled) <- rownames(cluster_summary_original)
# --- Step 3: Heatmap ---
pheatmap(min_max_scaled,
main = "Heatmap of Cluster Feature Averages (Min-Max Scaled)",
cluster_rows = FALSE,
cluster_cols = TRUE,
angle_col = 90)
This heatmap shows min-max scaled feature averages across DBSCAN clusters:
Overall, the clusters capture a clear stratification of housing value and quality in the Ames dataset.