# ==============================================================================
# BACKEND DATA PROCESSING
# Logic imported from DataCleaning.R
# ==============================================================================
# 1. Read Raw Data
# Please ensure "HousePrices.csv" is in the same directory
if(file.exists("HousePrices.csv")) {
raw_data <- read.csv("HousePrices.csv", stringsAsFactors = FALSE)
} else {
stop("Error: 'HousePrices.csv' not found. Please upload the raw dataset.")
}
# 2. Ordinal Mapping Definition
quality_map <- c("None" = 0, "Po" = 1, "Fa" = 2, "TA" = 3, "Gd" = 4, "Ex" = 5)
# 3. Cleaning Pipeline
df_clean <- raw_data %>%
# --- Outlier handling ---
filter(!(GrLivArea > 4000 & SalePrice < 300000)) %>%
# --- Fill in the classification gap (NAs that mean "None") ---
mutate(
PoolQC = replace_na(PoolQC, "None"),
MiscFeature = replace_na(MiscFeature, "None"),
Alley = replace_na(Alley, "None"),
Fence = replace_na(Fence, "None"),
FireplaceQu = replace_na(FireplaceQu, "None"),
GarageType = replace_na(GarageType, "None"),
GarageFinish = replace_na(GarageFinish, "None"),
GarageQual = replace_na(GarageQual, "None"),
GarageCond = replace_na(GarageCond, "None"),
BsmtQual = replace_na(BsmtQual, "None"),
BsmtCond = replace_na(BsmtCond, "None"),
BsmtExposure = replace_na(BsmtExposure, "None"),
BsmtFinType1 = replace_na(BsmtFinType1, "None"),
BsmtFinType2 = replace_na(BsmtFinType2, "None"),
MasVnrType = replace_na(MasVnrType, "None")
) %>%
# --- Fill in the missing values (Median for LotFrontage) ---
mutate(
LotFrontage = ifelse(is.na(LotFrontage), median(LotFrontage, na.rm = TRUE), LotFrontage),
MasVnrArea = replace_na(MasVnrArea, 0),
GarageYrBlt = replace_na(GarageYrBlt, 0),
Electrical = replace_na(Electrical, "SBrkr")
) %>%
# --- Ordinal Encoding ---
mutate(
ExterQual = quality_map[ExterQual],
ExterCond = quality_map[ExterCond],
BsmtQual = quality_map[BsmtQual],
BsmtCond = quality_map[BsmtCond],
HeatingQC = quality_map[HeatingQC],
KitchenQual = quality_map[KitchenQual],
FireplaceQu = quality_map[FireplaceQu],
GarageQual = quality_map[GarageQual],
GarageCond = quality_map[GarageCond],
PoolQC = quality_map[PoolQC]
) %>%
# --- Construct new features ---
mutate(
HouseAge = pmax(0, YrSold - YearBuilt),
TotalSF = GrLivArea + TotalBsmtSF,
) %>%
# --- Type conversion ---
mutate(
MSSubClass = as.factor(MSSubClass),
MoSold = as.factor(MoSold),
YrSold = as.factor(YrSold),
) %>%
mutate_if(is.character, as.factor) %>%
# --- Delete useless columns ---
select(-any_of(c("Id", "Utilities")))
# df_clean is now ready for analysis in subsequent chunks
Our project leverages the Ames Housing Dataset (1460 observations, 81 variables) to address two core business questions:
SalePrice based on house attributes?Methodology: We followed the CRISP-DM framework: Data Understanding \(\rightarrow\) Cleaning \(\rightarrow\) Feature Engineering \(\rightarrow\) Modeling \(\rightarrow\) Evaluation.
# Code adapted from report chart.R
# Visualization: Before vs After Cleaning
df_raw <- read.csv("HousePrices.csv", stringsAsFactors = FALSE)
# Mark outliers for visualization
df_raw$Type <- ifelse(df_raw$GrLivArea > 4000 & df_raw$SalePrice < 300000,
"Outlier", "Normal")
# Plot 1: Before Cleaning
p1 <- ggplot(df_raw, aes(x = GrLivArea, y = SalePrice)) +
geom_point(aes(color = Type), alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Normal" = "gray50", "Outlier" = "red")) +
geom_smooth(method = "lm", se = FALSE, color = "blue", linetype = "dashed") +
labs(title = "Before Cleaning",
subtitle = "High-Leverage Outliers (Red) vs Normal (Gray)",
x = "GrLivArea (sq ft)", y = "Sale Price ($)") +
theme_bw() +
theme(legend.position = "top")
# Plot 2: After Cleaning (Filter out outliers)
df_clean_plot <- df_raw %>% filter(Type == "Normal")
p2 <- ggplot(df_clean_plot, aes(x = GrLivArea, y = SalePrice)) +
geom_point(color = "steelblue", alpha = 0.6, size = 1.5) +
geom_smooth(method = "lm", se = FALSE, color = "red", size = 1) +
labs(title = "After Cleaning",
subtitle = "Outliers Removed (Clean Blue Data)",
x = "GrLivArea (sq ft)", y = "Sale Price ($)") +
theme_bw()
# Combine plots
grid.arrange(p1, p2, ncol = 2)
We identified and removed high-leverage outliers to prevent model
distortion, specifically observations with
GrLivArea > 4000 sq.ft but abnormally low prices.
LotFrontage): Imputed using
Median (69.0) due to right-skewed distribution (\(p < 0.05\) in Shapiro-Wilk test).We constructed new features to capture real-world market logic:
HouseAge:
YrSold - YearBuilt. Quantifies depreciation.TotalSF:
GrLivArea + TotalBsmtSF. Captures total usable space.Feature Validity Check (Correlation Heatmap): The
heatmap below confirms that our engineered feature TotalSF
and quality metrics have the strongest correlation with
SalePrice.
# Adapted from Houseprices.R to visualize correlations on Cleaned Data
# Select numeric columns from the cleaned dataset
num_df <- df_clean[, sapply(df_clean, is.numeric)]
# Compute correlation matrix (ignoring missing values pairwise)
cor_matrix <- cor(num_df, use = "pairwise.complete.obs")
# Get correlation with SalePrice, sort by absolute value descending
cor_sorted <- sort(abs(cor_matrix[, "SalePrice"]), decreasing = TRUE)
# Select Top 10 features + SalePrice
top_vars <- names(cor_sorted)[1:11]
top_cor_matrix <- cor_matrix[top_vars, top_vars]
# Data Transformation for ggplot
melted_cor <- melt(top_cor_matrix)
# Plotting
ggplot(data = melted_cor, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 3) +
scale_fill_gradient2(low = "#6D9EC1", mid = "white", high = "#E46726",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = "Correlation") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank()
) +
coord_fixed() +
labs(title = "Top 10 Features Correlated with SalePrice (Post-Cleaning)")
We trained three models on a 70/30 split to predict continuous
SalePrice: 1. Multiple Linear Regression:
Baseline. 2. Lasso Regression: L1 Regularization to
handle multicollinearity (\(\lambda \approx
29.57\)). 3. Random Forest: Ensemble method (500
trees) to capture non-linearities.
# Visualization: Predicted vs Actual Sale Price (Linear Regression)
# Note: We use Linear Regression predictions here for visualization clarity
pred_vs_actual_df <- data.frame(
Actual = y_test,
Predicted = pred_lm
)
ggplot(pred_vs_actual_df, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", size = 1) +
labs(
title = "Prediction Accuracy: Actual vs Predicted Prices",
subtitle = "Points closer to the red dashed line indicate better predictions",
x = "Actual Sale Price ($)",
y = "Predicted Sale Price ($)"
) +
theme_minimal()
Random Forest outperformed linear models significantly,
explaining ~90% of the variance on unseen data.
# Recreating the results table summary for visualization
reg_results <- data.frame(
Model = c("Linear Regression", "Lasso Regression", "Random Forest"),
RMSE = c(30532.57, 30239.72, 24655.49),
R_Squared = c(0.8477, 0.8506, 0.9007)
)
# Visualization
p1 <- ggplot(reg_results, aes(x = reorder(Model, -R_Squared), y = R_Squared, fill = Model)) +
geom_col(width = 0.6, show.legend = FALSE) +
geom_text(aes(label = round(R_Squared, 3)), vjust = -0.5) +
scale_y_continuous(limits = c(0, 1)) +
labs(title = "Model R-Squared Comparison", x = NULL, y = "R²") +
theme_minimal() + scale_fill_brewer(palette = "Blues")
p2 <- ggplot(reg_results, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
geom_col(width = 0.6, show.legend = FALSE) +
geom_text(aes(label = round(RMSE, 0)), vjust = -0.5) +
labs(title = "Model RMSE Comparison (Lower is Better)", x = NULL, y = "RMSE ($)") +
theme_minimal() + scale_fill_brewer(palette = "Greens")
grid.arrange(p1, p2, ncol = 2)
# Visualization: Random Forest Feature Importance
# Extract importance from the trained RF model
importance_df <- as.data.frame(importance(rf_fit))
importance_df$Feature <- rownames(importance_df)
# Select Top 15 features
top_importance <- importance_df %>%
arrange(desc(`%IncMSE`)) %>%
head(15)
ggplot(top_importance, aes(x = reorder(Feature, `%IncMSE`), y = `%IncMSE`)) +
geom_col(fill = "darkgreen", alpha = 0.7) +
coord_flip() +
labs(
title = "Top 15 Drivers of House Prices (Random Forest)",
subtitle = "%IncMSE: How much error increases if this feature is removed",
x = NULL,
y = "Importance (% Increase in MSE)"
) +
theme_minimal()
The Random Forest model identified the key drivers of house prices.
TotalSF (Total Square Footage) is the
dominant predictor (Permuting it increases MSE by ~34%).OverallQual and
GrLivArea follow closely.We converted SalePrice into a binary target
HighPrice (1 if > Median, 0 otherwise) and trained
models on an 80/20 split using 5-Fold Cross-Validation.
# Visualization: ROC Curves for Classification Models
# Calculate ROC for Logistic Regression
roc_log <- roc(testData$HighPrice, predict(model_logreg, testData, type = "prob")[, "High"], levels = c("Low", "High"), direction = "<")
# Calculate ROC for Random Forest
roc_rf <- roc(testData$HighPrice, predict(model_rf, testData, type = "prob")[, "High"], levels = c("Low", "High"), direction = "<")
# Calculate ROC for KNN
roc_knn <- roc(testData$HighPrice, predict(model_knn, testData, type = "prob")[, "High"], levels = c("Low", "High"), direction = "<")
plot(roc_log, col = "blue", main = "ROC Curves Comparison", legacy.axes = TRUE, lwd = 2)
plot(roc_rf, col = "red", add = TRUE, lwd = 2)
plot(roc_knn, col = "green", add = TRUE, lwd = 2)
legend("bottomright",
legend = c(paste("Logistic (AUC =", round(auc(roc_log),3),")"),
paste("Random Forest (AUC =", round(auc(roc_rf),3),")"),
paste("KNN (AUC =", round(auc(roc_knn),3),")")),
col = c("blue", "red", "green"), lwd = 2, bty = "n")
Contrary to the regression task, Logistic Regression (Elastic
Net) achieved the best performance, slightly outperforming the
non-linear Random Forest.
# Summary Data from Classification Report
class_results <- data.frame(
Model = c("Logistic Regression", "Random Forest", "KNN"),
Accuracy = c(0.9416, 0.9315, 0.9041),
F1_Score = c(0.9412, 0.9320, 0.9030),
AUC = c(0.988, 0.985, 0.963)
)
kable(class_results, caption = "Classification Model Performance (Test Set)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
| Model | Accuracy | F1_Score | AUC |
|---|---|---|---|
| Logistic Regression | 0.9416 | 0.9412 | 0.988 |
| Random Forest | 0.9315 | 0.9320 | 0.985 |
| KNN | 0.9041 | 0.9030 | 0.963 |
OverallQual, TotalSF, and
GrLivArea are the strongest coefficients.Model Strategy:
Key Determinants: Across all models,
Total Area (TotalSF) and Overall
Quality (OverallQual) are the most critical
factors.
Data Quality: The implementation of rigorous
cleaning (Outlier removal) and domain-specific feature engineering
(HouseAge, TotalSF) significantly improved
model stability and performance compared to raw baselines.