# ==============================================================================
# BACKEND DATA PROCESSING
# Logic imported from DataCleaning.R
# ==============================================================================

# 1. Read Raw Data
# Please ensure "HousePrices.csv" is in the same directory
if(file.exists("HousePrices.csv")) {
  raw_data <- read.csv("HousePrices.csv", stringsAsFactors = FALSE)
} else {
  stop("Error: 'HousePrices.csv' not found. Please upload the raw dataset.")
}

# 2. Ordinal Mapping Definition
quality_map <- c("None" = 0, "Po" = 1, "Fa" = 2, "TA" = 3, "Gd" = 4, "Ex" = 5)

# 3. Cleaning Pipeline
df_clean <- raw_data %>%
  # --- Outlier handling ---
  filter(!(GrLivArea > 4000 & SalePrice < 300000)) %>%
  
  # --- Fill in the classification gap (NAs that mean "None") ---
  mutate(
    PoolQC = replace_na(PoolQC, "None"),
    MiscFeature = replace_na(MiscFeature, "None"),
    Alley = replace_na(Alley, "None"),
    Fence = replace_na(Fence, "None"),
    FireplaceQu = replace_na(FireplaceQu, "None"),
    GarageType = replace_na(GarageType, "None"),
    GarageFinish = replace_na(GarageFinish, "None"),
    GarageQual = replace_na(GarageQual, "None"),
    GarageCond = replace_na(GarageCond, "None"),
    BsmtQual = replace_na(BsmtQual, "None"),
    BsmtCond = replace_na(BsmtCond, "None"),
    BsmtExposure = replace_na(BsmtExposure, "None"),
    BsmtFinType1 = replace_na(BsmtFinType1, "None"),
    BsmtFinType2 = replace_na(BsmtFinType2, "None"),
    MasVnrType = replace_na(MasVnrType, "None")
  ) %>%
  
  # --- Fill in the missing values (Median for LotFrontage) ---
  mutate(
    LotFrontage = ifelse(is.na(LotFrontage), median(LotFrontage, na.rm = TRUE), LotFrontage),
    MasVnrArea = replace_na(MasVnrArea, 0),
    GarageYrBlt = replace_na(GarageYrBlt, 0),
    Electrical = replace_na(Electrical, "SBrkr")
  ) %>%
  
  # --- Ordinal Encoding ---
  mutate(
    ExterQual   = quality_map[ExterQual],
    ExterCond   = quality_map[ExterCond],
    BsmtQual    = quality_map[BsmtQual],
    BsmtCond    = quality_map[BsmtCond],
    HeatingQC   = quality_map[HeatingQC],
    KitchenQual = quality_map[KitchenQual],
    FireplaceQu = quality_map[FireplaceQu],
    GarageQual  = quality_map[GarageQual],
    GarageCond  = quality_map[GarageCond],
    PoolQC      = quality_map[PoolQC]
  ) %>%
  
  # --- Construct new features ---
  mutate(
    HouseAge = pmax(0, YrSold - YearBuilt),
    TotalSF = GrLivArea + TotalBsmtSF,
  ) %>%
  
  # --- Type conversion ---
  mutate(
    MSSubClass = as.factor(MSSubClass),
    MoSold = as.factor(MoSold),
    YrSold = as.factor(YrSold),
  ) %>%
  mutate_if(is.character, as.factor) %>%
  
  # --- Delete useless columns ---
  select(-any_of(c("Id", "Utilities")))

# df_clean is now ready for analysis in subsequent chunks

1 1. Project Overview & Objectives

Our project leverages the Ames Housing Dataset (1460 observations, 81 variables) to address two core business questions:

Regression Task: Can we accurately predict the final SalePrice based on house attributes?
Classification Task: Can we identify “High Value” properties (price > median) versus “Low Value” properties?

Methodology: We followed the CRISP-DM framework: Data Understanding \(\rightarrow\) Cleaning \(\rightarrow\) Feature Engineering \(\rightarrow\) Modeling \(\rightarrow\) Evaluation.

2 2. Data Engineering & Cleaning

2.1 2.1 Outlier Treatment & Missing Values

# Code adapted from report chart.R
# Visualization: Before vs After Cleaning
df_raw <- read.csv("HousePrices.csv", stringsAsFactors = FALSE)

# Mark outliers for visualization
df_raw$Type <- ifelse(df_raw$GrLivArea > 4000 & df_raw$SalePrice < 300000, 
                      "Outlier", "Normal")

# Plot 1: Before Cleaning
p1 <- ggplot(df_raw, aes(x = GrLivArea, y = SalePrice)) +
  geom_point(aes(color = Type), alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("Normal" = "gray50", "Outlier" = "red")) +
  geom_smooth(method = "lm", se = FALSE, color = "blue", linetype = "dashed") +
  labs(title = "Before Cleaning", 
       subtitle = "High-Leverage Outliers (Red) vs Normal (Gray)",
       x = "GrLivArea (sq ft)", y = "Sale Price ($)") +
  theme_bw() + 
  theme(legend.position = "top")

# Plot 2: After Cleaning (Filter out outliers)
df_clean_plot <- df_raw %>% filter(Type == "Normal")

p2 <- ggplot(df_clean_plot, aes(x = GrLivArea, y = SalePrice)) +
  geom_point(color = "steelblue", alpha = 0.6, size = 1.5) +
  geom_smooth(method = "lm", se = FALSE, color = "red", size = 1) +
  labs(title = "After Cleaning", 
       subtitle = "Outliers Removed (Clean Blue Data)",
       x = "GrLivArea (sq ft)", y = "Sale Price ($)") +
  theme_bw()

# Combine plots
grid.arrange(p1, p2, ncol = 2)

We identified and removed high-leverage outliers to prevent model distortion, specifically observations with GrLivArea > 4000 sq.ft but abnormally low prices.

Missing Values:
- Categorical (Pool, Alley, etc.): Imputed as “None” (indicating lack of facility).
- Numerical (LotFrontage): Imputed using Median (69.0) due to right-skewed distribution (\(p < 0.05\) in Shapiro-Wilk test).

2.2 2.2 Feature Engineering & Validation

We constructed new features to capture real-world market logic:

HouseAge: YrSold - YearBuilt. Quantifies depreciation.
TotalSF: GrLivArea + TotalBsmtSF. Captures total usable space.
Ordinal Encoding: Mapped Quality variables (Ex, Gd, TA…) to numeric scales (5, 4, 3…).

Feature Validity Check (Correlation Heatmap): The heatmap below confirms that our engineered feature TotalSF and quality metrics have the strongest correlation with SalePrice.

# Adapted from Houseprices.R to visualize correlations on Cleaned Data
# Select numeric columns from the cleaned dataset
num_df <- df_clean[, sapply(df_clean, is.numeric)]

# Compute correlation matrix (ignoring missing values pairwise)
cor_matrix <- cor(num_df, use = "pairwise.complete.obs")

# Get correlation with SalePrice, sort by absolute value descending
cor_sorted <- sort(abs(cor_matrix[, "SalePrice"]), decreasing = TRUE)

# Select Top 10 features + SalePrice
top_vars <- names(cor_sorted)[1:11] 
top_cor_matrix <- cor_matrix[top_vars, top_vars]

# Data Transformation for ggplot
melted_cor <- melt(top_cor_matrix)

# Plotting
ggplot(data = melted_cor, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") + 
  geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 3) +
  scale_fill_gradient2(low = "#6D9EC1", mid = "white", high = "#E46726", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name = "Correlation") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1), 
    axis.title.x = element_blank(), 
    axis.title.y = element_blank(), 
    panel.grid.major = element_blank(), 
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank()
  ) +
  coord_fixed() + 
  labs(title = "Top 10 Features Correlated with SalePrice (Post-Cleaning)")

3 3. Regression Analysis (Price Prediction)

3.1 3.1 Model Selection

We trained three models on a 70/30 split to predict continuous SalePrice: 1. Multiple Linear Regression: Baseline. 2. Lasso Regression: L1 Regularization to handle multicollinearity (\(\lambda \approx 29.57\)). 3. Random Forest: Ensemble method (500 trees) to capture non-linearities.

3.2 3.2 Performance Comparison

# Visualization: Predicted vs Actual Sale Price (Linear Regression)
# Note: We use Linear Regression predictions here for visualization clarity
pred_vs_actual_df <- data.frame(
  Actual = y_test,
  Predicted = pred_lm
)

ggplot(pred_vs_actual_df, aes(x = Actual, y = Predicted)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red", size = 1) +
  labs(
    title = "Prediction Accuracy: Actual vs Predicted Prices",
    subtitle = "Points closer to the red dashed line indicate better predictions",
    x = "Actual Sale Price ($)",
    y = "Predicted Sale Price ($)"
  ) +
  theme_minimal()

Random Forest outperformed linear models significantly, explaining ~90% of the variance on unseen data.

# Recreating the results table summary for visualization
reg_results <- data.frame(
  Model = c("Linear Regression", "Lasso Regression", "Random Forest"),
  RMSE = c(30532.57, 30239.72, 24655.49),
  R_Squared = c(0.8477, 0.8506, 0.9007)
)

# Visualization
p1 <- ggplot(reg_results, aes(x = reorder(Model, -R_Squared), y = R_Squared, fill = Model)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = round(R_Squared, 3)), vjust = -0.5) +
  scale_y_continuous(limits = c(0, 1)) +
  labs(title = "Model R-Squared Comparison", x = NULL, y = "R²") +
  theme_minimal() + scale_fill_brewer(palette = "Blues")

p2 <- ggplot(reg_results, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = round(RMSE, 0)), vjust = -0.5) +
  labs(title = "Model RMSE Comparison (Lower is Better)", x = NULL, y = "RMSE ($)") +
  theme_minimal() + scale_fill_brewer(palette = "Greens")

grid.arrange(p1, p2, ncol = 2)

3.3 3.3 Feature Importance

# Visualization: Random Forest Feature Importance
# Extract importance from the trained RF model
importance_df <- as.data.frame(importance(rf_fit))
importance_df$Feature <- rownames(importance_df)

# Select Top 15 features
top_importance <- importance_df %>%
  arrange(desc(`%IncMSE`)) %>%
  head(15)

ggplot(top_importance, aes(x = reorder(Feature, `%IncMSE`), y = `%IncMSE`)) +
  geom_col(fill = "darkgreen", alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Top 15 Drivers of House Prices (Random Forest)",
    subtitle = "%IncMSE: How much error increases if this feature is removed",
    x = NULL,
    y = "Importance (% Increase in MSE)"
  ) +
  theme_minimal()

The Random Forest model identified the key drivers of house prices.

TotalSF (Total Square Footage) is the dominant predictor (Permuting it increases MSE by ~34%).
OverallQual and GrLivArea follow closely.
This confirms that Space and Quality are the primary determinants of value.

4 4. Classification Analysis (High vs. Low Price)

4.1 4.1 Approach

We converted SalePrice into a binary target HighPrice (1 if > Median, 0 otherwise) and trained models on an 80/20 split using 5-Fold Cross-Validation.

4.2 4.2 Results Summary

# Visualization: ROC Curves for Classification Models
# Calculate ROC for Logistic Regression
roc_log <- roc(testData$HighPrice, predict(model_logreg, testData, type = "prob")[, "High"], levels = c("Low", "High"), direction = "<")
# Calculate ROC for Random Forest
roc_rf <- roc(testData$HighPrice, predict(model_rf, testData, type = "prob")[, "High"], levels = c("Low", "High"), direction = "<")
# Calculate ROC for KNN
roc_knn <- roc(testData$HighPrice, predict(model_knn, testData, type = "prob")[, "High"], levels = c("Low", "High"), direction = "<")

plot(roc_log, col = "blue", main = "ROC Curves Comparison", legacy.axes = TRUE, lwd = 2)
plot(roc_rf, col = "red", add = TRUE, lwd = 2)
plot(roc_knn, col = "green", add = TRUE, lwd = 2)
legend("bottomright", 
       legend = c(paste("Logistic (AUC =", round(auc(roc_log),3),")"), 
                  paste("Random Forest (AUC =", round(auc(roc_rf),3),")"),
                  paste("KNN (AUC =", round(auc(roc_knn),3),")")),
       col = c("blue", "red", "green"), lwd = 2, bty = "n")

Contrary to the regression task, Logistic Regression (Elastic Net) achieved the best performance, slightly outperforming the non-linear Random Forest.

# Summary Data from Classification Report
class_results <- data.frame(
  Model = c("Logistic Regression", "Random Forest", "KNN"),
  Accuracy = c(0.9416, 0.9315, 0.9041),
  F1_Score = c(0.9412, 0.9320, 0.9030),
  AUC = c(0.988, 0.985, 0.963)
)

kable(class_results, caption = "Classification Model Performance (Test Set)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Classification Model Performance (Test Set)
Model	Accuracy	F1_Score	AUC
Logistic Regression	0.9416	0.9412	0.988
Random Forest	0.9315	0.9320	0.985
KNN	0.9041	0.9030	0.963

4.3 4.3 Interpretation

Linearity Dominates Classification: The success of Logistic Regression indicates that the boundary between “High” and “Low” price houses is largely linear and additive.
Key Drivers: Consistent with regression, OverallQual, TotalSF, and GrLivArea are the strongest coefficients.
Business Insight: We can confidently tell stakeholders that specific upgrades (e.g., increasing quality score) have a direct, proportional impact on moving a property into the “High Price” segment.

5 5. Conclusion

Model Strategy:
- For exact price prediction, use Random Forest (\(R^2 \approx 0.90\)) to capture complex market fluctuations and non-linear interactions.
- For market segmentation (High/Low tier), use Logistic Regression (Accuracy \(\approx 94\%\)) for its interpretability and stability.
Key Determinants: Across all models, Total Area (TotalSF) and Overall Quality (OverallQual) are the most critical factors.
Data Quality: The implementation of rigorous cleaning (Outlier removal) and domain-specific feature engineering (HouseAge, TotalSF) significantly improved model stability and performance compared to raw baselines.

Group Project Summary: House Price Prediction & Classification

Data Science Group

2026-01-16