4.1 Data Integrity and Anomaly Inspection
4.1.1 Outlier Detection and Data Validity Check
This section investigates the distributions of key variables
Price_Lac,
Area_sqft,
Parking_num to detect potential outliers
and anomalies.
library(dplyr)
house_df <- read.csv("house_price_cleaned.csv",stringsAsFactors = FALSE)
# a. Distribution visualization
op <- par(mfrow = c(1, 3))
boxplot(house_df$Price_Lac, main = "Price_Lac",
ylab = "Price (Lac)", col = "salmon", border = "brown")
boxplot(house_df$Area_sqft, main = "Area_sqft",
ylab = "Area (sqft)", col = "lightblue", border = "blue")
boxplot(house_df$Parking_num, main = "Parking_num",
ylab = "Parking count", col = "lightgreen", border = "darkgreen")

par(op)
# b. Parking Anomalies detection (Top 0.5%)
park_thr <- quantile(house_df$Parking_num, 0.995, na.rm = TRUE)
anomalous_parking <- subset(house_df, !is.na(Parking_num) & Parking_num > park_thr)
cat("Parking threshold (99.5th percentile):", park_thr, "\n")
Parking threshold (99.5th percentile): 34
cat("Rows with Parking_num above threshold:", nrow(anomalous_parking), "\n")
Rows with Parking_num above threshold: 589
anomalous_parking %>%
dplyr::select(location, Parking_num, Price_Lac, Area_sqft) %>%
head() %>%
knitr::kable(
caption = "Anomalous Parking Records",
digits = 0,
format.args = list(big.mark = ","),
row.names = FALSE)
Anomalous Parking Records
| thane |
66 |
90 |
600 |
| thane |
701 |
260 |
1,200 |
| thane |
35 |
24 |
654 |
| thane |
323 |
245 |
1,250 |
| thane |
103 |
81 |
600 |
| thane |
203 |
80 |
600 |
# c. Area outliers detection (Top 0.1%)
area_thr <- quantile(house_df$Area_sqft, 0.999, na.rm = TRUE)
extreme_area <- subset(house_df, !is.na(Area_sqft) & Area_sqft > area_thr)
cat("Area threshold (99.9th percentile):", area_thr, "\n")
Area threshold (99.9th percentile): 6481.1
cat("Number of extreme area outliers:", nrow(extreme_area), "\n")
Number of extreme area outliers: 178
extreme_area %>%
dplyr::arrange(desc(Area_sqft)) %>%
dplyr::select(location, Area_sqft, Price_Lac, Parking_num) %>%
head(10) %>%
knitr::kable(
caption = "Top 10 Listings by Area (Extreme High)",
digits = 0,
format.args = list(big.mark = ","))
Top 10 Listings by Area (Extreme High)
| guwahati |
709,222 |
60 |
1 |
| visakhapatnam |
530,040 |
105 |
1 |
| bhiwadi |
495,970 |
19 |
1 |
| agra |
282,004 |
20 |
1 |
| gurgaon |
194,936 |
260 |
2 |
| agra |
113,134 |
56 |
1 |
| jaipur |
107,806 |
68 |
1 |
| thrissur |
81,845 |
45 |
1 |
| siliguri |
81,675 |
46 |
1 |
| siliguri |
71,775 |
40 |
1 |
# d. Price outliers detection (Top 0.1%)
price_thr <- quantile(house_df$Price_Lac, 0.999, na.rm = TRUE)
extreme_prices <- subset(house_df, !is.na(Price_Lac) & Price_Lac > price_thr)
cat("Price threshold (99.9th percentile):", price_thr, "\n")
Price threshold (99.9th percentile): 1250
cat("Number of extreme price outliers:", nrow(extreme_prices), "\n")
Number of extreme price outliers: 174
extreme_prices %>%
dplyr::select(location, Price_Lac, Area_sqft, Parking_num) %>%
head() %>%
knitr::kable(
caption = "Extreme Price Listings",
digits = 0,
format.args = list(big.mark = ","),
row.names = FALSE)
Extreme Price Listings
| mumbai |
1,475 |
2,800 |
2 |
| mumbai |
2,200 |
2,300 |
2 |
| mumbai |
3,600 |
10,000 |
4 |
| mumbai |
3,680 |
15,000 |
6 |
| mumbai |
3,600 |
12,000 |
6 |
| mumbai |
4,000 |
10,000 |
3 |
For Parking_num, the 99.5th-percentile cutoff is
34 and 589 rows are above it. Values like 66, 103, 203, 323, and
701 look unrealistic for a place of residence, so they are more likely
data-entry issues, placeholders, or a definition
mismatch.
For Area_sqft, the 99.9th-percentile cutoff is
6481.1 and 178 rows are above it. The maximum area reaches
709,222 sqft, which is not plausible for typical
residential listings and strongly suggests a unit or data-entry
problem.
For Price_Lac, the 99.9th-percentile cutoff is
1250 and 174 extreme-price rows are flagged. These rows often
have large Area_sqft values, such as 10,000 to 15,000. They
are likely genuine luxury listings.
4.1.2 Missing and “Unknown” Values Analysis
This section evaluates data completeness by examining
missing values across variables and the prevalence of the
“Unknown” placeholder in categorical fields.
library(tidyverse)
# a. Missing rate by variable
missing_rate <- sapply(house_df, function(x) mean(is.na(x)))
missing_df <- data.frame(
variable = names(missing_rate),
missing_pct = as.numeric(missing_rate))
missing_df <- missing_df[missing_df$missing_pct > 0,]
missing_df <- missing_df[order(missing_df$missing_pct, decreasing = TRUE),]
ggplot(missing_df, aes(x = reorder(variable, missing_pct), y = missing_pct)) +
geom_col(fill = "tomato") +
geom_text(aes(label = scales::percent(missing_pct, accuracy = 0.1)),
hjust = -0.05, size = 3) +
coord_flip() +
scale_y_continuous(labels = scales::percent_format(accuracy = 1),expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Missing value rate by variable",
x = NULL,
y = "Missing rate (NA)") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))

# b. "Unknown" proportion in categorical fields
unknown_cols <- c("facing", "Ownership", "Furnishing", "Transaction")
unknown_df <- data.frame(
variable = unknown_cols,
unknown_pct = sapply(unknown_cols, function(col) {
mean(tolower(house_df[[col]]) == "unknown", na.rm = TRUE)}))
unknown_df <- unknown_df[order(unknown_df$unknown_pct, decreasing = TRUE),]
ggplot(unknown_df, aes(x = reorder(variable, unknown_pct), y = unknown_pct)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = scales::percent(unknown_pct, accuracy = 0.1)), hjust = -0.05, size = 3) +
coord_flip() +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.15))) +
labs(
title = "\"Unknown\" proportion by categorical variable",
x = NULL,
y = "Proportion of \"Unknown\"") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))

Super_Area_sqft (57.13%) and
Carpet_Area_sqft (42.92%) show significant
missing values. They are the main sources of missing data in the
dataset. The missing rate for Price_per_sqft is relatively
low at 4.49%. These data gaps remain in area-related fields. This
indicates lower data quality for these variables. Other
fields are largely complete.
Among categorical variables, Facing and
Ownership have high ‘Unknown’ rates, standing
at 36.96% and 34.82% respectively. This limits
the effective information despite the absence of missing values.
‘Unknown’ values are rare in Furnishing (1.16%) and
Transaction (0.04%). The data quality concern is
concentrated in only a few categorical fields.
4.2 Univariate Analysis: Distributions
4.2.1 Target Variable Analysis: Price
This section examines the distribution of the target variable
Price_Lac. The goal is to assess its
skewness and tail behavior.
library(scales)
library(ggplot2)
library(patchwork)
library(kableExtra)
# a. Price distribution
summary(house_df$Price_Lac) %>%
broom::tidy() %>%
knitr::kable(
caption = "Summary Statistics: Price (Lac)",
digits = 2,
align = "c",
format.args = list(big.mark = ",")
) %>%
kable_styling(full_width = TRUE)
Summary Statistics: Price (Lac)
|
minimum
|
q1
|
median
|
mean
|
q3
|
maximum
|
|
1
|
48.4
|
78
|
119.81
|
145
|
140,030
|
house_df$Price_Lac <- as.numeric(as.character(house_df$Price_Lac))
house_df <- house_df %>%
mutate(log_price = log1p(Price_Lac))
median_raw <- median(house_df$Price_Lac, na.rm = TRUE)
median_log <- median(house_df$log_price, na.rm = TRUE)
# Raw scale
p99 <- quantile(house_df$Price_Lac, 0.99, na.rm = TRUE)
p1 <- ggplot(house_df, aes(x = Price_Lac)) +
geom_histogram(aes(y = after_stat(density)), bins = 50,
fill = "black", color = "white", alpha = 0.8, na.rm = TRUE) +
geom_density(color = "darkblue", linewidth = 1, na.rm = TRUE) +
geom_vline(xintercept = median_raw, linetype = "dashed", color = "tomato", linewidth = 0.8) +
annotate("text", x = median_raw, y = 0,
label = paste("Median:", format(round(median_raw, 1), big.mark=",")),
vjust = -1, hjust = -0.1, color = "tomato", fontface = "bold", size = 3.5) +
coord_cartesian(xlim = c(0, p99)) +
labs(title = "Original Scale (Zoomed top 99%)",
x = "Price (Lac)", y = "Density") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))
# Log transformed scale
p2 <- ggplot(house_df, aes(x = log_price)) +
geom_histogram(aes(y = after_stat(density)), bins = 50,
fill = "lightgreen", color = "white", alpha = 0.8, na.rm = TRUE) +
geom_density(color = "steelblue", linewidth = 1, na.rm = TRUE) +
geom_vline(xintercept = median_log, linetype = "dashed", color = "tomato", linewidth = 0.8) +
annotate("text", x = median_log, y = 0,
label = paste("Median:", round(median_log, 2)),
vjust = -1, hjust = -0.1, color = "tomato", fontface = "bold", size = 3.5) +
labs(title = "Log Transformed Scale",
x = "log1p(Price)", y = "Density") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))
p1 + p2

Given the extremely long right tail on the raw scale
(Price_Lac max = 140,030), applying
log1p(Price_Lac) compresses extreme values.
This results in a much more symmetric distribution. The log
view effectively mitigates the skewness caused by very expensive
listings.
4.2.2 Key Numerical Features
This section analyzes Area_sqft and
Price_per_sqft. The focus is on identifying skewness, heavy
tails, and potential anomalies to reveal the data’s spread and extreme
behaviors.
# a. Area_sqft distribution
summary(house_df$Area_sqft) %>%
broom::tidy() %>%
knitr::kable(
caption = "Summary Statistics: Area_sqft",
digits = 0,
align = "c",
format.args = list(big.mark = ","))%>%
kable_styling(full_width = TRUE)
Summary Statistics: Area_sqft
|
minimum
|
q1
|
median
|
mean
|
q3
|
maximum
|
na
|
|
1
|
833
|
1,150
|
1,269
|
1,570
|
709,222
|
90
|
quantile(house_df$Area_sqft, probs = c(0.5, 0.95, 0.99, 0.999), na.rm = TRUE) %>%
t() %>%
as.data.frame() %>%
knitr::kable(
caption = "Quantiles: Area_sqft",
digits = 0,
align = "c",
format.args = list(big.mark = ","))%>%
kable_styling(full_width = TRUE)
Quantiles: Area_sqft
|
50%
|
95%
|
99%
|
99.9%
|
|
1,150
|
2,500
|
3,784
|
6,481
|
area_p99 <- quantile(house_df$Area_sqft, 0.99, na.rm = TRUE)
median_area <- median(house_df$Area_sqft, na.rm = TRUE)
# Distribution
p1 <- ggplot(house_df, aes(x = Area_sqft)) +
geom_histogram(aes(y = after_stat(density)), bins = 60,
fill = "black", color = "white", alpha = 0.7, na.rm = TRUE) +
geom_density(color = "navy", linewidth = 1, na.rm = TRUE) +
geom_vline(xintercept = median_area, linetype = "dashed", color = "red") +
coord_cartesian(xlim = c(0, area_p99)) +
labs(title = "Distribution",
x = "Area (sqft)", y = "Density") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))
# Density & Spread
p2 <- ggplot(house_df, aes(x = "", y = Area_sqft)) +
geom_violin(fill = "lightblue", color = "steelblue", alpha = 0.5, trim = TRUE) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
coord_cartesian(ylim = c(0, area_p99)) +
labs(title = "Density & Spread",
x = NULL, y = "Area (sqft)") +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
axis.text.x = element_blank())
p1 + p2

Area_sqft is right-skewed, with a
median around 1,150 and a very long upper tail (99.9th
percentile ≈ 6,481, max ≈ 709,222), which suggests a mix of genuine
large properties and potential unit or data-entry anomalies.
# b. Price_per_sqft distribution
summary(house_df$Price_per_sqft) %>%
broom::tidy() %>%
knitr::kable(
caption = "Summary Statistics: Price_per_sqft",
digits = 0,
align = "c",
format.args = list(big.mark = ","))%>%
kable_styling(full_width = TRUE)
Summary Statistics: Price_per_sqft
|
minimum
|
q1
|
median
|
mean
|
q3
|
maximum
|
na
|
|
0
|
4,297
|
6,034
|
7,584
|
9,450
|
6,700,000
|
7,981
|
quantile(house_df$Price_per_sqft, probs = c(0.5, 0.95, 0.99, 0.999), na.rm = TRUE) %>%
t() %>%
knitr::kable(
caption = "Quantiles: Price_per_sqft",
digits = 0,
align = "c",
format.args = list(big.mark = ","))%>%
kable_styling(full_width = TRUE)
Quantiles: Price_per_sqft
|
50%
|
95%
|
99%
|
99.9%
|
|
6,034
|
16,111
|
23,810
|
46,429
|
pps_p99 <- quantile(house_df$Price_per_sqft, 0.99, na.rm = TRUE)
median_pps <- median(house_df$Price_per_sqft, na.rm = TRUE)
# Distribution
p1 <- ggplot(house_df, aes(x = Price_per_sqft)) +
geom_histogram(aes(y = after_stat(density)), bins = 60, fill = "seagreen", color = "white", alpha = 0.7, na.rm = TRUE) +
geom_density(color = "darkgreen", linewidth = 1, na.rm = TRUE) +
geom_vline(xintercept = median_pps, linetype = "dashed", color = "darkorange") +
annotate("text", x = median_pps, y = 0, label = paste("Median:", comma(median_pps)), vjust = -1, hjust = -0.1, color = "darkorange", fontface = "bold", size = 3) +
coord_cartesian(xlim = c(0, pps_p99)) +
labs(title = "Distribution",
x = "Price per Sqft", y = "Density") +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = scales::label_number(accuracy = 0.0001)) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))
# Density & Spread
p2 <- ggplot(house_df, aes(x = "", y = Price_per_sqft)) +
geom_violin(fill = "lightgreen", color = "darkgreen", alpha = 0.6, trim = TRUE, na.rm = TRUE) +
geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA, na.rm = TRUE) +
coord_cartesian(ylim = c(0, pps_p99)) +
labs(title = "Density & Spread",
x = NULL, y = "Price per Sqft") +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
axis.text.x = element_blank())
p1 + p2

Price_per_sqft shows heavy tails with extreme
upper values. The median is 6,034, while the maximum reaches
6,700,000. Additionally, there is non-trivial missingness with 7,981
missing entries. These factors indicate a highly skewed
distribution with significant data gaps.
4.2.3 Categorical Features
This section analyzes frequency distributions of key categorical
variables includinglocation, Transaction,
Furnishing, facing and Ownership.
The goal is to identify class imbalance and high-cardinality issues.
# a. Frequency overview
cat_vars <- c("location", "Transaction", "Furnishing", "facing", "Ownership")
cat_summary <- purrr::map_dfr(cat_vars, function(v) {
x <- house_df[[v]]
n_total <- length(x)
n_unique <- dplyr::n_distinct(x)
top_counts <- sort(table(x), decreasing = TRUE)
top1_pct <- as.numeric(top_counts[1]) / n_total
top3_pct <- sum(top_counts[1:min(3, length(top_counts))]) / n_total
unknown_pct <- mean(tolower(as.character(x)) == "unknown", na.rm = TRUE)
tibble::tibble(
variable = v,
n_unique = n_unique,
top1_pct = top1_pct,
top3_pct = top3_pct,
unknown_pct = unknown_pct)})
cat_summary %>%
mutate(across(ends_with("_pct"), ~ scales::percent(.x, accuracy = 0.1))) %>%
knitr::kable(
caption = "Summary of Categorical Variables",
align = "lcccc")
Summary of Categorical Variables
| location |
81 |
14.0% |
39.3% |
0.0% |
| Transaction |
5 |
76.3% |
100.0% |
0.0% |
| Furnishing |
4 |
46.6% |
98.8% |
1.2% |
| facing |
9 |
37.0% |
79.5% |
37.0% |
| Ownership |
5 |
59.8% |
97.5% |
34.8% |
location shows an uneven distribution.
While there are 81 unique cities, the top three dominate with
39.3% coverage.
Transaction and Ownership show strong
class concentration. A single category dominates each variable
(76.3% and 59.8% respectively).
facing is notable for its high “Unknown” rate of
37.0%. This makes the placeholder the most frequent category.
Ownership also contains significant placeholders
(34.8%).
# b. Location (Top-N) distribution
house_df_base <- house_df %>%
mutate(
Price_Lac = as.numeric(Price_Lac),
log_price = log1p(Price_Lac)
) %>%
filter(!is.na(Price_Lac), Price_Lac >= 0)
# Identify Top 20 Locations
top_n <- 20
loc_top20_levels <- house_df_base %>%
count(location, sort = TRUE) %>%
slice_head(n = top_n) %>%
pull(location)
# Calculate Distribution
loc_counts <- house_df_base %>%
mutate(location_top20 = if_else(location %in% loc_top20_levels, location, "Other")) %>%
count(location_top20, sort = TRUE) %>%
mutate(pct = n / sum(n))
loc_plot_df <- loc_counts %>%
mutate(location_top20 = reorder(location_top20, pct))
# Visualization
ggplot(loc_plot_df, aes(x = location_top20, y = pct)) +
geom_col(fill = "steelblue") +
geom_text(
aes(label = percent(pct, accuracy = 0.1)),
hjust = -0.05,
size = 3) +
coord_flip() +
scale_y_continuous(
labels = percent_format(accuracy = 1),
expand = expansion(mult = c(0, 0.15))) +
labs(
title = paste0("Top ", top_n, " locations (others grouped)"),
x = NULL,
y = "Share of listings") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))

The top five locations make up roughly 56.9% of the
total, showing that the data is heavily concentrated rather than evenly
distributed.
# c. Low-cardinality categorical variables
plot_cat_pct <- function(df, col) {
tmp <- df %>%
count(.data[[col]], sort = TRUE) %>%
mutate(pct = n / sum(n)) %>%
rename(level = 1)
ggplot(tmp, aes(x = reorder(level, pct), y = pct)) +
geom_col(fill = "tomato") +
geom_text(aes(label = percent(pct, accuracy = 0.1)), hjust = -0.05, size = 3) +
coord_flip() +
scale_y_continuous(labels = percent_format(accuracy = 1),
expand = expansion(mult = c(0, 0.3))) +
labs(
title = paste0(col, " distribution"),
x = NULL,
y = "Share") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))}
p1 <- plot_cat_pct(house_df, "Transaction")
p2 <- plot_cat_pct(house_df, "Furnishing")
p3 <- plot_cat_pct(house_df, "facing")
p4 <- plot_cat_pct(house_df, "Ownership")
(p1 + p2) / (p3 + p4)

Among low-cardinality variables, Transaction is
highly imbalanced. “Resale” dominates with
76.3%, and the top three categories cover the entire
dataset. Furnishing is similarly concentrated but contains
rare “Unknown” entries accounting for 1.2%. facing and
Ownership show high “Unknown” rates of 37.0% and
34.8% respectively. While these variables appear complete, they
actually contain substantial placeholder information.
4.3 Bivariate Analysis: Determinants of Price
4.3.1 Price vs Area-related Features
This section explores whether Area_sqft has a
linear relationship with Price_Lac. The analysis
focuses on three aspects: the existence of a positive
correlation, the linearity of the trend, and the
price variance among properties of similar sizes.
# a. Raw scale density
house_df %>%
dplyr::mutate(
Price_Lac = as.numeric(Price_Lac),
Area_sqft = as.numeric(Area_sqft)) %>%
dplyr::filter(!is.na(Price_Lac), !is.na(Area_sqft)) %>%
dplyr::filter(Price_Lac >= 0, Area_sqft > 0) %>%
ggplot(aes(x = Area_sqft, y = Price_Lac)) +
geom_bin2d(bins = 60) +
scale_x_continuous(labels = scales::label_number(big.mark = ",")) +
scale_y_continuous(labels = scales::label_number(big.mark = ",")) +
labs(title = "Price vs Area - Raw Scale") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))

Transactions cluster heavily in the low area and low price region. A
positive correlation is evident, yet significant
vertical spread suggests area alone cannot fully explain price
variation. This highlights the influence of other factors like
location.
# b.Log scale comparison
ggplot(house_df, aes(x = log1p(Area_sqft), y = log1p(Price_Lac))) +
geom_bin2d(alpha = 0.08, size = 0.8, show.legend = FALSE) +
geom_smooth(aes(color = "Linear"), method = "lm", se = FALSE, linewidth = 0.9) +
geom_smooth(aes(color = "LOESS"), method = "loess", se = FALSE, linewidth = 0.9) +
scale_color_manual(
name = "Fit",
values = c("Linear" = "steelblue", "LOESS" = "tomato")) +
labs(
title = "log1p(Price) vs log1p(Area) - Log Scale",
x = "log1p(Area_sqft)",
y = "log1p(Price_Lac)") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 11, hjust = 0.5))

The log scale reveals a distinct upward diagonal
pattern, indicating a more stable linear structure. However, the
visible vertical thickness of the density band implies that price
variation persists even among properties of similar size. This
suggests area is a strong but incomplete predictor,
pointing to the influence of additional factors like
location and furnishing.
# c. Correlation tests
# Raw scale
pearson_raw <- cor.test(house_df$Area_sqft, house_df$Price_Lac, method = "pearson")
spearman_raw <- cor.test(house_df$Area_sqft, house_df$Price_Lac, method = "spearman")
# Log scale
pearson_log <- cor.test(log1p(house_df$Area_sqft), log1p(house_df$Price_Lac), method = "pearson")
spearman_log <- cor.test(log1p(house_df$Area_sqft), log1p(house_df$Price_Lac), method = "spearman")
knitr::kable(
tibble(
Scale = c("Raw", "Raw", "Log1p", "Log1p"),
Method = c("Pearson", "Spearman", "Pearson", "Spearman"),
Correlation = c(unname(pearson_raw$estimate),
unname(spearman_raw$estimate),
unname(pearson_log$estimate),
unname(spearman_log$estimate)),
P_value = c(pearson_raw$p.value,
spearman_raw$p.value,
pearson_log$p.value,
spearman_log$p.value)) %>%
mutate(
Correlation = round(Correlation, 4),
P_value = signif(P_value, 3)),
col.names = c("Scale", "Method", "Correlation", "$P$"),
escape = FALSE,
align = "c")
| Raw |
Pearson |
0.0609 |
0 |
| Raw |
Spearman |
0.6892 |
0 |
| Log1p |
Pearson |
0.5428 |
0 |
| Log1p |
Spearman |
0.6892 |
0 |
The correlation tests show a very low Pearson correlation on the raw
scale (r≈0.0609) but a high Spearman correlation (ρ≈0.689).
The extremely small p-values confirm these results are statistically
significant. This indicates a strong positive monotonic
relationship between area and price. Meanwhile, the raw-scale
association is not well-described by a simple linear pattern, which is
likely due to heavy tails, outliers, and heteroscedasticity. After
log1p transformation, Pearson increases substantially
(r≈0.543), suggesting the log scale yields a more linear and
stable relationship.
# d. Linear explanatory power via R²
lm_raw <- lm(Price_Lac ~ Area_sqft, data = house_df)
lm_log <- lm(log1p(Price_Lac) ~ log1p(Area_sqft), data = house_df)
knitr::kable(
tibble(
Model = c("Price_Lac ~ Area_sqft",
"log1p(Price_Lac) ~ log1p(Area_sqft)"),
R_squared = c(summary(lm_raw)$r.squared, summary(lm_log)$r.squared),
Adj_R_squared = c(summary(lm_raw)$adj.r.squared, summary(lm_log)$adj.r.squared)) %>%
mutate(across(c(R_squared, Adj_R_squared), ~ round(.x, 4))),
col.names = c("Model", "$R^2$", "Adjusted $R^2$"),
escape = FALSE,
align = "c")
| Price_Lac ~ Area_sqft |
0.0037 |
0.0037 |
| log1p(Price_Lac) ~ log1p(Area_sqft) |
0.2946 |
0.2946 |
The raw-scale regression yields a negligible R² of
0.0037, meaning area alone provides almost no linear explanatory
power for price variation. In contrast, the log-log specification
achieves an R² of 0.295. This substantial improvement
demonstrates a stronger association on the log scale, indicating the
log transform makes the relationship much more linearly
explainable.
4.3.2 Price vs Discrete Features
This section investigates Current_Floor,
Total_Floors,
Bathroom_num,
Balcony_num, and
Parking_num to see if these discrete
features lead to systematic price variations. The goal is to
identify potential premiums for higher counts and
understand the price consistency within each category.
# a. Grouping strategy
house_df <- house_df %>%
mutate(
Price_Lac = as.numeric(Price_Lac),
Area_sqft = as.numeric(Area_sqft),
Current_Floor = as.numeric(Current_Floor),
Total_Floors = as.numeric(Total_Floors),
Bathroom_num = as.numeric(Bathroom_num),
Balcony_num = as.numeric(Balcony_num),
Parking_num = as.numeric(Parking_num)
) %>%
filter(!is.na(Price_Lac), !is.na(Area_sqft), Price_Lac >= 0, Area_sqft > 0) %>%
mutate(log_price = log1p(Price_Lac)) %>%
mutate(
Bathroom_grp = case_when(
is.na(Bathroom_num) ~ NA_character_,
Bathroom_num >= 5 ~ ">=5",
TRUE ~ as.character(as.integer(Bathroom_num))),
Balcony_grp = case_when(
is.na(Balcony_num) ~ NA_character_,
Balcony_num >= 4 ~ ">=4",
TRUE ~ as.character(as.integer(Balcony_num))),
Parking_capped = case_when(
is.na(Parking_num) ~ NA_real_,
Parking_num > 10 ~ 10,
TRUE ~ Parking_num),
Parking_grp = case_when(
is.na(Parking_capped) ~ NA_character_,
Parking_capped >= 3 ~ ">=3",
TRUE ~ as.character(as.integer(Parking_capped))),
CurrentFloor_grp = case_when(
is.na(Current_Floor) ~ NA_character_,
Current_Floor == 0 ~ "0",
Current_Floor <= 2 ~ "1-2",
Current_Floor <= 5 ~ "3-5",
Current_Floor <= 10 ~ "6-10",
Current_Floor <= 20 ~ "11-20",
TRUE ~ ">20"),
TotalFloors_grp = case_when(
is.na(Total_Floors) ~ NA_character_,
Total_Floors <= 5 ~ "<=5",
Total_Floors <= 10 ~ "6-10",
Total_Floors <= 20 ~ "11-20",
TRUE ~ ">20"),
Floor_Ratio = if_else(!is.na(Current_Floor) & !is.na(Total_Floors) & Total_Floors > 0,
pmin(pmax(Current_Floor / Total_Floors, 0), 1), NA_real_),
FloorPos_grp = case_when(
is.na(Floor_Ratio) ~ NA_character_,
Floor_Ratio <= 0.30 ~ "Bottom",
Floor_Ratio <= 0.75 ~ "Middle",
TRUE ~ "Top")
) %>%
mutate(
Bathroom_grp = factor(Bathroom_grp, levels = c("1", "2", "3", "4", ">=5")),
Balcony_grp = factor(Balcony_grp, levels = c("1", "2", "3", ">=4")),
Parking_grp = factor(Parking_grp, levels = c("1", "2", ">=3")),
CurrentFloor_grp = factor(CurrentFloor_grp, levels = c("0","1-2","3-5","6-10","11-20",">20")),
TotalFloors_grp = factor(TotalFloors_grp, levels = c("<=5","6-10","11-20",">20")),
FloorPos_grp = factor(FloorPos_grp, levels = c("Bottom", "Middle", "Top")))
plot_group_dist <- function(col_name, plot_title, x_label) {
house_df %>%
filter(!is.na(.data[[col_name]])) %>%
count(.data[[col_name]]) %>%
ggplot(aes(x = .data[[col_name]], y = n)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = comma(n)), vjust = -0.5, size = 3) +
labs(title = plot_title, x = x_label, y = "Count") +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.2))) +
theme_minimal() +
theme(
plot.title = element_text(size = 11, face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 0))}
p1 <- plot_group_dist("Bathroom_grp", "Bathrooms", "Number of Bathrooms")
p2 <- plot_group_dist("Balcony_grp", "Balconies", "Number of Balconies")
p3 <- plot_group_dist("Parking_grp", "Parking Spaces", "Number of Parking Spots")
p4 <- plot_group_dist("FloorPos_grp", "Floor Position", "Relative Floor Level")
(p1 + p2) / (p3 + p4)

Key price, area, floor, and amenity fields were coerced to numeric
types, and observations containing missing or invalid values were
excluded.Bathrooms were grouped into categories of 1–4 and \(\ge 5\), and balconies into 1–3 and \(\ge 4\). Parking spaces were capped and
categorized into 1, 2, and \(\ge
3\).Current and total floors were grouped into 0, 1–2, 3–5, 6–10,
11–20, and >20, while the ratio of current to total floors was
classified into Bottom, Middle, and Top categories. Finally, all grouped
variables were cast as ordered factors.
# b. Price vs Bathrooms
price_p99 <- quantile(house_df$Price_Lac, 0.99, na.rm = TRUE)
# Raw Scale
p_raw <- ggplot(house_df %>% filter(!is.na(Bathroom_grp)),
aes(x = Bathroom_grp, y = Price_Lac, fill = Bathroom_grp)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "white") +
coord_cartesian(ylim = c(0, price_p99)) +
scale_fill_brewer(palette = "Blues") +
labs(title = "Price by Bathrooms (Raw Scale)",
x = "Number of Bathrooms",
y = "Price (Lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
# Log Scale
p_log <- ggplot(house_df %>% filter(!is.na(Bathroom_grp)),
aes(x = Bathroom_grp, y = log_price, fill = Bathroom_grp)) +
geom_boxplot(outlier.alpha = 0.3, alpha = 0.7) +
scale_fill_brewer(palette = "Blues") +
labs(title = "Log Price by Bathrooms",
x = "Number of Bathrooms",
y = "log1p(Price)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
p_raw + p_log

# Kruskal-Wallis Test
kruskal.test(Price_Lac ~ Bathroom_grp, data = house_df) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Bathroom Group",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Bathroom Group
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
86,635.5
|
4
|
0
|
# Summary Table
house_df %>%
filter(!is.na(Bathroom_grp)) %>%
group_by(Bathroom_grp) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
knitr::kable(
caption = "Price Distribution by Bathroom Group",
col.names = c("Bathroom Group", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ","))
Price Distribution by Bathroom Group
| 1 |
15,494 |
25 |
3.258 |
| 2 |
89,709 |
60 |
4.111 |
| 3 |
54,166 |
121 |
4.804 |
| 4 |
14,823 |
235 |
5.464 |
| >=5 |
3,565 |
500 |
6.217 |
The raw-scale boxplot (truncated at P99) reveals a distinct
stepwise price increase, with medians rising from 25 Lac (1
bath) to 500 Lac (≥5 baths). This positive trend
remains robust on the log scale, indicating it is not driven by
outliers.The Kruskal–Wallis test is highly significant (χ² ≈ 86635.50,P
≪ 0.001), confirming statistically meaningful distribution differences
and supporting bathrooms as an important discrete determinant of
price.
# c. Price vs Balconies
price_p99 <- quantile(house_df$Price_Lac, 0.99, na.rm = TRUE)
# Raw Scale
p_raw <- ggplot(house_df %>% filter(!is.na(Balcony_grp)),
aes(x = Balcony_grp, y = Price_Lac, fill = Balcony_grp)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "white") +
coord_cartesian(ylim = c(0, price_p99)) +
scale_fill_brewer(palette = "Greens") +
labs(title = "Price by Balconies (Raw Scale)",
x = "Number of Balconies",
y = "Price (Lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
# Log Scale
p_log <- ggplot(house_df %>% filter(!is.na(Balcony_grp)),
aes(x = Balcony_grp, y = log_price, fill = Balcony_grp)) +
geom_boxplot(outlier.alpha = 0.3, alpha = 0.7) +
scale_fill_brewer(palette = "Greens") +
labs(title = "Log Price by Balconies",
x = "Number of Balconies",
y = "log1p(Price)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
p_raw + p_log

# Kruskal-Wallis Test
kruskal.test(Price_Lac ~ Balcony_grp, data = house_df) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Balcony Group",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Balcony Group
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
23,750.15
|
3
|
0
|
# Summary Table
house_df %>%
filter(!is.na(Balcony_grp)) %>%
group_by(Balcony_grp) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
knitr::kable(
caption = "Price Distribution by Balcony Group",
col.names = c("Balcony Group", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ","))
Price Distribution by Balcony Group
| 1 |
45,125 |
55 |
4.025 |
| 2 |
97,410 |
78 |
4.369 |
| 3 |
24,953 |
135 |
4.913 |
| >=4 |
10,269 |
210 |
5.352 |
The raw-scale boxplot (truncated at P99) shows price distributions
shifting upward with more balconies, with medians rising from 55
Lac (1 balcony) to 210 Lac (≥4). This positive
association remains robust on the log scale, unaffected by outliers. A
significant Kruskal–Wallis test (χ² ≈ 23750.15, P ≪ 0.001) confirms
statistically significant group differences, supporting balcony
count as an informative indicator of price levels.
# d. Price vs Parking
price_p99 <- quantile(house_df$Price_Lac, 0.99, na.rm = TRUE)
# Raw Scale
p_raw <- ggplot(house_df %>% filter(!is.na(Parking_grp)),
aes(x = Parking_grp, y = Price_Lac, fill = Parking_grp)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "white") +
coord_cartesian(ylim = c(0, price_p99)) +
scale_fill_brewer(palette = "Purples") +
labs(title = "Price by Parking Spaces (Raw Scale)",
x = "Number of Parking Spots",
y = "Price (lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
# Log Scale
p_log <- ggplot(house_df %>% filter(!is.na(Parking_grp)),
aes(x = Parking_grp, y = log_price, fill = Parking_grp)) +
geom_boxplot(outlier.alpha = 0.3, alpha = 0.7) +
scale_fill_brewer(palette = "Purples") +
labs(title = "Log Price by Parking Spaces",
x = "Number of Parking Spots",
y = "log1p(Price)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
p_raw + p_log

# Kruskal-Wallis Test
kruskal.test(Price_Lac ~ Parking_grp, data = house_df) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Parking Group",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Parking Group
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
23,466.89
|
2
|
0
|
# Summary Table
house_df %>%
filter(!is.na(Parking_grp)) %>%
group_by(Parking_grp) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop"
) %>%
knitr::kable(
caption = "Price Distribution by Parking Group",
col.names = c("Parking Group", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ","))
Price Distribution by Parking Group
| 1 |
158,178 |
70 |
4.263 |
| 2 |
16,546 |
270 |
5.602 |
| >=3 |
3,033 |
58 |
4.078 |
The raw-scale boxplot (truncated at P99) reveals pronounced but
non-monotonic price differences where medians peak at 2 spots (270 Lac),
significantly exceeding both the 1 spot (70 Lac) and \(\ge 3\) spots (58 Lac) groups. This pattern
remains robust on the log scale, although the lower median for the ≥3
group suggests potential sample heterogeneity requiring cautious
interpretation. A significant Kruskal–Wallis test (\(\chi^2 \approx 23466.89\), \(P \ll 0.001\)) confirms that parking
availability is a statistically significant indicator of price
distribution.
# e. Price vs Floors
# Current Floor
p_curr <- ggplot(house_df %>% filter(!is.na(CurrentFloor_grp)),
aes(x = CurrentFloor_grp, y = log_price, fill = CurrentFloor_grp)) +
geom_boxplot(outlier.alpha = 0.3, alpha = 0.7) +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Log Price by Current Floor",
x = "Current Floor Level",
y = "log1p(Price)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
# Total Floors
p_tot <- ggplot(house_df %>% filter(!is.na(TotalFloors_grp)),
aes(x = TotalFloors_grp, y = log_price, fill = TotalFloors_grp)) +
geom_boxplot(outlier.alpha = 0.3, alpha = 0.7) +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Log Price by Total Floors",
x = "Total Floors Category",
y = NULL) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
# Floor Position
p_pos <- ggplot(house_df %>% filter(!is.na(FloorPos_grp)),
aes(x = FloorPos_grp, y = log_price, fill = FloorPos_grp)) +
geom_boxplot(outlier.alpha = 0.3, alpha = 0.7) +
scale_fill_brewer(palette = "Oranges") +
labs(title = "Log Price by Position",
x = "Relative Position",
y = NULL) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")
p_curr + p_tot + p_pos

# Kruskal-Wallis Test (Combined)
kw_curr <- kruskal.test(Price_Lac ~ CurrentFloor_grp, data = house_df)
kw_tot <- kruskal.test(Price_Lac ~ TotalFloors_grp, data = house_df)
kw_pos <- kruskal.test(Price_Lac ~ FloorPos_grp, data = house_df)
dplyr::bind_rows(
broom::tidy(kw_curr) %>% mutate(Variable = "Current Floor"),
broom::tidy(kw_tot) %>% mutate(Variable = "Total Floors"),
broom::tidy(kw_pos) %>% mutate(Variable = "Floor Position")) %>%
dplyr::select(Variable, statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Floor Variables",
col.names = c("Variable", "$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Floor Variables
|
Variable
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
Current Floor
|
17,372.029
|
5
|
0
|
|
Total Floors
|
17,642.590
|
3
|
0
|
|
Floor Position
|
556.269
|
2
|
0
|
# Current Floor Summary
house_df %>%
filter(!is.na(CurrentFloor_grp)) %>%
group_by(CurrentFloor_grp) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
knitr::kable(
caption = "Price Distribution by Current Floor Group",
col.names = c("Current Floor Grp", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ","))
Price Distribution by Current Floor Group
| 0 |
10,577 |
71 |
4.277 |
| 1-2 |
61,950 |
65 |
4.190 |
| 3-5 |
53,971 |
69 |
4.248 |
| 6-10 |
26,624 |
115 |
4.754 |
| 11-20 |
15,633 |
120 |
4.796 |
| >20 |
1,704 |
310 |
5.740 |
# Total Floors Summary
house_df %>%
filter(!is.na(TotalFloors_grp)) %>%
group_by(TotalFloors_grp) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
knitr::kable(
caption = "Price Distribution by Total Floors Group",
col.names = c("Total Floors Grp", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ","))
Price Distribution by Total Floors Group
| <=5 |
85,888 |
61 |
4.127 |
| 6-10 |
38,640 |
85 |
4.454 |
| 11-20 |
33,472 |
90 |
4.511 |
| >20 |
12,459 |
170 |
5.142 |
# Floor Position Summary
house_df %>%
filter(!is.na(FloorPos_grp)) %>%
group_by(FloorPos_grp) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
knitr::kable(
caption = "Price Distribution by Floor Position",
col.names = c("Floor Position Grp", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ","))
Price Distribution by Floor Position
| Bottom |
45,000 |
75 |
4.331 |
| Middle |
86,355 |
80 |
4.394 |
| Top |
39,104 |
72 |
4.290 |
Log-scale boxplots reveal clear price stratification
where median prices rise from 65–71 Lac for low floors to 115–120 Lac
for mid-high floors, jumping to 310 Lac for the >20
group. Total building height likewise exhibits a consistent
high-rise premium with medians increasing from 61 Lac (≤5) to 170
Lac (>20). Highly significant Kruskal–Wallis tests for Current Floor
(χ² ≈ 17372.03) and Total Floors (χ² ≈ 17642.59) confirm the
statistical impact of height on price. In contrast, Floor
Position shows significant but narrower median gaps (72–80 Lac, χ² ≈
556.27), suggesting the premium is driven more by absolute height and
scarcity than by relative position.
4.3.3 Price vs Categorical Variables
This section investigates systematic price differences across
categorical variables. Using log1p(Price_Lac)
distributions, Kruskal–Wallis tests, and
median comparisons, price levels associated with market
segmentation factors are identified. location_top20 serves
as the primary baseline, while Transaction and
Furnishing are utilized to assess specific premiums. Due to
high “Unknown” rates in facing (36.96%) and
Ownership (34.82%), results for these variables are
reported as supplementary, interpreting “Unknown” as missing data rather
than a valid business category.
# a. Location effect: Top 20 + Other
df_loc <- house_df_base %>%
mutate(location_top20 = if_else(location %in% loc_top20_levels, location, "Other"))
loc_order <- df_loc %>%
group_by(location_top20) %>%
summarise(med_log = median(log_price, na.rm = TRUE), .groups = "drop") %>%
arrange(med_log) %>%
pull(location_top20)
# Visualization
df_loc %>%
mutate(location_top20 = factor(location_top20, levels = loc_order)) %>%
ggplot(aes(x = location_top20, y = log_price, fill = location_top20)) +
geom_boxplot(outlier.alpha = 0.2, alpha = 0.75) +
coord_flip() +
labs(
title = "log1p(Price) by Location (Top 20 + Other)",
x = NULL,
y = "log1p(Price_Lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")

# Kruskal-Wallis Test
kruskal.test(Price_Lac ~ location_top20, data = df_loc) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Location (Top20+Other)",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Location (Top20+Other)
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
35,054.12
|
20
|
0
|
# Summary Table
df_loc %>%
group_by(location_top20) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
mutate(pct = n / sum(n)) %>%
arrange(desc(n)) %>%
mutate(pct = scales::percent(pct, accuracy = 0.1)) %>%
knitr::kable(
caption = "Location (Top20+Other): Share and Price Summary",
col.names = c("Location Group", "Count", "Median Price (Lac)", "Median Log Price", "Share"),
digits = c(0, 0, 1, 3, 0),
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Location (Top20+Other): Share and Price Summary
|
Location Group
|
Count
|
Median Price (Lac)
|
Median Log Price
|
Share
|
|
new-delhi
|
24,945
|
160.0
|
5.081
|
14.0%
|
|
bangalore
|
23,262
|
85.0
|
4.454
|
13.1%
|
|
kolkata
|
21,605
|
51.0
|
3.951
|
12.1%
|
|
Other
|
21,439
|
53.0
|
3.989
|
12.1%
|
|
gurgaon
|
18,846
|
157.0
|
5.063
|
10.6%
|
|
ahmedabad
|
12,614
|
70.0
|
4.263
|
7.1%
|
|
hyderabad
|
11,147
|
81.8
|
4.416
|
6.3%
|
|
chennai
|
10,163
|
69.0
|
4.248
|
5.7%
|
|
jaipur
|
7,867
|
51.0
|
3.951
|
4.4%
|
|
greater-noida
|
4,490
|
64.0
|
4.174
|
2.5%
|
|
faridabad
|
3,733
|
75.0
|
4.331
|
2.1%
|
|
vadodara
|
2,361
|
39.0
|
3.689
|
1.3%
|
|
surat
|
2,180
|
56.9
|
4.058
|
1.2%
|
|
pune
|
2,177
|
70.0
|
4.263
|
1.2%
|
|
thane
|
1,869
|
70.0
|
4.263
|
1.1%
|
|
mumbai
|
1,814
|
210.0
|
5.352
|
1.0%
|
|
visakhapatnam
|
1,729
|
55.0
|
4.025
|
1.0%
|
|
mohali
|
1,479
|
65.0
|
4.190
|
0.8%
|
|
zirakpur
|
1,478
|
63.0
|
4.159
|
0.8%
|
|
chandigarh
|
1,402
|
72.0
|
4.290
|
0.8%
|
|
noida
|
1,247
|
85.0
|
4.454
|
0.7%
|
location_top20 (Top20+Other) strongly
differentiates prices as confirmed by a significant
Kruskal–Wallis test (χ² ≈ 35054.12, P ≪ 0.001). Within the Top 20,
medians range widely from 210 Lac in Mumbai to 39 Lac
in Vadodara, reflecting a substantial city-tier
premium. The “Other” category, with a 12.1% share and 53 Lac
median, aggregates smaller cities to represent a group average rather
than specific local market dynamics.
# b. Transaction effect
tx_order <- house_df %>%
filter(!is.na(Transaction)) %>%
group_by(Transaction) %>%
summarise(med_log = median(log_price, na.rm = TRUE), .groups = "drop") %>%
arrange(med_log) %>%
pull(Transaction)
# Visualization
house_df %>%
filter(!is.na(Transaction)) %>%
mutate(Transaction = factor(Transaction, levels = tx_order)) %>%
ggplot(aes(x = Transaction, y = log_price, fill = Transaction)) +
geom_boxplot(outlier.alpha = 0.2, alpha = 0.75) +
labs(
title = "log1p(Price) by Transaction",
x = "Transaction",
y = "log1p(Price_Lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")

# Kruskal-Wallis Test
kruskal.test(Price_Lac ~ Transaction, data = house_df) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Transaction",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kableExtra::kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Transaction
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
2,941.07
|
4
|
0
|
# Summary Table
house_df %>%
filter(!is.na(Transaction)) %>%
group_by(Transaction) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
arrange(desc(n)) %>%
knitr::kable(
caption = "Price Summary by Transaction",
col.names = c("Transaction", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ","))
Price Summary by Transaction
| Resale |
135,562 |
75.0 |
4.331 |
| New Property |
41,423 |
93.0 |
4.543 |
| Other |
703 |
45.0 |
3.829 |
| Unknown |
67 |
56.0 |
4.043 |
| Rent/Lease |
2 |
52.5 |
3.826 |
Transaction type significantly influences house prices as confirmed
by a Kruskal–Wallis test showing statistically distinct distributions
(\(\chi^2 \approx 2941.07\), \(P \ll 0.001\)). Median comparisons reveal a
clear premium for new properties where New Property listings (93 Lac)
generally exceed Resale (75 Lac). However, results for categories such
as Other and Rent-Lease should be viewed as exploratory due to
negligible sample sizes.
# c. Furnishing effect
fur_order <- house_df %>%
filter(!is.na(Furnishing)) %>%
group_by(Furnishing) %>%
summarise(med_log = median(log_price, na.rm = TRUE), .groups = "drop") %>%
arrange(med_log) %>%
pull(Furnishing)
# Visualization
house_df %>%
filter(!is.na(Furnishing)) %>%
mutate(Furnishing = factor(Furnishing, levels = fur_order)) %>%
ggplot(aes(x = Furnishing, y = log_price, fill = Furnishing)) +
geom_boxplot(outlier.alpha = 0.2, alpha = 0.75) +
labs(
title = "log1p(Price) by Furnishing",
x = "Furnishing",
y = "log1p(Price_Lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")

# Kruskal-Wallis Test
kruskal.test(Price_Lac ~ Furnishing, data = house_df) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Furnishing",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Furnishing
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
2,984.756
|
3
|
0
|
# Summary Table
house_df %>%
filter(!is.na(Furnishing)) %>%
group_by(Furnishing) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
arrange(desc(n)) %>%
knitr::kable(
caption = "Price Summary by Furnishing",
col.names = c("Furnishing", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ","))
Price Summary by Furnishing
| Semi-Furnished |
82,832 |
87.6 |
4.484 |
| Unfurnished |
73,458 |
70.0 |
4.263 |
| Furnished |
19,404 |
75.0 |
4.331 |
| Unknown |
2,063 |
52.0 |
3.970 |
The Kruskal-Wallis test confirms a statistically significant
difference in house price distributions across furnishing statuses
(\(\chi^2 \approx 2984.756\), \(P \ll 0.001\)). Regarding median prices,
semi-furnished properties rank highest at approximately 87.5 Lac,
followed by furnished units at 75 Lac and unfurnished ones at 70 Lac,
indicating a clear premium associated with furnishing configurations. In
contrast, the unknown category presents a notably lower median of 52
Lac. Given its smaller sample size (n=2067), this likely reflects
heterogeneity due to missing information.
# d. Facing effect
facing_order <- house_df %>%
filter(!is.na(facing)) %>%
group_by(facing) %>%
summarise(med_log = median(log_price, na.rm = TRUE), .groups = "drop") %>%
arrange(med_log) %>%
pull(facing)
# Visualization
house_df %>%
filter(!is.na(facing)) %>%
mutate(facing = factor(facing, levels = facing_order)) %>%
ggplot(aes(x = facing, y = log_price, fill = facing)) +
geom_boxplot(outlier.alpha = 0.2, alpha = 0.75) +
labs(
title = "log1p(Price) by Facing",
x = "Facing",
y = "log1p(Price_Lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")

# Kruskal-Wallis Test
kruskal.test(Price_Lac ~ facing, data = house_df) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Facing",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Facing
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
14,554.94
|
8
|
0
|
# Summary Table
house_df %>%
filter(!is.na(facing)) %>%
group_by(facing) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop") %>%
arrange(desc(n)) %>%
knitr::kable(
caption = "Price Summary by Facing",
col.names = c("Facing", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Price Summary by Facing
|
Facing
|
Count
|
Median Price (Lac)
|
Median Log Price
|
|
Unknown
|
65,659
|
62.0
|
4.143
|
|
East
|
52,238
|
85.0
|
4.454
|
|
North - East
|
23,348
|
121.0
|
4.804
|
|
North
|
15,301
|
83.0
|
4.431
|
|
West
|
8,471
|
75.0
|
4.331
|
|
South
|
4,315
|
136.0
|
4.920
|
|
North - West
|
3,803
|
160.0
|
5.081
|
|
South - East
|
2,570
|
65.5
|
4.197
|
|
South -West
|
2,052
|
78.0
|
4.369
|
# Kruskal-Wallis Test (excluding 'Unknown')
kruskal.test(Price_Lac ~ facing,
data = house_df %>% filter(!is.na(facing), facing != "Unknown")) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Facing (Excluding Unknown)",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Facing (Excluding Unknown)
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
3,677.443
|
7
|
0
|
Due to the high Unknown rate of 36.96% in facing, this
category is treated as missing information and the results are reported
as supplementary. The Kruskal-Wallis test reveals significant
differences in price distributions across orientations (\(\chi^2 \approx 14554.94\), \(P \ll 0.001\)), and these differences
remain significant even after excluding the Unknown category (\(\chi^2 \approx 3677.443\), \(P \ll 0.001\)). Median prices are higher
for North-West and South facing properties at approximately 160 Lac and
136 Lac respectively, while the Unknown group sits considerably lower at
62 Lac.
# e.Ownership effect
own_order <- house_df %>%
filter(!is.na(Ownership)) %>%
group_by(Ownership) %>%
summarise(med_log = median(log_price, na.rm = TRUE), .groups = "drop") %>%
arrange(med_log) %>%
pull(Ownership)
# Visualization
house_df %>%
filter(!is.na(Ownership)) %>%
mutate(Ownership = factor(Ownership, levels = own_order)) %>%
ggplot(aes(x = Ownership, y = log_price, fill = Ownership)) +
geom_boxplot(outlier.alpha = 0.2, alpha = 0.75) +
coord_flip() +
labs(
title = "log1p(Price) by Ownership",
x = "Ownership",
y = "log1p(Price_Lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
legend.position = "none")

# Kruskal-Wallis Test
kruskal.test(Price_Lac ~ Ownership, data = house_df) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Ownership",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kableExtra::kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Ownership
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
12,582.87
|
4
|
0
|
# Summary Table
house_df %>%
filter(!is.na(Ownership)) %>%
group_by(Ownership) %>%
summarise(
n = n(),
median_price = median(Price_Lac, na.rm = TRUE),
median_log_price = median(log_price, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(n)) %>%
knitr::kable(
caption = "Price Summary by Ownership",
col.names = c("Ownership", "Count", "Median Price (Lac)", "Median Log Price"),
digits = c(0, 0, 1, 3),
format.args = list(big.mark = ",")
) %>%
kableExtra::kable_styling(full_width = TRUE)
Price Summary by Ownership
|
Ownership
|
Count
|
Median Price (Lac)
|
Median Log Price
|
|
Freehold
|
106,397
|
88.0
|
4.489
|
|
Unknown
|
61,855
|
64.0
|
4.174
|
|
Leasehold
|
5,149
|
91.8
|
4.530
|
|
Co-operative Society
|
3,344
|
55.0
|
4.025
|
|
Power Of Attorney
|
1,012
|
163.0
|
5.100
|
# Kruskal-Wallis Test (Excluding 'Unknown')
kruskal.test(Price_Lac ~ Ownership,
data = house_df %>% filter(!is.na(Ownership), Ownership != "Unknown")) %>%
broom::tidy() %>%
dplyr::select(statistic, parameter, p.value) %>%
knitr::kable(
caption = "Kruskal-Wallis Test: Price by Ownership",
col.names = c("$\\chi^2$", "DF", "$P$"),
escape = FALSE,
digits = 3,
align = "c",
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = TRUE)
Kruskal-Wallis Test: Price by Ownership
|
\(\chi^2\)
|
DF
|
\(P\)
|
|
961.755
|
3
|
0
|
Due to the high Unknown rate of 34.82% in Ownership,
this category is treated as missing information and the results are
reported as supplementary. The Kruskal-Wallis test reveals significant
differences in price distributions across ownership types (\(\chi^2 \approx 12582.87\), \(P \ll 0.001\)), and these differences
remain significant even after excluding the Unknown category (\(\chi^2 \approx 961.755\), \(P \ll 0.001\)). Median prices are higher
for Freehold and Leasehold properties, at 88 Lac and 91.8 Lac
respectively, while Co-operative Society units are lower at
approximately 55 Lac.
4.4 Multivariate Relationships
4.4.1 Correlation Matrix
This section employs a correlation matrix heatmap to
summarize linear relationships among key numerical
variables such as Price_Lac, Price_per_sqft,
and various area or floor measures. The goal is to detect highly
correlated feature groups that indicate redundancy or
multicollinearity risks and to inform subsequent feature
engineering decisions.
library(ggcorrplot)
key_numeric_vars <- c(
"Price_Lac", "Price_per_sqft",
"Area_sqft", "Carpet_Area_sqft", "Super_Area_sqft",
"Current_Floor", "Total_Floors",
"Bathroom_num", "Balcony_num", "Parking_num")
num_df_key <- house_df %>%
dplyr::select(where(is.numeric)) %>%
dplyr::select(any_of(key_numeric_vars), everything())
# a. Clustered correlation heatmap
corr_mat <- cor(num_df_key, use = "pairwise.complete.obs", method = "pearson")
corr_mat[is.na(corr_mat)] <- 0
ggcorrplot::ggcorrplot(
corr_mat,
type = "lower",
hc.order = TRUE,
lab = TRUE,
lab_size = 3,
outline.col = "white",
colors = c("steelblue", "white", "tomato"),
show.diag = FALSE) +
labs(title = "Clustered Correlation Heatmap") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
axis.text.x = element_text(
angle = 45,
hjust = 1, vjust = 1,
margin = margin(t = 2)))

# b. Feature correlations table
corr_long <- as.data.frame(as.table(corr_mat)) %>%
dplyr::rename(var1 = Var1, var2 = Var2, corr = Freq) %>%
dplyr::filter(var1 != var2) %>%
dplyr::mutate(abs_corr = abs(corr)) %>%
dplyr::arrange(desc(abs_corr))
top_pairs <- corr_long %>%
dplyr::rowwise() %>%
dplyr::mutate(pair_id = paste(sort(c(as.character(var1), as.character(var2))), collapse = " | ")) %>%
dplyr::ungroup() %>%
dplyr::distinct(pair_id, .keep_all = TRUE) %>%
dplyr::select(var1, var2, corr, abs_corr) %>%
dplyr::slice_head(n = 10)
top_pairs %>%
dplyr::mutate(Interpretation = dplyr::case_when(
abs_corr >= 0.9 ~ "Redundant (Consider Dropping)",
abs_corr >= 0.7 ~ "Strong Linkage",
abs_corr >= 0.4 ~ "Moderate Correlation",
TRUE ~ "Weak Correlation")) %>%
knitr::kable(
caption = "Top 10 Strongest Feature Correlations",
col.names = c("Variable A", "Variable B", "$r$", "$\\vert r \\vert$", "Interpretation"),
escape = FALSE,
digits = 3,
align = "llrrr") %>%
kableExtra::kable_styling(full_width = TRUE)
Top 10 Strongest Feature Correlations
|
Variable A
|
Variable B
|
\(r\)
|
\(\vert r \vert\)
|
Interpretation
|
|
Carpet_Area_sqft
|
Area_sqft
|
1.000
|
1.000
|
Redundant (Consider Dropping)
|
|
Super_Area_sqft
|
Area_sqft
|
1.000
|
1.000
|
Redundant (Consider Dropping)
|
|
Price_per_sqft
|
Price_Lac
|
0.763
|
0.763
|
Strong Linkage
|
|
Total_Floors
|
Current_Floor
|
0.734
|
0.734
|
Strong Linkage
|
|
log_price
|
Bathroom_num
|
0.718
|
0.718
|
Strong Linkage
|
|
Parking_capped
|
Parking_num
|
0.509
|
0.509
|
Moderate Correlation
|
|
Floor_Ratio
|
Current_Floor
|
0.452
|
0.452
|
Moderate Correlation
|
|
log_price
|
Balcony_num
|
0.362
|
0.362
|
Weak Correlation
|
|
log_price
|
Total_Floors
|
0.356
|
0.356
|
Weak Correlation
|
|
Balcony_num
|
Bathroom_num
|
0.341
|
0.341
|
Weak Correlation
|
The heatmap and the Top-10 table jointly demonstrate that the three
area measures are effectively duplicates, as Area_sqft
correlates perfectly with both Super_Area_sqft and
Carpet_Area_sqft (r=1.000). Price signals are also strongly
related as Price_per_sqft correlates robustly with
Price_Lac (r=0.763), suggesting that unit price is a key
driver of total price variation. Floor and amenity features provide
additional signals with moderate-to-strong relationships linking
Total_Floors with Current_Floor (r=0.734),
Parking_capped with Parking_num (r=0.509), and
Bathroom_num with Balcony_num (r=0.341),
suggesting they complement the core price and area information rather
than dominate it. Additionally, “zero correlations” resulting from
NA-to-zero replacement should be interpreted as data sparsity rather
than statistical independence.
4.4.2 Interaction Effects
This section investigates interaction effects between property
size and location by integrating raw-scale faceted scatter
plots with a log-linear regression forest plot.
While the facets visualize the actual price-area distributions across
the Top-20 locations, the forest plot provides a standardized
comparison of pricing sensitivity by extracting regression
slopes.
library(broom)
# a. Raw scale (facet)
area_cap_99 <- quantile(house_df_base$Area_sqft, probs = 0.99, na.rm = TRUE)
house_df_cap <- house_df_base %>%
filter(Area_sqft <= area_cap_99)
top_n <- 20
loc_top20_levels <- house_df_cap %>%
count(location, sort = TRUE) %>%
slice_head(n = top_n) %>%
pull(location)
plot_df_top20 <- house_df_cap %>%
filter(location %in% loc_top20_levels) %>%
mutate(location = forcats::fct_reorder(location, Price_Lac, .fun = median, .desc = TRUE))
ggplot(plot_df_top20, aes(x = Area_sqft, y = Price_Lac)) +
geom_point(alpha = 0.15, size = 0.8, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, linewidth = 0.8, color = "firebrick") +
scale_y_continuous(labels = label_number(big.mark = ",")) +
facet_wrap(~ location, scales = "free_y", ncol = 4) +
labs(
title = "Price vs Area across Top-20 Locations (Raw Scale)",
x = "Area (sqft)",
y = "Price (lac)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
strip.text = element_text(size = 8, face = "bold"),
panel.spacing = unit(1, "lines"))

# b. Log scale (forest)
slopes_log <- plot_df_top20 %>%
group_by(location) %>%
do(tidy(lm(log_price ~ Area_sqft, data = .), conf.int = TRUE)) %>%
filter(term == "Area_sqft") %>%
ungroup() %>%
mutate(location = reorder(location, estimate))
ggplot(slopes_log, aes(x = estimate, y = location)) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.3, color = "gray60") +
geom_point(size = 2.5, color = "darkred") +
geom_vline(xintercept = mean(slopes_log$estimate), linetype = "dashed", color = "blue", alpha = 0.5) +
labs(
title = "Pricing Sensitivity by Location (Log-Linear Slopes)",
x = "Slope Coefficient (Area's Marginal Effect on Log Price)",
y = NULL) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
axis.text.y = element_text(size = 9))

# c. Top-5 vs Bottom-5 summary table
slopes_tbl <- plot_df_top20 %>%
group_by(location) %>%
do(tidy(lm(log_price ~ Area_sqft, data = .), conf.int = TRUE)) %>%
ungroup() %>%
filter(term == "Area_sqft") %>%
left_join(
plot_df_top20 %>% count(location, name = "n"),
by = "location"
) %>%
mutate(
pct_uplift_100 = (exp(estimate * 100) - 1) * 100
) %>%
select(location, n, estimate, conf.low, conf.high, pct_uplift_100)
top5 <- slopes_tbl %>% arrange(desc(estimate)) %>% slice_head(n = 5) %>%
mutate(group = "Top-5")
bot5 <- slopes_tbl %>% arrange(estimate) %>% slice_head(n = 5) %>%
mutate(group = "Bottom-5")
top_bot_10 <- bind_rows(top5, bot5)
avg_stats <- slopes_tbl %>%
summarise(
n_locations = n(),
slope_mean = mean(estimate, na.rm = TRUE),
slope_min = min(estimate, na.rm = TRUE),
slope_max = max(estimate, na.rm = TRUE)
)
avg_row <- tibble(
group = "Average",
location = "NA",
n = avg_stats$n_locations, # number of locations contributing to the mean
estimate = avg_stats$slope_mean,
conf.low = NA_real_,
conf.high = NA_real_,
pct_uplift_100 = (exp(avg_stats$slope_mean * 100) - 1) * 100
)
top_bot_11 <- bind_rows(avg_row, top_bot_10) %>%
mutate(
estimate = round(estimate, 6),
conf.low = ifelse(is.na(conf.low), NA, round(conf.low, 6)),
conf.high = ifelse(is.na(conf.high), NA, round(conf.high, 6)),
`Uplift per +100 sqft` = scales::percent(pct_uplift_100 / 100, accuracy = 0.01)
) %>%
select(
Group = group,
Location = location,
N = n,
Slope = estimate,
`CI (low)` = conf.low,
`CI (high)` = conf.high,
`Uplift per +100 sqft`
)
top_bot_11 %>%
knitr::kable(
caption = "Top-5 vs Bottom-5 locations by pricing sensitivity (log_price ~ Area_sqft)",
align = "llrrrrr"
) %>%
kableExtra::kable_styling(full_width = TRUE)
Top-5 vs Bottom-5 locations by pricing sensitivity (log_price ~
Area_sqft)
|
Group
|
Location
|
N
|
Slope
|
CI (low)
|
CI (high)
|
Uplift per +100 sqft
|
|
Average
|
NA
|
20
|
0.000870
|
NA
|
NA
|
9.09%
|
|
Top-5
|
thane
|
1865
|
0.001759
|
0.001676
|
0.001841
|
19.23%
|
|
Top-5
|
kolkata
|
21581
|
0.001348
|
0.001337
|
0.001360
|
14.43%
|
|
Top-5
|
mumbai
|
1792
|
0.001214
|
0.001170
|
0.001259
|
12.91%
|
|
Top-5
|
new-delhi
|
24880
|
0.001110
|
0.001099
|
0.001121
|
11.74%
|
|
Top-5
|
pune
|
2143
|
0.001097
|
0.001063
|
0.001130
|
11.59%
|
|
Bottom-5
|
ahmedabad
|
12194
|
0.000297
|
0.000284
|
0.000310
|
3.02%
|
|
Bottom-5
|
bangalore
|
23185
|
0.000447
|
0.000437
|
0.000456
|
4.57%
|
|
Bottom-5
|
chandigarh
|
1399
|
0.000462
|
0.000416
|
0.000509
|
4.73%
|
|
Bottom-5
|
faridabad
|
3693
|
0.000486
|
0.000465
|
0.000506
|
4.98%
|
|
Bottom-5
|
zirakpur
|
1475
|
0.000496
|
0.000455
|
0.000536
|
5.08%
|
After capping Area_sqft at the 99th percentile (3,783.88
sqft), the analysis of 154,721 observations from the Top-20 locations
reveals a pronounced interaction between property size and
location. While all location-specific slopes from the log_price ~
Area_sqft regression are positive, they exhibit substantial variation,
ranging from 0.000297 to 0.001759 with a mean of 0.000870.
This indicates that the marginal impact of area on price differs
materially across geographic segments. On average, a 100 sqft
increase in area yields a 9.09% price uplift. Sensitivities
peak in Thane, Kolkata, and Mumbai, reaching approximately
19.23%, 14.43%, and 12.91% respectively, whereas Ahmedabad
and Bangalore exhibit much flatter responses at 3.02% and 4.57%. The 95%
confidence intervals in the forest plot confirm that 8 locations reside
significantly above the global mean while 10 fall significantly below,
reinforcing that this market heterogeneity is systematic rather
than mere noise.
4.5 Data Quality and Feature Engineering Strategy
4.5.1 Outlier and Skewness Treatment for Modeling
This section implements modeling-oriented preprocessing
to mitigate the impact of long-tailed distributions and
extreme observations. A logarithmic
transformation is applied to the target variable
Price_Lac to reduce skewness. Extreme values are identified
using 99th-percentile thresholds for both price and area,
which are used for outlier flagging and selective filtering
rather than immediate value modification. Price outliers are optionally
handled through quantile-based capping during model
training, while anomalous coded values in auxiliary features such as
Parking_num = 999 are recoded as missing.
library(tidyr)
library(knitr)
library(e1071)
### 3.5.1 Outlier Flagging & Skewness Treatment
q_hi <- 0.99
model_df <- house_df %>%
mutate(
Price_Lac = as.numeric(Price_Lac),
Area_sqft = as.numeric(Area_sqft),
Parking_num = as.numeric(Parking_num)) %>%
filter(
!is.na(Price_Lac), Price_Lac >= 0,
!is.na(Area_sqft), Area_sqft > 0) %>%
mutate(
log_price = log1p(Price_Lac),
Parking_num_fix = if_else(Parking_num == 999, NA_real_, Parking_num))
# a. Compute 99th percentile thresholds
area_q99 <- quantile(model_df$Area_sqft, probs = q_hi, na.rm = TRUE)
price_q99 <- quantile(model_df$Price_Lac, probs = q_hi, na.rm = TRUE)
threshold_tbl <- tibble::tibble(
Variable = c("Area_sqft", "Price_Lac"),
Quantile = q_hi,
Threshold_99 = c(as.numeric(area_q99), as.numeric(price_q99)))
threshold_tbl %>%
knitr::kable(
caption = "99th Percentile Thresholds (Flagging Only)",
digits = 3,
align = "lrr") %>%
kableExtra::kable_styling(full_width = TRUE)
99th Percentile Thresholds (Flagging Only)
|
Variable
|
Quantile
|
Threshold_99
|
|
Area_sqft
|
0.99
|
3783.88
|
|
Price_Lac
|
0.99
|
700.00
|
# b. Create outlier flags
house_df_outlier_flag <- model_df %>%
mutate(
area_outlier_99 = Area_sqft > area_q99,
price_outlier_99 = Price_Lac > price_q99,
any_outlier_99 = area_outlier_99 | price_outlier_99)
audit_outlier_flag <- tibble::tibble(
Metric = c(
"Rows in modeling dataset",
"Area 99th percentile threshold",
"Price 99th percentile threshold",
"Share flagged by Area (top 1%)",
"Share flagged by Price (top 1%)",
"Share flagged by either (union)",
"Count of Parking_num coded as 999"),
Value = c(
nrow(house_df_outlier_flag),
as.numeric(area_q99),
as.numeric(price_q99),
mean(house_df_outlier_flag$area_outlier_99, na.rm = TRUE),
mean(house_df_outlier_flag$price_outlier_99, na.rm = TRUE),
mean(house_df_outlier_flag$any_outlier_99, na.rm = TRUE),
sum(model_df$Parking_num == 999, na.rm = TRUE)))
audit_outlier_flag %>%
knitr::kable(
caption = "Outlier Flag Audit (99th Percentile)",
digits = 4,
align = "lr") %>%
kableExtra::kable_styling(full_width = TRUE)
Outlier Flag Audit (99th Percentile)
|
Metric
|
Value
|
|
Rows in modeling dataset
|
177757.0000
|
|
Area 99th percentile threshold
|
3783.8800
|
|
Price 99th percentile threshold
|
700.0000
|
|
Share flagged by Area (top 1%)
|
0.0100
|
|
Share flagged by Price (top 1%)
|
0.0066
|
|
Share flagged by either (union)
|
0.0120
|
|
Count of Parking_num coded as 999
|
1.0000
|
# c. Skewness comparison (Raw vs Log)
skew_table <- tibble::tibble(
Variable = c("Price_Lac", "log_price"),
Skewness = c(
e1071::skewness(model_df$Price_Lac, na.rm = TRUE, type = 3),
e1071::skewness(model_df$log_price, na.rm = TRUE, type = 3)))
skew_table %>%
knitr::kable(
caption = "Skewness Audit: Raw vs Log Transform",
digits = 3,
align = "lr") %>%
kableExtra::kable_styling(full_width = TRUE)
Skewness Audit: Raw vs Log Transform
|
Variable
|
Skewness
|
|
Price_Lac
|
270.193
|
|
log_price
|
0.324
|
# d. Preview of flagged rows
area_outlier_preview <- house_df_outlier_flag %>%
filter(area_outlier_99) %>%
arrange(desc(Area_sqft)) %>%
select(location, Area_sqft, Price_Lac, log_price, Price_per_sqft, Current_Floor, Total_Floors) %>%
slice_head(n = 10)
area_outlier_preview %>%
knitr::kable(
caption = "Preview: Area Outliers Flagged (Top by Area_sqft, Values Unchanged)",
digits = 3,
align = "lrrrrrr") %>%
kableExtra::kable_styling(full_width = TRUE)
Preview: Area Outliers Flagged (Top by Area_sqft, Values Unchanged)
|
location
|
Area_sqft
|
Price_Lac
|
log_price
|
Price_per_sqft
|
Current_Floor
|
Total_Floors
|
|
guwahati
|
709222
|
60.0
|
4.111
|
5859
|
3
|
8
|
|
visakhapatnam
|
530040
|
105.0
|
4.663
|
0
|
NA
|
NA
|
|
bhiwadi
|
495970
|
19.0
|
2.996
|
2533
|
10
|
14
|
|
agra
|
282004
|
20.0
|
3.045
|
7
|
3
|
6
|
|
gurgaon
|
194936
|
260.0
|
5.565
|
6131
|
20
|
20
|
|
agra
|
113134
|
56.0
|
4.043
|
3500
|
2
|
11
|
|
jaipur
|
107806
|
68.0
|
4.234
|
4231
|
11
|
14
|
|
thrissur
|
81845
|
45.0
|
3.829
|
NA
|
3
|
7
|
|
siliguri
|
81675
|
45.7
|
3.844
|
4200
|
1
|
5
|
|
siliguri
|
71775
|
40.2
|
3.718
|
4200
|
5
|
5
|
price_outlier_preview <- house_df_outlier_flag %>%
filter(price_outlier_99) %>%
arrange(desc(Price_Lac)) %>%
select(location, Area_sqft, Price_Lac, log_price, Price_per_sqft, Current_Floor, Total_Floors) %>%
slice_head(n = 10)
price_outlier_preview %>%
knitr::kable(
caption = "Preview: Price Outliers Flagged (Top by Price_Lac, Values Unchanged)",
digits = 3,
align = "lrrrrrr") %>%
kableExtra::kable_styling(full_width = TRUE)
Preview: Price Outliers Flagged (Top by Price_Lac, Values Unchanged)
|
location
|
Area_sqft
|
Price_Lac
|
log_price
|
Price_per_sqft
|
Current_Floor
|
Total_Floors
|
|
vadodara
|
1252
|
140030
|
11.850
|
6700000
|
5
|
9
|
|
siliguri
|
970
|
51004
|
10.840
|
4041600
|
1
|
4
|
|
raipur
|
920
|
39675
|
10.589
|
3450000
|
2
|
2
|
|
raipur
|
750
|
22980
|
10.042
|
2669100
|
2
|
9
|
|
varanasi
|
20000
|
8000
|
8.987
|
40000
|
NA
|
NA
|
|
gurgaon
|
11000
|
6000
|
8.700
|
54545
|
25
|
25
|
|
gurgaon
|
5500
|
5500
|
8.613
|
52381
|
25
|
25
|
|
gurgaon
|
5400
|
5500
|
8.613
|
74324
|
10
|
30
|
|
gurgaon
|
5250
|
5200
|
8.557
|
54737
|
10
|
39
|
|
gurgaon
|
11000
|
5000
|
8.517
|
45455
|
25
|
25
|
In this section, potential extreme observations are flagged using the
99th-percentile thresholds for Area_sqft and
Price_Lac, without applying capping, winsorization, or
overwriting any original values in the dataset.
With q_hi = 0.99, the computed thresholds are 3,783.88 sqft for
Area_sqft and 700.0 Lac for Price_Lac; these
cutoffs are used solely to construct indicator variables
(area_outlier_99,price_outlier_99,
any_outlier_99).
The anomalous code Parking_num = 999 is recoded as
missing Parking_num_fix = NA to prevent it from being
interpreted as a valid numeric magnitude by the model.
Skewness is substantially reduced through logarithmic transformation,
with price skewness decreasing from 270.193 for Price_Lac
to 0.324 for log_price.
4.5.2 Missing and Unknown Treatment for Modeling
This section performs a unified treatment of missing and
unknown values in numerical, binary, and categorical features.
Missingness is audited based on NA rates, and explicitly
coded “Unknown” values in categorical variables are
analyzed together with NA values. Features with missing rates exceeding
a predefined threshold are removed, while the remaining
variables are processed according to feature type, using median
imputation with missing indicators for numerical and binary
features, or explicit level encoding for NA and “Unknown”
values in categorical features.
### 4.5.2 Missing and Unknown Treatment for Modeling
library(stringr)
data_missing_in <- house_df_outlier_flag
missing_thresh <- 0.30
# a. Missingness summary (NA-based)
missing_tbl <- data_missing_in %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Missing_n") %>%
mutate(
Missing_rate = Missing_n / nrow(data_missing_in),
Type = case_when(
Feature %in% names(select(data_missing_in, where(is.numeric))) ~ "Numeric",
Feature %in% names(select(data_missing_in, where(is.logical))) ~ "Binary",
Feature %in% names(select(data_missing_in, where(is.character))) ~ "Categorical",
TRUE ~ "Other")) %>%
arrange(desc(Missing_rate))
missing_tbl %>%
slice_head(n = 15) %>%
knitr::kable(
caption = "Missingness Summary (Top 15 by NA rate)",
digits = 3,
align = "lrrl") %>%
kableExtra::kable_styling(full_width = TRUE)
Missingness Summary (Top 15 by NA rate)
|
Feature
|
Missing_n
|
Missing_rate
|
Type
|
|
Super_Area_sqft
|
101522
|
0.571
|
Numeric
|
|
Carpet_Area_sqft
|
76235
|
0.429
|
Numeric
|
|
Price_per_sqft
|
7891
|
0.044
|
Numeric
|
|
Current_Floor
|
7298
|
0.041
|
Numeric
|
|
Total_Floors
|
7298
|
0.041
|
Numeric
|
|
CurrentFloor_grp
|
7298
|
0.041
|
Other
|
|
TotalFloors_grp
|
7298
|
0.041
|
Other
|
|
Floor_Ratio
|
7298
|
0.041
|
Numeric
|
|
FloorPos_grp
|
7298
|
0.041
|
Other
|
|
Parking_num_fix
|
1
|
0.000
|
Numeric
|
|
location
|
0
|
0.000
|
Categorical
|
|
Price_Lac
|
0
|
0.000
|
Numeric
|
|
Area_sqft
|
0
|
0.000
|
Numeric
|
|
Bathroom_num
|
0
|
0.000
|
Numeric
|
|
Balcony_num
|
0
|
0.000
|
Numeric
|
# b. "Unknown" summary for categorical columns (string 'Unknown')
unknown_tbl <- data_missing_in %>%
select(where(is.character)) %>%
summarise(across(everything(), ~ sum(str_to_lower(str_trim(.)) == "unknown", na.rm = TRUE))) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Unknown_n") %>%
mutate(Unknown_rate = Unknown_n / nrow(data_missing_in)) %>%
arrange(desc(Unknown_rate))
unknown_tbl %>%
slice_head(n = 10) %>%
knitr::kable(
caption = "‘Unknown’ Level Frequency (Top 10 categorical features)",
digits = 3,
align = "lrr") %>%
kableExtra::kable_styling(full_width = TRUE)
‘Unknown’ Level Frequency (Top 10 categorical features)
|
Feature
|
Unknown_n
|
Unknown_rate
|
|
facing
|
65659
|
0.369
|
|
Ownership
|
61855
|
0.348
|
|
Furnishing
|
2063
|
0.012
|
|
Transaction
|
67
|
0.000
|
|
location
|
0
|
0.000
|
# c. Unified modeling decision table (threshold = 30%)
decision_tbl <- missing_tbl %>%
select(Feature, Type, Missing_n, Missing_rate) %>%
left_join(unknown_tbl %>% select(Feature, Unknown_n, Unknown_rate), by = "Feature") %>%
mutate(
Unknown_n = tidyr::replace_na(Unknown_n, 0L),
Unknown_rate = tidyr::replace_na(Unknown_rate, 0),
Action = case_when(
Missing_rate >= missing_thresh ~ "Drop (high missingness >= 30%)",
Type %in% c("Numeric", "Binary") & Missing_rate > 0 ~ "Keep; add missing flag + median impute",
Type == "Categorical" & (Missing_rate > 0 | Unknown_rate > 0) ~ "Keep; treat NA/'Unknown' as explicit level",
TRUE ~ "Keep"),
Issue = case_when(
Type == "Categorical" ~ paste0("NA rate = ", round(Missing_rate, 3), "; 'Unknown' rate = ", round(Unknown_rate, 3)),
TRUE ~ paste0("NA rate = ", round(Missing_rate, 3)))) %>%
arrange(desc(str_detect(Action, "^Drop")), desc(Missing_rate), desc(Unknown_rate), Feature) %>%
select(Feature, Type, Issue, Action)
decision_tbl %>%
knitr::kable(
caption = "Missing/Unknown Handling Rules for Modeling (Threshold = 30%)",
align = "llll") %>%
kableExtra::kable_styling(full_width = TRUE)
Missing/Unknown Handling Rules for Modeling (Threshold = 30%)
|
Feature
|
Type
|
Issue
|
Action
|
|
Super_Area_sqft
|
Numeric
|
NA rate = 0.571
|
Drop (high missingness >= 30%)
|
|
Carpet_Area_sqft
|
Numeric
|
NA rate = 0.429
|
Drop (high missingness >= 30%)
|
|
Price_per_sqft
|
Numeric
|
NA rate = 0.044
|
Keep; add missing flag + median impute
|
|
CurrentFloor_grp
|
Other
|
NA rate = 0.041
|
Keep
|
|
Current_Floor
|
Numeric
|
NA rate = 0.041
|
Keep; add missing flag + median impute
|
|
FloorPos_grp
|
Other
|
NA rate = 0.041
|
Keep
|
|
Floor_Ratio
|
Numeric
|
NA rate = 0.041
|
Keep; add missing flag + median impute
|
|
TotalFloors_grp
|
Other
|
NA rate = 0.041
|
Keep
|
|
Total_Floors
|
Numeric
|
NA rate = 0.041
|
Keep; add missing flag + median impute
|
|
Parking_num_fix
|
Numeric
|
NA rate = 0
|
Keep; add missing flag + median impute
|
|
facing
|
Categorical
|
NA rate = 0; ‘Unknown’ rate = 0.369
|
Keep; treat NA/‘Unknown’ as explicit level
|
|
Ownership
|
Categorical
|
NA rate = 0; ‘Unknown’ rate = 0.348
|
Keep; treat NA/‘Unknown’ as explicit level
|
|
Furnishing
|
Categorical
|
NA rate = 0; ‘Unknown’ rate = 0.012
|
Keep; treat NA/‘Unknown’ as explicit level
|
|
Transaction
|
Categorical
|
NA rate = 0; ‘Unknown’ rate = 0
|
Keep; treat NA/‘Unknown’ as explicit level
|
|
Area_sqft
|
Numeric
|
NA rate = 0
|
Keep
|
|
Balcony_grp
|
Other
|
NA rate = 0
|
Keep
|
|
Balcony_num
|
Numeric
|
NA rate = 0
|
Keep
|
|
Bathroom_grp
|
Other
|
NA rate = 0
|
Keep
|
|
Bathroom_num
|
Numeric
|
NA rate = 0
|
Keep
|
|
Parking_capped
|
Numeric
|
NA rate = 0
|
Keep
|
|
Parking_grp
|
Other
|
NA rate = 0
|
Keep
|
|
Parking_num
|
Numeric
|
NA rate = 0
|
Keep
|
|
Price_Lac
|
Numeric
|
NA rate = 0
|
Keep
|
|
any_outlier_99
|
Binary
|
NA rate = 0
|
Keep
|
|
area_outlier_99
|
Binary
|
NA rate = 0
|
Keep
|
|
location
|
Categorical
|
NA rate = 0; ‘Unknown’ rate = 0
|
Keep
|
|
log_price
|
Numeric
|
NA rate = 0
|
Keep
|
|
price_outlier_99
|
Binary
|
NA rate = 0
|
Keep
|
# d. Numeric missing handling preview (Before vs After)
pps_median <- median(data_missing_in$Price_per_sqft, na.rm = TRUE)
missing_preview <- data_missing_in %>%
mutate(
Price_per_sqft_missing = as.integer(is.na(Price_per_sqft)),
Price_per_sqft_imputed = if_else(is.na(Price_per_sqft), pps_median, as.numeric(Price_per_sqft))) %>%
filter(Price_per_sqft_missing == 1) %>%
slice_head(n = 10) %>%
select(
location,
Area_sqft,
Price_Lac,
Price_per_sqft,
Price_per_sqft_imputed,
Price_per_sqft_missing)
missing_preview %>%
knitr::kable(
caption = "Preview: Numeric Missing Handling (Price_per_sqft NA -> Median Impute)",
digits = 3,
align = "lrrrrr") %>%
kableExtra::kable_styling(full_width = TRUE)
Preview: Numeric Missing Handling (Price_per_sqft NA -> Median
Impute)
|
location
|
Area_sqft
|
Price_Lac
|
Price_per_sqft
|
Price_per_sqft_imputed
|
Price_per_sqft_missing
|
|
thane
|
530
|
25.0
|
NA
|
6034
|
1
|
|
thane
|
923
|
70.0
|
NA
|
6034
|
1
|
|
thane
|
741
|
155.0
|
NA
|
6034
|
1
|
|
thane
|
1200
|
265.0
|
NA
|
6034
|
1
|
|
thane
|
400
|
22.0
|
NA
|
6034
|
1
|
|
thane
|
550
|
25.0
|
NA
|
6034
|
1
|
|
thane
|
1600
|
400.0
|
NA
|
6034
|
1
|
|
thane
|
625
|
48.0
|
NA
|
6034
|
1
|
|
thane
|
739
|
68.6
|
NA
|
6034
|
1
|
|
thane
|
894
|
160.0
|
NA
|
6034
|
1
|
Using a 30% missingness threshold, the decision table flags 2
variables for removal due to high NA rates, while all remaining
variables are retained with explicit per-type handling rules.
Among categorical features, the highest “Unknown” frequency appears
in facing (rate = 0.3696), which will be carried forward as
an explicit level rather than being left as ambiguous missingness.
For the numeric preview example Price_per_sqft, there
are 7,891 missing observations; the median used for the illustrative
imputation is 6,034.0, and the table displays the first 10 imputed
rows.
4.5.3 Feature Creation
This section proposes a small set of interpretable derived
features such as floor position and
density-style features and conducts a lightweight
audit on their availability and basic relationship with the
transformed target log_price. The goal is to identify
feature candidates that are well-defined, have manageable
missingness, and capture housing-price mechanisms.
data_in <- house_df_outlier_flag
# a. Feature creation
feature_df <- data_in %>%
mutate(
Floor_Ratio = dplyr::if_else(!is.na(Total_Floors) & Total_Floors > 0,
Current_Floor / Total_Floors,
NA_real_),
is_top_floor = dplyr::if_else(!is.na(Current_Floor) & !is.na(Total_Floors),
Current_Floor == Total_Floors, FALSE),
is_ground_floor = dplyr::if_else(!is.na(Current_Floor),
Current_Floor == 0, FALSE),
log_area = log1p(Area_sqft),
Bathroom_per_1000sqft = Bathroom_num / (Area_sqft / 1000),
Balcony_per_1000sqft = Balcony_num / (Area_sqft / 1000),
Parking_per_1000sqft = Parking_num_fix / (Area_sqft / 1000))
# b. Availability audit for derived features
derived_vars <- c(
"Floor_Ratio", "is_top_floor", "is_ground_floor",
"log_area", "Bathroom_per_1000sqft", "Balcony_per_1000sqft", "Parking_per_1000sqft")
derived_audit <- feature_df %>%
summarise(across(all_of(derived_vars), ~ mean(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Missing_rate") %>%
mutate(
Type = case_when(
Feature %in% names(select(feature_df, where(is.numeric))) ~ "Numeric",
Feature %in% names(select(feature_df, where(is.logical))) ~ "Binary",
TRUE ~ "Other")) %>%
arrange(desc(Missing_rate))
derived_audit %>%
knitr::kable(
caption = "Derived Feature Availability Audit (Missing Rate)",
digits = 4,
align = "lrl") %>%
kableExtra::kable_styling(full_width = TRUE)
Derived Feature Availability Audit (Missing Rate)
|
Feature
|
Missing_rate
|
Type
|
|
Floor_Ratio
|
0.0411
|
Numeric
|
|
Parking_per_1000sqft
|
0.0000
|
Numeric
|
|
is_top_floor
|
0.0000
|
Binary
|
|
is_ground_floor
|
0.0000
|
Binary
|
|
log_area
|
0.0000
|
Numeric
|
|
Bathroom_per_1000sqft
|
0.0000
|
Numeric
|
|
Balcony_per_1000sqft
|
0.0000
|
Numeric
|
# c. Summary stats for key numeric derived features
derived_summary <- feature_df %>%
summarise(
Floor_Ratio_p05 = quantile(Floor_Ratio, 0.05, na.rm = TRUE),
Floor_Ratio_med = median(Floor_Ratio, na.rm = TRUE),
Floor_Ratio_p95 = quantile(Floor_Ratio, 0.95, na.rm = TRUE),
log_area_p05 = quantile(log_area, 0.05, na.rm = TRUE),
log_area_med = median(log_area, na.rm = TRUE),
log_area_p95 = quantile(log_area, 0.95, na.rm = TRUE),
BathDen_p05 = quantile(Bathroom_per_1000sqft, 0.05, na.rm = TRUE),
BathDen_med = median(Bathroom_per_1000sqft, na.rm = TRUE),
BathDen_p95 = quantile(Bathroom_per_1000sqft, 0.95, na.rm = TRUE)) %>%
pivot_longer(everything(), names_to = "Metric", values_to = "Value")
derived_summary %>%
knitr::kable(
caption = "Sanity Check: Selected Derived Feature Summary (p05 / median / p95)",
digits = 3,
align = "lr") %>%
kableExtra::kable_styling(full_width = TRUE)
Sanity Check: Selected Derived Feature Summary (p05 / median / p95)
|
Metric
|
Value
|
|
Floor_Ratio_p05
|
0.000
|
|
Floor_Ratio_med
|
0.500
|
|
Floor_Ratio_p95
|
1.000
|
|
log_area_p05
|
5.303
|
|
log_area_med
|
7.048
|
|
log_area_p95
|
7.824
|
|
BathDen_p05
|
1.289
|
|
BathDen_med
|
2.000
|
|
BathDen_p95
|
14.286
|
# d. Association check with log_price (Spearman correlation)
corr_tbl <- feature_df %>%
summarise(
rho_log_area = cor(log_price, log_area, method = "spearman", use = "complete.obs"),
rho_floor = cor(log_price, Floor_Ratio, method = "spearman", use = "complete.obs"),
rho_bath_den = cor(log_price, Bathroom_per_1000sqft, method = "spearman", use = "complete.obs"),
rho_bal_den = cor(log_price, Balcony_per_1000sqft, method = "spearman", use = "complete.obs"),
rho_park_den = cor(log_price, Parking_per_1000sqft, method = "spearman", use = "complete.obs")) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Spearman_rho") %>%
arrange(desc(abs(Spearman_rho)))
corr_tbl %>%
knitr::kable(
caption = "Spearman Correlation with log_price (Derived Features)",
digits = 3,
align = "lr") %>%
kableExtra::kable_styling(full_width = TRUE)
Spearman Correlation with log_price (Derived Features)
|
Feature
|
Spearman_rho
|
|
rho_log_area
|
0.689
|
|
rho_park_den
|
-0.561
|
|
rho_bal_den
|
-0.351
|
|
rho_bath_den
|
-0.348
|
|
rho_floor
|
0.031
|
# e. Group contrast examples (top floor / ground floor)
group_tbl <- feature_df %>%
summarise(
mean_log_price_all = mean(log_price, na.rm = TRUE),
mean_log_price_top = mean(log_price[is_top_floor], na.rm = TRUE),
mean_log_price_gnd = mean(log_price[is_ground_floor], na.rm = TRUE),
n_top = sum(is_top_floor, na.rm = TRUE),
n_gnd = sum(is_ground_floor, na.rm = TRUE)) %>%
pivot_longer(everything(), names_to = "Metric", values_to = "Value")
group_tbl %>%
knitr::kable(
caption = "Mean log_price by Floor Flags",
digits = 3,
align = "lr") %>%
kableExtra::kable_styling(full_width = TRUE)
Mean log_price by Floor Flags
|
Metric
|
Value
|
|
mean_log_price_all
|
4.427
|
|
mean_log_price_top
|
4.217
|
|
mean_log_price_gnd
|
4.155
|
|
n_top
|
21612.000
|
|
n_gnd
|
10577.000
|
# f. Preview: Newly Created Features (first 10 rows)
feature_df %>%
select(
location, Price_Lac, log_price, Area_sqft,
Floor_Ratio, is_top_floor, is_ground_floor,
log_area, Bathroom_per_1000sqft, Balcony_per_1000sqft, Parking_per_1000sqft,
area_outlier_99, price_outlier_99, any_outlier_99) %>%
slice_head(n = 10) %>%
knitr::kable(
caption = "Preview: Newly Created Features (First 10 Rows)",
digits = 3,
align = "lrrrrllrrrrlll") %>%
kableExtra::kable_styling(full_width = TRUE)
Preview: Newly Created Features (First 10 Rows)
|
location
|
Price_Lac
|
log_price
|
Area_sqft
|
Floor_Ratio
|
is_top_floor
|
is_ground_floor
|
log_area
|
Bathroom_per_1000sqft
|
Balcony_per_1000sqft
|
Parking_per_1000sqft
|
area_outlier_99
|
price_outlier_99
|
any_outlier_99
|
|
thane
|
42.0
|
3.761
|
500
|
0.909
|
FALSE
|
FALSE
|
6.217
|
2.000
|
4.000
|
2.000
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
98.0
|
4.595
|
473
|
0.136
|
FALSE
|
FALSE
|
6.161
|
4.228
|
4.228
|
2.114
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
140.0
|
4.949
|
779
|
0.345
|
FALSE
|
FALSE
|
6.659
|
2.567
|
2.567
|
1.284
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
25.0
|
3.258
|
530
|
0.333
|
FALSE
|
FALSE
|
6.275
|
1.887
|
1.887
|
1.887
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
160.0
|
5.081
|
635
|
0.476
|
FALSE
|
FALSE
|
6.455
|
3.150
|
3.150
|
1.575
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
45.0
|
3.829
|
680
|
0.286
|
FALSE
|
FALSE
|
6.524
|
1.471
|
1.471
|
1.471
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
16.5
|
2.862
|
550
|
0.800
|
FALSE
|
FALSE
|
6.312
|
1.818
|
3.636
|
1.818
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
60.0
|
4.111
|
575
|
0.000
|
FALSE
|
TRUE
|
6.356
|
1.739
|
3.478
|
1.739
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
60.0
|
4.111
|
600
|
0.000
|
FALSE
|
TRUE
|
6.399
|
1.667
|
3.333
|
1.667
|
FALSE
|
FALSE
|
FALSE
|
|
thane
|
160.0
|
5.081
|
900
|
0.111
|
FALSE
|
FALSE
|
6.804
|
3.333
|
1.111
|
1.111
|
FALSE
|
FALSE
|
FALSE
|
Seven interpretable derived features, Floor_Ratio,
is_top_floor, is_ground_floor,
log_area, Bathroom_per_1000sqft,
Balcony_per_1000sqft, and
Parking_per_1000sqft, are created from
house_df_outlier_flag, followed by a lightweight audit of
feature availability and their basic relationship with the transformed
target log_price.
In terms of availability, Floor_Ratio has a missing rate
of approximately 0.0411, mainly driven by missing floor information,
while the remaining derived features show zero missingness.
For relationship exploration, log_area shows a strong
monotonic association with log_price (\(\rho \approx 0.689\)), whereas Floor_Ratio
has a weak association (\(\rho \approx
0.031\)).
A simple group contrast further indicates that the mean log_price is
about 4.427 overall, compared with about 4.217 for top-floor units
(n=21,612) and about 4.155 for ground-floor units (n=10,577), suggesting
a directional but relatively modest floor-position effect in this
dataset.