library(tidyr)
library(ggplot2)
library(dplyr)
library(scales)
setwd("D:/LPU/2nd Sem/R Programming for Data Analysis/R Project/HOUSE-PRICES-PREDICTION-PROJECT")
data <- read.csv("train.csv")
test_data <- read.csv("test.csv")
dim(data)
## [1] 1460 81
str(data)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
dim(test_data)
## [1] 1459 80
str(test_data)
## 'data.frame': 1459 obs. of 80 variables:
## $ Id : int 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 ...
## $ MSSubClass : int 20 20 60 60 120 60 20 60 20 20 ...
## $ MSZoning : chr "RH" "RL" "RL" "RL" ...
## $ LotFrontage : int 80 81 74 78 43 75 NA 63 85 70 ...
## $ LotArea : int 11622 14267 13830 9978 5005 10000 7980 8402 10176 8400 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "IR1" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "Corner" "Inside" "Inside" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "NAmes" "NAmes" "Gilbert" "Gilbert" ...
## $ Condition1 : chr "Feedr" "Norm" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "1Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 5 6 5 6 8 6 6 6 7 4 ...
## $ OverallCond : int 6 6 5 6 5 5 7 5 5 5 ...
## $ YearBuilt : int 1961 1958 1997 1998 1992 1993 1992 1998 1990 1970 ...
## $ YearRemodAdd : int 1961 1958 1998 1998 1992 1994 2007 1998 1990 1970 ...
## $ RoofStyle : chr "Gable" "Hip" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
## $ Exterior2nd : chr "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
## $ MasVnrType : chr "None" "BrkFace" "None" "BrkFace" ...
## $ MasVnrArea : int 0 108 0 20 0 0 0 0 0 0 ...
## $ ExterQual : chr "TA" "TA" "TA" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "PConc" "PConc" ...
## $ BsmtQual : chr "TA" "TA" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "TA" ...
## $ BsmtExposure : chr "No" "No" "No" "No" ...
## $ BsmtFinType1 : chr "Rec" "ALQ" "GLQ" "GLQ" ...
## $ BsmtFinSF1 : int 468 923 791 602 263 0 935 0 637 804 ...
## $ BsmtFinType2 : chr "LwQ" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 144 0 0 0 0 0 0 0 0 78 ...
## $ BsmtUnfSF : int 270 406 137 324 1017 763 233 789 663 0 ...
## $ TotalBsmtSF : int 882 1329 928 926 1280 763 1168 789 1300 882 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "TA" "TA" "Gd" "Ex" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 896 1329 928 926 1280 763 1187 789 1341 882 ...
## $ X2ndFlrSF : int 0 0 701 678 0 892 0 676 0 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 896 1329 1629 1604 1280 1655 1187 1465 1341 882 ...
## $ BsmtFullBath : int 0 0 0 0 0 0 1 0 1 1 ...
## $ BsmtHalfBath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 1 1 2 2 2 2 2 2 1 1 ...
## $ HalfBath : int 0 1 1 1 0 1 0 1 1 0 ...
## $ BedroomAbvGr : int 2 3 3 3 2 3 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ KitchenQual : chr "TA" "Gd" "TA" "Gd" ...
## $ TotRmsAbvGrd : int 5 6 6 7 5 7 6 7 5 4 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 0 1 1 0 1 0 1 1 0 ...
## $ FireplaceQu : chr NA NA "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ GarageYrBlt : int 1961 1958 1997 1998 1992 1993 1992 1998 1990 1970 ...
## $ GarageFinish : chr "Unf" "Unf" "Fin" "Fin" ...
## $ GarageCars : int 1 1 2 2 2 2 2 2 2 2 ...
## $ GarageArea : int 730 312 482 470 506 440 420 393 506 525 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 140 393 212 360 0 157 483 0 192 240 ...
## $ OpenPorchSF : int 0 36 34 36 82 84 21 75 0 0 ...
## $ EnclosedPorch: int 0 0 0 0 0 0 0 0 0 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ScreenPorch : int 120 0 0 0 144 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr "MnPrv" NA "MnPrv" NA ...
## $ MiscFeature : chr NA "Gar2" NA NA ...
## $ MiscVal : int 0 12500 0 0 0 0 500 0 0 0 ...
## $ MoSold : int 6 6 3 6 1 4 3 5 2 4 ...
## $ YrSold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Normal" ...
setdiff(colnames(data), colnames(test_data))
## [1] "SalePrice"
The dataset contains 1460 observations and 81 variables, including both numerical and categorical features. These variables capture structural, quality, and locational aspects of houses.
num_vars <- data %>% select(where(is.numeric)) %>% ncol()
cat_vars <- data %>% select(where(is.character)) %>% ncol()
num_vars
## [1] 38
cat_vars
## [1] 43
The dataset includes both numerical and categorical variables, enabling analysis of measurable attributes such as size and price, along with qualitative features such as neighborhood.
df <- data %>%
select(SalePrice,
GrLivArea,
OverallQual,
YearBuilt,
TotalBsmtSF,
GarageArea,
BedroomAbvGr,
FullBath,
Neighborhood,
GarageCars,
TotRmsAbvGrd,
LotArea) %>%
mutate(across(where(is.numeric), ~replace_na(., 0)))
test_df <- test_data %>%
select(GrLivArea,
OverallQual,
YearBuilt,
TotalBsmtSF,
GarageArea,
BedroomAbvGr,
FullBath,
GarageCars,
TotRmsAbvGrd,
LotArea,
Neighborhood) %>%
mutate(across(where(is.numeric), ~replace_na(., 0)))
dim(df)
## [1] 1460 12
A refined dataset containing key variables was created to ensure focused and meaningful analysis. This dataset retains the most influential features affecting house prices while eliminating redundant or less relevant variables.
colSums(is.na(df))
## SalePrice GrLivArea OverallQual YearBuilt TotalBsmtSF GarageArea
## 0 0 0 0 0 0
## BedroomAbvGr FullBath Neighborhood GarageCars TotRmsAbvGrd LotArea
## 0 0 0 0 0 0
Missing values are present in several variables, indicating the need for data cleaning before analysis.
summary(df$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
The summary statistics show a wide range of house prices, with the mean exceeding the median, indicating a right-skewed distribution.
df <- df %>%
mutate(Log_SalePrice = log(SalePrice))
# Original distribution
ggplot(df, aes(x = SalePrice)) +
geom_histogram(bins = 30, fill = "seagreen", color = "black") +
scale_x_continuous(labels = scales::comma) +
labs(
title = "Distribution of Sale Price",
x = "Sale Price",
y = "Frequency"
) +
theme_minimal()
# Log-transformed distribution
ggplot(df, aes(x = Log_SalePrice)) +
geom_histogram(bins = 30, fill = "lightgreen", color = "black") +
labs(
title = "Distribution of Log(SalePrice)",
x = "Log(Sale Price)",
y = "Frequency"
) +
theme_minimal()
The distribution of SalePrice is positively skewed, with a concentration of houses in the lower price range and a long right tail representing high-value properties. After applying a log transformation, the distribution becomes more symmetric and less influenced by extreme values. This indicates that the transformation stabilizes the variance and makes the data more suitable for statistical modeling.
df %>%
group_by(Neighborhood) %>%
summarise(avg_price = mean(SalePrice)) %>%
arrange(desc(avg_price))
## # A tibble: 25 × 2
## Neighborhood avg_price
## <chr> <dbl>
## 1 NoRidge 335295.
## 2 NridgHt 316271.
## 3 StoneBr 310499
## 4 Timber 242247.
## 5 Veenker 238773.
## 6 Somerst 225380.
## 7 ClearCr 212565.
## 8 Crawfor 210625.
## 9 CollgCr 197966.
## 10 Blmngtn 194871.
## # ℹ 15 more rows
House prices vary significantly across neighborhoods, with NoRidge, NridgHt, and StoneBr forming the high-value segment. This indicates that location is a major determinant of property value.
quantile(df$SalePrice)
## 0% 25% 50% 75% 100%
## 34900 129975 163000 214000 755000
The quantile distribution shows that a large proportion of houses fall within lower price ranges, while a smaller fraction occupies the high-value segment.
IQR(df$SalePrice)
## [1] 84025
The large interquartile range indicates substantial variability and the presence of extreme values.
mean(df$SalePrice > mean(df$SalePrice)) * 100
## [1] 38.35616
Approximately 38.36% of houses are priced above the average value, indicating that the majority of houses fall below the mean. This supports the earlier observation of a right-skewed distribution, where a smaller proportion of high-value properties increases the average price disproportionately.
ggplot(df, aes(x = GrLivArea, y = SalePrice)) +
geom_point(alpha = 0.6) +
scale_y_continuous(labels = scales::comma) +
labs(
title = "SalePrice vs Living Area",
x = "Living Area",
y = "Sale Price"
) +
theme_minimal()
The scatter plot shows a clear positive relationship between living area and house price, indicating that larger houses tend to have higher prices. However, the spread of points increases for larger houses, suggesting that size alone does not fully explain price variation. Other factors such as quality and location also influence pricing.
df %>%
group_by(OverallQual) %>%
summarise(avg_price = mean(SalePrice)) %>%
arrange(OverallQual)
## # A tibble: 10 × 2
## OverallQual avg_price
## <int> <dbl>
## 1 1 50150
## 2 2 51770.
## 3 3 87474.
## 4 4 108421.
## 5 5 133523.
## 6 6 161603.
## 7 7 207716.
## 8 8 274736.
## 9 9 367513.
## 10 10 438588.
ggplot(df, aes(x = factor(OverallQual), y = SalePrice)) +
geom_boxplot(fill = "skyblue", color = "black") +
scale_y_continuous(labels = scales::comma) +
labs(
title = "SalePrice vs Overall Quality",
x = "Overall Quality (1 = Low, 10 = High)",
y = "Sale Price"
) +
theme_minimal()
House prices increase sharply with overall quality, with a disproportionately large jump at higher quality levels (8–10). This indicates that premium construction quality commands a significant price premium, making overall quality one of the strongest determinants of house value.
df %>%
group_by(BedroomAbvGr) %>%
summarise(
avg_price = mean(SalePrice),
count = n()
) %>%
arrange(BedroomAbvGr)
## # A tibble: 8 × 3
## BedroomAbvGr avg_price count
## <int> <dbl> <int>
## 1 0 221493. 6
## 2 1 173162. 50
## 3 2 158198. 358
## 4 3 181057. 804
## 5 4 220421. 213
## 6 5 180819. 21
## 7 6 143779 7
## 8 8 200000 1
The number of bedrooms does not show a consistent relationship with house price. Prices increase up to a certain point but decline for higher bedroom counts, indicating diminishing returns. This suggests that simply increasing the number of bedrooms does not add value unless supported by overall size and quality. In some cases, more bedrooms may reflect inefficient space usage rather than higher property value.
df %>%
group_by(FullBath) %>%
summarise(
avg_price = mean(SalePrice),
count = n()
) %>%
arrange(FullBath)
## # A tibble: 4 × 3
## FullBath avg_price count
## <int> <dbl> <int>
## 1 0 165201. 9
## 2 1 134751. 650
## 3 2 213010. 768
## 4 3 347823. 33
House prices show a clear upward trend with the number of bathrooms, indicating that additional bathrooms enhance property value by improving functionality and comfort. Unlike bedrooms, this relationship is more consistent, suggesting that bathrooms are a more reliable indicator of increased housing value.
df %>%
group_by(Neighborhood) %>%
summarise(avg_price = mean(SalePrice)) %>%
arrange(desc(avg_price))
## # A tibble: 25 × 2
## Neighborhood avg_price
## <chr> <dbl>
## 1 NoRidge 335295.
## 2 NridgHt 316271.
## 3 StoneBr 310499
## 4 Timber 242247.
## 5 Veenker 238773.
## 6 Somerst 225380.
## 7 ClearCr 212565.
## 8 Crawfor 210625.
## 9 CollgCr 197966.
## 10 Blmngtn 194871.
## # ℹ 15 more rows
House prices exhibit strong variation across neighborhoods, with a small number of areas such as NoRidge, NridgHt, and StoneBr dominating the high-value segment. This highlights a clear market segmentation, where location creates a significant price premium independent of structural features.
df %>%
mutate(Age = 2010 - YearBuilt,
AgeGroup = cut(Age, breaks = c(0, 20, 40, 60, 80, 150))) %>%
group_by(AgeGroup) %>%
summarise(avg_price = mean(SalePrice), count = n())
## # A tibble: 6 × 3
## AgeGroup avg_price count
## <fct> <dbl> <int>
## 1 (0,20] 238003. 550
## 2 (20,40] 161954. 249
## 3 (40,60] 147545. 342
## 4 (60,80] 134004. 133
## 5 (80,150] 131022. 185
## 6 <NA> 394432 1
House prices generally decline with increasing age, indicating that newer houses are valued higher in the market. However, the irregular pattern suggests that age alone does not determine value, as older houses can still achieve high prices if they possess strong attributes such as superior construction quality or prime location.
House prices are influenced by multiple structural and locational factors, with overall quality and neighborhood emerging as the strongest determinants. While features such as bathrooms show a consistent positive relationship with price, others like bedroom count demonstrate diminishing or inconsistent impact, highlighting the importance of quality and efficient space utilization over simple quantity-based measures.
df %>%
group_by(Neighborhood) %>%
summarise(
avg_price = mean(SalePrice),
count = n()
) %>%
arrange(desc(avg_price))
## # A tibble: 25 × 3
## Neighborhood avg_price count
## <chr> <dbl> <int>
## 1 NoRidge 335295. 41
## 2 NridgHt 316271. 77
## 3 StoneBr 310499 25
## 4 Timber 242247. 38
## 5 Veenker 238773. 11
## 6 Somerst 225380. 86
## 7 ClearCr 212565. 28
## 8 Crawfor 210625. 51
## 9 CollgCr 197966. 150
## 10 Blmngtn 194871. 17
## # ℹ 15 more rows
A small number of neighborhoods such as NoRidge, NridgHt, and StoneBr dominate the high-price segment of the housing market. These areas not only have the highest average prices but also represent a distinct premium category. The distribution of prices across neighborhoods indicates a strong location-based hierarchy, where a limited number of areas command a disproportionate share of high-value properties.
df %>%
arrange(desc(GrLivArea)) %>%
select(GrLivArea, SalePrice, Neighborhood, OverallQual) %>%
head(5)
## GrLivArea SalePrice Neighborhood OverallQual
## 1 5642 160000 Edwards 10
## 2 4676 184750 Edwards 10
## 3 4476 745000 NoRidge 10
## 4 4316 755000 NoRidge 10
## 5 3627 625000 NoRidge 10
The largest houses do not consistently correspond to the highest prices, indicating that size alone is not a sufficient determinant of house value. While some large houses are highly priced, others are relatively lower in value, suggesting that factors such as neighborhood and overall quality significantly influence pricing. This highlights that the impact of size on price is conditional rather than absolute.
df <- df %>%
mutate(
Total_SF = GrLivArea + TotalBsmtSF,
Price_per_sqft = SalePrice / Total_SF
)
df %>%
select(Neighborhood, SalePrice, Total_SF, Price_per_sqft) %>%
arrange(desc(Price_per_sqft)) %>%
head(5)
## Neighborhood SalePrice Total_SF Price_per_sqft
## 1 StoneBr 392000 2838 138.1254
## 2 NridgHt 611657 4694 130.3061
## 3 NAmes 107500 827 129.9879
## 4 NridgHt 582933 4556 127.9484
## 5 NAmes 106500 882 120.7483
Houses with the highest price per unit area are typically not the largest, but are high-quality properties located in premium neighborhoods such as StoneBr and NridgHt. This indicates that price efficiency is driven more by construction quality and location than by size alone. Smaller, well-designed houses in desirable areas can achieve higher value per square foot compared to larger properties in less favorable locations.
The ranking and comparison analysis reveals that house prices are not determined solely by size. While larger houses may have higher total prices, price efficiency and overall valuation are strongly influenced by location and construction quality. This reinforces that high-value properties are defined by a combination of factors rather than a single attribute.
df <- df %>%
mutate(
Total_SF = GrLivArea + TotalBsmtSF,
Price_per_sqft = SalePrice / Total_SF,
Price_per_room = SalePrice / TotRmsAbvGrd,
Age = 2024 - YearBuilt
)
summary(df$Price_per_sqft)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.61 60.33 69.51 69.81 78.93 138.13
ggplot(df, aes(x = Price_per_sqft)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
labs(
title = "Distribution of Price per Sqft",
x = "Price per Sqft",
y = "Frequency"
) +
theme_minimal()
Price per square foot shows substantial variation across houses, indicating differences in pricing efficiency. While most houses fall within a moderate range, a small number of properties exhibit extremely high values, reflecting premium-quality houses in desirable locations. This suggests that pricing efficiency is influenced more by quality and location than by size alone.
df %>%
mutate(AgeGroup = cut(Age, breaks = c(0, 20, 40, 60, 80, 150))) %>%
group_by(AgeGroup) %>%
summarise(count = n())
## # A tibble: 6 × 2
## AgeGroup count
## <fct> <int>
## 1 (0,20] 276
## 2 (20,40] 311
## 3 (40,60] 322
## 4 (60,80] 277
## 5 (80,150] 273
## 6 <NA> 1
The distribution of house age indicates that a large proportion of properties fall within mid-age ranges, with fewer very new or very old houses. This suggests a mature housing market where most properties are neither newly constructed nor extremely old. The presence of both newer and older houses provides diversity, enabling analysis of how age interacts with other factors such as quality and location.
summary(df$Price_per_room)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6317 21943 26467 27909 32378 78500
Price per room varies significantly across houses, indicating that room count alone does not determine property value. Houses with fewer rooms can exhibit higher price per room if they are of superior quality or located in premium neighborhoods. This highlights that value is driven more by quality and efficient space utilization than by the number of rooms.
summary(df$Total_SF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 2014 2479 2573 3008 11752
Total usable space varies widely across houses, indicating the presence of both compact homes and extremely large luxury properties. While most houses fall within a moderate size range, the presence of very large houses suggests a segment of high-end properties. However, large size alone does not guarantee higher value, reinforcing that space must be considered alongside quality and location.
The engineered features provide deeper insight into pricing dynamics, showing that efficiency-based measures such as price per square foot and price per room reveal variations that are not captured by total price alone. These features highlight that house value is influenced not just by size, but by how effectively space is utilized, along with quality and location.