Introduction

This project aims to predict house prices using regression and machine learning techniques on the King County housing dataset.

Data Preparation

data <- read.csv("kc_house_data.csv")
data <- data %>% select(-id, -date)  # Remove non-informative columns
summary(data)
##      price            bedrooms        bathrooms      sqft_living   
##  Min.   :  75000   Min.   : 0.000   Min.   :0.000   Min.   :  290  
##  1st Qu.: 321950   1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427  
##  Median : 450000   Median : 3.000   Median :2.250   Median : 1910  
##  Mean   : 540088   Mean   : 3.371   Mean   :2.115   Mean   : 2080  
##  3rd Qu.: 645000   3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550  
##  Max.   :7700000   Max.   :33.000   Max.   :8.000   Max.   :13540  
##     sqft_lot           floors        waterfront            view       
##  Min.   :    520   Min.   :1.000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   5040   1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   7618   Median :1.500   Median :0.000000   Median :0.0000  
##  Mean   :  15107   Mean   :1.494   Mean   :0.007542   Mean   :0.2343  
##  3rd Qu.:  10688   3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :1651359   Max.   :3.500   Max.   :1.000000   Max.   :4.0000  
##    condition         grade          sqft_above   sqft_basement   
##  Min.   :1.000   Min.   : 1.000   Min.   : 290   Min.   :   0.0  
##  1st Qu.:3.000   1st Qu.: 7.000   1st Qu.:1190   1st Qu.:   0.0  
##  Median :3.000   Median : 7.000   Median :1560   Median :   0.0  
##  Mean   :3.409   Mean   : 7.657   Mean   :1788   Mean   : 291.5  
##  3rd Qu.:4.000   3rd Qu.: 8.000   3rd Qu.:2210   3rd Qu.: 560.0  
##  Max.   :5.000   Max.   :13.000   Max.   :9410   Max.   :4820.0  
##     yr_built     yr_renovated       zipcode           lat       
##  Min.   :1900   Min.   :   0.0   Min.   :98001   Min.   :47.16  
##  1st Qu.:1951   1st Qu.:   0.0   1st Qu.:98033   1st Qu.:47.47  
##  Median :1975   Median :   0.0   Median :98065   Median :47.57  
##  Mean   :1971   Mean   :  84.4   Mean   :98078   Mean   :47.56  
##  3rd Qu.:1997   3rd Qu.:   0.0   3rd Qu.:98118   3rd Qu.:47.68  
##  Max.   :2015   Max.   :2015.0   Max.   :98199   Max.   :47.78  
##       long        sqft_living15    sqft_lot15    
##  Min.   :-122.5   Min.   : 399   Min.   :   651  
##  1st Qu.:-122.3   1st Qu.:1490   1st Qu.:  5100  
##  Median :-122.2   Median :1840   Median :  7620  
##  Mean   :-122.2   Mean   :1987   Mean   : 12768  
##  3rd Qu.:-122.1   3rd Qu.:2360   3rd Qu.: 10083  
##  Max.   :-121.3   Max.   :6210   Max.   :871200

Exploratory Data Analysis (EDA)

# Correlation matrix
corrplot(cor(data %>% select_if(is.numeric)), method = "color")

# Distribution of price
ggplot(data, aes(price)) + geom_histogram(bins = 50, fill = "steelblue", color = "white")

Data Splitting

set.seed(123)
split <- createDataPartition(data$price, p = 0.8, list = FALSE)
train <- data[split, ]
test <- data[-split, ]

Model Building and Evaluation

Linear Regression

lm_model <- lm(price ~ ., data = train)
pred_lm <- predict(lm_model, test)
rmse_lm <- rmse(test$price, pred_lm)
rmse_lm
## [1] 208007.3

Random Forest

rf_model <- randomForest(price ~ ., data = train, ntree = 100)
pred_rf <- predict(rf_model, test)
rmse_rf <- rmse(test$price, pred_rf)
rmse_rf
## [1] 137859.4

Model Comparison

results <- data.frame(Model = c("Linear Regression", "Random Forest"), RMSE = c(rmse_lm, rmse_rf))
results
##               Model     RMSE
## 1 Linear Regression 208007.3
## 2     Random Forest 137859.4

Conclusion

Random Forest provides a lower RMSE than Linear Regression, indicating better performance in predicting house prices.