This project aims to predict house prices using regression and machine learning techniques on the King County housing dataset.
data <- read.csv("kc_house_data.csv")
data <- data %>% select(-id, -date) # Remove non-informative columns
summary(data)
## price bedrooms bathrooms sqft_living
## Min. : 75000 Min. : 0.000 Min. :0.000 Min. : 290
## 1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427
## Median : 450000 Median : 3.000 Median :2.250 Median : 1910
## Mean : 540088 Mean : 3.371 Mean :2.115 Mean : 2080
## 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550
## Max. :7700000 Max. :33.000 Max. :8.000 Max. :13540
## sqft_lot floors waterfront view
## Min. : 520 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5040 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7618 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 15107 Mean :1.494 Mean :0.007542 Mean :0.2343
## 3rd Qu.: 10688 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1651359 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition grade sqft_above sqft_basement
## Min. :1.000 Min. : 1.000 Min. : 290 Min. : 0.0
## 1st Qu.:3.000 1st Qu.: 7.000 1st Qu.:1190 1st Qu.: 0.0
## Median :3.000 Median : 7.000 Median :1560 Median : 0.0
## Mean :3.409 Mean : 7.657 Mean :1788 Mean : 291.5
## 3rd Qu.:4.000 3rd Qu.: 8.000 3rd Qu.:2210 3rd Qu.: 560.0
## Max. :5.000 Max. :13.000 Max. :9410 Max. :4820.0
## yr_built yr_renovated zipcode lat
## Min. :1900 Min. : 0.0 Min. :98001 Min. :47.16
## 1st Qu.:1951 1st Qu.: 0.0 1st Qu.:98033 1st Qu.:47.47
## Median :1975 Median : 0.0 Median :98065 Median :47.57
## Mean :1971 Mean : 84.4 Mean :98078 Mean :47.56
## 3rd Qu.:1997 3rd Qu.: 0.0 3rd Qu.:98118 3rd Qu.:47.68
## Max. :2015 Max. :2015.0 Max. :98199 Max. :47.78
## long sqft_living15 sqft_lot15
## Min. :-122.5 Min. : 399 Min. : 651
## 1st Qu.:-122.3 1st Qu.:1490 1st Qu.: 5100
## Median :-122.2 Median :1840 Median : 7620
## Mean :-122.2 Mean :1987 Mean : 12768
## 3rd Qu.:-122.1 3rd Qu.:2360 3rd Qu.: 10083
## Max. :-121.3 Max. :6210 Max. :871200
# Correlation matrix
corrplot(cor(data %>% select_if(is.numeric)), method = "color")
# Distribution of price
ggplot(data, aes(price)) + geom_histogram(bins = 50, fill = "steelblue", color = "white")
set.seed(123)
split <- createDataPartition(data$price, p = 0.8, list = FALSE)
train <- data[split, ]
test <- data[-split, ]
lm_model <- lm(price ~ ., data = train)
pred_lm <- predict(lm_model, test)
rmse_lm <- rmse(test$price, pred_lm)
rmse_lm
## [1] 208007.3
rf_model <- randomForest(price ~ ., data = train, ntree = 100)
pred_rf <- predict(rf_model, test)
rmse_rf <- rmse(test$price, pred_rf)
rmse_rf
## [1] 137859.4
results <- data.frame(Model = c("Linear Regression", "Random Forest"), RMSE = c(rmse_lm, rmse_rf))
results
## Model RMSE
## 1 Linear Regression 208007.3
## 2 Random Forest 137859.4
Random Forest provides a lower RMSE than Linear Regression, indicating better performance in predicting house prices.