The goal of this project is to build machine learning models in order to make prediction on house sale prices in King County, USA. Data is found on Kaggle and it can be downloaded through the following link. And the description of the dataset is available here.
library(tidyverse)
library(caret)
library(corrplot) #Pearson's Corr
library(MASS) #Box-cox
library(car) #Vif
library(glmnet) #LASSO, Ridge
library(xgboost)
data <- read.csv("kc_house_data.csv")
str(data)
## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ date : chr "20141013T000000" "20141209T000000" "20150225T000000" "20141209T000000" ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
data <- data %>% dplyr::select(-c(date))
corrplot(cor(data), method="number", type="lower")
# Remove all auxiliary information and data transformation (num -> factor)
factorfun <- function(x){
x <- as.factor(x)
}
newdata <- data %>% mutate_at(c("waterfront", "view", "condition"), factorfun)
# Replace variables of yr_built and yr_renovated with age
for (i in 1:nrow(newdata)){
if(newdata$yr_renovated[i]!=0)
newdata$yr_built[i] <- newdata$yr_renovated[i]
}
newdata$age <- 2020 - newdata$yr_built
newdata <- newdata %>% dplyr::select(price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront,
view, condition, age, lat, long)
str(newdata)
## 'data.frame': 21613 obs. of 12 variables:
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living: int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ view : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ condition : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 5 3 3 3 3 3 3 ...
## $ age : num 65 29 87 55 33 19 25 57 60 17 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
# Remove any missings. In this case, there is none.
newdata <- newdata[rowSums(is.na(newdata))==0,]
# Data visualization
plot(price~sqft_living, data=newdata, main="Price vs. Sqft_living",
xlab="Sqft_living", ylab="Price", xlim=c(0,8000), ylim=c(0,6000000))
boxplot(price~bedrooms, data=newdata, col=(c(2:8)), main="Price vs. Bedrooms",
xlab="Bedrooms", ylab="Price")
boxplot(price~bathrooms, data=newdata, col=(c(2:8)), main="Price vs. Bathrooms",
xlab="Bathrooms", ylab="Price")
# Preprocessing: scale and center
params <- preProcess(newdata[c(-1)], method=c("center", "scale"))
newdata <- predict(params, newdata)
# Train-Test Split
set.seed(123456)
index <- sample(1:nrow(newdata), 0.8*nrow(newdata))
train <- newdata[index,]
validation <- newdata[-index,]
# Evaluations
RMSE <- function(x, y){
sqrt(mean((x-y)^2))
}
R2 <- function(actual, pred){
rss <- sum((pred-actual)^2)
tss <- sum((actual-mean(actual))^2)
r2 <- 1 - rss/tss
return(r2)
}
# Linear Regression
lm.fit <- lm(price~., data=newdata)
# summary(lm.fit)
par(mfrow=c(1,2))
# Assumption of linearity failed, transformation needed!
qqnorm(newdata$price)
qqline(newdata$price, col=2)
lm.res <- resid(lm.fit)
plot(lm.fit$fitted.values, lm.res, xlab="Fitted value", ylab="Residuals")
abline(h=0,col="red")
# Box-cox Trans
par(mfrow=c(1,1))
set.seed(123456)
bc <- boxcox(lm.fit, lambda=seq(-1,1,0.1))
best.lam <- bc$x[which(bc$y==max(bc$y))]
best.lam
## [1] -0.03030303
lm.fit <- lm((price)^best.lam~., data=newdata)
par(mfrow=c(1,2))
qqnorm(newdata$price^best.lam)
qqline(newdata$price^best.lam, col=2)
lm.res <- resid(lm.fit)
plot(lm.fit$fitted.values, lm.res, xlab="Fitted value", ylab="Residuals")
abline(h=0,col="red")
par(mfrow=c(1,1))
# Update new dataset with Box-cox trans
newtrain <- train
newval <- validation
newtrain$price <- (newtrain$price)^best.lam
newval$price <- (newval$price)^best.lam
# 10-fold CV
train.control <- trainControl(method="cv", number=10,
savePredictions="all")
lm.model <- train(price~., data=newtrain, method="lm", trControl=train.control)
print(lm.model)
## Linear Regression
##
## 17290 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 15562, 15560, 15560, 15561, 15562, 15562, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.005860284 0.7014231 0.004494142
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lm.model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.029777 -0.003559 0.000002 0.003621 0.063396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.801e-01 1.174e-03 579.146 < 2e-16 ***
## bedrooms 5.713e-04 5.590e-05 10.220 < 2e-16 ***
## bathrooms -1.169e-03 8.052e-05 -14.520 < 2e-16 ***
## sqft_living -6.094e-03 7.609e-05 -80.091 < 2e-16 ***
## sqft_lot -2.849e-04 4.688e-05 -6.076 1.26e-09 ***
## floors -1.083e-03 5.549e-05 -19.512 < 2e-16 ***
## waterfront1 -7.543e-03 6.484e-04 -11.633 < 2e-16 ***
## view1 -4.720e-03 3.689e-04 -12.795 < 2e-16 ***
## view2 -3.567e-03 2.224e-04 -16.042 < 2e-16 ***
## view3 -5.031e-03 3.025e-04 -16.634 < 2e-16 ***
## view4 -6.089e-03 4.826e-04 -12.618 < 2e-16 ***
## condition2 -2.005e-03 1.271e-03 -1.578 0.114695
## condition3 -5.515e-03 1.176e-03 -4.689 2.77e-06 ***
## condition4 -6.983e-03 1.176e-03 -5.937 2.97e-09 ***
## condition5 -8.040e-03 1.184e-03 -6.792 1.14e-11 ***
## age -7.696e-04 6.544e-05 -11.761 < 2e-16 ***
## lat -4.365e-03 4.630e-05 -94.288 < 2e-16 ***
## long 1.955e-04 5.076e-05 3.853 0.000117 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.005855 on 17272 degrees of freedom
## Multiple R-squared: 0.7021, Adjusted R-squared: 0.7018
## F-statistic: 2395 on 17 and 17272 DF, p-value: < 2.2e-16
lm.fit <- lm(price~., data=newtrain)
vif(lm.fit)
## GVIF Df GVIF^(1/(2*Df))
## bedrooms 1.617188 1 1.271687
## bathrooms 3.296910 1 1.815740
## sqft_living 2.946622 1 1.716573
## sqft_lot 1.105054 1 1.051216
## floors 1.561581 1 1.249633
## waterfront 1.618351 1 1.272144
## view 1.827844 4 1.078307
## condition 1.330435 4 1.036333
## age 2.176354 1 1.475247
## lat 1.078977 1 1.038738
## long 1.303349 1 1.141643
# lm prediction
lm.pred <- predict(lm.fit, newdata=newval[c(-1)])
# Actual test price
y <- exp(log(newval$price)/(best.lam))
# Actual predicted price
yhat.lm <- as.numeric(exp(log(lm.pred)/(best.lam)))
lm.R2 <- summary(lm.fit)$r.squared
lm.R2
## [1] 0.7021131
lm.RMSE <- RMSE(y, yhat.lm)
lm.RMSE
## [1] 238338.4
X <- model.matrix(price~., newtrain)[,-1]
Y <- newtrain$price
X.test <- model.matrix(price~., newval)[,-1]
lasso <- glmnet(X, Y, alpha=1)
# No.3 parameter (sqft_living) slowest converges to zero; No.16 (lat) the second
plot(lasso, xvar="lambda", label=TRUE)
lasso.cv <- cv.glmnet(X, Y, type.measure="mse", alpha=1)
plot(lasso.cv)
lambda_min_lasso <- lasso.cv$lambda.min
lambda_min_lasso
## [1] 1.455347e-05
lasso.fit <- glmnet(X, Y, alpha=1, lambda=lambda_min_lasso)
coef(lasso.fit)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 0.6745520873
## bedrooms 0.0005317245
## bathrooms -0.0011440725
## sqft_living -0.0060852764
## sqft_lot -0.0002667097
## floors -0.0010588593
## waterfront1 -0.0074725807
## view1 -0.0045969511
## view2 -0.0035213889
## view3 -0.0049699748
## view4 -0.0059904676
## condition2 0.0033339876
## condition3 .
## condition4 -0.0014488235
## condition5 -0.0024996432
## age -0.0007290651
## lat -0.0043585236
## long 0.0001871925
lasso.pred <- as.numeric(predict(lasso.fit, newx=X.test))
lasso.R2 <- R2(newval$price, lasso.pred)
lasso.R2
## [1] 0.7046674
# Actual predicted price using LASSO
yhat.lasso <- as.numeric(exp(log(lasso.pred)/(best.lam)))
lasso.RMSE <- RMSE(y, yhat.lasso)
lasso.RMSE
## [1] 236769.2
ridge <- glmnet(X, Y, alpha=0)
plot(ridge, xvar="lambda", label=TRUE)
ridge.fit <- cv.glmnet(X, Y, type.measure="mse", alpha=0)
lambda_min_ridge <- ridge.fit$lambda.min
lambda_min_ridge
## [1] 0.0007413738
ridge.pred <- as.numeric(predict(ridge.fit, s=lambda_min_ridge, newx=X.test))
ridge.R2 <- R2(newval$price, ridge.pred)
ridge.R2
## [1] 0.7016186
# Actual predicted price using LASSO
yhat.ridge <- as.numeric(exp(log(ridge.pred)/(best.lam)))
ridge.RMSE <- RMSE(y, yhat.ridge)
ridge.RMSE
## [1] 221165
# One-hot encoding
dmy <- dummyVars(" ~ .", data=train)
trainOneHot <- data.frame(predict(dmy, newdata=train))
xx <- as.matrix(dplyr::select(trainOneHot, -price))
yy <- trainOneHot$price
train_control <- trainControl(
method="cv",
number=10,
savePredictions="all"
)
set.seed(123456)
# tuneLength to specify the number of possible cp values to evaluate
regTree <- train(x=xx, y=yy, trControl=train_control, method="rpart",
tuneLength=10)
print(regTree)
## CART
##
## 17290 samples
## 20 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 15562, 15560, 15561, 15560, 15562, 15560, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.01341019 224030.2 0.6268869 136550.6
## 0.01442151 226002.1 0.6198600 139024.2
## 0.01551795 231596.1 0.6021456 142846.0
## 0.02359081 239975.1 0.5720958 147953.3
## 0.02801113 242928.6 0.5617225 149642.2
## 0.03248538 250590.2 0.5321393 152205.9
## 0.05987884 260603.7 0.4948882 158774.1
## 0.06608682 276285.9 0.4312377 168165.4
## 0.09080874 289402.1 0.3747255 184382.2
## 0.31208437 333359.4 0.2888034 216409.3
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01341019.
# Default Hyperparameters
grid_default <- expand.grid(
nrounds = 100,
max_depth = 6,
eta = 0.3,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
train_control <- trainControl(
method = "cv",
verboseIter = FALSE,
allowParallel = TRUE,
number=10,
savePredictions="all"
)
set.seed(123456)
xgb_base <- train(
price~.,
data=train,
trControl = train_control,
tuneGrid = grid_default,
method = "xgbTree",
verbose = TRUE
)
## [23:14:56] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:57] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:57] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:57] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:58] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:58] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:58] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:59] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:59] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:15:00] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:15:00] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
print(xgb_base)
## eXtreme Gradient Boosting
##
## 17290 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 15562, 15560, 15561, 15560, 15562, 15560, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 140488.3 0.8544111 74309.33
##
## Tuning parameter 'nrounds' was held constant at a value of 100
## Tuning
## held constant at a value of 1
## Tuning parameter 'subsample' was held
## constant at a value of 1
xgb.pred <- predict(xgb_base, newdata=validation)
xgb.R2 <- R2(validation$price, xgb.pred)
xgb.R2
## [1] 0.8942625
xgb.RMSE <- RMSE(validation$price, xgb.pred)
xgb.RMSE
## [1] 119420