Objective

The goal of this project is to build machine learning models in order to make prediction on house sale prices in King County, USA. Data is found on Kaggle and it can be downloaded through the following link. And the description of the dataset is available here.

library(tidyverse)
library(caret)
library(corrplot) #Pearson's Corr
library(MASS) #Box-cox
library(car) #Vif
library(glmnet) #LASSO, Ridge
library(xgboost)
data <- read.csv("kc_house_data.csv")
str(data)
## 'data.frame':    21613 obs. of  21 variables:
##  $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ date         : chr  "20141013T000000" "20141209T000000" "20150225T000000" "20141209T000000" ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
##  $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
##  $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
data <- data %>% dplyr::select(-c(date))
corrplot(cor(data), method="number", type="lower")

# Remove all auxiliary information and data transformation (num -> factor)
factorfun <- function(x){
      x <- as.factor(x)
}
newdata <- data %>% mutate_at(c("waterfront", "view", "condition"), factorfun)

# Replace variables of yr_built and yr_renovated with age 
for (i in 1:nrow(newdata)){
  if(newdata$yr_renovated[i]!=0)
    newdata$yr_built[i] <- newdata$yr_renovated[i]
}
newdata$age <- 2020 - newdata$yr_built

newdata <- newdata %>% dplyr::select(price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, 
                              view, condition, age, lat, long)
str(newdata)
## 'data.frame':    21613 obs. of  12 variables:
##  $ price      : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms   : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms  : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living: int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot   : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors     : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ view       : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ condition  : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 5 3 3 3 3 3 3 ...
##  $ age        : num  65 29 87 55 33 19 25 57 60 17 ...
##  $ lat        : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long       : num  -122 -122 -122 -122 -122 ...
# Remove any missings. In this case, there is none.
newdata <- newdata[rowSums(is.na(newdata))==0,]
# Data visualization
plot(price~sqft_living, data=newdata, main="Price vs. Sqft_living", 
     xlab="Sqft_living", ylab="Price", xlim=c(0,8000), ylim=c(0,6000000))

boxplot(price~bedrooms, data=newdata, col=(c(2:8)), main="Price vs. Bedrooms", 
        xlab="Bedrooms", ylab="Price")

boxplot(price~bathrooms, data=newdata, col=(c(2:8)), main="Price vs. Bathrooms", 
        xlab="Bathrooms", ylab="Price")

# Preprocessing: scale and center
params <- preProcess(newdata[c(-1)], method=c("center", "scale"))
newdata <- predict(params, newdata)

# Train-Test Split
set.seed(123456)
index <- sample(1:nrow(newdata), 0.8*nrow(newdata)) 
train <- newdata[index,]
validation <- newdata[-index,]
# Evaluations
RMSE <- function(x, y){
      sqrt(mean((x-y)^2))
}

R2 <- function(actual, pred){
      rss <- sum((pred-actual)^2)
      tss <- sum((actual-mean(actual))^2)
      r2 <- 1 - rss/tss
      return(r2)
}

Linear Regression

# Linear Regression
lm.fit <- lm(price~., data=newdata)
# summary(lm.fit)
par(mfrow=c(1,2))
# Assumption of linearity failed, transformation needed!
qqnorm(newdata$price)
qqline(newdata$price, col=2)

lm.res <- resid(lm.fit)
plot(lm.fit$fitted.values, lm.res, xlab="Fitted value", ylab="Residuals")
abline(h=0,col="red")

# Box-cox Trans
par(mfrow=c(1,1))
set.seed(123456)
bc <- boxcox(lm.fit, lambda=seq(-1,1,0.1))

best.lam <- bc$x[which(bc$y==max(bc$y))]
best.lam
## [1] -0.03030303
lm.fit <- lm((price)^best.lam~., data=newdata)
par(mfrow=c(1,2))
qqnorm(newdata$price^best.lam)
qqline(newdata$price^best.lam, col=2)
lm.res <- resid(lm.fit)
plot(lm.fit$fitted.values, lm.res, xlab="Fitted value", ylab="Residuals")
abline(h=0,col="red")

par(mfrow=c(1,1))
# Update new dataset with Box-cox trans
newtrain <- train
newval <- validation
newtrain$price <- (newtrain$price)^best.lam
newval$price <- (newval$price)^best.lam

# 10-fold CV
train.control <- trainControl(method="cv", number=10,
                       savePredictions="all")

Linear Regression

lm.model <- train(price~., data=newtrain, method="lm", trControl=train.control)
print(lm.model)
## Linear Regression 
## 
## 17290 samples
##    11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 15562, 15560, 15560, 15561, 15562, 15562, ... 
## Resampling results:
## 
##   RMSE         Rsquared   MAE        
##   0.005860284  0.7014231  0.004494142
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lm.model)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.029777 -0.003559  0.000002  0.003621  0.063396 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.801e-01  1.174e-03 579.146  < 2e-16 ***
## bedrooms     5.713e-04  5.590e-05  10.220  < 2e-16 ***
## bathrooms   -1.169e-03  8.052e-05 -14.520  < 2e-16 ***
## sqft_living -6.094e-03  7.609e-05 -80.091  < 2e-16 ***
## sqft_lot    -2.849e-04  4.688e-05  -6.076 1.26e-09 ***
## floors      -1.083e-03  5.549e-05 -19.512  < 2e-16 ***
## waterfront1 -7.543e-03  6.484e-04 -11.633  < 2e-16 ***
## view1       -4.720e-03  3.689e-04 -12.795  < 2e-16 ***
## view2       -3.567e-03  2.224e-04 -16.042  < 2e-16 ***
## view3       -5.031e-03  3.025e-04 -16.634  < 2e-16 ***
## view4       -6.089e-03  4.826e-04 -12.618  < 2e-16 ***
## condition2  -2.005e-03  1.271e-03  -1.578 0.114695    
## condition3  -5.515e-03  1.176e-03  -4.689 2.77e-06 ***
## condition4  -6.983e-03  1.176e-03  -5.937 2.97e-09 ***
## condition5  -8.040e-03  1.184e-03  -6.792 1.14e-11 ***
## age         -7.696e-04  6.544e-05 -11.761  < 2e-16 ***
## lat         -4.365e-03  4.630e-05 -94.288  < 2e-16 ***
## long         1.955e-04  5.076e-05   3.853 0.000117 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.005855 on 17272 degrees of freedom
## Multiple R-squared:  0.7021, Adjusted R-squared:  0.7018 
## F-statistic:  2395 on 17 and 17272 DF,  p-value: < 2.2e-16
lm.fit <- lm(price~., data=newtrain)
vif(lm.fit)
##                 GVIF Df GVIF^(1/(2*Df))
## bedrooms    1.617188  1        1.271687
## bathrooms   3.296910  1        1.815740
## sqft_living 2.946622  1        1.716573
## sqft_lot    1.105054  1        1.051216
## floors      1.561581  1        1.249633
## waterfront  1.618351  1        1.272144
## view        1.827844  4        1.078307
## condition   1.330435  4        1.036333
## age         2.176354  1        1.475247
## lat         1.078977  1        1.038738
## long        1.303349  1        1.141643
# lm prediction
lm.pred <- predict(lm.fit, newdata=newval[c(-1)])

# Actual test price
y <- exp(log(newval$price)/(best.lam))
# Actual predicted price
yhat.lm <- as.numeric(exp(log(lm.pred)/(best.lam)))

lm.R2 <- summary(lm.fit)$r.squared
lm.R2
## [1] 0.7021131
lm.RMSE <- RMSE(y, yhat.lm)
lm.RMSE
## [1] 238338.4

LASSO and Ridge

X <- model.matrix(price~., newtrain)[,-1]
Y <- newtrain$price
X.test <- model.matrix(price~., newval)[,-1]

lasso <- glmnet(X, Y, alpha=1)
# No.3 parameter (sqft_living) slowest converges to zero; No.16 (lat) the second 
plot(lasso, xvar="lambda", label=TRUE)

lasso.cv <- cv.glmnet(X, Y, type.measure="mse", alpha=1)
plot(lasso.cv)

lambda_min_lasso <- lasso.cv$lambda.min
lambda_min_lasso
## [1] 1.455347e-05
lasso.fit <- glmnet(X, Y, alpha=1, lambda=lambda_min_lasso)
coef(lasso.fit)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept)  0.6745520873
## bedrooms     0.0005317245
## bathrooms   -0.0011440725
## sqft_living -0.0060852764
## sqft_lot    -0.0002667097
## floors      -0.0010588593
## waterfront1 -0.0074725807
## view1       -0.0045969511
## view2       -0.0035213889
## view3       -0.0049699748
## view4       -0.0059904676
## condition2   0.0033339876
## condition3   .           
## condition4  -0.0014488235
## condition5  -0.0024996432
## age         -0.0007290651
## lat         -0.0043585236
## long         0.0001871925
lasso.pred <- as.numeric(predict(lasso.fit, newx=X.test))

lasso.R2 <- R2(newval$price, lasso.pred)
lasso.R2
## [1] 0.7046674
# Actual predicted price using LASSO
yhat.lasso <- as.numeric(exp(log(lasso.pred)/(best.lam)))
lasso.RMSE <- RMSE(y, yhat.lasso)
lasso.RMSE
## [1] 236769.2
ridge <- glmnet(X, Y, alpha=0)
plot(ridge, xvar="lambda", label=TRUE)

ridge.fit <- cv.glmnet(X, Y, type.measure="mse", alpha=0)
lambda_min_ridge <- ridge.fit$lambda.min
lambda_min_ridge
## [1] 0.0007413738
ridge.pred <- as.numeric(predict(ridge.fit, s=lambda_min_ridge, newx=X.test))

ridge.R2 <- R2(newval$price, ridge.pred)
ridge.R2
## [1] 0.7016186
# Actual predicted price using LASSO
yhat.ridge <- as.numeric(exp(log(ridge.pred)/(best.lam)))
ridge.RMSE <- RMSE(y, yhat.ridge)
ridge.RMSE
## [1] 221165
# One-hot encoding
dmy <- dummyVars(" ~ .", data=train)
trainOneHot <- data.frame(predict(dmy, newdata=train))
xx <- as.matrix(dplyr::select(trainOneHot, -price))
yy <- trainOneHot$price

Regression Tree

train_control <- trainControl(
  method="cv",
  number=10,
  savePredictions="all"
)

set.seed(123456)
# tuneLength to specify the number of possible cp values to evaluate
regTree <- train(x=xx, y=yy, trControl=train_control, method="rpart", 
                 tuneLength=10)
print(regTree)
## CART 
## 
## 17290 samples
##    20 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 15562, 15560, 15561, 15560, 15562, 15560, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   MAE     
##   0.01341019  224030.2  0.6268869  136550.6
##   0.01442151  226002.1  0.6198600  139024.2
##   0.01551795  231596.1  0.6021456  142846.0
##   0.02359081  239975.1  0.5720958  147953.3
##   0.02801113  242928.6  0.5617225  149642.2
##   0.03248538  250590.2  0.5321393  152205.9
##   0.05987884  260603.7  0.4948882  158774.1
##   0.06608682  276285.9  0.4312377  168165.4
##   0.09080874  289402.1  0.3747255  184382.2
##   0.31208437  333359.4  0.2888034  216409.3
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01341019.

XGBoost

# Default Hyperparameters
grid_default <- expand.grid(
  nrounds = 100,
  max_depth = 6,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

train_control <- trainControl(
  method = "cv",
  verboseIter = FALSE, 
  allowParallel = TRUE,
  number=10,
  savePredictions="all"
)

set.seed(123456)
xgb_base <- train(
      price~., 
      data=train,
      trControl = train_control,
      tuneGrid = grid_default,
      method = "xgbTree",
      verbose = TRUE
)
## [23:14:56] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:57] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:57] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:57] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:58] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:58] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:58] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:59] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:14:59] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:15:00] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [23:15:00] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
print(xgb_base)
## eXtreme Gradient Boosting 
## 
## 17290 samples
##    11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 15562, 15560, 15561, 15560, 15562, 15560, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   140488.3  0.8544111  74309.33
## 
## Tuning parameter 'nrounds' was held constant at a value of 100
## Tuning
##  held constant at a value of 1
## Tuning parameter 'subsample' was held
##  constant at a value of 1
xgb.pred <- predict(xgb_base, newdata=validation)

xgb.R2 <- R2(validation$price, xgb.pred)
xgb.R2
## [1] 0.8942625
xgb.RMSE <- RMSE(validation$price, xgb.pred)
xgb.RMSE
## [1] 119420