House price prediction

Read in data

# load packages
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# read in data
data=read.csv("C:\\Users\\hed2\\Downloads\\statistics in biomedical\\week3\\HousePrices.csv")
summary(data)

##      date               price             bedrooms       bathrooms    
##  Length:4600        Min.   :       0   Min.   :0.000   Min.   :0.000  
##  Class :character   1st Qu.:  322875   1st Qu.:3.000   1st Qu.:1.750  
##  Mode  :character   Median :  460943   Median :3.000   Median :2.250  
##                     Mean   :  551963   Mean   :3.401   Mean   :2.161  
##                     3rd Qu.:  654962   3rd Qu.:4.000   3rd Qu.:2.500  
##                     Max.   :26590000   Max.   :9.000   Max.   :8.000  
##   sqft_living       sqft_lot           floors        waterfront      
##  Min.   :  370   Min.   :    638   Min.   :1.000   Min.   :0.000000  
##  1st Qu.: 1460   1st Qu.:   5001   1st Qu.:1.000   1st Qu.:0.000000  
##  Median : 1980   Median :   7683   Median :1.500   Median :0.000000  
##  Mean   : 2139   Mean   :  14852   Mean   :1.512   Mean   :0.007174  
##  3rd Qu.: 2620   3rd Qu.:  11001   3rd Qu.:2.000   3rd Qu.:0.000000  
##  Max.   :13540   Max.   :1074218   Max.   :3.500   Max.   :1.000000  
##       view          condition       sqft_above   sqft_basement   
##  Min.   :0.0000   Min.   :1.000   Min.   : 370   Min.   :   0.0  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:1190   1st Qu.:   0.0  
##  Median :0.0000   Median :3.000   Median :1590   Median :   0.0  
##  Mean   :0.2407   Mean   :3.452   Mean   :1827   Mean   : 312.1  
##  3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.:2300   3rd Qu.: 610.0  
##  Max.   :4.0000   Max.   :5.000   Max.   :9410   Max.   :4820.0  
##     yr_built     yr_renovated       street              city          
##  Min.   :1900   Min.   :   0.0   Length:4600        Length:4600       
##  1st Qu.:1951   1st Qu.:   0.0   Class :character   Class :character  
##  Median :1976   Median :   0.0   Mode  :character   Mode  :character  
##  Mean   :1971   Mean   : 808.6                                        
##  3rd Qu.:1997   3rd Qu.:1999.0                                        
##  Max.   :2014   Max.   :2014.0                                        
##    statezip           country         
##  Length:4600        Length:4600       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

Check missing

# check missing
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

md.pattern(data )

##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##      date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## 4600    1     1        1         1           1        1      1          1    1
##         0     0        0         0           0        0      0          0    0
##      condition sqft_above sqft_basement yr_built yr_renovated street city
## 4600         1          1             1        1            1      1    1
##              0          0             0        0            0      0    0
##      statezip country  
## 4600        1       1 0
##             0       0 0

Data manipulate

Variables transform

# variables transform from string to date then extract month and week
data$date_time=as.Date(data$date, "%Y-%m-%d")
library("lubridate")
data$month=lubridate::month(ymd(data$date_time))  
data$week=lubridate::week(ymd(data$date_time))  

# variables transform from string to integer 
data$statezip_cat = as.integer(factor(data$statezip, levels = unique(data$statezip)))
data$city_cat = as.integer(factor(data$city, levels = unique(data$city)))
data$street_cat=gsub('[0-9]+', '', data$street)
data$street_cat=gsub('th ', '', data$street_cat)
data$street_cat=gsub('st ', '', data$street_cat)
data$street_cat=gsub('rd ', '', data$street_cat)
data$street_cat=gsub('nd ', '', data$street_cat)
data$street_cat=gsub('- ', '', data$street_cat)
data$street_cat=gsub('-/ ', '', data$street_cat)
data$street_cat = as.integer(factor(data$street_cat))

# examine distribution 
library(DataExplorer)
plot_histogram(data)

Logarithm

# log transform to get nearly normal distribution
data$price_log=log(data$price)
data$sqft_lot_log=log(data$sqft_lot)

data$sqft_basement[data$sqft_basement==0]=0.001 #replace 0 by 0.001, avoid log0=-inf
data$sqft_basement_log=log(data$sqft_basement)
data$sqft_above_log=log(data$sqft_above)
data$sqft_living_log=log(data$sqft_living)

# compute derived variables then log
data$house_years=2014-data$yr_built
data$house_reno_years=ifelse(data$yr_renovated==0,data$house_years,2014-data$yr_renovated)
data$house_years_log=log(data$house_years)
data$house_reno_years_log=log(data$house_reno_years)

# select variables for analysis
data_new =data %>% select ( c(
   "bedrooms"      ,      
   "bathrooms"  ,                   
   "floors"    ,             "view",           "waterfront",
   "condition" ,            "statezip_cat" ,        "city_cat"     ,       
   "street_cat"  ,         "price_log"     ,       "sqft_lot_log"  ,      
   "sqft_basement_log",    "sqft_above_log" ,      "house_years_log" ,    
   "house_reno_years_log", "sqft_living_log",       "month",  "week"
))

Examine outliers

# examine outliers
boxplot( data_new[,!names(data_new) %in% c("statezip_cat" ,  "city_cat" ,  "street_cat")]) #miss category variables

# remove outliers
outliers <- function(x) {
  Q1 <- quantile(x, probs=.25)
  Q3 <- quantile(x, probs=.75)
  iqr = Q3-Q1
  upper_limit = Q3 + (iqr*1.5)  
  lower_limit = Q1 - (iqr*1.5)
  x > upper_limit | x < lower_limit
}
remove_outliers <- function(df, cols = names(df)) {
  for (col in cols) {
    df <- df[!outliers(df[[col]]),]
  }
  df
}
data_no_outlier=remove_outliers(data_new,names(data_new))
nrow(data_new) #4600 before removing outliers

## [1] 4600

nrow(data_no_outlier) #3085

## [1] 3085

# if try to test the effect of sample size
# library(caTools)
# split <- sample.split(data_no_outlier , SplitRatio = 1)
# train_cl <- subset(data_no_outlier, split == "TRUE")
# nrow(train_cl)


# check outliers and distribution again
boxplot( data_no_outlier[,!names(data_no_outlier) %in% c("statezip_cat" ,  "city_cat" ,  "street_cat")]) #remove category variables

plot_histogram(data_no_outlier )

Exploratory analysis by plot

# delete some usefulness variables 
data_no_outlier=data_no_outlier %>% select(-c("view",  "waterfront"))

# plot features
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

pairs.panels(data_no_outlier [c(1:10)])

pairs.panels(data_no_outlier [c(10:16)])

Variables factor

# convert categorical variables to factor
data_no_outlier$bedrooms=as.factor(data_no_outlier$bedrooms)
data_no_outlier$bathrooms=as.factor(data_no_outlier$bathrooms)
data_no_outlier$floors=as.factor(data_no_outlier$floors)
data_no_outlier$condition=as.factor(data_no_outlier$condition)
data_no_outlier$statezip_cat=as.factor(data_no_outlier$statezip_cat)
data_no_outlier$city_cat=as.factor(data_no_outlier$city_cat)
data_no_outlier$street_cat=as.factor(data_no_outlier$street_cat)
data_no_outlier$month=as.factor(data_no_outlier$month)
data_no_outlier$week=as.factor(data_no_outlier$week)
str(data_no_outlier)

## 'data.frame':    3085 obs. of  16 variables:
##  $ bedrooms            : Factor w/ 4 levels "2","3","4","5": 2 2 2 3 1 1 3 2 3 2 ...
##  $ bathrooms           : Factor w/ 11 levels "0.75","1","1.5",..: 3 5 6 7 2 5 5 4 7 4 ...
##  $ floors              : Factor w/ 5 levels "1","1.5","2",..: 2 1 1 1 1 1 2 1 2 1 ...
##  $ condition           : Factor w/ 4 levels "2","3","4","5": 2 3 3 3 2 2 2 2 4 2 ...
##  $ statezip_cat        : Factor w/ 70 levels "1","2","3","4",..: 1 3 4 5 6 5 6 9 10 11 ...
##  $ city_cat            : Factor w/ 34 levels "1","2","3","4",..: 1 3 4 5 2 5 2 8 2 9 ...
##  $ street_cat          : Factor w/ 551 levels "1","2","5","6",..: 70 472 271 18 240 18 18 481 469 271 ...
##  $ price_log           : num  12.7 12.7 12.9 13.2 13.1 ...
##  $ sqft_lot_log        : num  8.98 9.39 8.99 9.26 8.76 ...
##  $ sqft_basement_log   : num  -6.91 -6.91 6.91 6.68 -6.91 ...
##  $ sqft_above_log      : num  7.2 7.57 6.91 7.04 6.78 ...
##  $ house_years_log     : num  4.08 3.87 3.93 3.64 4.33 ...
##  $ house_reno_years_log: num  2.2 3.87 3.93 3.09 3 ...
##  $ sqft_living_log     : num  7.2 7.57 7.6 7.57 6.78 ...
##  $ month               : Factor w/ 3 levels "5","6","7": 1 1 1 1 1 1 1 1 1 1 ...
##  $ week                : Factor w/ 11 levels "18","19","20",..: 1 1 1 1 1 1 1 1 1 1 ...

Modeling- part1 regeression analysis

Scale features

################ scale features and split
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# normalization
x_scale <- scale(data_no_outlier[,!names(data_no_outlier) %in% c("price_log", "bedrooms","bathrooms" ,"floors" , "condition",
                                                                 "statezip_cat","city_cat", "street_cat", "month", "week"  )])

# one hot encoding, create the dummy variables
dummies_model <- dummyVars( ~ bedrooms+bathrooms +floors + condition+
                            statezip_cat+city_cat+ street_cat+ month+ week, data=data_no_outlier)
x_hot <- predict(dummies_model, newdata = data_no_outlier)
data_s_hot <- data.frame(x_scale,x_hot,price_log=data_no_outlier$price_log )

Split data

# Split the data into training and test set
set.seed(12356)
training.samples <- data_s_hot$price_log %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- data_s_hot[training.samples, ]
test.data <- data_s_hot[-training.samples, ]

Elastic net regression

##################using caret package
# Elastic net regression 
# Setup a grid range of lambda values
lambda <- 10^seq(-3, 3, length = 100)
# Build the model
elastic <- train(
  price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
  +poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
  +poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
)
# Model coefficients
# coef(elastic$finalModel, elastic$bestTune$lambda)
# Make predictions
predictions <- elastic %>% predict(test.data)
# Model prediction performance
elastic_rsqu=data.frame(
  RMSE = RMSE(predictions, test.data$price_log),
  Rsquare = R2(predictions, test.data$price_log)
)

Lasso regression

##############
# lasso regression
# Build the model
lasso <- train(
  price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
  +poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
  +poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(alpha = 1, lambda = lambda)
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

# Model coefficients
# lasso_coef=coef(lasso$finalModel, lasso$bestTune$lambda)
# Make predictions
predictions <- lasso %>% predict(test.data)
# Model prediction performance
lasso_rsqu=data.frame(
  RMSE = RMSE(predictions, test.data$price_log),
  Rsquare = R2(predictions, test.data$price_log)
)

Ridge regression

################
# ridge regression
# Build the model
ridge <- train(
  price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
  +poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
  +poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(alpha = 0, lambda = lambda)
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

# Model coefficients
# coef(ridge$finalModel, ridge$bestTune$lambda)
# Make predictions
predictions <- ridge %>% predict(test.data)
# Model prediction performance
ridge_rsqu=data.frame(
  RMSE = RMSE(predictions, test.data$price_log),
  Rsquare = R2(predictions, test.data$price_log)
)

Linear regression

################
# linear regression
# Build the model
linear <- train(
  price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
  +poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
  +poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "lm",
  trControl = trainControl("cv", number = 10),repeats=20)
# Model coefficients
# coef(linear$finalModel) 
# Make predictions
predictions <- linear %>% predict(test.data)
# Model prediction performance
linear_rsqu=data.frame(
  RMSE = RMSE(predictions, test.data$price_log),
  Rsquare = R2(predictions, test.data$price_log)
)

Compare models

###############
# compare models performance
# training data
models <- list( linear=linear,ridge=ridge, lasso = lasso, elastic = elastic)
resamples(models) %>% summary( metric = "Rsquare")

## 
## Call:
## summary.resamples(object = ., metric = "Rsquare")
## 
## Models: linear, ridge, lasso, elastic 
## Number of resamples: 10 
## 
## Rsquare 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lineard  0.7344266 0.7641324 0.7819561 0.7843697 0.8094866 0.8265475    0
## ridged   0.7641145 0.8038247 0.8297073 0.8247490 0.8531208 0.8681333    0
## lassod   0.7734900 0.8290738 0.8408816 0.8368813 0.8516262 0.8666030    0
## elasticd 0.8112687 0.8302773 0.8405050 0.8419475 0.8532001 0.8672771    0

# test data
linear_rsqu

##        RMSE   Rsquare
## 1 0.2037262 0.8143295

ridge_rsqu

##        RMSE   Rsquare
## 1 0.1828943 0.8420651

lasso_rsqu

##        RMSE  Rsquare
## 1 0.1822103 0.843335

elastic_rsqu #best

##        RMSE  Rsquare
## 1 0.1811502 0.845032

Modeling- part2 machine learning

Multivariate Adaptive Regression Splines (MARS)

# Train the model using randomForest and predict on the training data itself.
model_mars = train(price_log ~ ., data=train.data, method='earth')

# calculate the importance of variable
# library(ggplot2)
varimp_mars <- varImp(model_mars)
plot(varimp_mars, main="Variable Importance with MARS")

# Predict on testData
predicted <- predict(model_mars, test.data)

# Model prediction performance
MARS_rsqu=data.frame(
  RMSE = RMSE(predicted, test.data$price_log),
  Rsquare = R2(predicted, test.data$price_log)
)
# RMSE         y
# 1 0.2017003 0.8083039

Random forest

# random forest, cannnot compute class probabilities for regression
model_rf = train(price_log ~ ., data=train.data, method='rf', tuneLength=5, trControl = fitControl)

# Predict on testData
predicted <- predict(model_rf, test.data)

# Model prediction performance
rf_rsqu=data.frame(
  RMSE = RMSE(predicted, test.data$price_log),
  Rsquare = R2(predicted, test.data$price_log)
)
# RMSE   Rsquare
# 1 0.210986 0.7915845

SVM

# Train the model using SVM
model_svmRadial = train(price_log ~ ., data=train.data, method='svmRadial', tuneLength=15, trControl = fitControl)

# Predict on testData
predicted <- predict(model_svmRadial, test.data)

# Model prediction performance
svm_rsqu=data.frame(
  RMSE = RMSE(predicted, test.data$price_log),
  Rsquare = R2(predicted, test.data$price_log)
)
# RMSE   Rsquare
# 1 0.1940989 0.8223975

KNN

# Train the model using KNN
model_knn = train(price_log ~ ., data=train.data, method='knn', tuneLength=15, trControl = fitControl)

# Predict on testData
predicted <- predict(model_knn, test.data)

# Model prediction performance
knn_rsqu=data.frame(
  RMSE = RMSE(predicted, test.data$price_log),
  Rsquare = R2(predicted, test.data$price_log)
)
knn_rsqu
#        RMSE   Rsquare
# 1 0.2805626 0.6335133

Tuning hyperparameter to optimize the model

# # Define the training control
fitControl <- trainControl(
  method = 'cv',                   # k-fold cross validation
  number = 5,                      # number of folds
  savePredictions = 'final'       # saves predictions for optimal tuning parameter
  # classProbs = T,                  # should class probabilities be returned
  # summaryFunction=twoClassSummary  # results summary function
)
# 
# Step 1: Define the tuneGrid
marsGrid <-  expand.grid(k = c(2, 4, 6, 8, 10))

# Step 2: Tune hyper parameters by setting tuneGrid
model_knn3 = train(price_log ~ ., data=train.data, method='knn', metric='RMSE', tuneGrid = marsGrid, trControl = fitControl)

# Step 3: Predict on testData and Compute the confusion matrix
predicted3 <- predict(model_knn3, test.data)

# Model prediction performance
knn_tune_rsqu=data.frame(
  RMSE = RMSE(predicted3, test.data$price_log),
  Rsquare = R2(predicted3, test.data$price_log)
)
knn_tune_rsqu
# RMSE   Rsquare
# 1 0.2775675 0.6442353

Ensemble predictions from multiple models

library(caretEnsemble)

# Stacking Algorithms - Run multiple algos in one call.
trainControl <- trainControl(method="repeatedcv", 
                             number=10, 
                             repeats=3,
                             savePredictions=TRUE )

algorithmList <- c(   'rf',   'knn', 'svmRadial'  )
  # c('rf', 'adaboost', 'earth', 'knn', 'svmRadial')

models_ens <- caretList(price_log ~ ., data=train.data, trControl=trainControl, methodList=algorithmList) 
results <- resamples(models_ens)
summary(results)

# comparison by visualization
# Box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)

# Create the trainControl
stackControl <- trainControl(method="repeatedcv", 
                             number=10, 
                             repeats=3,
                             savePredictions=TRUE )

# Ensemble the predictions of `models` to form a new combined prediction based on glm
stack.glm <- caretStack(models_ens, method="glm", metric="RMSE", trControl=stackControl)

# Predict on testData
stack_predicteds <- predict(stack.glm, newdata=test.data)

# Model prediction performance
ensem_rsqu=data.frame(
  RMSE = RMSE(stack_predicteds, test.data$price_log),
  Rsquare = R2(stack_predicteds, test.data$price_log)
)
# RMSE   Rsquare
# 1 0.2788662 0.6335133

Summary

I compared R squared from the test dataset between four regression models, linear, ridge, lasso, and elastic net, using the cross-validation method. I found the elastic model is the best-fitting model with the R squared after cross-validation is 0.845, linear 0.814, ridge 0.842, and lasso 0.843.
Before modeling, I conducted data processes including checking missingness, generating derived variables, variable transform, examining the distribution and logarithm skewed variables, removing patient outliers, and scaling and one hot encoded all predictors based on styles of variables.
All models were built using “caret” package with polynomial terms of continuous variables. Data was split into training data and test data randomly with a ratio of 0.8.
How to find which one is better?

House price prediction

Daniel

2023-11-10

Read in data

Check missing

Data manipulate

Variables transform

Logarithm

Examine outliers

Exploratory analysis by plot

Variables factor

Modeling- part1 regeression analysis

Scale features

Split data

Elastic net regression

Lasso regression

Ridge regression

Linear regression

Compare models

Modeling- part2 machine learning

Multivariate Adaptive Regression Splines (MARS)

Random forest

SVM

KNN

Tuning hyperparameter to optimize the model

Ensemble predictions from multiple models

Summary