Read in data

# load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# read in data
data=read.csv("C:\\Users\\hed2\\Downloads\\statistics in biomedical\\week3\\HousePrices.csv")
summary(data)
##      date               price             bedrooms       bathrooms    
##  Length:4600        Min.   :       0   Min.   :0.000   Min.   :0.000  
##  Class :character   1st Qu.:  322875   1st Qu.:3.000   1st Qu.:1.750  
##  Mode  :character   Median :  460943   Median :3.000   Median :2.250  
##                     Mean   :  551963   Mean   :3.401   Mean   :2.161  
##                     3rd Qu.:  654962   3rd Qu.:4.000   3rd Qu.:2.500  
##                     Max.   :26590000   Max.   :9.000   Max.   :8.000  
##   sqft_living       sqft_lot           floors        waterfront      
##  Min.   :  370   Min.   :    638   Min.   :1.000   Min.   :0.000000  
##  1st Qu.: 1460   1st Qu.:   5001   1st Qu.:1.000   1st Qu.:0.000000  
##  Median : 1980   Median :   7683   Median :1.500   Median :0.000000  
##  Mean   : 2139   Mean   :  14852   Mean   :1.512   Mean   :0.007174  
##  3rd Qu.: 2620   3rd Qu.:  11001   3rd Qu.:2.000   3rd Qu.:0.000000  
##  Max.   :13540   Max.   :1074218   Max.   :3.500   Max.   :1.000000  
##       view          condition       sqft_above   sqft_basement   
##  Min.   :0.0000   Min.   :1.000   Min.   : 370   Min.   :   0.0  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:1190   1st Qu.:   0.0  
##  Median :0.0000   Median :3.000   Median :1590   Median :   0.0  
##  Mean   :0.2407   Mean   :3.452   Mean   :1827   Mean   : 312.1  
##  3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.:2300   3rd Qu.: 610.0  
##  Max.   :4.0000   Max.   :5.000   Max.   :9410   Max.   :4820.0  
##     yr_built     yr_renovated       street              city          
##  Min.   :1900   Min.   :   0.0   Length:4600        Length:4600       
##  1st Qu.:1951   1st Qu.:   0.0   Class :character   Class :character  
##  Median :1976   Median :   0.0   Mode  :character   Mode  :character  
##  Mean   :1971   Mean   : 808.6                                        
##  3rd Qu.:1997   3rd Qu.:1999.0                                        
##  Max.   :2014   Max.   :2014.0                                        
##    statezip           country         
##  Length:4600        Length:4600       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Check missing

# check missing
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
md.pattern(data )
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##      date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## 4600    1     1        1         1           1        1      1          1    1
##         0     0        0         0           0        0      0          0    0
##      condition sqft_above sqft_basement yr_built yr_renovated street city
## 4600         1          1             1        1            1      1    1
##              0          0             0        0            0      0    0
##      statezip country  
## 4600        1       1 0
##             0       0 0

Data manipulate

Variables transform

# variables transform from string to date then extract month and week
data$date_time=as.Date(data$date, "%Y-%m-%d")
library("lubridate")
data$month=lubridate::month(ymd(data$date_time))  
data$week=lubridate::week(ymd(data$date_time))  

# variables transform from string to integer 
data$statezip_cat = as.integer(factor(data$statezip, levels = unique(data$statezip)))
data$city_cat = as.integer(factor(data$city, levels = unique(data$city)))
data$street_cat=gsub('[0-9]+', '', data$street)
data$street_cat=gsub('th ', '', data$street_cat)
data$street_cat=gsub('st ', '', data$street_cat)
data$street_cat=gsub('rd ', '', data$street_cat)
data$street_cat=gsub('nd ', '', data$street_cat)
data$street_cat=gsub('- ', '', data$street_cat)
data$street_cat=gsub('-/ ', '', data$street_cat)
data$street_cat = as.integer(factor(data$street_cat))

# examine distribution 
library(DataExplorer)
plot_histogram(data)

Logarithm

# log transform to get nearly normal distribution
data$price_log=log(data$price)
data$sqft_lot_log=log(data$sqft_lot)

data$sqft_basement[data$sqft_basement==0]=0.001 #replace 0 by 0.001, avoid log0=-inf
data$sqft_basement_log=log(data$sqft_basement)
data$sqft_above_log=log(data$sqft_above)
data$sqft_living_log=log(data$sqft_living)

# compute derived variables then log
data$house_years=2014-data$yr_built
data$house_reno_years=ifelse(data$yr_renovated==0,data$house_years,2014-data$yr_renovated)
data$house_years_log=log(data$house_years)
data$house_reno_years_log=log(data$house_reno_years)

# select variables for analysis
data_new =data %>% select ( c(
   "bedrooms"      ,      
   "bathrooms"  ,                   
   "floors"    ,             "view",           "waterfront",
   "condition" ,            "statezip_cat" ,        "city_cat"     ,       
   "street_cat"  ,         "price_log"     ,       "sqft_lot_log"  ,      
   "sqft_basement_log",    "sqft_above_log" ,      "house_years_log" ,    
   "house_reno_years_log", "sqft_living_log",       "month",  "week"
))

Examine outliers

# examine outliers
boxplot( data_new[,!names(data_new) %in% c("statezip_cat" ,  "city_cat" ,  "street_cat")]) #miss category variables

# remove outliers
outliers <- function(x) {
  Q1 <- quantile(x, probs=.25)
  Q3 <- quantile(x, probs=.75)
  iqr = Q3-Q1
  upper_limit = Q3 + (iqr*1.5)  
  lower_limit = Q1 - (iqr*1.5)
  x > upper_limit | x < lower_limit
}
remove_outliers <- function(df, cols = names(df)) {
  for (col in cols) {
    df <- df[!outliers(df[[col]]),]
  }
  df
}
data_no_outlier=remove_outliers(data_new,names(data_new))
nrow(data_new) #4600 before removing outliers
## [1] 4600
nrow(data_no_outlier) #3085
## [1] 3085
# if try to test the effect of sample size
# library(caTools)
# split <- sample.split(data_no_outlier , SplitRatio = 1)
# train_cl <- subset(data_no_outlier, split == "TRUE")
# nrow(train_cl)


# check outliers and distribution again
boxplot( data_no_outlier[,!names(data_no_outlier) %in% c("statezip_cat" ,  "city_cat" ,  "street_cat")]) #remove category variables

plot_histogram(data_no_outlier )

Exploratory analysis by plot

# delete some usefulness variables 
data_no_outlier=data_no_outlier %>% select(-c("view",  "waterfront"))

# plot features
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
pairs.panels(data_no_outlier [c(1:10)])

pairs.panels(data_no_outlier [c(10:16)])

Variables factor

# convert categorical variables to factor
data_no_outlier$bedrooms=as.factor(data_no_outlier$bedrooms)
data_no_outlier$bathrooms=as.factor(data_no_outlier$bathrooms)
data_no_outlier$floors=as.factor(data_no_outlier$floors)
data_no_outlier$condition=as.factor(data_no_outlier$condition)
data_no_outlier$statezip_cat=as.factor(data_no_outlier$statezip_cat)
data_no_outlier$city_cat=as.factor(data_no_outlier$city_cat)
data_no_outlier$street_cat=as.factor(data_no_outlier$street_cat)
data_no_outlier$month=as.factor(data_no_outlier$month)
data_no_outlier$week=as.factor(data_no_outlier$week)
str(data_no_outlier)
## 'data.frame':    3085 obs. of  16 variables:
##  $ bedrooms            : Factor w/ 4 levels "2","3","4","5": 2 2 2 3 1 1 3 2 3 2 ...
##  $ bathrooms           : Factor w/ 11 levels "0.75","1","1.5",..: 3 5 6 7 2 5 5 4 7 4 ...
##  $ floors              : Factor w/ 5 levels "1","1.5","2",..: 2 1 1 1 1 1 2 1 2 1 ...
##  $ condition           : Factor w/ 4 levels "2","3","4","5": 2 3 3 3 2 2 2 2 4 2 ...
##  $ statezip_cat        : Factor w/ 70 levels "1","2","3","4",..: 1 3 4 5 6 5 6 9 10 11 ...
##  $ city_cat            : Factor w/ 34 levels "1","2","3","4",..: 1 3 4 5 2 5 2 8 2 9 ...
##  $ street_cat          : Factor w/ 551 levels "1","2","5","6",..: 70 472 271 18 240 18 18 481 469 271 ...
##  $ price_log           : num  12.7 12.7 12.9 13.2 13.1 ...
##  $ sqft_lot_log        : num  8.98 9.39 8.99 9.26 8.76 ...
##  $ sqft_basement_log   : num  -6.91 -6.91 6.91 6.68 -6.91 ...
##  $ sqft_above_log      : num  7.2 7.57 6.91 7.04 6.78 ...
##  $ house_years_log     : num  4.08 3.87 3.93 3.64 4.33 ...
##  $ house_reno_years_log: num  2.2 3.87 3.93 3.09 3 ...
##  $ sqft_living_log     : num  7.2 7.57 7.6 7.57 6.78 ...
##  $ month               : Factor w/ 3 levels "5","6","7": 1 1 1 1 1 1 1 1 1 1 ...
##  $ week                : Factor w/ 11 levels "18","19","20",..: 1 1 1 1 1 1 1 1 1 1 ...

Modeling- part1 regeression analysis

Scale features

################ scale features and split
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# normalization
x_scale <- scale(data_no_outlier[,!names(data_no_outlier) %in% c("price_log", "bedrooms","bathrooms" ,"floors" , "condition",
                                                                 "statezip_cat","city_cat", "street_cat", "month", "week"  )])

# one hot encoding, create the dummy variables
dummies_model <- dummyVars( ~ bedrooms+bathrooms +floors + condition+
                            statezip_cat+city_cat+ street_cat+ month+ week, data=data_no_outlier)
x_hot <- predict(dummies_model, newdata = data_no_outlier)
data_s_hot <- data.frame(x_scale,x_hot,price_log=data_no_outlier$price_log )

Split data

# Split the data into training and test set
set.seed(12356)
training.samples <- data_s_hot$price_log %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- data_s_hot[training.samples, ]
test.data <- data_s_hot[-training.samples, ]

Elastic net regression

##################using caret package
# Elastic net regression 
# Setup a grid range of lambda values
lambda <- 10^seq(-3, 3, length = 100)
# Build the model
elastic <- train(
  price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
  +poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
  +poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
)
# Model coefficients
# coef(elastic$finalModel, elastic$bestTune$lambda)
# Make predictions
predictions <- elastic %>% predict(test.data)
# Model prediction performance
elastic_rsqu=data.frame(
  RMSE = RMSE(predictions, test.data$price_log),
  Rsquare = R2(predictions, test.data$price_log)
)

Lasso regression

##############
# lasso regression
# Build the model
lasso <- train(
  price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
  +poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
  +poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(alpha = 1, lambda = lambda)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Model coefficients
# lasso_coef=coef(lasso$finalModel, lasso$bestTune$lambda)
# Make predictions
predictions <- lasso %>% predict(test.data)
# Model prediction performance
lasso_rsqu=data.frame(
  RMSE = RMSE(predictions, test.data$price_log),
  Rsquare = R2(predictions, test.data$price_log)
)

Ridge regression

################
# ridge regression
# Build the model
ridge <- train(
  price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
  +poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
  +poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(alpha = 0, lambda = lambda)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Model coefficients
# coef(ridge$finalModel, ridge$bestTune$lambda)
# Make predictions
predictions <- ridge %>% predict(test.data)
# Model prediction performance
ridge_rsqu=data.frame(
  RMSE = RMSE(predictions, test.data$price_log),
  Rsquare = R2(predictions, test.data$price_log)
)

Linear regression

################
# linear regression
# Build the model
linear <- train(
  price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
  +poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
  +poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "lm",
  trControl = trainControl("cv", number = 10),repeats=20)
# Model coefficients
# coef(linear$finalModel) 
# Make predictions
predictions <- linear %>% predict(test.data)
# Model prediction performance
linear_rsqu=data.frame(
  RMSE = RMSE(predictions, test.data$price_log),
  Rsquare = R2(predictions, test.data$price_log)
)

Compare models

###############
# compare models performance
# training data
models <- list( linear=linear,ridge=ridge, lasso = lasso, elastic = elastic)
resamples(models) %>% summary( metric = "Rsquare")
## 
## Call:
## summary.resamples(object = ., metric = "Rsquare")
## 
## Models: linear, ridge, lasso, elastic 
## Number of resamples: 10 
## 
## Rsquare 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lineard  0.7344266 0.7641324 0.7819561 0.7843697 0.8094866 0.8265475    0
## ridged   0.7641145 0.8038247 0.8297073 0.8247490 0.8531208 0.8681333    0
## lassod   0.7734900 0.8290738 0.8408816 0.8368813 0.8516262 0.8666030    0
## elasticd 0.8112687 0.8302773 0.8405050 0.8419475 0.8532001 0.8672771    0
# test data
linear_rsqu
##        RMSE   Rsquare
## 1 0.2037262 0.8143295
ridge_rsqu
##        RMSE   Rsquare
## 1 0.1828943 0.8420651
lasso_rsqu
##        RMSE  Rsquare
## 1 0.1822103 0.843335
elastic_rsqu #best
##        RMSE  Rsquare
## 1 0.1811502 0.845032

Modeling- part2 machine learning

Multivariate Adaptive Regression Splines (MARS)

# Train the model using randomForest and predict on the training data itself.
model_mars = train(price_log ~ ., data=train.data, method='earth')

# calculate the importance of variable
# library(ggplot2)
varimp_mars <- varImp(model_mars)
plot(varimp_mars, main="Variable Importance with MARS")

# Predict on testData
predicted <- predict(model_mars, test.data)

# Model prediction performance
MARS_rsqu=data.frame(
  RMSE = RMSE(predicted, test.data$price_log),
  Rsquare = R2(predicted, test.data$price_log)
)
# RMSE         y
# 1 0.2017003 0.8083039

Random forest

# random forest, cannnot compute class probabilities for regression
model_rf = train(price_log ~ ., data=train.data, method='rf', tuneLength=5, trControl = fitControl)

# Predict on testData
predicted <- predict(model_rf, test.data)

# Model prediction performance
rf_rsqu=data.frame(
  RMSE = RMSE(predicted, test.data$price_log),
  Rsquare = R2(predicted, test.data$price_log)
)
# RMSE   Rsquare
# 1 0.210986 0.7915845

SVM

# Train the model using SVM
model_svmRadial = train(price_log ~ ., data=train.data, method='svmRadial', tuneLength=15, trControl = fitControl)

# Predict on testData
predicted <- predict(model_svmRadial, test.data)

# Model prediction performance
svm_rsqu=data.frame(
  RMSE = RMSE(predicted, test.data$price_log),
  Rsquare = R2(predicted, test.data$price_log)
)
# RMSE   Rsquare
# 1 0.1940989 0.8223975

KNN

# Train the model using KNN
model_knn = train(price_log ~ ., data=train.data, method='knn', tuneLength=15, trControl = fitControl)

# Predict on testData
predicted <- predict(model_knn, test.data)

# Model prediction performance
knn_rsqu=data.frame(
  RMSE = RMSE(predicted, test.data$price_log),
  Rsquare = R2(predicted, test.data$price_log)
)
knn_rsqu
#        RMSE   Rsquare
# 1 0.2805626 0.6335133

Tuning hyperparameter to optimize the model

# # Define the training control
fitControl <- trainControl(
  method = 'cv',                   # k-fold cross validation
  number = 5,                      # number of folds
  savePredictions = 'final'       # saves predictions for optimal tuning parameter
  # classProbs = T,                  # should class probabilities be returned
  # summaryFunction=twoClassSummary  # results summary function
)
# 
# Step 1: Define the tuneGrid
marsGrid <-  expand.grid(k = c(2, 4, 6, 8, 10))

# Step 2: Tune hyper parameters by setting tuneGrid
model_knn3 = train(price_log ~ ., data=train.data, method='knn', metric='RMSE', tuneGrid = marsGrid, trControl = fitControl)

# Step 3: Predict on testData and Compute the confusion matrix
predicted3 <- predict(model_knn3, test.data)

# Model prediction performance
knn_tune_rsqu=data.frame(
  RMSE = RMSE(predicted3, test.data$price_log),
  Rsquare = R2(predicted3, test.data$price_log)
)
knn_tune_rsqu
# RMSE   Rsquare
# 1 0.2775675 0.6442353

Ensemble predictions from multiple models

library(caretEnsemble)

# Stacking Algorithms - Run multiple algos in one call.
trainControl <- trainControl(method="repeatedcv", 
                             number=10, 
                             repeats=3,
                             savePredictions=TRUE )

algorithmList <- c(   'rf',   'knn', 'svmRadial'  )
  # c('rf', 'adaboost', 'earth', 'knn', 'svmRadial')

models_ens <- caretList(price_log ~ ., data=train.data, trControl=trainControl, methodList=algorithmList) 
results <- resamples(models_ens)
summary(results)

# comparison by visualization
# Box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)

# Create the trainControl
stackControl <- trainControl(method="repeatedcv", 
                             number=10, 
                             repeats=3,
                             savePredictions=TRUE )

# Ensemble the predictions of `models` to form a new combined prediction based on glm
stack.glm <- caretStack(models_ens, method="glm", metric="RMSE", trControl=stackControl)

# Predict on testData
stack_predicteds <- predict(stack.glm, newdata=test.data)

# Model prediction performance
ensem_rsqu=data.frame(
  RMSE = RMSE(stack_predicteds, test.data$price_log),
  Rsquare = R2(stack_predicteds, test.data$price_log)
)
# RMSE   Rsquare
# 1 0.2788662 0.6335133

Summary