Read in data
# load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# read in data
data=read.csv("C:\\Users\\hed2\\Downloads\\statistics in biomedical\\week3\\HousePrices.csv")
summary(data)
## date price bedrooms bathrooms
## Length:4600 Min. : 0 Min. :0.000 Min. :0.000
## Class :character 1st Qu.: 322875 1st Qu.:3.000 1st Qu.:1.750
## Mode :character Median : 460943 Median :3.000 Median :2.250
## Mean : 551963 Mean :3.401 Mean :2.161
## 3rd Qu.: 654962 3rd Qu.:4.000 3rd Qu.:2.500
## Max. :26590000 Max. :9.000 Max. :8.000
## sqft_living sqft_lot floors waterfront
## Min. : 370 Min. : 638 Min. :1.000 Min. :0.000000
## 1st Qu.: 1460 1st Qu.: 5001 1st Qu.:1.000 1st Qu.:0.000000
## Median : 1980 Median : 7683 Median :1.500 Median :0.000000
## Mean : 2139 Mean : 14852 Mean :1.512 Mean :0.007174
## 3rd Qu.: 2620 3rd Qu.: 11001 3rd Qu.:2.000 3rd Qu.:0.000000
## Max. :13540 Max. :1074218 Max. :3.500 Max. :1.000000
## view condition sqft_above sqft_basement
## Min. :0.0000 Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:1190 1st Qu.: 0.0
## Median :0.0000 Median :3.000 Median :1590 Median : 0.0
## Mean :0.2407 Mean :3.452 Mean :1827 Mean : 312.1
## 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.:2300 3rd Qu.: 610.0
## Max. :4.0000 Max. :5.000 Max. :9410 Max. :4820.0
## yr_built yr_renovated street city
## Min. :1900 Min. : 0.0 Length:4600 Length:4600
## 1st Qu.:1951 1st Qu.: 0.0 Class :character Class :character
## Median :1976 Median : 0.0 Mode :character Mode :character
## Mean :1971 Mean : 808.6
## 3rd Qu.:1997 3rd Qu.:1999.0
## Max. :2014 Max. :2014.0
## statezip country
## Length:4600 Length:4600
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Check missing
# check missing
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
md.pattern(data )
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'

## date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## 4600 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0
## condition sqft_above sqft_basement yr_built yr_renovated street city
## 4600 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## statezip country
## 4600 1 1 0
## 0 0 0
Data manipulate
Logarithm
# log transform to get nearly normal distribution
data$price_log=log(data$price)
data$sqft_lot_log=log(data$sqft_lot)
data$sqft_basement[data$sqft_basement==0]=0.001 #replace 0 by 0.001, avoid log0=-inf
data$sqft_basement_log=log(data$sqft_basement)
data$sqft_above_log=log(data$sqft_above)
data$sqft_living_log=log(data$sqft_living)
# compute derived variables then log
data$house_years=2014-data$yr_built
data$house_reno_years=ifelse(data$yr_renovated==0,data$house_years,2014-data$yr_renovated)
data$house_years_log=log(data$house_years)
data$house_reno_years_log=log(data$house_reno_years)
# select variables for analysis
data_new =data %>% select ( c(
"bedrooms" ,
"bathrooms" ,
"floors" , "view", "waterfront",
"condition" , "statezip_cat" , "city_cat" ,
"street_cat" , "price_log" , "sqft_lot_log" ,
"sqft_basement_log", "sqft_above_log" , "house_years_log" ,
"house_reno_years_log", "sqft_living_log", "month", "week"
))
Examine outliers
# examine outliers
boxplot( data_new[,!names(data_new) %in% c("statezip_cat" , "city_cat" , "street_cat")]) #miss category variables

# remove outliers
outliers <- function(x) {
Q1 <- quantile(x, probs=.25)
Q3 <- quantile(x, probs=.75)
iqr = Q3-Q1
upper_limit = Q3 + (iqr*1.5)
lower_limit = Q1 - (iqr*1.5)
x > upper_limit | x < lower_limit
}
remove_outliers <- function(df, cols = names(df)) {
for (col in cols) {
df <- df[!outliers(df[[col]]),]
}
df
}
data_no_outlier=remove_outliers(data_new,names(data_new))
nrow(data_new) #4600 before removing outliers
## [1] 4600
nrow(data_no_outlier) #3085
## [1] 3085
# if try to test the effect of sample size
# library(caTools)
# split <- sample.split(data_no_outlier , SplitRatio = 1)
# train_cl <- subset(data_no_outlier, split == "TRUE")
# nrow(train_cl)
# check outliers and distribution again
boxplot( data_no_outlier[,!names(data_no_outlier) %in% c("statezip_cat" , "city_cat" , "street_cat")]) #remove category variables

plot_histogram(data_no_outlier )

Exploratory analysis by plot
# delete some usefulness variables
data_no_outlier=data_no_outlier %>% select(-c("view", "waterfront"))
# plot features
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(data_no_outlier [c(1:10)])

pairs.panels(data_no_outlier [c(10:16)])

Variables factor
# convert categorical variables to factor
data_no_outlier$bedrooms=as.factor(data_no_outlier$bedrooms)
data_no_outlier$bathrooms=as.factor(data_no_outlier$bathrooms)
data_no_outlier$floors=as.factor(data_no_outlier$floors)
data_no_outlier$condition=as.factor(data_no_outlier$condition)
data_no_outlier$statezip_cat=as.factor(data_no_outlier$statezip_cat)
data_no_outlier$city_cat=as.factor(data_no_outlier$city_cat)
data_no_outlier$street_cat=as.factor(data_no_outlier$street_cat)
data_no_outlier$month=as.factor(data_no_outlier$month)
data_no_outlier$week=as.factor(data_no_outlier$week)
str(data_no_outlier)
## 'data.frame': 3085 obs. of 16 variables:
## $ bedrooms : Factor w/ 4 levels "2","3","4","5": 2 2 2 3 1 1 3 2 3 2 ...
## $ bathrooms : Factor w/ 11 levels "0.75","1","1.5",..: 3 5 6 7 2 5 5 4 7 4 ...
## $ floors : Factor w/ 5 levels "1","1.5","2",..: 2 1 1 1 1 1 2 1 2 1 ...
## $ condition : Factor w/ 4 levels "2","3","4","5": 2 3 3 3 2 2 2 2 4 2 ...
## $ statezip_cat : Factor w/ 70 levels "1","2","3","4",..: 1 3 4 5 6 5 6 9 10 11 ...
## $ city_cat : Factor w/ 34 levels "1","2","3","4",..: 1 3 4 5 2 5 2 8 2 9 ...
## $ street_cat : Factor w/ 551 levels "1","2","5","6",..: 70 472 271 18 240 18 18 481 469 271 ...
## $ price_log : num 12.7 12.7 12.9 13.2 13.1 ...
## $ sqft_lot_log : num 8.98 9.39 8.99 9.26 8.76 ...
## $ sqft_basement_log : num -6.91 -6.91 6.91 6.68 -6.91 ...
## $ sqft_above_log : num 7.2 7.57 6.91 7.04 6.78 ...
## $ house_years_log : num 4.08 3.87 3.93 3.64 4.33 ...
## $ house_reno_years_log: num 2.2 3.87 3.93 3.09 3 ...
## $ sqft_living_log : num 7.2 7.57 7.6 7.57 6.78 ...
## $ month : Factor w/ 3 levels "5","6","7": 1 1 1 1 1 1 1 1 1 1 ...
## $ week : Factor w/ 11 levels "18","19","20",..: 1 1 1 1 1 1 1 1 1 1 ...
Modeling- part1 regeression analysis
Scale features
################ scale features and split
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# normalization
x_scale <- scale(data_no_outlier[,!names(data_no_outlier) %in% c("price_log", "bedrooms","bathrooms" ,"floors" , "condition",
"statezip_cat","city_cat", "street_cat", "month", "week" )])
# one hot encoding, create the dummy variables
dummies_model <- dummyVars( ~ bedrooms+bathrooms +floors + condition+
statezip_cat+city_cat+ street_cat+ month+ week, data=data_no_outlier)
x_hot <- predict(dummies_model, newdata = data_no_outlier)
data_s_hot <- data.frame(x_scale,x_hot,price_log=data_no_outlier$price_log )
Split data
# Split the data into training and test set
set.seed(12356)
training.samples <- data_s_hot$price_log %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- data_s_hot[training.samples, ]
test.data <- data_s_hot[-training.samples, ]
Elastic net regression
##################using caret package
# Elastic net regression
# Setup a grid range of lambda values
lambda <- 10^seq(-3, 3, length = 100)
# Build the model
elastic <- train(
price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
+poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
+poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
# Model coefficients
# coef(elastic$finalModel, elastic$bestTune$lambda)
# Make predictions
predictions <- elastic %>% predict(test.data)
# Model prediction performance
elastic_rsqu=data.frame(
RMSE = RMSE(predictions, test.data$price_log),
Rsquare = R2(predictions, test.data$price_log)
)
Lasso regression
##############
# lasso regression
# Build the model
lasso <- train(
price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
+poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
+poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneGrid = expand.grid(alpha = 1, lambda = lambda)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Model coefficients
# lasso_coef=coef(lasso$finalModel, lasso$bestTune$lambda)
# Make predictions
predictions <- lasso %>% predict(test.data)
# Model prediction performance
lasso_rsqu=data.frame(
RMSE = RMSE(predictions, test.data$price_log),
Rsquare = R2(predictions, test.data$price_log)
)
Ridge regression
################
# ridge regression
# Build the model
ridge <- train(
price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
+poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
+poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneGrid = expand.grid(alpha = 0, lambda = lambda)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Model coefficients
# coef(ridge$finalModel, ridge$bestTune$lambda)
# Make predictions
predictions <- ridge %>% predict(test.data)
# Model prediction performance
ridge_rsqu=data.frame(
RMSE = RMSE(predictions, test.data$price_log),
Rsquare = R2(predictions, test.data$price_log)
)
Linear regression
################
# linear regression
# Build the model
linear <- train(
price_log ~.+poly(sqft_lot_log, degree=3) +poly(sqft_basement_log, degree=3)+poly(sqft_living_log, degree=3)
+poly(sqft_above_log, degree=3)+poly(house_years_log, degree=3)
+poly(house_reno_years_log, degree=3)+poly(sqft_living_log, degree=3), data = train.data, method = "lm",
trControl = trainControl("cv", number = 10),repeats=20)
# Model coefficients
# coef(linear$finalModel)
# Make predictions
predictions <- linear %>% predict(test.data)
# Model prediction performance
linear_rsqu=data.frame(
RMSE = RMSE(predictions, test.data$price_log),
Rsquare = R2(predictions, test.data$price_log)
)
Compare models
###############
# compare models performance
# training data
models <- list( linear=linear,ridge=ridge, lasso = lasso, elastic = elastic)
resamples(models) %>% summary( metric = "Rsquare")
##
## Call:
## summary.resamples(object = ., metric = "Rsquare")
##
## Models: linear, ridge, lasso, elastic
## Number of resamples: 10
##
## Rsquare
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lineard 0.7344266 0.7641324 0.7819561 0.7843697 0.8094866 0.8265475 0
## ridged 0.7641145 0.8038247 0.8297073 0.8247490 0.8531208 0.8681333 0
## lassod 0.7734900 0.8290738 0.8408816 0.8368813 0.8516262 0.8666030 0
## elasticd 0.8112687 0.8302773 0.8405050 0.8419475 0.8532001 0.8672771 0
# test data
linear_rsqu
## RMSE Rsquare
## 1 0.2037262 0.8143295
ridge_rsqu
## RMSE Rsquare
## 1 0.1828943 0.8420651
lasso_rsqu
## RMSE Rsquare
## 1 0.1822103 0.843335
elastic_rsqu #best
## RMSE Rsquare
## 1 0.1811502 0.845032
Modeling- part2 machine learning
Multivariate Adaptive Regression Splines (MARS)
# Train the model using randomForest and predict on the training data itself.
model_mars = train(price_log ~ ., data=train.data, method='earth')
# calculate the importance of variable
# library(ggplot2)
varimp_mars <- varImp(model_mars)
plot(varimp_mars, main="Variable Importance with MARS")
# Predict on testData
predicted <- predict(model_mars, test.data)
# Model prediction performance
MARS_rsqu=data.frame(
RMSE = RMSE(predicted, test.data$price_log),
Rsquare = R2(predicted, test.data$price_log)
)
# RMSE y
# 1 0.2017003 0.8083039
Random forest
# random forest, cannnot compute class probabilities for regression
model_rf = train(price_log ~ ., data=train.data, method='rf', tuneLength=5, trControl = fitControl)
# Predict on testData
predicted <- predict(model_rf, test.data)
# Model prediction performance
rf_rsqu=data.frame(
RMSE = RMSE(predicted, test.data$price_log),
Rsquare = R2(predicted, test.data$price_log)
)
# RMSE Rsquare
# 1 0.210986 0.7915845
SVM
# Train the model using SVM
model_svmRadial = train(price_log ~ ., data=train.data, method='svmRadial', tuneLength=15, trControl = fitControl)
# Predict on testData
predicted <- predict(model_svmRadial, test.data)
# Model prediction performance
svm_rsqu=data.frame(
RMSE = RMSE(predicted, test.data$price_log),
Rsquare = R2(predicted, test.data$price_log)
)
# RMSE Rsquare
# 1 0.1940989 0.8223975
KNN
# Train the model using KNN
model_knn = train(price_log ~ ., data=train.data, method='knn', tuneLength=15, trControl = fitControl)
# Predict on testData
predicted <- predict(model_knn, test.data)
# Model prediction performance
knn_rsqu=data.frame(
RMSE = RMSE(predicted, test.data$price_log),
Rsquare = R2(predicted, test.data$price_log)
)
knn_rsqu
# RMSE Rsquare
# 1 0.2805626 0.6335133
Tuning hyperparameter to optimize the model
# # Define the training control
fitControl <- trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'final' # saves predictions for optimal tuning parameter
# classProbs = T, # should class probabilities be returned
# summaryFunction=twoClassSummary # results summary function
)
#
# Step 1: Define the tuneGrid
marsGrid <- expand.grid(k = c(2, 4, 6, 8, 10))
# Step 2: Tune hyper parameters by setting tuneGrid
model_knn3 = train(price_log ~ ., data=train.data, method='knn', metric='RMSE', tuneGrid = marsGrid, trControl = fitControl)
# Step 3: Predict on testData and Compute the confusion matrix
predicted3 <- predict(model_knn3, test.data)
# Model prediction performance
knn_tune_rsqu=data.frame(
RMSE = RMSE(predicted3, test.data$price_log),
Rsquare = R2(predicted3, test.data$price_log)
)
knn_tune_rsqu
# RMSE Rsquare
# 1 0.2775675 0.6442353
Ensemble predictions from multiple models
library(caretEnsemble)
# Stacking Algorithms - Run multiple algos in one call.
trainControl <- trainControl(method="repeatedcv",
number=10,
repeats=3,
savePredictions=TRUE )
algorithmList <- c( 'rf', 'knn', 'svmRadial' )
# c('rf', 'adaboost', 'earth', 'knn', 'svmRadial')
models_ens <- caretList(price_log ~ ., data=train.data, trControl=trainControl, methodList=algorithmList)
results <- resamples(models_ens)
summary(results)
# comparison by visualization
# Box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)
# Create the trainControl
stackControl <- trainControl(method="repeatedcv",
number=10,
repeats=3,
savePredictions=TRUE )
# Ensemble the predictions of `models` to form a new combined prediction based on glm
stack.glm <- caretStack(models_ens, method="glm", metric="RMSE", trControl=stackControl)
# Predict on testData
stack_predicteds <- predict(stack.glm, newdata=test.data)
# Model prediction performance
ensem_rsqu=data.frame(
RMSE = RMSE(stack_predicteds, test.data$price_log),
Rsquare = R2(stack_predicteds, test.data$price_log)
)
# RMSE Rsquare
# 1 0.2788662 0.6335133