Library importing
pacman::p_load(dplyr, broom, caTools, ggplot2, gridExtra, forecast)
Dataset importing
dataset = read.csv('realestate_roc.csv')
dataset = dataset[, c(3, 4, 5, 8)]
colnames(dataset) = c('age', 'd.mrt', 'n.store', 'price')
glimpse(dataset)
## Rows: 414
## Columns: 4
## $ age <dbl> 32.0, 19.5, 13.3, 13.3, 5.0, 7.1, 34.5, 20.3, 31.7, 17.9, 3...
## $ d.mrt <dbl> 84.87882, 306.59470, 561.98450, 561.98450, 390.56840, 2175....
## $ n.store <int> 10, 9, 5, 5, 5, 3, 7, 6, 1, 3, 1, 9, 5, 4, 4, 2, 6, 1, 8, 7...
## $ price <dbl> 37.9, 42.2, 47.3, 54.8, 43.1, 32.1, 40.3, 46.7, 18.8, 22.1,...
Dataset partitioning
set.seed(123)
split = sample.split(dataset$price, SplitRatio = 2/3)
training.set = subset(dataset, split == T)
test.set = subset(dataset, split == F)
Model fitting
# Multiple linear regression
mod.mlr = lm(price ~ .,
data = training.set)
# Support vector regression
library('e1071')
mod.svr = svm(price ~ .,
data = training.set,
type = 'eps-regression',
kernel = 'linear')
# Kernel support vector regression
library('e1071')
mod.ksvr = svm(price ~ .,
data = training.set,
type = 'eps-regression',
kernel = 'radial')
# Decision tree regression
library('rpart')
mod.dt = rpart(price ~ .,
data = training.set,
method = 'anova',
control = rpart.control(xval = 10))
plotcp(mod.dt)

mod.dt.p = prune(mod.dt, cp = 0.025)
# Random forest regression
library('randomForest')
mod.rf = randomForest(price ~ .,
data = training.set,
ntree = 500,
mtry = 1,
importance = T)
Predication & Tunning
# Multiple linear regression
y.pred = predict(mod.mlr, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = data.frame(RMSE = rmse)
# Support vector regression
y.pred = predict(mod.svr, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = rbind(eval, rmse)
# Kernel support vector regression
y.pred = predict(mod.ksvr, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = rbind(eval, rmse)
# Decision tree regression
y.pred = predict(mod.dt.p, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = rbind(eval, rmse)
# Random forest regression
y.pred = predict(mod.rf, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = rbind(eval, rmse)
Model evaluating
rownames(eval) = c('MLR',
'SVR',
'KSVR',
'DT',
'RF')
eval[order(eval$RMSE), , drop = F]
## RMSE
## RF 9.82
## KSVR 10.14
## DT 10.21
## MLR 10.83
## SVR 10.96
eval %>%
arrange(RMSE) %>%
mutate(ML.name = factor(c('RF',
'KSVR',
'DT',
'MLR',
'SVR'),
levels = c('RF',
'KSVR',
'DT',
'MLR',
'SVR'))) %>%
ggplot(aes(x = ML.name, y = RMSE)) +
geom_col() +
coord_cartesian(ylim = c(9, 12)) +
labs(title = 'Regression ML vs RMSE',
subtitle = 'Random Forest',
x = '')
