Library importing

pacman::p_load(dplyr, broom, caTools, ggplot2, gridExtra, forecast)

Dataset importing

dataset = read.csv('realestate_roc.csv')
dataset = dataset[, c(3, 4, 5, 8)]
colnames(dataset) = c('age', 'd.mrt', 'n.store', 'price')
glimpse(dataset)
## Rows: 414
## Columns: 4
## $ age     <dbl> 32.0, 19.5, 13.3, 13.3, 5.0, 7.1, 34.5, 20.3, 31.7, 17.9, 3...
## $ d.mrt   <dbl> 84.87882, 306.59470, 561.98450, 561.98450, 390.56840, 2175....
## $ n.store <int> 10, 9, 5, 5, 5, 3, 7, 6, 1, 3, 1, 9, 5, 4, 4, 2, 6, 1, 8, 7...
## $ price   <dbl> 37.9, 42.2, 47.3, 54.8, 43.1, 32.1, 40.3, 46.7, 18.8, 22.1,...

Dataset partitioning

set.seed(123)
split = sample.split(dataset$price, SplitRatio = 2/3)
training.set = subset(dataset, split == T)
test.set = subset(dataset, split == F)

Model fitting

# Multiple linear regression
mod.mlr = lm(price ~ .,
             data = training.set)

# Support vector regression
library('e1071')
mod.svr = svm(price ~ .,
              data = training.set,
              type = 'eps-regression',
              kernel = 'linear')

# Kernel support vector regression
library('e1071')
mod.ksvr = svm(price ~ .,
               data = training.set,
               type = 'eps-regression',
               kernel = 'radial')

# Decision tree regression
library('rpart')
mod.dt = rpart(price ~ .,
               data = training.set,
               method = 'anova',
               control = rpart.control(xval = 10))
plotcp(mod.dt)

mod.dt.p = prune(mod.dt, cp = 0.025)

# Random forest regression
library('randomForest')
mod.rf = randomForest(price ~ .,
                      data = training.set,
                      ntree = 500,
                      mtry = 1,
                      importance = T)

Predication & Tunning

# Multiple linear regression
y.pred = predict(mod.mlr, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = data.frame(RMSE = rmse)

# Support vector regression
y.pred = predict(mod.svr, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = rbind(eval, rmse)

# Kernel support vector regression
y.pred = predict(mod.ksvr, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = rbind(eval, rmse)

# Decision tree regression
y.pred = predict(mod.dt.p, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = rbind(eval, rmse)

# Random forest regression
y.pred = predict(mod.rf, newdata = test.set)
error = accuracy(y.pred, test.set$price)
rmse = format(round(error[2], 2), nsmall = 2) %>% as.numeric()
eval = rbind(eval, rmse)

Model evaluating

rownames(eval) = c('MLR',
                   'SVR',
                   'KSVR',
                   'DT',
                   'RF')

eval[order(eval$RMSE), , drop = F]
##       RMSE
## RF    9.82
## KSVR 10.14
## DT   10.21
## MLR  10.83
## SVR  10.96
eval %>% 
    arrange(RMSE) %>% 
    mutate(ML.name = factor(c('RF',
                              'KSVR',
                              'DT',
                              'MLR',
                              'SVR'),
                            levels = c('RF',
                                       'KSVR',
                                       'DT',
                                       'MLR',
                                       'SVR'))) %>% 
    ggplot(aes(x = ML.name, y = RMSE)) +
    geom_col() +
    coord_cartesian(ylim = c(9, 12)) +
    labs(title = 'Regression ML vs RMSE',
         subtitle = 'Random Forest',
         x = '')