library(tidyverse)
library(tidymodels)
library(highcharter)
oyo <- read_csv("oyoall.csv")
oyo %>% head()
## # A tibble: 6 x 26
##   name  city  room_type room_size  price rating fasility  cctv free_wifi
##   <chr> <chr> <chr>         <dbl>  <dbl> <chr>  <chr>    <dbl>     <dbl>
## 1 OYO ~ jaka~ Standard~       338 136895 4.7    AC TV F~     1         1
## 2 OYO ~ jaka~ Deluxe D~       154 190000 4.7    AC TV F~     0         1
## 3 OYO ~ jaka~ Standard~        96 154072 4.6    Free_Wi~     1         1
## 4 OYO ~ jaka~ Standard~       100 143910 4.5    Free_Wi~     1         1
## 5 Capi~ jaka~ Standard~       154 378153 4.6    Free_Wi~     1         1
## 6 OYO ~ jaka~ Standard~        90 233449 4.4    Free_Wi~     1         1
## # ... with 17 more variables: modern_wardrobe <dbl>, ac <dbl>,
## #   house_keeping <dbl>, mini_fridge <dbl>, seating_area <dbl>, geyser <dbl>,
## #   Parking_Facility <dbl>, Coffee_Tea_Maker <dbl>, Room_heater <dbl>,
## #   Hair_Dryer <dbl>, Refrigerator <dbl>, Toaster <dbl>, Playstation <dbl>,
## #   Water_Purifier <dbl>, Washing_Machine <dbl>, Smoking_Room <dbl>,
## #   Mini_Bar <dbl>

Data cleaning

oyo <- oyo %>% 
  mutate(rating = ifelse(is.na(rating), 0, rating),
         rating = ifelse(rating == "no rating", 0, rating),
         rating = as.numeric(rating))
oyo %>% glimpse()
## Observations: 909
## Variables: 26
## $ name             <chr> "OYO Flagship 210 Amethyst Kemayoran", "OYO Townho...
## $ city             <chr> "jakarta", "jakarta", "jakarta", "jakarta", "jakar...
## $ room_type        <chr> "Standard Twin", "Deluxe Double", "Standard Double...
## $ room_size        <dbl> 338, 154, 96, 100, 154, 90, 69, 80, 110, 100, 165,...
## $ price            <dbl> 136895, 190000, 154072, 143910, 378153, 233449, 13...
## $ rating           <dbl> 4.7, 4.7, 4.6, 4.5, 4.6, 4.4, 3.7, 4.4, 4.4, 4.3, ...
## $ fasility         <chr> "AC TV Free_Wifi Twin_Single_Bed Swimming_Pool Pow...
## $ cctv             <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ free_wifi        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ modern_wardrobe  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ac               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ house_keeping    <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,...
## $ mini_fridge      <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ seating_area     <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ geyser           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Parking_Facility <dbl> 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,...
## $ Coffee_Tea_Maker <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Room_heater      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Hair_Dryer       <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Refrigerator     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Toaster          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Playstation      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Water_Purifier   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Washing_Machine  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Smoking_Room     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Mini_Bar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

“Skimr” package is used to summary the statistic of the data

oyo %>% skimr::skim(.)
Data summary
Name Piped data
Number of rows 909
Number of columns 26
_______________________
Column type frequency:
character 4
numeric 22
________________________
Group variables

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 14 56 0 897 0
city 0 1 4 10 0 9 0
room_type 0 1 7 28 0 29 0
fasility 0 1 2 539 0 794 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
room_size 144 0.84 108.84 55.57 6 96.0 103.0 130.0 774 ▇▁▁▁▁
price 0 1.00 294321.20 505963.40 38922 117875.0 144570.0 177760.0 3000000 ▇▁▁▁▁
rating 0 1.00 3.53 1.84 0 3.8 4.4 4.6 5 ▂▁▁▁▇
cctv 0 1.00 0.72 0.45 0 0.0 1.0 1.0 1 ▃▁▁▁▇
free_wifi 0 1.00 0.78 0.42 0 1.0 1.0 1.0 1 ▂▁▁▁▇
modern_wardrobe 0 1.00 0.00 0.07 0 0.0 0.0 0.0 1 ▇▁▁▁▁
ac 0 1.00 0.99 0.10 0 1.0 1.0 1.0 1 ▁▁▁▁▇
house_keeping 0 1.00 0.58 0.49 0 0.0 1.0 1.0 1 ▆▁▁▁▇
mini_fridge 0 1.00 0.23 0.42 0 0.0 0.0 0.0 1 ▇▁▁▁▂
seating_area 0 1.00 0.25 0.43 0 0.0 0.0 1.0 1 ▇▁▁▁▃
geyser 0 1.00 0.01 0.09 0 0.0 0.0 0.0 1 ▇▁▁▁▁
Parking_Facility 0 1.00 0.71 0.46 0 0.0 1.0 1.0 1 ▃▁▁▁▇
Coffee_Tea_Maker 0 1.00 0.20 0.40 0 0.0 0.0 0.0 1 ▇▁▁▁▂
Room_heater 0 1.00 0.00 0.06 0 0.0 0.0 0.0 1 ▇▁▁▁▁
Hair_Dryer 0 1.00 0.18 0.38 0 0.0 0.0 0.0 1 ▇▁▁▁▂
Refrigerator 0 1.00 0.07 0.26 0 0.0 0.0 0.0 1 ▇▁▁▁▁
Toaster 0 1.00 0.01 0.08 0 0.0 0.0 0.0 1 ▇▁▁▁▁
Playstation 0 1.00 0.00 0.06 0 0.0 0.0 0.0 1 ▇▁▁▁▁
Water_Purifier 0 1.00 0.00 0.06 0 0.0 0.0 0.0 1 ▇▁▁▁▁
Washing_Machine 0 1.00 0.00 0.06 0 0.0 0.0 0.0 1 ▇▁▁▁▁
Smoking_Room 0 1.00 0.00 0.06 0 0.0 0.0 0.0 1 ▇▁▁▁▁
Mini_Bar 0 1.00 0.00 0.06 0 0.0 0.0 0.0 1 ▇▁▁▁▁
#### Summary statis tic of numer ical variables
- The min/minimum o r the cheape st room price is Rp38.922
- The max/maximum o r the most e xpensive room pr ice is Rp3.0 00.000
oyo %>% 
  select(price, room_size, rating) %>% 
  summary()
##      price           room_size         rating    
##  Min.   :  38922   Min.   :  6.0   Min.   :0.00  
##  1st Qu.: 117875   1st Qu.: 96.0   1st Qu.:3.80  
##  Median : 144570   Median :103.0   Median :4.40  
##  Mean   : 294321   Mean   :108.8   Mean   :3.53  
##  3rd Qu.: 177760   3rd Qu.:130.0   3rd Qu.:4.60  
##  Max.   :3000000   Max.   :774.0   Max.   :5.00  
##                    NA's   :144

Visual Analysis

oyo %>% 
  select(price, room_size, city, name) %>% 
  hchart(
    type = "scatter",
    hcaes(x = room_size, y = price, group = city)
  ) %>% 
  hc_add_theme(hc_theme_flat()) %>% 
  hc_title(text = "Relationship between price and room size")

- After remove outliers - Seem

oyo %>% 
  select(price, room_size, city, name) %>% 
  filter(price < 500000 & room_size < 500) %>% 
  
  hchart(
    type = "scatter",
    hcaes(x = room_size, y = price, group = city)
  ) %>% 
  hc_title(text = "Relationship between price and room size after remove the outliers") %>% 
  hc_add_theme(hc_theme_flat())
oyo %>% 
  filter(room_size > 700 & price < 500000) %>% 
  select(name, city, room_type, price, room_size, rating, fasility)
## # A tibble: 1 x 7
##   name       city    room_type  price room_size rating fasility                 
##   <chr>      <chr>   <chr>      <dbl>     <dbl>  <dbl> <chr>                    
## 1 OYO 722 U~ yogyak~ Suite Fa~ 245910       774      5 Free_Wifi Twin_Single_Be~
oyo %>% 
  filter(price < 50000) %>% 
  select(name, city, room_type, price, room_size, rating, fasility)
## # A tibble: 1 x 7
##   name              city   room_type price room_size rating fasility            
##   <chr>             <chr>  <chr>     <dbl>     <dbl>  <dbl> <chr>               
## 1 OYO 3133 Wisma Y~ jakar~ unknown   38922        NA    3.5 Twin_Single_Bed AC ~
oyo %>% 
  select(city, price) %>% 
  group_by(city) %>% 
  summarize(average = round(mean(price)), 2) %>% 
  arrange(desc(average)) %>% 
  
  hchart(
    type = "bar",
    hcaes(x = city, y = average),
    dataLabels = list(enabled = T),
    name = "Average Price"
  ) %>% 
  hc_add_theme(hc_theme_flat()) %>% 
  hc_title(text = "Average price each city")
## `summarise()` ungrouping output (override with `.groups` argument)
oyo %>% 
  group_by(city) %>% 
  summarize(total = n()) %>% 
  arrange(desc(total)) %>% 
  
  hchart(
    type = "bar",
    hcaes(x = city, y = total),
    dataLabels = list(enabled = T),
    name = "Total room"
  ) %>% 
  hc_title(text = "Total room each city") %>% 
  hc_add_theme(hc_theme_hcrt())
## `summarise()` ungrouping output (override with `.groups` argument)
oyo %>% 
  select(city, room_size) %>% 
  filter(!is.na(room_size)) %>% 
  group_by(city) %>% 
  summarize(average = round(mean(room_size)), 0) %>% 
  arrange(desc(average)) %>% 
  
  hchart(
    type = "bar",
    hcaes(x = city, y = average),
    name = "average",
    dataLabels = list(enabled = T)
  ) %>% 
  hc_title(text = "Average Room Size Each City") %>% 
  hc_add_theme(hc_theme_smpl())
## `summarise()` ungrouping output (override with `.groups` argument)
oyo %>% 
  select(rating, city) %>% 
  filter(!is.na(rating) & rating != 0) %>% 
  group_by(city) %>% 
  summarize(average = round(mean(rating),2)) %>%
  arrange(desc(average)) %>% 
  
  hchart(
    type = "bar",
    hcaes(x = city, y = average),
    dataLabels = list(enabled =T)
  ) %>% 
  hc_title(text = "Average rating of room each city") %>% 
  hc_add_theme(hc_theme_elementary())
## `summarise()` ungrouping output (override with `.groups` argument)

Modeling

Data Preprocessing

  • Detecting missing data
oyo %>% 
  map(~ sum(is.na(.)))
## $name
## [1] 0
## 
## $city
## [1] 0
## 
## $room_type
## [1] 0
## 
## $room_size
## [1] 144
## 
## $price
## [1] 0
## 
## $rating
## [1] 0
## 
## $fasility
## [1] 0
## 
## $cctv
## [1] 0
## 
## $free_wifi
## [1] 0
## 
## $modern_wardrobe
## [1] 0
## 
## $ac
## [1] 0
## 
## $house_keeping
## [1] 0
## 
## $mini_fridge
## [1] 0
## 
## $seating_area
## [1] 0
## 
## $geyser
## [1] 0
## 
## $Parking_Facility
## [1] 0
## 
## $Coffee_Tea_Maker
## [1] 0
## 
## $Room_heater
## [1] 0
## 
## $Hair_Dryer
## [1] 0
## 
## $Refrigerator
## [1] 0
## 
## $Toaster
## [1] 0
## 
## $Playstation
## [1] 0
## 
## $Water_Purifier
## [1] 0
## 
## $Washing_Machine
## [1] 0
## 
## $Smoking_Room
## [1] 0
## 
## $Mini_Bar
## [1] 0
  • Data cleaning and procesing
oyomodel <- oyo %>% 
  select(-fasility, -name) %>% 
  filter(room_type != "unknown") %>% 
  filter(!is.na(room_size)) %>%
  filter(rating != 0) %>% 
  mutate(room_type = as.factor(room_type),
         city = as.factor(city),
         room_type  = str_replace_all(room_type, " ", "_")) 

oyo_prep <- recipes::recipe(price ~., data = oyomodel) %>% 
  step_integer(room_type) %>% 
  prep()

oyodone <- bake(oyo_prep, new_data = oyomodel)
oyodone %>% glimpse()
## Observations: 642
## Variables: 24
## $ city             <fct> jakarta, jakarta, jakarta, jakarta, jakarta, jakar...
## $ room_type        <dbl> 17, 2, 15, 15, 15, 15, 16, 15, 15, 15, 2, 15, 15, ...
## $ room_size        <dbl> 338, 154, 96, 100, 154, 90, 69, 80, 110, 100, 165,...
## $ rating           <dbl> 4.7, 4.7, 4.6, 4.5, 4.6, 4.4, 3.7, 4.4, 4.4, 4.3, ...
## $ cctv             <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ free_wifi        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ modern_wardrobe  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ac               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ house_keeping    <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,...
## $ mini_fridge      <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ seating_area     <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ geyser           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Parking_Facility <dbl> 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,...
## $ Coffee_Tea_Maker <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Room_heater      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Hair_Dryer       <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Refrigerator     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Toaster          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Playstation      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Water_Purifier   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Washing_Machine  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Smoking_Room     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Mini_Bar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ price            <dbl> 136895, 190000, 154072, 143910, 378153, 233449, 13...
source("calc_metrics.R")
  • Split data into test and train
set.seed(123)
split_object <- rsample::initial_split(oyodone,
                                     prop = 0.8,
                                     strata = "city")

train_tbl <- split_object %>% training()
test_tbl <- split_object %>% testing()

Machine Learning Modeling

  • Linear Regression model
linear_regmodel <- linear_reg("regression") %>% 
  set_engine("lm") %>% 
  fit(price ~., data = train_tbl)

linear_regmodel %>% predict(new_data = test_tbl)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
## # A tibble: 128 x 1
##      .pred
##      <dbl>
##  1 175455.
##  2 187502.
##  3 167363.
##  4 288531.
##  5 302710.
##  6 156556.
##  7 182572.
##  8 196564.
##  9 167919.
## 10 165770.
## # ... with 118 more rows
linear_regmodel %>% calc_metrics(new_data = test_tbl, truth = price)
## # A tibble: 1 x 3
##      mae   rmse    rsq
##    <dbl>  <dbl>  <dbl>
## 1 40065. 80589. 0.0295
  • Random Forest Model
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dials':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
rfmod <- rand_forest("regression") %>% 
  set_engine("randomForest") %>% 
  fit(price ~., data = train_tbl)

rfmod %>% predict(new_data = test_tbl)
## # A tibble: 128 x 1
##      .pred
##      <dbl>
##  1 178485.
##  2 170908.
##  3 150016.
##  4 297296.
##  5 225435.
##  6 180569.
##  7 170883.
##  8 190194.
##  9 203322.
## 10 204037.
## # ... with 118 more rows
rfmod %>% calc_metrics(new_data = test_tbl, truth = price)
## # A tibble: 1 x 3
##      mae   rmse   rsq
##    <dbl>  <dbl> <dbl>
## 1 35884. 70516. 0.193
library(vip)
## Warning: package 'vip' was built under R version 3.6.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
vi(rfmod$fit,
   lambda = lrmod$fit)
## # A tibble: 30 x 2
##    Variable         Importance
##    <chr>                 <dbl>
##  1 rating              1.65e12
##  2 room_size           7.73e11
##  3 citysurabaya        7.61e11
##  4 house_keeping       3.29e11
##  5 citymalang          3.25e11
##  6 Refrigerator        3.12e11
##  7 room_type           2.26e11
##  8 Hair_Dryer          9.59e10
##  9 Coffee_Tea_Maker    8.70e10
## 10 cityjakarta         8.55e10
## # ... with 20 more rows
  • Xgboost model
set.seed(12)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
model_04_xgboost <- boost_tree(
        mode = "regression", 
        mtry = 10, 
        trees = 642, 
        min_n = 2, 
        tree_depth = 6,
        learn_rate = 0.35, 
        loss_reduction = 0.0001) %>%
    set_engine("xgboost") %>%
    fit(price~ ., data = train_tbl)
model_04_xgboost %>% predict(new_data = test_tbl)
## # A tibble: 128 x 1
##      .pred
##      <dbl>
##  1 176268.
##  2 157875.
##  3 135172.
##  4 190768.
##  5 168886.
##  6 166003.
##  7 153852.
##  8 157386.
##  9 202954.
## 10 231018.
## # ... with 118 more rows
model_04_xgboost %>% calc_metrics(new_data = test_tbl, truth = price)
## # A tibble: 1 x 3
##      mae   rmse   rsq
##    <dbl>  <dbl> <dbl>
## 1 45128. 80442. 0.119

Feature importance

  • room size is the most importance feature based on Xgboost algorithm, it means that room size could affect price change of a room
model_04_xgboost$fit %>%
    xgb.importance(model = .) %>%
    xgb.plot.importance(main = "XGBoost Feature Importance")