Oyo Indonesia Data Analysis and Machine Learning Modeling

library(tidyverse)
library(tidymodels)
library(highcharter)

oyo <- read_csv("oyoall.csv")

oyo %>% head()

## # A tibble: 6 x 26
##   name  city  room_type room_size  price rating fasility  cctv free_wifi
##   <chr> <chr> <chr>         <dbl>  <dbl> <chr>  <chr>    <dbl>     <dbl>
## 1 OYO ~ jaka~ Standard~       338 136895 4.7    AC TV F~     1         1
## 2 OYO ~ jaka~ Deluxe D~       154 190000 4.7    AC TV F~     0         1
## 3 OYO ~ jaka~ Standard~        96 154072 4.6    Free_Wi~     1         1
## 4 OYO ~ jaka~ Standard~       100 143910 4.5    Free_Wi~     1         1
## 5 Capi~ jaka~ Standard~       154 378153 4.6    Free_Wi~     1         1
## 6 OYO ~ jaka~ Standard~        90 233449 4.4    Free_Wi~     1         1
## # ... with 17 more variables: modern_wardrobe <dbl>, ac <dbl>,
## #   house_keeping <dbl>, mini_fridge <dbl>, seating_area <dbl>, geyser <dbl>,
## #   Parking_Facility <dbl>, Coffee_Tea_Maker <dbl>, Room_heater <dbl>,
## #   Hair_Dryer <dbl>, Refrigerator <dbl>, Toaster <dbl>, Playstation <dbl>,
## #   Water_Purifier <dbl>, Washing_Machine <dbl>, Smoking_Room <dbl>,
## #   Mini_Bar <dbl>

Data cleaning

Convert NA to 0 and string to numeric
Convert rating data type to numeric

oyo <- oyo %>% 
  mutate(rating = ifelse(is.na(rating), 0, rating),
         rating = ifelse(rating == "no rating", 0, rating),
         rating = as.numeric(rating))

Glimpse

oyo %>% glimpse()

## Observations: 909
## Variables: 26
## $ name             <chr> "OYO Flagship 210 Amethyst Kemayoran", "OYO Townho...
## $ city             <chr> "jakarta", "jakarta", "jakarta", "jakarta", "jakar...
## $ room_type        <chr> "Standard Twin", "Deluxe Double", "Standard Double...
## $ room_size        <dbl> 338, 154, 96, 100, 154, 90, 69, 80, 110, 100, 165,...
## $ price            <dbl> 136895, 190000, 154072, 143910, 378153, 233449, 13...
## $ rating           <dbl> 4.7, 4.7, 4.6, 4.5, 4.6, 4.4, 3.7, 4.4, 4.4, 4.3, ...
## $ fasility         <chr> "AC TV Free_Wifi Twin_Single_Bed Swimming_Pool Pow...
## $ cctv             <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ free_wifi        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ modern_wardrobe  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ac               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ house_keeping    <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,...
## $ mini_fridge      <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ seating_area     <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ geyser           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Parking_Facility <dbl> 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,...
## $ Coffee_Tea_Maker <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Room_heater      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Hair_Dryer       <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Refrigerator     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Toaster          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Playstation      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Water_Purifier   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Washing_Machine  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Smoking_Room     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Mini_Bar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

“Skimr” package is used to summary the statistic of the data

We have 26 variables or columns that contain 4 character and 22 numeric
We do also have missing value from “room_size” column that contains 144 missing rows

oyo %>% skimr::skim(.)

Data summary
Name	Piped data
Number of rows	909
Number of columns	26
_______________________
Column type frequency:
character	4
numeric	22
________________________
Group variables

Variable type: character

skim_variable	complete_rate	min	max	n_unique
name	1	14	56	897
city	1	4	10	9
room_type	1	7	28	29
fasility	1	2	539	794

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
room_size	144	0.84	108.84	55.57	6	96.0	103.0	130.0	774	▇▁▁▁▁
price	0	1.00	294321.20	505963.40	38922	117875.0	144570.0	177760.0	3000000	▇▁▁▁▁
rating	0	1.00	3.53	1.84	0	3.8	4.4	4.6	5	▂▁▁▁▇
cctv	0	1.00	0.72	0.45	0	0.0	1.0	1.0	1	▃▁▁▁▇
free_wifi	0	1.00	0.78	0.42	0	1.0	1.0	1.0	1	▂▁▁▁▇
modern_wardrobe	0	1.00	0.00	0.07	0	0.0	0.0	0.0	1	▇▁▁▁▁
ac	0	1.00	0.99	0.10	0	1.0	1.0	1.0	1	▁▁▁▁▇
house_keeping	0	1.00	0.58	0.49	0	0.0	1.0	1.0	1	▆▁▁▁▇
mini_fridge	0	1.00	0.23	0.42	0	0.0	0.0	0.0	1	▇▁▁▁▂
seating_area	0	1.00	0.25	0.43	0	0.0	0.0	1.0	1	▇▁▁▁▃
geyser	0	1.00	0.01	0.09	0	0.0	0.0	0.0	1	▇▁▁▁▁
Parking_Facility	0	1.00	0.71	0.46	0	0.0	1.0	1.0	1	▃▁▁▁▇
Coffee_Tea_Maker	0	1.00	0.20	0.40	0	0.0	0.0	0.0	1	▇▁▁▁▂
Room_heater	0	1.00	0.00	0.06	0	0.0	0.0	0.0	1	▇▁▁▁▁
Hair_Dryer	0	1.00	0.18	0.38	0	0.0	0.0	0.0	1	▇▁▁▁▂
Refrigerator	0	1.00	0.07	0.26	0	0.0	0.0	0.0	1	▇▁▁▁▁
Toaster	0	1.00	0.01	0.08	0	0.0	0.0	0.0	1	▇▁▁▁▁
Playstation	0	1.00	0.00	0.06	0	0.0	0.0	0.0	1	▇▁▁▁▁
Water_Purifier	0	1.00	0.00	0.06	0	0.0	0.0	0.0	1	▇▁▁▁▁
Washing_Machine	0	1.00	0.00	0.06	0	0.0	0.0	0.0	1	▇▁▁▁▁
Smoking_Room	0	1.00	0.00	0.06	0	0.0	0.0	0.0	1	▇▁▁▁▁
Mini_Bar	0	1.00	0.00	0.06	0	0.0	0.0	0.0	1	▇▁▁▁▁
#### Summary statis	tic of numer	ical variables
- The min/minimum o	r the cheape	st room price is	Rp38.922
- The max/maximum o	r the most e	xpensive room pr	ice is Rp3.0	00.000

oyo %>% 
  select(price, room_size, rating) %>% 
  summary()

##      price           room_size         rating    
##  Min.   :  38922   Min.   :  6.0   Min.   :0.00  
##  1st Qu.: 117875   1st Qu.: 96.0   1st Qu.:3.80  
##  Median : 144570   Median :103.0   Median :4.40  
##  Mean   : 294321   Mean   :108.8   Mean   :3.53  
##  3rd Qu.: 177760   3rd Qu.:130.0   3rd Qu.:4.60  
##  Max.   :3000000   Max.   :774.0   Max.   :5.00  
##                    NA's   :144

Visual Analysis

The scatterplot bellow shows there are outliers in the dataset

oyo %>% 
  select(price, room_size, city, name) %>% 
  hchart(
    type = "scatter",
    hcaes(x = room_size, y = price, group = city)
  ) %>% 
  hc_add_theme(hc_theme_flat()) %>% 
  hc_title(text = "Relationship between price and room size")

- After remove outliers - Seem

oyo %>% 
  select(price, room_size, city, name) %>% 
  filter(price < 500000 & room_size < 500) %>% 
  
  hchart(
    type = "scatter",
    hcaes(x = room_size, y = price, group = city)
  ) %>% 
  hc_title(text = "Relationship between price and room size after remove the outliers") %>% 
  hc_add_theme(hc_theme_flat())

The room that has largest size

oyo %>% 
  filter(room_size > 700 & price < 500000) %>% 
  select(name, city, room_type, price, room_size, rating, fasility)

## # A tibble: 1 x 7
##   name       city    room_type  price room_size rating fasility                 
##   <chr>      <chr>   <chr>      <dbl>     <dbl>  <dbl> <chr>                    
## 1 OYO 722 U~ yogyak~ Suite Fa~ 245910       774      5 Free_Wifi Twin_Single_Be~

The cheapest room

oyo %>% 
  filter(price < 50000) %>% 
  select(name, city, room_type, price, room_size, rating, fasility)

## # A tibble: 1 x 7
##   name              city   room_type price room_size rating fasility            
##   <chr>             <chr>  <chr>     <dbl>     <dbl>  <dbl> <chr>               
## 1 OYO 3133 Wisma Y~ jakar~ unknown   38922        NA    3.5 Twin_Single_Bed AC ~

oyo %>% 
  select(city, price) %>% 
  group_by(city) %>% 
  summarize(average = round(mean(price)), 2) %>% 
  arrange(desc(average)) %>% 
  
  hchart(
    type = "bar",
    hcaes(x = city, y = average),
    dataLabels = list(enabled = T),
    name = "Average Price"
  ) %>% 
  hc_add_theme(hc_theme_flat()) %>% 
  hc_title(text = "Average price each city")

## `summarise()` ungrouping output (override with `.groups` argument)

oyo %>% 
  group_by(city) %>% 
  summarize(total = n()) %>% 
  arrange(desc(total)) %>% 
  
  hchart(
    type = "bar",
    hcaes(x = city, y = total),
    dataLabels = list(enabled = T),
    name = "Total room"
  ) %>% 
  hc_title(text = "Total room each city") %>% 
  hc_add_theme(hc_theme_hcrt())

## `summarise()` ungrouping output (override with `.groups` argument)

oyo %>% 
  select(city, room_size) %>% 
  filter(!is.na(room_size)) %>% 
  group_by(city) %>% 
  summarize(average = round(mean(room_size)), 0) %>% 
  arrange(desc(average)) %>% 
  
  hchart(
    type = "bar",
    hcaes(x = city, y = average),
    name = "average",
    dataLabels = list(enabled = T)
  ) %>% 
  hc_title(text = "Average Room Size Each City") %>% 
  hc_add_theme(hc_theme_smpl())

## `summarise()` ungrouping output (override with `.groups` argument)

oyo %>% 
  select(rating, city) %>% 
  filter(!is.na(rating) & rating != 0) %>% 
  group_by(city) %>% 
  summarize(average = round(mean(rating),2)) %>%
  arrange(desc(average)) %>% 
  
  hchart(
    type = "bar",
    hcaes(x = city, y = average),
    dataLabels = list(enabled =T)
  ) %>% 
  hc_title(text = "Average rating of room each city") %>% 
  hc_add_theme(hc_theme_elementary())

## `summarise()` ungrouping output (override with `.groups` argument)

Modeling

Data Preprocessing

Detecting missing data

oyo %>% 
  map(~ sum(is.na(.)))

## $name
## [1] 0
## 
## $city
## [1] 0
## 
## $room_type
## [1] 0
## 
## $room_size
## [1] 144
## 
## $price
## [1] 0
## 
## $rating
## [1] 0
## 
## $fasility
## [1] 0
## 
## $cctv
## [1] 0
## 
## $free_wifi
## [1] 0
## 
## $modern_wardrobe
## [1] 0
## 
## $ac
## [1] 0
## 
## $house_keeping
## [1] 0
## 
## $mini_fridge
## [1] 0
## 
## $seating_area
## [1] 0
## 
## $geyser
## [1] 0
## 
## $Parking_Facility
## [1] 0
## 
## $Coffee_Tea_Maker
## [1] 0
## 
## $Room_heater
## [1] 0
## 
## $Hair_Dryer
## [1] 0
## 
## $Refrigerator
## [1] 0
## 
## $Toaster
## [1] 0
## 
## $Playstation
## [1] 0
## 
## $Water_Purifier
## [1] 0
## 
## $Washing_Machine
## [1] 0
## 
## $Smoking_Room
## [1] 0
## 
## $Mini_Bar
## [1] 0

Data cleaning and procesing

oyomodel <- oyo %>% 
  select(-fasility, -name) %>% 
  filter(room_type != "unknown") %>% 
  filter(!is.na(room_size)) %>%
  filter(rating != 0) %>% 
  mutate(room_type = as.factor(room_type),
         city = as.factor(city),
         room_type  = str_replace_all(room_type, " ", "_")) 

oyo_prep <- recipes::recipe(price ~., data = oyomodel) %>% 
  step_integer(room_type) %>% 
  prep()

oyodone <- bake(oyo_prep, new_data = oyomodel)

oyodone %>% glimpse()

## Observations: 642
## Variables: 24
## $ city             <fct> jakarta, jakarta, jakarta, jakarta, jakarta, jakar...
## $ room_type        <dbl> 17, 2, 15, 15, 15, 15, 16, 15, 15, 15, 2, 15, 15, ...
## $ room_size        <dbl> 338, 154, 96, 100, 154, 90, 69, 80, 110, 100, 165,...
## $ rating           <dbl> 4.7, 4.7, 4.6, 4.5, 4.6, 4.4, 3.7, 4.4, 4.4, 4.3, ...
## $ cctv             <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ free_wifi        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ modern_wardrobe  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ac               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ house_keeping    <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,...
## $ mini_fridge      <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ seating_area     <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ geyser           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Parking_Facility <dbl> 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,...
## $ Coffee_Tea_Maker <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Room_heater      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Hair_Dryer       <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Refrigerator     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Toaster          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Playstation      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Water_Purifier   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Washing_Machine  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Smoking_Room     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Mini_Bar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ price            <dbl> 136895, 190000, 154072, 143910, 378153, 233449, 13...

source("calc_metrics.R")

Split data into test and train

set.seed(123)
split_object <- rsample::initial_split(oyodone,
                                     prop = 0.8,
                                     strata = "city")

train_tbl <- split_object %>% training()
test_tbl <- split_object %>% testing()

Machine Learning Modeling

Linear Regression model

linear_regmodel <- linear_reg("regression") %>% 
  set_engine("lm") %>% 
  fit(price ~., data = train_tbl)

linear_regmodel %>% predict(new_data = test_tbl)

## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading

## # A tibble: 128 x 1
##      .pred
##      <dbl>
##  1 175455.
##  2 187502.
##  3 167363.
##  4 288531.
##  5 302710.
##  6 156556.
##  7 182572.
##  8 196564.
##  9 167919.
## 10 165770.
## # ... with 118 more rows

linear_regmodel %>% calc_metrics(new_data = test_tbl, truth = price)

## # A tibble: 1 x 3
##      mae   rmse    rsq
##    <dbl>  <dbl>  <dbl>
## 1 40065. 80589. 0.0295

Random Forest Model

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dials':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

rfmod <- rand_forest("regression") %>% 
  set_engine("randomForest") %>% 
  fit(price ~., data = train_tbl)

rfmod %>% predict(new_data = test_tbl)

## # A tibble: 128 x 1
##      .pred
##      <dbl>
##  1 178485.
##  2 170908.
##  3 150016.
##  4 297296.
##  5 225435.
##  6 180569.
##  7 170883.
##  8 190194.
##  9 203322.
## 10 204037.
## # ... with 118 more rows

rfmod %>% calc_metrics(new_data = test_tbl, truth = price)

## # A tibble: 1 x 3
##      mae   rmse   rsq
##    <dbl>  <dbl> <dbl>
## 1 35884. 70516. 0.193

library(vip)

## Warning: package 'vip' was built under R version 3.6.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

vi(rfmod$fit,
   lambda = lrmod$fit)

## # A tibble: 30 x 2
##    Variable         Importance
##    <chr>                 <dbl>
##  1 rating              1.65e12
##  2 room_size           7.73e11
##  3 citysurabaya        7.61e11
##  4 house_keeping       3.29e11
##  5 citymalang          3.25e11
##  6 Refrigerator        3.12e11
##  7 room_type           2.26e11
##  8 Hair_Dryer          9.59e10
##  9 Coffee_Tea_Maker    8.70e10
## 10 cityjakarta         8.55e10
## # ... with 20 more rows

Xgboost model

set.seed(12)
library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

model_04_xgboost <- boost_tree(
        mode = "regression", 
        mtry = 10, 
        trees = 642, 
        min_n = 2, 
        tree_depth = 6,
        learn_rate = 0.35, 
        loss_reduction = 0.0001) %>%
    set_engine("xgboost") %>%
    fit(price~ ., data = train_tbl)

model_04_xgboost %>% predict(new_data = test_tbl)

## # A tibble: 128 x 1
##      .pred
##      <dbl>
##  1 176268.
##  2 157875.
##  3 135172.
##  4 190768.
##  5 168886.
##  6 166003.
##  7 153852.
##  8 157386.
##  9 202954.
## 10 231018.
## # ... with 118 more rows

model_04_xgboost %>% calc_metrics(new_data = test_tbl, truth = price)

## # A tibble: 1 x 3
##      mae   rmse   rsq
##    <dbl>  <dbl> <dbl>
## 1 45128. 80442. 0.119

Feature importance

room size is the most importance feature based on Xgboost algorithm, it means that room size could affect price change of a room

model_04_xgboost$fit %>%
    xgb.importance(model = .) %>%
    xgb.plot.importance(main = "XGBoost Feature Importance")