library(tidyverse)
library(tidymodels)
library(highcharter)
oyo <- read_csv("oyoall.csv")
oyo %>% head()
## # A tibble: 6 x 26
## name city room_type room_size price rating fasility cctv free_wifi
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 OYO ~ jaka~ Standard~ 338 136895 4.7 AC TV F~ 1 1
## 2 OYO ~ jaka~ Deluxe D~ 154 190000 4.7 AC TV F~ 0 1
## 3 OYO ~ jaka~ Standard~ 96 154072 4.6 Free_Wi~ 1 1
## 4 OYO ~ jaka~ Standard~ 100 143910 4.5 Free_Wi~ 1 1
## 5 Capi~ jaka~ Standard~ 154 378153 4.6 Free_Wi~ 1 1
## 6 OYO ~ jaka~ Standard~ 90 233449 4.4 Free_Wi~ 1 1
## # ... with 17 more variables: modern_wardrobe <dbl>, ac <dbl>,
## # house_keeping <dbl>, mini_fridge <dbl>, seating_area <dbl>, geyser <dbl>,
## # Parking_Facility <dbl>, Coffee_Tea_Maker <dbl>, Room_heater <dbl>,
## # Hair_Dryer <dbl>, Refrigerator <dbl>, Toaster <dbl>, Playstation <dbl>,
## # Water_Purifier <dbl>, Washing_Machine <dbl>, Smoking_Room <dbl>,
## # Mini_Bar <dbl>
oyo <- oyo %>%
mutate(rating = ifelse(is.na(rating), 0, rating),
rating = ifelse(rating == "no rating", 0, rating),
rating = as.numeric(rating))
oyo %>% glimpse()
## Observations: 909
## Variables: 26
## $ name <chr> "OYO Flagship 210 Amethyst Kemayoran", "OYO Townho...
## $ city <chr> "jakarta", "jakarta", "jakarta", "jakarta", "jakar...
## $ room_type <chr> "Standard Twin", "Deluxe Double", "Standard Double...
## $ room_size <dbl> 338, 154, 96, 100, 154, 90, 69, 80, 110, 100, 165,...
## $ price <dbl> 136895, 190000, 154072, 143910, 378153, 233449, 13...
## $ rating <dbl> 4.7, 4.7, 4.6, 4.5, 4.6, 4.4, 3.7, 4.4, 4.4, 4.3, ...
## $ fasility <chr> "AC TV Free_Wifi Twin_Single_Bed Swimming_Pool Pow...
## $ cctv <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ free_wifi <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ modern_wardrobe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ac <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ house_keeping <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,...
## $ mini_fridge <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ seating_area <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ geyser <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Parking_Facility <dbl> 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,...
## $ Coffee_Tea_Maker <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Room_heater <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Hair_Dryer <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Refrigerator <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Toaster <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Playstation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Water_Purifier <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Washing_Machine <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Smoking_Room <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Mini_Bar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
oyo %>% skimr::skim(.)
| Name | Piped data |
| Number of rows | 909 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| numeric | 22 |
| ________________________ | |
| Group variables |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| name | 0 | 1 | 14 | 56 | 0 | 897 | 0 |
| city | 0 | 1 | 4 | 10 | 0 | 9 | 0 |
| room_type | 0 | 1 | 7 | 28 | 0 | 29 | 0 |
| fasility | 0 | 1 | 2 | 539 | 0 | 794 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| room_size | 144 | 0.84 | 108.84 | 55.57 | 6 | 96.0 | 103.0 | 130.0 | 774 | ▇▁▁▁▁ |
| price | 0 | 1.00 | 294321.20 | 505963.40 | 38922 | 117875.0 | 144570.0 | 177760.0 | 3000000 | ▇▁▁▁▁ |
| rating | 0 | 1.00 | 3.53 | 1.84 | 0 | 3.8 | 4.4 | 4.6 | 5 | ▂▁▁▁▇ |
| cctv | 0 | 1.00 | 0.72 | 0.45 | 0 | 0.0 | 1.0 | 1.0 | 1 | ▃▁▁▁▇ |
| free_wifi | 0 | 1.00 | 0.78 | 0.42 | 0 | 1.0 | 1.0 | 1.0 | 1 | ▂▁▁▁▇ |
| modern_wardrobe | 0 | 1.00 | 0.00 | 0.07 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| ac | 0 | 1.00 | 0.99 | 0.10 | 0 | 1.0 | 1.0 | 1.0 | 1 | ▁▁▁▁▇ |
| house_keeping | 0 | 1.00 | 0.58 | 0.49 | 0 | 0.0 | 1.0 | 1.0 | 1 | ▆▁▁▁▇ |
| mini_fridge | 0 | 1.00 | 0.23 | 0.42 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▂ |
| seating_area | 0 | 1.00 | 0.25 | 0.43 | 0 | 0.0 | 0.0 | 1.0 | 1 | ▇▁▁▁▃ |
| geyser | 0 | 1.00 | 0.01 | 0.09 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| Parking_Facility | 0 | 1.00 | 0.71 | 0.46 | 0 | 0.0 | 1.0 | 1.0 | 1 | ▃▁▁▁▇ |
| Coffee_Tea_Maker | 0 | 1.00 | 0.20 | 0.40 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▂ |
| Room_heater | 0 | 1.00 | 0.00 | 0.06 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| Hair_Dryer | 0 | 1.00 | 0.18 | 0.38 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▂ |
| Refrigerator | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| Toaster | 0 | 1.00 | 0.01 | 0.08 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| Playstation | 0 | 1.00 | 0.00 | 0.06 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| Water_Purifier | 0 | 1.00 | 0.00 | 0.06 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| Washing_Machine | 0 | 1.00 | 0.00 | 0.06 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| Smoking_Room | 0 | 1.00 | 0.00 | 0.06 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| Mini_Bar | 0 | 1.00 | 0.00 | 0.06 | 0 | 0.0 | 0.0 | 0.0 | 1 | ▇▁▁▁▁ |
| #### Summary statis | tic of numer | ical variables | ||||||||
| - The min/minimum o | r the cheape | st room price is | Rp38.922 | |||||||
| - The max/maximum o | r the most e | xpensive room pr | ice is Rp3.0 | 00.000 |
oyo %>%
select(price, room_size, rating) %>%
summary()
## price room_size rating
## Min. : 38922 Min. : 6.0 Min. :0.00
## 1st Qu.: 117875 1st Qu.: 96.0 1st Qu.:3.80
## Median : 144570 Median :103.0 Median :4.40
## Mean : 294321 Mean :108.8 Mean :3.53
## 3rd Qu.: 177760 3rd Qu.:130.0 3rd Qu.:4.60
## Max. :3000000 Max. :774.0 Max. :5.00
## NA's :144
oyo %>%
select(price, room_size, city, name) %>%
hchart(
type = "scatter",
hcaes(x = room_size, y = price, group = city)
) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Relationship between price and room size")
- After remove outliers - Seem
oyo %>%
select(price, room_size, city, name) %>%
filter(price < 500000 & room_size < 500) %>%
hchart(
type = "scatter",
hcaes(x = room_size, y = price, group = city)
) %>%
hc_title(text = "Relationship between price and room size after remove the outliers") %>%
hc_add_theme(hc_theme_flat())
oyo %>%
filter(room_size > 700 & price < 500000) %>%
select(name, city, room_type, price, room_size, rating, fasility)
## # A tibble: 1 x 7
## name city room_type price room_size rating fasility
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 OYO 722 U~ yogyak~ Suite Fa~ 245910 774 5 Free_Wifi Twin_Single_Be~
oyo %>%
filter(price < 50000) %>%
select(name, city, room_type, price, room_size, rating, fasility)
## # A tibble: 1 x 7
## name city room_type price room_size rating fasility
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 OYO 3133 Wisma Y~ jakar~ unknown 38922 NA 3.5 Twin_Single_Bed AC ~
oyo %>%
select(city, price) %>%
group_by(city) %>%
summarize(average = round(mean(price)), 2) %>%
arrange(desc(average)) %>%
hchart(
type = "bar",
hcaes(x = city, y = average),
dataLabels = list(enabled = T),
name = "Average Price"
) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Average price each city")
## `summarise()` ungrouping output (override with `.groups` argument)
oyo %>%
group_by(city) %>%
summarize(total = n()) %>%
arrange(desc(total)) %>%
hchart(
type = "bar",
hcaes(x = city, y = total),
dataLabels = list(enabled = T),
name = "Total room"
) %>%
hc_title(text = "Total room each city") %>%
hc_add_theme(hc_theme_hcrt())
## `summarise()` ungrouping output (override with `.groups` argument)
oyo %>%
select(city, room_size) %>%
filter(!is.na(room_size)) %>%
group_by(city) %>%
summarize(average = round(mean(room_size)), 0) %>%
arrange(desc(average)) %>%
hchart(
type = "bar",
hcaes(x = city, y = average),
name = "average",
dataLabels = list(enabled = T)
) %>%
hc_title(text = "Average Room Size Each City") %>%
hc_add_theme(hc_theme_smpl())
## `summarise()` ungrouping output (override with `.groups` argument)
oyo %>%
select(rating, city) %>%
filter(!is.na(rating) & rating != 0) %>%
group_by(city) %>%
summarize(average = round(mean(rating),2)) %>%
arrange(desc(average)) %>%
hchart(
type = "bar",
hcaes(x = city, y = average),
dataLabels = list(enabled =T)
) %>%
hc_title(text = "Average rating of room each city") %>%
hc_add_theme(hc_theme_elementary())
## `summarise()` ungrouping output (override with `.groups` argument)
oyo %>%
map(~ sum(is.na(.)))
## $name
## [1] 0
##
## $city
## [1] 0
##
## $room_type
## [1] 0
##
## $room_size
## [1] 144
##
## $price
## [1] 0
##
## $rating
## [1] 0
##
## $fasility
## [1] 0
##
## $cctv
## [1] 0
##
## $free_wifi
## [1] 0
##
## $modern_wardrobe
## [1] 0
##
## $ac
## [1] 0
##
## $house_keeping
## [1] 0
##
## $mini_fridge
## [1] 0
##
## $seating_area
## [1] 0
##
## $geyser
## [1] 0
##
## $Parking_Facility
## [1] 0
##
## $Coffee_Tea_Maker
## [1] 0
##
## $Room_heater
## [1] 0
##
## $Hair_Dryer
## [1] 0
##
## $Refrigerator
## [1] 0
##
## $Toaster
## [1] 0
##
## $Playstation
## [1] 0
##
## $Water_Purifier
## [1] 0
##
## $Washing_Machine
## [1] 0
##
## $Smoking_Room
## [1] 0
##
## $Mini_Bar
## [1] 0
oyomodel <- oyo %>%
select(-fasility, -name) %>%
filter(room_type != "unknown") %>%
filter(!is.na(room_size)) %>%
filter(rating != 0) %>%
mutate(room_type = as.factor(room_type),
city = as.factor(city),
room_type = str_replace_all(room_type, " ", "_"))
oyo_prep <- recipes::recipe(price ~., data = oyomodel) %>%
step_integer(room_type) %>%
prep()
oyodone <- bake(oyo_prep, new_data = oyomodel)
oyodone %>% glimpse()
## Observations: 642
## Variables: 24
## $ city <fct> jakarta, jakarta, jakarta, jakarta, jakarta, jakar...
## $ room_type <dbl> 17, 2, 15, 15, 15, 15, 16, 15, 15, 15, 2, 15, 15, ...
## $ room_size <dbl> 338, 154, 96, 100, 154, 90, 69, 80, 110, 100, 165,...
## $ rating <dbl> 4.7, 4.7, 4.6, 4.5, 4.6, 4.4, 3.7, 4.4, 4.4, 4.3, ...
## $ cctv <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ free_wifi <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ modern_wardrobe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ac <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ house_keeping <dbl> 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,...
## $ mini_fridge <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ seating_area <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ geyser <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Parking_Facility <dbl> 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,...
## $ Coffee_Tea_Maker <dbl> 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Room_heater <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Hair_Dryer <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Refrigerator <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Toaster <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Playstation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Water_Purifier <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Washing_Machine <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Smoking_Room <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Mini_Bar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ price <dbl> 136895, 190000, 154072, 143910, 378153, 233449, 13...
source("calc_metrics.R")
set.seed(123)
split_object <- rsample::initial_split(oyodone,
prop = 0.8,
strata = "city")
train_tbl <- split_object %>% training()
test_tbl <- split_object %>% testing()
linear_regmodel <- linear_reg("regression") %>%
set_engine("lm") %>%
fit(price ~., data = train_tbl)
linear_regmodel %>% predict(new_data = test_tbl)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
## # A tibble: 128 x 1
## .pred
## <dbl>
## 1 175455.
## 2 187502.
## 3 167363.
## 4 288531.
## 5 302710.
## 6 156556.
## 7 182572.
## 8 196564.
## 9 167919.
## 10 165770.
## # ... with 118 more rows
linear_regmodel %>% calc_metrics(new_data = test_tbl, truth = price)
## # A tibble: 1 x 3
## mae rmse rsq
## <dbl> <dbl> <dbl>
## 1 40065. 80589. 0.0295
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dials':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rfmod <- rand_forest("regression") %>%
set_engine("randomForest") %>%
fit(price ~., data = train_tbl)
rfmod %>% predict(new_data = test_tbl)
## # A tibble: 128 x 1
## .pred
## <dbl>
## 1 178485.
## 2 170908.
## 3 150016.
## 4 297296.
## 5 225435.
## 6 180569.
## 7 170883.
## 8 190194.
## 9 203322.
## 10 204037.
## # ... with 118 more rows
rfmod %>% calc_metrics(new_data = test_tbl, truth = price)
## # A tibble: 1 x 3
## mae rmse rsq
## <dbl> <dbl> <dbl>
## 1 35884. 70516. 0.193
library(vip)
## Warning: package 'vip' was built under R version 3.6.3
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
vi(rfmod$fit,
lambda = lrmod$fit)
## # A tibble: 30 x 2
## Variable Importance
## <chr> <dbl>
## 1 rating 1.65e12
## 2 room_size 7.73e11
## 3 citysurabaya 7.61e11
## 4 house_keeping 3.29e11
## 5 citymalang 3.25e11
## 6 Refrigerator 3.12e11
## 7 room_type 2.26e11
## 8 Hair_Dryer 9.59e10
## 9 Coffee_Tea_Maker 8.70e10
## 10 cityjakarta 8.55e10
## # ... with 20 more rows
set.seed(12)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
model_04_xgboost <- boost_tree(
mode = "regression",
mtry = 10,
trees = 642,
min_n = 2,
tree_depth = 6,
learn_rate = 0.35,
loss_reduction = 0.0001) %>%
set_engine("xgboost") %>%
fit(price~ ., data = train_tbl)
model_04_xgboost %>% predict(new_data = test_tbl)
## # A tibble: 128 x 1
## .pred
## <dbl>
## 1 176268.
## 2 157875.
## 3 135172.
## 4 190768.
## 5 168886.
## 6 166003.
## 7 153852.
## 8 157386.
## 9 202954.
## 10 231018.
## # ... with 118 more rows
model_04_xgboost %>% calc_metrics(new_data = test_tbl, truth = price)
## # A tibble: 1 x 3
## mae rmse rsq
## <dbl> <dbl> <dbl>
## 1 45128. 80442. 0.119
model_04_xgboost$fit %>%
xgb.importance(model = .) %>%
xgb.plot.importance(main = "XGBoost Feature Importance")