Goal: to predict the price. Click [here for the data]https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-11-03/readme.md.
ikea <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv')
skimr::skim(ikea)
Name | ikea |
Number of rows | 3694 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
character | 7 |
logical | 1 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
name | 0 | 1 | 3 | 27 | 0 | 607 | 0 |
category | 0 | 1 | 4 | 36 | 0 | 17 | 0 |
old_price | 0 | 1 | 4 | 13 | 0 | 365 | 0 |
link | 0 | 1 | 52 | 163 | 0 | 2962 | 0 |
other_colors | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
short_description | 0 | 1 | 3 | 63 | 0 | 1706 | 0 |
designer | 0 | 1 | 3 | 1261 | 0 | 381 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
sellable_online | 0 | 1 | 0.99 | TRU: 3666, FAL: 28 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
…1 | 0 | 1.00 | 1846.50 | 1066.51 | 0 | 923.25 | 1846.5 | 2769.75 | 3693 | ▇▇▇▇▇ |
item_id | 0 | 1.00 | 48632396.79 | 28887094.10 | 58487 | 20390574.00 | 49288078.0 | 70403572.75 | 99932615 | ▇▇▇▇▇ |
price | 0 | 1.00 | 1078.21 | 1374.65 | 3 | 180.90 | 544.7 | 1429.50 | 9585 | ▇▁▁▁▁ |
depth | 1463 | 0.60 | 54.38 | 29.96 | 1 | 38.00 | 47.0 | 60.00 | 257 | ▇▃▁▁▁ |
height | 988 | 0.73 | 101.68 | 61.10 | 1 | 67.00 | 83.0 | 124.00 | 700 | ▇▂▁▁▁ |
width | 589 | 0.84 | 104.47 | 71.13 | 1 | 60.00 | 80.0 | 140.00 | 420 | ▇▅▂▁▁ |
data <- ikea %>%
# Treat missing values
select(-other_colors, -old_price, -sellable_online, -link, -designer) %>%
na.omit() %>%
# log transform variables with pos-skewed distribution
mutate(price = log(price))
Identify good predictors.
item_id
data %>%
ggplot(aes(price, height)) +
scale_y_log10() +
geom_point()
Category
data %>%
ggplot(aes(price, fct_reorder(as.factor(width), price))) +
geom_boxplot()
Category
data %>%
# tokenize title
unnest_tokens(output = name, input = category) %>%
# calculate average rent per word
group_by(name) %>%
summarise(price = mean(price),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(name, "\\d")) %>%
slice_max(order_by = price, n =100) %>%
# Plot
ggplot(aes(price, fct_reorder(name, price))) +
geom_point() +
labs(y = "Words in Title")
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-item_id, -short_description) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 1,899
## Columns: 59
## $ ...8 <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ ...9 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ ...10 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ ...11 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__ALGOT <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BEKANT <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BESTÅ <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__BILLY_/_OXBERG` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BRIMNES <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BROR <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__EKET <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__GRÖNLID <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HAVSTA <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HAVSTEN <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HEMNES <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__IVAR <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__JONAXEL <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__KALLAX <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__LIDHULT <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__LIXHULT <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__NORDLI <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__PAX <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__PLATSA <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__STUVA_/_FRITIDS` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__TROFAST <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__VALLENTUNA <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__VIMLE <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ category__Bar_furniture <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ category__Beds <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Bookcases_&_shelving_units` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Cabinets_&_cupboards` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Chairs <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Chests_of_drawers_&_drawer_units` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Children's_furniture` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Nursery_furniture <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Outdoor_furniture <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sideboards,_buffets_&_console_tables` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sofas_&_armchairs` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Tables_&_desks` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__TV_&_media_furniture` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Wardrobes <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `price__-Inf_5.68697535633982` <dbl> 1, 1, 0, 1, 1, 1, 0, …
## $ price__5.68697535633982_6.52209279817015 <dbl> 0, 0, 1, 0, 0, 0, 1, …
## $ price__6.52209279817015_7.37085996851068 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ price__7.37085996851068_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `depth__-Inf_40` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ depth__40_47 <dbl> 0, 0, 1, 1, 1, 1, 1, …
## $ depth__47_60 <dbl> 1, 1, 0, 0, 0, 0, 0, …
## $ depth__60_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `height__-Inf_71` <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ height__71_92 <dbl> 0, 0, 1, 0, 0, 0, 0, …
## $ height__92_171 <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ height__171_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `width__-Inf_60` <dbl> 1, 0, 1, 1, 1, 1, 1, …
## $ width__60_93 <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ width__93_161.5 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ width__161.5_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(price__7.37085996851068_Inf)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [1, 2, 3,
## 4].
data_corr_tbl
## # A tibble: 59 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 price 7.37085996851068_Inf 1
## 2 width 161.5_Inf 0.579
## 3 depth 60_Inf 0.447
## 4 category Sofas_&_armchairs 0.379
## 5 width -Inf_60 -0.374
## 6 price -Inf_5.68697535633982 -0.336
## 7 price 6.52209279817015_7.37085996851068 -0.333
## 8 price 5.68697535633982_6.52209279817015 -0.331
## 9 name PAX 0.302
## 10 category Wardrobes 0.279
## # ℹ 49 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 35 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
data <- data %>%
select(item_id, name, category, price, short_description, depth, height, width)
Split data
data <- data %>% sample_n(100)
# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
use_xgboost(price ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = price ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(6804)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = price ~ ., data = data_train) %>%
recipes::update_role(item_id, new_role = "id variables") %>%
step_tokenize(short_description) %>%
step_tokenfilter(short_description, max_tokens = 100) %>%
step_tokenfilter(max_tokens = 40) %>%
step_tfidf(short_description) %>%
step_other(category) %>%
step_dummy(category, name, one_hot = TRUE) %>%
step_log(height, width) %>%
step_YeoJohnson(width, height) %>%
step_impute_knn(all_predictors())
# step_tokenize() %>%
xgboost_recipe %>% prep() %>% bake(new_data = NULL) %>% glimpse()
## Rows: 75
## Columns: 164
## $ item_id <dbl> 79306333, 30466281, 59279585…
## $ depth <dbl> 44, 30, 50, 42, 42, 57, 46, …
## $ height <dbl> 96.54439, 24.50599, 48.39329…
## $ width <dbl> 27.85003, 15.78814, 18.78546…
## $ price <dbl> 7.661527, 2.708050, 5.926926…
## $ tfidf_short_description_1 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_108x96 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_120x30x202 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_120x30x237 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_120x42x48 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_121x35x123 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_130x117 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_150x44x236 <dbl> 1.443578, 0.000000, 0.000000…
## $ tfidf_short_description_160x99 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_180x41x49 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_180x42x64 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_193x25x176 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_2 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_200x58x201 <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_200x66x236 <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_222x80x90 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_240x42x230 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_25x51x70 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_29 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_3 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_300x42x193 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_300x42x210 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_300x60x236 <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_35x35x10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_40x28x106 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_40x99 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_42x30x10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_42x30x23 <dbl> 0.000000, 1.082683, 0.000000…
## $ tfidf_short_description_42x61 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_44x55x216 <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_45x47x92 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_50x51x70 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_6 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_60x20x38 <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_60x22x128 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_60x22x202 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_60x25x40 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_60x40x128 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_60x50x64 <dbl> 0.000000, 0.000000, 1.082683…
## $ tfidf_short_description_60x57x73 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_63x126 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_64x60 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_70x47 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_75 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_80x35x210 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_80x42x221 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_81x35x123 <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_81x47x212 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_9 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_90x200 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_90x40x38 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_90x83 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_94x44x52 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_add <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_armrest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_backrest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_bar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_baskets <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_bed <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_bench <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_bookcase <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_box <dbl> 0.0000000, 0.9126646, 0.0000…
## $ tfidf_short_description_cabinet <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_castors <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_chair <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_chaise <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_chest <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_cm <dbl> 0.3014854, 0.2261141, 0.2261…
## $ tfidf_short_description_combination <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_corner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_cushion <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_desk <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_display <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_door <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_doors <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_drawer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_drawers <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_folding <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_frame <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_glass <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_leg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_modular <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_mounted <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_of <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_on <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_outdoor <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_seat <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_section <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_shelf <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_sofa <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_stool <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_storage <dbl> 0.0000000, 0.5141130, 0.5141…
## $ tfidf_short_description_tv <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_unit <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_wall <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_wardrobe <dbl> 0.8202697, 0.0000000, 0.0000…
## $ tfidf_short_description_wire <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_with <dbl> 0.0000000, 0.0000000, 0.4345…
## $ category_Bookcases...shelving.units <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Cabinets...cupboards <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Chairs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Chests.of.drawers...drawer.units <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ category_Children.s.furniture <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ category_Sofas...armchairs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ category_TV...media.furniture <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ category_Wardrobes <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0…
## $ category_other <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0…
## $ name_ÄPPLARÖ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BESTÅ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BESTÅ...EKET <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ name_BESTÅ.BURS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BILLY...BOTTNA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BILLY...OXBERG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BRIMNES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_DELAKTIG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_EKEDALEN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_EKET <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_ELVARLI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_GODVIN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_GRÖNLID <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_GUNDE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_HÄLLAN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_HAVSTA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_HEMNES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_IDÅSEN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_JANINGE <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ name_JONAXEL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_KOPPANG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_KORNSJÖ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_KUNGSHAMN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_KUNGSHOLMEN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LANDSKRONA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LANGUR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LEIFARNE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LIDHULT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LIXHULT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LYCKSELE.LÖVÅS <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ name_MOSJÖ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_NORDLI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_NORDVIKEN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_NORRARYD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_NYHAMN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_PÅHL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_PAX <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ name_PAX...MEHAMN.AULI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_PLATSA <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ name_POÄNG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_SÖDERHAMN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ name_SOLLERÖN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_STALLARP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_STUBBARP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_STUVA...FRITIDS <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ name_SVALNÄS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_TROFAST <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_VALLENTUNA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_VIMLE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_VISTHUS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(6804)
doParallel::registerDoParallel()
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 431 9 10 0.0100 7.91e-10 0.427 rmse
## 2 1309 24 2 0.00694 2.65e+ 0 0.905 rmse
## 3 1020 11 5 0.00253 4.65e- 5 0.672 rmse
## 4 1926 32 7 0.0768 5.00e- 8 0.553 rmse
## 5 121 34 15 0.212 1.08e- 1 0.110 rmse
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
# Update the model by selecting the best hyperparameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
# Fit the model on the entire traning data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.868 Preprocessor1_Model1
## 2 rsq standard 0.649 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
ggplot(aes(price, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()
ranger_spec <-
rand_forest(trees = 500) %>%
set_engine("ranger") %>%
set_mode("regression")
ranger_spec
## Random Forest Model Specification (regression)
##
## Main Arguments:
## trees = 500
##
## Computational engine: ranger
svm_spec <-
svm_linear() %>%
set_engine("LiblineaR") %>%
set_mode("regression")
svm_spec
## Linear Support Vector Machine Model Specification (regression)
##
## Computational engine: LiblineaR
ranger_wf <- workflow(xgboost_recipe, ranger_spec)
svm_wf <- workflow(xgboost_recipe, svm_spec)
doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)
svm_rs <- fit_resamples(
svm_wf,
resamples = data_cv,
control = contrl_preds
)
## Warning: package 'LiblineaR' was built under R version 4.4.1
ranger_rs <- fit_resamples(
ranger_wf,
resamples = data_cv,
control = contrl_preds
)
collect_metrics(svm_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.818 10 0.0741 Preprocessor1_Model1
## 2 rsq standard 0.842 10 0.0249 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.715 10 0.0922 Preprocessor1_Model1
## 2 rsq standard 0.823 10 0.0306 Preprocessor1_Model1
bind_rows(
collect_predictions(svm_rs) %>%
mutate(mod = "SVM"), collect_predictions(ranger_rs) %>%
mutate(mod = "ranger")) %>%
ggplot(aes(price, .pred)) +
geom_abline(lty = 2, color = "gray50") +
geom_point(alpha = 0.3, fill = "midnightblue") +
coord_fixed()
final_fitted <- last_fit(svm_wf, data_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.990 Preprocessor1_Model1
## 2 rsq standard 0.711 Preprocessor1_Model1
final_wf <- extract_workflow(final_fitted)
predict(final_wf, data_test)
## Warning: ! There are new levels in a factor: `MASTHOLMEN`, `BEKANT`, `SMÅGÖRA`, `IVAR`,
## `STUK`, `ARKELSTORP`, `GALANT`, `BRUSALI`, `INGOLF`, `VADHOLMA`, `FÖRSIKTIG`,
## `GODISHUS`, and `VÄDDÖ`.
## # A tibble: 25 × 1
## .pred
## <dbl>
## 1 6.37
## 2 6.18
## 3 7.09
## 4 5.39
## 5 5.38
## 6 8.19
## 7 6.67
## 8 0.927
## 9 3.71
## 10 8.17
## # ℹ 15 more rows
Comments
I made the inclusion of name as a predictor. With the use of step_tokenize() and take the first 40 words, I also tried to implement the mtry = tune() as a predictor but all models failed. With the use Random Forest, and SVM i found that xgboost is performing better for IKEA.