Import data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.5.2
## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)
library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.5.2
## Loading required package: recipes
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step
data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-11-03/ikea.csv')
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
skimr::skim(data)
Data summary
Name data
Number of rows 3694
Number of columns 14
_______________________
Column type frequency:
character 7
logical 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 3 27 0 607 0
category 0 1 4 36 0 17 0
old_price 0 1 4 13 0 365 0
link 0 1 52 163 0 2962 0
other_colors 0 1 2 3 0 2 0
short_description 0 1 3 63 0 1706 0
designer 0 1 3 1261 0 381 0

Variable type: logical

skim_variable n_missing complete_rate mean count
sellable_online 0 1 0.99 TRU: 3666, FAL: 28

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
…1 0 1.00 1846.50 1066.51 0 923.25 1846.5 2769.75 3693 ▇▇▇▇▇
item_id 0 1.00 48632396.79 28887094.10 58487 20390574.00 49288078.0 70403572.75 99932615 ▇▇▇▇▇
price 0 1.00 1078.21 1374.65 3 180.90 544.7 1429.50 9585 ▇▁▁▁▁
depth 1463 0.60 54.38 29.96 1 38.00 47.0 60.00 257 ▇▃▁▁▁
height 988 0.73 101.68 61.10 1 67.00 83.0 124.00 700 ▇▂▁▁▁
width 589 0.84 104.47 71.13 1 60.00 80.0 140.00 420 ▇▅▂▁▁

Clean data

data_clean <- data %>%

    mutate(across(is.logical, as.factor),
    sellable_online = factor(sellable_online, levels = c(FALSE, TRUE), labels = c("No", "Yes")),
    sellable_online = fct_relevel(sellable_online, "Yes", "No")) %>%
    select(-old_price, -link, -...1) %>%
    na.omit() %>%

mutate(price = log(price))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(is.logical, as.factor)`.
## Caused by warning:
## ! Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
##   # Was:
##   data %>% select(is.logical)
## 
##   # Now:
##   data %>% select(where(is.logical))

Explore data

data_clean %>% count(sellable_online)
## # A tibble: 2 × 2
##   sellable_online     n
##   <fct>           <int>
## 1 Yes              1886
## 2 No                 13
data_clean %>%
    ggplot(aes(sellable_online)) +
    geom_bar()

sellable_online vs price

data_clean %>% 
  ggplot(aes(sellable_online, price)) + 
  geom_boxplot()

Correlation plot

# Step 1: Binarize
data_binarized <- data_clean %>%
  select(-item_id, -short_description) %>%
  binarize()

data_binarized %>% glimpse()
## Rows: 1,899
## Columns: 82
## $ name__ALGOT                                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BEKANT                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BESTÅ                                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__BILLY_/_OXBERG`                           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BRIMNES                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BROR                                       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__EKET                                       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__GRÖNLID                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HAVSTA                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HAVSTEN                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HEMNES                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__IVAR                                       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__JONAXEL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__KALLAX                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__LIDHULT                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__LIXHULT                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__NORDLI                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__PAX                                        <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__PLATSA                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__STUVA_/_FRITIDS`                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__TROFAST                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__VALLENTUNA                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__VIMLE                                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__-OTHER`                                   <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ category__Bar_furniture                          <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ category__Beds                                   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Bookcases_&_shelving_units`           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Cabinets_&_cupboards`                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Chairs                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Chests_of_drawers_&_drawer_units`     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Children's_furniture`                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Nursery_furniture                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Outdoor_furniture                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sideboards,_buffets_&_console_tables` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sofas_&_armchairs`                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Tables_&_desks`                       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__TV_&_media_furniture`                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Wardrobes                              <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__-OTHER`                               <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `price__-Inf_5.68697535633982`                   <dbl> 1, 1, 0, 1, 1, 1, 0, …
## $ price__5.68697535633982_6.52209279817015         <dbl> 0, 0, 1, 0, 0, 0, 1, …
## $ price__6.52209279817015_7.37085996851068         <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ price__7.37085996851068_Inf                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ sellable_online__Yes                             <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `sellable_online__-OTHER`                        <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ other_colors__No                                 <dbl> 0, 1, 1, 1, 1, 1, 1, …
## $ other_colors__Yes                                <dbl> 1, 0, 0, 0, 0, 0, 0, …
## $ designer__Andreas_Fredriksson                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Carina_Bengs                           <dbl> 0, 0, 1, 0, 0, 0, 1, …
## $ designer__Carl_Öjerstam                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Ebba_Strandmark                        <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Ehlén_Johansson                        <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__Ehlén_Johansson/IKEA_of_Sweden`       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Eva_Lilja_Löwenhielm                   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Francis_Cayouette                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Gillis_Lundgren                        <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Henrik_Preutz                          <dbl> 1, 0, 0, 0, 0, 0, 0, …
## $ designer__IKEA_of_Sweden                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__IKEA_of_Sweden/Ehlén_Johansson`       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__IKEA_of_Sweden/Jon_Karlsson`          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Johan_Kroon                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Jon_Karlsson                           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__Jon_Karlsson/IKEA_of_Sweden`          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__K_Hagberg/M_Hagberg`                  <dbl> 0, 0, 0, 1, 1, 1, 0, …
## $ designer__Mia_Lagerman                           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Nike_Karlsson                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Ola_Wihlborg                           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Studio_Copenhagen                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Tord_Björklund                         <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__-OTHER`                               <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ `depth__-Inf_40`                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ depth__40_47                                     <dbl> 0, 0, 1, 1, 1, 1, 1, …
## $ depth__47_60                                     <dbl> 1, 1, 0, 0, 0, 0, 0, …
## $ depth__60_Inf                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `height__-Inf_71`                                <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ height__71_92                                    <dbl> 0, 0, 1, 0, 0, 0, 0, …
## $ height__92_171                                   <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ height__171_Inf                                  <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `width__-Inf_60`                                 <dbl> 1, 0, 1, 1, 1, 1, 1, …
## $ width__60_93                                     <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ width__93_161.5                                  <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ width__161.5_Inf                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlation 
data_correlation <- data_binarized %>%
  correlate(sellable_online__Yes)

data_correlation
## # A tibble: 82 × 3
##    feature         bin                   correlation
##    <fct>           <chr>                       <dbl>
##  1 sellable_online Yes                        1     
##  2 sellable_online -OTHER                    -1     
##  3 name            TROFAST                   -0.332 
##  4 category        Children's_furniture      -0.144 
##  5 price           -Inf_5.68697535633982     -0.143 
##  6 category        Nursery_furniture         -0.128 
##  7 width           -Inf_60                   -0.128 
##  8 designer        Studio_Copenhagen         -0.112 
##  9 depth           -Inf_40                   -0.0806
## 10 designer        Francis_Cayouette         -0.0573
## # ℹ 72 more rows
# Step 3: Plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 61 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split data

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.5.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.10     ✔ tailor       0.1.0 
## ✔ dials        1.4.2      ✔ tune         2.0.1 
## ✔ infer        1.1.0      ✔ workflows    1.3.0 
## ✔ modeldata    1.5.1      ✔ workflowsets 1.1.1 
## ✔ parsnip      1.4.1      ✔ yardstick    1.3.2 
## ✔ rsample      1.3.1
## Warning: package 'dials' was built under R version 4.5.2
## Warning: package 'infer' was built under R version 4.5.2
## Warning: package 'modeldata' was built under R version 4.5.2
## Warning: package 'parsnip' was built under R version 4.5.2
## Warning: package 'tailor' was built under R version 4.5.2
## Warning: package 'tune' was built under R version 4.5.2
## Warning: package 'workflows' was built under R version 4.5.2
## Warning: package 'workflowsets' was built under R version 4.5.2
## Warning: package 'yardstick' was built under R version 4.5.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
set.seed(1234)
# data_clean <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean, strata = sellable_online)
data_train <- training(data_split)
data_test  <- testing(data_split)

data_cv <- vfold_cv(data_train, v = 5, strata = sellable_online)
data_cv
## #  5-fold cross-validation using stratification 
## # A tibble: 5 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [1139/285]> Fold1
## 2 <split [1139/285]> Fold2
## 3 <split [1139/285]> Fold3
## 4 <split [1139/285]> Fold4
## 5 <split [1140/284]> Fold5

Preprocess data

library(themis)
## Warning: package 'themis' was built under R version 4.5.2
xgboost_rec <- recipes::recipe(sellable_online ~ ., data = data_train) %>%
    update_role(item_id, new_role = "ID") %>%
    step_tokenize(short_description) %>%
    step_tokenfilter(short_description, max_tokens = 100) %>%
    step_tf(short_description) %>%
    step_other(name, designer, threshold = 0.02) %>%
    step_dummy(all_nominal_predictors(), designer) %>%
    step_zv(all_predictors()) %>%
    step_smote(sellable_online, over_ratio = 1)


xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 2,828
## Columns: 145
## $ item_id                                       <dbl> 70404875, 50406465, 9040…
## $ price                                         <dbl> 4.859812, 4.859812, 5.00…
## $ depth                                         <dbl> 44, 44, 44, 52, 51, 44, …
## $ height                                        <dbl> 95, 95, 103, 114, 102, 1…
## $ width                                         <dbl> 50, 50, 52, 43, 40, 52, …
## $ sellable_online                               <fct> Yes, Yes, Yes, Yes, Yes,…
## $ tf_short_description_1                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_147x147                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_150x44x236               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_150x60x236               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_150x66x236               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_2                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_200x60x236               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_200x66x236               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_25x51x70                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_3                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_35x35x35                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_4                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_41x61                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_5                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_50x51x70                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_6                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_60x50x128                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_63                       <dbl> 1, 1, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_74                       <dbl> 0, 0, 1, 0, 1, 1, 0, 0, …
## $ tf_short_description_75                       <dbl> 0, 0, 0, 1, 0, 0, 0, 1, …
## $ tf_short_description_8                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_80x30x202                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_add                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_and                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_armchair                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_armrest                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_armrests                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_backrest                 <dbl> 1, 1, 1, 1, 1, 1, 0, 1, …
## $ tf_short_description_bar                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tf_short_description_baskets                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_bed                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_bedside                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_bench                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_bookcase                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_box                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_cabinet                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_cabinets                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_castors                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_chair                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_chaise                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_changing                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_chest                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tf_short_description_children's`             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_clothes                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_cm                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tf_short_description_combination              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_corner                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_cover                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_desk                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_door                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_doors                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_drawer                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_drawers                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_feet                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_foldable                 <dbl> 1, 1, 1, 0, 0, 1, 0, 0, …
## $ tf_short_description_for                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_frame                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ tf_short_description_glass                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_high                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_highchair                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_in                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_inserts                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_junior                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_leg                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_legs                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_lock                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_longue                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_mesh                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_modular                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_mounted                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_of                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_on                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_outdoor                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_panel                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_plinth                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_rail                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_seat                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_section                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_sections                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_shelf                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_shelves                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_shelving                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_side                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_sliding                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_smart                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_sofa                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_stool                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ tf_short_description_storage                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_table                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_top                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_tv                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_two                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_underframe               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_unit                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_upright                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_w                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_wall                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_wardrobe                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_wire                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_short_description_with                     <dbl> 1, 1, 1, 1, 1, 1, 0, 1, …
## $ name_EKET                                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_GRÖNLID                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_IVAR                                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_JONAXEL                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_LIDHULT                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_NORDLI                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_PAX                                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_PLATSA                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_TROFAST                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_VIMLE                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ name_other                                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ category_Beds                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Bookcases...shelving.units           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Cabinets...cupboards                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Café.furniture                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Chairs                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Chests.of.drawers...drawer.units     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Children.s.furniture                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Nursery.furniture                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Outdoor.furniture                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Room.dividers                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Sideboards..buffets...console.tables <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Sofas...armchairs                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Tables...desks                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Trolleys                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_TV...media.furniture                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_Wardrobes                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ other_colors_Yes                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_Ehlén.Johansson                      <dbl> 0, 0, 0, 1, 0, 0, 0, 1, …
## $ designer_Ehlén.Johansson.IKEA.of.Sweden       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_Francis.Cayouette                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_Henrik.Preutz                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_IKEA.of.Sweden                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_IKEA.of.Sweden.Ehlén.Johansson       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_Jon.Karlsson                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_K.Hagberg.M.Hagberg                  <dbl> 1, 1, 1, 0, 0, 1, 0, 0, …
## $ designer_Ola.Wihlborg                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_Studio.Copenhagen                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ designer_other                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …

Specify model

xgboost_spec <- 
  boost_tree(trees = tune(), 
             min_n = tune(),
             tree_depth = tune(),
             learn_rate = tune(),
             loss_reduction = tune(),
             sample_size = tune()) %>%
    set_mode("classification") %>% 
    set_engine("xgboost")

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(1234)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv,
            grid = 5,
            metrics = metric_set(roc_auc, accuracy),
            control = control_grid(save_pred = TRUE))

Model Evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)
## # A tibble: 10 × 12
##    trees min_n tree_depth learn_rate loss_reduction sample_size .metric 
##    <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>   
##  1     1    30         15    0.0750    0.0422             0.625 accuracy
##  2     1    30         15    0.0750    0.0422             0.625 roc_auc 
##  3   500    21          1    0.316     0.0000562          1     accuracy
##  4   500    21          1    0.316     0.0000562          1     roc_auc 
##  5  1000     2          8    0.0178    0.0000000001       0.5   accuracy
##  6  1000     2          8    0.0178    0.0000000001       0.5   roc_auc 
##  7  1500    11          4    0.001    31.6                0.75  accuracy
##  8  1500    11          4    0.001    31.6                0.75  roc_auc 
##  9  2000    40         11    0.00422   0.0000000750       0.875 accuracy
## 10  2000    40         11    0.00422   0.0000000750       0.875 roc_auc 
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(sellable_online, .pred_Yes) %>%
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config        
##   <chr>       <chr>          <dbl> <chr>          
## 1 accuracy    binary       0.996   pre0_mod0_post0
## 2 roc_auc     binary       0.968   pre0_mod0_post0
## 3 brier_class binary       0.00365 pre0_mod0_post0
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(sellable_online, .pred_class) %>%
    autoplot()

Variable importance

library(vip)
## Warning: package 'vip' was built under R version 4.5.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.996 and AUC of 0.968.