Goal: to predict the price. Click [here for the data]https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-11-03/readme.md.

Import Data

ikea <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv')

skimr::skim(ikea)

Data summary
Name	ikea
Number of rows	3694
Number of columns	14
_______________________
Column type frequency:
character	7
logical	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
name	1	3	27	607
category	1	4	36	17
old_price	1	4	13	365
link	1	52	163	2962
other_colors	1	2	3	2
short_description	1	3	63	1706
designer	1	3	1261	381

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
sellable_online	0	1	0.99	TRU: 3666, FAL: 28

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
…1	0	1.00	1846.50	1066.51	0	923.25	1846.5	2769.75	3693	▇▇▇▇▇
item_id	0	1.00	48632396.79	28887094.10	58487	20390574.00	49288078.0	70403572.75	99932615	▇▇▇▇▇
price	0	1.00	1078.21	1374.65	3	180.90	544.7	1429.50	9585	▇▁▁▁▁
depth	1463	0.60	54.38	29.96	1	38.00	47.0	60.00	257	▇▃▁▁▁
height	988	0.73	101.68	61.10	1	67.00	83.0	124.00	700	▇▂▁▁▁
width	589	0.84	104.47	71.13	1	60.00	80.0	140.00	420	▇▅▂▁▁

data <- ikea %>%
    
    # Treat missing values
    select(-other_colors, -old_price, -sellable_online, -link, -designer) %>%
    na.omit() %>%
    
    # log transform variables with pos-skewed distribution
    mutate(price = log(price))

Explore Data

Identify good predictors.

item_id

data %>%
    ggplot(aes(price, height)) +
    scale_y_log10() +
    geom_point()

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-item_id, -short_description) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 1,899
## Columns: 59
## $ ...8                                             <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ ...9                                             <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ ...10                                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ ...11                                            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__ALGOT                                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BEKANT                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BESTÅ                                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__BILLY_/_OXBERG`                           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BRIMNES                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__BROR                                       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__EKET                                       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__GRÖNLID                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HAVSTA                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HAVSTEN                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__HEMNES                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__IVAR                                       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__JONAXEL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__KALLAX                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__LIDHULT                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__LIXHULT                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__NORDLI                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__PAX                                        <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__PLATSA                                     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__STUVA_/_FRITIDS`                          <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__TROFAST                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__VALLENTUNA                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ name__VIMLE                                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `name__-OTHER`                                   <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ category__Bar_furniture                          <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ category__Beds                                   <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Bookcases_&_shelving_units`           <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Cabinets_&_cupboards`                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Chairs                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Chests_of_drawers_&_drawer_units`     <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Children's_furniture`                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Nursery_furniture                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Outdoor_furniture                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sideboards,_buffets_&_console_tables` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sofas_&_armchairs`                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Tables_&_desks`                       <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__TV_&_media_furniture`                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Wardrobes                              <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__-OTHER`                               <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `price__-Inf_5.68697535633982`                   <dbl> 1, 1, 0, 1, 1, 1, 0, …
## $ price__5.68697535633982_6.52209279817015         <dbl> 0, 0, 1, 0, 0, 0, 1, …
## $ price__6.52209279817015_7.37085996851068         <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ price__7.37085996851068_Inf                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `depth__-Inf_40`                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ depth__40_47                                     <dbl> 0, 0, 1, 1, 1, 1, 1, …
## $ depth__47_60                                     <dbl> 1, 1, 0, 0, 0, 0, 0, …
## $ depth__60_Inf                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `height__-Inf_71`                                <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ height__71_92                                    <dbl> 0, 0, 1, 0, 0, 0, 0, …
## $ height__92_171                                   <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ height__171_Inf                                  <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `width__-Inf_60`                                 <dbl> 1, 0, 1, 1, 1, 1, 1, …
## $ width__60_93                                     <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ width__93_161.5                                  <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ width__161.5_Inf                                 <dbl> 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(price__7.37085996851068_Inf)

## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [1, 2, 3,
## 4].

data_corr_tbl

## # A tibble: 59 × 3
##    feature  bin                               correlation
##    <fct>    <chr>                                   <dbl>
##  1 price    7.37085996851068_Inf                    1    
##  2 width    161.5_Inf                               0.579
##  3 depth    60_Inf                                  0.447
##  4 category Sofas_&_armchairs                       0.379
##  5 width    -Inf_60                                -0.374
##  6 price    -Inf_5.68697535633982                  -0.336
##  7 price    6.52209279817015_7.37085996851068      -0.333
##  8 price    5.68697535633982_6.52209279817015      -0.331
##  9 name     PAX                                     0.302
## 10 category Wardrobes                               0.279
## # ℹ 49 more rows

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).

## Warning: ggrepel: 35 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

data <- data %>%
    
    select(item_id, name, category, price, short_description, depth, height, width)

Build Models

Split data

data <- data %>% sample_n(100)

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10

library(usemodels)
use_xgboost(price ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = price ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6804)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- 
  recipe(formula = price ~ ., data = data_train) %>%
    recipes::update_role(item_id, new_role = "id variables") %>%
    step_tokenize(short_description) %>%
    step_tokenfilter(short_description, max_tokens = 100) %>%
    step_tokenfilter(max_tokens = 40) %>%
    step_tfidf(short_description) %>%
    step_other(category) %>%
    step_dummy(category, name, one_hot = TRUE) %>%
    step_log(height, width) %>%
    step_YeoJohnson(width, height) %>%
    step_impute_knn(all_predictors())

# step_tokenize() %>%
xgboost_recipe %>% prep() %>% bake(new_data = NULL) %>% glimpse()

## Rows: 75
## Columns: 164
## $ item_id                                   <dbl> 79306333, 30466281, 59279585…
## $ depth                                     <dbl> 44, 30, 50, 42, 42, 57, 46, …
## $ height                                    <dbl> 96.54439, 24.50599, 48.39329…
## $ width                                     <dbl> 27.85003, 15.78814, 18.78546…
## $ price                                     <dbl> 7.661527, 2.708050, 5.926926…
## $ tfidf_short_description_1                 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_108x96            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_120x30x202        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_120x30x237        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_120x42x48         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_121x35x123        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_130x117           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_150x44x236        <dbl> 1.443578, 0.000000, 0.000000…
## $ tfidf_short_description_160x99            <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_180x41x49         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_180x42x64         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_193x25x176        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_2                 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_200x58x201        <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_200x66x236        <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_222x80x90         <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_240x42x230        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_25x51x70          <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_29                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_3                 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_300x42x193        <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_300x42x210        <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_300x60x236        <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_35x35x10          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_4                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_40x28x106         <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_40x99             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_42x30x10          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_42x30x23          <dbl> 0.000000, 1.082683, 0.000000…
## $ tfidf_short_description_42x61             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_44x55x216         <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_45x47x92          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_5                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_50x51x70          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_6                 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_60x20x38          <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_60x22x128         <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_60x22x202         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_60x25x40          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_60x40x128         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_60x50x64          <dbl> 0.000000, 0.000000, 1.082683…
## $ tfidf_short_description_60x57x73          <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_63x126            <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_64x60             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_70x47             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_75                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_80x35x210         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_80x42x221         <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_81x35x123         <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_81x47x212         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_9                 <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_90x200            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_90x40x38          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_90x83             <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_94x44x52          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_add               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_armrest           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_backrest          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_bar               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_baskets           <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_bed               <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_bench             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_bookcase          <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_box               <dbl> 0.0000000, 0.9126646, 0.0000…
## $ tfidf_short_description_cabinet           <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_castors           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_chair             <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_chaise            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_chest             <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_cm                <dbl> 0.3014854, 0.2261141, 0.2261…
## $ tfidf_short_description_combination       <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_corner            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_cushion           <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_desk              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_display           <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_door              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_doors             <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_drawer            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_drawers           <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_folding           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_frame             <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_glass             <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_leg               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_modular           <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_mounted           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_of                <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_on                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_outdoor           <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_seat              <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_section           <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_shelf             <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_sofa              <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_stool             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_short_description_storage           <dbl> 0.0000000, 0.5141130, 0.5141…
## $ tfidf_short_description_tv                <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_unit              <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_wall              <dbl> 0.0000000, 0.0000000, 0.0000…
## $ tfidf_short_description_wardrobe          <dbl> 0.8202697, 0.0000000, 0.0000…
## $ tfidf_short_description_wire              <dbl> 0.000000, 0.000000, 0.000000…
## $ tfidf_short_description_with              <dbl> 0.0000000, 0.0000000, 0.4345…
## $ category_Bookcases...shelving.units       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Cabinets...cupboards             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Chairs                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ category_Chests.of.drawers...drawer.units <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ category_Children.s.furniture             <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ category_Sofas...armchairs                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ category_TV...media.furniture             <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ category_Wardrobes                        <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0…
## $ category_other                            <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0…
## $ name_ÄPPLARÖ                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BESTÅ                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BESTÅ...EKET                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ name_BESTÅ.BURS                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BILLY...BOTTNA                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BILLY...OXBERG                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_BRIMNES                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_DELAKTIG                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_EKEDALEN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_EKET                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_ELVARLI                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_GODVIN                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_GRÖNLID                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_GUNDE                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_HÄLLAN                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_HAVSTA                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_HEMNES                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_IDÅSEN                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_JANINGE                              <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ name_JONAXEL                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_KOPPANG                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_KORNSJÖ                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_KUNGSHAMN                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_KUNGSHOLMEN                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LANDSKRONA                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LANGUR                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LEIFARNE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LIDHULT                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LIXHULT                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_LYCKSELE.LÖVÅS                       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ name_MOSJÖ                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_NORDLI                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_NORDVIKEN                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_NORRARYD                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_NYHAMN                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_PÅHL                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_PAX                                  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ name_PAX...MEHAMN.AULI                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_PLATSA                               <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0…
## $ name_POÄNG                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_SÖDERHAMN                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ name_SOLLERÖN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_STALLARP                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_STUBBARP                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_STUVA...FRITIDS                      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ name_SVALNÄS                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_TROFAST                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_VALLENTUNA                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_VIMLE                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ name_VISTHUS                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(6804)
doParallel::registerDoParallel()
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5)

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1   431     9         10    0.0100        7.91e-10       0.427 rmse   
## 2  1309    24          2    0.00694       2.65e+ 0       0.905 rmse   
## 3  1020    11          5    0.00253       4.65e- 5       0.672 rmse   
## 4  1926    32          7    0.0768        5.00e- 8       0.553 rmse   
## 5   121    34         15    0.212         1.08e- 1       0.110 rmse   
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

# Update the model by selecting the best hyperparameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire traning data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune::collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.868 Preprocessor1_Model1
## 2 rsq     standard       0.649 Preprocessor1_Model1

tune::collect_predictions(data_fit) %>%
    ggplot(aes(price, .pred)) +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") +
    coord_fixed()

Models

ranger_spec <-
  rand_forest(trees = 500) %>%
  set_engine("ranger") %>%
  set_mode("regression")

ranger_spec

## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger

svm_spec <-
    svm_linear() %>%
    set_engine("LiblineaR") %>%
    set_mode("regression")

svm_spec

## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR

ranger_wf <- workflow(xgboost_recipe, ranger_spec)
svm_wf <- workflow(xgboost_recipe, svm_spec)

doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
  svm_wf,
  resamples = data_cv,
  control = contrl_preds
)

## Warning: package 'LiblineaR' was built under R version 4.4.1

ranger_rs <- fit_resamples(
  ranger_wf,
  resamples = data_cv,
  control = contrl_preds
)

collect_metrics(svm_rs)

## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.818    10  0.0741 Preprocessor1_Model1
## 2 rsq     standard   0.842    10  0.0249 Preprocessor1_Model1

collect_metrics(ranger_rs)

## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.715    10  0.0922 Preprocessor1_Model1
## 2 rsq     standard   0.823    10  0.0306 Preprocessor1_Model1

bind_rows(
  collect_predictions(svm_rs) %>%
    mutate(mod = "SVM"), collect_predictions(ranger_rs) %>%
    mutate(mod = "ranger")) %>%
    
  ggplot(aes(price, .pred)) +
    geom_abline(lty = 2, color = "gray50") +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    coord_fixed()

Final

final_fitted <- last_fit(svm_wf, data_split)
collect_metrics(final_fitted)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.990 Preprocessor1_Model1
## 2 rsq     standard       0.711 Preprocessor1_Model1

final_wf <- extract_workflow(final_fitted)
predict(final_wf, data_test)

## Warning: ! There are new levels in a factor: `MASTHOLMEN`, `BEKANT`, `SMÅGÖRA`, `IVAR`,
##   `STUK`, `ARKELSTORP`, `GALANT`, `BRUSALI`, `INGOLF`, `VADHOLMA`, `FÖRSIKTIG`,
##   `GODISHUS`, and `VÄDDÖ`.

## # A tibble: 25 × 1
##    .pred
##    <dbl>
##  1 6.37 
##  2 6.18 
##  3 7.09 
##  4 5.39 
##  5 5.38 
##  6 8.19 
##  7 6.67 
##  8 0.927
##  9 3.71 
## 10 8.17 
## # ℹ 15 more rows

Comments

I made the inclusion of name as a predictor. With the use of step_tokenize() and take the first 40 words, I also tried to implement the mtry = tune() as a predictor but all models failed. With the use Random Forest, and SVM i found that xgboost is performing better for IKEA.

Apply 4: Ikea Price Prediction

Julius Mondschein

2024-09-12