Explore Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ikea <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv")
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
ikea %>%
    select(...1, price, depth:width) %>%
    pivot_longer(depth:width, names_to = "dim") %>%
    ggplot(aes(value, price, color = dim)) + 
    geom_point(alpha = 0.4, show.legend = FALSE) +
    scale_y_log10() + 
    facet_wrap(~ dim, scale = "free_x") +
    labs(x = NULL)

ikea_df <- ikea %>%
    select(price, name, category, depth, height, width) %>%
    mutate(price = log10(price)) %>%
    mutate_if(is.character, factor)

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
ikea_split <- initial_split(ikea_df, strata = price)
ikea_train <- training(ikea_split)
ikea_test <- testing(ikea_split)

set.seed(234)
ikea_folds <- bootstraps(ikea_train, strata = price)
ikea_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits              id         
##    <list>              <chr>      
##  1 <split [2770/994]>  Bootstrap01
##  2 <split [2770/1003]> Bootstrap02
##  3 <split [2770/1037]> Bootstrap03
##  4 <split [2770/1010]> Bootstrap04
##  5 <split [2770/1014]> Bootstrap05
##  6 <split [2770/1007]> Bootstrap06
##  7 <split [2770/1036]> Bootstrap07
##  8 <split [2770/1016]> Bootstrap08
##  9 <split [2770/1021]> Bootstrap09
## 10 <split [2770/1043]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)

use_ranger(price ~ ., data = ikea_train)
## ranger_recipe <- 
##   recipe(formula = price ~ ., data = ikea_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("classification") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(67013)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(textrecipes)
ranger_recipe <-
  recipe(formula = price ~ ., data = ikea_train) %>%
  step_other(name, category, threshold = 0.01) %>%
  step_clean_levels(name, category) %>%
  step_impute_knn(depth, height, width)

ranger_spec <-
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
  set_mode("regression") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
  tune_grid(ranger_workflow,
    resamples = ikea_folds,
    grid = 11)
## i Creating pre-processing data to finalize unknown parameter: mtry

Explore Results

show_best(ranger_tune, metric = "rmse")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2     4 rmse    standard   0.340    25 0.00203 Preprocessor1_Model10
## 2     4    10 rmse    standard   0.348    25 0.00226 Preprocessor1_Model05
## 3     5     6 rmse    standard   0.349    25 0.00235 Preprocessor1_Model06
## 4     3    18 rmse    standard   0.350    25 0.00218 Preprocessor1_Model01
## 5     2    21 rmse    standard   0.352    25 0.00200 Preprocessor1_Model08
show_best(ranger_tune, metric = "rsq")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2     4 rsq     standard   0.726    25 0.00332 Preprocessor1_Model10
## 2     4    10 rsq     standard   0.713    25 0.00372 Preprocessor1_Model05
## 3     5     6 rsq     standard   0.711    25 0.00385 Preprocessor1_Model06
## 4     3    18 rsq     standard   0.709    25 0.00368 Preprocessor1_Model01
## 5     2    21 rsq     standard   0.707    25 0.00347 Preprocessor1_Model08
autoplot(ranger_tune)

final_rf <- ranger_workflow %>%
  finalize_workflow(select_best(ranger_tune))

final_rf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_other()
## • step_clean_levels()
## • step_impute_knn()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 2
##   trees = 1000
##   min_n = 4
## 
## Computational engine: ranger
ikea_fit <- last_fit(final_rf, ikea_split)
ikea_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits             id               .metrics .notes   .predictions .workflow 
##   <list>             <chr>            <list>   <list>   <list>       <list>    
## 1 <split [2770/924]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(ikea_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.318 Preprocessor1_Model1
## 2 rsq     standard       0.753 Preprocessor1_Model1
collect_predictions(ikea_fit) %>%
  ggplot(aes(price, .pred)) +
  geom_abline(lty = 2, color = "gray50") +
  geom_point(alpha = 0.5, color = "midnightblue") +
  coord_fixed()

predict(ikea_fit$.workflow[[1]], ikea_test[15, ])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  2.42
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_spec <- ranger_spec %>%
  finalize_model(select_best(ranger_tune)) %>%
  set_engine("ranger", importance = "permutation")

workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(imp_spec) %>%
  fit(ikea_train) %>%
  pull_workflow_fit() %>%
  vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))