setwd('C:/Users/DellPC/Desktop/Corner/R_source_code')

#registerDoParallel(4)

Explore the data

Our modeling goal is to predict the price of IKEA furniture from other furniture characteristics like category and size.

library(tidyverse)
## -- Attaching packages --------
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -----------------
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
ikea <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-11-03/ikea.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   item_id = col_double(),
##   name = col_character(),
##   category = col_character(),
##   price = col_double(),
##   old_price = col_character(),
##   sellable_online = col_logical(),
##   link = col_character(),
##   other_colors = col_character(),
##   short_description = col_character(),
##   designer = col_character(),
##   depth = col_double(),
##   height = col_double(),
##   width = col_double()
## )

How is the price relaed to the furniture dimensions?

theme_set(theme_light())

library(foreach)


ikea %>%
               select(X1, price, depth:width) %>%
               pivot_longer(depth:width, names_to = 'dim') %>%
               ggplot(aes(value, price, color = dim)) +
               geom_point(alpha = 0.4, show.legend = FALSE) +
               scale_y_log10() + 
               facet_wrap(~dim, scales = 'free_x') + labs(x = NULL)

There are lots more greatexamples of TidyTuesday EDA out there to explore on Twitter! Let’s do a bit of data preparation for modeling. There are still lots of NA values for furiture dimensions but we are going to impute those

ikea_df <- ikea %>%
               select(price, name, category, depth, height, width) %>%
               mutate(price = log10(price)) %>%
               mutate_if(is.character, factor)

ikea_df
## # A tibble: 3,694 x 6
##    price name                  category      depth height width
##    <dbl> <fct>                 <fct>         <dbl>  <dbl> <dbl>
##  1  2.42 FREKVENS              Bar furniture    NA     99    51
##  2  3.00 NORDVIKEN             Bar furniture    NA    105    80
##  3  3.32 NORDVIKEN / NORDVIKEN Bar furniture    NA     NA    NA
##  4  1.84 STIG                  Bar furniture    50    100    60
##  5  2.35 NORBERG               Bar furniture    60     43    74
##  6  2.54 INGOLF                Bar furniture    45     91    40
##  7  2.11 FRANKLIN              Bar furniture    44     95    50
##  8  2.29 DALFRED               Bar furniture    50     NA    50
##  9  2.11 FRANKLIN              Bar furniture    44     95    50
## 10  3.34 EKEDALEN / EKEDALEN   Bar furniture    NA     NA    NA
## # ... with 3,684 more rows

Build a model

We can start by loading the tidymodels metapackage, spliting our data into training and testing sets, and creating resamples

library(tidymodels)
## -- Attaching packages --------
## v broom     0.7.0      v recipes   0.1.15
## v dials     0.0.9      v rsample   0.0.8 
## v infer     0.5.3      v tune      0.1.2 
## v modeldata 0.0.2      v workflows 0.2.1 
## v parsnip   0.1.4      v yardstick 0.0.7
## Warning: package 'dials' was built under R version 4.0.3
## Warning: package 'parsnip' was built under R version 4.0.3
## Warning: package 'recipes' was built under R version 4.0.3
## Warning: package 'rsample' was built under R version 4.0.3
## Warning: package 'tune' was built under R version 4.0.3
## Warning: package 'workflows' was built under R version 4.0.3
## -- Conflicts -----------------
## x foreach::accumulate() masks purrr::accumulate()
## x scales::discard()     masks purrr::discard()
## x dplyr::filter()       masks stats::filter()
## x recipes::fixed()      masks stringr::fixed()
## x dplyr::lag()          masks stats::lag()
## x yardstick::spec()     masks readr::spec()
## x recipes::step()       masks stats::step()
## x foreach::when()       masks purrr::when()
set.seed(123)



ikea_split <- initial_split(ikea_df, strata = price)

ikea_train <- training(ikea_split)
ikea_test <- testing(ikea_split)

set.seed(234)
ikea_folds <- bootstraps(ikea_train, strata = price)

ikea_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 x 2
##    splits             id         
##    <list>             <chr>      
##  1 <split [2.8K/998]> Bootstrap01
##  2 <split [2.8K/1K]>  Bootstrap02
##  3 <split [2.8K/1K]>  Bootstrap03
##  4 <split [2.8K/1K]>  Bootstrap04
##  5 <split [2.8K/1K]>  Bootstrap05
##  6 <split [2.8K/1K]>  Bootstrap06
##  7 <split [2.8K/1K]>  Bootstrap07
##  8 <split [2.8K/1K]>  Bootstrap08
##  9 <split [2.8K/1K]>  Bootstrap09
## 10 <split [2.8K/1K]>  Bootstrap10
## # ... with 15 more rows

In this analysis, we are using a function from usemodels to providescaffolding for getting started with tidymodels tuning. The two inputs we need are:

library(usemodels)
## Warning: package 'usemodels' was built under R version 4.0.3
use_ranger(price ~., data = ikea_train)
## ranger_recipe <- 
##   recipe(formula = price ~ ., data = ikea_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("regression") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(8577)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

The output that we get from the usemodels scaffolding sets up up for random forest tuning, and we can add just a few more feature engineering steps to take care of the numerous factor levels in the furniture name and category “cleaning” the factor levels, and imputing the missing data in the furniture dimensions. Then it’s time to tune!

library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.0.3
ranger_recipe <- 
               recipe(formula = price ~., data = ikea_train) %>%
               step_other(name, category, threshold = 0.01) %>%
               step_clean_levels(name, category) %>%
               step_knnimpute(depth, height, width)


ranger_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
               set_mode('regression') %>%
               set_engine('ranger')



ranger_workflow <- 
               workflow() %>% 
               add_recipe(ranger_recipe) %>%
               add_model(ranger_spec)


set.seed(8577)

ranger_tune <- tune_grid(ranger_workflow, resamples = ikea_folds, grid =11) 
## i Creating pre-processing data to finalize unknown parameter: mtry
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice

The usemodels output required us to decide for ourselveson the resamples and grid to use; it provides sensible defaults fr many options based on our data but we still need to use good judgement for some modeling inputs

Explore results

Now let’s see how we did. We can check out the best-performing models in the tuning results

show_best(ranger_tune, metric = 'rmse')
## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2     4 rmse    standard   0.342    25 0.00210 Preprocessor1_Model10
## 2     4    10 rmse    standard   0.348    25 0.00235 Preprocessor1_Model05
## 3     5     6 rmse    standard   0.348    25 0.00266 Preprocessor1_Model06
## 4     3    18 rmse    standard   0.351    25 0.00212 Preprocessor1_Model01
## 5     2    21 rmse    standard   0.355    25 0.00200 Preprocessor1_Model08
show_best(ranger_tune, metric = 'rsq')
## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2     4 rsq     standard   0.714    25 0.00335 Preprocessor1_Model10
## 2     4    10 rsq     standard   0.705    25 0.00367 Preprocessor1_Model05
## 3     5     6 rsq     standard   0.704    25 0.00407 Preprocessor1_Model06
## 4     3    18 rsq     standard   0.698    25 0.00336 Preprocessor1_Model01
## 5     2    21 rsq     standard   0.694    25 0.00326 Preprocessor1_Model08

How did all the possible parameter combinations do?

autoplot(ranger_tune)

We can finalize our random forest workflow with the best performing parameters.

final_rf <- ranger_workflow %>% finalize_workflow(select_best(ranger_tune))
## Warning: No value of `metric` was given; metric 'rmse' will be used.
final_rf
## == Workflow ==================
## Preprocessor: Recipe
## Model: rand_forest()
## 
## -- Preprocessor --------------
## 3 Recipe Steps
## 
## * step_other()
## * step_clean_levels()
## * step_knnimpute()
## 
## -- Model ---------------------
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 2
##   trees = 1000
##   min_n = 4
## 
## Computational engine: ranger

The function last_fit() fits this finalized random forest one last time to the training data and evaluates one last time on the testing data.

ikea_fit <- last_fit(final_rf, ikea_split)

ikea_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 x 6
##   splits        id           .metrics      .notes      .predictions    .workflow
##   <list>        <chr>        <list>        <list>      <list>          <list>   
## 1 <split [2.8K~ train/test ~ <tibble [2 x~ <tibble [0~ <tibble [922 x~ <workflo~

The metrics in ikea_fit are computed using the testing data

collect_metrics(ikea_fit)
## # A tibble: 2 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.314 Preprocessor1_Model1
## 2 rsq     standard       0.769 Preprocessor1_Model1

The predictions in ikea_fit are also for the testing data.

collect_predictions(ikea_fit) 
## # A tibble: 922 x 5
##    id               .pred  .row price .config             
##    <chr>            <dbl> <int> <dbl> <chr>               
##  1 train/test split  2.47    14  2.25 Preprocessor1_Model1
##  2 train/test split  2.89    16  2.84 Preprocessor1_Model1
##  3 train/test split  2.52    17  2.60 Preprocessor1_Model1
##  4 train/test split  2.96    24  2.90 Preprocessor1_Model1
##  5 train/test split  2.53    26  2.54 Preprocessor1_Model1
##  6 train/test split  2.52    28  2.84 Preprocessor1_Model1
##  7 train/test split  2.52    32  2.64 Preprocessor1_Model1
##  8 train/test split  2.71    48  2.60 Preprocessor1_Model1
##  9 train/test split  2.74    50  2.77 Preprocessor1_Model1
## 10 train/test split  3.38    51  3.32 Preprocessor1_Model1
## # ... with 912 more rows

The predictions in ikea_fit are also for the testing data.

collect_predictions(ikea_fit) %>%
               ggplot(aes(price, .pred)) +
               geom_abline(lty = 2, color = 'gray50') +
               geom_point(alpha = 0.5, color = 'midnightblue') +
               coord_fixed()

We can use the trained workflow from ikea_fit for prediction, or save it to use later.

predict(ikea_fit$.workflow[[1]], ikea_test[15, ])
## # A tibble: 1 x 1
##   .pred
##   <dbl>
## 1  2.71

Lastly, let’s lern about feature importance for this model using the vip package. For a ranger model, we do need to go back to the model specification itself and update the engine with importance = "permutation" in order to compute feature importance. This means fitting the model one more time.

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_spec <- ranger_spec %>% 
               finalize_model(select_best(ranger_tune)) %>%
               set_engine('ranger', importance = 'permutation')
## Warning: No value of `metric` was given; metric 'rmse' will be used.
workflow() %>% add_recipe(ranger_recipe) %>%
               add_model(imp_spec) %>%
               fit(ikea_train) %>%
               pull_workflow_fit() %>%
               vip(aesthetics = list(alpha = 0.8, fill = 'midnightblue'))