library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ correlationfunnel Tip #2 ════════════════════════════════════════════════════
## Clean your NA's prior to using `binarize()`.
## Missing values and cleaning data are critical to getting great correlations. :)
library(recipes)
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.12 ✔ tailor 0.1.0
## ✔ dials 1.4.2 ✔ tune 2.0.1
## ✔ infer 1.1.0 ✔ workflows 1.3.0
## ✔ modeldata 1.5.1 ✔ workflowsets 1.1.1
## ✔ parsnip 1.4.1 ✔ yardstick 1.3.2
## ✔ rsample 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(themis)
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: iterators
## Loading required package: parallel
data <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-11-03/ikea.csv')
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Issues with data
Missing values
depth, height, width
Factors or numeric variables
category, sellable_online, other_colors, designer
Zero variance variables
none identified
Character variables
name, short_description, link
Unbalanced target variable
price distribution may be skewed across categories
ID variable
item_id
data_clean <- data %>%
mutate(
category = as.factor(category),
sellable_online = factor(sellable_online, levels = c(FALSE, TRUE), labels = c("No", "Yes")),
other_colors = as.factor(other_colors),
designer = as.factor(designer)
) %>%
select(-`...1`, -item_id, -link, -short_description, -old_price, -name)
data_clean %>% count(category)
## # A tibble: 17 × 2
## category n
## <fct> <int>
## 1 Bar furniture 47
## 2 Beds 208
## 3 Bookcases & shelving units 548
## 4 Cabinets & cupboards 292
## 5 Café furniture 26
## 6 Chairs 481
## 7 Chests of drawers & drawer units 125
## 8 Children's furniture 124
## 9 Nursery furniture 97
## 10 Outdoor furniture 216
## 11 Room dividers 13
## 12 Sideboards, buffets & console tables 23
## 13 Sofas & armchairs 428
## 14 Tables & desks 612
## 15 Trolleys 28
## 16 TV & media furniture 190
## 17 Wardrobes 236
data_clean %>%
ggplot(aes(category)) +
geom_bar() +
coord_flip()
data_clean %>%
ggplot(aes(category, price)) +
geom_boxplot() +
coord_flip()
# step 1: binarizes
data_model <- data_clean %>%
drop_na(depth, height, width)
# step 2: correlation
data_binarized <- data_model %>%
binarize()
data_binarized %>% glimpse()
## Rows: 1,899
## Columns: 58
## $ category__Bar_furniture <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ category__Beds <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Bookcases_&_shelving_units` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Cabinets_&_cupboards` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Chairs <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Chests_of_drawers_&_drawer_units` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Children's_furniture` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Nursery_furniture <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Outdoor_furniture <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sideboards,_buffets_&_console_tables` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Sofas_&_armchairs` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__Tables_&_desks` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__TV_&_media_furniture` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ category__Wardrobes <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `category__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `price__-Inf_295` <dbl> 1, 1, 0, 1, 1, 1, 0, …
## $ price__295_680 <dbl> 0, 0, 1, 0, 0, 0, 1, …
## $ price__680_1589 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ price__1589_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ sellable_online__Yes <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `sellable_online__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ other_colors__No <dbl> 0, 1, 1, 1, 1, 1, 1, …
## $ other_colors__Yes <dbl> 1, 0, 0, 0, 0, 0, 0, …
## $ designer__Andreas_Fredriksson <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Carina_Bengs <dbl> 0, 0, 1, 0, 0, 0, 1, …
## $ designer__Carl_Öjerstam <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Ebba_Strandmark <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Ehlén_Johansson <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__Ehlén_Johansson/IKEA_of_Sweden` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Eva_Lilja_Löwenhielm <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Francis_Cayouette <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Gillis_Lundgren <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Henrik_Preutz <dbl> 1, 0, 0, 0, 0, 0, 0, …
## $ designer__IKEA_of_Sweden <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__IKEA_of_Sweden/Ehlén_Johansson` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__IKEA_of_Sweden/Jon_Karlsson` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Johan_Kroon <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Jon_Karlsson <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__Jon_Karlsson/IKEA_of_Sweden` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__K_Hagberg/M_Hagberg` <dbl> 0, 0, 0, 1, 1, 1, 0, …
## $ designer__Mia_Lagerman <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Nike_Karlsson <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Ola_Wihlborg <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Studio_Copenhagen <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ designer__Tord_Björklund <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `designer__-OTHER` <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ `depth__-Inf_40` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ depth__40_47 <dbl> 0, 0, 1, 1, 1, 1, 1, …
## $ depth__47_60 <dbl> 1, 1, 0, 0, 0, 0, 0, …
## $ depth__60_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `height__-Inf_71` <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ height__71_92 <dbl> 0, 0, 1, 0, 0, 0, 0, …
## $ height__92_171 <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ height__171_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `width__-Inf_60` <dbl> 1, 0, 1, 1, 1, 1, 1, …
## $ width__60_93 <dbl> 0, 1, 0, 0, 0, 0, 0, …
## $ width__93_161.5 <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ width__161.5_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, …
# step 3: correlation
data_correlation <- data_binarized %>%
correlate(sellable_online__Yes)
# step 4: plot
data_correlation %>%
correlationfunnel::plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
## Please report the issue at
## <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
## Please report the issue at
## <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
data_clean <- data %>%
mutate(
category = as.factor(category),
sellable_online = factor(sellable_online, levels = c(FALSE, TRUE), labels = c("No", "Yes")),
other_colors = as.factor(other_colors),
designer = as.factor(designer)
) %>%
select(-`...1`, -item_id, -link, -short_description, -old_price)
#split Data
set.seed(1234)
data_split <- initial_split(data_clean, strata = sellable_online)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- vfold_cv(data_train, strata = sellable_online)
xgboost_recipe <- recipe(sellable_online ~ ., data = data_train) %>%
step_impute_median(all_numeric_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(sellable_online)
xgboost_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
sample_size = tune()
) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(65743)
xgboost_tune <-
tune_grid(
xgboost_workflow,
resamples = data_cv,
grid = 5,
metrics = metric_set(accuracy, roc_auc),
control = control_grid(save_pred = TRUE)
)
## → A | warning: No event observations were detected in `truth` with event level 'No'.
## There were issues with some computations A: x5There were issues with some computations A: x5
collect_predictions(xgboost_tune) %>%
roc_curve(sellable_online, .pred_Yes) %>%
autoplot()
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.995 pre0_mod0_post0
## 2 roc_auc binary 0.953 pre0_mod0_post0
## 3 brier_class binary 0.00375 pre0_mod0_post0
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(sellable_online, .pred_class) %>%
autoplot()
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgboost_fit <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
fit(data_train)
xgboost_fit %>%
extract_fit_parsnip() %>%
vip()