Import data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
suppressWarnings(library(correlationfunnel))
## ══ correlationfunnel Tip #3 ════════════════════════════════════════════════════
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)
data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-11-03/ikea.csv')
## New names:
## Rows: 3694 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): name, category, old_price, link, other_colors, short_description, d... dbl
## (6): ...1, item_id, price, depth, height, width lgl (1): sellable_online
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
skimr::skim(data)
Data summary
Name data
Number of rows 3694
Number of columns 14
_______________________
Column type frequency:
character 7
logical 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 3 27 0 607 0
category 0 1 4 36 0 17 0
old_price 0 1 4 13 0 365 0
link 0 1 52 163 0 2962 0
other_colors 0 1 2 3 0 2 0
short_description 0 1 3 63 0 1706 0
designer 0 1 3 1261 0 381 0

Variable type: logical

skim_variable n_missing complete_rate mean count
sellable_online 0 1 0.99 TRU: 3666, FAL: 28

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
…1 0 1.00 1846.50 1066.51 0 923.25 1846.5 2769.75 3693 ▇▇▇▇▇
item_id 0 1.00 48632396.79 28887094.10 58487 20390574.00 49288078.0 70403572.75 99932615 ▇▇▇▇▇
price 0 1.00 1078.21 1374.65 3 180.90 544.7 1429.50 9585 ▇▁▁▁▁
depth 1463 0.60 54.38 29.96 1 38.00 47.0 60.00 257 ▇▃▁▁▁
height 988 0.73 101.68 61.10 1 67.00 83.0 124.00 700 ▇▂▁▁▁
width 589 0.84 104.47 71.13 1 60.00 80.0 140.00 420 ▇▅▂▁▁
data_clean <- data %>%

    mutate(across(is.logical, as.factor)) %>%
    select(-old_price, -link, -...1) %>%
    na.omit() %>%
    
mutate(price = log(price))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(is.logical, as.factor)`.
## Caused by warning:
## ! Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
##   # Was:
##   data %>% select(is.logical)
## 
##   # Now:
##   data %>% select(where(is.logical))

Explore data

data_clean %>% count(sellable_online)
## # A tibble: 2 × 2
##   sellable_online     n
##   <fct>           <int>
## 1 FALSE              13
## 2 TRUE             1886
data_clean %>%
    ggplot(aes(sellable_online)) +
    geom_bar()

sellable_online vs price

data_clean %>% 
  ggplot(aes(sellable_online, price)) + 
  geom_boxplot()

Correlation plot

# Step 1: Binarize
data_binarized <- data_clean %>%
  select(-item_id) %>%
  binarize()

data_binarized %>% glimpse()
## Rows: 1,899
## Columns: 88
## $ name__ALGOT                                           <dbl> 0, 0, 0, 0, 0, 0…
## $ name__BEKANT                                          <dbl> 0, 0, 0, 0, 0, 0…
## $ name__BESTÅ                                           <dbl> 0, 0, 0, 0, 0, 0…
## $ `name__BILLY_/_OXBERG`                                <dbl> 0, 0, 0, 0, 0, 0…
## $ name__BRIMNES                                         <dbl> 0, 0, 0, 0, 0, 0…
## $ name__BROR                                            <dbl> 0, 0, 0, 0, 0, 0…
## $ name__EKET                                            <dbl> 0, 0, 0, 0, 0, 0…
## $ name__GRÖNLID                                         <dbl> 0, 0, 0, 0, 0, 0…
## $ name__HAVSTA                                          <dbl> 0, 0, 0, 0, 0, 0…
## $ name__HAVSTEN                                         <dbl> 0, 0, 0, 0, 0, 0…
## $ name__HEMNES                                          <dbl> 0, 0, 0, 0, 0, 0…
## $ name__IVAR                                            <dbl> 0, 0, 0, 0, 0, 0…
## $ name__JONAXEL                                         <dbl> 0, 0, 0, 0, 0, 0…
## $ name__KALLAX                                          <dbl> 0, 0, 0, 0, 0, 0…
## $ name__LIDHULT                                         <dbl> 0, 0, 0, 0, 0, 0…
## $ name__LIXHULT                                         <dbl> 0, 0, 0, 0, 0, 0…
## $ name__NORDLI                                          <dbl> 0, 0, 0, 0, 0, 0…
## $ name__PAX                                             <dbl> 0, 0, 0, 0, 0, 0…
## $ name__PLATSA                                          <dbl> 0, 0, 0, 0, 0, 0…
## $ `name__STUVA_/_FRITIDS`                               <dbl> 0, 0, 0, 0, 0, 0…
## $ name__TROFAST                                         <dbl> 0, 0, 0, 0, 0, 0…
## $ name__VALLENTUNA                                      <dbl> 0, 0, 0, 0, 0, 0…
## $ name__VIMLE                                           <dbl> 0, 0, 0, 0, 0, 0…
## $ `name__-OTHER`                                        <dbl> 1, 1, 1, 1, 1, 1…
## $ category__Bar_furniture                               <dbl> 1, 1, 1, 1, 1, 1…
## $ category__Beds                                        <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__Bookcases_&_shelving_units`                <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__Cabinets_&_cupboards`                      <dbl> 0, 0, 0, 0, 0, 0…
## $ category__Chairs                                      <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__Chests_of_drawers_&_drawer_units`          <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__Children's_furniture`                      <dbl> 0, 0, 0, 0, 0, 0…
## $ category__Nursery_furniture                           <dbl> 0, 0, 0, 0, 0, 0…
## $ category__Outdoor_furniture                           <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__Sideboards,_buffets_&_console_tables`      <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__Sofas_&_armchairs`                         <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__Tables_&_desks`                            <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__TV_&_media_furniture`                      <dbl> 0, 0, 0, 0, 0, 0…
## $ category__Wardrobes                                   <dbl> 0, 0, 0, 0, 0, 0…
## $ `category__-OTHER`                                    <dbl> 0, 0, 0, 0, 0, 0…
## $ `price__-Inf_5.68697535633982`                        <dbl> 1, 1, 0, 1, 1, 1…
## $ price__5.68697535633982_6.52209279817015              <dbl> 0, 0, 1, 0, 0, 0…
## $ price__6.52209279817015_7.37085996851068              <dbl> 0, 0, 0, 0, 0, 0…
## $ price__7.37085996851068_Inf                           <dbl> 0, 0, 0, 0, 0, 0…
## $ sellable_online__TRUE                                 <dbl> 1, 1, 1, 1, 1, 1…
## $ `sellable_online__-OTHER`                             <dbl> 0, 0, 0, 0, 0, 0…
## $ other_colors__No                                      <dbl> 0, 1, 1, 1, 1, 1…
## $ other_colors__Yes                                     <dbl> 1, 0, 0, 0, 0, 0…
## $ `short_description__3-seat_sofa`                      <dbl> 0, 0, 0, 0, 0, 0…
## $ `short_description__3-seat_sofa-bed`                  <dbl> 0, 0, 0, 0, 0, 0…
## $ short_description__Armchair                           <dbl> 0, 0, 0, 0, 0, 0…
## $ short_description__Chair                              <dbl> 0, 0, 0, 0, 0, 0…
## $ `short_description__Wardrobe,__________150x66x236_cm` <dbl> 0, 0, 0, 0, 0, 0…
## $ `short_description__-OTHER`                           <dbl> 1, 1, 1, 1, 1, 1…
## $ designer__Andreas_Fredriksson                         <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Carina_Bengs                                <dbl> 0, 0, 1, 0, 0, 0…
## $ designer__Carl_Öjerstam                               <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Ebba_Strandmark                             <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Ehlén_Johansson                             <dbl> 0, 0, 0, 0, 0, 0…
## $ `designer__Ehlén_Johansson/IKEA_of_Sweden`            <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Eva_Lilja_Löwenhielm                        <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Francis_Cayouette                           <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Gillis_Lundgren                             <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Henrik_Preutz                               <dbl> 1, 0, 0, 0, 0, 0…
## $ designer__IKEA_of_Sweden                              <dbl> 0, 0, 0, 0, 0, 0…
## $ `designer__IKEA_of_Sweden/Ehlén_Johansson`            <dbl> 0, 0, 0, 0, 0, 0…
## $ `designer__IKEA_of_Sweden/Jon_Karlsson`               <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Johan_Kroon                                 <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Jon_Karlsson                                <dbl> 0, 0, 0, 0, 0, 0…
## $ `designer__Jon_Karlsson/IKEA_of_Sweden`               <dbl> 0, 0, 0, 0, 0, 0…
## $ `designer__K_Hagberg/M_Hagberg`                       <dbl> 0, 0, 0, 1, 1, 1…
## $ designer__Mia_Lagerman                                <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Nike_Karlsson                               <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Ola_Wihlborg                                <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Studio_Copenhagen                           <dbl> 0, 0, 0, 0, 0, 0…
## $ designer__Tord_Björklund                              <dbl> 0, 0, 0, 0, 0, 0…
## $ `designer__-OTHER`                                    <dbl> 0, 1, 0, 0, 0, 0…
## $ `depth__-Inf_40`                                      <dbl> 0, 0, 0, 0, 0, 0…
## $ depth__40_47                                          <dbl> 0, 0, 1, 1, 1, 1…
## $ depth__47_60                                          <dbl> 1, 1, 0, 0, 0, 0…
## $ depth__60_Inf                                         <dbl> 0, 0, 0, 0, 0, 0…
## $ `height__-Inf_71`                                     <dbl> 0, 1, 0, 0, 0, 0…
## $ height__71_92                                         <dbl> 0, 0, 1, 0, 0, 0…
## $ height__92_171                                        <dbl> 1, 0, 0, 1, 1, 1…
## $ height__171_Inf                                       <dbl> 0, 0, 0, 0, 0, 0…
## $ `width__-Inf_60`                                      <dbl> 1, 0, 1, 1, 1, 1…
## $ width__60_93                                          <dbl> 0, 1, 0, 0, 0, 0…
## $ width__93_161.5                                       <dbl> 0, 0, 0, 0, 0, 0…
## $ width__161.5_Inf                                      <dbl> 0, 0, 0, 0, 0, 0…
# Step 2: Correlation 
data_correlation <- data_binarized %>%
  correlate(sellable_online__TRUE)
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [52].
data_correlation
## # A tibble: 88 × 3
##    feature         bin                   correlation
##    <fct>           <chr>                       <dbl>
##  1 sellable_online TRUE                       1     
##  2 sellable_online -OTHER                    -1     
##  3 name            TROFAST                   -0.332 
##  4 category        Children's_furniture      -0.144 
##  5 price           -Inf_5.68697535633982     -0.143 
##  6 category        Nursery_furniture         -0.128 
##  7 width           -Inf_60                   -0.128 
##  8 designer        Studio_Copenhagen         -0.112 
##  9 depth           -Inf_40                   -0.0806
## 10 designer        Francis_Cayouette         -0.0573
## # ℹ 78 more rows
# Step 3: Plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 67 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps