Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(chocolate)
Data summary
Name chocolate
Number of rows 2530
Number of columns 10
_______________________
Column type frequency:
character 7
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
company_manufacturer 0 1.00 2 39 0 580 0
company_location 0 1.00 4 21 0 67 0
country_of_bean_origin 0 1.00 4 21 0 62 0
specific_bean_origin_or_bar_name 0 1.00 3 51 0 1605 0
cocoa_percent 0 1.00 3 6 0 46 0
ingredients 87 0.97 4 14 0 21 0
most_memorable_characteristics 0 1.00 3 37 0 2487 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ref 0 1 1429.80 757.65 5 802 1454.00 2079.0 2712 ▆▇▇▇▇
review_date 0 1 2014.37 3.97 2006 2012 2015.00 2018.0 2021 ▃▅▇▆▅
rating 0 1 3.20 0.45 1 3 3.25 3.5 4 ▁▁▅▇▇
data <- chocolate %>%
  
  separate_rows(most_memorable_characteristics) %>% 
  
  separate_rows(specific_bean_origin_or_bar_name) %>%
  
  # Treat missing values 
  select(-specific_bean_origin_or_bar_name, -ref) %>%
  na.omit() %>%
  
  # Log Transform Variables with pos-skewed Distribution
  mutate(rating = log(rating))
# Step 1: Prepare data 
data_binarized_tbl <- data %>%
  select(-review_date) %>%
  binarize()

data_binarized_tbl %>% glimpse()
## Rows: 23,520
## Columns: 95
## $ company_manufacturer__Bonnat               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Castronovo           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Dandelion            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Fresco               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Soma                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_manufacturer__-OTHER`             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Australia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Austria                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Belgium                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Brazil                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Canada                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Denmark                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Ecuador                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__France                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Germany                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Italy                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__New_Zealand              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Switzerland              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.K.                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.S.A.                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Venezuela                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_location__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Belize             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Blend              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Bolivia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Brazil             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Colombia           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Costa_Rica         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Dominican_Republic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ecuador            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ghana              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Guatemala          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__India              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Jamaica            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Madagascar         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Mexico             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Nicaragua          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Papua_New_Guinea   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Peru               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Philippines        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Tanzania           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ country_of_bean_origin__Trinidad           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__U.S.A.             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Venezuela          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Vietnam            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country_of_bean_origin__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__60%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__64%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__65%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__67%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__68%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__70%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__71%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__72%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__73%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__74%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__75%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__76%`                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `cocoa_percent__77%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__80%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__85%`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__-OTHER`                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S*`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__3-_B,S,C`                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `ingredients__4-_B,S,C,L`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__4-_B,S,C,V`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__5-_B,S,C,V,L`                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__bitter     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__cocoa      <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, …
## $ most_memorable_characteristics__creamy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__dried      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__earthy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fatty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ most_memorable_characteristics__floral     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fruit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__intense    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__mild       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__molasses   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__nutty      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__off        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__rich       <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__roasty     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sandy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sour       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spicy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sweet      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__tart       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__woody      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `most_memorable_characteristics__-OTHER`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `rating__-Inf_1.09861228866811`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.09861228866811_1.17865499634165  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rating__1.17865499634165_1.25276296849537  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.25276296849537_Inf               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate 
data_corr_tbl <- data_binarized_tbl %>%
  correlate(rating__1.25276296849537_Inf)

data_corr_tbl
## # A tibble: 95 × 3
##    feature                        bin                               correlation
##    <fct>                          <chr>                                   <dbl>
##  1 rating                         1.25276296849537_Inf                   1     
##  2 rating                         -Inf_1.09861228866811                 -0.384 
##  3 rating                         1.17865499634165_1.25276296849537     -0.270 
##  4 rating                         1.09861228866811_1.17865499634165     -0.237 
##  5 company_manufacturer           -OTHER                                -0.153 
##  6 company_manufacturer           Soma                                   0.149 
##  7 most_memorable_characteristics creamy                                 0.104 
##  8 company_manufacturer           Bonnat                                 0.0903
##  9 cocoa_percent                  67%                                    0.0897
## 10 company_location               Canada                                 0.0808
## # … with 85 more rows
# Step 3: Plot 
data_corr_tbl %>%
  plot_correlation_funnel()
## Warning: ggrepel: 85 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps