Import Data
chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(chocolate)
Data summary
Name |
chocolate |
Number of rows |
2530 |
Number of columns |
10 |
_______________________ |
|
Column type frequency: |
|
character |
7 |
numeric |
3 |
________________________ |
|
Group variables |
None |
Variable type: character
company_manufacturer |
0 |
1.00 |
2 |
39 |
0 |
580 |
0 |
company_location |
0 |
1.00 |
4 |
21 |
0 |
67 |
0 |
country_of_bean_origin |
0 |
1.00 |
4 |
21 |
0 |
62 |
0 |
specific_bean_origin_or_bar_name |
0 |
1.00 |
3 |
51 |
0 |
1605 |
0 |
cocoa_percent |
0 |
1.00 |
3 |
6 |
0 |
46 |
0 |
ingredients |
87 |
0.97 |
4 |
14 |
0 |
21 |
0 |
most_memorable_characteristics |
0 |
1.00 |
3 |
37 |
0 |
2487 |
0 |
Variable type: numeric
ref |
0 |
1 |
1429.80 |
757.65 |
5 |
802 |
1454.00 |
2079.0 |
2712 |
▆▇▇▇▇ |
review_date |
0 |
1 |
2014.37 |
3.97 |
2006 |
2012 |
2015.00 |
2018.0 |
2021 |
▃▅▇▆▅ |
rating |
0 |
1 |
3.20 |
0.45 |
1 |
3 |
3.25 |
3.5 |
4 |
▁▁▅▇▇ |
data <- chocolate %>%
separate_rows(most_memorable_characteristics) %>%
separate_rows(specific_bean_origin_or_bar_name) %>%
# Treat missing values
select(-specific_bean_origin_or_bar_name, -ref) %>%
na.omit() %>%
# Log Transform Variables with pos-skewed Distribution
mutate(rating = log(rating))
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-review_date) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 23,520
## Columns: 95
## $ company_manufacturer__Bonnat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Castronovo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Dandelion <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Fresco <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_manufacturer__Soma <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_manufacturer__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Australia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Austria <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Belgium <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Brazil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Canada <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Denmark <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Ecuador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__France <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Germany <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Italy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__New_Zealand <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__Switzerland <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.K. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ company_location__U.S.A. <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ company_location__Venezuela <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `company_location__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Belize <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Blend <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Bolivia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Brazil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Colombia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Costa_Rica <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Dominican_Republic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ecuador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Ghana <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Guatemala <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__India <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Jamaica <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Madagascar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Mexico <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Nicaragua <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Papua_New_Guinea <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Peru <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Philippines <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Tanzania <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ country_of_bean_origin__Trinidad <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__U.S.A. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Venezuela <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country_of_bean_origin__Vietnam <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `country_of_bean_origin__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__60%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__64%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__65%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__67%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__68%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__70%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__71%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__72%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__73%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__74%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__75%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__76%` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `cocoa_percent__77%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__80%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__85%` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `cocoa_percent__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__2-_B,S*` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__3-_B,S,C` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `ingredients__4-_B,S,C,L` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__4-_B,S,C,V` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__5-_B,S,C,V,L` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `ingredients__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__bitter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__cocoa <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, …
## $ most_memorable_characteristics__creamy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__dried <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__earthy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fatty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ most_memorable_characteristics__floral <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__fruit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__intense <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__mild <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__molasses <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__nutty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__off <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__rich <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__roasty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sandy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sour <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__spicy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__sweet <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__tart <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ most_memorable_characteristics__woody <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `most_memorable_characteristics__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `rating__-Inf_1.09861228866811` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.09861228866811_1.17865499634165 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ rating__1.17865499634165_1.25276296849537 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rating__1.25276296849537_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(rating__1.25276296849537_Inf)
data_corr_tbl
## # A tibble: 95 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 rating 1.25276296849537_Inf 1
## 2 rating -Inf_1.09861228866811 -0.384
## 3 rating 1.17865499634165_1.25276296849537 -0.270
## 4 rating 1.09861228866811_1.17865499634165 -0.237
## 5 company_manufacturer -OTHER -0.153
## 6 company_manufacturer Soma 0.149
## 7 most_memorable_characteristics creamy 0.104
## 8 company_manufacturer Bonnat 0.0903
## 9 cocoa_percent 67% 0.0897
## 10 company_location Canada 0.0808
## # … with 85 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 85 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
