This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(readr)
library(tidyr)
library(forcats)
boston_cocktails <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/boston_cocktails.csv")
## Rows: 3643 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, category, ingredient, measure
## dbl (2): row_id, ingredient_number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
boston_cocktails %>%
count(ingredient, sort = TRUE)
## # A tibble: 569 × 2
## ingredient n
## <chr> <int>
## 1 Gin 176
## 2 Fresh lemon juice 138
## 3 Simple Syrup 115
## 4 Vodka 114
## 5 Light Rum 113
## 6 Dry Vermouth 107
## 7 Fresh Lime Juice 107
## 8 Triple Sec 107
## 9 Powdered Sugar 90
## 10 Grenadine 85
## # ℹ 559 more rows
#Clean the ingredient and measure columns
cocktails_parsed <- boston_cocktails %>%
mutate(
ingredient = str_to_lower(ingredient),
ingredient = str_replace_all(ingredient, "-", " "),
ingredient = str_remove(ingredient, " liqueur$"),
ingredient = str_remove(ingredient, " (if desired)$"),
ingredient = case_when(
str_detect(ingredient, "bitters") ~ "bitters",
str_detect(ingredient, "lemon") ~ "lemon juice",
str_detect(ingredient, "lime") ~ "lime juice",
str_detect(ingredient, "grapefruit") ~ "grapefruit juice",
str_detect(ingredient, "orange") ~ "orange juice",
TRUE ~ ingredient
),
measure = case_when(
str_detect(ingredient, "bitters") ~ str_replace(measure, "oz$", "dash"),
TRUE ~ measure
),
measure = str_replace(measure, " ?1/2", ".5"),
measure = str_replace(measure, " ?3/4", ".75"),
measure = str_replace(measure, " ?1/4", ".25"),
measure_number = parse_number(measure),
measure_number = if_else(str_detect(measure, "dash$"),
measure_number / 50,
measure_number
)
) %>%
add_count(ingredient) %>%
filter(n > 15) %>%
select(-n) %>%
distinct(row_id, ingredient, .keep_all = TRUE) %>%
na.omit()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `measure_number = parse_number(measure)`.
## Caused by warning:
## ! 20 parsing failures.
## row col expected actual
## 637 -- a number For glass
## 671 -- a number For glass
## 794 -- a number For glass
## 878 -- a number For glass
## 928 -- a number For glass
## ... ... ........ .........
## See problems(...) for more details.
cocktails_parsed
## # A tibble: 2,542 × 7
## name category row_id ingredient_number ingredient measure measure_number
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Gauguin Cocktai… 1 1 light rum 2 oz 2
## 2 Gauguin Cocktai… 1 3 lemon jui… 1 oz 1
## 3 Gauguin Cocktai… 1 4 lime juice 1 oz 1
## 4 Fort Lau… Cocktai… 2 1 light rum 1.5 oz 1.5
## 5 Fort Lau… Cocktai… 2 2 sweet ver… .5 oz 0.5
## 6 Fort Lau… Cocktai… 2 3 orange ju… .25 oz 0.25
## 7 Fort Lau… Cocktai… 2 4 lime juice .25 oz 0.25
## 8 Cuban Co… Cocktai… 4 1 lime juice .5 oz 0.5
## 9 Cuban Co… Cocktai… 4 2 powdered … .5 oz 0.5
## 10 Cuban Co… Cocktai… 4 3 light rum 2 oz 2
## # ℹ 2,532 more rows
#Reshape data into a wider format
cocktails_df <- cocktails_parsed %>%
select(-ingredient_number, -row_id, -measure) %>%
pivot_wider(names_from = ingredient, values_from = measure_number, values_fill = 0) %>%
janitor::clean_names() %>%
na.omit()
Principal Component analysis
#Load tidymodels and implement pca
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
pca_rec <- recipe(~., data = cocktails_df) %>%
update_role(name, category, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
pca_prep <- prep(pca_rec)
pca_prep
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## predictor: 40
## id: 2
##
## ── Training information
## Training data contained 937 data points and no incomplete rows.
##
## ── Operations
## • Centering and scaling for: light_rum and lemon_juice, ... | Trained
## • PCA extraction with: light_rum, lemon_juice, lime_juice, ... | Trained
#Tidy the PCA and make a visualization
tidied_pca <- tidy(pca_prep, 2)
tidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)
tidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)
#New visualization to show which ingredients contribute in the positive and negative directions
library(tidytext)
tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)
#So PC1 is about powdered sugar + egg + gin drinks vs simple syrup + lime + tequila drinks
#PC1 explains the most variation in drinks
#Visualization to show how the cocktails are distributed in the plane of the first two components.
juice(pca_prep) %>%
ggplot(aes(PC1, PC2, label = name)) +
geom_point(aes(color = category), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)
UMAP
library(embed)
umap_rec <- recipe(~., data = cocktails_df) %>%
update_role(name, category, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors())
umap_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## predictor: 40
## id: 2
##
## ── Operations
## • Centering and scaling for: all_predictors()
## • UMAP embedding for: all_predictors()
umap_prep <- prep(umap_rec)
#Show how the cocktails are distributed in the plane of the first two UMAP components
juice(umap_prep)
## # A tibble: 937 × 4
## name category UMAP1 UMAP2
## <fct> <fct> <dbl> <dbl>
## 1 Gauguin Cocktail Classics 2.33 0.671
## 2 Fort Lauderdale Cocktail Classics 1.98 0.679
## 3 Cuban Cocktail No. 1 Cocktail Classics 2.22 0.688
## 4 Cool Carlos Cocktail Classics 4.03 0.858
## 5 John Collins Whiskies 2.76 9.92
## 6 Cherry Rum Cocktail Classics -3.06 0.666
## 7 Casa Blanca Cocktail Classics -2.10 0.565
## 8 Caribbean Champagne Cocktail Classics -1.54 1.72
## 9 Amber Amour Cordials and Liqueurs 2.74 3.88
## 10 The Joe Lewis Whiskies -0.302 0.618
## # ℹ 927 more rows
juice(umap_prep) %>%
ggplot(aes(UMAP1, UMAP2, label = name)) +
geom_point(aes(color = category), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)
Extension of PCA
cocktails_pca <- juice(pca_prep)
cocktails_pca
## # A tibble: 937 × 7
## name category PC1 PC2 PC3 PC4 PC5
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Gauguin Cocktail Classics 1.38 -1.15 1.34 -1.12 1.52
## 2 Fort Lauderdale Cocktail Classics 0.684 0.548 0.0308 -0.370 1.41
## 3 Cuban Cocktail No. 1 Cocktail Classics 0.285 -0.967 0.454 -0.931 2.02
## 4 Cool Carlos Cocktail Classics 2.19 -0.935 -1.21 2.47 1.80
## 5 John Collins Whiskies 1.28 -1.07 0.403 -1.09 -2.21
## 6 Cherry Rum Cocktail Classics -0.757 -0.460 0.909 0.0154 -0.748
## 7 Casa Blanca Cocktail Classics 1.53 -0.392 3.29 -3.39 3.87
## 8 Caribbean Champagne Cocktail Classics 0.324 0.137 -0.134 -0.147 0.303
## 9 Amber Amour Cordials and Liqu… 1.31 -0.234 -1.55 0.839 -1.19
## 10 The Joe Lewis Whiskies 0.138 -0.0401 -0.0365 -0.100 -0.531
## # ℹ 927 more rows
pca1_2 <- cocktails_pca %>%
mutate(mixed = ifelse(category %in% c("Cocktail Classics", "Cordials and Liqueurs", "Non-alcoholic Drinks", "Rum - Daiquiris", "Shooters"), 'mixed', 'not mixed')) %>%
ggplot(aes(PC1, PC2, label = name))+
geom_point(aes(color = mixed), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") +
scale_color_manual(values = c("mixed" = "lightblue", "not mixed" = "darkblue"),
labels = c("mixed" = "Mixed", "not mixed" = "Not Mixed")) +
labs(color = "Drink Type") +
theme(legend.title = element_text(face = "bold"))
pca1_2
pca1_4 <- cocktails_pca %>%
mutate(mixed = ifelse(category %in% c("Cocktail Classics", "Cordials and Liqueurs", "Non-alcoholic Drinks", "Rum - Daiquiris", "Shooters"), 'mixed', 'not mixed')) %>%
ggplot(aes(PC1, PC4, label = name))+
geom_point(aes(color = mixed), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") +
scale_color_manual(values = c("mixed" = "lightblue", "not mixed" = "darkblue"),
labels = c("mixed" = "Mixed", "not mixed" = "Not Mixed")) +
labs(color = "Drink Type") +
theme(legend.title = element_text(face = "bold"))
pca1_4