PCA and UMAP Replications

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(readr)
library(tidyr)
library(forcats)
boston_cocktails <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/boston_cocktails.csv")

## Rows: 3643 Columns: 6

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, category, ingredient, measure
## dbl (2): row_id, ingredient_number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

boston_cocktails %>%
  count(ingredient, sort = TRUE)

## # A tibble: 569 × 2
##    ingredient            n
##    <chr>             <int>
##  1 Gin                 176
##  2 Fresh lemon juice   138
##  3 Simple Syrup        115
##  4 Vodka               114
##  5 Light Rum           113
##  6 Dry Vermouth        107
##  7 Fresh Lime Juice    107
##  8 Triple Sec          107
##  9 Powdered Sugar       90
## 10 Grenadine            85
## # ℹ 559 more rows

#Clean the ingredient and measure columns
cocktails_parsed <- boston_cocktails %>%
  mutate(
    ingredient = str_to_lower(ingredient),
    ingredient = str_replace_all(ingredient, "-", " "),
    ingredient = str_remove(ingredient, " liqueur$"),
    ingredient = str_remove(ingredient, " (if desired)$"),
    ingredient = case_when(
      str_detect(ingredient, "bitters") ~ "bitters",
      str_detect(ingredient, "lemon") ~ "lemon juice",
      str_detect(ingredient, "lime") ~ "lime juice",
      str_detect(ingredient, "grapefruit") ~ "grapefruit juice",
      str_detect(ingredient, "orange") ~ "orange juice",
      TRUE ~ ingredient
    ),
    measure = case_when(
      str_detect(ingredient, "bitters") ~ str_replace(measure, "oz$", "dash"),
      TRUE ~ measure
    ),
    measure = str_replace(measure, " ?1/2", ".5"),
    measure = str_replace(measure, " ?3/4", ".75"),
    measure = str_replace(measure, " ?1/4", ".25"),
    measure_number = parse_number(measure),
    measure_number = if_else(str_detect(measure, "dash$"),
      measure_number / 50,
      measure_number
    )
  ) %>%
  add_count(ingredient) %>%
  filter(n > 15) %>%
  select(-n) %>%
  distinct(row_id, ingredient, .keep_all = TRUE) %>%
  na.omit()

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `measure_number = parse_number(measure)`.
## Caused by warning:
## ! 20 parsing failures.
## row col expected    actual
## 637  -- a number For glass
## 671  -- a number For glass
## 794  -- a number For glass
## 878  -- a number For glass
## 928  -- a number For glass
## ... ... ........ .........
## See problems(...) for more details.

cocktails_parsed

## # A tibble: 2,542 × 7
##    name      category row_id ingredient_number ingredient measure measure_number
##    <chr>     <chr>     <dbl>             <dbl> <chr>      <chr>            <dbl>
##  1 Gauguin   Cocktai…      1                 1 light rum  2 oz              2   
##  2 Gauguin   Cocktai…      1                 3 lemon jui… 1 oz              1   
##  3 Gauguin   Cocktai…      1                 4 lime juice 1 oz              1   
##  4 Fort Lau… Cocktai…      2                 1 light rum  1.5 oz            1.5 
##  5 Fort Lau… Cocktai…      2                 2 sweet ver… .5 oz             0.5 
##  6 Fort Lau… Cocktai…      2                 3 orange ju… .25 oz            0.25
##  7 Fort Lau… Cocktai…      2                 4 lime juice .25 oz            0.25
##  8 Cuban Co… Cocktai…      4                 1 lime juice .5 oz             0.5 
##  9 Cuban Co… Cocktai…      4                 2 powdered … .5 oz             0.5 
## 10 Cuban Co… Cocktai…      4                 3 light rum  2 oz              2   
## # ℹ 2,532 more rows

#Reshape data into a wider format
cocktails_df <- cocktails_parsed %>%
  select(-ingredient_number, -row_id, -measure) %>%
  pivot_wider(names_from = ingredient, values_from = measure_number, values_fill = 0) %>%
  janitor::clean_names() %>%
  na.omit()

Principal Component analysis

#Load tidymodels and implement pca
library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.7     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ ggplot2      3.5.1     ✔ tibble       3.2.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()  masks scales::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

pca_rec <- recipe(~., data = cocktails_df) %>%
  update_role(name, category, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

pca_prep <- prep(pca_rec)

pca_prep

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## predictor: 40
## id:         2

##

## ── Training information

## Training data contained 937 data points and no incomplete rows.

##

## ── Operations

## • Centering and scaling for: light_rum and lemon_juice, ... | Trained

## • PCA extraction with: light_rum, lemon_juice, lime_juice, ... | Trained

#Tidy the PCA and make a visualization
tidied_pca <- tidy(pca_prep, 2)

tidied_pca %>%
  filter(component %in% paste0("PC", 1:5)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)

tidied_pca %>%
  filter(component %in% paste0("PC", 1:5)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)

#New visualization to show which ingredients contribute in the positive and negative directions
library(tidytext)

tidied_pca %>%
  filter(component %in% paste0("PC", 1:4)) %>%
  group_by(component) %>%
  top_n(8, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, abs(value), component)) %>%
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col() +
  facet_wrap(~component, scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = "Positive?"
  )

#So PC1 is about powdered sugar + egg + gin drinks vs simple syrup + lime + tequila drinks
#PC1 explains the most variation in drinks

#Visualization to show how the cocktails are distributed in the plane of the first two components. 
juice(pca_prep) %>%
  ggplot(aes(PC1, PC2, label = name)) +
  geom_point(aes(color = category), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)

UMAP

library(embed)

umap_rec <- recipe(~., data = cocktails_df) %>%
  update_role(name, category, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors())

umap_rec

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## predictor: 40
## id:         2

##

## ── Operations

## • Centering and scaling for: all_predictors()

## • UMAP embedding for: all_predictors()

umap_prep <- prep(umap_rec)

#Show how the cocktails are distributed in the plane of the first two UMAP components
juice(umap_prep)

## # A tibble: 937 × 4
##    name                 category               UMAP1 UMAP2
##    <fct>                <fct>                  <dbl> <dbl>
##  1 Gauguin              Cocktail Classics      2.33  0.671
##  2 Fort Lauderdale      Cocktail Classics      1.98  0.679
##  3 Cuban Cocktail No. 1 Cocktail Classics      2.22  0.688
##  4 Cool Carlos          Cocktail Classics      4.03  0.858
##  5 John Collins         Whiskies               2.76  9.92 
##  6 Cherry Rum           Cocktail Classics     -3.06  0.666
##  7 Casa Blanca          Cocktail Classics     -2.10  0.565
##  8 Caribbean Champagne  Cocktail Classics     -1.54  1.72 
##  9 Amber Amour          Cordials and Liqueurs  2.74  3.88 
## 10 The Joe Lewis        Whiskies              -0.302 0.618
## # ℹ 927 more rows

juice(umap_prep) %>%
  ggplot(aes(UMAP1, UMAP2, label = name)) +
  geom_point(aes(color = category), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)

Extension of PCA

cocktails_pca <- juice(pca_prep)
cocktails_pca

## # A tibble: 937 × 7
##    name                 category              PC1     PC2     PC3     PC4    PC5
##    <fct>                <fct>               <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
##  1 Gauguin              Cocktail Classics   1.38  -1.15    1.34   -1.12    1.52 
##  2 Fort Lauderdale      Cocktail Classics   0.684  0.548   0.0308 -0.370   1.41 
##  3 Cuban Cocktail No. 1 Cocktail Classics   0.285 -0.967   0.454  -0.931   2.02 
##  4 Cool Carlos          Cocktail Classics   2.19  -0.935  -1.21    2.47    1.80 
##  5 John Collins         Whiskies            1.28  -1.07    0.403  -1.09   -2.21 
##  6 Cherry Rum           Cocktail Classics  -0.757 -0.460   0.909   0.0154 -0.748
##  7 Casa Blanca          Cocktail Classics   1.53  -0.392   3.29   -3.39    3.87 
##  8 Caribbean Champagne  Cocktail Classics   0.324  0.137  -0.134  -0.147   0.303
##  9 Amber Amour          Cordials and Liqu…  1.31  -0.234  -1.55    0.839  -1.19 
## 10 The Joe Lewis        Whiskies            0.138 -0.0401 -0.0365 -0.100  -0.531
## # ℹ 927 more rows

pca1_2 <- cocktails_pca %>%
  mutate(mixed = ifelse(category %in% c("Cocktail Classics", "Cordials and Liqueurs", "Non-alcoholic Drinks", "Rum - Daiquiris", "Shooters"), 'mixed', 'not mixed')) %>%
  ggplot(aes(PC1, PC2, label = name))+
  geom_point(aes(color = mixed), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  scale_color_manual(values = c("mixed" = "lightblue", "not mixed" = "darkblue"),
                     labels = c("mixed" = "Mixed", "not mixed" = "Not Mixed")) +  
  labs(color = "Drink Type") + 
  theme(legend.title = element_text(face = "bold"))
pca1_2

pca1_4 <- cocktails_pca %>%
  mutate(mixed = ifelse(category %in% c("Cocktail Classics", "Cordials and Liqueurs", "Non-alcoholic Drinks", "Rum - Daiquiris", "Shooters"), 'mixed', 'not mixed')) %>%
  ggplot(aes(PC1, PC4, label = name))+
  geom_point(aes(color = mixed), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  scale_color_manual(values = c("mixed" = "lightblue", "not mixed" = "darkblue"),
                     labels = c("mixed" = "Mixed", "not mixed" = "Not Mixed")) +  
  labs(color = "Drink Type") + 
  theme(legend.title = element_text(face = "bold"))
pca1_4

PCA and UMAP Replications

2025-02-04

R Markdown