Goal

The goal is to explore predictors of chocolate ratings using the provided dataset.

Import Data

chocolate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-18/chocolate.csv')
## Rows: 2530 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_manufacturer, company_location, country_of_bean_origin, spe...
## dbl (3): ref, review_date, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(chocolate)
Data summary
Name chocolate
Number of rows 2530
Number of columns 10
_______________________
Column type frequency:
character 7
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
company_manufacturer 0 1.00 2 39 0 580 0
company_location 0 1.00 4 21 0 67 0
country_of_bean_origin 0 1.00 4 21 0 62 0
specific_bean_origin_or_bar_name 0 1.00 3 51 0 1605 0
cocoa_percent 0 1.00 3 6 0 46 0
ingredients 87 0.97 4 14 0 21 0
most_memorable_characteristics 0 1.00 3 37 0 2487 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ref 0 1 1429.80 757.65 5 802 1454.00 2079.0 2712 ▆▇▇▇▇
review_date 0 1 2014.37 3.97 2006 2012 2015.00 2018.0 2021 ▃▅▇▆▅
rating 0 1 3.20 0.45 1 3 3.25 3.5 4 ▁▁▅▇▇

Data Cleaning & Transformation

data <- chocolate %>%
    select(-ref, -review_date, -company_location, -specific_bean_origin_or_bar_name) %>%
    na.omit() %>%
    mutate(rating = as.numeric(rating), # Ensure numeric conversion
           cocoa_percent = as.numeric(str_remove(cocoa_percent, "%")), # Convert cocoa_percent to numeric
           rating = ifelse(rating > 0, log(rating), NA)) %>% # Avoid log(0) or negative values
    drop_na() # Remove any remaining NAs

Explore Data

Cocoa Percent

data %>%
    ggplot(aes(rating, cocoa_percent)) +
    geom_point()

Company

# Ensure 'company' exists in data
data %>%
    ggplot(aes(rating, as.factor(company_manufacturer))) +
    geom_boxplot()

Ingredients

data %>%
    unnest_tokens(output = word, input = ingredients) %>%
    group_by(word) %>%
    summarise(rating = mean(rating), n = n()) %>%
    ungroup() %>%
    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = rating, n = 20) %>%
    ggplot(aes(rating, fct_reorder(word, rating))) +
    geom_point() +
    labs(y = "Ingredients")

Correlation Analysis

Preprocess Data

# Data preprocessing steps go here

Build Models

# Modeling steps go here

Evaluate Models

# Model evaluation steps go here

Make Predictions

# Prediction steps go here