Coffee Data Analytics

Kate C

2022-02-24

Load Packages and Dataset

Note: The dataset is gathered from Coffee Quality Institute (CQI) <https://database.coffeeinstitute.org> in January, 2018.

Summary of Dataset

  • look at first ten rows of the dataset
coffee_ratings <- read_csv("coffee_ratings.csv")
coffee_ratings
## # A tibble: 1,338 × 44
##     ...1 total_cup_points species owner   country_of_orig… farm_name  lot_number
##    <dbl>            <dbl> <chr>   <chr>   <chr>            <chr>      <chr>     
##  1     1             90.6 Arabica metad … Ethiopia         "metad pl… <NA>      
##  2     2             89.9 Arabica metad … Ethiopia         "metad pl… <NA>      
##  3     3             89.8 Arabica ground… Guatemala        "san marc… <NA>      
##  4     4             89   Arabica yidnek… Ethiopia         "yidnekac… <NA>      
##  5     5             88.8 Arabica metad … Ethiopia         "metad pl… <NA>      
##  6     6             88.8 Arabica ji-ae … Brazil            <NA>      <NA>      
##  7     7             88.8 Arabica hugo v… Peru              <NA>      <NA>      
##  8     8             88.7 Arabica ethiop… Ethiopia         "aolme"    <NA>      
##  9     9             88.4 Arabica ethiop… Ethiopia         "aolme"    <NA>      
## 10    10             88.2 Arabica diamon… Ethiopia         "tulla co… <NA>      
## # … with 1,328 more rows, and 37 more variables: mill <chr>, ico_number <chr>,
## #   company <chr>, altitude <chr>, region <chr>, producer <chr>,
## #   number_of_bags <dbl>, bag_weight <chr>, in_country_partner <chr>,
## #   harvest_year <chr>, grading_date <chr>, owner_1 <chr>, variety <chr>,
## #   processing_method <chr>, aroma <dbl>, flavor <dbl>, aftertaste <dbl>,
## #   acidity <dbl>, body <dbl>, balance <dbl>, uniformity <dbl>,
## #   clean_cup <dbl>, sweetness <dbl>, cupper_points <dbl>, moisture <dbl>, …
  • observations
coffee_ratings %>% 
  mutate(coffee_id = row_number()) %>% 
  select(coffee_id, total_cup_points, species, variety, owner, farm_name, country_of_origin, processing_method:aftertaste)
## # A tibble: 1,338 × 11
##    coffee_id total_cup_points species variety owner  farm_name  country_of_orig…
##        <int>            <dbl> <chr>   <chr>   <chr>  <chr>      <chr>           
##  1         1             90.6 Arabica <NA>    metad… "metad pl… Ethiopia        
##  2         2             89.9 Arabica Other   metad… "metad pl… Ethiopia        
##  3         3             89.8 Arabica Bourbon groun… "san marc… Guatemala       
##  4         4             89   Arabica <NA>    yidne… "yidnekac… Ethiopia        
##  5         5             88.8 Arabica Other   metad… "metad pl… Ethiopia        
##  6         6             88.8 Arabica <NA>    ji-ae…  <NA>      Brazil          
##  7         7             88.8 Arabica Other   hugo …  <NA>      Peru            
##  8         8             88.7 Arabica <NA>    ethio… "aolme"    Ethiopia        
##  9         9             88.4 Arabica <NA>    ethio… "aolme"    Ethiopia        
## 10        10             88.2 Arabica Other   diamo… "tulla co… Ethiopia        
## # … with 1,328 more rows, and 4 more variables: processing_method <chr>,
## #   aroma <dbl>, flavor <dbl>, aftertaste <dbl>

Total points vs Variety

coffee_lumped <- coffee_ratings %>% 
  filter(!is.na(variety),
         total_cup_points > 10) %>% 
  mutate(variety = fct_lump(variety, 10), sort = TRUE)

coffee_lumped %>% 
  mutate(variety = fct_reorder(variety, total_cup_points)) %>% 
  ggplot(aes(total_cup_points, variety)) +
  geom_boxplot()

coffee_lumped %>% 
  ggplot(aes(total_cup_points, fill = variety)) +
  geom_histogram(binwidth = 2) +
  facet_wrap(~variety, scales = "free_y") +
  theme(legend.position = "none")

Coffee ratings per processing method

coffee_ratings %>% 
  filter(!is.na(processing_method)) %>% 
  group_by(processing_method) %>% 
  summarize(points = mean(total_cup_points)) %>% 
  arrange(desc(points))
## # A tibble: 5 × 2
##   processing_method         points
##   <chr>                      <dbl>
## 1 Pulped natural / honey      82.8
## 2 Semi-washed / Semi-pulped   82.6
## 3 Natural / Dry               82.2
## 4 Washed / Wet                82.0
## 5 Other                       81.3

Number of coffees by country/region

coffee_ratings %>% 
  count(country = fct_lump(country_of_origin, 12), sort = TRUE) %>% 
  filter(!is.na(country)) %>% 
  mutate(country_region = fct_reorder(country, n)) %>% 
  ggplot(aes(n, country_region, fill = country)) +
  geom_col() +
  scale_fill_viridis_d() +
  ylab("country/region") +
  theme(legend.position = "none")

Coffee ratings per country/region

coffee_ratings %>% 
  filter(!is.na(country_of_origin)) %>% 
  filter(total_cup_points >50) %>% 
  mutate(country = fct_lump(country_of_origin, 8),
         country = fct_reorder(country, total_cup_points)) %>% 
  ggplot(aes(total_cup_points, country)) +
  geom_boxplot() +
  ylab("country/region")

More Dive into datasets

coffee_metrics <- coffee_ratings %>% 
  mutate(coffee_id = row_number()) %>% 
  select(coffee_id, total_cup_points, variety, company, country_of_origin, processing_method, aroma:cupper_points, altitude_mean_meters) %>% 
  pivot_longer(aroma:cupper_points, "metric", values_to = "value")

How each features are rated

library(ggridges)
coffee_metrics %>% 
  mutate(metric = fct_reorder(metric, value)) %>% 
  ggplot(aes(value, metric)) +
  geom_density_ridges() +
  xlab("points")

  • by numeric results
coffee_metrics %>% 
  group_by(metric) %>% 
  summarize(average = mean(value), sd = sd(value)) %>% 
  arrange(desc(average))
## # A tibble: 10 × 3
##    metric        average    sd
##    <chr>           <dbl> <dbl>
##  1 sweetness        9.86 0.554
##  2 clean_cup        9.84 0.715
##  3 uniformity       9.84 0.485
##  4 aroma            7.57 0.316
##  5 acidity          7.54 0.319
##  6 flavor           7.53 0.341
##  7 balance          7.52 0.354
##  8 body             7.52 0.308
##  9 cupper_points    7.51 0.427
## 10 aftertaste       7.41 0.350

Does altitude matter to total cup points?

coffee_ratings %>% 
  filter(!is.na(altitude_mean_meters)) %>% 
  filter(!is.na(species)) %>% 
  filter(altitude_mean_meters < 2600, altitude != 1, total_cup_points >70) %>% 
  filter(altitude_mean_meters > 600) %>% 
  mutate(altitude_mean_meters = pmin(altitude_mean_meters,2600)) %>% 
  ggplot(aes(altitude_mean_meters, total_cup_points)) +
  geom_point(color = "#da8620") +
  geom_smooth(method = lm, se = TRUE, color = "dark blue")

How is altitude impacting scoring?

coffee_metrics %>% 
  filter(!is.na(altitude_mean_meters)) %>% 
  filter(altitude_mean_meters < 2600, total_cup_points >70) %>% 
  mutate(altitude_mean_meters = pmin(altitude_mean_meters,2600)) %>% 
  mutate(km = altitude_mean_meters/1000) %>% 
  group_by(metric) %>% 
  summarize(correlation = cor(altitude_mean_meters, value), 
            model = list(lm(value ~ km))) %>%
  mutate(tidied = map(model, broom:: tidy, conf.int = TRUE)) %>% 
  unnest(tidied) %>% 
  filter(term == "km") %>% 
  ungroup() %>% 
  mutate(metric = fct_reorder(metric, estimate)) %>% 
  ggplot(aes(estimate, metric, color = p.value < .05)) +
  geom_point(color = "sky blue") +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.1) +
  xlab("total points per kilometer") +
  ylab("characteristics of coffee")