Load Packages and Dataset
Note: The dataset is gathered from Coffee Quality Institute (CQI) <https://database.coffeeinstitute.org> in January, 2018.
Summary of Dataset
- look at first ten rows of the dataset
coffee_ratings <- read_csv("coffee_ratings.csv")
coffee_ratings## # A tibble: 1,338 × 44
## ...1 total_cup_points species owner country_of_orig… farm_name lot_number
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 90.6 Arabica metad … Ethiopia "metad pl… <NA>
## 2 2 89.9 Arabica metad … Ethiopia "metad pl… <NA>
## 3 3 89.8 Arabica ground… Guatemala "san marc… <NA>
## 4 4 89 Arabica yidnek… Ethiopia "yidnekac… <NA>
## 5 5 88.8 Arabica metad … Ethiopia "metad pl… <NA>
## 6 6 88.8 Arabica ji-ae … Brazil <NA> <NA>
## 7 7 88.8 Arabica hugo v… Peru <NA> <NA>
## 8 8 88.7 Arabica ethiop… Ethiopia "aolme" <NA>
## 9 9 88.4 Arabica ethiop… Ethiopia "aolme" <NA>
## 10 10 88.2 Arabica diamon… Ethiopia "tulla co… <NA>
## # … with 1,328 more rows, and 37 more variables: mill <chr>, ico_number <chr>,
## # company <chr>, altitude <chr>, region <chr>, producer <chr>,
## # number_of_bags <dbl>, bag_weight <chr>, in_country_partner <chr>,
## # harvest_year <chr>, grading_date <chr>, owner_1 <chr>, variety <chr>,
## # processing_method <chr>, aroma <dbl>, flavor <dbl>, aftertaste <dbl>,
## # acidity <dbl>, body <dbl>, balance <dbl>, uniformity <dbl>,
## # clean_cup <dbl>, sweetness <dbl>, cupper_points <dbl>, moisture <dbl>, …
- observations
coffee_ratings %>%
mutate(coffee_id = row_number()) %>%
select(coffee_id, total_cup_points, species, variety, owner, farm_name, country_of_origin, processing_method:aftertaste)## # A tibble: 1,338 × 11
## coffee_id total_cup_points species variety owner farm_name country_of_orig…
## <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 90.6 Arabica <NA> metad… "metad pl… Ethiopia
## 2 2 89.9 Arabica Other metad… "metad pl… Ethiopia
## 3 3 89.8 Arabica Bourbon groun… "san marc… Guatemala
## 4 4 89 Arabica <NA> yidne… "yidnekac… Ethiopia
## 5 5 88.8 Arabica Other metad… "metad pl… Ethiopia
## 6 6 88.8 Arabica <NA> ji-ae… <NA> Brazil
## 7 7 88.8 Arabica Other hugo … <NA> Peru
## 8 8 88.7 Arabica <NA> ethio… "aolme" Ethiopia
## 9 9 88.4 Arabica <NA> ethio… "aolme" Ethiopia
## 10 10 88.2 Arabica Other diamo… "tulla co… Ethiopia
## # … with 1,328 more rows, and 4 more variables: processing_method <chr>,
## # aroma <dbl>, flavor <dbl>, aftertaste <dbl>
Total points vs Variety
coffee_lumped <- coffee_ratings %>%
filter(!is.na(variety),
total_cup_points > 10) %>%
mutate(variety = fct_lump(variety, 10), sort = TRUE)
coffee_lumped %>%
mutate(variety = fct_reorder(variety, total_cup_points)) %>%
ggplot(aes(total_cup_points, variety)) +
geom_boxplot()coffee_lumped %>%
ggplot(aes(total_cup_points, fill = variety)) +
geom_histogram(binwidth = 2) +
facet_wrap(~variety, scales = "free_y") +
theme(legend.position = "none")Coffee ratings per processing method
coffee_ratings %>%
filter(!is.na(processing_method)) %>%
group_by(processing_method) %>%
summarize(points = mean(total_cup_points)) %>%
arrange(desc(points))## # A tibble: 5 × 2
## processing_method points
## <chr> <dbl>
## 1 Pulped natural / honey 82.8
## 2 Semi-washed / Semi-pulped 82.6
## 3 Natural / Dry 82.2
## 4 Washed / Wet 82.0
## 5 Other 81.3
Number of coffees by country/region
coffee_ratings %>%
count(country = fct_lump(country_of_origin, 12), sort = TRUE) %>%
filter(!is.na(country)) %>%
mutate(country_region = fct_reorder(country, n)) %>%
ggplot(aes(n, country_region, fill = country)) +
geom_col() +
scale_fill_viridis_d() +
ylab("country/region") +
theme(legend.position = "none")Coffee ratings per country/region
coffee_ratings %>%
filter(!is.na(country_of_origin)) %>%
filter(total_cup_points >50) %>%
mutate(country = fct_lump(country_of_origin, 8),
country = fct_reorder(country, total_cup_points)) %>%
ggplot(aes(total_cup_points, country)) +
geom_boxplot() +
ylab("country/region")More Dive into datasets
coffee_metrics <- coffee_ratings %>%
mutate(coffee_id = row_number()) %>%
select(coffee_id, total_cup_points, variety, company, country_of_origin, processing_method, aroma:cupper_points, altitude_mean_meters) %>%
pivot_longer(aroma:cupper_points, "metric", values_to = "value")How each features are rated
library(ggridges)
coffee_metrics %>%
mutate(metric = fct_reorder(metric, value)) %>%
ggplot(aes(value, metric)) +
geom_density_ridges() +
xlab("points")- by numeric results
coffee_metrics %>%
group_by(metric) %>%
summarize(average = mean(value), sd = sd(value)) %>%
arrange(desc(average))## # A tibble: 10 × 3
## metric average sd
## <chr> <dbl> <dbl>
## 1 sweetness 9.86 0.554
## 2 clean_cup 9.84 0.715
## 3 uniformity 9.84 0.485
## 4 aroma 7.57 0.316
## 5 acidity 7.54 0.319
## 6 flavor 7.53 0.341
## 7 balance 7.52 0.354
## 8 body 7.52 0.308
## 9 cupper_points 7.51 0.427
## 10 aftertaste 7.41 0.350
Does altitude matter to total cup points?
coffee_ratings %>%
filter(!is.na(altitude_mean_meters)) %>%
filter(!is.na(species)) %>%
filter(altitude_mean_meters < 2600, altitude != 1, total_cup_points >70) %>%
filter(altitude_mean_meters > 600) %>%
mutate(altitude_mean_meters = pmin(altitude_mean_meters,2600)) %>%
ggplot(aes(altitude_mean_meters, total_cup_points)) +
geom_point(color = "#da8620") +
geom_smooth(method = lm, se = TRUE, color = "dark blue")How is altitude impacting scoring?
coffee_metrics %>%
filter(!is.na(altitude_mean_meters)) %>%
filter(altitude_mean_meters < 2600, total_cup_points >70) %>%
mutate(altitude_mean_meters = pmin(altitude_mean_meters,2600)) %>%
mutate(km = altitude_mean_meters/1000) %>%
group_by(metric) %>%
summarize(correlation = cor(altitude_mean_meters, value),
model = list(lm(value ~ km))) %>%
mutate(tidied = map(model, broom:: tidy, conf.int = TRUE)) %>%
unnest(tidied) %>%
filter(term == "km") %>%
ungroup() %>%
mutate(metric = fct_reorder(metric, estimate)) %>%
ggplot(aes(estimate, metric, color = p.value < .05)) +
geom_point(color = "sky blue") +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.1) +
xlab("total points per kilometer") +
ylab("characteristics of coffee")