Step 1: Load in data

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(htmltools)
packageVersion("htmltools")
## [1] '0.5.8.1'
library(readxl)
library(tibble)
library(ggplot2)

# Load in data
coffee <- read_csv("~/Downloads/coffee_analysis.csv")
## Rows: 2095 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, roaster, roast, loc_country, origin_1, origin_2, review_date...
## dbl  (2): 100g_USD, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Show column types
spec(coffee) # double = numeric
## cols(
##   name = col_character(),
##   roaster = col_character(),
##   roast = col_character(),
##   loc_country = col_character(),
##   origin_1 = col_character(),
##   origin_2 = col_character(),
##   `100g_USD` = col_double(),
##   rating = col_double(),
##   review_date = col_character(),
##   desc_1 = col_character(),
##   desc_2 = col_character(),
##   desc_3 = col_character()
## )

Step 2: Numeric data summaries

summary(coffee[, sapply(coffee, is.numeric)]) 
##     100g_USD           rating     
##  Min.   :  0.120   Min.   :84.00  
##  1st Qu.:  4.930   1st Qu.:92.00  
##  Median :  5.860   Median :93.00  
##  Mean   :  9.323   Mean   :93.11  
##  3rd Qu.:  8.785   3rd Qu.:94.00  
##  Max.   :132.280   Max.   :98.00
# Output: Min/Max, Mean & Median (quartiles), 1st & 3rd quartiles
numeric_summary <- sapply(
  coffee[, sapply(coffee, is.numeric)],
  function(x) c(
    min = min(x, na.rm = TRUE),
    max = max(x, na.rm = TRUE),
    mean = mean(x, na.rm = TRUE),
    median = median(x, na.rm = TRUE),
    sd = sd(x, na.rm = TRUE),
    quantiles = quantile(x, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
  )
)

numeric_summary
##                 100g_USD    rating
## min             0.120000 84.000000
## max           132.280000 98.000000
## mean            9.323313 93.114081
## median          5.860000 93.000000
## sd             11.430659  1.563024
## quantiles.25%   4.930000 92.000000
## quantiles.50%   5.860000 93.000000
## quantiles.75%   8.785000 94.000000
# Output: Mean, SD, quantiles

Step 3: Categorical column summaries

categorical_summary <- lapply(
  coffee[, sapply(coffee, is.factor) | sapply(coffee, is.character)],
  function(x) table(x)
)

# Created DataFrame 'categorical_summary'
# Output: Unique values and counts

Step 4: Complete Numeric and Categorical Summaries

# Numeric
coffee %>%
  summarise(across(
    where(is.numeric),
    list(
      min = ~min(.x, na.rm = TRUE),
      max = ~max(.x, na.rm = TRUE),
      mean = ~mean(.x, na.rm = TRUE),
      median = ~median(.x, na.rm = TRUE),
      q25 = ~quantile(.x, 0.25, na.rm = TRUE),
      q75 = ~quantile(.x, 0.75, na.rm = TRUE)
    )
  ))
## # A tibble: 1 × 12
##   `100g_USD_min` `100g_USD_max` `100g_USD_mean` `100g_USD_median` `100g_USD_q25`
##            <dbl>          <dbl>           <dbl>             <dbl>          <dbl>
## 1           0.12           132.            9.32              5.86           4.93
## # ℹ 7 more variables: `100g_USD_q75` <dbl>, rating_min <dbl>, rating_max <dbl>,
## #   rating_mean <dbl>, rating_median <dbl>, rating_q25 <dbl>, rating_q75 <dbl>
# Categorical
coffee %>%
  summarise(across(
    where(~is.factor(.x) | is.character(.x)),
    ~list(table(.x))
  ))
## # A tibble: 1 × 10
##   name       roaster    roast      loc_country origin_1   origin_2   review_date
##   <list>     <list>     <list>     <list>      <list>     <list>     <list>     
## 1 <table[…]> <table[…]> <table[…]> <table[…]>  <table[…]> <table[…]> <table[…]> 
## # ℹ 3 more variables: desc_1 <list>, desc_2 <list>, desc_3 <list>

Questions & Analysis

  1. Are coffees that tend to receive consistently higher ratings generally more expensive?

‘cor_price_rating’ is correlation between coffee price and rating. 0 = No relationship, 0.25 = Weak, 0.75 = Strong. The correlation between price and rating in this coffee reviews data set is a weak correlation, as the cor_price_rating = 0.259.

coffee %>%
  summarise(
    avg_price = mean(`100g_USD`, na.rm = TRUE),
    avg_rating = mean(rating, na.rm = TRUE),
    cor_price_rating = cor(`100g_USD`, rating, use = "complete.obs") 
  )
## # A tibble: 1 × 3
##   avg_price avg_rating cor_price_rating
##       <dbl>      <dbl>            <dbl>
## 1      9.32       93.1            0.260
  1. Which type of roast on average receives the highest ratings world wide?

The highest rated coffe roast world wide gathered from this data set is a light roast, earning an average rating of 93.5 out of 100. This tibble does show a strong correlation between rating and price when arranging by type of roast, The two highest rated roasts were the most expensive coffees while dark roasts received the lowest overall rating, making it the cheapest to purchase.

coffee %>%
  group_by(roast) %>%
  summarise(
    avg_price = mean(`100g_USD`, na.rm = TRUE),
    avg_rating = mean(rating, na.rm = TRUE),
    n = n()
  ) %>%
  arrange(desc(avg_rating))
## # A tibble: 6 × 4
##   roast        avg_price avg_rating     n
##   <chr>            <dbl>      <dbl> <int>
## 1 <NA>             42.2        93.5    15
## 2 Light            11.4        93.5   287
## 3 Medium-Light      8.99       93.2  1490
## 4 Medium            7.06       92.3   259
## 5 Medium-Dark       9.51       91.9    39
## 6 Dark              4.46       88.2     5
  1. What places of origin produce the highest rated coffees on average?

This question aims to provide insights into where the highest rated coffee beans are grown. The broader goal is to discover the top regions around the world where coffee is grown, based on aggregated ratings from this data set.

coffee %>%
  group_by(origin_1) %>%
  summarise(
    avg_rating = mean(rating, na.rm = TRUE),
    count = n()
  ) %>%
  filter(count >= 5) %>%   # avoids tiny samples
  arrange(desc(avg_rating))
## # A tibble: 88 × 3
##    origin_1               avg_rating count
##    <chr>                       <dbl> <int>
##  1 Chiriqui Province            94.9     7
##  2 Gedeo Zone                   94.6     5
##  3 Kirinyaga                    94.6     5
##  4 Boquete Growing Region       94.6    39
##  5 Piedra Candela               94.6     9
##  6 Kiambu Growing Region        94.5    12
##  7 Sidama Growing Region        94.4    14
##  8 Alto Quiel                   94.3     6
##  9 Embu County                  94.3     6
## 10 Holualoa                     94.3    41
## # ℹ 78 more rows
  1. What kind of trends does this dataset reveal based on how the cost of coffee has fluctuated over time?

The price of coffee appears to be steadily rising. This data set contains coffee reviews from 2017-2022. In 2017, the average cost of coffee was $7.87 and in 2022, the average cost increased to $9.27, after eclipsing an average of $10 throughout 2021. Aggregating the price of coffee by month gives more insight into increased prices over time, although there is higher fluctuation in monthly prices, resulting in the yearly aggegrating providing a more steady analysis.

# Converting the review date to date to give R a full date to process in order to successfully aggregate the average cost of coffee over time
coffee <- coffee %>%
  mutate(review_date = as.Date(paste0("01 ", review_date), format = "%d %B %Y"),
         year = year(review_date))
# Aggregating average coffee price by year
price_by_year <- coffee %>%
  group_by(year) %>%
  summarise(
    avg_price = mean(`100g_USD`, na.rm = TRUE),
    coffees = n()
  ) %>%
  arrange(year)

price_by_year
## # A tibble: 6 × 3
##    year avg_price coffees
##   <dbl>     <dbl>   <int>
## 1  2017      7.87      68
## 2  2018      8.22     297
## 3  2019      9.29     291
## 4  2020      9.43     426
## 5  2021     10.1      543
## 6  2022      9.27     470
# Aggregating average coffee price by month and year
price_by_month <- coffee %>%
  mutate(year_month = floor_date(review_date, "month")) %>%
  group_by(year_month) %>%
  summarise(
    avg_price = mean(`100g_USD`, na.rm = TRUE),
    coffees = n()
  ) %>%
  arrange(year_month)
price_by_month
## # A tibble: 61 × 3
##    year_month avg_price coffees
##    <date>         <dbl>   <int>
##  1 2017-11-01      6.97      40
##  2 2017-12-01      9.15      28
##  3 2018-01-01     19.4       17
##  4 2018-02-01      5.94      22
##  5 2018-03-01      5.87      22
##  6 2018-04-01     12.2       27
##  7 2018-05-01      6.15      22
##  8 2018-06-01      5.16      20
##  9 2018-07-01      4.82      12
## 10 2018-08-01      9.37      32
## # ℹ 51 more rows

Visual Summary

Q1 Visualization Analysis

# Exploratory Distribution Visuals
ggplot(coffee, aes(x = `100g_USD`)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  labs(
    title = "Distribution of Coffee Prices (per 100g)",
    x = "Price (USD per 100g)",
    y = "Count"
  )

ggplot(coffee, aes(x = rating)) +
  geom_histogram(bins = 30, fill = "darkgreen", alpha = 0.7) +
  labs(
    title = "Distribution of Coffee Ratings",
    x = "Rating",
    y = "Count"
  )

‘Coffee Price vs Rating’ plot below directly answers Q1 by visualizing the correlation and trend between the variables, showing how ratings subtly rise in relation to higher priced coffee. This insight leads us to believe that people are willing to pay for more expensive coffee if the quality is excellent.

ggplot(coffee, aes(x = `100g_USD`, y = rating)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Coffee Price vs Rating",
    x = "Price (USD per 100g)",
    y = "Rating"
  )
## `geom_smooth()` using formula = 'y ~ x'

Q2 Visualization Analysis

# Visualizing rating & price trends by type of coffee roast
ggplot(coffee, aes(x = `100g_USD`, y = rating, color = roast)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Price vs Rating by Roast Level",
    x = "Price (USD per 100g)",
    y = "Rating",
    color = "Roast"
  )
## `geom_smooth()` using formula = 'y ~ x'