library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(htmltools)
packageVersion("htmltools")
## [1] '0.5.8.1'
library(readxl)
library(tibble)
library(ggplot2)
# Load in data
coffee <- read_csv("~/Downloads/coffee_analysis.csv")
## Rows: 2095 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, roaster, roast, loc_country, origin_1, origin_2, review_date...
## dbl (2): 100g_USD, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Show column types
spec(coffee) # double = numeric
## cols(
## name = col_character(),
## roaster = col_character(),
## roast = col_character(),
## loc_country = col_character(),
## origin_1 = col_character(),
## origin_2 = col_character(),
## `100g_USD` = col_double(),
## rating = col_double(),
## review_date = col_character(),
## desc_1 = col_character(),
## desc_2 = col_character(),
## desc_3 = col_character()
## )
summary(coffee[, sapply(coffee, is.numeric)])
## 100g_USD rating
## Min. : 0.120 Min. :84.00
## 1st Qu.: 4.930 1st Qu.:92.00
## Median : 5.860 Median :93.00
## Mean : 9.323 Mean :93.11
## 3rd Qu.: 8.785 3rd Qu.:94.00
## Max. :132.280 Max. :98.00
# Output: Min/Max, Mean & Median (quartiles), 1st & 3rd quartiles
numeric_summary <- sapply(
coffee[, sapply(coffee, is.numeric)],
function(x) c(
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE),
mean = mean(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
quantiles = quantile(x, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
)
)
numeric_summary
## 100g_USD rating
## min 0.120000 84.000000
## max 132.280000 98.000000
## mean 9.323313 93.114081
## median 5.860000 93.000000
## sd 11.430659 1.563024
## quantiles.25% 4.930000 92.000000
## quantiles.50% 5.860000 93.000000
## quantiles.75% 8.785000 94.000000
# Output: Mean, SD, quantiles
categorical_summary <- lapply(
coffee[, sapply(coffee, is.factor) | sapply(coffee, is.character)],
function(x) table(x)
)
# Created DataFrame 'categorical_summary'
# Output: Unique values and counts
# Numeric
coffee %>%
summarise(across(
where(is.numeric),
list(
min = ~min(.x, na.rm = TRUE),
max = ~max(.x, na.rm = TRUE),
mean = ~mean(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE),
q25 = ~quantile(.x, 0.25, na.rm = TRUE),
q75 = ~quantile(.x, 0.75, na.rm = TRUE)
)
))
## # A tibble: 1 × 12
## `100g_USD_min` `100g_USD_max` `100g_USD_mean` `100g_USD_median` `100g_USD_q25`
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.12 132. 9.32 5.86 4.93
## # ℹ 7 more variables: `100g_USD_q75` <dbl>, rating_min <dbl>, rating_max <dbl>,
## # rating_mean <dbl>, rating_median <dbl>, rating_q25 <dbl>, rating_q75 <dbl>
# Categorical
coffee %>%
summarise(across(
where(~is.factor(.x) | is.character(.x)),
~list(table(.x))
))
## # A tibble: 1 × 10
## name roaster roast loc_country origin_1 origin_2 review_date
## <list> <list> <list> <list> <list> <list> <list>
## 1 <table[…]> <table[…]> <table[…]> <table[…]> <table[…]> <table[…]> <table[…]>
## # ℹ 3 more variables: desc_1 <list>, desc_2 <list>, desc_3 <list>
‘cor_price_rating’ is correlation between coffee price and rating. 0 = No relationship, 0.25 = Weak, 0.75 = Strong. The correlation between price and rating in this coffee reviews data set is a weak correlation, as the cor_price_rating = 0.259.
coffee %>%
summarise(
avg_price = mean(`100g_USD`, na.rm = TRUE),
avg_rating = mean(rating, na.rm = TRUE),
cor_price_rating = cor(`100g_USD`, rating, use = "complete.obs")
)
## # A tibble: 1 × 3
## avg_price avg_rating cor_price_rating
## <dbl> <dbl> <dbl>
## 1 9.32 93.1 0.260
The highest rated coffe roast world wide gathered from this data set is a light roast, earning an average rating of 93.5 out of 100. This tibble does show a strong correlation between rating and price when arranging by type of roast, The two highest rated roasts were the most expensive coffees while dark roasts received the lowest overall rating, making it the cheapest to purchase.
coffee %>%
group_by(roast) %>%
summarise(
avg_price = mean(`100g_USD`, na.rm = TRUE),
avg_rating = mean(rating, na.rm = TRUE),
n = n()
) %>%
arrange(desc(avg_rating))
## # A tibble: 6 × 4
## roast avg_price avg_rating n
## <chr> <dbl> <dbl> <int>
## 1 <NA> 42.2 93.5 15
## 2 Light 11.4 93.5 287
## 3 Medium-Light 8.99 93.2 1490
## 4 Medium 7.06 92.3 259
## 5 Medium-Dark 9.51 91.9 39
## 6 Dark 4.46 88.2 5
This question aims to provide insights into where the highest rated coffee beans are grown. The broader goal is to discover the top regions around the world where coffee is grown, based on aggregated ratings from this data set.
coffee %>%
group_by(origin_1) %>%
summarise(
avg_rating = mean(rating, na.rm = TRUE),
count = n()
) %>%
filter(count >= 5) %>% # avoids tiny samples
arrange(desc(avg_rating))
## # A tibble: 88 × 3
## origin_1 avg_rating count
## <chr> <dbl> <int>
## 1 Chiriqui Province 94.9 7
## 2 Gedeo Zone 94.6 5
## 3 Kirinyaga 94.6 5
## 4 Boquete Growing Region 94.6 39
## 5 Piedra Candela 94.6 9
## 6 Kiambu Growing Region 94.5 12
## 7 Sidama Growing Region 94.4 14
## 8 Alto Quiel 94.3 6
## 9 Embu County 94.3 6
## 10 Holualoa 94.3 41
## # ℹ 78 more rows
The price of coffee appears to be steadily rising. This data set contains coffee reviews from 2017-2022. In 2017, the average cost of coffee was $7.87 and in 2022, the average cost increased to $9.27, after eclipsing an average of $10 throughout 2021. Aggregating the price of coffee by month gives more insight into increased prices over time, although there is higher fluctuation in monthly prices, resulting in the yearly aggegrating providing a more steady analysis.
# Converting the review date to date to give R a full date to process in order to successfully aggregate the average cost of coffee over time
coffee <- coffee %>%
mutate(review_date = as.Date(paste0("01 ", review_date), format = "%d %B %Y"),
year = year(review_date))
# Aggregating average coffee price by year
price_by_year <- coffee %>%
group_by(year) %>%
summarise(
avg_price = mean(`100g_USD`, na.rm = TRUE),
coffees = n()
) %>%
arrange(year)
price_by_year
## # A tibble: 6 × 3
## year avg_price coffees
## <dbl> <dbl> <int>
## 1 2017 7.87 68
## 2 2018 8.22 297
## 3 2019 9.29 291
## 4 2020 9.43 426
## 5 2021 10.1 543
## 6 2022 9.27 470
# Aggregating average coffee price by month and year
price_by_month <- coffee %>%
mutate(year_month = floor_date(review_date, "month")) %>%
group_by(year_month) %>%
summarise(
avg_price = mean(`100g_USD`, na.rm = TRUE),
coffees = n()
) %>%
arrange(year_month)
price_by_month
## # A tibble: 61 × 3
## year_month avg_price coffees
## <date> <dbl> <int>
## 1 2017-11-01 6.97 40
## 2 2017-12-01 9.15 28
## 3 2018-01-01 19.4 17
## 4 2018-02-01 5.94 22
## 5 2018-03-01 5.87 22
## 6 2018-04-01 12.2 27
## 7 2018-05-01 6.15 22
## 8 2018-06-01 5.16 20
## 9 2018-07-01 4.82 12
## 10 2018-08-01 9.37 32
## # ℹ 51 more rows
Q1 Visualization Analysis
# Exploratory Distribution Visuals
ggplot(coffee, aes(x = `100g_USD`)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
labs(
title = "Distribution of Coffee Prices (per 100g)",
x = "Price (USD per 100g)",
y = "Count"
)
ggplot(coffee, aes(x = rating)) +
geom_histogram(bins = 30, fill = "darkgreen", alpha = 0.7) +
labs(
title = "Distribution of Coffee Ratings",
x = "Rating",
y = "Count"
)
‘Coffee Price vs Rating’ plot below directly answers Q1 by visualizing the correlation and trend between the variables, showing how ratings subtly rise in relation to higher priced coffee. This insight leads us to believe that people are willing to pay for more expensive coffee if the quality is excellent.
ggplot(coffee, aes(x = `100g_USD`, y = rating)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Coffee Price vs Rating",
x = "Price (USD per 100g)",
y = "Rating"
)
## `geom_smooth()` using formula = 'y ~ x'
Q2 Visualization Analysis
# Visualizing rating & price trends by type of coffee roast
ggplot(coffee, aes(x = `100g_USD`, y = rating, color = roast)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Price vs Rating by Roast Level",
x = "Price (USD per 100g)",
y = "Rating",
color = "Roast"
)
## `geom_smooth()` using formula = 'y ~ x'