library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(htmltools)
packageVersion("htmltools")
## [1] '0.5.8.1'
library(readxl)
library(tibble)
library(ggplot2)
# Load in data
coffee <- read_csv("~/Downloads/coffee_analysis.csv")
## Rows: 2095 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, roaster, roast, loc_country, origin_1, origin_2, review_date...
## dbl (2): 100g_USD, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Load in new CSV
coffee_clean <- read_csv("coffee_clean.csv")
## Rows: 2080 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, roaster, roast, loc_country, origin_1, origin_2, review_date...
## dbl (2): 100g_USD, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Show column types
spec(coffee) # double = numeric
## cols(
## name = col_character(),
## roaster = col_character(),
## roast = col_character(),
## loc_country = col_character(),
## origin_1 = col_character(),
## origin_2 = col_character(),
## `100g_USD` = col_double(),
## rating = col_double(),
## review_date = col_character(),
## desc_1 = col_character(),
## desc_2 = col_character(),
## desc_3 = col_character()
## )
# Build DataFrame with Random Samples
# choose your sampling fraction (must be >= 0.10)
sample_frac <- 0.20
# choose number of samples (3-7)
n_samples <- 3
df_samples <- tibble()
for (sample_i in 1:n_samples) {
df_i <- coffee_clean %>%
sample_n(size = sample_frac * nrow(coffee), replace = TRUE) %>%
mutate(sample_num = sample_i)
df_samples <- bind_rows(df_samples, df_i)
}
# Using 'group_by' to summarize numeric values for the 3 sample groups
df_samples %>%
group_by(sample_num) %>%
summarise(
n = n(),
mean_price = mean(`100g_USD`, na.rm = TRUE),
sd_price = sd(`100g_USD`, na.rm = TRUE),
mean_rating = mean(rating, na.rm = TRUE),
sd_rating = sd(rating, na.rm = TRUE),
min_price = min(`100g_USD`, na.rm = TRUE),
max_price = max(`100g_USD`, na.rm = TRUE)
)
## # A tibble: 3 × 8
## sample_num n mean_price sd_price mean_rating sd_rating min_price max_price
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 419 7.86 7.01 93.1 1.54 1.47 83.8
## 2 2 419 8.98 10.8 93.1 1.41 1.36 128.
## 3 3 419 9.08 10.7 93.1 1.58 0.17 90.8
Numeric Subsamples Insights/Comparisons After testing the numeric results numerous times by adjusting the ‘sample_frac’ and the ‘n_samples’, I landed on comparing 3 samples, each that are 20% of the total DataFrame. This numeric summary of the 3 random subsamples analyzes the average min, mean, max, and sd price as well s the mean and sd rating. Between the 3 subsamples, the ‘max_price’ differentiated most significantly. While the 3 subsamples are quite similar accross the board of these statistical categories, a notable takeaway is the high variability among the extremes between them. Both the ‘min_price’ and ‘max_price’ give insight into the vast difference in extremes, where as the other categories such as ‘mean_rating’ and ‘mean_price’ are very similar for all the subsamples.
Price Distribution
# Group subsamples into quantiles to summarize price distributions
df_samples %>%
group_by(sample_num) %>%
summarise(
Q_1 = quantile(`100g_USD`, 0.25, na.rm = TRUE), # Quantile 1 = 25%
median = median(`100g_USD`, na.rm = TRUE), # Median = 50%
Q_3 = quantile(`100g_USD`, 0.75, na.rm = TRUE) # Quantile 3 = 75%
)
## # A tibble: 3 × 4
## sample_num Q_1 median Q_3
## <int> <dbl> <dbl> <dbl>
## 1 1 4.85 5.64 7.97
## 2 2 4.9 5.73 8.8
## 3 3 4.85 5.73 8.51
Rating Distribution
# Group subsamples into quantiles to summarize rating distributions
df_samples %>%
group_by(sample_num) %>%
summarise(
Q_1 = quantile(`rating`, 0.25, na.rm = TRUE), # Quantile 1 = 25%
median = median(`rating`, na.rm = TRUE), # Median = 50%
Q_3 = quantile(`rating`, 0.75, na.rm = TRUE) # Quantile 3 = 75%
)
## # A tibble: 3 × 4
## sample_num Q_1 median Q_3
## <int> <dbl> <dbl> <dbl>
## 1 1 92 93 94
## 2 2 92 93 94
## 3 3 92 93 94
Quartile Distribution Analysis To summarize the quartiles for continuous variariables for the subsamples, being the price and rating, I grouped the subsamples into 3 quantiles (Q_1, Median, Q_3). These 3 quantiles enables a comparison analysis to happen. There was variance evident in the price distribution between the 3 subsamples as subsample 1 has the lowest coffee prices while subsample 3 has the highest, based on each quantile. There is no anomaly in these subsamples regarding either distribution as the results are quite similiar and consistent. Especially in the rating distribution, the quantiles are the exact same for all 3 subsamples, which means the subsamples are quite evenly divided.
# Analyze correlation between subsamples for price and rating
df_samples %>%
group_by(sample_num) %>%
summarise(
correlation = cor(`100g_USD`, rating, use = "complete.obs")
)
## # A tibble: 3 × 2
## sample_num correlation
## <int> <dbl>
## 1 1 0.293
## 2 2 0.228
## 3 3 0.229
Correlaton Analysis Insights/Conclusions The lower numbers above suggest low correlation between coffee price and rating in the subsamples, and the quantile distribution analysis supports this. Despite prices slightly varying between subsamples shown in the price distribution, the subsamples still all average the exact same rating for each of their subsamples, suggesting that their is weak correlation. For their to be strong correlation between the subsamples regarding price and rating, the rating distribution analysis would need to more closely follow the price distribution trend, in which it does not in this current scenario.
Compare categorical variables across subsamples
# Compare 'roast' counts and proportions between the subsamples
df_samples %>%
group_by(sample_num, roast) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(prop = count / sum(count)) # prop is proportion of total sample count
## # A tibble: 14 × 4
## # Groups: sample_num [3]
## sample_num roast count prop
## <int> <chr> <int> <dbl>
## 1 1 Dark 1 0.00239
## 2 1 Light 58 0.138
## 3 1 Medium 46 0.110
## 4 1 Medium-Dark 5 0.0119
## 5 1 Medium-Light 309 0.737
## 6 2 Dark 1 0.00239
## 7 2 Light 68 0.162
## 8 2 Medium 54 0.129
## 9 2 Medium-Dark 3 0.00716
## 10 2 Medium-Light 293 0.699
## 11 3 Light 67 0.160
## 12 3 Medium 45 0.107
## 13 3 Medium-Dark 10 0.0239
## 14 3 Medium-Light 297 0.709
# Compare 'loc_country' counts and proportions between the subsamples
df_samples %>%
group_by(sample_num, loc_country) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(prop = count / sum(count)) # prop is proportion of total sample count
## # A tibble: 28 × 4
## # Groups: sample_num [3]
## sample_num loc_country count prop
## <int> <chr> <int> <dbl>
## 1 1 Canada 5 0.0119
## 2 1 Guatemala 8 0.0191
## 3 1 Hawai'i 13 0.0310
## 4 1 Hong Kong 2 0.00477
## 5 1 Japan 1 0.00239
## 6 1 Malaysia 2 0.00477
## 7 1 Peru 3 0.00716
## 8 1 Taiwan 120 0.286
## 9 1 United States 265 0.632
## 10 2 Canada 6 0.0143
## # ℹ 18 more rows
Categorical Variable Insights/Analysis The two categorical variables I analyzed from the subsamples are ‘roast’ and ‘loc_country’. I wanted to gain insight into how category distributions vary across subsamples. In addition to ‘count’, ‘prop’ helped me better undestanding the subsamples for these two categories by taking the fraction of the count based on the total for each subsample. For instance, I learned that ‘Medium-Light’ roasts dominate each subsample as they make up over 70% of the total observations in all three of them. This likely reflected the whole dataset as ‘Medium-Light’ roasts dominate the overall reviews as the most popular roast. There was more distribution evident in the location country in which the coffee is brewed between the subsamples, although The United States and Taiwan dominate the count and prop consistently across the board. Overall, these categorical summaries provide great insight to the dataset as a whole, as if a variable is consistent across each subsample, it is likely reflected to be true in representing the whole ‘coffee’ dataset.