library(readr)
library(dplyr)
library(ggplot2)
mydata <- read_csv("coffee_survey_clean.csv")
summary(mydata)
## visit_freq shops_visited preferred_shop local_shop_desc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.500
## Median :3.000 Median :1.000 Median :2.000 Median :2.000
## Mean :3.148 Mean :1.185 Mean :2.519 Mean :2.148
## 3rd Qu.:4.000 3rd Qu.:1.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :5.000 Max. :2.000 Max. :4.000 Max. :5.000
##
## drink_specialty drink_cold_brew drink_hot_drip drink_energy
## Min. :1 Min. :1 Min. :1.000 Min. :1.000
## 1st Qu.:1 1st Qu.:1 1st Qu.:1.000 1st Qu.:2.000
## Median :2 Median :2 Median :1.000 Median :3.000
## Mean :2 Mean :2 Mean :1.762 Mean :2.474
## 3rd Qu.:3 3rd Qu.:3 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :3 Max. :4 Max. :4.000 Max. :4.000
## NA's :4 NA's :4 NA's :6 NA's :8
## spend_coffee spend_per_visit age gender employment
## Min. : 2.000 Min. :2.000 Min. :2.000 Min. :1.000 Min. :1
## 1st Qu.: 7.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.500 1st Qu.:1
## Median : 8.000 Median :3.000 Median :2.000 Median :2.000 Median :2
## Mean : 9.385 Mean :2.963 Mean :2.333 Mean :1.815 Mean :2
## 3rd Qu.:10.750 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3
## Max. :20.000 Max. :4.000 Max. :5.000 Max. :4.000 Max. :4
## NA's :1
ggplot(mydata, aes(x = factor(visit_freq))) +
geom_bar(fill = "steelblue") +
labs(
title = "How Often Do You Visit a Coffee Shop?",
x = "Visit Frequency (1=Never … 5=Daily)",
y = "Count"
) +
theme_minimal()

shop_labels <- c("1" = "Starbucks", "2" = "Dutch Bros", "3" = "Dunkin'", "4" = "Local/Other")
ggplot(mydata, aes(x = factor(preferred_shop))) +
geom_bar(fill = "coral") +
scale_x_discrete(labels = shop_labels) +
labs(title = "Preferred Coffee Shop", x = "Coffee Shop", y = "Count") +
theme_minimal()

ggplot(mydata, aes(x = spend_coffee)) +
geom_histogram(binwidth = 2, fill = "darkgreen", color = "white") +
labs(title = "Distribution of Typical Spend per Visit", x = "Dollars Spent", y = "Count") +
theme_minimal()

library(tidyr)
drink_data <- mydata %>%
select(drink_specialty, drink_cold_brew, drink_hot_drip, drink_energy) %>%
pivot_longer(cols = everything(), names_to = "drink_type", values_to = "frequency") %>%
filter(!is.na(frequency))
ggplot(drink_data, aes(x = drink_type, y = frequency, fill = drink_type)) +
geom_boxplot() +
labs(title = "Typical Weekly Drink Frequency by Type", x = "Drink Type", y = "Frequency") +
theme_minimal() +
theme(legend.position = "none")

cluster_vars <- mydata %>%
select(visit_freq, drink_specialty, drink_cold_brew, drink_hot_drip,
drink_energy, spend_coffee) %>%
na.omit()
cluster_scaled <- scale(cluster_vars, center = TRUE, scale = TRUE)
dist_matrix <- dist(cluster_scaled)
seg_hclust <- hclust(dist_matrix, method = "ward.D2")
plot(seg_hclust, main = "Hierarchical Clustering Dendrogram",
xlab = "Respondents", ylab = "Height", sub = "")
rect.hclust(seg_hclust, k = 3, border = "red")

groups_3 <- cutree(seg_hclust, k = 3)
table(groups_3)
## groups_3
## 1 2 3
## 3 6 8
cluster_means <- aggregate(cluster_vars, by = list(Cluster = groups_3), FUN = mean)
print(round(cluster_means, 2))
## Cluster visit_freq drink_specialty drink_cold_brew drink_hot_drip
## 1 1 2.0 2.00 1.67 3.33
## 2 2 3.5 1.50 1.33 1.33
## 3 3 3.5 2.62 2.88 1.50
## drink_energy spend_coffee
## 1 3.00 11.67
## 2 2.17 10.33
## 3 2.38 7.38
cluster_medians <- aggregate(cluster_vars, by = list(Cluster = groups_3), FUN = median)
print(cluster_medians)
## Cluster visit_freq drink_specialty drink_cold_brew drink_hot_drip
## 1 1 2.0 2.0 2 3
## 2 2 3.5 1.5 1 1
## 3 3 4.0 3.0 3 1
## drink_energy spend_coffee
## 1 3 9.0
## 2 2 10.0
## 3 3 7.5
write.csv(groups_3, "clusterID.csv")
write.csv(cluster_means, "cluster_means.csv")
write.csv(cluster_medians, "cluster_medians.csv")