library(readr)
library(dplyr)
library(ggplot2)

mydata <- read_csv("coffee_survey_clean.csv")
summary(mydata)
##    visit_freq    shops_visited   preferred_shop  local_shop_desc
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.500  
##  Median :3.000   Median :1.000   Median :2.000   Median :2.000  
##  Mean   :3.148   Mean   :1.185   Mean   :2.519   Mean   :2.148  
##  3rd Qu.:4.000   3rd Qu.:1.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :5.000   Max.   :2.000   Max.   :4.000   Max.   :5.000  
##                                                                 
##  drink_specialty drink_cold_brew drink_hot_drip   drink_energy  
##  Min.   :1       Min.   :1       Min.   :1.000   Min.   :1.000  
##  1st Qu.:1       1st Qu.:1       1st Qu.:1.000   1st Qu.:2.000  
##  Median :2       Median :2       Median :1.000   Median :3.000  
##  Mean   :2       Mean   :2       Mean   :1.762   Mean   :2.474  
##  3rd Qu.:3       3rd Qu.:3       3rd Qu.:2.000   3rd Qu.:3.000  
##  Max.   :3       Max.   :4       Max.   :4.000   Max.   :4.000  
##  NA's   :4       NA's   :4       NA's   :6       NA's   :8      
##   spend_coffee    spend_per_visit      age            gender        employment
##  Min.   : 2.000   Min.   :2.000   Min.   :2.000   Min.   :1.000   Min.   :1   
##  1st Qu.: 7.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.500   1st Qu.:1   
##  Median : 8.000   Median :3.000   Median :2.000   Median :2.000   Median :2   
##  Mean   : 9.385   Mean   :2.963   Mean   :2.333   Mean   :1.815   Mean   :2   
##  3rd Qu.:10.750   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:3   
##  Max.   :20.000   Max.   :4.000   Max.   :5.000   Max.   :4.000   Max.   :4   
##  NA's   :1
ggplot(mydata, aes(x = factor(visit_freq))) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "How Often Do You Visit a Coffee Shop?",
    x = "Visit Frequency (1=Never … 5=Daily)",
    y = "Count"
  ) +
  theme_minimal()

shop_labels <- c("1" = "Starbucks", "2" = "Dutch Bros", "3" = "Dunkin'", "4" = "Local/Other")

ggplot(mydata, aes(x = factor(preferred_shop))) +
  geom_bar(fill = "coral") +
  scale_x_discrete(labels = shop_labels) +
  labs(title = "Preferred Coffee Shop", x = "Coffee Shop", y = "Count") +
  theme_minimal()

ggplot(mydata, aes(x = spend_coffee)) +
  geom_histogram(binwidth = 2, fill = "darkgreen", color = "white") +
  labs(title = "Distribution of Typical Spend per Visit", x = "Dollars Spent", y = "Count") +
  theme_minimal()

library(tidyr)

drink_data <- mydata %>%
  select(drink_specialty, drink_cold_brew, drink_hot_drip, drink_energy) %>%
  pivot_longer(cols = everything(), names_to = "drink_type", values_to = "frequency") %>%
  filter(!is.na(frequency))

ggplot(drink_data, aes(x = drink_type, y = frequency, fill = drink_type)) +
  geom_boxplot() +
  labs(title = "Typical Weekly Drink Frequency by Type", x = "Drink Type", y = "Frequency") +
  theme_minimal() +
  theme(legend.position = "none")

cluster_vars <- mydata %>%
  select(visit_freq, drink_specialty, drink_cold_brew, drink_hot_drip,
         drink_energy, spend_coffee) %>%
  na.omit()

cluster_scaled <- scale(cluster_vars, center = TRUE, scale = TRUE)
dist_matrix <- dist(cluster_scaled)
seg_hclust  <- hclust(dist_matrix, method = "ward.D2")

plot(seg_hclust, main = "Hierarchical Clustering Dendrogram",
     xlab = "Respondents", ylab = "Height", sub = "")
rect.hclust(seg_hclust, k = 3, border = "red")

groups_3 <- cutree(seg_hclust, k = 3)
table(groups_3)
## groups_3
## 1 2 3 
## 3 6 8
cluster_means <- aggregate(cluster_vars, by = list(Cluster = groups_3), FUN = mean)
print(round(cluster_means, 2))
##   Cluster visit_freq drink_specialty drink_cold_brew drink_hot_drip
## 1       1        2.0            2.00            1.67           3.33
## 2       2        3.5            1.50            1.33           1.33
## 3       3        3.5            2.62            2.88           1.50
##   drink_energy spend_coffee
## 1         3.00        11.67
## 2         2.17        10.33
## 3         2.38         7.38
cluster_medians <- aggregate(cluster_vars, by = list(Cluster = groups_3), FUN = median)
print(cluster_medians)
##   Cluster visit_freq drink_specialty drink_cold_brew drink_hot_drip
## 1       1        2.0             2.0               2              3
## 2       2        3.5             1.5               1              1
## 3       3        4.0             3.0               3              1
##   drink_energy spend_coffee
## 1            3          9.0
## 2            2         10.0
## 3            3          7.5
write.csv(groups_3,        "clusterID.csv")
write.csv(cluster_means,   "cluster_means.csv")
write.csv(cluster_medians, "cluster_medians.csv")