Data Preparation

# Create synthetic yogurt beverage consumer data
set.seed(456)
n_consumers <- 300

# Create synthetic consumer survey data with basic structure
yogurt_data <- data.frame(
  # Basic demographics and consumption variables
  age = sample(18:65, n_consumers, replace = TRUE),
  gender = sample(c("Male", "Female", "Non-binary"), n_consumers, replace = TRUE, 
                 prob = c(0.45, 0.50, 0.05)),
  consumption_frequency = sample(1:5, n_consumers, replace = TRUE),
  morning_consumption = sample(0:1, n_consumers, replace = TRUE, prob = c(0.4, 0.6)),
  afternoon_consumption = sample(0:1, n_consumers, replace = TRUE, prob = c(0.5, 0.5)),
  evening_consumption = sample(0:1, n_consumers, replace = TRUE, prob = c(0.7, 0.3)),
  
  # Preferences (all on 1-5 scale)
  pref_fruit = sample(1:5, n_consumers, replace = TRUE),
  pref_vanilla = sample(1:5, n_consumers, replace = TRUE),
  pref_chocolate = sample(1:5, n_consumers, replace = TRUE),
  pref_coffee = sample(1:5, n_consumers, replace = TRUE),
  pref_exotic = sample(1:5, n_consumers, replace = TRUE),
  
  # Nutritional priorities and purchase drivers
  importance_protein = sample(1:5, n_consumers, replace = TRUE),
  importance_low_sugar = sample(1:5, n_consumers, replace = TRUE),
  importance_probiotics = sample(1:5, n_consumers, replace = TRUE),
  importance_natural = sample(1:5, n_consumers, replace = TRUE),
  driver_taste = sample(1:5, n_consumers, replace = TRUE),
  driver_health = sample(1:5, n_consumers, replace = TRUE),
  driver_convenience = sample(1:5, n_consumers, replace = TRUE),
  driver_price = sample(1:5, n_consumers, replace = TRUE),
  driver_packaging = sample(1:5, n_consumers, replace = TRUE)
)

# Create patterns for clusters
# Health-conscious consumers
yogurt_data[1:75, c("importance_protein", "importance_low_sugar", 
                   "importance_probiotics", "importance_natural",
                   "driver_health")] <- sample(4:5, 75*5, replace = TRUE)

# Taste-driven consumers 
yogurt_data[76:150, c("pref_fruit", "pref_vanilla", 
                     "driver_taste", "driver_price")] <- sample(4:5, 75*4, replace = TRUE)

# Convenience-focused consumers
yogurt_data[151:225, c("driver_convenience", "driver_packaging")] <- sample(4:5, 75*2, replace = TRUE)
yogurt_data[151:225, "morning_consumption"] <- 1

# Convert categorical variables to factors
yogurt_data$gender <- as.factor(yogurt_data$gender)
yogurt_data$morning_consumption <- as.factor(yogurt_data$morning_consumption)
yogurt_data$afternoon_consumption <- as.factor(yogurt_data$afternoon_consumption)
yogurt_data$evening_consumption <- as.factor(yogurt_data$evening_consumption)

Cluster Analysis

# Select variables for clustering
cluster_vars <- yogurt_data %>%
  select(consumption_frequency, 
         starts_with("pref_"),
         starts_with("importance_"),
         starts_with("driver_"))

# Scale the data
cluster_data_scaled <- scale(cluster_vars)

# Determine optimal number of clusters with elbow method
set.seed(123)
fviz_nbclust(cluster_data_scaled, kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) +
  labs(title = "Elbow Method for Optimal k")

# Perform k-means clustering with k=4
k <- 4
km_result <- kmeans(cluster_data_scaled, centers = k, nstart = 25)
yogurt_data$cluster <- as.factor(km_result$cluster)

# PCA visualization of clusters
fviz_cluster(list(data = cluster_data_scaled, cluster = km_result$cluster),
             ellipse.type = "convex",
             repel = TRUE,
             ggtheme = theme_minimal(),
             main = "Consumer Segments for Yogurt Beverages")

Key Cluster Insights

# Flavor preferences by cluster
yogurt_data %>%
  group_by(cluster) %>%
  summarise(across(starts_with("pref_"), mean, na.rm = TRUE)) %>%
  pivot_longer(-cluster, names_to = "flavor", values_to = "rating") %>%
  mutate(flavor = str_replace(flavor, "pref_", "")) %>%
  ggplot(aes(x = flavor, y = rating, fill = cluster)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Flavor Preferences by Consumer Segment",
       x = "Flavor", y = "Average Rating (1-5)")

# Purchase drivers by cluster
yogurt_data %>%
  group_by(cluster) %>%
  summarise(across(starts_with("driver_"), mean, na.rm = TRUE)) %>%
  pivot_longer(-cluster, names_to = "driver", values_to = "rating") %>%
  mutate(driver = str_replace(driver, "driver_", "")) %>%
  ggplot(aes(x = driver, y = rating, fill = cluster)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Purchase Drivers by Consumer Segment",
       x = "Purchase Driver", y = "Average Importance (1-5)")

# Radar chart of cluster profiles
radar_vars <- c("consumption_frequency", 
                "pref_fruit", "pref_vanilla", "pref_chocolate", 
                "importance_protein", "importance_low_sugar", 
                "driver_taste", "driver_health", "driver_convenience")

# Calculate cluster means
radar_data <- yogurt_data %>%
  group_by(cluster) %>%
  summarise(across(all_of(radar_vars), mean, na.rm = TRUE))

# Create radar chart data
radar_matrix <- as.matrix(radar_data[, -1])
rownames(radar_matrix) <- paste("Cluster", radar_data$cluster)

# Plot using a simple approach - create parallel coordinates plot instead of radar
radar_long <- radar_data %>%
  pivot_longer(-cluster, names_to = "variable", values_to = "value")

ggplot(radar_long, aes(x = variable, y = value, color = cluster, group = cluster)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_color_brewer(palette = "Set2") +
  labs(title = "Cluster Profiles Across Key Variables",
       x = "", y = "Average Score (1-5)")

Cluster Summary and Recommendations

# Statistical summary of each cluster
yogurt_data %>%
  group_by(cluster) %>%
  summarise(
    count = n(),
    proportion = n() / nrow(yogurt_data) * 100,
    age_mean = mean(age),
    consumption_mean = mean(consumption_frequency),
    morning_pct = mean(as.numeric(as.character(morning_consumption))) * 100
  ) %>%
  knitr::kable(digits = 1, caption = "Cluster Summary Statistics")
Cluster Summary Statistics
cluster count proportion age_mean consumption_mean morning_pct
1 59 19.7 43.1 3.4 71.2
2 65 21.7 42.9 3.4 89.2
3 75 25.0 39.9 3.0 64.0
4 101 33.7 42.5 2.8 61.4

Cluster Interpretation

  1. Cluster 1: Health-Conscious Enthusiasts
    • Focus on protein, probiotics, natural ingredients
    • Driven by health benefits
  2. Cluster 2: Flavor-Focused Consumers
    • Strong preferences for fruit and vanilla flavors
    • Taste is primary purchase driver
  3. Cluster 3: Convenience-Oriented Pragmatists
    • Value convenience and packaging
    • Strong preference for morning consumption
  4. Cluster 4: Balanced Moderate Consumers
    • No strong preferences in any dimension
    • Middle-of-the-road in most attributes

Product Development Recommendations

  • Health Segment: High-protein formulations with probiotics and clean labels
  • Flavor Segment: Premium indulgent flavors with authentic taste profiles
  • Convenience Segment: On-the-go packaging optimized for morning consumption
  • Balanced Segment: Mainstream flavors with balanced nutritional profiles