Data Preparation
# Create synthetic yogurt beverage consumer data
set.seed(456)
n_consumers <- 300
# Create synthetic consumer survey data with basic structure
yogurt_data <- data.frame(
# Basic demographics and consumption variables
age = sample(18:65, n_consumers, replace = TRUE),
gender = sample(c("Male", "Female", "Non-binary"), n_consumers, replace = TRUE,
prob = c(0.45, 0.50, 0.05)),
consumption_frequency = sample(1:5, n_consumers, replace = TRUE),
morning_consumption = sample(0:1, n_consumers, replace = TRUE, prob = c(0.4, 0.6)),
afternoon_consumption = sample(0:1, n_consumers, replace = TRUE, prob = c(0.5, 0.5)),
evening_consumption = sample(0:1, n_consumers, replace = TRUE, prob = c(0.7, 0.3)),
# Preferences (all on 1-5 scale)
pref_fruit = sample(1:5, n_consumers, replace = TRUE),
pref_vanilla = sample(1:5, n_consumers, replace = TRUE),
pref_chocolate = sample(1:5, n_consumers, replace = TRUE),
pref_coffee = sample(1:5, n_consumers, replace = TRUE),
pref_exotic = sample(1:5, n_consumers, replace = TRUE),
# Nutritional priorities and purchase drivers
importance_protein = sample(1:5, n_consumers, replace = TRUE),
importance_low_sugar = sample(1:5, n_consumers, replace = TRUE),
importance_probiotics = sample(1:5, n_consumers, replace = TRUE),
importance_natural = sample(1:5, n_consumers, replace = TRUE),
driver_taste = sample(1:5, n_consumers, replace = TRUE),
driver_health = sample(1:5, n_consumers, replace = TRUE),
driver_convenience = sample(1:5, n_consumers, replace = TRUE),
driver_price = sample(1:5, n_consumers, replace = TRUE),
driver_packaging = sample(1:5, n_consumers, replace = TRUE)
)
# Create patterns for clusters
# Health-conscious consumers
yogurt_data[1:75, c("importance_protein", "importance_low_sugar",
"importance_probiotics", "importance_natural",
"driver_health")] <- sample(4:5, 75*5, replace = TRUE)
# Taste-driven consumers
yogurt_data[76:150, c("pref_fruit", "pref_vanilla",
"driver_taste", "driver_price")] <- sample(4:5, 75*4, replace = TRUE)
# Convenience-focused consumers
yogurt_data[151:225, c("driver_convenience", "driver_packaging")] <- sample(4:5, 75*2, replace = TRUE)
yogurt_data[151:225, "morning_consumption"] <- 1
# Convert categorical variables to factors
yogurt_data$gender <- as.factor(yogurt_data$gender)
yogurt_data$morning_consumption <- as.factor(yogurt_data$morning_consumption)
yogurt_data$afternoon_consumption <- as.factor(yogurt_data$afternoon_consumption)
yogurt_data$evening_consumption <- as.factor(yogurt_data$evening_consumption)
Cluster Analysis
# Select variables for clustering
cluster_vars <- yogurt_data %>%
select(consumption_frequency,
starts_with("pref_"),
starts_with("importance_"),
starts_with("driver_"))
# Scale the data
cluster_data_scaled <- scale(cluster_vars)
# Determine optimal number of clusters with elbow method
set.seed(123)
fviz_nbclust(cluster_data_scaled, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
labs(title = "Elbow Method for Optimal k")

# Perform k-means clustering with k=4
k <- 4
km_result <- kmeans(cluster_data_scaled, centers = k, nstart = 25)
yogurt_data$cluster <- as.factor(km_result$cluster)
# PCA visualization of clusters
fviz_cluster(list(data = cluster_data_scaled, cluster = km_result$cluster),
ellipse.type = "convex",
repel = TRUE,
ggtheme = theme_minimal(),
main = "Consumer Segments for Yogurt Beverages")

Key Cluster Insights
# Flavor preferences by cluster
yogurt_data %>%
group_by(cluster) %>%
summarise(across(starts_with("pref_"), mean, na.rm = TRUE)) %>%
pivot_longer(-cluster, names_to = "flavor", values_to = "rating") %>%
mutate(flavor = str_replace(flavor, "pref_", "")) %>%
ggplot(aes(x = flavor, y = rating, fill = cluster)) +
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Flavor Preferences by Consumer Segment",
x = "Flavor", y = "Average Rating (1-5)")

# Purchase drivers by cluster
yogurt_data %>%
group_by(cluster) %>%
summarise(across(starts_with("driver_"), mean, na.rm = TRUE)) %>%
pivot_longer(-cluster, names_to = "driver", values_to = "rating") %>%
mutate(driver = str_replace(driver, "driver_", "")) %>%
ggplot(aes(x = driver, y = rating, fill = cluster)) +
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Purchase Drivers by Consumer Segment",
x = "Purchase Driver", y = "Average Importance (1-5)")

# Radar chart of cluster profiles
radar_vars <- c("consumption_frequency",
"pref_fruit", "pref_vanilla", "pref_chocolate",
"importance_protein", "importance_low_sugar",
"driver_taste", "driver_health", "driver_convenience")
# Calculate cluster means
radar_data <- yogurt_data %>%
group_by(cluster) %>%
summarise(across(all_of(radar_vars), mean, na.rm = TRUE))
# Create radar chart data
radar_matrix <- as.matrix(radar_data[, -1])
rownames(radar_matrix) <- paste("Cluster", radar_data$cluster)
# Plot using a simple approach - create parallel coordinates plot instead of radar
radar_long <- radar_data %>%
pivot_longer(-cluster, names_to = "variable", values_to = "value")
ggplot(radar_long, aes(x = variable, y = value, color = cluster, group = cluster)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_color_brewer(palette = "Set2") +
labs(title = "Cluster Profiles Across Key Variables",
x = "", y = "Average Score (1-5)")

Cluster Summary and Recommendations
# Statistical summary of each cluster
yogurt_data %>%
group_by(cluster) %>%
summarise(
count = n(),
proportion = n() / nrow(yogurt_data) * 100,
age_mean = mean(age),
consumption_mean = mean(consumption_frequency),
morning_pct = mean(as.numeric(as.character(morning_consumption))) * 100
) %>%
knitr::kable(digits = 1, caption = "Cluster Summary Statistics")
Cluster Summary Statistics
1 |
59 |
19.7 |
43.1 |
3.4 |
71.2 |
2 |
65 |
21.7 |
42.9 |
3.4 |
89.2 |
3 |
75 |
25.0 |
39.9 |
3.0 |
64.0 |
4 |
101 |
33.7 |
42.5 |
2.8 |
61.4 |
Cluster Interpretation
- Cluster 1: Health-Conscious Enthusiasts
- Focus on protein, probiotics, natural ingredients
- Driven by health benefits
- Cluster 2: Flavor-Focused Consumers
- Strong preferences for fruit and vanilla flavors
- Taste is primary purchase driver
- Cluster 3: Convenience-Oriented Pragmatists
- Value convenience and packaging
- Strong preference for morning consumption
- Cluster 4: Balanced Moderate Consumers
- No strong preferences in any dimension
- Middle-of-the-road in most attributes
Product Development Recommendations
- Health Segment: High-protein formulations with
probiotics and clean labels
- Flavor Segment: Premium indulgent flavors with
authentic taste profiles
- Convenience Segment: On-the-go packaging optimized
for morning consumption
- Balanced Segment: Mainstream flavors with balanced
nutritional profiles