library(tidyverse)
library(cluster) # For Gower distance and PAM
library(factoextra) # For clustering visualization
library(dplyr)
library(patchwork) # For combining plots
# Load the cleaned dataset
ryanair_clean <- read.csv("ryanair_reviews_cleaned.csv")
# Display basic dataset info
cat("## Dataset Overview\n")
## ## Dataset Overview
cat("Total reviews:", nrow(ryanair_clean), "\n")
## Total reviews: 2119
# Define service variables FIRST
service_vars <- c("Seat.Comfort", "Cabin.Staff.Service", "Food...Beverages",
"Ground.Service", "Value.For.Money")
# Extract year from review date if available
if("Review.Date" %in% names(ryanair_clean)) {
ryanair_clean <- ryanair_clean %>%
mutate(
review_year = as.numeric(format(as.Date(Review.Date, format = "%Y-%m-%d"), "%Y"))
)
} else {
ryanair_clean$review_year <- NA
}
# Prepare the clustering dataset - DROP UNKNOWN RECORDS
clustering_data_mixed <- ryanair_clean %>%
select(all_of(service_vars), Type.Of.Traveller, review_year) %>%
filter(complete.cases(across(all_of(service_vars)))) %>%
filter(Type.Of.Traveller != "Unknown") %>%
mutate(
Type.Of.Traveller = as.factor(Type.Of.Traveller)
)
cat("### Mixed Data Clustering Dataset (After Dropping Unknowns)\n")
## ### Mixed Data Clustering Dataset (After Dropping Unknowns)
cat("Final observations:", nrow(clustering_data_mixed), "\n")
## Final observations: 1635
# Remove review_year for clustering
clustering_data_for_gower <- clustering_data_mixed %>% select(-review_year)
# Calculate Gower distance
gower_dist <- daisy(clustering_data_for_gower,
metric = "gower",
type = list(numeric = 1:5, factor = 6))
cat("### Gower Distance Matrix Calculated\n")
## ### Gower Distance Matrix Calculated
cat("Distance matrix dimensions:", dim(as.matrix(gower_dist)), "\n")
## Distance matrix dimensions: 1635 1635
# Use faster computation with sampling for visualization
sample_size <- min(1000, nrow(clustering_data_for_gower))
if(nrow(clustering_data_for_gower) > 1000) {
set.seed(123)
sample_indices <- sample(1:nrow(clustering_data_for_gower), sample_size)
gower_sample <- as.matrix(gower_dist)[sample_indices, sample_indices]
gower_sample <- as.dist(gower_sample)
} else {
gower_sample <- gower_dist
}
# Calculate silhouette scores
silhouette_scores <- map_dbl(2:6, function(k) {
pam_result <- pam(gower_sample, k = k)
return(pam_result$silinfo$avg.width)
})
# Calculate WSS for elbow method
wss_scores <- map_dbl(2:6, function(k) {
pam_result <- pam(gower_sample, k = k)
return(pam_result$objective[1])
})
# Find optimal k
optimal_k_silhouette <- which.max(silhouette_scores) + 1
cat("### Cluster Evaluation Results\n")
## ### Cluster Evaluation Results
results_df <- data.frame(
k = 2:6,
silhouette_score = round(silhouette_scores, 3),
wss = round(wss_scores, 3)
)
results_df %>% knitr::kable()
| k | silhouette_score | wss |
|---|---|---|
| 2 | 0.474 | 0.253 |
| 3 | 0.307 | 0.185 |
| 4 | 0.403 | 0.162 |
| 5 | 0.365 | 0.146 |
| 6 | 0.407 | 0.133 |
# Visualization: Compare both methods
p1 <- fviz_nbclust(as.matrix(gower_sample), FUN = pam, method = "silhouette",
k.max = 6) +
labs(title = "Silhouette Method") +
theme_minimal()
p2 <- fviz_nbclust(as.matrix(gower_sample), FUN = pam, method = "wss",
k.max = 6) +
labs(title = "Elbow Method (Within-Cluster Sum of Squares)") +
theme_minimal()
# Combine plots
p1 + p2
optimal_k <- optimal_k_silhouette
cat("\n**Selected Number of Clusters**: k =", optimal_k, "\n")
##
## **Selected Number of Clusters**: k = 2
cat("**Reason for not choosing k=3**: While k=3 shows a WSS reduction, the silhouette score drops significantly from 0.474 (k=2) to 0.307 (k=3), indicating poorer cluster quality and overlapping segments.\n")
## **Reason for not choosing k=3**: While k=3 shows a WSS reduction, the silhouette score drops significantly from 0.474 (k=2) to 0.307 (k=3), indicating poorer cluster quality and overlapping segments.
# Perform PAM clustering with optimal k
set.seed(123)
pam_result <- pam(gower_dist, k = optimal_k)
# Add cluster assignments to data
ryanair_segmented_mixed <- clustering_data_mixed %>%
mutate(cluster = as.factor(pam_result$clustering))
# Add back other columns for analysis
original_indices <- as.numeric(rownames(clustering_data_mixed))
additional_cols <- c("Overall.Rating", "Recommended", "Seat.Type")
available_cols <- intersect(additional_cols, names(ryanair_clean))
if(length(available_cols) > 0) {
ryanair_segmented_mixed <- ryanair_segmented_mixed %>%
bind_cols(ryanair_clean[original_indices, available_cols, drop = FALSE])
}
cat("### PAM Clustering Results\n")
## ### PAM Clustering Results
cat("Cluster sizes:\n")
## Cluster sizes:
table(pam_result$clustering) %>% knitr::kable()
| Var1 | Freq |
|---|---|
| 1 | 598 |
| 2 | 1037 |
# Create comprehensive cluster profiles
cluster_profiles_mixed <- ryanair_segmented_mixed %>%
group_by(cluster) %>%
summarize(
n_customers = n(),
percent_total = round(n() / nrow(ryanair_segmented_mixed) * 100, 1),
# Service ratings
avg_seat_comfort = round(mean(Seat.Comfort, na.rm = TRUE), 2),
avg_cabin_staff = round(mean(Cabin.Staff.Service, na.rm = TRUE), 2),
avg_food_beverages = round(mean(Food...Beverages, na.rm = TRUE), 2),
avg_ground_service = round(mean(Ground.Service, na.rm = TRUE), 2),
avg_value_money = round(mean(Value.For.Money, na.rm = TRUE), 2),
# Overall rating
avg_overall_rating = ifelse("Overall.Rating" %in% names(.),
round(mean(Overall.Rating, na.rm = TRUE), 2), NA),
# Traveler type composition
business_count = sum(Type.Of.Traveller == "Business"),
family_count = sum(Type.Of.Traveller == "Family Leisure"),
solo_count = sum(Type.Of.Traveller == "Solo Leisure"),
couples_count = sum(Type.Of.Traveller == "Couple Leisure"),
other_count = sum(!Type.Of.Traveller %in% c("Business", "Family Leisure", "Solo Leisure", "Couple Leisure")),
# Calculate percentages
business_pct = round(business_count / n() * 100, 1),
family_pct = round(family_count / n() * 100, 1),
solo_pct = round(solo_count / n() * 100, 1),
couples_pct = round(couples_count / n() * 100, 1),
other_pct = round(other_count / n() * 100, 1),
# Recommendation rate
recommendation_rate = ifelse("Recommended" %in% names(.),
round(sum(Recommended == "yes", na.rm = TRUE) / n() * 100, 1), NA)
) %>%
select(-ends_with("_count"))
cat("### Customer Segment Profiles\n")
## ### Customer Segment Profiles
cluster_profiles_mixed %>% knitr::kable()
| cluster | n_customers | percent_total | avg_seat_comfort | avg_cabin_staff | avg_food_beverages | avg_ground_service | avg_value_money | avg_overall_rating | business_pct | family_pct | solo_pct | couples_pct | other_pct | recommendation_rate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 598 | 36.6 | 3.26 | 4.06 | 2.59 | 3.69 | 4.40 | 7.42 | 7.5 | 21.4 | 46.7 | 24.4 | 0 | 85.1 |
| 2 | 1037 | 63.4 | 1.52 | 1.85 | 1.72 | 1.21 | 1.44 | 1.56 | 7.8 | 23.7 | 25.9 | 42.5 | 0 | 3.4 |
# Assign meaningful segment names
segment_names <- cluster_profiles_mixed %>%
mutate(
segment_name = case_when(
cluster == 1 ~ "Satisfied Value-Focused Leisure Travelers",
cluster == 2 ~ "Completely Dissatisfied Couples & Families",
TRUE ~ paste("Segment", cluster)
)
) %>%
select(cluster, segment_name)
# Add segment names to profiles
cluster_profiles_mixed <- cluster_profiles_mixed %>%
left_join(segment_names, by = "cluster")
# Define custom color palette (consistent naming)
main_colors <- c("#073590", "#FFD200") # For profile plot lines
traveler_colors <- c("#073590", "#1E90FF", "#FFD200", "#32CD32", "#808080") # For boxplot fill
# 1. Combined Faceted Boxplot (USING FULL DATA)
combined_plot <- ryanair_segmented_mixed %>%
select(cluster, all_of(service_vars), Type.Of.Traveller) %>%
pivot_longer(cols = all_of(service_vars),
names_to = "service", values_to = "rating") %>%
left_join(segment_names, by = "cluster") %>%
mutate(
service = case_when(
service == "Seat.Comfort" ~ "Seat Comfort",
service == "Cabin.Staff.Service" ~ "Cabin Staff",
service == "Food...Beverages" ~ "Food & Beverages",
service == "Ground.Service" ~ "Ground Service",
service == "Value.For.Money" ~ "Value for Money",
TRUE ~ service
),
service = factor(service, levels = c("Seat Comfort", "Cabin Staff", "Food & Beverages",
"Ground Service", "Value for Money"))
) %>%
ggplot(aes(x = service, y = rating, fill = Type.Of.Traveller)) +
geom_boxplot(alpha = 0.8, outlier.alpha = 0.3) +
facet_wrap(~segment_name, ncol = 2) +
labs(
title = "Service Rating Distributions by Traveler Type Within Customer Segments",
subtitle = "Note: Ground Service shows minimal variation in dissatisfied segment (consistently rated 1/5)",
x = "Service Dimension",
y = "Rating (1-5)",
fill = "Traveler Type"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.subtitle = element_text(size = 10, color = "gray40", face = "italic")) +
scale_fill_manual(values = traveler_colors) +
scale_y_continuous(limits = c(0, 5), breaks = seq(0, 5, by = 1))
# 2. Simple Profile Plot with Traveler Type Annotations (FIXED - remove aspect.ratio)
simple_profile <- cluster_profiles_mixed %>%
select(cluster, segment_name,
avg_seat_comfort, avg_cabin_staff, avg_food_beverages,
avg_ground_service, avg_value_money,
solo_pct, couples_pct, family_pct, business_pct) %>%
pivot_longer(cols = starts_with("avg_"),
names_to = "service", values_to = "rating") %>%
mutate(service = gsub("avg_", "", service),
service = case_when(
service == "seat_comfort" ~ "Seat Comfort",
service == "cabin_staff" ~ "Cabin Staff",
service == "food_beverages" ~ "Food & Beverages",
service == "ground_service" ~ "Ground Service",
service == "value_money" ~ "Value for Money",
TRUE ~ service
),
service = factor(service, levels = c("Seat Comfort", "Cabin Staff", "Food & Beverages",
"Ground Service", "Value for Money"))) %>%
ggplot(aes(x = service, y = rating, color = segment_name, group = segment_name)) +
geom_line(size = 1.5) +
geom_point(size = 3) +
labs(title = "Service Rating Patterns by Customer Segment",
x = "Service Dimension", y = "Average Rating (1-5)",
color = NULL) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
plot.margin = margin(b = 120, t = 20, l = 20, r = 20),
legend.position = "bottom",
legend.direction = "horizontal",
legend.text = element_text(size = 9),
legend.key.width = unit(2, "cm"),
legend.spacing.x = unit(1, "cm"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color = "black")
# REMOVED: aspect.ratio = 0.6
) +
scale_color_manual(
values = c("Satisfied Value-Focused Leisure Travelers" = "#073590", # Blue for satisfied
"Completely Dissatisfied Couples & Families" = "#FFD200"), # Yellow for dissatisfied
labels = c("Satisfied Value-Focused\nLeisure Travelers",
"Completely Dissatisfied\nCouples & Families"),
guide = guide_legend(override.aes = list(
color = c("#073590", "#FFD200") # FIXED: Force correct colors in legend
))
) +
scale_y_continuous(breaks = seq(0, 5, by = 1),
limits = c(0, 5),
expand = expansion(mult = c(0.1, 0.1))) +
# Add traveler composition annotations
labs(caption = "Segment Compositions:\nSatisfied Value-Focused Leisure Travelers: Solo: 46.7% | Couples: 24.4% | Family: 21.4% | Business: 7.5%\nCompletely Dissatisfied Couples & Families: Solo: 25.9% | Couples: 42.5% | Family: 23.7% | Business: 7.8%") +
theme(plot.caption = element_text(hjust = 0, size = 9, margin = margin(t = 10)))
# 3. Traveler type composition
traveler_plot <- ryanair_segmented_mixed %>%
count(cluster, Type.Of.Traveller) %>%
group_by(cluster) %>%
mutate(percent = n / sum(n) * 100) %>%
left_join(segment_names, by = "cluster") %>%
mutate(segment_spaced = factor(segment_name,
levels = c("Satisfied Value-Focused Leisure Travelers",
"Completely Dissatisfied Couples & Families"))) %>%
ggplot(aes(x = segment_spaced, y = percent, fill = Type.Of.Traveller)) +
geom_bar(stat = "identity", position = "stack", width = 0.7) +
labs(
title = "Traveler Type Composition by Segment",
x = "Customer Segment",
y = "Percentage",
fill = "Traveler Type"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = 9)) +
scale_fill_manual(values = traveler_colors) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 20))
# Display plots
combined_plot
simple_profile
traveler_plot
Recommended Segment Names:
Cluster 1: “Satisfied Value-Focused Leisure Travelers”
Leisure-dominated (Solo 46.7% + Couples 24.4% + Family 21.4% = 92.5% leisure)
Excellent value perception (4.40/5)
High overall satisfaction (7.42/10)
Very high recommendation rate (85.1%)
Cluster 2: “Completely Dissatisfied Couples & Families”
Couples are largest group (42.5%)
Family is second largest (23.7%)
Terrible across ALL service dimensions
Extremely low recommendation (3.4%)
Key Strategic Insight:
You have a polarized customer base:
36% Advocates who love Ryanair’s value proposition
64% Critics who hate the experience
This suggests Ryanair’s ultra-low-cost model works well for budget-conscious solo/leisure travelers but fails miserably for couples and families who likely have different expectations and needs.
cat("## Detailed Business Insights\n\n")
## ## Detailed Business Insights
for(i in 1:nrow(cluster_profiles_mixed)) {
cluster_data <- cluster_profiles_mixed[i, ]
cat("###", cluster_data$segment_name, "(", cluster_data$percent_total, "% of customers)\n")
cat("**Profile**:", cluster_data$n_customers, "customers")
if(!is.na(cluster_data$recommendation_rate)) {
cat(" |", cluster_data$recommendation_rate, "% recommendation rate")
}
cat("\n")
# Service patterns
services <- c("Seat Comfort", "Cabin Staff", "Food & Beverages", "Ground Service", "Value for Money")
ratings <- c(cluster_data$avg_seat_comfort, cluster_data$avg_cabin_staff,
cluster_data$avg_food_beverages, cluster_data$avg_ground_service,
cluster_data$avg_value_money)
top_service <- services[which.max(ratings)]
worst_service <- services[which.min(ratings)]
cat("**Service Pattern**: Strongest on", top_service, "(", max(ratings), "/5) |",
"Weakest on", worst_service, "(", min(ratings), "/5)\n")
# Traveler type insights
traveler_pcts <- c(cluster_data$business_pct, cluster_data$family_pct,
cluster_data$solo_pct, cluster_data$couples_pct, cluster_data$other_pct)
traveler_types <- c("Business", "Family Leisure", "Solo Leisure", "Couple Leisure", "Other")
dominant_traveler <- traveler_types[which.max(traveler_pcts)]
cat("**Traveler Mix**: Dominated by", dominant_traveler, "travelers (",
max(traveler_pcts), "%)\n\n")
}
## ### Satisfied Value-Focused Leisure Travelers ( 36.6 % of customers)
## **Profile**: 598 customers | 85.1 % recommendation rate
## **Service Pattern**: Strongest on Value for Money ( 4.4 /5) | Weakest on Food & Beverages ( 2.59 /5)
## **Traveler Mix**: Dominated by Solo Leisure travelers ( 46.7 %)
##
## ### Completely Dissatisfied Couples & Families ( 63.4 % of customers)
## **Profile**: 1037 customers | 3.4 % recommendation rate
## **Service Pattern**: Strongest on Cabin Staff ( 1.85 /5) | Weakest on Ground Service ( 1.21 /5)
## **Traveler Mix**: Dominated by Couple Leisure travelers ( 42.5 %)
Clear Segmentation: Identified 2 natural customer segments with distinct profiles
Polarized Base: 36% are highly satisfied advocates vs 64% completely dissatisfied critics
Value Proposition Works: Ultra-low-cost model resonates strongly with solo leisure travelers
Critical Gap: Current experience fails to meet needs of couples and families
Actionable Insights: Clear strategic directions for marketing and service improvement
Next Steps: - Deep dive into specific pain points for couples and families - Develop targeted service packages for different traveler types - Create segment-specific marketing and communication strategies - Implement tracking to monitor segment migration over time