Customer Segmentation Analysis with Traveler Types

Setup and Data Loading

library(tidyverse)
library(cluster)    # For Gower distance and PAM
library(factoextra) # For clustering visualization
library(dplyr)
library(patchwork)  # For combining plots
# Load the cleaned dataset
ryanair_clean <- read.csv("ryanair_reviews_cleaned.csv")

# Display basic dataset info
cat("## Dataset Overview\n")
## ## Dataset Overview
cat("Total reviews:", nrow(ryanair_clean), "\n")
## Total reviews: 2119

Data Preparation for Mixed Data Clustering

# Define service variables FIRST
service_vars <- c("Seat.Comfort", "Cabin.Staff.Service", "Food...Beverages", 
                  "Ground.Service", "Value.For.Money")

# Extract year from review date if available
if("Review.Date" %in% names(ryanair_clean)) {
  ryanair_clean <- ryanair_clean %>%
    mutate(
      review_year = as.numeric(format(as.Date(Review.Date, format = "%Y-%m-%d"), "%Y"))
    )
} else {
  ryanair_clean$review_year <- NA
}

# Prepare the clustering dataset - DROP UNKNOWN RECORDS
clustering_data_mixed <- ryanair_clean %>%
  select(all_of(service_vars), Type.Of.Traveller, review_year) %>%
  filter(complete.cases(across(all_of(service_vars)))) %>%
  filter(Type.Of.Traveller != "Unknown") %>%
  mutate(
    Type.Of.Traveller = as.factor(Type.Of.Traveller)
  )

cat("### Mixed Data Clustering Dataset (After Dropping Unknowns)\n")
## ### Mixed Data Clustering Dataset (After Dropping Unknowns)
cat("Final observations:", nrow(clustering_data_mixed), "\n")
## Final observations: 1635

Calculate Gower Distance

# Remove review_year for clustering
clustering_data_for_gower <- clustering_data_mixed %>% select(-review_year)

# Calculate Gower distance
gower_dist <- daisy(clustering_data_for_gower, 
                    metric = "gower",
                    type = list(numeric = 1:5, factor = 6))

cat("### Gower Distance Matrix Calculated\n")
## ### Gower Distance Matrix Calculated
cat("Distance matrix dimensions:", dim(as.matrix(gower_dist)), "\n")
## Distance matrix dimensions: 1635 1635

Determine Optimal Number of Clusters

# Use faster computation with sampling for visualization
sample_size <- min(1000, nrow(clustering_data_for_gower))
if(nrow(clustering_data_for_gower) > 1000) {
  set.seed(123)
  sample_indices <- sample(1:nrow(clustering_data_for_gower), sample_size)
  gower_sample <- as.matrix(gower_dist)[sample_indices, sample_indices]
  gower_sample <- as.dist(gower_sample)
} else {
  gower_sample <- gower_dist
}

# Calculate silhouette scores
silhouette_scores <- map_dbl(2:6, function(k) {
  pam_result <- pam(gower_sample, k = k)
  return(pam_result$silinfo$avg.width)
})

# Calculate WSS for elbow method
wss_scores <- map_dbl(2:6, function(k) {
  pam_result <- pam(gower_sample, k = k)
  return(pam_result$objective[1])
})

# Find optimal k
optimal_k_silhouette <- which.max(silhouette_scores) + 1

cat("### Cluster Evaluation Results\n")
## ### Cluster Evaluation Results
results_df <- data.frame(
  k = 2:6,
  silhouette_score = round(silhouette_scores, 3),
  wss = round(wss_scores, 3)
)
results_df %>% knitr::kable()
k silhouette_score wss
2 0.474 0.253
3 0.307 0.185
4 0.403 0.162
5 0.365 0.146
6 0.407 0.133
# Visualization: Compare both methods
p1 <- fviz_nbclust(as.matrix(gower_sample), FUN = pam, method = "silhouette", 
                   k.max = 6) +
  labs(title = "Silhouette Method") +
  theme_minimal()

p2 <- fviz_nbclust(as.matrix(gower_sample), FUN = pam, method = "wss", 
                   k.max = 6) +
  labs(title = "Elbow Method (Within-Cluster Sum of Squares)") +
  theme_minimal()

# Combine plots
p1 + p2

optimal_k <- optimal_k_silhouette

cat("\n**Selected Number of Clusters**: k =", optimal_k, "\n")
## 
## **Selected Number of Clusters**: k = 2
cat("**Reason for not choosing k=3**: While k=3 shows a WSS reduction, the silhouette score drops significantly from 0.474 (k=2) to 0.307 (k=3), indicating poorer cluster quality and overlapping segments.\n")
## **Reason for not choosing k=3**: While k=3 shows a WSS reduction, the silhouette score drops significantly from 0.474 (k=2) to 0.307 (k=3), indicating poorer cluster quality and overlapping segments.

Perform PAM Clustering

# Perform PAM clustering with optimal k
set.seed(123)
pam_result <- pam(gower_dist, k = optimal_k)

# Add cluster assignments to data
ryanair_segmented_mixed <- clustering_data_mixed %>%
  mutate(cluster = as.factor(pam_result$clustering))

# Add back other columns for analysis
original_indices <- as.numeric(rownames(clustering_data_mixed))
additional_cols <- c("Overall.Rating", "Recommended", "Seat.Type")

available_cols <- intersect(additional_cols, names(ryanair_clean))

if(length(available_cols) > 0) {
  ryanair_segmented_mixed <- ryanair_segmented_mixed %>%
    bind_cols(ryanair_clean[original_indices, available_cols, drop = FALSE])
}

cat("### PAM Clustering Results\n")
## ### PAM Clustering Results
cat("Cluster sizes:\n")
## Cluster sizes:
table(pam_result$clustering) %>% knitr::kable()
Var1 Freq
1 598
2 1037

Cluster Profiling with Mixed Data

# Create comprehensive cluster profiles
cluster_profiles_mixed <- ryanair_segmented_mixed %>%
  group_by(cluster) %>%
  summarize(
    n_customers = n(),
    percent_total = round(n() / nrow(ryanair_segmented_mixed) * 100, 1),
    
    # Service ratings
    avg_seat_comfort = round(mean(Seat.Comfort, na.rm = TRUE), 2),
    avg_cabin_staff = round(mean(Cabin.Staff.Service, na.rm = TRUE), 2),
    avg_food_beverages = round(mean(Food...Beverages, na.rm = TRUE), 2),
    avg_ground_service = round(mean(Ground.Service, na.rm = TRUE), 2),
    avg_value_money = round(mean(Value.For.Money, na.rm = TRUE), 2),
    
    # Overall rating
    avg_overall_rating = ifelse("Overall.Rating" %in% names(.), 
                               round(mean(Overall.Rating, na.rm = TRUE), 2), NA),
    
    # Traveler type composition
    business_count = sum(Type.Of.Traveller == "Business"),
    family_count = sum(Type.Of.Traveller == "Family Leisure"),
    solo_count = sum(Type.Of.Traveller == "Solo Leisure"),
    couples_count = sum(Type.Of.Traveller == "Couple Leisure"),
    other_count = sum(!Type.Of.Traveller %in% c("Business", "Family Leisure", "Solo Leisure", "Couple Leisure")),
    
    # Calculate percentages
    business_pct = round(business_count / n() * 100, 1),
    family_pct = round(family_count / n() * 100, 1),
    solo_pct = round(solo_count / n() * 100, 1),
    couples_pct = round(couples_count / n() * 100, 1),
    other_pct = round(other_count / n() * 100, 1),
    
    # Recommendation rate
    recommendation_rate = ifelse("Recommended" %in% names(.),
                                round(sum(Recommended == "yes", na.rm = TRUE) / n() * 100, 1), NA)
  ) %>%
  select(-ends_with("_count"))

cat("### Customer Segment Profiles\n")
## ### Customer Segment Profiles
cluster_profiles_mixed %>% knitr::kable()
cluster n_customers percent_total avg_seat_comfort avg_cabin_staff avg_food_beverages avg_ground_service avg_value_money avg_overall_rating business_pct family_pct solo_pct couples_pct other_pct recommendation_rate
1 598 36.6 3.26 4.06 2.59 3.69 4.40 7.42 7.5 21.4 46.7 24.4 0 85.1
2 1037 63.4 1.52 1.85 1.72 1.21 1.44 1.56 7.8 23.7 25.9 42.5 0 3.4

Interpret Clusters and Assign Names

# Assign meaningful segment names
segment_names <- cluster_profiles_mixed %>%
  mutate(
    segment_name = case_when(
      cluster == 1 ~ "Satisfied Value-Focused Leisure Travelers",
      cluster == 2 ~ "Completely Dissatisfied Couples & Families",
      TRUE ~ paste("Segment", cluster)
    )
  ) %>%
  select(cluster, segment_name)

# Add segment names to profiles
cluster_profiles_mixed <- cluster_profiles_mixed %>%
  left_join(segment_names, by = "cluster")

Visualization of Mixed Data Clusters

# Define custom color palette (consistent naming)
main_colors <- c("#073590", "#FFD200")  # For profile plot lines
traveler_colors <- c("#073590", "#1E90FF", "#FFD200", "#32CD32", "#808080")  # For boxplot fill

# 1. Combined Faceted Boxplot (USING FULL DATA)
combined_plot <- ryanair_segmented_mixed %>%
  select(cluster, all_of(service_vars), Type.Of.Traveller) %>%
  pivot_longer(cols = all_of(service_vars), 
               names_to = "service", values_to = "rating") %>%
  left_join(segment_names, by = "cluster") %>%
  mutate(
    service = case_when(
      service == "Seat.Comfort" ~ "Seat Comfort",
      service == "Cabin.Staff.Service" ~ "Cabin Staff",
      service == "Food...Beverages" ~ "Food & Beverages", 
      service == "Ground.Service" ~ "Ground Service",
      service == "Value.For.Money" ~ "Value for Money",
      TRUE ~ service
    ),
    service = factor(service, levels = c("Seat Comfort", "Cabin Staff", "Food & Beverages", 
                                        "Ground Service", "Value for Money"))
  ) %>%
  ggplot(aes(x = service, y = rating, fill = Type.Of.Traveller)) +
  geom_boxplot(alpha = 0.8, outlier.alpha = 0.3) +
  facet_wrap(~segment_name, ncol = 2) +
  labs(
    title = "Service Rating Distributions by Traveler Type Within Customer Segments",
    subtitle = "Note: Ground Service shows minimal variation in dissatisfied segment (consistently rated 1/5)",
    x = "Service Dimension", 
    y = "Rating (1-5)",
    fill = "Traveler Type"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.subtitle = element_text(size = 10, color = "gray40", face = "italic")) +
  scale_fill_manual(values = traveler_colors) +
  scale_y_continuous(limits = c(0, 5), breaks = seq(0, 5, by = 1))

# 2. Simple Profile Plot with Traveler Type Annotations (FIXED - remove aspect.ratio)
simple_profile <- cluster_profiles_mixed %>%
  select(cluster, segment_name, 
         avg_seat_comfort, avg_cabin_staff, avg_food_beverages,
         avg_ground_service, avg_value_money,
         solo_pct, couples_pct, family_pct, business_pct) %>%
  pivot_longer(cols = starts_with("avg_"), 
               names_to = "service", values_to = "rating") %>%
  mutate(service = gsub("avg_", "", service),
         service = case_when(
           service == "seat_comfort" ~ "Seat Comfort",
           service == "cabin_staff" ~ "Cabin Staff", 
           service == "food_beverages" ~ "Food & Beverages",
           service == "ground_service" ~ "Ground Service",
           service == "value_money" ~ "Value for Money",
           TRUE ~ service
         ),
         service = factor(service, levels = c("Seat Comfort", "Cabin Staff", "Food & Beverages",
                                             "Ground Service", "Value for Money"))) %>%
  ggplot(aes(x = service, y = rating, color = segment_name, group = segment_name)) +
  geom_line(size = 1.5) +
  geom_point(size = 3) +
  labs(title = "Service Rating Patterns by Customer Segment",
       x = "Service Dimension", y = "Average Rating (1-5)",
       color = NULL) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
    plot.margin = margin(b = 120, t = 20, l = 20, r = 20),
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.text = element_text(size = 9),
    legend.key.width = unit(2, "cm"),
    legend.spacing.x = unit(1, "cm"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    axis.line = element_line(color = "black")
    # REMOVED: aspect.ratio = 0.6
  ) +
  scale_color_manual(
    values = c("Satisfied Value-Focused Leisure Travelers" = "#073590",  # Blue for satisfied
               "Completely Dissatisfied Couples & Families" = "#FFD200"), # Yellow for dissatisfied
    labels = c("Satisfied Value-Focused\nLeisure Travelers", 
               "Completely Dissatisfied\nCouples & Families"),
    guide = guide_legend(override.aes = list(
      color = c("#073590", "#FFD200")  # FIXED: Force correct colors in legend
    ))
  ) +
  scale_y_continuous(breaks = seq(0, 5, by = 1),
                     limits = c(0, 5),
                     expand = expansion(mult = c(0.1, 0.1))) +
  # Add traveler composition annotations
  labs(caption = "Segment Compositions:\nSatisfied Value-Focused Leisure Travelers: Solo: 46.7% | Couples: 24.4% | Family: 21.4% | Business: 7.5%\nCompletely Dissatisfied Couples & Families: Solo: 25.9% | Couples: 42.5% | Family: 23.7% | Business: 7.8%") +
  theme(plot.caption = element_text(hjust = 0, size = 9, margin = margin(t = 10)))

# 3. Traveler type composition
traveler_plot <- ryanair_segmented_mixed %>%
  count(cluster, Type.Of.Traveller) %>%
  group_by(cluster) %>%
  mutate(percent = n / sum(n) * 100) %>%
  left_join(segment_names, by = "cluster") %>%
  mutate(segment_spaced = factor(segment_name, 
                                levels = c("Satisfied Value-Focused Leisure Travelers",
                                          "Completely Dissatisfied Couples & Families"))) %>%
  ggplot(aes(x = segment_spaced, y = percent, fill = Type.Of.Traveller)) +
  geom_bar(stat = "identity", position = "stack", width = 0.7) +
  labs(
    title = "Traveler Type Composition by Segment",
    x = "Customer Segment", 
    y = "Percentage", 
    fill = "Traveler Type"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = 9)) +
  scale_fill_manual(values = traveler_colors) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))

# Display plots
combined_plot

simple_profile

traveler_plot

Segment Profiles and Strategic Insights

Recommended Segment Names:

Cluster 1: “Satisfied Value-Focused Leisure Travelers”

Leisure-dominated (Solo 46.7% + Couples 24.4% + Family 21.4% = 92.5% leisure)

Excellent value perception (4.40/5)

High overall satisfaction (7.42/10)

Very high recommendation rate (85.1%)

Cluster 2: “Completely Dissatisfied Couples & Families”

Couples are largest group (42.5%)

Family is second largest (23.7%)

Terrible across ALL service dimensions

Extremely low recommendation (3.4%)

Key Strategic Insight:

You have a polarized customer base:

36% Advocates who love Ryanair’s value proposition

64% Critics who hate the experience

This suggests Ryanair’s ultra-low-cost model works well for budget-conscious solo/leisure travelers but fails miserably for couples and families who likely have different expectations and needs.

Business Insights

cat("## Detailed Business Insights\n\n")
## ## Detailed Business Insights
for(i in 1:nrow(cluster_profiles_mixed)) {
  cluster_data <- cluster_profiles_mixed[i, ]
  
  cat("###", cluster_data$segment_name, "(", cluster_data$percent_total, "% of customers)\n")
  cat("**Profile**:", cluster_data$n_customers, "customers")
  
  if(!is.na(cluster_data$recommendation_rate)) {
    cat(" |", cluster_data$recommendation_rate, "% recommendation rate")
  }
  cat("\n")
  
  # Service patterns
  services <- c("Seat Comfort", "Cabin Staff", "Food & Beverages", "Ground Service", "Value for Money")
  ratings <- c(cluster_data$avg_seat_comfort, cluster_data$avg_cabin_staff, 
               cluster_data$avg_food_beverages, cluster_data$avg_ground_service, 
               cluster_data$avg_value_money)
  
  top_service <- services[which.max(ratings)]
  worst_service <- services[which.min(ratings)]
  
  cat("**Service Pattern**: Strongest on", top_service, "(", max(ratings), "/5) |",
      "Weakest on", worst_service, "(", min(ratings), "/5)\n")
  
  # Traveler type insights
  traveler_pcts <- c(cluster_data$business_pct, cluster_data$family_pct, 
                     cluster_data$solo_pct, cluster_data$couples_pct, cluster_data$other_pct)
  traveler_types <- c("Business", "Family Leisure", "Solo Leisure", "Couple Leisure", "Other")
  dominant_traveler <- traveler_types[which.max(traveler_pcts)]
  
  cat("**Traveler Mix**: Dominated by", dominant_traveler, "travelers (", 
      max(traveler_pcts), "%)\n\n")
}
## ### Satisfied Value-Focused Leisure Travelers ( 36.6 % of customers)
## **Profile**: 598 customers | 85.1 % recommendation rate
## **Service Pattern**: Strongest on Value for Money ( 4.4 /5) | Weakest on Food & Beverages ( 2.59 /5)
## **Traveler Mix**: Dominated by Solo Leisure travelers ( 46.7 %)
## 
## ### Completely Dissatisfied Couples & Families ( 63.4 % of customers)
## **Profile**: 1037 customers | 3.4 % recommendation rate
## **Service Pattern**: Strongest on Cabin Staff ( 1.85 /5) | Weakest on Ground Service ( 1.21 /5)
## **Traveler Mix**: Dominated by Couple Leisure travelers ( 42.5 %)

Executive Summary

Clear Segmentation: Identified 2 natural customer segments with distinct profiles

Polarized Base: 36% are highly satisfied advocates vs 64% completely dissatisfied critics

Value Proposition Works: Ultra-low-cost model resonates strongly with solo leisure travelers

Critical Gap: Current experience fails to meet needs of couples and families

Actionable Insights: Clear strategic directions for marketing and service improvement

Next Steps: - Deep dive into specific pain points for couples and families - Develop targeted service packages for different traveler types - Create segment-specific marketing and communication strategies - Implement tracking to monitor segment migration over time