Logistic Regression: What Drives Customer Recommendations?

Setup

Data Preparation for Logistic Regression

# Load the cleaned dataset
ryanair_clean <- read.csv("ryanair_reviews_cleaned.csv")

# Load required libraries
library(tidyverse)
library(cluster)

# Perform clustering first to create ryanair_segmented
service_vars <- c("Seat.Comfort", "Cabin.Staff.Service", "Food...Beverages", 
                  "Ground.Service", "Value.For.Money")

clustering_data <- ryanair_clean %>%
  select(all_of(service_vars)) %>%
  scale() %>%
  as.data.frame()

# Perform K-means clustering
set.seed(123)
kmeans_result <- kmeans(clustering_data, centers = 3, nstart = 25)

# Add cluster assignments to create ryanair_segmented
ryanair_segmented <- ryanair_clean %>%
  mutate(kmeans_cluster = as.factor(kmeans_result$cluster))

cat("Clustering completed - 3 customer segments created\n")

## Clustering completed - 3 customer segments created

# Now proceed with logistic regression preparation
ryanair_logistic <- ryanair_clean %>%
  mutate(
    recommended_binary = ifelse(Recommended == "yes", 1, 0),
    # Ensure all predictor variables are numeric
    across(c(Seat.Comfort, Cabin.Staff.Service, Food...Beverages, 
             Ground.Service, Value.For.Money), as.numeric)
  )

# Add cluster assignments to logistic data
ryanair_logistic <- ryanair_logistic %>%
  left_join(ryanair_segmented %>% select(X, kmeans_cluster), by = "X")

# Check recommendation distribution and balance
cat("### Recommendation Distribution - Balance Check\n")

## ### Recommendation Distribution - Balance Check

rec_dist <- table(ryanair_logistic$recommended_binary)
print(rec_dist)

## 
##    0    1 
## 1247  872

cat("\nRecommendation rate:", mean(ryanair_logistic$recommended_binary, na.rm = TRUE) * 100, "%\n")

## 
## Recommendation rate: 41.15149 %

# Check if dataset is balanced (typical threshold: 40-60% split)
rec_rate <- mean(ryanair_logistic$recommended_binary, na.rm = TRUE)
if(rec_rate < 0.4 | rec_rate > 0.6) {
  cat("⚠️ WARNING: Dataset is IMBALANCED - may need sampling techniques\n")
} else {
  cat("Dataset is reasonably balanced for logistic regression\n")
}

## Dataset is reasonably balanced for logistic regression

# Remove any rows with missing values in key variables
ryanair_logistic_clean <- ryanair_logistic %>%
  select(recommended_binary, Seat.Comfort, Cabin.Staff.Service, Food...Beverages,
         Ground.Service, Value.For.Money, kmeans_cluster, Overall.Rating) %>%
  na.omit()

# NOW we can safely reference ryanair_logistic_clean
cat("Final dataset for logistic regression:", nrow(ryanair_logistic_clean), "observations\n")

## Final dataset for logistic regression: 2119 observations

Overall Logistic Regression Model

# Build overall logistic regression model
overall_model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service + 
                     Food...Beverages + Ground.Service + Value.For.Money,
                   data = ryanair_logistic_clean, family = binomial)

# Create odds_ratios object for dependencies (but don't display)
odds_ratios <- exp(coef(overall_model))

# REPLACE the redundant table with the better one:
coef_details <- tidy(overall_model) %>%
  mutate(
    term_clean = case_when(
      term == "(Intercept)" ~ "Intercept",
      term == "Seat.Comfort" ~ "Seat Comfort",
      term == "Cabin.Staff.Service" ~ "Cabin Staff",
      term == "Food...Beverages" ~ "Food & Beverages", 
      term == "Ground.Service" ~ "Ground Service",
      term == "Value.For.Money" ~ "Value for Money"
    ),
    odds_ratio = exp(estimate)
  ) %>%
  select(Service = term_clean, Coefficient = estimate, 
         Standard_Error = std.error, Odds_Ratio = odds_ratio, P_Value = p.value)

cat("### Logistic Regression Results\n")

## ### Logistic Regression Results

coef_details %>% knitr::kable(digits = 3)

Service	Coefficient	Standard_Error	Odds_Ratio	P_Value
Intercept	-10.042	0.511	0.000	0.000
Seat Comfort	0.428	0.108	1.534	0.000
Cabin Staff	0.570	0.093	1.769	0.000
Food & Beverages	0.201	0.090	1.223	0.026
Ground Service	0.482	0.086	1.619	0.000
Value for Money	1.718	0.104	5.575	0.000

# Model performance (keep this)
predicted_probs <- predict(overall_model, type = "response")
predicted_class <- ifelse(predicted_probs > 0.5, 1, 0)
accuracy <- mean(predicted_class == ryanair_logistic_clean$recommended_binary)
cat("\nModel Accuracy:", round(accuracy * 100, 1), "%\n")

## 
## Model Accuracy: 93.8 %

Visualize Feature Importance

library(broom)

# Create odds ratio plot only
odds_plot <- tidy(overall_model) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term_clean = case_when(
      term == "Seat.Comfort" ~ "Seat Comfort",
      term == "Cabin.Staff.Service" ~ "Cabin Staff",
      term == "Food...Beverages" ~ "Food & Beverages",
      term == "Ground.Service" ~ "Ground Service", 
      term == "Value.For.Money" ~ "Value for Money"
    ),
    odds_ratio = exp(estimate)
  ) %>%
  ggplot(aes(x = reorder(term_clean, odds_ratio), y = odds_ratio)) +
  geom_bar(stat = "identity", fill = "#073590", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = round(odds_ratio, 2)), hjust = -0.2, size = 4, color = "#073590") +
  coord_flip() +
  labs(title = "How Service Improvements Drive Recommendations",
       subtitle = "Each 1-point rating increase multiplies odds of recommendation",
       x = "Service Dimension", y = "Odds Ratio") +
  theme_minimal() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))

odds_plot

Business Interpretation - Easy to Understand

# Create a clean results table for business interpretation
business_insights <- tidy(overall_model) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    Service = case_when(
      term == "Seat.Comfort" ~ "Seat Comfort",
      term == "Cabin.Staff.Service" ~ "Cabin Staff Service",
      term == "Food...Beverages" ~ "Food & Beverages",
      term == "Ground.Service" ~ "Ground Service", 
      term == "Value.For.Money" ~ "Value for Money"
    ),
    `Odds Ratio` = round(exp(estimate), 2),
    `Impact` = case_when(
      `Odds Ratio` >= 3 ~ "VERY HIGH",
      `Odds Ratio` >= 2 ~ "HIGH",
      `Odds Ratio` >= 1.5 ~ "MEDIUM",
      TRUE ~ "LOW"
    ),
    `Business Interpretation` = paste0("1-point improvement makes customers ", `Odds Ratio`, "x more likely to recommend")
  ) %>%
  select(Service, `Odds Ratio`, Impact, `Business Interpretation`) %>%
  arrange(desc(`Odds Ratio`))

cat("### How Service Improvements Drive Recommendations\n")

## ### How Service Improvements Drive Recommendations

business_insights %>% knitr::kable()

Service	Odds Ratio	Impact	Business Interpretation
Value for Money	5.57	VERY HIGH	1-point improvement makes customers 5.57x more likely to recommend
Cabin Staff Service	1.77	MEDIUM	1-point improvement makes customers 1.77x more likely to recommend
Ground Service	1.62	MEDIUM	1-point improvement makes customers 1.62x more likely to recommend
Seat Comfort	1.53	MEDIUM	1-point improvement makes customers 1.53x more likely to recommend
Food & Beverages	1.22	LOW	1-point improvement makes customers 1.22x more likely to recommend

Strategic Recommendations

Priority #1: Focus on Value for Money - delivers 5.58x return on improvements
• Communicate value proposition clearly
• Review pricing strategy and transparency
• Highlight what’s included vs extra costs
Priority #2: Improve Cabin Staff Service - delivers 1.77x return
• Implement staff training programs
• Set clear service standards
• Improve complaint handling
Priority #3: Enhance Ground Service - delivers 1.62x return
• Streamline airport operations
• Reduce check-in and boarding wait times
• Improve baggage handling
Secondary: Seat Comfort improvements - delivers 1.53x return
• Basic comfort enhancements
• Legroom optimization where possible
• Cleanliness and maintenance
Lowest Priority: Food & Beverages - delivers 1.22x return
• Maintain current standards
• No major investment needed

Cluster-Specific Logistic Regression

# First create cluster profile names
cluster_names <- data.frame(
  kmeans_cluster = 1:3,
  segment_name = c("Satisfied Service-Experienced", 
                   "Highly Dissatisfied Customers", 
                   "Value-Satisfied but Ground Service Critics")
)

# Run separate models for each cluster
cluster_models <- list()

for (cluster_num in 1:3) {
  cluster_data <- ryanair_logistic_clean %>% 
    filter(kmeans_cluster == cluster_num)
  
  if (nrow(cluster_data) > 0) {
    model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service + 
                 Food...Beverages + Ground.Service + Value.For.Money,
               data = cluster_data, family = binomial)
    cluster_models[[paste0("Cluster_", cluster_num)]] <- model
  }
}

# Compare coefficients across clusters - CREATE cluster_coefs FIRST
cluster_coefs <- map_dfr(cluster_models, ~{
  tidy(.x) %>% 
    filter(term != "(Intercept)") %>%
    mutate(odds_ratio = exp(estimate))
}, .id = "cluster")

# NOW create the plot with significance coding
cluster_comparison_plot <- cluster_coefs %>%
  mutate(
    term = gsub("Seat.Comfort", "Seat Comfort", term),
    term = gsub("Cabin.Staff.Service", "Cabin Staff", term),
    term = gsub("Food...Beverages", "Food & Beverages", term),
    term = gsub("Ground.Service", "Ground Service", term),
    term = gsub("Value.For.Money", "Value for Money", term),
    cluster_num = as.numeric(gsub("Cluster_", "", cluster)),
    # Add significance indicator
    significant = p.value < 0.05
  ) %>%
  ggplot(aes(x = term, y = odds_ratio, color = as.factor(cluster_num), shape = significant)) +
  geom_point(size = 4, position = position_dodge(width = 0.5), stroke = 1.5) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red", size = 1) +
  scale_color_manual(values = c("#073590", "#FFD200", "#2E8B57"),
                     labels = cluster_names$segment_name,
                     name = "Customer Segment") +
  scale_shape_manual(values = c(1, 16),  # 1 = hollow, 16 = solid
                     labels = c("Not Significant", "Significant (p < 0.05)"),
                     name = "Statistical Significance") +
  coord_flip() +
  labs(title = "Recommendation Drivers by Customer Segment",
       subtitle = "Solid dots = statistically significant | Hollow dots = not significant",
       x = "Service Dimension", y = "Odds Ratio") +
  theme_minimal() +
  scale_y_continuous(limits = c(0.9, max(cluster_coefs$odds_ratio) * 1.1))

cluster_comparison_plot

Cluster-Specific Logistic Regression Results

# Create cluster profile names
cluster_names <- data.frame(
  kmeans_cluster = 1:3,
  segment_name = c("Satisfied Service-Experienced", 
                   "Highly Dissatisfied Customers", 
                   "Value-Satisfied but Ground Service Critics")
)

# Run separate models for each cluster and create results table
cluster_results <- map_dfr(1:3, function(cluster_num) {
  cluster_data <- ryanair_logistic_clean %>% 
    filter(kmeans_cluster == cluster_num)
  
  if (nrow(cluster_data) > 0) {
    model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service + 
                 Food...Beverages + Ground.Service + Value.For.Money,
               data = cluster_data, family = binomial)
    
    tidy(model) %>%
      filter(term != "(Intercept)") %>%
      mutate(
        term_clean = case_when(
          term == "Seat.Comfort" ~ "Seat Comfort",
          term == "Cabin.Staff.Service" ~ "Cabin Staff",
          term == "Food...Beverages" ~ "Food & Beverages", 
          term == "Ground.Service" ~ "Ground Service",
          term == "Value.For.Money" ~ "Value for Money"
        ),
        odds_ratio = exp(estimate),
        cluster = cluster_num,
        segment_name = cluster_names$segment_name[cluster_num],
        n_customers = nrow(cluster_data)
      ) %>%
      select(Segment = segment_name, Cluster = cluster, Service = term_clean, 
             Coefficient = estimate, Odds_Ratio = odds_ratio, P_Value = p.value, 
             Customers = n_customers)
  }
})

cat("### Cluster-Specific Logistic Regression Results\n")

## ### Cluster-Specific Logistic Regression Results

cluster_results %>% 
  select(-Cluster) %>%  # Remove cluster number since segment name is clearer
  knitr::kable(digits = 3)

Segment	Service	Coefficient	Odds_Ratio	P_Value	Customers
Satisfied Service-Experienced	Seat Comfort	0.089	1.093	0.692	487
Satisfied Service-Experienced	Cabin Staff	0.679	1.971	0.001	487
Satisfied Service-Experienced	Food & Beverages	0.243	1.275	0.377	487
Satisfied Service-Experienced	Ground Service	0.012	1.012	0.965	487
Satisfied Service-Experienced	Value for Money	1.361	3.902	0.000	487
Highly Dissatisfied Customers	Seat Comfort	0.418	1.518	0.066	1155
Highly Dissatisfied Customers	Cabin Staff	0.238	1.269	0.253	1155
Highly Dissatisfied Customers	Food & Beverages	0.477	1.611	0.074	1155
Highly Dissatisfied Customers	Ground Service	0.652	1.919	0.002	1155
Highly Dissatisfied Customers	Value for Money	1.616	5.033	0.000	1155
Value-Satisfied but Ground Service Critics	Seat Comfort	0.552	1.737	0.005	477
Value-Satisfied but Ground Service Critics	Cabin Staff	0.497	1.644	0.008	477
Value-Satisfied but Ground Service Critics	Food & Beverages	0.161	1.175	0.150	477
Value-Satisfied but Ground Service Critics	Ground Service	0.644	1.904	0.130	477
Value-Satisfied but Ground Service Critics	Value for Money	1.923	6.838	0.000	477

The table is detailed value used in the above plot, showing how each service impacts recommendations per segment.

Insights by Customer Segment:

Satisfied Service-Experienced : Most influenced by Value for Money (Odds Ratio = 3.9 x)

Highly Dissatisfied Customers : Most influenced by Value for Money (Odds Ratio = 5.03 x)

Value-Satisfied but Ground Service Critics : Most influenced by Value for Money (Odds Ratio = 6.84 x)

Cross-check with Overall Rating

# Simple approach using the data we already have
recommendation_analysis <- ryanair_logistic %>%
  select(recommended_binary, Seat.Comfort, Cabin.Staff.Service, Food...Beverages,
         Ground.Service, Value.For.Money, Overall.Rating) %>%
  na.omit() %>%
  group_by(recommended_binary) %>%
  summarize(
    avg_overall = mean(Overall.Rating),
    avg_seat = mean(Seat.Comfort),
    avg_cabin = mean(Cabin.Staff.Service),
    avg_food = mean(Food...Beverages),
    avg_ground = mean(Ground.Service),
    avg_value = mean(Value.For.Money),
    n_customers = n()
  )

cat("### Service Ratings: Recommending vs Non-Recommending Customers\n")

## ### Service Ratings: Recommending vs Non-Recommending Customers

recommendation_analysis %>% knitr::kable(digits = 2)

recommended_binary	avg_overall	avg_seat	avg_cabin	avg_food	avg_ground	avg_value	n_customers
0	1.70	1.69	1.96	1.67	1.24	1.62	1247
1	8.22	3.40	4.08	2.54	2.75	4.54	872

Service Rating Comparison

# Original comparison plot code with scale info in caption
comparison_plot <- recommendation_analysis %>%
  pivot_longer(cols = starts_with("avg_"), names_to = "service", values_to = "rating") %>%
  mutate(
    service_clean = case_when(
      service == "avg_overall" ~ "Overall Rating",
      service == "avg_seat" ~ "Seat Comfort",
      service == "avg_cabin" ~ "Cabin Staff", 
      service == "avg_food" ~ "Food & Beverages",
      service == "avg_ground" ~ "Ground Service",
      service == "avg_value" ~ "Value for Money"
    ),
    recommendation_status = ifelse(recommended_binary == 1, "Recommends", "Doesn't Recommend")
  ) %>%
  ggplot(aes(x = reorder(service_clean, rating), y = rating, fill = recommendation_status)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(rating, 1)), position = position_dodge(width = 0.9), 
            vjust = -0.5, size = 3) +
  scale_fill_manual(values = c("#FFD200", "#073590")) +
  labs(title = "Service Ratings: Recommending vs Non-Recommending Customers",
       subtitle = "How much higher do ratings need to be to get recommendations?",
       x = "Service", y = "Average Rating", 
       fill = "",
       caption = "Note: Service dimensions rated on 5-point scale (1-5)\nOverall satisfaction rated on 10-point scale (1-10)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.caption = element_text(size = 10, color = "gray40"))

comparison_plot

Summary

The logistic regression reveals exactly which service improvements will yield the highest return on investment for customer recommendations. The odds ratios quantify how much each 1-point service improvement increases the likelihood of customers recommending Ryanair to others.

Business Recommendations

Based on our logistic regression analysis, Value for Money emerges as the most critical driver of customer recommendations, with a 1-point rating improvement multiplying recommendation odds by 5.6x. This significantly outpaces other service dimensions, making it the highest priority for intervention.

Cabin Staff Service, Ground Service, and Seat Comfort form a second tier of importance, each delivering substantial returns (1.5-1.8x odds improvement) and representing clear opportunities for meaningful impact. These services should receive focused but secondary investment following value perception improvements.

Food & Beverages, while still positively influencing recommendations, shows the smallest marginal return (1.2x odds improvement) and should be maintained at current service levels rather than receiving significant new investment. The analysis clearly indicates that Ryanair’s recommendation challenge is fundamentally a value proposition issue, not primarily a service quality problem.

Implementation Priority:

Immediate: Address value perception through pricing transparency and communication
Short-term: Enhance cabin staff and ground service operations
Medium-term: Consider basic comfort improvements
Maintain: Current food and beverage standards

Check if clusters match original analysis (this is just for validation)

cat("Cluster distribution in logistic analysis:\n")

## Cluster distribution in logistic analysis:

table(ryanair_segmented$kmeans_cluster)

## 
##    1    2    3 
##  487 1155  477

# Compare with what you expect from previous analysis:
# Cluster 1: 487 (23%)
# Cluster 2: 1155 (54.5%)  
# Cluster 3: 477 (22.5%)

Did some digging on traveler types, but this might confuse our audience as we already put focus on the clusters.

They are just extra info if you are curious.

Also ran logistic regression by traveler type to see if any interesting patterns emerge. Top Recommendation Driver is Value for Money for all types (except Business traveler which is Statistically Not Significant).

Recommendation Rates by Traveler Type

# Create traveler_recommendation data within this chunk
traveler_recommendation <- ryanair_segmented %>%
  group_by(Type.Of.Traveller) %>%
  summarize(
    recommendation_rate = mean(ifelse(Recommended == "yes", 1, 0)) * 100,
    n_customers = n()
  )

# Simple bar plot with no grid and closer bars
traveler_recommendation_plot <- traveler_recommendation %>%
  ggplot(aes(x = recommendation_rate, 
             y = reorder(Type.Of.Traveller, recommendation_rate), 
             fill = Type.Of.Traveller)) +
  geom_bar(stat = "identity", alpha = 0.8, width = 0.5) +
  geom_text(aes(label = paste0(round(recommendation_rate, 1), "%")), 
            hjust = -0.2, size = 4, fontface = "bold") +
  scale_fill_manual(values = c("#073590", "#1E90FF", "#FFD200", "#32CD32", "#808080")) +
  labs(title = "Recommendation Rates by Traveler Type",
       subtitle = "Percentage of customers who recommend Ryanair",
       x = "Recommendation Rate (%)", y = "Traveler Type",
       fill = "Traveler Type") +
  theme_minimal() +
  theme(
    legend.position = "none",
    # Remove grid lines
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    # Add space between bars and axis
    plot.margin = margin(10, 40, 10, 10)
  ) +
  # Extend x-axis to make room for labels
  scale_x_continuous(expand = expansion(mult = c(0, 0.15)))

traveler_recommendation_plot

Ryanair Logistic regression

2025-11-13