Logistic Regression: What Drives Customer Recommendations?

Setup

Data Preparation for Logistic Regression

# Load the cleaned dataset
ryanair_clean <- read.csv("ryanair_reviews_cleaned.csv")

# Load required libraries
library(tidyverse)
library(cluster)

# Perform clustering first to create ryanair_segmented
service_vars <- c("Seat.Comfort", "Cabin.Staff.Service", "Food...Beverages", 
                  "Ground.Service", "Value.For.Money")

clustering_data <- ryanair_clean %>%
  select(all_of(service_vars)) %>%
  scale() %>%
  as.data.frame()

# Perform K-means clustering
set.seed(123)
kmeans_result <- kmeans(clustering_data, centers = 3, nstart = 25)

# Add cluster assignments to create ryanair_segmented
ryanair_segmented <- ryanair_clean %>%
  mutate(kmeans_cluster = as.factor(kmeans_result$cluster))

cat("Clustering completed - 3 customer segments created\n")
## Clustering completed - 3 customer segments created
# Now proceed with logistic regression preparation
ryanair_logistic <- ryanair_clean %>%
  mutate(
    recommended_binary = ifelse(Recommended == "yes", 1, 0),
    # Ensure all predictor variables are numeric
    across(c(Seat.Comfort, Cabin.Staff.Service, Food...Beverages, 
             Ground.Service, Value.For.Money), as.numeric)
  )

# Add cluster assignments to logistic data
ryanair_logistic <- ryanair_logistic %>%
  left_join(ryanair_segmented %>% select(X, kmeans_cluster), by = "X")

# Check recommendation distribution and balance
cat("### Recommendation Distribution - Balance Check\n")
## ### Recommendation Distribution - Balance Check
rec_dist <- table(ryanair_logistic$recommended_binary)
print(rec_dist)
## 
##    0    1 
## 1247  872
cat("\nRecommendation rate:", mean(ryanair_logistic$recommended_binary, na.rm = TRUE) * 100, "%\n")
## 
## Recommendation rate: 41.15149 %
# Check if dataset is balanced (typical threshold: 40-60% split)
rec_rate <- mean(ryanair_logistic$recommended_binary, na.rm = TRUE)
if(rec_rate < 0.4 | rec_rate > 0.6) {
  cat("⚠️ WARNING: Dataset is IMBALANCED - may need sampling techniques\n")
} else {
  cat("Dataset is reasonably balanced for logistic regression\n")
}
## Dataset is reasonably balanced for logistic regression
# Remove any rows with missing values in key variables
ryanair_logistic_clean <- ryanair_logistic %>%
  select(recommended_binary, Seat.Comfort, Cabin.Staff.Service, Food...Beverages,
         Ground.Service, Value.For.Money, kmeans_cluster, Overall.Rating) %>%
  na.omit()

# NOW we can safely reference ryanair_logistic_clean
cat("Final dataset for logistic regression:", nrow(ryanair_logistic_clean), "observations\n")
## Final dataset for logistic regression: 2119 observations

Overall Logistic Regression Model

# Build overall logistic regression model
overall_model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service + 
                     Food...Beverages + Ground.Service + Value.For.Money,
                   data = ryanair_logistic_clean, family = binomial)

# Create odds_ratios object for dependencies (but don't display)
odds_ratios <- exp(coef(overall_model))

# REPLACE the redundant table with the better one:
coef_details <- tidy(overall_model) %>%
  mutate(
    term_clean = case_when(
      term == "(Intercept)" ~ "Intercept",
      term == "Seat.Comfort" ~ "Seat Comfort",
      term == "Cabin.Staff.Service" ~ "Cabin Staff",
      term == "Food...Beverages" ~ "Food & Beverages", 
      term == "Ground.Service" ~ "Ground Service",
      term == "Value.For.Money" ~ "Value for Money"
    ),
    odds_ratio = exp(estimate)
  ) %>%
  select(Service = term_clean, Coefficient = estimate, 
         Standard_Error = std.error, Odds_Ratio = odds_ratio, P_Value = p.value)

cat("### Logistic Regression Results\n")
## ### Logistic Regression Results
coef_details %>% knitr::kable(digits = 3)
Service Coefficient Standard_Error Odds_Ratio P_Value
Intercept -10.042 0.511 0.000 0.000
Seat Comfort 0.428 0.108 1.534 0.000
Cabin Staff 0.570 0.093 1.769 0.000
Food & Beverages 0.201 0.090 1.223 0.026
Ground Service 0.482 0.086 1.619 0.000
Value for Money 1.718 0.104 5.575 0.000
# Model performance (keep this)
predicted_probs <- predict(overall_model, type = "response")
predicted_class <- ifelse(predicted_probs > 0.5, 1, 0)
accuracy <- mean(predicted_class == ryanair_logistic_clean$recommended_binary)
cat("\nModel Accuracy:", round(accuracy * 100, 1), "%\n")
## 
## Model Accuracy: 93.8 %

Visualize Feature Importance

library(broom)

# Create odds ratio plot only
odds_plot <- tidy(overall_model) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term_clean = case_when(
      term == "Seat.Comfort" ~ "Seat Comfort",
      term == "Cabin.Staff.Service" ~ "Cabin Staff",
      term == "Food...Beverages" ~ "Food & Beverages",
      term == "Ground.Service" ~ "Ground Service", 
      term == "Value.For.Money" ~ "Value for Money"
    ),
    odds_ratio = exp(estimate)
  ) %>%
  ggplot(aes(x = reorder(term_clean, odds_ratio), y = odds_ratio)) +
  geom_bar(stat = "identity", fill = "#073590", alpha = 0.8, width = 0.7) +
  geom_text(aes(label = round(odds_ratio, 2)), hjust = -0.2, size = 4, color = "#073590") +
  coord_flip() +
  labs(title = "How Service Improvements Drive Recommendations",
       subtitle = "Each 1-point rating increase multiplies odds of recommendation",
       x = "Service Dimension", y = "Odds Ratio") +
  theme_minimal() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))

odds_plot

Business Interpretation - Easy to Understand

# Create a clean results table for business interpretation
business_insights <- tidy(overall_model) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    Service = case_when(
      term == "Seat.Comfort" ~ "Seat Comfort",
      term == "Cabin.Staff.Service" ~ "Cabin Staff Service",
      term == "Food...Beverages" ~ "Food & Beverages",
      term == "Ground.Service" ~ "Ground Service", 
      term == "Value.For.Money" ~ "Value for Money"
    ),
    `Odds Ratio` = round(exp(estimate), 2),
    `Impact` = case_when(
      `Odds Ratio` >= 3 ~ "VERY HIGH",
      `Odds Ratio` >= 2 ~ "HIGH",
      `Odds Ratio` >= 1.5 ~ "MEDIUM",
      TRUE ~ "LOW"
    ),
    `Business Interpretation` = paste0("1-point improvement makes customers ", `Odds Ratio`, "x more likely to recommend")
  ) %>%
  select(Service, `Odds Ratio`, Impact, `Business Interpretation`) %>%
  arrange(desc(`Odds Ratio`))

cat("### How Service Improvements Drive Recommendations\n")
## ### How Service Improvements Drive Recommendations
business_insights %>% knitr::kable()
Service Odds Ratio Impact Business Interpretation
Value for Money 5.57 VERY HIGH 1-point improvement makes customers 5.57x more likely to recommend
Cabin Staff Service 1.77 MEDIUM 1-point improvement makes customers 1.77x more likely to recommend
Ground Service 1.62 MEDIUM 1-point improvement makes customers 1.62x more likely to recommend
Seat Comfort 1.53 MEDIUM 1-point improvement makes customers 1.53x more likely to recommend
Food & Beverages 1.22 LOW 1-point improvement makes customers 1.22x more likely to recommend

Strategic Recommendations

  1. Priority #1: Focus on Value for Money - delivers 5.58x return on improvements
    • Communicate value proposition clearly
    • Review pricing strategy and transparency
    • Highlight what’s included vs extra costs

  2. Priority #2: Improve Cabin Staff Service - delivers 1.77x return
    • Implement staff training programs
    • Set clear service standards
    • Improve complaint handling

  3. Priority #3: Enhance Ground Service - delivers 1.62x return
    • Streamline airport operations
    • Reduce check-in and boarding wait times
    • Improve baggage handling

  4. Secondary: Seat Comfort improvements - delivers 1.53x return
    • Basic comfort enhancements
    • Legroom optimization where possible
    • Cleanliness and maintenance

  5. Lowest Priority: Food & Beverages - delivers 1.22x return
    • Maintain current standards
    • No major investment needed

Cluster-Specific Logistic Regression

# First create cluster profile names
cluster_names <- data.frame(
  kmeans_cluster = 1:3,
  segment_name = c("Satisfied Service-Experienced", 
                   "Highly Dissatisfied Customers", 
                   "Value-Satisfied but Ground Service Critics")
)

# Run separate models for each cluster
cluster_models <- list()

for (cluster_num in 1:3) {
  cluster_data <- ryanair_logistic_clean %>% 
    filter(kmeans_cluster == cluster_num)
  
  if (nrow(cluster_data) > 0) {
    model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service + 
                 Food...Beverages + Ground.Service + Value.For.Money,
               data = cluster_data, family = binomial)
    cluster_models[[paste0("Cluster_", cluster_num)]] <- model
  }
}

# Compare coefficients across clusters - CREATE cluster_coefs FIRST
cluster_coefs <- map_dfr(cluster_models, ~{
  tidy(.x) %>% 
    filter(term != "(Intercept)") %>%
    mutate(odds_ratio = exp(estimate))
}, .id = "cluster")

# NOW create the plot with significance coding
cluster_comparison_plot <- cluster_coefs %>%
  mutate(
    term = gsub("Seat.Comfort", "Seat Comfort", term),
    term = gsub("Cabin.Staff.Service", "Cabin Staff", term),
    term = gsub("Food...Beverages", "Food & Beverages", term),
    term = gsub("Ground.Service", "Ground Service", term),
    term = gsub("Value.For.Money", "Value for Money", term),
    cluster_num = as.numeric(gsub("Cluster_", "", cluster)),
    # Add significance indicator
    significant = p.value < 0.05
  ) %>%
  ggplot(aes(x = term, y = odds_ratio, color = as.factor(cluster_num), shape = significant)) +
  geom_point(size = 4, position = position_dodge(width = 0.5), stroke = 1.5) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red", size = 1) +
  scale_color_manual(values = c("#073590", "#FFD200", "#2E8B57"),
                     labels = cluster_names$segment_name,
                     name = "Customer Segment") +
  scale_shape_manual(values = c(1, 16),  # 1 = hollow, 16 = solid
                     labels = c("Not Significant", "Significant (p < 0.05)"),
                     name = "Statistical Significance") +
  coord_flip() +
  labs(title = "Recommendation Drivers by Customer Segment",
       subtitle = "Solid dots = statistically significant | Hollow dots = not significant",
       x = "Service Dimension", y = "Odds Ratio") +
  theme_minimal() +
  scale_y_continuous(limits = c(0.9, max(cluster_coefs$odds_ratio) * 1.1))

cluster_comparison_plot

Cluster-Specific Logistic Regression Results

# Create cluster profile names
cluster_names <- data.frame(
  kmeans_cluster = 1:3,
  segment_name = c("Satisfied Service-Experienced", 
                   "Highly Dissatisfied Customers", 
                   "Value-Satisfied but Ground Service Critics")
)

# Run separate models for each cluster and create results table
cluster_results <- map_dfr(1:3, function(cluster_num) {
  cluster_data <- ryanair_logistic_clean %>% 
    filter(kmeans_cluster == cluster_num)
  
  if (nrow(cluster_data) > 0) {
    model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service + 
                 Food...Beverages + Ground.Service + Value.For.Money,
               data = cluster_data, family = binomial)
    
    tidy(model) %>%
      filter(term != "(Intercept)") %>%
      mutate(
        term_clean = case_when(
          term == "Seat.Comfort" ~ "Seat Comfort",
          term == "Cabin.Staff.Service" ~ "Cabin Staff",
          term == "Food...Beverages" ~ "Food & Beverages", 
          term == "Ground.Service" ~ "Ground Service",
          term == "Value.For.Money" ~ "Value for Money"
        ),
        odds_ratio = exp(estimate),
        cluster = cluster_num,
        segment_name = cluster_names$segment_name[cluster_num],
        n_customers = nrow(cluster_data)
      ) %>%
      select(Segment = segment_name, Cluster = cluster, Service = term_clean, 
             Coefficient = estimate, Odds_Ratio = odds_ratio, P_Value = p.value, 
             Customers = n_customers)
  }
})

cat("### Cluster-Specific Logistic Regression Results\n")
## ### Cluster-Specific Logistic Regression Results
cluster_results %>% 
  select(-Cluster) %>%  # Remove cluster number since segment name is clearer
  knitr::kable(digits = 3)
Segment Service Coefficient Odds_Ratio P_Value Customers
Satisfied Service-Experienced Seat Comfort 0.089 1.093 0.692 487
Satisfied Service-Experienced Cabin Staff 0.679 1.971 0.001 487
Satisfied Service-Experienced Food & Beverages 0.243 1.275 0.377 487
Satisfied Service-Experienced Ground Service 0.012 1.012 0.965 487
Satisfied Service-Experienced Value for Money 1.361 3.902 0.000 487
Highly Dissatisfied Customers Seat Comfort 0.418 1.518 0.066 1155
Highly Dissatisfied Customers Cabin Staff 0.238 1.269 0.253 1155
Highly Dissatisfied Customers Food & Beverages 0.477 1.611 0.074 1155
Highly Dissatisfied Customers Ground Service 0.652 1.919 0.002 1155
Highly Dissatisfied Customers Value for Money 1.616 5.033 0.000 1155
Value-Satisfied but Ground Service Critics Seat Comfort 0.552 1.737 0.005 477
Value-Satisfied but Ground Service Critics Cabin Staff 0.497 1.644 0.008 477
Value-Satisfied but Ground Service Critics Food & Beverages 0.161 1.175 0.150 477
Value-Satisfied but Ground Service Critics Ground Service 0.644 1.904 0.130 477
Value-Satisfied but Ground Service Critics Value for Money 1.923 6.838 0.000 477

The table is detailed value used in the above plot, showing how each service impacts recommendations per segment.

Insights by Customer Segment:

Satisfied Service-Experienced : Most influenced by Value for Money (Odds Ratio = 3.9 x)

Highly Dissatisfied Customers : Most influenced by Value for Money (Odds Ratio = 5.03 x)

Value-Satisfied but Ground Service Critics : Most influenced by Value for Money (Odds Ratio = 6.84 x)

Cross-check with Overall Rating

# Simple approach using the data we already have
recommendation_analysis <- ryanair_logistic %>%
  select(recommended_binary, Seat.Comfort, Cabin.Staff.Service, Food...Beverages,
         Ground.Service, Value.For.Money, Overall.Rating) %>%
  na.omit() %>%
  group_by(recommended_binary) %>%
  summarize(
    avg_overall = mean(Overall.Rating),
    avg_seat = mean(Seat.Comfort),
    avg_cabin = mean(Cabin.Staff.Service),
    avg_food = mean(Food...Beverages),
    avg_ground = mean(Ground.Service),
    avg_value = mean(Value.For.Money),
    n_customers = n()
  )

cat("### Service Ratings: Recommending vs Non-Recommending Customers\n")
## ### Service Ratings: Recommending vs Non-Recommending Customers
recommendation_analysis %>% knitr::kable(digits = 2)
recommended_binary avg_overall avg_seat avg_cabin avg_food avg_ground avg_value n_customers
0 1.70 1.69 1.96 1.67 1.24 1.62 1247
1 8.22 3.40 4.08 2.54 2.75 4.54 872

Service Rating Comparison

# Original comparison plot code with scale info in caption
comparison_plot <- recommendation_analysis %>%
  pivot_longer(cols = starts_with("avg_"), names_to = "service", values_to = "rating") %>%
  mutate(
    service_clean = case_when(
      service == "avg_overall" ~ "Overall Rating",
      service == "avg_seat" ~ "Seat Comfort",
      service == "avg_cabin" ~ "Cabin Staff", 
      service == "avg_food" ~ "Food & Beverages",
      service == "avg_ground" ~ "Ground Service",
      service == "avg_value" ~ "Value for Money"
    ),
    recommendation_status = ifelse(recommended_binary == 1, "Recommends", "Doesn't Recommend")
  ) %>%
  ggplot(aes(x = reorder(service_clean, rating), y = rating, fill = recommendation_status)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(rating, 1)), position = position_dodge(width = 0.9), 
            vjust = -0.5, size = 3) +
  scale_fill_manual(values = c("#FFD200", "#073590")) +
  labs(title = "Service Ratings: Recommending vs Non-Recommending Customers",
       subtitle = "How much higher do ratings need to be to get recommendations?",
       x = "Service", y = "Average Rating", 
       fill = "",
       caption = "Note: Service dimensions rated on 5-point scale (1-5)\nOverall satisfaction rated on 10-point scale (1-10)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.caption = element_text(size = 10, color = "gray40"))

comparison_plot

Summary

The logistic regression reveals exactly which service improvements will yield the highest return on investment for customer recommendations. The odds ratios quantify how much each 1-point service improvement increases the likelihood of customers recommending Ryanair to others.

Business Recommendations

Based on our logistic regression analysis, Value for Money emerges as the most critical driver of customer recommendations, with a 1-point rating improvement multiplying recommendation odds by 5.6x. This significantly outpaces other service dimensions, making it the highest priority for intervention.

Cabin Staff Service, Ground Service, and Seat Comfort form a second tier of importance, each delivering substantial returns (1.5-1.8x odds improvement) and representing clear opportunities for meaningful impact. These services should receive focused but secondary investment following value perception improvements.

Food & Beverages, while still positively influencing recommendations, shows the smallest marginal return (1.2x odds improvement) and should be maintained at current service levels rather than receiving significant new investment. The analysis clearly indicates that Ryanair’s recommendation challenge is fundamentally a value proposition issue, not primarily a service quality problem.

Implementation Priority:

  1. Immediate: Address value perception through pricing transparency and communication

  2. Short-term: Enhance cabin staff and ground service operations

  3. Medium-term: Consider basic comfort improvements

  4. Maintain: Current food and beverage standards

Check if clusters match original analysis (this is just for validation)

cat("Cluster distribution in logistic analysis:\n")
## Cluster distribution in logistic analysis:
table(ryanair_segmented$kmeans_cluster)
## 
##    1    2    3 
##  487 1155  477
# Compare with what you expect from previous analysis:
# Cluster 1: 487 (23%)
# Cluster 2: 1155 (54.5%)  
# Cluster 3: 477 (22.5%)

Did some digging on traveler types, but this might confuse our audience as we already put focus on the clusters.

They are just extra info if you are curious.

Also ran logistic regression by traveler type to see if any interesting patterns emerge. Top Recommendation Driver is Value for Money for all types (except Business traveler which is Statistically Not Significant).

Recommendation Rates by Traveler Type

# Create traveler_recommendation data within this chunk
traveler_recommendation <- ryanair_segmented %>%
  group_by(Type.Of.Traveller) %>%
  summarize(
    recommendation_rate = mean(ifelse(Recommended == "yes", 1, 0)) * 100,
    n_customers = n()
  )

# Simple bar plot with no grid and closer bars
traveler_recommendation_plot <- traveler_recommendation %>%
  ggplot(aes(x = recommendation_rate, 
             y = reorder(Type.Of.Traveller, recommendation_rate), 
             fill = Type.Of.Traveller)) +
  geom_bar(stat = "identity", alpha = 0.8, width = 0.5) +
  geom_text(aes(label = paste0(round(recommendation_rate, 1), "%")), 
            hjust = -0.2, size = 4, fontface = "bold") +
  scale_fill_manual(values = c("#073590", "#1E90FF", "#FFD200", "#32CD32", "#808080")) +
  labs(title = "Recommendation Rates by Traveler Type",
       subtitle = "Percentage of customers who recommend Ryanair",
       x = "Recommendation Rate (%)", y = "Traveler Type",
       fill = "Traveler Type") +
  theme_minimal() +
  theme(
    legend.position = "none",
    # Remove grid lines
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    # Add space between bars and axis
    plot.margin = margin(10, 40, 10, 10)
  ) +
  # Extend x-axis to make room for labels
  scale_x_continuous(expand = expansion(mult = c(0, 0.15)))

traveler_recommendation_plot