# Load the cleaned dataset
ryanair_clean <- read.csv("ryanair_reviews_cleaned.csv")
# Load required libraries
library(tidyverse)
library(cluster)
# Perform clustering first to create ryanair_segmented
service_vars <- c("Seat.Comfort", "Cabin.Staff.Service", "Food...Beverages",
"Ground.Service", "Value.For.Money")
clustering_data <- ryanair_clean %>%
select(all_of(service_vars)) %>%
scale() %>%
as.data.frame()
# Perform K-means clustering
set.seed(123)
kmeans_result <- kmeans(clustering_data, centers = 3, nstart = 25)
# Add cluster assignments to create ryanair_segmented
ryanair_segmented <- ryanair_clean %>%
mutate(kmeans_cluster = as.factor(kmeans_result$cluster))
cat("Clustering completed - 3 customer segments created\n")
## Clustering completed - 3 customer segments created
# Now proceed with logistic regression preparation
ryanair_logistic <- ryanair_clean %>%
mutate(
recommended_binary = ifelse(Recommended == "yes", 1, 0),
# Ensure all predictor variables are numeric
across(c(Seat.Comfort, Cabin.Staff.Service, Food...Beverages,
Ground.Service, Value.For.Money), as.numeric)
)
# Add cluster assignments to logistic data
ryanair_logistic <- ryanair_logistic %>%
left_join(ryanair_segmented %>% select(X, kmeans_cluster), by = "X")
# Check recommendation distribution and balance
cat("### Recommendation Distribution - Balance Check\n")
## ### Recommendation Distribution - Balance Check
rec_dist <- table(ryanair_logistic$recommended_binary)
print(rec_dist)
##
## 0 1
## 1247 872
cat("\nRecommendation rate:", mean(ryanair_logistic$recommended_binary, na.rm = TRUE) * 100, "%\n")
##
## Recommendation rate: 41.15149 %
# Check if dataset is balanced (typical threshold: 40-60% split)
rec_rate <- mean(ryanair_logistic$recommended_binary, na.rm = TRUE)
if(rec_rate < 0.4 | rec_rate > 0.6) {
cat("⚠️ WARNING: Dataset is IMBALANCED - may need sampling techniques\n")
} else {
cat("Dataset is reasonably balanced for logistic regression\n")
}
## Dataset is reasonably balanced for logistic regression
# Remove any rows with missing values in key variables
ryanair_logistic_clean <- ryanair_logistic %>%
select(recommended_binary, Seat.Comfort, Cabin.Staff.Service, Food...Beverages,
Ground.Service, Value.For.Money, kmeans_cluster, Overall.Rating) %>%
na.omit()
# NOW we can safely reference ryanair_logistic_clean
cat("Final dataset for logistic regression:", nrow(ryanair_logistic_clean), "observations\n")
## Final dataset for logistic regression: 2119 observations
# Build overall logistic regression model
overall_model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service +
Food...Beverages + Ground.Service + Value.For.Money,
data = ryanair_logistic_clean, family = binomial)
# Create odds_ratios object for dependencies (but don't display)
odds_ratios <- exp(coef(overall_model))
# REPLACE the redundant table with the better one:
coef_details <- tidy(overall_model) %>%
mutate(
term_clean = case_when(
term == "(Intercept)" ~ "Intercept",
term == "Seat.Comfort" ~ "Seat Comfort",
term == "Cabin.Staff.Service" ~ "Cabin Staff",
term == "Food...Beverages" ~ "Food & Beverages",
term == "Ground.Service" ~ "Ground Service",
term == "Value.For.Money" ~ "Value for Money"
),
odds_ratio = exp(estimate)
) %>%
select(Service = term_clean, Coefficient = estimate,
Standard_Error = std.error, Odds_Ratio = odds_ratio, P_Value = p.value)
cat("### Logistic Regression Results\n")
## ### Logistic Regression Results
coef_details %>% knitr::kable(digits = 3)
| Service | Coefficient | Standard_Error | Odds_Ratio | P_Value |
|---|---|---|---|---|
| Intercept | -10.042 | 0.511 | 0.000 | 0.000 |
| Seat Comfort | 0.428 | 0.108 | 1.534 | 0.000 |
| Cabin Staff | 0.570 | 0.093 | 1.769 | 0.000 |
| Food & Beverages | 0.201 | 0.090 | 1.223 | 0.026 |
| Ground Service | 0.482 | 0.086 | 1.619 | 0.000 |
| Value for Money | 1.718 | 0.104 | 5.575 | 0.000 |
# Model performance (keep this)
predicted_probs <- predict(overall_model, type = "response")
predicted_class <- ifelse(predicted_probs > 0.5, 1, 0)
accuracy <- mean(predicted_class == ryanair_logistic_clean$recommended_binary)
cat("\nModel Accuracy:", round(accuracy * 100, 1), "%\n")
##
## Model Accuracy: 93.8 %
library(broom)
# Create odds ratio plot only
odds_plot <- tidy(overall_model) %>%
filter(term != "(Intercept)") %>%
mutate(
term_clean = case_when(
term == "Seat.Comfort" ~ "Seat Comfort",
term == "Cabin.Staff.Service" ~ "Cabin Staff",
term == "Food...Beverages" ~ "Food & Beverages",
term == "Ground.Service" ~ "Ground Service",
term == "Value.For.Money" ~ "Value for Money"
),
odds_ratio = exp(estimate)
) %>%
ggplot(aes(x = reorder(term_clean, odds_ratio), y = odds_ratio)) +
geom_bar(stat = "identity", fill = "#073590", alpha = 0.8, width = 0.7) +
geom_text(aes(label = round(odds_ratio, 2)), hjust = -0.2, size = 4, color = "#073590") +
coord_flip() +
labs(title = "How Service Improvements Drive Recommendations",
subtitle = "Each 1-point rating increase multiplies odds of recommendation",
x = "Service Dimension", y = "Odds Ratio") +
theme_minimal() +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
odds_plot
# Create a clean results table for business interpretation
business_insights <- tidy(overall_model) %>%
filter(term != "(Intercept)") %>%
mutate(
Service = case_when(
term == "Seat.Comfort" ~ "Seat Comfort",
term == "Cabin.Staff.Service" ~ "Cabin Staff Service",
term == "Food...Beverages" ~ "Food & Beverages",
term == "Ground.Service" ~ "Ground Service",
term == "Value.For.Money" ~ "Value for Money"
),
`Odds Ratio` = round(exp(estimate), 2),
`Impact` = case_when(
`Odds Ratio` >= 3 ~ "VERY HIGH",
`Odds Ratio` >= 2 ~ "HIGH",
`Odds Ratio` >= 1.5 ~ "MEDIUM",
TRUE ~ "LOW"
),
`Business Interpretation` = paste0("1-point improvement makes customers ", `Odds Ratio`, "x more likely to recommend")
) %>%
select(Service, `Odds Ratio`, Impact, `Business Interpretation`) %>%
arrange(desc(`Odds Ratio`))
cat("### How Service Improvements Drive Recommendations\n")
## ### How Service Improvements Drive Recommendations
business_insights %>% knitr::kable()
| Service | Odds Ratio | Impact | Business Interpretation |
|---|---|---|---|
| Value for Money | 5.57 | VERY HIGH | 1-point improvement makes customers 5.57x more likely to recommend |
| Cabin Staff Service | 1.77 | MEDIUM | 1-point improvement makes customers 1.77x more likely to recommend |
| Ground Service | 1.62 | MEDIUM | 1-point improvement makes customers 1.62x more likely to recommend |
| Seat Comfort | 1.53 | MEDIUM | 1-point improvement makes customers 1.53x more likely to recommend |
| Food & Beverages | 1.22 | LOW | 1-point improvement makes customers 1.22x more likely to recommend |
Priority #1: Focus on Value for Money - delivers
5.58x return on improvements
• Communicate value proposition clearly
• Review pricing strategy and transparency
• Highlight what’s included vs extra costs
Priority #2: Improve Cabin Staff Service -
delivers 1.77x return
• Implement staff training programs
• Set clear service standards
• Improve complaint handling
Priority #3: Enhance Ground Service - delivers
1.62x return
• Streamline airport operations
• Reduce check-in and boarding wait times
• Improve baggage handling
Secondary: Seat Comfort improvements - delivers
1.53x return
• Basic comfort enhancements
• Legroom optimization where possible
• Cleanliness and maintenance
Lowest Priority: Food & Beverages - delivers
1.22x return
• Maintain current standards
• No major investment needed
# First create cluster profile names
cluster_names <- data.frame(
kmeans_cluster = 1:3,
segment_name = c("Satisfied Service-Experienced",
"Highly Dissatisfied Customers",
"Value-Satisfied but Ground Service Critics")
)
# Run separate models for each cluster
cluster_models <- list()
for (cluster_num in 1:3) {
cluster_data <- ryanair_logistic_clean %>%
filter(kmeans_cluster == cluster_num)
if (nrow(cluster_data) > 0) {
model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service +
Food...Beverages + Ground.Service + Value.For.Money,
data = cluster_data, family = binomial)
cluster_models[[paste0("Cluster_", cluster_num)]] <- model
}
}
# Compare coefficients across clusters - CREATE cluster_coefs FIRST
cluster_coefs <- map_dfr(cluster_models, ~{
tidy(.x) %>%
filter(term != "(Intercept)") %>%
mutate(odds_ratio = exp(estimate))
}, .id = "cluster")
# NOW create the plot with significance coding
cluster_comparison_plot <- cluster_coefs %>%
mutate(
term = gsub("Seat.Comfort", "Seat Comfort", term),
term = gsub("Cabin.Staff.Service", "Cabin Staff", term),
term = gsub("Food...Beverages", "Food & Beverages", term),
term = gsub("Ground.Service", "Ground Service", term),
term = gsub("Value.For.Money", "Value for Money", term),
cluster_num = as.numeric(gsub("Cluster_", "", cluster)),
# Add significance indicator
significant = p.value < 0.05
) %>%
ggplot(aes(x = term, y = odds_ratio, color = as.factor(cluster_num), shape = significant)) +
geom_point(size = 4, position = position_dodge(width = 0.5), stroke = 1.5) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red", size = 1) +
scale_color_manual(values = c("#073590", "#FFD200", "#2E8B57"),
labels = cluster_names$segment_name,
name = "Customer Segment") +
scale_shape_manual(values = c(1, 16), # 1 = hollow, 16 = solid
labels = c("Not Significant", "Significant (p < 0.05)"),
name = "Statistical Significance") +
coord_flip() +
labs(title = "Recommendation Drivers by Customer Segment",
subtitle = "Solid dots = statistically significant | Hollow dots = not significant",
x = "Service Dimension", y = "Odds Ratio") +
theme_minimal() +
scale_y_continuous(limits = c(0.9, max(cluster_coefs$odds_ratio) * 1.1))
cluster_comparison_plot
# Create cluster profile names
cluster_names <- data.frame(
kmeans_cluster = 1:3,
segment_name = c("Satisfied Service-Experienced",
"Highly Dissatisfied Customers",
"Value-Satisfied but Ground Service Critics")
)
# Run separate models for each cluster and create results table
cluster_results <- map_dfr(1:3, function(cluster_num) {
cluster_data <- ryanair_logistic_clean %>%
filter(kmeans_cluster == cluster_num)
if (nrow(cluster_data) > 0) {
model <- glm(recommended_binary ~ Seat.Comfort + Cabin.Staff.Service +
Food...Beverages + Ground.Service + Value.For.Money,
data = cluster_data, family = binomial)
tidy(model) %>%
filter(term != "(Intercept)") %>%
mutate(
term_clean = case_when(
term == "Seat.Comfort" ~ "Seat Comfort",
term == "Cabin.Staff.Service" ~ "Cabin Staff",
term == "Food...Beverages" ~ "Food & Beverages",
term == "Ground.Service" ~ "Ground Service",
term == "Value.For.Money" ~ "Value for Money"
),
odds_ratio = exp(estimate),
cluster = cluster_num,
segment_name = cluster_names$segment_name[cluster_num],
n_customers = nrow(cluster_data)
) %>%
select(Segment = segment_name, Cluster = cluster, Service = term_clean,
Coefficient = estimate, Odds_Ratio = odds_ratio, P_Value = p.value,
Customers = n_customers)
}
})
cat("### Cluster-Specific Logistic Regression Results\n")
## ### Cluster-Specific Logistic Regression Results
cluster_results %>%
select(-Cluster) %>% # Remove cluster number since segment name is clearer
knitr::kable(digits = 3)
| Segment | Service | Coefficient | Odds_Ratio | P_Value | Customers |
|---|---|---|---|---|---|
| Satisfied Service-Experienced | Seat Comfort | 0.089 | 1.093 | 0.692 | 487 |
| Satisfied Service-Experienced | Cabin Staff | 0.679 | 1.971 | 0.001 | 487 |
| Satisfied Service-Experienced | Food & Beverages | 0.243 | 1.275 | 0.377 | 487 |
| Satisfied Service-Experienced | Ground Service | 0.012 | 1.012 | 0.965 | 487 |
| Satisfied Service-Experienced | Value for Money | 1.361 | 3.902 | 0.000 | 487 |
| Highly Dissatisfied Customers | Seat Comfort | 0.418 | 1.518 | 0.066 | 1155 |
| Highly Dissatisfied Customers | Cabin Staff | 0.238 | 1.269 | 0.253 | 1155 |
| Highly Dissatisfied Customers | Food & Beverages | 0.477 | 1.611 | 0.074 | 1155 |
| Highly Dissatisfied Customers | Ground Service | 0.652 | 1.919 | 0.002 | 1155 |
| Highly Dissatisfied Customers | Value for Money | 1.616 | 5.033 | 0.000 | 1155 |
| Value-Satisfied but Ground Service Critics | Seat Comfort | 0.552 | 1.737 | 0.005 | 477 |
| Value-Satisfied but Ground Service Critics | Cabin Staff | 0.497 | 1.644 | 0.008 | 477 |
| Value-Satisfied but Ground Service Critics | Food & Beverages | 0.161 | 1.175 | 0.150 | 477 |
| Value-Satisfied but Ground Service Critics | Ground Service | 0.644 | 1.904 | 0.130 | 477 |
| Value-Satisfied but Ground Service Critics | Value for Money | 1.923 | 6.838 | 0.000 | 477 |
The table is detailed value used in the above plot, showing how each service impacts recommendations per segment.
Insights by Customer Segment:
Satisfied Service-Experienced : Most influenced by Value for Money (Odds Ratio = 3.9 x)
Highly Dissatisfied Customers : Most influenced by Value for Money (Odds Ratio = 5.03 x)
Value-Satisfied but Ground Service Critics : Most influenced by Value for Money (Odds Ratio = 6.84 x)
# Simple approach using the data we already have
recommendation_analysis <- ryanair_logistic %>%
select(recommended_binary, Seat.Comfort, Cabin.Staff.Service, Food...Beverages,
Ground.Service, Value.For.Money, Overall.Rating) %>%
na.omit() %>%
group_by(recommended_binary) %>%
summarize(
avg_overall = mean(Overall.Rating),
avg_seat = mean(Seat.Comfort),
avg_cabin = mean(Cabin.Staff.Service),
avg_food = mean(Food...Beverages),
avg_ground = mean(Ground.Service),
avg_value = mean(Value.For.Money),
n_customers = n()
)
cat("### Service Ratings: Recommending vs Non-Recommending Customers\n")
## ### Service Ratings: Recommending vs Non-Recommending Customers
recommendation_analysis %>% knitr::kable(digits = 2)
| recommended_binary | avg_overall | avg_seat | avg_cabin | avg_food | avg_ground | avg_value | n_customers |
|---|---|---|---|---|---|---|---|
| 0 | 1.70 | 1.69 | 1.96 | 1.67 | 1.24 | 1.62 | 1247 |
| 1 | 8.22 | 3.40 | 4.08 | 2.54 | 2.75 | 4.54 | 872 |
# Original comparison plot code with scale info in caption
comparison_plot <- recommendation_analysis %>%
pivot_longer(cols = starts_with("avg_"), names_to = "service", values_to = "rating") %>%
mutate(
service_clean = case_when(
service == "avg_overall" ~ "Overall Rating",
service == "avg_seat" ~ "Seat Comfort",
service == "avg_cabin" ~ "Cabin Staff",
service == "avg_food" ~ "Food & Beverages",
service == "avg_ground" ~ "Ground Service",
service == "avg_value" ~ "Value for Money"
),
recommendation_status = ifelse(recommended_binary == 1, "Recommends", "Doesn't Recommend")
) %>%
ggplot(aes(x = reorder(service_clean, rating), y = rating, fill = recommendation_status)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = round(rating, 1)), position = position_dodge(width = 0.9),
vjust = -0.5, size = 3) +
scale_fill_manual(values = c("#FFD200", "#073590")) +
labs(title = "Service Ratings: Recommending vs Non-Recommending Customers",
subtitle = "How much higher do ratings need to be to get recommendations?",
x = "Service", y = "Average Rating",
fill = "",
caption = "Note: Service dimensions rated on 5-point scale (1-5)\nOverall satisfaction rated on 10-point scale (1-10)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.caption = element_text(size = 10, color = "gray40"))
comparison_plot
The logistic regression reveals exactly which service improvements will yield the highest return on investment for customer recommendations. The odds ratios quantify how much each 1-point service improvement increases the likelihood of customers recommending Ryanair to others.
Based on our logistic regression analysis, Value for Money emerges as the most critical driver of customer recommendations, with a 1-point rating improvement multiplying recommendation odds by 5.6x. This significantly outpaces other service dimensions, making it the highest priority for intervention.
Cabin Staff Service, Ground Service, and Seat Comfort form a second tier of importance, each delivering substantial returns (1.5-1.8x odds improvement) and representing clear opportunities for meaningful impact. These services should receive focused but secondary investment following value perception improvements.
Food & Beverages, while still positively influencing recommendations, shows the smallest marginal return (1.2x odds improvement) and should be maintained at current service levels rather than receiving significant new investment. The analysis clearly indicates that Ryanair’s recommendation challenge is fundamentally a value proposition issue, not primarily a service quality problem.
Implementation Priority:
Immediate: Address value perception through pricing transparency and communication
Short-term: Enhance cabin staff and ground service operations
Medium-term: Consider basic comfort improvements
Maintain: Current food and beverage standards
cat("Cluster distribution in logistic analysis:\n")
## Cluster distribution in logistic analysis:
table(ryanair_segmented$kmeans_cluster)
##
## 1 2 3
## 487 1155 477
# Compare with what you expect from previous analysis:
# Cluster 1: 487 (23%)
# Cluster 2: 1155 (54.5%)
# Cluster 3: 477 (22.5%)
Did some digging on traveler types, but this might confuse our audience as we already put focus on the clusters.
They are just extra info if you are curious.
Also ran logistic regression by traveler type to see if any interesting patterns emerge. Top Recommendation Driver is Value for Money for all types (except Business traveler which is Statistically Not Significant).
# Create traveler_recommendation data within this chunk
traveler_recommendation <- ryanair_segmented %>%
group_by(Type.Of.Traveller) %>%
summarize(
recommendation_rate = mean(ifelse(Recommended == "yes", 1, 0)) * 100,
n_customers = n()
)
# Simple bar plot with no grid and closer bars
traveler_recommendation_plot <- traveler_recommendation %>%
ggplot(aes(x = recommendation_rate,
y = reorder(Type.Of.Traveller, recommendation_rate),
fill = Type.Of.Traveller)) +
geom_bar(stat = "identity", alpha = 0.8, width = 0.5) +
geom_text(aes(label = paste0(round(recommendation_rate, 1), "%")),
hjust = -0.2, size = 4, fontface = "bold") +
scale_fill_manual(values = c("#073590", "#1E90FF", "#FFD200", "#32CD32", "#808080")) +
labs(title = "Recommendation Rates by Traveler Type",
subtitle = "Percentage of customers who recommend Ryanair",
x = "Recommendation Rate (%)", y = "Traveler Type",
fill = "Traveler Type") +
theme_minimal() +
theme(
legend.position = "none",
# Remove grid lines
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# Add space between bars and axis
plot.margin = margin(10, 40, 10, 10)
) +
# Extend x-axis to make room for labels
scale_x_continuous(expand = expansion(mult = c(0, 0.15)))
traveler_recommendation_plot