1. Load Dataset
library(readxl)
df <- read_excel("satisfaction.xlsx", sheet = "satisfaction_v2")
head(df)
## # A tibble: 6 × 24
## id satisfaction_v2 Gender `Customer Type` Age `Type of Travel` Class
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 11112 satisfied Female Loyal Customer 65 Personal Travel Eco
## 2 110278 satisfied Male Loyal Customer 47 Personal Travel Business
## 3 103199 satisfied Female Loyal Customer 15 Personal Travel Eco
## 4 47462 satisfied Female Loyal Customer 60 Personal Travel Eco
## 5 120011 satisfied Female Loyal Customer 70 Personal Travel Eco
## 6 100744 satisfied Male Loyal Customer 30 Personal Travel Eco
## # ℹ 17 more variables: `Flight Distance` <dbl>, `Seat comfort` <dbl>,
## # `Departure/Arrival time convenient` <dbl>, `Food and drink` <dbl>,
## # `Gate location` <dbl>, `Inflight wifi service` <dbl>,
## # `Inflight entertainment` <dbl>, `Online support` <dbl>,
## # `Ease of Online booking` <dbl>, `On-board service` <dbl>,
## # `Leg room service` <dbl>, `Baggage handling` <dbl>,
## # `Checkin service` <dbl>, Cleanliness <dbl>, `Online boarding` <dbl>, …
2. Preprocessing
df <- na.omit(df)
df$satisfaction <- ifelse(df$satisfaction_v2 == "satisfied", 1, 0)
# Label encoding
df$Gender <- as.numeric(factor(df$Gender))
df$Customer.Type <- as.numeric(factor(df$`Customer Type`))
df$Type.of.Travel <- as.numeric(factor(df$`Type of Travel`))
df$Class <- as.numeric(factor(df$Class))
# Normalisasi
num_cols <- c("Flight Distance", "Seat comfort", "Departure/Arrival time convenient",
"Food and drink", "Inflight wifi service", "Inflight entertainment",
"Online support", "Ease of Online booking", "On-board service",
"Leg room service", "Baggage handling", "Checkin service",
"Cleanliness", "Online boarding", "Departure Delay in Minutes",
"Arrival Delay in Minutes")
df[num_cols] <- scale(df[num_cols])
3. Eksplorasi Data
library(ggplot2)
ggplot(df, aes(x = Age)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
labs(title = "Distribusi Usia Penumpang")

library(tidyr)
library(dplyr)
num_data <- df[, num_cols]
long_data <- pivot_longer(num_data, cols = everything(), names_to = "Feature", values_to = "Value")
ggplot(long_data, aes(x = Value)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
facet_wrap(~ Feature, scales = "free", ncol = 4) +
theme_minimal()

4. Feature Selection (EFA)
library(psych)
efa_data <- df[, num_cols]
KMO(efa_data)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = efa_data)
## Overall MSA = 0.74
## MSA for each item =
## Flight Distance Seat comfort
## 0.71 0.71
## Departure/Arrival time convenient Food and drink
## 0.69 0.65
## Inflight wifi service Inflight entertainment
## 0.84 0.78
## Online support Ease of Online booking
## 0.84 0.77
## On-board service Leg room service
## 0.85 0.89
## Baggage handling Checkin service
## 0.82 0.73
## Cleanliness Online boarding
## 0.80 0.82
## Departure Delay in Minutes Arrival Delay in Minutes
## 0.51 0.51
fa.parallel(efa_data, fa = "fa")

## Parallel analysis suggests that the number of factors = 6 and the number of components = NA
efa_result <- fa(efa_data, nfactors = 6, rotate = "varimax", fm = "ml")
print(efa_result, cut = 0.3)
## Factor Analysis using method = ml
## Call: fa(r = efa_data, nfactors = 6, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML5 ML2 ML4 ML3 ML6 h2
## Flight Distance 0.017
## Seat comfort 0.76 0.660
## Departure/Arrival time convenient 0.62 0.404
## Food and drink 0.89 0.825
## Inflight wifi service 0.75 0.560
## Inflight entertainment 0.31 0.86 0.932
## Online support 0.75 0.634
## Ease of Online booking 0.80 0.52 0.985
## On-board service 0.70 0.508
## Leg room service 0.54 0.308
## Baggage handling 0.76 0.599
## Checkin service 0.42 0.270
## Cleanliness 0.79 0.635
## Online boarding 0.84 0.731
## Departure Delay in Minutes 0.98 0.967
## Arrival Delay in Minutes 0.98 0.964
## u2 com
## Flight Distance 0.983 1.4
## Seat comfort 0.340 1.3
## Departure/Arrival time convenient 0.596 1.1
## Food and drink 0.175 1.1
## Inflight wifi service 0.440 1.0
## Inflight entertainment 0.068 1.6
## Online support 0.366 1.3
## Ease of Online booking 0.015 2.0
## On-board service 0.492 1.1
## Leg room service 0.692 1.1
## Baggage handling 0.401 1.1
## Checkin service 0.730 2.1
## Cleanliness 0.365 1.1
## Online boarding 0.269 1.1
## Departure Delay in Minutes 0.033 1.0
## Arrival Delay in Minutes 0.036 1.0
##
## ML1 ML5 ML2 ML4 ML3 ML6
## SS loadings 2.61 2.36 1.93 1.82 0.91 0.36
## Proportion Var 0.16 0.15 0.12 0.11 0.06 0.02
## Cumulative Var 0.16 0.31 0.43 0.55 0.60 0.62
## Proportion Explained 0.26 0.24 0.19 0.18 0.09 0.04
## Cumulative Proportion 0.26 0.50 0.69 0.87 0.96 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 6 factors are sufficient.
##
## df null model = 120 with the objective function = 8.24 with Chi Square = 1066670
## df of the model are 39 and the objective function was 0.03
##
## The root mean square of the residuals (RMSR) is 0.01
## The df corrected root mean square of the residuals is 0.02
##
## The harmonic n.obs is 129487 with the empirical chi square 2500.04 with prob < 0
## The total n.obs was 129487 with Likelihood Chi Square = 4228.36 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.988
## RMSEA index = 0.029 and the 90 % confidence intervals are 0.028 0.03
## BIC = 3769.27
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML1 ML5 ML2 ML4 ML3 ML6
## Correlation of (regression) scores with factors 0.95 0.93 0.99 0.93 0.93 0.76
## Multiple R square of scores with factors 0.91 0.86 0.98 0.86 0.87 0.58
## Minimum correlation of possible factor scores 0.82 0.71 0.95 0.72 0.74 0.17
5. Modeling
library(caret)
selected_vars <- c("Online boarding", "Departure Delay in Minutes", "Inflight entertainment",
"Cleanliness", "Food and drink", "Checkin service")
model_formula <- as.formula(
paste("satisfaction ~", paste(sprintf("`%s`", selected_vars), collapse = " + "))
)
set.seed(123)
train_index <- createDataPartition(df$satisfaction, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
Logistic Regression
log_model <- glm(model_formula, data = train_data, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = model_formula, family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.236027 0.007825 30.16 <2e-16 ***
## `Online boarding` 0.405657 0.008310 48.81 <2e-16 ***
## `Departure Delay in Minutes` -0.163794 0.008660 -18.91 <2e-16 ***
## `Inflight entertainment` 1.240332 0.010247 121.04 <2e-16 ***
## Cleanliness 0.490031 0.008078 60.66 <2e-16 ***
## `Food and drink` -0.189073 0.009023 -20.95 <2e-16 ***
## `Checkin service` 0.273420 0.008146 33.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 142660 on 103589 degrees of freedom
## Residual deviance: 100789 on 103583 degrees of freedom
## AIC: 100803
##
## Number of Fisher Scoring iterations: 4
anova(log_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: satisfaction
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 103589 142660
## `Online boarding` 1 12125.0 103588 130535 < 2.2e-16 ***
## `Departure Delay in Minutes` 1 543.1 103587 129992 < 2.2e-16 ***
## `Inflight entertainment` 1 22529.4 103586 107462 < 2.2e-16 ***
## Cleanliness 1 4996.5 103585 102466 < 2.2e-16 ***
## `Food and drink` 1 539.2 103584 101927 < 2.2e-16 ***
## `Checkin service` 1 1137.5 103583 100789 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef(log_model))
## (Intercept) `Online boarding`
## 1.2662082 1.5002878
## `Departure Delay in Minutes` `Inflight entertainment`
## 0.8489168 3.4567612
## Cleanliness `Food and drink`
## 1.6323663 0.8277261
## `Checkin service`
## 1.3144523
log_probs <- predict(log_model, newdata = test_data, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
confusionMatrix(factor(log_preds), factor(test_data$satisfaction), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8832 2344
## 1 2925 11796
##
## Accuracy : 0.7965
## 95% CI : (0.7916, 0.8014)
## No Information Rate : 0.546
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5879
##
## Mcnemar's Test P-Value : 1.346e-15
##
## Sensitivity : 0.8342
## Specificity : 0.7512
## Pos Pred Value : 0.8013
## Neg Pred Value : 0.7903
## Prevalence : 0.5460
## Detection Rate : 0.4555
## Detection Prevalence : 0.5684
## Balanced Accuracy : 0.7927
##
## 'Positive' Class : 1
##
LDA
library(MASS)
lda_model <- lda(model_formula, data = train_data)
lda_model$scaling
## LD1
## `Online boarding` 0.29819167
## `Departure Delay in Minutes` -0.09514001
## `Inflight entertainment` 0.94150690
## Cleanliness 0.35899294
## `Food and drink` -0.11689066
## `Checkin service` 0.20405678
lda_preds <- predict(lda_model, newdata = test_data)$class
confusionMatrix(factor(lda_preds), factor(test_data$satisfaction), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8768 2279
## 1 2989 11861
##
## Accuracy : 0.7966
## 95% CI : (0.7916, 0.8015)
## No Information Rate : 0.546
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5876
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8388
## Specificity : 0.7458
## Pos Pred Value : 0.7987
## Neg Pred Value : 0.7937
## Prevalence : 0.5460
## Detection Rate : 0.4580
## Detection Prevalence : 0.5734
## Balanced Accuracy : 0.7923
##
## 'Positive' Class : 1
##
lda_probs <- predict(lda_model, newdata = test_data)$posterior[, 2]
6. Evaluasi & Visualisasi
library(pROC)
roc_log <- roc(test_data$satisfaction, log_probs)
roc_lda <- roc(test_data$satisfaction, lda_probs)
plot(roc_log, col = "blue", lwd = 2, main = "ROC Curve Logistic vs LDA")
lines(roc_lda, col = "red", lwd = 2)
legend("bottomright",
legend = c(paste("Logistic AUC =", round(auc(roc_log), 3)),
paste("LDA AUC =", round(auc(roc_lda), 3))),
col = c("blue", "red"), lwd = 2)
