1. Load Dataset

library(readxl)
df <- read_excel("satisfaction.xlsx", sheet = "satisfaction_v2")
head(df)
## # A tibble: 6 × 24
##       id satisfaction_v2 Gender `Customer Type`   Age `Type of Travel` Class   
##    <dbl> <chr>           <chr>  <chr>           <dbl> <chr>            <chr>   
## 1  11112 satisfied       Female Loyal Customer     65 Personal Travel  Eco     
## 2 110278 satisfied       Male   Loyal Customer     47 Personal Travel  Business
## 3 103199 satisfied       Female Loyal Customer     15 Personal Travel  Eco     
## 4  47462 satisfied       Female Loyal Customer     60 Personal Travel  Eco     
## 5 120011 satisfied       Female Loyal Customer     70 Personal Travel  Eco     
## 6 100744 satisfied       Male   Loyal Customer     30 Personal Travel  Eco     
## # ℹ 17 more variables: `Flight Distance` <dbl>, `Seat comfort` <dbl>,
## #   `Departure/Arrival time convenient` <dbl>, `Food and drink` <dbl>,
## #   `Gate location` <dbl>, `Inflight wifi service` <dbl>,
## #   `Inflight entertainment` <dbl>, `Online support` <dbl>,
## #   `Ease of Online booking` <dbl>, `On-board service` <dbl>,
## #   `Leg room service` <dbl>, `Baggage handling` <dbl>,
## #   `Checkin service` <dbl>, Cleanliness <dbl>, `Online boarding` <dbl>, …

2. Preprocessing

df <- na.omit(df)
df$satisfaction <- ifelse(df$satisfaction_v2 == "satisfied", 1, 0)

# Label encoding
df$Gender <- as.numeric(factor(df$Gender))
df$Customer.Type <- as.numeric(factor(df$`Customer Type`))
df$Type.of.Travel <- as.numeric(factor(df$`Type of Travel`))
df$Class <- as.numeric(factor(df$Class))

# Normalisasi
num_cols <- c("Flight Distance", "Seat comfort", "Departure/Arrival time convenient",
              "Food and drink", "Inflight wifi service", "Inflight entertainment",
              "Online support", "Ease of Online booking", "On-board service",
              "Leg room service", "Baggage handling", "Checkin service",
              "Cleanliness", "Online boarding", "Departure Delay in Minutes",
              "Arrival Delay in Minutes")

df[num_cols] <- scale(df[num_cols])

3. Eksplorasi Data

library(ggplot2)
ggplot(df, aes(x = Age)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  labs(title = "Distribusi Usia Penumpang")

library(tidyr)
library(dplyr)
num_data <- df[, num_cols]
long_data <- pivot_longer(num_data, cols = everything(), names_to = "Feature", values_to = "Value")

ggplot(long_data, aes(x = Value)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  facet_wrap(~ Feature, scales = "free", ncol = 4) +
  theme_minimal()

4. Feature Selection (EFA)

library(psych)
efa_data <- df[, num_cols]
KMO(efa_data)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = efa_data)
## Overall MSA =  0.74
## MSA for each item = 
##                   Flight Distance                      Seat comfort 
##                              0.71                              0.71 
## Departure/Arrival time convenient                    Food and drink 
##                              0.69                              0.65 
##             Inflight wifi service            Inflight entertainment 
##                              0.84                              0.78 
##                    Online support            Ease of Online booking 
##                              0.84                              0.77 
##                  On-board service                  Leg room service 
##                              0.85                              0.89 
##                  Baggage handling                   Checkin service 
##                              0.82                              0.73 
##                       Cleanliness                   Online boarding 
##                              0.80                              0.82 
##        Departure Delay in Minutes          Arrival Delay in Minutes 
##                              0.51                              0.51
fa.parallel(efa_data, fa = "fa")

## Parallel analysis suggests that the number of factors =  6  and the number of components =  NA
efa_result <- fa(efa_data, nfactors = 6, rotate = "varimax", fm = "ml")
print(efa_result, cut = 0.3)
## Factor Analysis using method =  ml
## Call: fa(r = efa_data, nfactors = 6, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                                     ML1   ML5   ML2   ML4   ML3   ML6    h2
## Flight Distance                                                       0.017
## Seat comfort                                         0.76             0.660
## Departure/Arrival time convenient                    0.62             0.404
## Food and drink                                       0.89             0.825
## Inflight wifi service              0.75                               0.560
## Inflight entertainment             0.31                    0.86       0.932
## Online support                     0.75                               0.634
## Ease of Online booking             0.80  0.52                         0.985
## On-board service                         0.70                         0.508
## Leg room service                         0.54                         0.308
## Baggage handling                         0.76                         0.599
## Checkin service                                                  0.42 0.270
## Cleanliness                              0.79                         0.635
## Online boarding                    0.84                               0.731
## Departure Delay in Minutes                     0.98                   0.967
## Arrival Delay in Minutes                       0.98                   0.964
##                                      u2 com
## Flight Distance                   0.983 1.4
## Seat comfort                      0.340 1.3
## Departure/Arrival time convenient 0.596 1.1
## Food and drink                    0.175 1.1
## Inflight wifi service             0.440 1.0
## Inflight entertainment            0.068 1.6
## Online support                    0.366 1.3
## Ease of Online booking            0.015 2.0
## On-board service                  0.492 1.1
## Leg room service                  0.692 1.1
## Baggage handling                  0.401 1.1
## Checkin service                   0.730 2.1
## Cleanliness                       0.365 1.1
## Online boarding                   0.269 1.1
## Departure Delay in Minutes        0.033 1.0
## Arrival Delay in Minutes          0.036 1.0
## 
##                        ML1  ML5  ML2  ML4  ML3  ML6
## SS loadings           2.61 2.36 1.93 1.82 0.91 0.36
## Proportion Var        0.16 0.15 0.12 0.11 0.06 0.02
## Cumulative Var        0.16 0.31 0.43 0.55 0.60 0.62
## Proportion Explained  0.26 0.24 0.19 0.18 0.09 0.04
## Cumulative Proportion 0.26 0.50 0.69 0.87 0.96 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 6 factors are sufficient.
## 
## df null model =  120  with the objective function =  8.24 with Chi Square =  1066670
## df of  the model are 39  and the objective function was  0.03 
## 
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.02 
## 
## The harmonic n.obs is  129487 with the empirical chi square  2500.04  with prob <  0 
## The total n.obs was  129487  with Likelihood Chi Square =  4228.36  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.988
## RMSEA index =  0.029  and the 90 % confidence intervals are  0.028 0.03
## BIC =  3769.27
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML1  ML5  ML2  ML4  ML3  ML6
## Correlation of (regression) scores with factors   0.95 0.93 0.99 0.93 0.93 0.76
## Multiple R square of scores with factors          0.91 0.86 0.98 0.86 0.87 0.58
## Minimum correlation of possible factor scores     0.82 0.71 0.95 0.72 0.74 0.17

5. Modeling

library(caret)
selected_vars <- c("Online boarding", "Departure Delay in Minutes", "Inflight entertainment",
                   "Cleanliness", "Food and drink", "Checkin service")
model_formula <- as.formula(
  paste("satisfaction ~", paste(sprintf("`%s`", selected_vars), collapse = " + "))
)

set.seed(123)
train_index <- createDataPartition(df$satisfaction, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

Logistic Regression

log_model <- glm(model_formula, data = train_data, family = binomial)
summary(log_model)
## 
## Call:
## glm(formula = model_formula, family = binomial, data = train_data)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   0.236027   0.007825   30.16   <2e-16 ***
## `Online boarding`             0.405657   0.008310   48.81   <2e-16 ***
## `Departure Delay in Minutes` -0.163794   0.008660  -18.91   <2e-16 ***
## `Inflight entertainment`      1.240332   0.010247  121.04   <2e-16 ***
## Cleanliness                   0.490031   0.008078   60.66   <2e-16 ***
## `Food and drink`             -0.189073   0.009023  -20.95   <2e-16 ***
## `Checkin service`             0.273420   0.008146   33.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 142660  on 103589  degrees of freedom
## Residual deviance: 100789  on 103583  degrees of freedom
## AIC: 100803
## 
## Number of Fisher Scoring iterations: 4
anova(log_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: satisfaction
## 
## Terms added sequentially (first to last)
## 
## 
##                              Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                        103589     142660              
## `Online boarding`             1  12125.0    103588     130535 < 2.2e-16 ***
## `Departure Delay in Minutes`  1    543.1    103587     129992 < 2.2e-16 ***
## `Inflight entertainment`      1  22529.4    103586     107462 < 2.2e-16 ***
## Cleanliness                   1   4996.5    103585     102466 < 2.2e-16 ***
## `Food and drink`              1    539.2    103584     101927 < 2.2e-16 ***
## `Checkin service`             1   1137.5    103583     100789 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef(log_model))
##                  (Intercept)            `Online boarding` 
##                    1.2662082                    1.5002878 
## `Departure Delay in Minutes`     `Inflight entertainment` 
##                    0.8489168                    3.4567612 
##                  Cleanliness             `Food and drink` 
##                    1.6323663                    0.8277261 
##            `Checkin service` 
##                    1.3144523
log_probs <- predict(log_model, newdata = test_data, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
confusionMatrix(factor(log_preds), factor(test_data$satisfaction), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  8832  2344
##          1  2925 11796
##                                           
##                Accuracy : 0.7965          
##                  95% CI : (0.7916, 0.8014)
##     No Information Rate : 0.546           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5879          
##                                           
##  Mcnemar's Test P-Value : 1.346e-15       
##                                           
##             Sensitivity : 0.8342          
##             Specificity : 0.7512          
##          Pos Pred Value : 0.8013          
##          Neg Pred Value : 0.7903          
##              Prevalence : 0.5460          
##          Detection Rate : 0.4555          
##    Detection Prevalence : 0.5684          
##       Balanced Accuracy : 0.7927          
##                                           
##        'Positive' Class : 1               
## 

LDA

library(MASS)
lda_model <- lda(model_formula, data = train_data)
lda_model$scaling
##                                      LD1
## `Online boarding`             0.29819167
## `Departure Delay in Minutes` -0.09514001
## `Inflight entertainment`      0.94150690
## Cleanliness                   0.35899294
## `Food and drink`             -0.11689066
## `Checkin service`             0.20405678
lda_preds <- predict(lda_model, newdata = test_data)$class
confusionMatrix(factor(lda_preds), factor(test_data$satisfaction), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  8768  2279
##          1  2989 11861
##                                           
##                Accuracy : 0.7966          
##                  95% CI : (0.7916, 0.8015)
##     No Information Rate : 0.546           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5876          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8388          
##             Specificity : 0.7458          
##          Pos Pred Value : 0.7987          
##          Neg Pred Value : 0.7937          
##              Prevalence : 0.5460          
##          Detection Rate : 0.4580          
##    Detection Prevalence : 0.5734          
##       Balanced Accuracy : 0.7923          
##                                           
##        'Positive' Class : 1               
## 
lda_probs <- predict(lda_model, newdata = test_data)$posterior[, 2]

6. Evaluasi & Visualisasi

library(pROC)
roc_log <- roc(test_data$satisfaction, log_probs)
roc_lda <- roc(test_data$satisfaction, lda_probs)

plot(roc_log, col = "blue", lwd = 2, main = "ROC Curve Logistic vs LDA")
lines(roc_lda, col = "red", lwd = 2)
legend("bottomright",
       legend = c(paste("Logistic AUC =", round(auc(roc_log), 3)),
                  paste("LDA AUC =", round(auc(roc_lda), 3))),
       col = c("blue", "red"), lwd = 2)