Dataset yang digunakan adalah Airline Passenger Satisfaction Survey**, berisi hasil survei kepuasan penumpang maskapai penerbangan dengan total 103.904 observasi pada data training.

Keterangan Variabel:

Load Library

library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(caret)
## Loading required package: lattice
library(brant)
## Warning: package 'brant' was built under R version 4.5.3
library(tidyr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Load Data

train <- read.csv("train.csv") %>% na.omit()
test  <- read.csv("test.csv") %>% na.omit()

# Fungsi untuk siapkan variabel (supaya train & test sama)
prepare_data <- function(df) {
  df$Class <- factor(df$Class, levels = c("Eco", "Eco Plus", "Business"), ordered = TRUE)
  df$loyal <- ifelse(df$Customer.Type == "Loyal Customer", 1, 0)
  df$business_travel <- ifelse(df$Type.of.Travel == "Business travel", 1, 0)
  df$satisfaction_bin <- ifelse(df$satisfaction == "satisfied", 1, 0)
  return(df)
}

train <- prepare_data(train)
test  <- prepare_data(test)

Exploratory Data Analysis (EDA)

Struktur Data

glimpse(train)
## Rows: 103,594
## Columns: 28
## $ X                                 <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ id                                <int> 70172, 5047, 110028, 24026, 119299, …
## $ Gender                            <chr> "Male", "Male", "Female", "Female", …
## $ Customer.Type                     <chr> "Loyal Customer", "disloyal Customer…
## $ Age                               <int> 13, 25, 26, 25, 61, 26, 47, 52, 41, …
## $ Type.of.Travel                    <chr> "Personal Travel", "Business travel"…
## $ Class                             <ord> Eco Plus, Business, Business, Busine…
## $ Flight.Distance                   <int> 460, 235, 1142, 562, 214, 1180, 1276…
## $ Inflight.wifi.service             <int> 3, 3, 2, 2, 3, 3, 2, 4, 1, 3, 4, 2, …
## $ Departure.Arrival.time.convenient <int> 4, 2, 2, 5, 3, 4, 4, 3, 2, 3, 5, 4, …
## $ Ease.of.Online.booking            <int> 3, 3, 2, 5, 3, 2, 2, 4, 2, 3, 5, 2, …
## $ Gate.location                     <int> 1, 3, 2, 5, 3, 1, 3, 4, 2, 4, 4, 2, …
## $ Food.and.drink                    <int> 5, 1, 5, 2, 4, 1, 2, 5, 4, 2, 2, 1, …
## $ Online.boarding                   <int> 3, 3, 5, 2, 5, 2, 2, 5, 3, 3, 5, 2, …
## $ Seat.comfort                      <int> 5, 1, 5, 2, 5, 1, 2, 5, 3, 3, 2, 1, …
## $ Inflight.entertainment            <int> 5, 1, 5, 2, 3, 1, 2, 5, 1, 2, 2, 1, …
## $ On.board.service                  <int> 4, 1, 4, 2, 3, 3, 3, 5, 1, 2, 3, 1, …
## $ Leg.room.service                  <int> 3, 5, 3, 5, 4, 4, 3, 5, 2, 3, 3, 2, …
## $ Baggage.handling                  <int> 4, 3, 4, 3, 4, 4, 4, 5, 1, 4, 5, 5, …
## $ Checkin.service                   <int> 4, 1, 4, 1, 3, 4, 3, 4, 4, 4, 3, 5, …
## $ Inflight.service                  <int> 5, 4, 4, 4, 3, 4, 5, 5, 1, 3, 5, 5, …
## $ Cleanliness                       <int> 5, 1, 5, 2, 3, 1, 2, 4, 2, 2, 2, 1, …
## $ Departure.Delay.in.Minutes        <int> 25, 1, 0, 11, 0, 0, 9, 4, 0, 0, 0, 0…
## $ Arrival.Delay.in.Minutes          <dbl> 18, 6, 0, 9, 0, 0, 23, 0, 0, 0, 0, 0…
## $ satisfaction                      <chr> "neutral or dissatisfied", "neutral …
## $ loyal                             <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ business_travel                   <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, …
## $ satisfaction_bin                  <dbl> 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, …

Missing Values

colSums(is.na(train))
##                                 X                                id 
##                                 0                                 0 
##                            Gender                     Customer.Type 
##                                 0                                 0 
##                               Age                    Type.of.Travel 
##                                 0                                 0 
##                             Class                   Flight.Distance 
##                                 0                                 0 
##             Inflight.wifi.service Departure.Arrival.time.convenient 
##                                 0                                 0 
##            Ease.of.Online.booking                     Gate.location 
##                                 0                                 0 
##                    Food.and.drink                   Online.boarding 
##                                 0                                 0 
##                      Seat.comfort            Inflight.entertainment 
##                                 0                                 0 
##                  On.board.service                  Leg.room.service 
##                                 0                                 0 
##                  Baggage.handling                   Checkin.service 
##                                 0                                 0 
##                  Inflight.service                       Cleanliness 
##                                 0                                 0 
##        Departure.Delay.in.Minutes          Arrival.Delay.in.Minutes 
##                                 0                                 0 
##                      satisfaction                             loyal 
##                                 0                                 0 
##                   business_travel                  satisfaction_bin 
##                                 0                                 0

Distribusi Target

table(train$Class)
## 
##      Eco Eco Plus Business 
##    46593     7468    49533
prop.table(table(train$Class))
## 
##        Eco   Eco Plus   Business 
## 0.44976543 0.07208912 0.47814545
ggplot(train, aes(Class, fill = Class)) +
  geom_bar() +
  theme_minimal()

Kepuasan Penumpang per Kelas

ggplot(train, aes(x = Class, fill = satisfaction)) +
  geom_bar(position = "fill", color = "white") +
  scale_fill_manual(values = c("satisfied"               = "#00b894",
                               "neutral or dissatisfied" = "#d63031"),
                    labels = c("Satisfied", "Neutral/Dissatisfied")) +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Proporsi Kepuasan Penumpang per Kelas",
        x = "Kelas", y = "Proporsi", fill = "Kepuasan") +
  theme_minimal()

Distribusi Usia per Kelas

ggplot(train, aes(x = Class, y = Age, fill = Class)) +
  geom_boxplot(color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("Eco" = "#74b9ff",
                               "Eco Plus" = "#0984e3",
                               "Business" = "#2d3436")) +
  labs(title = "Distribusi Usia Penumpang per Kelas",
        x = "Kelas", y = "Usia (Tahun)") +
  theme_minimal() +
  theme(legend.position = "none")

Jarak Penerbangan per Kelas

ggplot(train, aes(x = Class, y = Flight.Distance, fill = Class)) +
  geom_boxplot(color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("Eco" = "#74b9ff",
                               "Eco Plus" = "#0984e3",
                               "Business" = "#2d3436")) +
  labs(title = "Distribusi Jarak Penerbangan per Kelas",
        x = "Kelas", y = "Jarak (mil)") +
  theme_minimal() +
  theme(legend.position = "none")

Preprocessing

# Hapus NA
train <- train %>% filter(!is.na(Arrival.Delay.in.Minutes))
test  <- test  %>% filter(!is.na(Arrival.Delay.in.Minutes))

# Encoding target
train$Class <- factor(train$Class,
                      levels = c("Eco", "Eco Plus", "Business"),
                      ordered = TRUE)

test$Class <- factor(test$Class,
                     levels = c("Eco", "Eco Plus", "Business"),
                     ordered = TRUE)

# Encoding variabel lain
train$satisfaction_bin <- ifelse(train$satisfaction == "satisfied", 1, 0)
test$satisfaction_bin  <- ifelse(test$satisfaction == "satisfied", 1, 0)

train$loyal <- ifelse(train$Customer.Type == "Loyal Customer", 1, 0)
test$loyal  <- ifelse(test$Customer.Type == "Loyal Customer", 1, 0)

train$business_travel <- ifelse(train$Type.of.Travel == "Business travel", 1, 0)
test$business_travel  <- ifelse(test$Type.of.Travel == "Business travel", 1, 0)

Pemilihan Variabel

predictors <- c(
  "Age",
  "Flight.Distance",
  "business_travel",
  "loyal",
  "Inflight.wifi.service",
  "Seat.comfort",
  "Inflight.entertainment",
  "On.board.service",
  "Food.and.drink",
  "Cleanliness",
  "Departure.Delay.in.Minutes",
  "satisfaction_bin"
)

Uji Asumsi

Uji Multikolinearitas (VIF)

formula_vif <- as.formula(paste("as.numeric(Class) ~",
                                paste(predictors, collapse = " + ")))

model_vif <- lm(formula_vif, data = train)
vif(model_vif)
##                        Age            Flight.Distance 
##                   1.131370                   1.237154 
##            business_travel                      loyal 
##                   1.767073                   1.554721 
##      Inflight.wifi.service               Seat.comfort 
##                   1.115294                   2.182682 
##     Inflight.entertainment           On.board.service 
##                   3.037469                   1.451024 
##             Food.and.drink                Cleanliness 
##                   2.100972                   2.739152 
## Departure.Delay.in.Minutes           satisfaction_bin 
##                   1.005301                   1.910842

Uji Outlier

par(mfrow = c(1, 3))
boxplot(train$Age, main = "Age", col = "#74b9ff")
boxplot(train$Flight.Distance, main = "Flight Distance", col = "#fd79a8")
boxplot(train$Departure.Delay.in.Minutes, main = "Departure Delay", col = "#55efc4")

par(mfrow = c(1, 1))

Model OLR

formula_olr <- as.formula(paste("Class ~",
                                paste(predictors, collapse = " + ")))

set.seed(42)
model_olr <- polr(formula_olr,
                  data   = train,
                  Hess   = TRUE,
                  method = "logistic")

summary(model_olr)
## Call:
## polr(formula = formula_olr, data = train, Hess = TRUE, method = "logistic")
## 
## Coefficients:
##                                 Value Std. Error  t value
## Age                         5.761e-03  5.517e-04  10.4421
## Flight.Distance             9.309e-04  9.488e-06  98.1068
## business_travel             2.452e+00  2.131e-02 115.0529
## loyal                       7.771e-01  2.173e-02  35.7672
## Inflight.wifi.service      -2.195e-01  6.551e-03 -33.5070
## Seat.comfort                2.309e-01  8.904e-03  25.9269
## Inflight.entertainment     -1.632e-01  1.062e-02 -15.3679
## On.board.service            3.099e-01  7.092e-03  43.7001
## Food.and.drink             -1.958e-02  9.075e-03  -2.1579
## Cleanliness                 9.709e-03  1.044e-02   0.9299
## Departure.Delay.in.Minutes -7.974e-05  2.051e-04  -0.3887
## satisfaction_bin            1.083e+00  2.031e-02  53.3286
## 
## Intercepts:
##                   Value    Std. Error t value 
## Eco|Eco Plus        4.2749   0.0454    94.0837
## Eco Plus|Business   4.8078   0.0461   104.2861
## 
## Residual Deviance: 125311.50 
## AIC: 125339.50

Uji Signifikansi Paramter

Uji Parsial (Wald Test)

coef_table <- coef(summary(model_olr))
p_values <- pnorm(abs(coef_table[, "t value"]), lower.tail = FALSE) * 2

cbind(coef_table, "p value" = p_values)
##                                    Value   Std. Error     t value       p value
## Age                         5.761047e-03 5.517139e-04  10.4420924  1.592611e-25
## Flight.Distance             9.308542e-04 9.488177e-06  98.1067540  0.000000e+00
## business_travel             2.452213e+00 2.131378e-02 115.0529349  0.000000e+00
## loyal                       7.770760e-01 2.172591e-02  35.7672458 3.569002e-280
## Inflight.wifi.service      -2.195177e-01 6.551391e-03 -33.5070319 3.807240e-246
## Seat.comfort                2.308573e-01 8.904148e-03  25.9269346 3.310397e-148
## Inflight.entertainment     -1.632444e-01 1.062244e-02 -15.3678821  2.688136e-53
## On.board.service            3.099054e-01 7.091647e-03  43.7000575  0.000000e+00
## Food.and.drink             -1.958241e-02 9.074801e-03  -2.1578891  3.093646e-02
## Cleanliness                 9.709493e-03 1.044153e-02   0.9298921  3.524270e-01
## Departure.Delay.in.Minutes -7.974476e-05 2.051421e-04  -0.3887295  6.974763e-01
## satisfaction_bin            1.082958e+00 2.030728e-02  53.3285551  0.000000e+00
## Eco|Eco Plus                4.274861e+00 4.543678e-02  94.0837214  0.000000e+00
## Eco Plus|Business           4.807775e+00 4.610178e-02 104.2861078  0.000000e+00

Uji Serentak

model_null <- polr(Class ~ 1, data = train, Hess = TRUE)
lrt        <- anova(model_null, model_olr)
print(lrt)
## Likelihood ratio tests of ordinal regression models
## 
## Response: Class
##                                                                                                                                                                                                               Model
## 1                                                                                                                                                                                                                 1
## 2 Age + Flight.Distance + business_travel + loyal + Inflight.wifi.service + Seat.comfort + Inflight.entertainment + On.board.service + Food.and.drink + Cleanliness + Departure.Delay.in.Minutes + satisfaction_bin
##   Resid. df Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1    103592   186832.7                              
## 2    103580   125311.5 1 vs 2    12 61521.18       0

Odds Ratio

exp(coef(model_olr))
##                        Age            Flight.Distance 
##                  1.0057777                  1.0009313 
##            business_travel                      loyal 
##                 11.6140149                  2.1751029 
##      Inflight.wifi.service               Seat.comfort 
##                  0.8029060                  1.2596794 
##     Inflight.entertainment           On.board.service 
##                  0.8493836                  1.3632961 
##             Food.and.drink                Cleanliness 
##                  0.9806081                  1.0097568 
## Departure.Delay.in.Minutes           satisfaction_bin 
##                  0.9999203                  2.9534026

Uji Proportional Odds (Brant Test)

brant(model_olr)
## ------------------------------------------------------------ 
## Test for         X2  df  probability 
## ------------------------------------------------------------ 
## Omnibus              5465.91 12  0
## Age              98.26   1   0
## Flight.Distance      1856.28 1   0
## business_travel      1053.65 1   0
## loyal                1192.44 1   0
## Inflight.wifi.service        261.52  1   0
## Seat.comfort         173.63  1   0
## Inflight.entertainment   57.69   1   0
## On.board.service     389.82  1   0
## Food.and.drink           0.61    1   0.43
## Cleanliness          3.05    1   0.08
## Departure.Delay.in.Minutes   0   1   1
## satisfaction_bin     374.21  1   0
## ------------------------------------------------------------ 
## 
## H0: Parallel Regression Assumption holds

Prediksi dan Evaluasi Model

Prediksi Probabilitas

pred_prob  <- predict(model_olr, newdata = test, type = "probs")
pred_class <- predict(model_olr, newdata = test, type = "class")

prob_preview <- data.frame(
  Umur     = test$Age[1:8],
  Jarak    = test$Flight.Distance[1:8],
  Puas     = test$satisfaction[1:8],
  round(pred_prob[1:8, ], 4), 
  Prediksi = pred_class[1:8],
  Aktual   = test$Class[1:8]
)

prob_preview
##   Umur Jarak                    Puas    Eco Eco.Plus Business Prediksi   Aktual
## 1   52   160               satisfied 0.3100   0.1236   0.5664 Business      Eco
## 2   36  2863               satisfied 0.0123   0.0085   0.9791 Business Business
## 3   20   192 neutral or dissatisfied 0.6485   0.1102   0.2413      Eco      Eco
## 4   44  3377               satisfied 0.0112   0.0077   0.9811 Business Business
## 5   49  1182               satisfied 0.1549   0.0831   0.7620 Business      Eco
## 6   16   311               satisfied 0.3043   0.1227   0.5730 Business      Eco
## 7   77  3987               satisfied 0.0070   0.0049   0.9881 Business Business
## 8   43  2556               satisfied 0.0196   0.0134   0.9670 Business Business

## Confusion Matrix

cm <- confusionMatrix(pred_class, test$Class)

cm_df <- as.data.frame(cm$table)
names(cm_df) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_df, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Frekuensi), size = 5, fontface = "bold") +
  scale_fill_gradient(low = "#dfe6e9", high = "#0984e3") +
  labs(title    = "Confusion Matrix — Prediksi vs Aktual",
       subtitle = "Data Testing",
       x = "Kelas Aktual", y = "Kelas Prediksi") +
  theme_minimal()

Akurasi Model

akurasi <- mean(as.character(pred_class) == as.character(test$Class))

cat("Akurasi Model:", round(akurasi * 100, 2), "%\n")
## Akurasi Model: 75.89 %

Visualisasi Hasil Model

new_data_fd <- data.frame(
  Age                        = mean(train$Age),
  Flight.Distance            = seq(min(train$Flight.Distance),
                                   max(train$Flight.Distance), length.out = 200),
  business_travel            = 1,
  loyal                      = 1,
  Inflight.wifi.service      = 3,
  Seat.comfort               = 3,
  Inflight.entertainment     = 3,
  On.board.service           = 3,
  Food.and.drink             = 3,
  Cleanliness                = 3,
  Departure.Delay.in.Minutes = 0,
  satisfaction_bin           = 1
)

prob_fd    <- predict(model_olr, newdata = new_data_fd, type = "probs")
prob_fd_df <- as.data.frame(prob_fd)
prob_fd_df$Flight.Distance <- new_data_fd$Flight.Distance

prob_long <- pivot_longer(prob_fd_df,
                          cols      = c("Eco", "Eco Plus", "Business"),
                          names_to  = "Class",
                          values_to = "Probability")
prob_long$Class <- factor(prob_long$Class,
                          levels = c("Eco", "Eco Plus", "Business"))

ggplot(prob_long, aes(x = Flight.Distance, y = Probability, color = Class)) +
  geom_line(size = 1.3) +
  scale_color_manual(values = c("Eco" = "#74b9ff",
                                "Eco Plus" = "#0984e3",
                                "Business" = "#2d3436")) +
  labs(title    = "Prediksi Probabilitas Kelas vs Jarak Penerbangan",
       subtitle = "Kondisi: Business traveler, loyal, semua rating layanan = 3, satisfied",
       x = "Jarak Penerbangan (mil)", y = "Probabilitas", color = "Kelas") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.


Kesimpulan