Dataset yang digunakan adalah Airline Passenger Satisfaction Survey**, berisi hasil survei kepuasan penumpang maskapai penerbangan dengan total 103.904 observasi pada data training.
Keterangan Variabel:
Variabel Target (Y): Class sebagai indikator kelas penerbangan penumpang. Terdiri dari tiga kategori berurutan yaitu Eco < Eco Plus < Business.
Variabel Prediktor (X): Variabel yang digunakan meliputi faktor demografi, karakteristik perjalanan, layanan, dan operasional, yaitu: Age, Flight Distance, business_travel, loyal, Inflight wifi service, Seat comfort, Inflight entertainment, On-board service, Food and drink, Cleanliness, Departure Delay, satisfaction_bin.
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(caret)
## Loading required package: lattice
library(brant)
## Warning: package 'brant' was built under R version 4.5.3
library(tidyr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
train <- read.csv("train.csv") %>% na.omit()
test <- read.csv("test.csv") %>% na.omit()
# Fungsi untuk siapkan variabel (supaya train & test sama)
prepare_data <- function(df) {
df$Class <- factor(df$Class, levels = c("Eco", "Eco Plus", "Business"), ordered = TRUE)
df$loyal <- ifelse(df$Customer.Type == "Loyal Customer", 1, 0)
df$business_travel <- ifelse(df$Type.of.Travel == "Business travel", 1, 0)
df$satisfaction_bin <- ifelse(df$satisfaction == "satisfied", 1, 0)
return(df)
}
train <- prepare_data(train)
test <- prepare_data(test)
glimpse(train)
## Rows: 103,594
## Columns: 28
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ id <int> 70172, 5047, 110028, 24026, 119299, …
## $ Gender <chr> "Male", "Male", "Female", "Female", …
## $ Customer.Type <chr> "Loyal Customer", "disloyal Customer…
## $ Age <int> 13, 25, 26, 25, 61, 26, 47, 52, 41, …
## $ Type.of.Travel <chr> "Personal Travel", "Business travel"…
## $ Class <ord> Eco Plus, Business, Business, Busine…
## $ Flight.Distance <int> 460, 235, 1142, 562, 214, 1180, 1276…
## $ Inflight.wifi.service <int> 3, 3, 2, 2, 3, 3, 2, 4, 1, 3, 4, 2, …
## $ Departure.Arrival.time.convenient <int> 4, 2, 2, 5, 3, 4, 4, 3, 2, 3, 5, 4, …
## $ Ease.of.Online.booking <int> 3, 3, 2, 5, 3, 2, 2, 4, 2, 3, 5, 2, …
## $ Gate.location <int> 1, 3, 2, 5, 3, 1, 3, 4, 2, 4, 4, 2, …
## $ Food.and.drink <int> 5, 1, 5, 2, 4, 1, 2, 5, 4, 2, 2, 1, …
## $ Online.boarding <int> 3, 3, 5, 2, 5, 2, 2, 5, 3, 3, 5, 2, …
## $ Seat.comfort <int> 5, 1, 5, 2, 5, 1, 2, 5, 3, 3, 2, 1, …
## $ Inflight.entertainment <int> 5, 1, 5, 2, 3, 1, 2, 5, 1, 2, 2, 1, …
## $ On.board.service <int> 4, 1, 4, 2, 3, 3, 3, 5, 1, 2, 3, 1, …
## $ Leg.room.service <int> 3, 5, 3, 5, 4, 4, 3, 5, 2, 3, 3, 2, …
## $ Baggage.handling <int> 4, 3, 4, 3, 4, 4, 4, 5, 1, 4, 5, 5, …
## $ Checkin.service <int> 4, 1, 4, 1, 3, 4, 3, 4, 4, 4, 3, 5, …
## $ Inflight.service <int> 5, 4, 4, 4, 3, 4, 5, 5, 1, 3, 5, 5, …
## $ Cleanliness <int> 5, 1, 5, 2, 3, 1, 2, 4, 2, 2, 2, 1, …
## $ Departure.Delay.in.Minutes <int> 25, 1, 0, 11, 0, 0, 9, 4, 0, 0, 0, 0…
## $ Arrival.Delay.in.Minutes <dbl> 18, 6, 0, 9, 0, 0, 23, 0, 0, 0, 0, 0…
## $ satisfaction <chr> "neutral or dissatisfied", "neutral …
## $ loyal <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ business_travel <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, …
## $ satisfaction_bin <dbl> 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
colSums(is.na(train))
## X id
## 0 0
## Gender Customer.Type
## 0 0
## Age Type.of.Travel
## 0 0
## Class Flight.Distance
## 0 0
## Inflight.wifi.service Departure.Arrival.time.convenient
## 0 0
## Ease.of.Online.booking Gate.location
## 0 0
## Food.and.drink Online.boarding
## 0 0
## Seat.comfort Inflight.entertainment
## 0 0
## On.board.service Leg.room.service
## 0 0
## Baggage.handling Checkin.service
## 0 0
## Inflight.service Cleanliness
## 0 0
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes
## 0 0
## satisfaction loyal
## 0 0
## business_travel satisfaction_bin
## 0 0
table(train$Class)
##
## Eco Eco Plus Business
## 46593 7468 49533
prop.table(table(train$Class))
##
## Eco Eco Plus Business
## 0.44976543 0.07208912 0.47814545
ggplot(train, aes(Class, fill = Class)) +
geom_bar() +
theme_minimal()
ggplot(train, aes(x = Class, fill = satisfaction)) +
geom_bar(position = "fill", color = "white") +
scale_fill_manual(values = c("satisfied" = "#00b894",
"neutral or dissatisfied" = "#d63031"),
labels = c("Satisfied", "Neutral/Dissatisfied")) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proporsi Kepuasan Penumpang per Kelas",
x = "Kelas", y = "Proporsi", fill = "Kepuasan") +
theme_minimal()
ggplot(train, aes(x = Class, y = Age, fill = Class)) +
geom_boxplot(color = "white", alpha = 0.85) +
scale_fill_manual(values = c("Eco" = "#74b9ff",
"Eco Plus" = "#0984e3",
"Business" = "#2d3436")) +
labs(title = "Distribusi Usia Penumpang per Kelas",
x = "Kelas", y = "Usia (Tahun)") +
theme_minimal() +
theme(legend.position = "none")
ggplot(train, aes(x = Class, y = Flight.Distance, fill = Class)) +
geom_boxplot(color = "white", alpha = 0.85) +
scale_fill_manual(values = c("Eco" = "#74b9ff",
"Eco Plus" = "#0984e3",
"Business" = "#2d3436")) +
labs(title = "Distribusi Jarak Penerbangan per Kelas",
x = "Kelas", y = "Jarak (mil)") +
theme_minimal() +
theme(legend.position = "none")
# Hapus NA
train <- train %>% filter(!is.na(Arrival.Delay.in.Minutes))
test <- test %>% filter(!is.na(Arrival.Delay.in.Minutes))
# Encoding target
train$Class <- factor(train$Class,
levels = c("Eco", "Eco Plus", "Business"),
ordered = TRUE)
test$Class <- factor(test$Class,
levels = c("Eco", "Eco Plus", "Business"),
ordered = TRUE)
# Encoding variabel lain
train$satisfaction_bin <- ifelse(train$satisfaction == "satisfied", 1, 0)
test$satisfaction_bin <- ifelse(test$satisfaction == "satisfied", 1, 0)
train$loyal <- ifelse(train$Customer.Type == "Loyal Customer", 1, 0)
test$loyal <- ifelse(test$Customer.Type == "Loyal Customer", 1, 0)
train$business_travel <- ifelse(train$Type.of.Travel == "Business travel", 1, 0)
test$business_travel <- ifelse(test$Type.of.Travel == "Business travel", 1, 0)
predictors <- c(
"Age",
"Flight.Distance",
"business_travel",
"loyal",
"Inflight.wifi.service",
"Seat.comfort",
"Inflight.entertainment",
"On.board.service",
"Food.and.drink",
"Cleanliness",
"Departure.Delay.in.Minutes",
"satisfaction_bin"
)
formula_vif <- as.formula(paste("as.numeric(Class) ~",
paste(predictors, collapse = " + ")))
model_vif <- lm(formula_vif, data = train)
vif(model_vif)
## Age Flight.Distance
## 1.131370 1.237154
## business_travel loyal
## 1.767073 1.554721
## Inflight.wifi.service Seat.comfort
## 1.115294 2.182682
## Inflight.entertainment On.board.service
## 3.037469 1.451024
## Food.and.drink Cleanliness
## 2.100972 2.739152
## Departure.Delay.in.Minutes satisfaction_bin
## 1.005301 1.910842
par(mfrow = c(1, 3))
boxplot(train$Age, main = "Age", col = "#74b9ff")
boxplot(train$Flight.Distance, main = "Flight Distance", col = "#fd79a8")
boxplot(train$Departure.Delay.in.Minutes, main = "Departure Delay", col = "#55efc4")
par(mfrow = c(1, 1))
formula_olr <- as.formula(paste("Class ~",
paste(predictors, collapse = " + ")))
set.seed(42)
model_olr <- polr(formula_olr,
data = train,
Hess = TRUE,
method = "logistic")
summary(model_olr)
## Call:
## polr(formula = formula_olr, data = train, Hess = TRUE, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## Age 5.761e-03 5.517e-04 10.4421
## Flight.Distance 9.309e-04 9.488e-06 98.1068
## business_travel 2.452e+00 2.131e-02 115.0529
## loyal 7.771e-01 2.173e-02 35.7672
## Inflight.wifi.service -2.195e-01 6.551e-03 -33.5070
## Seat.comfort 2.309e-01 8.904e-03 25.9269
## Inflight.entertainment -1.632e-01 1.062e-02 -15.3679
## On.board.service 3.099e-01 7.092e-03 43.7001
## Food.and.drink -1.958e-02 9.075e-03 -2.1579
## Cleanliness 9.709e-03 1.044e-02 0.9299
## Departure.Delay.in.Minutes -7.974e-05 2.051e-04 -0.3887
## satisfaction_bin 1.083e+00 2.031e-02 53.3286
##
## Intercepts:
## Value Std. Error t value
## Eco|Eco Plus 4.2749 0.0454 94.0837
## Eco Plus|Business 4.8078 0.0461 104.2861
##
## Residual Deviance: 125311.50
## AIC: 125339.50
coef_table <- coef(summary(model_olr))
p_values <- pnorm(abs(coef_table[, "t value"]), lower.tail = FALSE) * 2
cbind(coef_table, "p value" = p_values)
## Value Std. Error t value p value
## Age 5.761047e-03 5.517139e-04 10.4420924 1.592611e-25
## Flight.Distance 9.308542e-04 9.488177e-06 98.1067540 0.000000e+00
## business_travel 2.452213e+00 2.131378e-02 115.0529349 0.000000e+00
## loyal 7.770760e-01 2.172591e-02 35.7672458 3.569002e-280
## Inflight.wifi.service -2.195177e-01 6.551391e-03 -33.5070319 3.807240e-246
## Seat.comfort 2.308573e-01 8.904148e-03 25.9269346 3.310397e-148
## Inflight.entertainment -1.632444e-01 1.062244e-02 -15.3678821 2.688136e-53
## On.board.service 3.099054e-01 7.091647e-03 43.7000575 0.000000e+00
## Food.and.drink -1.958241e-02 9.074801e-03 -2.1578891 3.093646e-02
## Cleanliness 9.709493e-03 1.044153e-02 0.9298921 3.524270e-01
## Departure.Delay.in.Minutes -7.974476e-05 2.051421e-04 -0.3887295 6.974763e-01
## satisfaction_bin 1.082958e+00 2.030728e-02 53.3285551 0.000000e+00
## Eco|Eco Plus 4.274861e+00 4.543678e-02 94.0837214 0.000000e+00
## Eco Plus|Business 4.807775e+00 4.610178e-02 104.2861078 0.000000e+00
model_null <- polr(Class ~ 1, data = train, Hess = TRUE)
lrt <- anova(model_null, model_olr)
print(lrt)
## Likelihood ratio tests of ordinal regression models
##
## Response: Class
## Model
## 1 1
## 2 Age + Flight.Distance + business_travel + loyal + Inflight.wifi.service + Seat.comfort + Inflight.entertainment + On.board.service + Food.and.drink + Cleanliness + Departure.Delay.in.Minutes + satisfaction_bin
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 103592 186832.7
## 2 103580 125311.5 1 vs 2 12 61521.18 0
exp(coef(model_olr))
## Age Flight.Distance
## 1.0057777 1.0009313
## business_travel loyal
## 11.6140149 2.1751029
## Inflight.wifi.service Seat.comfort
## 0.8029060 1.2596794
## Inflight.entertainment On.board.service
## 0.8493836 1.3632961
## Food.and.drink Cleanliness
## 0.9806081 1.0097568
## Departure.Delay.in.Minutes satisfaction_bin
## 0.9999203 2.9534026
brant(model_olr)
## ------------------------------------------------------------
## Test for X2 df probability
## ------------------------------------------------------------
## Omnibus 5465.91 12 0
## Age 98.26 1 0
## Flight.Distance 1856.28 1 0
## business_travel 1053.65 1 0
## loyal 1192.44 1 0
## Inflight.wifi.service 261.52 1 0
## Seat.comfort 173.63 1 0
## Inflight.entertainment 57.69 1 0
## On.board.service 389.82 1 0
## Food.and.drink 0.61 1 0.43
## Cleanliness 3.05 1 0.08
## Departure.Delay.in.Minutes 0 1 1
## satisfaction_bin 374.21 1 0
## ------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
pred_prob <- predict(model_olr, newdata = test, type = "probs")
pred_class <- predict(model_olr, newdata = test, type = "class")
prob_preview <- data.frame(
Umur = test$Age[1:8],
Jarak = test$Flight.Distance[1:8],
Puas = test$satisfaction[1:8],
round(pred_prob[1:8, ], 4),
Prediksi = pred_class[1:8],
Aktual = test$Class[1:8]
)
prob_preview
## Umur Jarak Puas Eco Eco.Plus Business Prediksi Aktual
## 1 52 160 satisfied 0.3100 0.1236 0.5664 Business Eco
## 2 36 2863 satisfied 0.0123 0.0085 0.9791 Business Business
## 3 20 192 neutral or dissatisfied 0.6485 0.1102 0.2413 Eco Eco
## 4 44 3377 satisfied 0.0112 0.0077 0.9811 Business Business
## 5 49 1182 satisfied 0.1549 0.0831 0.7620 Business Eco
## 6 16 311 satisfied 0.3043 0.1227 0.5730 Business Eco
## 7 77 3987 satisfied 0.0070 0.0049 0.9881 Business Business
## 8 43 2556 satisfied 0.0196 0.0134 0.9670 Business Business
cm <- confusionMatrix(pred_class, test$Class)
cm_df <- as.data.frame(cm$table)
names(cm_df) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(cm_df, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white") +
geom_text(aes(label = Frekuensi), size = 5, fontface = "bold") +
scale_fill_gradient(low = "#dfe6e9", high = "#0984e3") +
labs(title = "Confusion Matrix — Prediksi vs Aktual",
subtitle = "Data Testing",
x = "Kelas Aktual", y = "Kelas Prediksi") +
theme_minimal()
akurasi <- mean(as.character(pred_class) == as.character(test$Class))
cat("Akurasi Model:", round(akurasi * 100, 2), "%\n")
## Akurasi Model: 75.89 %
new_data_fd <- data.frame(
Age = mean(train$Age),
Flight.Distance = seq(min(train$Flight.Distance),
max(train$Flight.Distance), length.out = 200),
business_travel = 1,
loyal = 1,
Inflight.wifi.service = 3,
Seat.comfort = 3,
Inflight.entertainment = 3,
On.board.service = 3,
Food.and.drink = 3,
Cleanliness = 3,
Departure.Delay.in.Minutes = 0,
satisfaction_bin = 1
)
prob_fd <- predict(model_olr, newdata = new_data_fd, type = "probs")
prob_fd_df <- as.data.frame(prob_fd)
prob_fd_df$Flight.Distance <- new_data_fd$Flight.Distance
prob_long <- pivot_longer(prob_fd_df,
cols = c("Eco", "Eco Plus", "Business"),
names_to = "Class",
values_to = "Probability")
prob_long$Class <- factor(prob_long$Class,
levels = c("Eco", "Eco Plus", "Business"))
ggplot(prob_long, aes(x = Flight.Distance, y = Probability, color = Class)) +
geom_line(size = 1.3) +
scale_color_manual(values = c("Eco" = "#74b9ff",
"Eco Plus" = "#0984e3",
"Business" = "#2d3436")) +
labs(title = "Prediksi Probabilitas Kelas vs Jarak Penerbangan",
subtitle = "Kondisi: Business traveler, loyal, semua rating layanan = 3, satisfied",
x = "Jarak Penerbangan (mil)", y = "Probabilitas", color = "Kelas") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.