Dataset –> Airline Passenger Satisfaction Link Data –> https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.2 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.5.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(nnet)
## Warning: package 'nnet' was built under R version 4.5.3
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.3
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(dplyr)
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
df <- read.csv("train.csv")
head(df)
## X id Gender Customer.Type Age Type.of.Travel Class
## 1 0 70172 Male Loyal Customer 13 Personal Travel Eco Plus
## 2 1 5047 Male disloyal Customer 25 Business travel Business
## 3 2 110028 Female Loyal Customer 26 Business travel Business
## 4 3 24026 Female Loyal Customer 25 Business travel Business
## 5 4 119299 Male Loyal Customer 61 Business travel Business
## 6 5 111157 Female Loyal Customer 26 Personal Travel Eco
## Flight.Distance Inflight.wifi.service Departure.Arrival.time.convenient
## 1 460 3 4
## 2 235 3 2
## 3 1142 2 2
## 4 562 2 5
## 5 214 3 3
## 6 1180 3 4
## Ease.of.Online.booking Gate.location Food.and.drink Online.boarding
## 1 3 1 5 3
## 2 3 3 1 3
## 3 2 2 5 5
## 4 5 5 2 2
## 5 3 3 4 5
## 6 2 1 1 2
## Seat.comfort Inflight.entertainment On.board.service Leg.room.service
## 1 5 5 4 3
## 2 1 1 1 5
## 3 5 5 4 3
## 4 2 2 2 5
## 5 5 3 3 4
## 6 1 1 3 4
## Baggage.handling Checkin.service Inflight.service Cleanliness
## 1 4 4 5 5
## 2 3 1 4 1
## 3 4 4 4 5
## 4 3 1 4 2
## 5 4 3 3 3
## 6 4 4 4 1
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes satisfaction
## 1 25 18 neutral or dissatisfied
## 2 1 6 neutral or dissatisfied
## 3 0 0 satisfied
## 4 11 9 neutral or dissatisfied
## 5 0 0 satisfied
## 6 0 0 neutral or dissatisfied
colSums(is.na(df))
## X id
## 0 0
## Gender Customer.Type
## 0 0
## Age Type.of.Travel
## 0 0
## Class Flight.Distance
## 0 0
## Inflight.wifi.service Departure.Arrival.time.convenient
## 0 0
## Ease.of.Online.booking Gate.location
## 0 0
## Food.and.drink Online.boarding
## 0 0
## Seat.comfort Inflight.entertainment
## 0 0
## On.board.service Leg.room.service
## 0 0
## Baggage.handling Checkin.service
## 0 0
## Inflight.service Cleanliness
## 0 0
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes
## 0 310
## satisfaction
## 0
str(df)
## 'data.frame': 103904 obs. of 25 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ id : int 70172 5047 110028 24026 119299 111157 82113 96462 79485 65725 ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Customer.Type : chr "Loyal Customer" "disloyal Customer" "Loyal Customer" "Loyal Customer" ...
## $ Age : int 13 25 26 25 61 26 47 52 41 20 ...
## $ Type.of.Travel : chr "Personal Travel" "Business travel" "Business travel" "Business travel" ...
## $ Class : chr "Eco Plus" "Business" "Business" "Business" ...
## $ Flight.Distance : int 460 235 1142 562 214 1180 1276 2035 853 1061 ...
## $ Inflight.wifi.service : int 3 3 2 2 3 3 2 4 1 3 ...
## $ Departure.Arrival.time.convenient: int 4 2 2 5 3 4 4 3 2 3 ...
## $ Ease.of.Online.booking : int 3 3 2 5 3 2 2 4 2 3 ...
## $ Gate.location : int 1 3 2 5 3 1 3 4 2 4 ...
## $ Food.and.drink : int 5 1 5 2 4 1 2 5 4 2 ...
## $ Online.boarding : int 3 3 5 2 5 2 2 5 3 3 ...
## $ Seat.comfort : int 5 1 5 2 5 1 2 5 3 3 ...
## $ Inflight.entertainment : int 5 1 5 2 3 1 2 5 1 2 ...
## $ On.board.service : int 4 1 4 2 3 3 3 5 1 2 ...
## $ Leg.room.service : int 3 5 3 5 4 4 3 5 2 3 ...
## $ Baggage.handling : int 4 3 4 3 4 4 4 5 1 4 ...
## $ Checkin.service : int 4 1 4 1 3 4 3 4 4 4 ...
## $ Inflight.service : int 5 4 4 4 3 4 5 5 1 3 ...
## $ Cleanliness : int 5 1 5 2 3 1 2 4 2 2 ...
## $ Departure.Delay.in.Minutes : int 25 1 0 11 0 0 9 4 0 0 ...
## $ Arrival.Delay.in.Minutes : num 18 6 0 9 0 0 23 0 0 0 ...
## $ satisfaction : chr "neutral or dissatisfied" "neutral or dissatisfied" "satisfied" "neutral or dissatisfied" ...
set.seed(123)
df_model <- df %>%
select(
Seat.comfort,
Age,
Flight.Distance,
Departure.Delay.in.Minutes,
Arrival.Delay.in.Minutes,
Class,
Type.of.Travel,
Gender,
Customer.Type,
Food.and.drink,
Inflight.wifi.service
) %>%
na.omit() %>%
slice_sample(n = 10000)
colSums(is.na(df_model))
## Seat.comfort Age
## 0 0
## Flight.Distance Departure.Delay.in.Minutes
## 0 0
## Arrival.Delay.in.Minutes Class
## 0 0
## Type.of.Travel Gender
## 0 0
## Customer.Type Food.and.drink
## 0 0
## Inflight.wifi.service
## 0
Handling Outlier
detect_outliers <- function(x, var_name) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
n_out <- sum(x < lower | x > upper, na.rm = TRUE)
data.frame(
Variabel = var_name,
Batas_Bawah = round(lower, 3), Batas_Atas = round(upper, 3),
Jumlah_Outlier = n_out,
Persen = round(n_out / length(x) * 100, 2))
}
continuous_vars_for_outlier_check <- df_model %>%
select(Flight.Distance, Departure.Delay.in.Minutes, Arrival.Delay.in.Minutes)
outlier_tbl <- do.call(rbind,
lapply(names(continuous_vars_for_outlier_check), function(v_name) {
detect_outliers(continuous_vars_for_outlier_check[[v_name]], v_name)
}))
rownames(outlier_tbl) <- NULL
knitr::kable(outlier_tbl,
caption = "Deteksi Outlier")
| Variabel | Batas_Bawah | Batas_Atas | Jumlah_Outlier | Persen |
|---|---|---|---|---|
| Flight.Distance | -1530.25 | 3635.75 | 293 | 2.93 |
| Departure.Delay.in.Minutes | -18.00 | 30.00 | 1368 | 13.68 |
| Arrival.Delay.in.Minutes | -19.50 | 32.50 | 1326 | 13.26 |
plots_before <- lapply(names(continuous_vars_for_outlier_check), function(v) {
ggplot(continuous_vars_for_outlier_check, aes(y = .data[[v]])) +
geom_boxplot(fill = "#4292C6", color = "#084594",
outlier.color = "red", outlier.size = 2) +
labs(title = v) +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
})
grid.arrange(grobs = plots_before, ncol = 2,
top = "Boxplot")
cap_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
x[x < lower] <- lower
x[x > upper] <- upper
x
}
cols_to_cap <- c("Flight.Distance", "Departure.Delay.in.Minutes", "Arrival.Delay.in.Minutes")
df_model[cols_to_cap] <- lapply(df_model[cols_to_cap], cap_outliers)
No Outlier
hasil_perbandingan <- do.call(rbind, lapply(cols_to_cap, function(v_name) {
# sebelum capping
out_before <- detect_outliers(continuous_vars_for_outlier_check[[v_name]], v_name)
# sesudah capping
out_after <- detect_outliers(df_model[[v_name]], v_name)
data.frame(
Variabel = v_name,
Outlier_Sebelum = out_before$Jumlah_Outlier,
Outlier_Sesudah = out_after$Jumlah_Outlier
)
}))
knitr::kable(hasil_perbandingan,
caption = "Perbandingan Outlier Sebelum dan Sesudah Capping")
| Variabel | Outlier_Sebelum | Outlier_Sesudah |
|---|---|---|
| Flight.Distance | 293 | 0 |
| Departure.Delay.in.Minutes | 1368 | 0 |
| Arrival.Delay.in.Minutes | 1326 | 0 |
plots_after <- lapply(cols_to_cap, function(v) {
ggplot(df_model, aes(y = .data[[v]])) +
geom_boxplot(fill = "#9ecae1", color = "#08519c",
outlier.color = "red", outlier.size = 2) +
labs(title = v) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
})
grid.arrange(grobs = plots_after, ncol = 2,
top = "Boxplot (Sesudah Capping)")
df_model <- df_model[!duplicated(df_model), ]
df_model$Departure.Delay.in.Minutes <- df_model$Departure.Delay.in.Minutes + 1
df_model$Arrival.Delay.in.Minutes <- df_model$Arrival.Delay.in.Minutes + 1
df_model$Seat.comfort <- as.ordered(df_model$Seat.comfort)
df_model$y_num <- as.numeric(df_model$Seat.comfort)
df_model <- df_model %>%
mutate(
Class = as.factor(Class),
Type.of.Travel = as.factor(Type.of.Travel),
Gender = as.factor(Gender),
Customer.Type = as.factor(Customer.Type)
)
df_model <- df_model %>%
mutate(
Food.and.drink = as.factor(Food.and.drink),
Inflight.wifi.service = as.factor(Inflight.wifi.service)
)
summary(df_model)
## Seat.comfort Age Flight.Distance Departure.Delay.in.Minutes
## 1:1146 Min. : 7.0 Min. : 31 Min. : 1.00
## 2:1471 1st Qu.:27.0 1st Qu.: 407 1st Qu.: 1.00
## 3:1815 Median :40.0 Median : 842 Median : 1.00
## 4:3074 Mean :39.2 Mean :1166 Mean : 8.34
## 5:2490 3rd Qu.:50.0 3rd Qu.:1698 3rd Qu.:13.00
## Max. :80.0 Max. :3636 Max. :31.00
## Arrival.Delay.in.Minutes Class Type.of.Travel Gender
## Min. : 1.000 Business:4720 Business travel:6923 Female:5069
## 1st Qu.: 1.000 Eco :4599 Personal Travel:3073 Male :4927
## Median : 1.000 Eco Plus: 677
## Mean : 8.853
## 3rd Qu.:14.000
## Max. :33.500
## Customer.Type Food.and.drink Inflight.wifi.service y_num
## disloyal Customer:1864 0: 11 0: 271 Min. :1.000
## Loyal Customer :8132 1:1221 1:1700 1st Qu.:2.000
## 2:2135 2:2489 Median :4.000
## 3:2206 3:2531 Mean :3.429
## 4:2357 4:1926 3rd Qu.:4.000
## 5:2066 5:1079 Max. :5.000
if (!require("psych")) install.packages("psych")
## Loading required package: psych
## Warning: package 'psych' was built under R version 4.5.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
library(psych)
describe(df_model[, c("Age", "Flight.Distance", "Departure.Delay.in.Minutes", "Arrival.Delay.in.Minutes")])
## vars n mean sd median trimmed mad min
## Age 1 9996 39.20 14.96 40 39.16 17.79 7
## Flight.Distance 2 9996 1166.34 970.92 842 1024.69 757.61 31
## Departure.Delay.in.Minutes 3 9996 8.34 11.16 1 6.43 0.00 1
## Arrival.Delay.in.Minutes 4 9996 8.85 11.89 1 6.76 0.00 1
## max range skew kurtosis se
## Age 80.00 73.00 0.02 -0.71 0.15
## Flight.Distance 3635.75 3604.75 1.09 0.15 9.71
## Departure.Delay.in.Minutes 31.00 30.00 1.23 -0.16 0.11
## Arrival.Delay.in.Minutes 33.50 32.50 1.25 -0.06 0.12
# Transformasi variabel Age & Arrival Delay in Minutes
df_model$Age_log <- log(df_model$Age)
df_model$Arrival.Delay.in.Minutes_log <- log(df_model$Arrival.Delay.in.Minutes)
Interpretasi Variabel
model_olr <- polr(
Seat.comfort ~ Age_log + Flight.Distance +
Departure.Delay.in.Minutes +
Arrival.Delay.in.Minutes_log +
Class + Type.of.Travel +
Gender + Customer.Type,
data = df_model,
Hess = TRUE
)
summary(model_olr)
## Call:
## polr(formula = Seat.comfort ~ Age_log + Flight.Distance + Departure.Delay.in.Minutes +
## Arrival.Delay.in.Minutes_log + Class + Type.of.Travel + Gender +
## Customer.Type, data = df_model, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Age_log 4.709e-01 1.820e-02 25.8678
## Flight.Distance 3.808e-05 2.545e-05 1.4958
## Departure.Delay.in.Minutes -2.403e-03 2.605e-03 -0.9222
## Arrival.Delay.in.Minutes_log -3.378e-02 2.033e-02 -1.6616
## ClassEco -5.264e-01 4.736e-02 -11.1163
## ClassEco Plus -6.324e-01 7.704e-02 -8.2084
## Type.of.TravelPersonal Travel -1.853e-01 5.232e-02 -3.5417
## GenderMale -1.448e-01 3.566e-02 -4.0612
## Customer.TypeLoyal Customer 5.757e-01 5.589e-02 10.3021
##
## Intercepts:
## Value Std. Error t value
## 1|2 -0.4155 0.0091 -45.8215
## 2|3 0.6413 0.0269 23.8564
## 3|4 1.5127 0.0317 47.7605
## 4|5 2.9221 0.0378 77.3862
##
## Residual Deviance: 30145.72
## AIC: 30171.72
variabel usia, keterlambatan kedatangan, kelas penerbangan, tipe perjalanan, gender, dan tipe pelanggan berpengaruh signifikan terhadap tingkat kenyamanan kursi (Seat Comfort). Sementara itu, jarak penerbangan dan keterlambatan keberangkatan tidak menunjukkan pengaruh signifikan.
Linearity (p-value > 0.05)
model_lin <- glm(
y_num ~ Age_log + Flight.Distance +
Departure.Delay.in.Minutes +
Arrival.Delay.in.Minutes_log +
I(Age_log * log(Age_log)) +
I(Flight.Distance * log(Flight.Distance)) +
I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1)) +
I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1)),
data = df_model
)
summary(model_lin)
##
## Call:
## glm(formula = y_num ~ Age_log + Flight.Distance + Departure.Delay.in.Minutes +
## Arrival.Delay.in.Minutes_log + I(Age_log * log(Age_log)) +
## I(Flight.Distance * log(Flight.Distance)) + I(Departure.Delay.in.Minutes *
## log(Departure.Delay.in.Minutes + 1)) + I(Arrival.Delay.in.Minutes_log *
## log(Arrival.Delay.in.Minutes_log + 1)), data = df_model)
##
## Coefficients:
## Estimate
## (Intercept) 2.615e+00
## Age_log -2.200e-01
## Flight.Distance -2.973e-04
## Departure.Delay.in.Minutes 1.415e-02
## Arrival.Delay.in.Minutes_log 4.922e-02
## I(Age_log * log(Age_log)) 3.231e-01
## I(Flight.Distance * log(Flight.Distance)) 5.580e-05
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1)) -4.238e-03
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1)) -5.469e-02
## Std. Error
## (Intercept) 8.966e-01
## Age_log 5.992e-01
## Flight.Distance 3.177e-04
## Departure.Delay.in.Minutes 1.573e-02
## Arrival.Delay.in.Minutes_log 7.245e-02
## I(Age_log * log(Age_log)) 2.725e-01
## I(Flight.Distance * log(Flight.Distance)) 3.835e-05
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1)) 4.532e-03
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1)) 5.344e-02
## t value
## (Intercept) 2.916
## Age_log -0.367
## Flight.Distance -0.936
## Departure.Delay.in.Minutes 0.900
## Arrival.Delay.in.Minutes_log 0.679
## I(Age_log * log(Age_log)) 1.186
## I(Flight.Distance * log(Flight.Distance)) 1.455
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1)) -0.935
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1)) -1.023
## Pr(>|t|)
## (Intercept) 0.00355
## Age_log 0.71355
## Flight.Distance 0.34943
## Departure.Delay.in.Minutes 0.36827
## Arrival.Delay.in.Minutes_log 0.49690
## I(Age_log * log(Age_log)) 0.23578
## I(Flight.Distance * log(Flight.Distance)) 0.14567
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1)) 0.34973
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1)) 0.30610
##
## (Intercept) **
## Age_log
## Flight.Distance
## Departure.Delay.in.Minutes
## Arrival.Delay.in.Minutes_log
## I(Age_log * log(Age_log))
## I(Flight.Distance * log(Flight.Distance))
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1))
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1))
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.638498)
##
## Null deviance: 17247 on 9995 degrees of freedom
## Residual deviance: 16364 on 9987 degrees of freedom
## AIC: 33314
##
## Number of Fisher Scoring iterations: 2
Berdasarkan uji Box-Tidwell, seluruh variabel numerik memiliki nilai p-value > 0.05 pada komponen interaksi dengan logaritmanya. Hal ini menunjukkan bahwa hubungan antara prediktor kontinu dengan logit bersifat linear, sehingga asumsi linearitas pada model Ordinal Logistic Regression telah terpenuhi.
Independence Of Observation
names(df_model)
## [1] "Seat.comfort" "Age"
## [3] "Flight.Distance" "Departure.Delay.in.Minutes"
## [5] "Arrival.Delay.in.Minutes" "Class"
## [7] "Type.of.Travel" "Gender"
## [9] "Customer.Type" "Food.and.drink"
## [11] "Inflight.wifi.service" "y_num"
## [13] "Age_log" "Arrival.Delay.in.Minutes_log"
sum(duplicated(df_model))
## [1] 0
nrow(df)
## [1] 103904
Berdasarkan pemeriksaan terhadap data, tidak ditemukan observasi duplikat (sum duplicated = 0), sehingga setiap pengamatan dianggap independen. Dengan demikian, asumsi independence of observation pada model Ordinal Logistic Regression telah terpenuhi.
No Multicollinearity (GVIF<10)
cont_vars <- df_model %>%
select(Age, Flight.Distance, Arrival.Delay.in.Minutes)
cor_mat <- cor(cont_vars, use = "complete.obs")
corrplot(cor_mat,
method = "color",
type = "full",
addCoef.col = "black",
number.cex = 0.8,
tl.cex = 0.9,
tl.col = "black",
col = colorRampPalette(c("#2166AC", "white", "#B2182B"))(200),
title = "Correlation Matrix Variabel Kontinu",
mar = c(0, 0, 2, 0))
Semua nilai korelasi antar variabel numerik (independen) tersebut berada jauh di bawah ambang batas kritis. Secara umum, multikolinearitas dianggap serius jika nilai korelasi antar variabel independen melebihi 0.7 atau 0.8. Dari Correlation matrix diatas, variabel numerik berkorelasi < 0.1 sehingga asumsi No Multicollinearity terpenuhi.
model_vif <- lm(
y_num ~ Age + Flight.Distance +
Arrival.Delay.in.Minutes +
Class + Type.of.Travel +
Gender + Customer.Type +
Food.and.drink + Inflight.wifi.service,
data = df_model
)
vif(model_vif)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.116954 1 1.056860
## Flight.Distance 1.349677 1 1.161756
## Arrival.Delay.in.Minutes 1.007879 1 1.003932
## Class 1.819921 2 1.161484
## Type.of.Travel 1.841931 1 1.357178
## Gender 1.003236 1 1.001617
## Customer.Type 1.451381 1 1.204733
## Food.and.drink 1.129553 5 1.012257
## Inflight.wifi.service 1.165839 5 1.015462
Berdasarkan hasil uji Variance Inflation Factor (VIF), seluruh variabel independen memiliki nilai GVIF yang berkisar antara 1,00 hingga 1,84. Karena nilai tersebut jauh di bawah ambang batas 10, maka dapat disimpulkan bahwa asumsi no-multicollinearity terpenuhi. Model tidak mengandung hubungan linear yang kuat antar variabel independen, sehingga estimasi parameter yang dihasilkan bersifat valid.
model_ordinallogisticregression <- polr(
Seat.comfort ~ Age_log +
Flight.Distance +
Departure.Delay.in.Minutes +
Arrival.Delay.in.Minutes_log +
Class +
Type.of.Travel +
Gender +
Customer.Type +
Food.and.drink +
Inflight.wifi.service,
data = df_model,
Hess = TRUE
)
summary(model_ordinallogisticregression)
## Call:
## polr(formula = Seat.comfort ~ Age_log + Flight.Distance + Departure.Delay.in.Minutes +
## Arrival.Delay.in.Minutes_log + Class + Type.of.Travel + Gender +
## Customer.Type + Food.and.drink + Inflight.wifi.service, data = df_model,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Age_log 5.658e-01 1.955e-02 28.9429
## Flight.Distance 4.027e-05 2.588e-05 1.5557
## Departure.Delay.in.Minutes -4.183e-03 2.698e-03 -1.5503
## Arrival.Delay.in.Minutes_log -9.397e-03 2.107e-02 -0.4460
## ClassEco -5.105e-01 4.858e-02 -10.5087
## ClassEco Plus -6.649e-01 7.763e-02 -8.5646
## Type.of.TravelPersonal Travel -7.592e-02 5.378e-02 -1.4118
## GenderMale -2.167e-01 3.716e-02 -5.8315
## Customer.TypeLoyal Customer 4.950e-01 5.597e-02 8.8454
## Food.and.drink1 -2.974e+00 5.811e-02 -51.1850
## Food.and.drink2 -8.266e-01 3.785e-02 -21.8409
## Food.and.drink3 -1.180e-01 3.611e-02 -3.2670
## Food.and.drink4 6.857e-01 3.681e-02 18.6287
## Food.and.drink5 2.099e+00 4.702e-02 44.6442
## Inflight.wifi.service1 -1.919e-01 4.145e-02 -4.6303
## Inflight.wifi.service2 -3.072e-01 3.568e-02 -8.6091
## Inflight.wifi.service3 -2.625e-01 3.544e-02 -7.4078
## Inflight.wifi.service4 -2.784e-02 3.865e-02 -0.7204
## Inflight.wifi.service5 1.645e-01 5.135e-02 3.2026
##
## Intercepts:
## Value Std. Error t value
## 1|2 -1.0440 0.0092 -113.7470
## 2|3 0.4573 0.0374 12.2348
## 3|4 1.6625 0.0445 37.3873
## 4|5 3.5277 0.0533 66.2369
##
## Residual Deviance: 25603.67
## AIC: 25649.67
ctable <- coef(summary(model_ordinallogisticregression))
p_value <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
result <- cbind(
Estimate = ctable[, "Value"],
"Odds Ratio" = exp(ctable[, "Value"]),
"p value" = p_value
)
round(result, 4)
## Estimate Odds Ratio p value
## Age_log 0.5658 1.7608 0.0000
## Flight.Distance 0.0000 1.0000 0.1198
## Departure.Delay.in.Minutes -0.0042 0.9958 0.1211
## Arrival.Delay.in.Minutes_log -0.0094 0.9906 0.6556
## ClassEco -0.5105 0.6002 0.0000
## ClassEco Plus -0.6649 0.5143 0.0000
## Type.of.TravelPersonal Travel -0.0759 0.9269 0.1580
## GenderMale -0.2167 0.8052 0.0000
## Customer.TypeLoyal Customer 0.4950 1.6406 0.0000
## Food.and.drink1 -2.9745 0.0511 0.0000
## Food.and.drink2 -0.8266 0.4375 0.0000
## Food.and.drink3 -0.1180 0.8887 0.0011
## Food.and.drink4 0.6857 1.9852 0.0000
## Food.and.drink5 2.0990 8.1582 0.0000
## Inflight.wifi.service1 -0.1919 0.8254 0.0000
## Inflight.wifi.service2 -0.3072 0.7355 0.0000
## Inflight.wifi.service3 -0.2625 0.7691 0.0000
## Inflight.wifi.service4 -0.0278 0.9725 0.4713
## Inflight.wifi.service5 0.1645 1.1788 0.0014
## 1|2 -1.0440 0.3520 0.0000
## 2|3 0.4573 1.5798 0.0000
## 3|4 1.6625 5.2724 0.0000
## 4|5 3.5277 34.0454 0.0000
model_null <- polr(
Seat.comfort ~ 1,
data = df_model,
Hess = TRUE
)
anova(model_null, model_ordinallogisticregression)
## Likelihood ratio tests of ordinal regression models
##
## Response: Seat.comfort
## Model
## 1 1
## 2 Age_log + Flight.Distance + Departure.Delay.in.Minutes + Arrival.Delay.in.Minutes_log + Class + Type.of.Travel + Gender + Customer.Type + Food.and.drink + Inflight.wifi.service
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 9992 30966.45
## 2 9973 25603.67 1 vs 2 19 5362.782 0
AIC(model_null, model_ordinallogisticregression)
## df AIC
## model_null 4 30974.45
## model_ordinallogisticregression 23 25649.67