Tugas Modul 4 - Ordinal Logistic Regression

Dataset –> Airline Passenger Satisfaction Link Data –> https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction

library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.5.3
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(nnet)
## Warning: package 'nnet' was built under R version 4.5.3
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(dplyr)
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
df <- read.csv("train.csv")
head(df)
##   X     id Gender     Customer.Type Age  Type.of.Travel    Class
## 1 0  70172   Male    Loyal Customer  13 Personal Travel Eco Plus
## 2 1   5047   Male disloyal Customer  25 Business travel Business
## 3 2 110028 Female    Loyal Customer  26 Business travel Business
## 4 3  24026 Female    Loyal Customer  25 Business travel Business
## 5 4 119299   Male    Loyal Customer  61 Business travel Business
## 6 5 111157 Female    Loyal Customer  26 Personal Travel      Eco
##   Flight.Distance Inflight.wifi.service Departure.Arrival.time.convenient
## 1             460                     3                                 4
## 2             235                     3                                 2
## 3            1142                     2                                 2
## 4             562                     2                                 5
## 5             214                     3                                 3
## 6            1180                     3                                 4
##   Ease.of.Online.booking Gate.location Food.and.drink Online.boarding
## 1                      3             1              5               3
## 2                      3             3              1               3
## 3                      2             2              5               5
## 4                      5             5              2               2
## 5                      3             3              4               5
## 6                      2             1              1               2
##   Seat.comfort Inflight.entertainment On.board.service Leg.room.service
## 1            5                      5                4                3
## 2            1                      1                1                5
## 3            5                      5                4                3
## 4            2                      2                2                5
## 5            5                      3                3                4
## 6            1                      1                3                4
##   Baggage.handling Checkin.service Inflight.service Cleanliness
## 1                4               4                5           5
## 2                3               1                4           1
## 3                4               4                4           5
## 4                3               1                4           2
## 5                4               3                3           3
## 6                4               4                4           1
##   Departure.Delay.in.Minutes Arrival.Delay.in.Minutes            satisfaction
## 1                         25                       18 neutral or dissatisfied
## 2                          1                        6 neutral or dissatisfied
## 3                          0                        0               satisfied
## 4                         11                        9 neutral or dissatisfied
## 5                          0                        0               satisfied
## 6                          0                        0 neutral or dissatisfied
colSums(is.na(df))
##                                 X                                id 
##                                 0                                 0 
##                            Gender                     Customer.Type 
##                                 0                                 0 
##                               Age                    Type.of.Travel 
##                                 0                                 0 
##                             Class                   Flight.Distance 
##                                 0                                 0 
##             Inflight.wifi.service Departure.Arrival.time.convenient 
##                                 0                                 0 
##            Ease.of.Online.booking                     Gate.location 
##                                 0                                 0 
##                    Food.and.drink                   Online.boarding 
##                                 0                                 0 
##                      Seat.comfort            Inflight.entertainment 
##                                 0                                 0 
##                  On.board.service                  Leg.room.service 
##                                 0                                 0 
##                  Baggage.handling                   Checkin.service 
##                                 0                                 0 
##                  Inflight.service                       Cleanliness 
##                                 0                                 0 
##        Departure.Delay.in.Minutes          Arrival.Delay.in.Minutes 
##                                 0                               310 
##                      satisfaction 
##                                 0
str(df)
## 'data.frame':    103904 obs. of  25 variables:
##  $ X                                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ id                               : int  70172 5047 110028 24026 119299 111157 82113 96462 79485 65725 ...
##  $ Gender                           : chr  "Male" "Male" "Female" "Female" ...
##  $ Customer.Type                    : chr  "Loyal Customer" "disloyal Customer" "Loyal Customer" "Loyal Customer" ...
##  $ Age                              : int  13 25 26 25 61 26 47 52 41 20 ...
##  $ Type.of.Travel                   : chr  "Personal Travel" "Business travel" "Business travel" "Business travel" ...
##  $ Class                            : chr  "Eco Plus" "Business" "Business" "Business" ...
##  $ Flight.Distance                  : int  460 235 1142 562 214 1180 1276 2035 853 1061 ...
##  $ Inflight.wifi.service            : int  3 3 2 2 3 3 2 4 1 3 ...
##  $ Departure.Arrival.time.convenient: int  4 2 2 5 3 4 4 3 2 3 ...
##  $ Ease.of.Online.booking           : int  3 3 2 5 3 2 2 4 2 3 ...
##  $ Gate.location                    : int  1 3 2 5 3 1 3 4 2 4 ...
##  $ Food.and.drink                   : int  5 1 5 2 4 1 2 5 4 2 ...
##  $ Online.boarding                  : int  3 3 5 2 5 2 2 5 3 3 ...
##  $ Seat.comfort                     : int  5 1 5 2 5 1 2 5 3 3 ...
##  $ Inflight.entertainment           : int  5 1 5 2 3 1 2 5 1 2 ...
##  $ On.board.service                 : int  4 1 4 2 3 3 3 5 1 2 ...
##  $ Leg.room.service                 : int  3 5 3 5 4 4 3 5 2 3 ...
##  $ Baggage.handling                 : int  4 3 4 3 4 4 4 5 1 4 ...
##  $ Checkin.service                  : int  4 1 4 1 3 4 3 4 4 4 ...
##  $ Inflight.service                 : int  5 4 4 4 3 4 5 5 1 3 ...
##  $ Cleanliness                      : int  5 1 5 2 3 1 2 4 2 2 ...
##  $ Departure.Delay.in.Minutes       : int  25 1 0 11 0 0 9 4 0 0 ...
##  $ Arrival.Delay.in.Minutes         : num  18 6 0 9 0 0 23 0 0 0 ...
##  $ satisfaction                     : chr  "neutral or dissatisfied" "neutral or dissatisfied" "satisfied" "neutral or dissatisfied" ...
set.seed(123)

df_model <- df %>%
  select(
    Seat.comfort,
    Age,
    Flight.Distance,
    Departure.Delay.in.Minutes,
    Arrival.Delay.in.Minutes,
    Class,
    Type.of.Travel,
    Gender,
    Customer.Type,
    Food.and.drink,
    Inflight.wifi.service
  ) %>%
  na.omit() %>%
  slice_sample(n = 10000)
colSums(is.na(df_model))
##               Seat.comfort                        Age 
##                          0                          0 
##            Flight.Distance Departure.Delay.in.Minutes 
##                          0                          0 
##   Arrival.Delay.in.Minutes                      Class 
##                          0                          0 
##             Type.of.Travel                     Gender 
##                          0                          0 
##              Customer.Type             Food.and.drink 
##                          0                          0 
##      Inflight.wifi.service 
##                          0

Handling Outlier

detect_outliers <- function(x, var_name) {
  Q1  <- quantile(x, 0.25, na.rm = TRUE)
  Q3  <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR_val
  upper <- Q3 + 1.5 * IQR_val
  n_out <- sum(x < lower | x > upper, na.rm = TRUE)
  data.frame(
    Variabel = var_name,
    Batas_Bawah = round(lower, 3), Batas_Atas = round(upper, 3),
    Jumlah_Outlier = n_out,
    Persen = round(n_out / length(x) * 100, 2))
}

continuous_vars_for_outlier_check <- df_model %>%
  select(Flight.Distance, Departure.Delay.in.Minutes, Arrival.Delay.in.Minutes)

outlier_tbl <- do.call(rbind,
  lapply(names(continuous_vars_for_outlier_check), function(v_name) {
    detect_outliers(continuous_vars_for_outlier_check[[v_name]], v_name)
  }))
rownames(outlier_tbl) <- NULL

knitr::kable(outlier_tbl,
  caption = "Deteksi Outlier")
Deteksi Outlier
Variabel Batas_Bawah Batas_Atas Jumlah_Outlier Persen
Flight.Distance -1530.25 3635.75 293 2.93
Departure.Delay.in.Minutes -18.00 30.00 1368 13.68
Arrival.Delay.in.Minutes -19.50 32.50 1326 13.26
plots_before <- lapply(names(continuous_vars_for_outlier_check), function(v) {
  ggplot(continuous_vars_for_outlier_check, aes(y = .data[[v]])) +
    geom_boxplot(fill = "#4292C6", color = "#084594",
                 outlier.color = "red", outlier.size = 2) +
    labs(title = v) +
    theme_minimal(base_size = 14) +
    theme(plot.title = element_text(face = "bold", hjust = 0.5))
})

grid.arrange(grobs = plots_before, ncol = 2,
             top = "Boxplot")

cap_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1

  lower <- Q1 - 1.5 * IQR_val
  upper <- Q3 + 1.5 * IQR_val

  x[x < lower] <- lower
  x[x > upper] <- upper

  x
}

cols_to_cap <- c("Flight.Distance", "Departure.Delay.in.Minutes", "Arrival.Delay.in.Minutes")

df_model[cols_to_cap] <- lapply(df_model[cols_to_cap], cap_outliers)

No Outlier

hasil_perbandingan <- do.call(rbind, lapply(cols_to_cap, function(v_name) {

  # sebelum capping
  out_before <- detect_outliers(continuous_vars_for_outlier_check[[v_name]], v_name)

  # sesudah capping
  out_after <- detect_outliers(df_model[[v_name]], v_name)

  data.frame(
    Variabel = v_name,
    Outlier_Sebelum = out_before$Jumlah_Outlier,
    Outlier_Sesudah = out_after$Jumlah_Outlier
  )
}))
knitr::kable(hasil_perbandingan,
             caption = "Perbandingan Outlier Sebelum dan Sesudah Capping")
Perbandingan Outlier Sebelum dan Sesudah Capping
Variabel Outlier_Sebelum Outlier_Sesudah
Flight.Distance 293 0
Departure.Delay.in.Minutes 1368 0
Arrival.Delay.in.Minutes 1326 0
plots_after <- lapply(cols_to_cap, function(v) {
  ggplot(df_model, aes(y = .data[[v]])) +
    geom_boxplot(fill = "#9ecae1", color = "#08519c",
                 outlier.color = "red", outlier.size = 2) +
    labs(title = v) +
    theme_minimal(base_size = 12) +
    theme(plot.title = element_text(face = "bold", hjust = 0.5))
})

grid.arrange(grobs = plots_after, ncol = 2,
             top = "Boxplot (Sesudah Capping)")

df_model <- df_model[!duplicated(df_model), ]
df_model$Departure.Delay.in.Minutes <- df_model$Departure.Delay.in.Minutes + 1
df_model$Arrival.Delay.in.Minutes <- df_model$Arrival.Delay.in.Minutes + 1
df_model$Seat.comfort <- as.ordered(df_model$Seat.comfort)
df_model$y_num <- as.numeric(df_model$Seat.comfort)
df_model <- df_model %>%
  mutate(
    Class = as.factor(Class),
    Type.of.Travel = as.factor(Type.of.Travel),
    Gender = as.factor(Gender),
    Customer.Type = as.factor(Customer.Type)
  )
df_model <- df_model %>%
  mutate(
    Food.and.drink = as.factor(Food.and.drink),
    Inflight.wifi.service = as.factor(Inflight.wifi.service)
  )
summary(df_model)
##  Seat.comfort      Age       Flight.Distance Departure.Delay.in.Minutes
##  1:1146       Min.   : 7.0   Min.   :  31    Min.   : 1.00             
##  2:1471       1st Qu.:27.0   1st Qu.: 407    1st Qu.: 1.00             
##  3:1815       Median :40.0   Median : 842    Median : 1.00             
##  4:3074       Mean   :39.2   Mean   :1166    Mean   : 8.34             
##  5:2490       3rd Qu.:50.0   3rd Qu.:1698    3rd Qu.:13.00             
##               Max.   :80.0   Max.   :3636    Max.   :31.00             
##  Arrival.Delay.in.Minutes      Class              Type.of.Travel    Gender    
##  Min.   : 1.000           Business:4720   Business travel:6923   Female:5069  
##  1st Qu.: 1.000           Eco     :4599   Personal Travel:3073   Male  :4927  
##  Median : 1.000           Eco Plus: 677                                       
##  Mean   : 8.853                                                               
##  3rd Qu.:14.000                                                               
##  Max.   :33.500                                                               
##            Customer.Type  Food.and.drink Inflight.wifi.service     y_num      
##  disloyal Customer:1864   0:  11         0: 271                Min.   :1.000  
##  Loyal Customer   :8132   1:1221         1:1700                1st Qu.:2.000  
##                           2:2135         2:2489                Median :4.000  
##                           3:2206         3:2531                Mean   :3.429  
##                           4:2357         4:1926                3rd Qu.:4.000  
##                           5:2066         5:1079                Max.   :5.000
if (!require("psych")) install.packages("psych")
## Loading required package: psych
## Warning: package 'psych' was built under R version 4.5.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit
library(psych)
describe(df_model[, c("Age", "Flight.Distance", "Departure.Delay.in.Minutes", "Arrival.Delay.in.Minutes")])
##                            vars    n    mean     sd median trimmed    mad min
## Age                           1 9996   39.20  14.96     40   39.16  17.79   7
## Flight.Distance               2 9996 1166.34 970.92    842 1024.69 757.61  31
## Departure.Delay.in.Minutes    3 9996    8.34  11.16      1    6.43   0.00   1
## Arrival.Delay.in.Minutes      4 9996    8.85  11.89      1    6.76   0.00   1
##                                max   range skew kurtosis   se
## Age                          80.00   73.00 0.02    -0.71 0.15
## Flight.Distance            3635.75 3604.75 1.09     0.15 9.71
## Departure.Delay.in.Minutes   31.00   30.00 1.23    -0.16 0.11
## Arrival.Delay.in.Minutes     33.50   32.50 1.25    -0.06 0.12

Uji Asumsi Ordinal Logistic Regression

# Transformasi variabel Age & Arrival Delay in Minutes
df_model$Age_log <- log(df_model$Age)
df_model$Arrival.Delay.in.Minutes_log <- log(df_model$Arrival.Delay.in.Minutes)

Interpretasi Variabel

model_olr <- polr(
  Seat.comfort ~ Age_log + Flight.Distance +
    Departure.Delay.in.Minutes +
    Arrival.Delay.in.Minutes_log +
    Class + Type.of.Travel +
    Gender + Customer.Type,
  data = df_model,
  Hess = TRUE
)

summary(model_olr)
## Call:
## polr(formula = Seat.comfort ~ Age_log + Flight.Distance + Departure.Delay.in.Minutes + 
##     Arrival.Delay.in.Minutes_log + Class + Type.of.Travel + Gender + 
##     Customer.Type, data = df_model, Hess = TRUE)
## 
## Coefficients:
##                                    Value Std. Error  t value
## Age_log                        4.709e-01  1.820e-02  25.8678
## Flight.Distance                3.808e-05  2.545e-05   1.4958
## Departure.Delay.in.Minutes    -2.403e-03  2.605e-03  -0.9222
## Arrival.Delay.in.Minutes_log  -3.378e-02  2.033e-02  -1.6616
## ClassEco                      -5.264e-01  4.736e-02 -11.1163
## ClassEco Plus                 -6.324e-01  7.704e-02  -8.2084
## Type.of.TravelPersonal Travel -1.853e-01  5.232e-02  -3.5417
## GenderMale                    -1.448e-01  3.566e-02  -4.0612
## Customer.TypeLoyal Customer    5.757e-01  5.589e-02  10.3021
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -0.4155   0.0091   -45.8215
## 2|3   0.6413   0.0269    23.8564
## 3|4   1.5127   0.0317    47.7605
## 4|5   2.9221   0.0378    77.3862
## 
## Residual Deviance: 30145.72 
## AIC: 30171.72

variabel usia, keterlambatan kedatangan, kelas penerbangan, tipe perjalanan, gender, dan tipe pelanggan berpengaruh signifikan terhadap tingkat kenyamanan kursi (Seat Comfort). Sementara itu, jarak penerbangan dan keterlambatan keberangkatan tidak menunjukkan pengaruh signifikan.

Linearity (p-value > 0.05)

model_lin <- glm(
  y_num ~ Age_log + Flight.Distance +
    Departure.Delay.in.Minutes +
    Arrival.Delay.in.Minutes_log +
    I(Age_log * log(Age_log)) +
    I(Flight.Distance * log(Flight.Distance)) +
    I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1)) +
    I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1)),
  data = df_model
)

summary(model_lin)
## 
## Call:
## glm(formula = y_num ~ Age_log + Flight.Distance + Departure.Delay.in.Minutes + 
##     Arrival.Delay.in.Minutes_log + I(Age_log * log(Age_log)) + 
##     I(Flight.Distance * log(Flight.Distance)) + I(Departure.Delay.in.Minutes * 
##     log(Departure.Delay.in.Minutes + 1)) + I(Arrival.Delay.in.Minutes_log * 
##     log(Arrival.Delay.in.Minutes_log + 1)), data = df_model)
## 
## Coefficients:
##                                                                           Estimate
## (Intercept)                                                              2.615e+00
## Age_log                                                                 -2.200e-01
## Flight.Distance                                                         -2.973e-04
## Departure.Delay.in.Minutes                                               1.415e-02
## Arrival.Delay.in.Minutes_log                                             4.922e-02
## I(Age_log * log(Age_log))                                                3.231e-01
## I(Flight.Distance * log(Flight.Distance))                                5.580e-05
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1))     -4.238e-03
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1)) -5.469e-02
##                                                                         Std. Error
## (Intercept)                                                              8.966e-01
## Age_log                                                                  5.992e-01
## Flight.Distance                                                          3.177e-04
## Departure.Delay.in.Minutes                                               1.573e-02
## Arrival.Delay.in.Minutes_log                                             7.245e-02
## I(Age_log * log(Age_log))                                                2.725e-01
## I(Flight.Distance * log(Flight.Distance))                                3.835e-05
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1))      4.532e-03
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1))  5.344e-02
##                                                                         t value
## (Intercept)                                                               2.916
## Age_log                                                                  -0.367
## Flight.Distance                                                          -0.936
## Departure.Delay.in.Minutes                                                0.900
## Arrival.Delay.in.Minutes_log                                              0.679
## I(Age_log * log(Age_log))                                                 1.186
## I(Flight.Distance * log(Flight.Distance))                                 1.455
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1))      -0.935
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1))  -1.023
##                                                                         Pr(>|t|)
## (Intercept)                                                              0.00355
## Age_log                                                                  0.71355
## Flight.Distance                                                          0.34943
## Departure.Delay.in.Minutes                                               0.36827
## Arrival.Delay.in.Minutes_log                                             0.49690
## I(Age_log * log(Age_log))                                                0.23578
## I(Flight.Distance * log(Flight.Distance))                                0.14567
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1))      0.34973
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1))  0.30610
##                                                                           
## (Intercept)                                                             **
## Age_log                                                                   
## Flight.Distance                                                           
## Departure.Delay.in.Minutes                                                
## Arrival.Delay.in.Minutes_log                                              
## I(Age_log * log(Age_log))                                                 
## I(Flight.Distance * log(Flight.Distance))                                 
## I(Departure.Delay.in.Minutes * log(Departure.Delay.in.Minutes + 1))       
## I(Arrival.Delay.in.Minutes_log * log(Arrival.Delay.in.Minutes_log + 1))   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.638498)
## 
##     Null deviance: 17247  on 9995  degrees of freedom
## Residual deviance: 16364  on 9987  degrees of freedom
## AIC: 33314
## 
## Number of Fisher Scoring iterations: 2

Berdasarkan uji Box-Tidwell, seluruh variabel numerik memiliki nilai p-value > 0.05 pada komponen interaksi dengan logaritmanya. Hal ini menunjukkan bahwa hubungan antara prediktor kontinu dengan logit bersifat linear, sehingga asumsi linearitas pada model Ordinal Logistic Regression telah terpenuhi.

Independence Of Observation

names(df_model)
##  [1] "Seat.comfort"                 "Age"                         
##  [3] "Flight.Distance"              "Departure.Delay.in.Minutes"  
##  [5] "Arrival.Delay.in.Minutes"     "Class"                       
##  [7] "Type.of.Travel"               "Gender"                      
##  [9] "Customer.Type"                "Food.and.drink"              
## [11] "Inflight.wifi.service"        "y_num"                       
## [13] "Age_log"                      "Arrival.Delay.in.Minutes_log"
sum(duplicated(df_model))
## [1] 0
nrow(df)
## [1] 103904

Berdasarkan pemeriksaan terhadap data, tidak ditemukan observasi duplikat (sum duplicated = 0), sehingga setiap pengamatan dianggap independen. Dengan demikian, asumsi independence of observation pada model Ordinal Logistic Regression telah terpenuhi.

No Multicollinearity (GVIF<10)

cont_vars <- df_model %>%
   select(Age, Flight.Distance, Arrival.Delay.in.Minutes)

cor_mat <- cor(cont_vars, use = "complete.obs")
corrplot(cor_mat,
         method      = "color",
         type        = "full",
         addCoef.col = "black",
         number.cex  = 0.8,
         tl.cex      = 0.9,
         tl.col      = "black",
         col         = colorRampPalette(c("#2166AC", "white", "#B2182B"))(200),
         title       = "Correlation Matrix Variabel Kontinu",
         mar         = c(0, 0, 2, 0))

Semua nilai korelasi antar variabel numerik (independen) tersebut berada jauh di bawah ambang batas kritis. Secara umum, multikolinearitas dianggap serius jika nilai korelasi antar variabel independen melebihi 0.7 atau 0.8. Dari Correlation matrix diatas, variabel numerik berkorelasi < 0.1 sehingga asumsi No Multicollinearity terpenuhi.

model_vif <- lm(
  y_num ~ Age + Flight.Distance +
    Arrival.Delay.in.Minutes +
    Class + Type.of.Travel +
    Gender + Customer.Type +
    Food.and.drink + Inflight.wifi.service,
  data = df_model
)
vif(model_vif)
##                              GVIF Df GVIF^(1/(2*Df))
## Age                      1.116954  1        1.056860
## Flight.Distance          1.349677  1        1.161756
## Arrival.Delay.in.Minutes 1.007879  1        1.003932
## Class                    1.819921  2        1.161484
## Type.of.Travel           1.841931  1        1.357178
## Gender                   1.003236  1        1.001617
## Customer.Type            1.451381  1        1.204733
## Food.and.drink           1.129553  5        1.012257
## Inflight.wifi.service    1.165839  5        1.015462

Berdasarkan hasil uji Variance Inflation Factor (VIF), seluruh variabel independen memiliki nilai GVIF yang berkisar antara 1,00 hingga 1,84. Karena nilai tersebut jauh di bawah ambang batas 10, maka dapat disimpulkan bahwa asumsi no-multicollinearity terpenuhi. Model tidak mengandung hubungan linear yang kuat antar variabel independen, sehingga estimasi parameter yang dihasilkan bersifat valid.

ORDINAL LOGISTIC REGRESSION

model_ordinallogisticregression <- polr(
  Seat.comfort ~ Age_log +
    Flight.Distance +
    Departure.Delay.in.Minutes +
    Arrival.Delay.in.Minutes_log +
    Class +
    Type.of.Travel +
    Gender +
    Customer.Type +
    Food.and.drink +
    Inflight.wifi.service,
  data = df_model,
  Hess = TRUE
)
summary(model_ordinallogisticregression)
## Call:
## polr(formula = Seat.comfort ~ Age_log + Flight.Distance + Departure.Delay.in.Minutes + 
##     Arrival.Delay.in.Minutes_log + Class + Type.of.Travel + Gender + 
##     Customer.Type + Food.and.drink + Inflight.wifi.service, data = df_model, 
##     Hess = TRUE)
## 
## Coefficients:
##                                    Value Std. Error  t value
## Age_log                        5.658e-01  1.955e-02  28.9429
## Flight.Distance                4.027e-05  2.588e-05   1.5557
## Departure.Delay.in.Minutes    -4.183e-03  2.698e-03  -1.5503
## Arrival.Delay.in.Minutes_log  -9.397e-03  2.107e-02  -0.4460
## ClassEco                      -5.105e-01  4.858e-02 -10.5087
## ClassEco Plus                 -6.649e-01  7.763e-02  -8.5646
## Type.of.TravelPersonal Travel -7.592e-02  5.378e-02  -1.4118
## GenderMale                    -2.167e-01  3.716e-02  -5.8315
## Customer.TypeLoyal Customer    4.950e-01  5.597e-02   8.8454
## Food.and.drink1               -2.974e+00  5.811e-02 -51.1850
## Food.and.drink2               -8.266e-01  3.785e-02 -21.8409
## Food.and.drink3               -1.180e-01  3.611e-02  -3.2670
## Food.and.drink4                6.857e-01  3.681e-02  18.6287
## Food.and.drink5                2.099e+00  4.702e-02  44.6442
## Inflight.wifi.service1        -1.919e-01  4.145e-02  -4.6303
## Inflight.wifi.service2        -3.072e-01  3.568e-02  -8.6091
## Inflight.wifi.service3        -2.625e-01  3.544e-02  -7.4078
## Inflight.wifi.service4        -2.784e-02  3.865e-02  -0.7204
## Inflight.wifi.service5         1.645e-01  5.135e-02   3.2026
## 
## Intercepts:
##     Value     Std. Error t value  
## 1|2   -1.0440    0.0092  -113.7470
## 2|3    0.4573    0.0374    12.2348
## 3|4    1.6625    0.0445    37.3873
## 4|5    3.5277    0.0533    66.2369
## 
## Residual Deviance: 25603.67 
## AIC: 25649.67
ctable <- coef(summary(model_ordinallogisticregression))
p_value <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2

result <- cbind(
  Estimate = ctable[, "Value"],
  "Odds Ratio" = exp(ctable[, "Value"]),
  "p value" = p_value
)

round(result, 4)
##                               Estimate Odds Ratio p value
## Age_log                         0.5658     1.7608  0.0000
## Flight.Distance                 0.0000     1.0000  0.1198
## Departure.Delay.in.Minutes     -0.0042     0.9958  0.1211
## Arrival.Delay.in.Minutes_log   -0.0094     0.9906  0.6556
## ClassEco                       -0.5105     0.6002  0.0000
## ClassEco Plus                  -0.6649     0.5143  0.0000
## Type.of.TravelPersonal Travel  -0.0759     0.9269  0.1580
## GenderMale                     -0.2167     0.8052  0.0000
## Customer.TypeLoyal Customer     0.4950     1.6406  0.0000
## Food.and.drink1                -2.9745     0.0511  0.0000
## Food.and.drink2                -0.8266     0.4375  0.0000
## Food.and.drink3                -0.1180     0.8887  0.0011
## Food.and.drink4                 0.6857     1.9852  0.0000
## Food.and.drink5                 2.0990     8.1582  0.0000
## Inflight.wifi.service1         -0.1919     0.8254  0.0000
## Inflight.wifi.service2         -0.3072     0.7355  0.0000
## Inflight.wifi.service3         -0.2625     0.7691  0.0000
## Inflight.wifi.service4         -0.0278     0.9725  0.4713
## Inflight.wifi.service5          0.1645     1.1788  0.0014
## 1|2                            -1.0440     0.3520  0.0000
## 2|3                             0.4573     1.5798  0.0000
## 3|4                             1.6625     5.2724  0.0000
## 4|5                             3.5277    34.0454  0.0000
model_null <- polr(
  Seat.comfort ~ 1,
  data = df_model,
  Hess = TRUE
)
anova(model_null, model_ordinallogisticregression)
## Likelihood ratio tests of ordinal regression models
## 
## Response: Seat.comfort
##                                                                                                                                                                              Model
## 1                                                                                                                                                                                1
## 2 Age_log + Flight.Distance + Departure.Delay.in.Minutes + Arrival.Delay.in.Minutes_log + Class + Type.of.Travel + Gender + Customer.Type + Food.and.drink + Inflight.wifi.service
##   Resid. df Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1      9992   30966.45                              
## 2      9973   25603.67 1 vs 2    19 5362.782       0
AIC(model_null, model_ordinallogisticregression)
##                                 df      AIC
## model_null                       4 30974.45
## model_ordinallogisticregression 23 25649.67