1. Load Dataset

library(readxl)
df <- read_excel("satisfaction.xlsx", sheet = "satisfaction_v2")
head(df)
## # A tibble: 6 × 24
##       id satisfaction_v2 Gender `Customer Type`   Age `Type of Travel` Class   
##    <dbl> <chr>           <chr>  <chr>           <dbl> <chr>            <chr>   
## 1  11112 satisfied       Female Loyal Customer     65 Personal Travel  Eco     
## 2 110278 satisfied       Male   Loyal Customer     47 Personal Travel  Business
## 3 103199 satisfied       Female Loyal Customer     15 Personal Travel  Eco     
## 4  47462 satisfied       Female Loyal Customer     60 Personal Travel  Eco     
## 5 120011 satisfied       Female Loyal Customer     70 Personal Travel  Eco     
## 6 100744 satisfied       Male   Loyal Customer     30 Personal Travel  Eco     
## # ℹ 17 more variables: `Flight Distance` <dbl>, `Seat comfort` <dbl>,
## #   `Departure/Arrival time convenient` <dbl>, `Food and drink` <dbl>,
## #   `Gate location` <dbl>, `Inflight wifi service` <dbl>,
## #   `Inflight entertainment` <dbl>, `Online support` <dbl>,
## #   `Ease of Online booking` <dbl>, `On-board service` <dbl>,
## #   `Leg room service` <dbl>, `Baggage handling` <dbl>,
## #   `Checkin service` <dbl>, Cleanliness <dbl>, `Online boarding` <dbl>, …

2. Preprocessing

# Menghapus Data yang Hilang (Missing Values)
colSums(is.na(df))
##                                id                   satisfaction_v2 
##                                 0                                 0 
##                            Gender                     Customer Type 
##                                 0                                 0 
##                               Age                    Type of Travel 
##                                 0                                 0 
##                             Class                   Flight Distance 
##                                 0                                 0 
##                      Seat comfort Departure/Arrival time convenient 
##                                 0                                 0 
##                    Food and drink                     Gate location 
##                                 0                                 0 
##             Inflight wifi service            Inflight entertainment 
##                                 0                                 0 
##                    Online support            Ease of Online booking 
##                                 0                                 0 
##                  On-board service                  Leg room service 
##                                 0                                 0 
##                  Baggage handling                   Checkin service 
##                                 0                                 0 
##                       Cleanliness                   Online boarding 
##                                 0                                 0 
##        Departure Delay in Minutes          Arrival Delay in Minutes 
##                                 0                               393
df <- na.omit(df)
dim(df)
## [1] 129487     24
# Mengubah Variabel Target Menjadi Biner
df$satisfaction <- ifelse(df$satisfaction_v2 == "satisfied", 1, 0)
table(df$satisfaction)
## 
##     0     1 
## 58605 70882
# Mengubah Kolom Kategorikal Menjadi Numerik (Label Encoding)
df$Gender <- as.numeric(factor(df$Gender))
df$Customer.Type <- as.numeric(factor(df$`Customer Type`))
df$Type.of.Travel <- as.numeric(factor(df$'Type of Travel'))
df$Class <- as.numeric(factor(df$Class))

str(df[, c("Gender", "Customer.Type", "Type.of.Travel", "Class")])
## tibble [129,487 × 4] (S3: tbl_df/tbl/data.frame)
##  $ Gender        : num [1:129487] 1 2 1 1 1 2 1 2 1 2 ...
##  $ Customer.Type : num [1:129487] 2 2 2 2 2 2 2 2 2 2 ...
##  $ Type.of.Travel: num [1:129487] 2 2 2 2 2 2 2 2 2 2 ...
##  $ Class         : num [1:129487] 2 1 2 2 2 2 2 2 1 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:393] 146 247 711 737 819 1163 1268 1302 1911 2035 ...
##   ..- attr(*, "names")= chr [1:393] "146" "247" "711" "737" ...
# Menormalisasi Fitur Numerik Menggunakan scale()
num_cols <- c(
  "Flight Distance", "Seat comfort", "Departure/Arrival time convenient",
  "Food and drink", "Inflight wifi service", "Inflight entertainment", "Online support",
  "Ease of Online booking", "On-board service", "Leg room service", "Baggage handling",
  "Checkin service", "Cleanliness", "Online boarding", "Departure Delay in Minutes",
  "Arrival Delay in Minutes"
)
df[num_cols] <- scale(df[num_cols])
summary(df[num_cols])
##  Flight Distance     Seat comfort     Departure/Arrival time convenient
##  Min.   :-1.88045   Min.   :-2.0379   Min.   :-1.958034                
##  1st Qu.:-0.60572   1st Qu.:-0.6021   1st Qu.:-0.648434                
##  Median :-0.05552   Median : 0.1159   Median : 0.006367                
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.000000                
##  3rd Qu.: 0.54728   3rd Qu.: 0.8338   3rd Qu.: 0.661167                
##  Max.   : 4.83988   Max.   : 1.5518   Max.   : 1.315967                
##  Food and drink    Inflight wifi service Inflight entertainment
##  Min.   :-1.9757   Min.   :-2.4638       Min.   :-2.5140       
##  1st Qu.:-0.5902   1st Qu.:-0.9472       1st Qu.:-1.0281       
##  Median : 0.1025   Median :-0.1889       Median : 0.4579       
##  Mean   : 0.0000   Mean   : 0.0000       Mean   : 0.0000       
##  3rd Qu.: 0.7952   3rd Qu.: 0.5694       3rd Qu.: 0.4579       
##  Max.   : 1.4879   Max.   : 1.3276       Max.   : 1.2008       
##  Online support    Ease of Online booking On-board service  Leg room service 
##  Min.   :-2.6946   Min.   :-2.6595        Min.   :-2.7268   Min.   :-2.6981  
##  1st Qu.:-0.3980   1st Qu.:-1.1276        1st Qu.:-0.3660   1st Qu.:-1.1502  
##  Median : 0.3675   Median : 0.4043        Median : 0.4209   Median : 0.3977  
##  Mean   : 0.0000   Mean   : 0.0000        Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.1330   3rd Qu.: 1.1702        3rd Qu.: 0.4209   3rd Qu.: 1.1717  
##  Max.   : 1.1330   Max.   : 1.1702        Max.   : 1.2078   Max.   : 1.1717  
##  Baggage handling  Checkin service    Cleanliness      Online boarding  
##  Min.   :-2.3307   Min.   :-2.6502   Min.   :-3.2178   Min.   :-2.5816  
##  1st Qu.:-0.6014   1st Qu.:-0.2703   1st Qu.:-0.6129   1st Qu.:-1.0415  
##  Median : 0.2633   Median :-0.2703   Median : 0.2554   Median : 0.4986  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.1280   3rd Qu.: 0.5230   3rd Qu.: 1.1237   3rd Qu.: 0.4986  
##  Max.   : 1.1280   Max.   : 1.3163   Max.   : 1.1237   Max.   : 1.2686  
##  Departure Delay in Minutes Arrival Delay in Minutes
##  Min.   :-0.38603           Min.   :-0.39233        
##  1st Qu.:-0.38603           1st Qu.:-0.39233        
##  Median :-0.38603           Median :-0.39233        
##  Mean   : 0.00000           Mean   : 0.00000        
##  3rd Qu.:-0.06969           3rd Qu.:-0.05436        
##  Max.   :41.58285           Max.   :40.78727
# Mengecek Outlier Menggunakan Boxplot
boxplot(df$`Flight Distance`, main = "Outlier - Flight Distance")

boxplot(df$`Departure Delay in Minutes`, main = "Outlier - Departure Delay")

boxplot(df$`Arrival Delay in Minutes`, main = "Outlier - Arrival Delay")

3. Eksplorasi Data

library(ggplot2)
# Distribusi usia
ggplot(df, aes(x = Age)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  labs(title = "Distribusi Usia Penumpang", x = "Usia", y = "Frekuensi")

library(tidyr)
library(dplyr)
# Distribusi fitur numerik
temp_num_data <- df[, num_cols]
long_data <- pivot_longer(temp_num_data, cols = everything(), names_to = "Feature", values_to = "Value")

ggplot(long_data, aes(x = Value)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  facet_wrap(~ Feature, scales = "free", ncol = 4) +
  theme_minimal(base_size = 10) +
  labs(title = "Distribusi Fitur Numerik", x = NULL, y = NULL)

# Distribusi fitur kategorikal
cat_cols <- c("Gender", "Customer.Type", "Type.of.Travel", "Class")
cat_data <- df[, cat_cols]
long_cat <- pivot_longer(cat_data, cols = everything(), names_to = "Feature", values_to = "Category")

ggplot(long_cat, aes(x = Category)) +
  geom_bar(fill = "cornflowerblue", color = "black") +
  facet_wrap(~ Feature, scales = "free", ncol = 3) +
  theme_minimal(base_size = 10) +
  labs(title = "Distribusi Fitur Kategorikal", x = NULL, y = "Jumlah") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Visualisasi target
ggplot(df, aes(x = factor(satisfaction), fill = factor(satisfaction))) +
  geom_bar() +
  labs(title = "Distribusi Frekuensi Kepuasan", x = "Satisfaction", y = "Jumlah") +
  scale_fill_manual(values = c("cornflowerblue", "cornflowerblue"),
                    name = "Satisfaction", labels = c("neutral or disatisfied", "satisfied")) +
  theme_minimal()

# Korelasi fitur numerik
library(corrplot)
corr_matrix <- cor(df[, num_cols])
corrplot(corr_matrix, method = "color", type = "upper", tl.cex = 0.7,
         addCoef.col = "black", number.cex = 0.5, order = "hclust")

# Boxplot numerik vs target
df$satisfaction <- as.factor(df$satisfaction)
df_long <- df %>%
  pivot_longer(cols = all_of(num_cols), names_to = "Feature", values_to = "Value")

ggplot(df_long, aes(x = satisfaction, y = Value, fill = satisfaction)) +
  geom_boxplot(outlier.size = 0.5, outlier.alpha = 0.3) +
  facet_wrap(~ Feature, scales = "free", ncol = 3) +
  theme_minimal(base_size = 11) +
  labs(title = "Hubungan Fitur Numerik dengan Kepuasan (Satisfaction)",
       x = "Satisfaction (0 = Tidak Puas, 1 = Puas)",
       y = "Nilai Fitur") +
  theme(legend.position = "none")

# Statistik deskriptif fitur numerik
summary(df[num_cols])
##  Flight Distance     Seat comfort     Departure/Arrival time convenient
##  Min.   :-1.88045   Min.   :-2.0379   Min.   :-1.958034                
##  1st Qu.:-0.60572   1st Qu.:-0.6021   1st Qu.:-0.648434                
##  Median :-0.05552   Median : 0.1159   Median : 0.006367                
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.000000                
##  3rd Qu.: 0.54728   3rd Qu.: 0.8338   3rd Qu.: 0.661167                
##  Max.   : 4.83988   Max.   : 1.5518   Max.   : 1.315967                
##  Food and drink    Inflight wifi service Inflight entertainment
##  Min.   :-1.9757   Min.   :-2.4638       Min.   :-2.5140       
##  1st Qu.:-0.5902   1st Qu.:-0.9472       1st Qu.:-1.0281       
##  Median : 0.1025   Median :-0.1889       Median : 0.4579       
##  Mean   : 0.0000   Mean   : 0.0000       Mean   : 0.0000       
##  3rd Qu.: 0.7952   3rd Qu.: 0.5694       3rd Qu.: 0.4579       
##  Max.   : 1.4879   Max.   : 1.3276       Max.   : 1.2008       
##  Online support    Ease of Online booking On-board service  Leg room service 
##  Min.   :-2.6946   Min.   :-2.6595        Min.   :-2.7268   Min.   :-2.6981  
##  1st Qu.:-0.3980   1st Qu.:-1.1276        1st Qu.:-0.3660   1st Qu.:-1.1502  
##  Median : 0.3675   Median : 0.4043        Median : 0.4209   Median : 0.3977  
##  Mean   : 0.0000   Mean   : 0.0000        Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.1330   3rd Qu.: 1.1702        3rd Qu.: 0.4209   3rd Qu.: 1.1717  
##  Max.   : 1.1330   Max.   : 1.1702        Max.   : 1.2078   Max.   : 1.1717  
##  Baggage handling  Checkin service    Cleanliness      Online boarding  
##  Min.   :-2.3307   Min.   :-2.6502   Min.   :-3.2178   Min.   :-2.5816  
##  1st Qu.:-0.6014   1st Qu.:-0.2703   1st Qu.:-0.6129   1st Qu.:-1.0415  
##  Median : 0.2633   Median :-0.2703   Median : 0.2554   Median : 0.4986  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.1280   3rd Qu.: 0.5230   3rd Qu.: 1.1237   3rd Qu.: 0.4986  
##  Max.   : 1.1280   Max.   : 1.3163   Max.   : 1.1237   Max.   : 1.2686  
##  Departure Delay in Minutes Arrival Delay in Minutes
##  Min.   :-0.38603           Min.   :-0.39233        
##  1st Qu.:-0.38603           1st Qu.:-0.39233        
##  Median :-0.38603           Median :-0.39233        
##  Mean   : 0.00000           Mean   : 0.00000        
##  3rd Qu.:-0.06969           3rd Qu.:-0.05436        
##  Max.   :41.58285           Max.   :40.78727
# Jika ingin statistik yang lebih lengkap
library(psych)
describe(df[num_cols])
##                                   vars      n mean sd median trimmed  mad   min
## Flight Distance                      1 129487    0  1  -0.06   -0.04 0.86 -1.88
## Seat comfort                         2 129487    0  1   0.12    0.00 1.06 -2.04
## Departure/Arrival time convenient    3 129487    0  1   0.01    0.04 0.97 -1.96
## Food and drink                       4 129487    0  1   0.10    0.01 1.03 -1.98
## Inflight wifi service                5 129487    0  1  -0.19    0.05 1.12 -2.46
## Inflight entertainment               6 129487    0  1   0.46    0.09 1.10 -2.51
## Online support                       7 129487    0  1   0.37    0.10 1.13 -2.69
## Ease of Online booking               8 129487    0  1   0.40    0.09 1.14 -2.66
## On-board service                     9 129487    0  1   0.42    0.09 1.17 -2.73
## Leg room service                    10 129487    0  1   0.40    0.09 1.15 -2.70
## Baggage handling                    11 129487    0  1   0.26    0.11 1.28 -2.33
## Checkin service                     12 129487    0  1  -0.27    0.07 1.18 -2.65
## Cleanliness                         13 129487    0  1   0.26    0.11 1.29 -3.22
## Online boarding                     14 129487    0  1   0.50    0.07 1.14 -2.58
## Departure Delay in Minutes          15 129487    0  1  -0.39   -0.23 0.00 -0.39
## Arrival Delay in Minutes            16 129487    0  1  -0.39   -0.23 0.00 -0.39
##                                     max range  skew kurtosis se
## Flight Distance                    4.84  6.72  0.47     0.36  0
## Seat comfort                       1.55  3.59 -0.09    -0.94  0
## Departure/Arrival time convenient  1.32  3.27 -0.25    -1.09  0
## Food and drink                     1.49  3.46 -0.12    -0.99  0
## Inflight wifi service              1.33  3.79 -0.19    -1.12  0
## Inflight entertainment             1.20  3.71 -0.61    -0.53  0
## Online support                     1.13  3.83 -0.58    -0.81  0
## Ease of Online booking             1.17  3.83 -0.49    -0.91  0
## On-board service                   1.21  3.93 -0.51    -0.78  0
## Leg room service                   1.17  3.87 -0.50    -0.84  0
## Baggage handling                   1.13  3.46 -0.74    -0.24  0
## Checkin service                    1.32  3.97 -0.39    -0.79  0
## Cleanliness                        1.12  4.34 -0.76    -0.21  0
## Online boarding                    1.27  3.85 -0.37    -0.94  0
## Departure Delay in Minutes        41.58 41.97  6.85   101.88  0
## Arrival Delay in Minutes          40.79 41.18  6.67    95.11  0
# Statistik deskriptif fitur kategorikal
table(df$Gender)
## 
##     1     2 
## 65703 63784
prop.table(table(df$Gender))
## 
##       1       2 
## 0.50741 0.49259
table(df$Customer.Type)
## 
##      1      2 
##  23714 105773
prop.table(table(df$Customer.Type))
## 
##         1         2 
## 0.1831381 0.8168619
table(df$Type.of.Travel)
## 
##     1     2 
## 89445 40042
prop.table(table(df$Type.of.Travel))
## 
##         1         2 
## 0.6907643 0.3092357
table(df$Class)
## 
##     1     2     3 
## 61990 58117  9380
prop.table(table(df$Class))
## 
##         1         2         3 
## 0.4787353 0.4488250 0.0724397
# Visualisasi numerik vs target
ggplot(df, aes(x = factor(satisfaction), y = `Seat comfort`)) +
  geom_boxplot(fill = c("salmon", "lightgreen")) +
  labs(title = "Seat Comfort vs Kepuasan", x = "Kepuasan (0 = Tidak, 1 = Ya)", y = "Seat Comfort")

# Visualisasi kategorikal vs target
ggplot(df, aes(x = factor(Gender), fill = factor(satisfaction))) +
  geom_bar(position = "fill") +
  labs(title = "Proporsi Kepuasan Berdasarkan Gender", x = "Gender", y = "Proporsi") +
  scale_fill_manual(values = c("red", "green"), name = "Satisfaction", labels = c("Tidak", "Puas"))

4. Feature Selection (EFA)

library(psych)
library(corpcor)

efa_data <- df[, num_cols]

# Uji KMO dan Bartlett
KMO(efa_data)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = efa_data)
## Overall MSA =  0.74
## MSA for each item = 
##                   Flight Distance                      Seat comfort 
##                              0.71                              0.71 
## Departure/Arrival time convenient                    Food and drink 
##                              0.69                              0.65 
##             Inflight wifi service            Inflight entertainment 
##                              0.84                              0.78 
##                    Online support            Ease of Online booking 
##                              0.84                              0.77 
##                  On-board service                  Leg room service 
##                              0.85                              0.89 
##                  Baggage handling                   Checkin service 
##                              0.82                              0.73 
##                       Cleanliness                   Online boarding 
##                              0.80                              0.82 
##        Departure Delay in Minutes          Arrival Delay in Minutes 
##                              0.51                              0.51
cortest.bartlett(cor(efa_data), n = nrow(efa_data))
## $chisq
## [1] 1066670
## 
## $p.value
## [1] 0
## 
## $df
## [1] 120
# Scree plot dan parallel analysis
fa.parallel(efa_data, fa = "fa", n.iter = 100, show.legend = TRUE,
            main = "Scree Plot dan Parallel Analysis")

## Parallel analysis suggests that the number of factors =  6  and the number of components =  NA
# Jalankan EFA
efa_result <- fa(efa_data, nfactors = 6, rotate = "varimax", fm = "ml")
print(efa_result, cut = 0.3)
## Factor Analysis using method =  ml
## Call: fa(r = efa_data, nfactors = 6, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                                     ML1   ML5   ML2   ML4   ML3   ML6    h2
## Flight Distance                                                       0.017
## Seat comfort                                         0.76             0.660
## Departure/Arrival time convenient                    0.62             0.404
## Food and drink                                       0.89             0.825
## Inflight wifi service              0.75                               0.560
## Inflight entertainment             0.31                    0.86       0.932
## Online support                     0.75                               0.634
## Ease of Online booking             0.80  0.52                         0.985
## On-board service                         0.70                         0.508
## Leg room service                         0.54                         0.308
## Baggage handling                         0.76                         0.599
## Checkin service                                                  0.42 0.270
## Cleanliness                              0.79                         0.635
## Online boarding                    0.84                               0.731
## Departure Delay in Minutes                     0.98                   0.967
## Arrival Delay in Minutes                       0.98                   0.964
##                                      u2 com
## Flight Distance                   0.983 1.4
## Seat comfort                      0.340 1.3
## Departure/Arrival time convenient 0.596 1.1
## Food and drink                    0.175 1.1
## Inflight wifi service             0.440 1.0
## Inflight entertainment            0.068 1.6
## Online support                    0.366 1.3
## Ease of Online booking            0.015 2.0
## On-board service                  0.492 1.1
## Leg room service                  0.692 1.1
## Baggage handling                  0.401 1.1
## Checkin service                   0.730 2.1
## Cleanliness                       0.365 1.1
## Online boarding                   0.269 1.1
## Departure Delay in Minutes        0.033 1.0
## Arrival Delay in Minutes          0.036 1.0
## 
##                        ML1  ML5  ML2  ML4  ML3  ML6
## SS loadings           2.61 2.36 1.93 1.82 0.91 0.36
## Proportion Var        0.16 0.15 0.12 0.11 0.06 0.02
## Cumulative Var        0.16 0.31 0.43 0.55 0.60 0.62
## Proportion Explained  0.26 0.24 0.19 0.18 0.09 0.04
## Cumulative Proportion 0.26 0.50 0.69 0.87 0.96 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 6 factors are sufficient.
## 
## df null model =  120  with the objective function =  8.24 with Chi Square =  1066670
## df of  the model are 39  and the objective function was  0.03 
## 
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.02 
## 
## The harmonic n.obs is  129487 with the empirical chi square  2500.04  with prob <  0 
## The total n.obs was  129487  with Likelihood Chi Square =  4228.36  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.988
## RMSEA index =  0.029  and the 90 % confidence intervals are  0.028 0.03
## BIC =  3769.27
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML1  ML5  ML2  ML4  ML3  ML6
## Correlation of (regression) scores with factors   0.95 0.93 0.99 0.93 0.93 0.76
## Multiple R square of scores with factors          0.91 0.86 0.98 0.86 0.87 0.58
## Minimum correlation of possible factor scores     0.82 0.71 0.95 0.72 0.74 0.17
# Ekstrak loading
loadings_matrix <- as.data.frame(efa_result$loadings[, 1:efa_result$factors])
print(round(loadings_matrix, 2))
##                                     ML1   ML5   ML2  ML4   ML3   ML6
## Flight Distance                    0.00 -0.01  0.12 0.00 -0.03  0.04
## Seat comfort                       0.12  0.12 -0.04 0.76  0.23 -0.09
## Departure/Arrival time convenient -0.01  0.05  0.02 0.62 -0.11  0.10
## Food and drink                    -0.01  0.01 -0.02 0.89  0.17 -0.04
## Inflight wifi service              0.75  0.00 -0.02 0.04  0.01  0.00
## Inflight entertainment             0.31  0.12 -0.06 0.25  0.86  0.14
## Online support                     0.75  0.08 -0.02 0.00  0.21  0.15
## Ease of Online booking             0.80  0.52 -0.04 0.03  0.04 -0.25
## On-board service                   0.10  0.70 -0.03 0.03  0.07  0.05
## Leg room service                   0.08  0.54  0.00 0.06  0.07 -0.01
## Baggage handling                   0.04  0.76  0.01 0.04 -0.01  0.13
## Checkin service                    0.13  0.26  0.02 0.01  0.11  0.42
## Cleanliness                        0.04  0.79 -0.04 0.04 -0.03  0.11
## Online boarding                    0.84  0.07  0.00 0.02  0.08  0.13
## Departure Delay in Minutes        -0.02 -0.01  0.98 0.00  0.05 -0.09
## Arrival Delay in Minutes          -0.02 -0.01  0.98 0.00  0.05 -0.10
# Ambil variabel dengan loading tertinggi
apply(loadings_matrix, 2, function(x) names(which.max(abs(x))))
##                          ML1                          ML5 
##            "Online boarding"                "Cleanliness" 
##                          ML2                          ML4 
## "Departure Delay in Minutes"             "Food and drink" 
##                          ML3                          ML6 
##     "Inflight entertainment"            "Checkin service"

5. Modeling

library(caret)
selected_vars <- c("Online boarding", "Departure Delay in Minutes", "Inflight entertainment",
                   "Cleanliness", "Food and drink", "Checkin service")
model_formula <- as.formula(
  paste("satisfaction ~", paste(sprintf("`%s`", selected_vars), collapse = " + "))
)

set.seed(123)
train_index <- createDataPartition(df$satisfaction, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

Logistic Regression

log_model <- glm(model_formula, data = train_data, family = binomial)
summary(log_model)
## 
## Call:
## glm(formula = model_formula, family = binomial, data = train_data)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   0.232312   0.007845   29.61   <2e-16 ***
## `Online boarding`             0.406712   0.008338   48.78   <2e-16 ***
## `Departure Delay in Minutes` -0.170827   0.008630  -19.79   <2e-16 ***
## `Inflight entertainment`      1.253449   0.010309  121.58   <2e-16 ***
## Cleanliness                   0.489677   0.008097   60.47   <2e-16 ***
## `Food and drink`             -0.190354   0.009053  -21.02   <2e-16 ***
## `Checkin service`             0.273191   0.008164   33.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 142674  on 103589  degrees of freedom
## Residual deviance: 100346  on 103583  degrees of freedom
## AIC: 100360
## 
## Number of Fisher Scoring iterations: 4
anova(log_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: satisfaction
## 
## Terms added sequentially (first to last)
## 
## 
##                              Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                        103589     142674              
## `Online boarding`             1  12369.6    103588     130304 < 2.2e-16 ***
## `Departure Delay in Minutes`  1    574.1    103587     129730 < 2.2e-16 ***
## `Inflight entertainment`      1  22734.9    103586     106995 < 2.2e-16 ***
## Cleanliness                   1   4976.5    103585     102019 < 2.2e-16 ***
## `Food and drink`              1    541.5    103584     101477 < 2.2e-16 ***
## `Checkin service`             1   1130.8    103583     100346 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary_df <- summary(log_model)$coefficients
summary_df <- cbind(summary_df, OddsRatio = exp(summary_df[, "Estimate"]))
print(summary_df)
##                                Estimate  Std. Error   z value      Pr(>|z|)
## (Intercept)                   0.2323125 0.007845300  29.61168 1.057110e-192
## `Online boarding`             0.4067117 0.008337750  48.77954  0.000000e+00
## `Departure Delay in Minutes` -0.1708265 0.008630456 -19.79345  3.389983e-87
## `Inflight entertainment`      1.2534489 0.010309320 121.58405  0.000000e+00
## Cleanliness                   0.4896771 0.008097360  60.47368  0.000000e+00
## `Food and drink`             -0.1903536 0.009053481 -21.02546  3.836452e-98
## `Checkin service`             0.2731911 0.008163516  33.46488 1.563820e-245
##                              OddsRatio
## (Intercept)                  1.2615139
## `Online boarding`            1.5018710
## `Departure Delay in Minutes` 0.8429678
## `Inflight entertainment`     3.5024016
## Cleanliness                  1.6317893
## `Food and drink`             0.8266668
## `Checkin service`            1.3141513
log_probs <- predict(log_model, newdata = test_data, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
confusionMatrix(factor(log_preds), factor(test_data$satisfaction), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  8729  2449
##          1  2992 11727
##                                           
##                Accuracy : 0.7899          
##                  95% CI : (0.7849, 0.7948)
##     No Information Rate : 0.5474          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5743          
##                                           
##  Mcnemar's Test P-Value : 2.014e-13       
##                                           
##             Sensitivity : 0.8272          
##             Specificity : 0.7447          
##          Pos Pred Value : 0.7967          
##          Neg Pred Value : 0.7809          
##              Prevalence : 0.5474          
##          Detection Rate : 0.4528          
##    Detection Prevalence : 0.5684          
##       Balanced Accuracy : 0.7860          
##                                           
##        'Positive' Class : 1               
## 

Linear Discriminant Analysis (LDA)

library(MASS)
lda_model <- lda(model_formula, data = train_data)
lda_model$scaling
##                                      LD1
## `Online boarding`             0.29749291
## `Departure Delay in Minutes` -0.09660415
## `Inflight entertainment`      0.94531947
## Cleanliness                   0.35723061
## `Food and drink`             -0.11655007
## `Checkin service`             0.20203351
lda_preds <- predict(lda_model, newdata = test_data)$class
confusionMatrix(factor(lda_preds), factor(test_data$satisfaction), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  8653  2385
##          1  3068 11791
##                                           
##                Accuracy : 0.7894          
##                  95% CI : (0.7844, 0.7944)
##     No Information Rate : 0.5474          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5729          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8318          
##             Specificity : 0.7382          
##          Pos Pred Value : 0.7935          
##          Neg Pred Value : 0.7839          
##              Prevalence : 0.5474          
##          Detection Rate : 0.4553          
##    Detection Prevalence : 0.5738          
##       Balanced Accuracy : 0.7850          
##                                           
##        'Positive' Class : 1               
## 
lda_probs <- predict(lda_model, newdata = test_data)$posterior[, 2]

Perbandingan Performa Model

# Akurasi manual
table_log <- table(Predicted = log_preds, Actual = test_data$satisfaction)
table_lda <- table(Predicted = lda_preds, Actual = test_data$satisfaction)

accuracy_log <- sum(diag(table_log)) / sum(table_log)
accuracy_lda <- sum(diag(table_lda)) / sum(table_lda)

cat("Akurasi Logistic Regression:", round(accuracy_log, 4), "\n")
## Akurasi Logistic Regression: 0.7899
cat("Akurasi LDA:", round(accuracy_lda, 4), "\n")
## Akurasi LDA: 0.7894
# ROC dan AUC
library(pROC)
roc_log <- roc(test_data$satisfaction, log_probs)
auc_log <- auc(roc_log)

roc_lda <- roc(test_data$satisfaction, lda_probs)
auc_lda <- auc(roc_lda)

plot(roc_log, col = "blue", main = "ROC Curve: Logistic vs LDA", legacy.axes = TRUE)
lines(roc_lda, col = "red")
legend("bottomright", legend = c(paste0("Logistic AUC = ", round(auc_log, 3)),
                                 paste0("LDA AUC = ", round(auc_lda, 3))),
       col = c("blue", "red"), lwd = 2)

6. Evaluasi & Visualisasi

Visualisasi Fitur Penting

log_coefs <- summary(log_model)$coefficients[-1, 1] 

coef_df <- data.frame(
  Variable = names(log_coefs),
  Coefficient = log_coefs
)

library(ggplot2)
ggplot(coef_df, aes(x = reorder(Variable, Coefficient), y = Coefficient)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Koefisien Fitur - Logistic Regression", x = "Fitur", y = "Koefisien")

# Plot ROC Curve per Model (lebih rapi)
## Plot Kedua ROC dalam Satu Grafik
library(pROC)

plot(roc_log, col = "blue", lwd = 2, main = "ROC Curve Logistic vs LDA", legacy.axes = TRUE)

lines(roc_lda, col = "red", lwd = 2)

legend("bottomright",
       legend = c(paste("Logistic AUC =", round(auc_log, 3)),
                  paste("LDA AUC =", round(auc_lda, 3))),
       col = c("blue", "red"),
       lwd = 2)

###Plot ROC Terpisah
####Logistic Regression ROC
plot(roc_log, col = "blue", main = "ROC Curve - Logistic Regression", legacy.axes = TRUE)

####LDA ROC
plot(roc_lda, col = "red", main = "ROC Curve - LDA", legacy.axes = TRUE)