1. Load Dataset
library(readxl)
df <- read_excel("satisfaction.xlsx", sheet = "satisfaction_v2")
head(df)
## # A tibble: 6 × 24
## id satisfaction_v2 Gender `Customer Type` Age `Type of Travel` Class
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 11112 satisfied Female Loyal Customer 65 Personal Travel Eco
## 2 110278 satisfied Male Loyal Customer 47 Personal Travel Business
## 3 103199 satisfied Female Loyal Customer 15 Personal Travel Eco
## 4 47462 satisfied Female Loyal Customer 60 Personal Travel Eco
## 5 120011 satisfied Female Loyal Customer 70 Personal Travel Eco
## 6 100744 satisfied Male Loyal Customer 30 Personal Travel Eco
## # ℹ 17 more variables: `Flight Distance` <dbl>, `Seat comfort` <dbl>,
## # `Departure/Arrival time convenient` <dbl>, `Food and drink` <dbl>,
## # `Gate location` <dbl>, `Inflight wifi service` <dbl>,
## # `Inflight entertainment` <dbl>, `Online support` <dbl>,
## # `Ease of Online booking` <dbl>, `On-board service` <dbl>,
## # `Leg room service` <dbl>, `Baggage handling` <dbl>,
## # `Checkin service` <dbl>, Cleanliness <dbl>, `Online boarding` <dbl>, …
2. Preprocessing
# Menghapus Data yang Hilang (Missing Values)
colSums(is.na(df))
## id satisfaction_v2
## 0 0
## Gender Customer Type
## 0 0
## Age Type of Travel
## 0 0
## Class Flight Distance
## 0 0
## Seat comfort Departure/Arrival time convenient
## 0 0
## Food and drink Gate location
## 0 0
## Inflight wifi service Inflight entertainment
## 0 0
## Online support Ease of Online booking
## 0 0
## On-board service Leg room service
## 0 0
## Baggage handling Checkin service
## 0 0
## Cleanliness Online boarding
## 0 0
## Departure Delay in Minutes Arrival Delay in Minutes
## 0 393
df <- na.omit(df)
dim(df)
## [1] 129487 24
# Mengubah Variabel Target Menjadi Biner
df$satisfaction <- ifelse(df$satisfaction_v2 == "satisfied", 1, 0)
table(df$satisfaction)
##
## 0 1
## 58605 70882
# Mengubah Kolom Kategorikal Menjadi Numerik (Label Encoding)
df$Gender <- as.numeric(factor(df$Gender))
df$Customer.Type <- as.numeric(factor(df$`Customer Type`))
df$Type.of.Travel <- as.numeric(factor(df$'Type of Travel'))
df$Class <- as.numeric(factor(df$Class))
str(df[, c("Gender", "Customer.Type", "Type.of.Travel", "Class")])
## tibble [129,487 × 4] (S3: tbl_df/tbl/data.frame)
## $ Gender : num [1:129487] 1 2 1 1 1 2 1 2 1 2 ...
## $ Customer.Type : num [1:129487] 2 2 2 2 2 2 2 2 2 2 ...
## $ Type.of.Travel: num [1:129487] 2 2 2 2 2 2 2 2 2 2 ...
## $ Class : num [1:129487] 2 1 2 2 2 2 2 2 1 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:393] 146 247 711 737 819 1163 1268 1302 1911 2035 ...
## ..- attr(*, "names")= chr [1:393] "146" "247" "711" "737" ...
# Menormalisasi Fitur Numerik Menggunakan scale()
num_cols <- c(
"Flight Distance", "Seat comfort", "Departure/Arrival time convenient",
"Food and drink", "Inflight wifi service", "Inflight entertainment", "Online support",
"Ease of Online booking", "On-board service", "Leg room service", "Baggage handling",
"Checkin service", "Cleanliness", "Online boarding", "Departure Delay in Minutes",
"Arrival Delay in Minutes"
)
df[num_cols] <- scale(df[num_cols])
summary(df[num_cols])
## Flight Distance Seat comfort Departure/Arrival time convenient
## Min. :-1.88045 Min. :-2.0379 Min. :-1.958034
## 1st Qu.:-0.60572 1st Qu.:-0.6021 1st Qu.:-0.648434
## Median :-0.05552 Median : 0.1159 Median : 0.006367
## Mean : 0.00000 Mean : 0.0000 Mean : 0.000000
## 3rd Qu.: 0.54728 3rd Qu.: 0.8338 3rd Qu.: 0.661167
## Max. : 4.83988 Max. : 1.5518 Max. : 1.315967
## Food and drink Inflight wifi service Inflight entertainment
## Min. :-1.9757 Min. :-2.4638 Min. :-2.5140
## 1st Qu.:-0.5902 1st Qu.:-0.9472 1st Qu.:-1.0281
## Median : 0.1025 Median :-0.1889 Median : 0.4579
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7952 3rd Qu.: 0.5694 3rd Qu.: 0.4579
## Max. : 1.4879 Max. : 1.3276 Max. : 1.2008
## Online support Ease of Online booking On-board service Leg room service
## Min. :-2.6946 Min. :-2.6595 Min. :-2.7268 Min. :-2.6981
## 1st Qu.:-0.3980 1st Qu.:-1.1276 1st Qu.:-0.3660 1st Qu.:-1.1502
## Median : 0.3675 Median : 0.4043 Median : 0.4209 Median : 0.3977
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.1330 3rd Qu.: 1.1702 3rd Qu.: 0.4209 3rd Qu.: 1.1717
## Max. : 1.1330 Max. : 1.1702 Max. : 1.2078 Max. : 1.1717
## Baggage handling Checkin service Cleanliness Online boarding
## Min. :-2.3307 Min. :-2.6502 Min. :-3.2178 Min. :-2.5816
## 1st Qu.:-0.6014 1st Qu.:-0.2703 1st Qu.:-0.6129 1st Qu.:-1.0415
## Median : 0.2633 Median :-0.2703 Median : 0.2554 Median : 0.4986
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.1280 3rd Qu.: 0.5230 3rd Qu.: 1.1237 3rd Qu.: 0.4986
## Max. : 1.1280 Max. : 1.3163 Max. : 1.1237 Max. : 1.2686
## Departure Delay in Minutes Arrival Delay in Minutes
## Min. :-0.38603 Min. :-0.39233
## 1st Qu.:-0.38603 1st Qu.:-0.39233
## Median :-0.38603 Median :-0.39233
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.06969 3rd Qu.:-0.05436
## Max. :41.58285 Max. :40.78727
# Mengecek Outlier Menggunakan Boxplot
boxplot(df$`Flight Distance`, main = "Outlier - Flight Distance")

boxplot(df$`Departure Delay in Minutes`, main = "Outlier - Departure Delay")

boxplot(df$`Arrival Delay in Minutes`, main = "Outlier - Arrival Delay")

3. Eksplorasi Data
library(ggplot2)
# Distribusi usia
ggplot(df, aes(x = Age)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
labs(title = "Distribusi Usia Penumpang", x = "Usia", y = "Frekuensi")

library(tidyr)
library(dplyr)
# Distribusi fitur numerik
temp_num_data <- df[, num_cols]
long_data <- pivot_longer(temp_num_data, cols = everything(), names_to = "Feature", values_to = "Value")
ggplot(long_data, aes(x = Value)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
facet_wrap(~ Feature, scales = "free", ncol = 4) +
theme_minimal(base_size = 10) +
labs(title = "Distribusi Fitur Numerik", x = NULL, y = NULL)

# Distribusi fitur kategorikal
cat_cols <- c("Gender", "Customer.Type", "Type.of.Travel", "Class")
cat_data <- df[, cat_cols]
long_cat <- pivot_longer(cat_data, cols = everything(), names_to = "Feature", values_to = "Category")
ggplot(long_cat, aes(x = Category)) +
geom_bar(fill = "cornflowerblue", color = "black") +
facet_wrap(~ Feature, scales = "free", ncol = 3) +
theme_minimal(base_size = 10) +
labs(title = "Distribusi Fitur Kategorikal", x = NULL, y = "Jumlah") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Visualisasi target
ggplot(df, aes(x = factor(satisfaction), fill = factor(satisfaction))) +
geom_bar() +
labs(title = "Distribusi Frekuensi Kepuasan", x = "Satisfaction", y = "Jumlah") +
scale_fill_manual(values = c("cornflowerblue", "cornflowerblue"),
name = "Satisfaction", labels = c("neutral or disatisfied", "satisfied")) +
theme_minimal()

# Korelasi fitur numerik
library(corrplot)
corr_matrix <- cor(df[, num_cols])
corrplot(corr_matrix, method = "color", type = "upper", tl.cex = 0.7,
addCoef.col = "black", number.cex = 0.5, order = "hclust")

# Boxplot numerik vs target
df$satisfaction <- as.factor(df$satisfaction)
df_long <- df %>%
pivot_longer(cols = all_of(num_cols), names_to = "Feature", values_to = "Value")
ggplot(df_long, aes(x = satisfaction, y = Value, fill = satisfaction)) +
geom_boxplot(outlier.size = 0.5, outlier.alpha = 0.3) +
facet_wrap(~ Feature, scales = "free", ncol = 3) +
theme_minimal(base_size = 11) +
labs(title = "Hubungan Fitur Numerik dengan Kepuasan (Satisfaction)",
x = "Satisfaction (0 = Tidak Puas, 1 = Puas)",
y = "Nilai Fitur") +
theme(legend.position = "none")

# Statistik deskriptif fitur numerik
summary(df[num_cols])
## Flight Distance Seat comfort Departure/Arrival time convenient
## Min. :-1.88045 Min. :-2.0379 Min. :-1.958034
## 1st Qu.:-0.60572 1st Qu.:-0.6021 1st Qu.:-0.648434
## Median :-0.05552 Median : 0.1159 Median : 0.006367
## Mean : 0.00000 Mean : 0.0000 Mean : 0.000000
## 3rd Qu.: 0.54728 3rd Qu.: 0.8338 3rd Qu.: 0.661167
## Max. : 4.83988 Max. : 1.5518 Max. : 1.315967
## Food and drink Inflight wifi service Inflight entertainment
## Min. :-1.9757 Min. :-2.4638 Min. :-2.5140
## 1st Qu.:-0.5902 1st Qu.:-0.9472 1st Qu.:-1.0281
## Median : 0.1025 Median :-0.1889 Median : 0.4579
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7952 3rd Qu.: 0.5694 3rd Qu.: 0.4579
## Max. : 1.4879 Max. : 1.3276 Max. : 1.2008
## Online support Ease of Online booking On-board service Leg room service
## Min. :-2.6946 Min. :-2.6595 Min. :-2.7268 Min. :-2.6981
## 1st Qu.:-0.3980 1st Qu.:-1.1276 1st Qu.:-0.3660 1st Qu.:-1.1502
## Median : 0.3675 Median : 0.4043 Median : 0.4209 Median : 0.3977
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.1330 3rd Qu.: 1.1702 3rd Qu.: 0.4209 3rd Qu.: 1.1717
## Max. : 1.1330 Max. : 1.1702 Max. : 1.2078 Max. : 1.1717
## Baggage handling Checkin service Cleanliness Online boarding
## Min. :-2.3307 Min. :-2.6502 Min. :-3.2178 Min. :-2.5816
## 1st Qu.:-0.6014 1st Qu.:-0.2703 1st Qu.:-0.6129 1st Qu.:-1.0415
## Median : 0.2633 Median :-0.2703 Median : 0.2554 Median : 0.4986
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.1280 3rd Qu.: 0.5230 3rd Qu.: 1.1237 3rd Qu.: 0.4986
## Max. : 1.1280 Max. : 1.3163 Max. : 1.1237 Max. : 1.2686
## Departure Delay in Minutes Arrival Delay in Minutes
## Min. :-0.38603 Min. :-0.39233
## 1st Qu.:-0.38603 1st Qu.:-0.39233
## Median :-0.38603 Median :-0.39233
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.06969 3rd Qu.:-0.05436
## Max. :41.58285 Max. :40.78727
# Jika ingin statistik yang lebih lengkap
library(psych)
describe(df[num_cols])
## vars n mean sd median trimmed mad min
## Flight Distance 1 129487 0 1 -0.06 -0.04 0.86 -1.88
## Seat comfort 2 129487 0 1 0.12 0.00 1.06 -2.04
## Departure/Arrival time convenient 3 129487 0 1 0.01 0.04 0.97 -1.96
## Food and drink 4 129487 0 1 0.10 0.01 1.03 -1.98
## Inflight wifi service 5 129487 0 1 -0.19 0.05 1.12 -2.46
## Inflight entertainment 6 129487 0 1 0.46 0.09 1.10 -2.51
## Online support 7 129487 0 1 0.37 0.10 1.13 -2.69
## Ease of Online booking 8 129487 0 1 0.40 0.09 1.14 -2.66
## On-board service 9 129487 0 1 0.42 0.09 1.17 -2.73
## Leg room service 10 129487 0 1 0.40 0.09 1.15 -2.70
## Baggage handling 11 129487 0 1 0.26 0.11 1.28 -2.33
## Checkin service 12 129487 0 1 -0.27 0.07 1.18 -2.65
## Cleanliness 13 129487 0 1 0.26 0.11 1.29 -3.22
## Online boarding 14 129487 0 1 0.50 0.07 1.14 -2.58
## Departure Delay in Minutes 15 129487 0 1 -0.39 -0.23 0.00 -0.39
## Arrival Delay in Minutes 16 129487 0 1 -0.39 -0.23 0.00 -0.39
## max range skew kurtosis se
## Flight Distance 4.84 6.72 0.47 0.36 0
## Seat comfort 1.55 3.59 -0.09 -0.94 0
## Departure/Arrival time convenient 1.32 3.27 -0.25 -1.09 0
## Food and drink 1.49 3.46 -0.12 -0.99 0
## Inflight wifi service 1.33 3.79 -0.19 -1.12 0
## Inflight entertainment 1.20 3.71 -0.61 -0.53 0
## Online support 1.13 3.83 -0.58 -0.81 0
## Ease of Online booking 1.17 3.83 -0.49 -0.91 0
## On-board service 1.21 3.93 -0.51 -0.78 0
## Leg room service 1.17 3.87 -0.50 -0.84 0
## Baggage handling 1.13 3.46 -0.74 -0.24 0
## Checkin service 1.32 3.97 -0.39 -0.79 0
## Cleanliness 1.12 4.34 -0.76 -0.21 0
## Online boarding 1.27 3.85 -0.37 -0.94 0
## Departure Delay in Minutes 41.58 41.97 6.85 101.88 0
## Arrival Delay in Minutes 40.79 41.18 6.67 95.11 0
# Statistik deskriptif fitur kategorikal
table(df$Gender)
##
## 1 2
## 65703 63784
prop.table(table(df$Gender))
##
## 1 2
## 0.50741 0.49259
table(df$Customer.Type)
##
## 1 2
## 23714 105773
prop.table(table(df$Customer.Type))
##
## 1 2
## 0.1831381 0.8168619
table(df$Type.of.Travel)
##
## 1 2
## 89445 40042
prop.table(table(df$Type.of.Travel))
##
## 1 2
## 0.6907643 0.3092357
table(df$Class)
##
## 1 2 3
## 61990 58117 9380
prop.table(table(df$Class))
##
## 1 2 3
## 0.4787353 0.4488250 0.0724397
# Visualisasi numerik vs target
ggplot(df, aes(x = factor(satisfaction), y = `Seat comfort`)) +
geom_boxplot(fill = c("salmon", "lightgreen")) +
labs(title = "Seat Comfort vs Kepuasan", x = "Kepuasan (0 = Tidak, 1 = Ya)", y = "Seat Comfort")

# Visualisasi kategorikal vs target
ggplot(df, aes(x = factor(Gender), fill = factor(satisfaction))) +
geom_bar(position = "fill") +
labs(title = "Proporsi Kepuasan Berdasarkan Gender", x = "Gender", y = "Proporsi") +
scale_fill_manual(values = c("red", "green"), name = "Satisfaction", labels = c("Tidak", "Puas"))

4. Feature Selection (EFA)
library(psych)
library(corpcor)
efa_data <- df[, num_cols]
# Uji KMO dan Bartlett
KMO(efa_data)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = efa_data)
## Overall MSA = 0.74
## MSA for each item =
## Flight Distance Seat comfort
## 0.71 0.71
## Departure/Arrival time convenient Food and drink
## 0.69 0.65
## Inflight wifi service Inflight entertainment
## 0.84 0.78
## Online support Ease of Online booking
## 0.84 0.77
## On-board service Leg room service
## 0.85 0.89
## Baggage handling Checkin service
## 0.82 0.73
## Cleanliness Online boarding
## 0.80 0.82
## Departure Delay in Minutes Arrival Delay in Minutes
## 0.51 0.51
cortest.bartlett(cor(efa_data), n = nrow(efa_data))
## $chisq
## [1] 1066670
##
## $p.value
## [1] 0
##
## $df
## [1] 120
# Scree plot dan parallel analysis
fa.parallel(efa_data, fa = "fa", n.iter = 100, show.legend = TRUE,
main = "Scree Plot dan Parallel Analysis")

## Parallel analysis suggests that the number of factors = 6 and the number of components = NA
# Jalankan EFA
efa_result <- fa(efa_data, nfactors = 6, rotate = "varimax", fm = "ml")
print(efa_result, cut = 0.3)
## Factor Analysis using method = ml
## Call: fa(r = efa_data, nfactors = 6, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML5 ML2 ML4 ML3 ML6 h2
## Flight Distance 0.017
## Seat comfort 0.76 0.660
## Departure/Arrival time convenient 0.62 0.404
## Food and drink 0.89 0.825
## Inflight wifi service 0.75 0.560
## Inflight entertainment 0.31 0.86 0.932
## Online support 0.75 0.634
## Ease of Online booking 0.80 0.52 0.985
## On-board service 0.70 0.508
## Leg room service 0.54 0.308
## Baggage handling 0.76 0.599
## Checkin service 0.42 0.270
## Cleanliness 0.79 0.635
## Online boarding 0.84 0.731
## Departure Delay in Minutes 0.98 0.967
## Arrival Delay in Minutes 0.98 0.964
## u2 com
## Flight Distance 0.983 1.4
## Seat comfort 0.340 1.3
## Departure/Arrival time convenient 0.596 1.1
## Food and drink 0.175 1.1
## Inflight wifi service 0.440 1.0
## Inflight entertainment 0.068 1.6
## Online support 0.366 1.3
## Ease of Online booking 0.015 2.0
## On-board service 0.492 1.1
## Leg room service 0.692 1.1
## Baggage handling 0.401 1.1
## Checkin service 0.730 2.1
## Cleanliness 0.365 1.1
## Online boarding 0.269 1.1
## Departure Delay in Minutes 0.033 1.0
## Arrival Delay in Minutes 0.036 1.0
##
## ML1 ML5 ML2 ML4 ML3 ML6
## SS loadings 2.61 2.36 1.93 1.82 0.91 0.36
## Proportion Var 0.16 0.15 0.12 0.11 0.06 0.02
## Cumulative Var 0.16 0.31 0.43 0.55 0.60 0.62
## Proportion Explained 0.26 0.24 0.19 0.18 0.09 0.04
## Cumulative Proportion 0.26 0.50 0.69 0.87 0.96 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 6 factors are sufficient.
##
## df null model = 120 with the objective function = 8.24 with Chi Square = 1066670
## df of the model are 39 and the objective function was 0.03
##
## The root mean square of the residuals (RMSR) is 0.01
## The df corrected root mean square of the residuals is 0.02
##
## The harmonic n.obs is 129487 with the empirical chi square 2500.04 with prob < 0
## The total n.obs was 129487 with Likelihood Chi Square = 4228.36 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.988
## RMSEA index = 0.029 and the 90 % confidence intervals are 0.028 0.03
## BIC = 3769.27
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML1 ML5 ML2 ML4 ML3 ML6
## Correlation of (regression) scores with factors 0.95 0.93 0.99 0.93 0.93 0.76
## Multiple R square of scores with factors 0.91 0.86 0.98 0.86 0.87 0.58
## Minimum correlation of possible factor scores 0.82 0.71 0.95 0.72 0.74 0.17
# Ekstrak loading
loadings_matrix <- as.data.frame(efa_result$loadings[, 1:efa_result$factors])
print(round(loadings_matrix, 2))
## ML1 ML5 ML2 ML4 ML3 ML6
## Flight Distance 0.00 -0.01 0.12 0.00 -0.03 0.04
## Seat comfort 0.12 0.12 -0.04 0.76 0.23 -0.09
## Departure/Arrival time convenient -0.01 0.05 0.02 0.62 -0.11 0.10
## Food and drink -0.01 0.01 -0.02 0.89 0.17 -0.04
## Inflight wifi service 0.75 0.00 -0.02 0.04 0.01 0.00
## Inflight entertainment 0.31 0.12 -0.06 0.25 0.86 0.14
## Online support 0.75 0.08 -0.02 0.00 0.21 0.15
## Ease of Online booking 0.80 0.52 -0.04 0.03 0.04 -0.25
## On-board service 0.10 0.70 -0.03 0.03 0.07 0.05
## Leg room service 0.08 0.54 0.00 0.06 0.07 -0.01
## Baggage handling 0.04 0.76 0.01 0.04 -0.01 0.13
## Checkin service 0.13 0.26 0.02 0.01 0.11 0.42
## Cleanliness 0.04 0.79 -0.04 0.04 -0.03 0.11
## Online boarding 0.84 0.07 0.00 0.02 0.08 0.13
## Departure Delay in Minutes -0.02 -0.01 0.98 0.00 0.05 -0.09
## Arrival Delay in Minutes -0.02 -0.01 0.98 0.00 0.05 -0.10
# Ambil variabel dengan loading tertinggi
apply(loadings_matrix, 2, function(x) names(which.max(abs(x))))
## ML1 ML5
## "Online boarding" "Cleanliness"
## ML2 ML4
## "Departure Delay in Minutes" "Food and drink"
## ML3 ML6
## "Inflight entertainment" "Checkin service"
5. Modeling
library(caret)
selected_vars <- c("Online boarding", "Departure Delay in Minutes", "Inflight entertainment",
"Cleanliness", "Food and drink", "Checkin service")
model_formula <- as.formula(
paste("satisfaction ~", paste(sprintf("`%s`", selected_vars), collapse = " + "))
)
set.seed(123)
train_index <- createDataPartition(df$satisfaction, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
Logistic Regression
log_model <- glm(model_formula, data = train_data, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = model_formula, family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.232312 0.007845 29.61 <2e-16 ***
## `Online boarding` 0.406712 0.008338 48.78 <2e-16 ***
## `Departure Delay in Minutes` -0.170827 0.008630 -19.79 <2e-16 ***
## `Inflight entertainment` 1.253449 0.010309 121.58 <2e-16 ***
## Cleanliness 0.489677 0.008097 60.47 <2e-16 ***
## `Food and drink` -0.190354 0.009053 -21.02 <2e-16 ***
## `Checkin service` 0.273191 0.008164 33.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 142674 on 103589 degrees of freedom
## Residual deviance: 100346 on 103583 degrees of freedom
## AIC: 100360
##
## Number of Fisher Scoring iterations: 4
anova(log_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: satisfaction
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 103589 142674
## `Online boarding` 1 12369.6 103588 130304 < 2.2e-16 ***
## `Departure Delay in Minutes` 1 574.1 103587 129730 < 2.2e-16 ***
## `Inflight entertainment` 1 22734.9 103586 106995 < 2.2e-16 ***
## Cleanliness 1 4976.5 103585 102019 < 2.2e-16 ***
## `Food and drink` 1 541.5 103584 101477 < 2.2e-16 ***
## `Checkin service` 1 1130.8 103583 100346 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary_df <- summary(log_model)$coefficients
summary_df <- cbind(summary_df, OddsRatio = exp(summary_df[, "Estimate"]))
print(summary_df)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2323125 0.007845300 29.61168 1.057110e-192
## `Online boarding` 0.4067117 0.008337750 48.77954 0.000000e+00
## `Departure Delay in Minutes` -0.1708265 0.008630456 -19.79345 3.389983e-87
## `Inflight entertainment` 1.2534489 0.010309320 121.58405 0.000000e+00
## Cleanliness 0.4896771 0.008097360 60.47368 0.000000e+00
## `Food and drink` -0.1903536 0.009053481 -21.02546 3.836452e-98
## `Checkin service` 0.2731911 0.008163516 33.46488 1.563820e-245
## OddsRatio
## (Intercept) 1.2615139
## `Online boarding` 1.5018710
## `Departure Delay in Minutes` 0.8429678
## `Inflight entertainment` 3.5024016
## Cleanliness 1.6317893
## `Food and drink` 0.8266668
## `Checkin service` 1.3141513
log_probs <- predict(log_model, newdata = test_data, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
confusionMatrix(factor(log_preds), factor(test_data$satisfaction), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8729 2449
## 1 2992 11727
##
## Accuracy : 0.7899
## 95% CI : (0.7849, 0.7948)
## No Information Rate : 0.5474
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5743
##
## Mcnemar's Test P-Value : 2.014e-13
##
## Sensitivity : 0.8272
## Specificity : 0.7447
## Pos Pred Value : 0.7967
## Neg Pred Value : 0.7809
## Prevalence : 0.5474
## Detection Rate : 0.4528
## Detection Prevalence : 0.5684
## Balanced Accuracy : 0.7860
##
## 'Positive' Class : 1
##
Linear Discriminant Analysis (LDA)
library(MASS)
lda_model <- lda(model_formula, data = train_data)
lda_model$scaling
## LD1
## `Online boarding` 0.29749291
## `Departure Delay in Minutes` -0.09660415
## `Inflight entertainment` 0.94531947
## Cleanliness 0.35723061
## `Food and drink` -0.11655007
## `Checkin service` 0.20203351
lda_preds <- predict(lda_model, newdata = test_data)$class
confusionMatrix(factor(lda_preds), factor(test_data$satisfaction), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8653 2385
## 1 3068 11791
##
## Accuracy : 0.7894
## 95% CI : (0.7844, 0.7944)
## No Information Rate : 0.5474
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5729
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8318
## Specificity : 0.7382
## Pos Pred Value : 0.7935
## Neg Pred Value : 0.7839
## Prevalence : 0.5474
## Detection Rate : 0.4553
## Detection Prevalence : 0.5738
## Balanced Accuracy : 0.7850
##
## 'Positive' Class : 1
##
lda_probs <- predict(lda_model, newdata = test_data)$posterior[, 2]
6. Evaluasi & Visualisasi
Visualisasi Fitur Penting
log_coefs <- summary(log_model)$coefficients[-1, 1]
coef_df <- data.frame(
Variable = names(log_coefs),
Coefficient = log_coefs
)
library(ggplot2)
ggplot(coef_df, aes(x = reorder(Variable, Coefficient), y = Coefficient)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Koefisien Fitur - Logistic Regression", x = "Fitur", y = "Koefisien")

# Plot ROC Curve per Model (lebih rapi)
## Plot Kedua ROC dalam Satu Grafik
library(pROC)
plot(roc_log, col = "blue", lwd = 2, main = "ROC Curve Logistic vs LDA", legacy.axes = TRUE)
lines(roc_lda, col = "red", lwd = 2)
legend("bottomright",
legend = c(paste("Logistic AUC =", round(auc_log, 3)),
paste("LDA AUC =", round(auc_lda, 3))),
col = c("blue", "red"),
lwd = 2)

###Plot ROC Terpisah
####Logistic Regression ROC
plot(roc_log, col = "blue", main = "ROC Curve - Logistic Regression", legacy.axes = TRUE)

####LDA ROC
plot(roc_lda, col = "red", main = "ROC Curve - LDA", legacy.axes = TRUE)
