# 1. Install dan Muat Package
# install.packages("nnet") # Hapus tanda '#' jika package belum terinstal
# install.packages("tidyverse") # Untuk manipulasi data
library(nnet)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 2. Muat Data (Pastikan file CSV berada di direktori kerja atau ganti path)
library(readxl)
data_hotel<- read_excel("C:/Users/tengk/Downloads/data_multinom.xlsx")
## New names:
## • `` -> `...8`
## • `` -> `...9`
data_hotel
# 3. Pengecekan dan Pembersihan Variabel
data_hotel <- data_hotel %>%
# Menghapus baris di mana customer_type kosong (seharusnya tidak ada) atau country kosong (NULL)
drop_na(customer_type, country) %>%
# Mengkonversi variabel kategorikal menjadi faktor
mutate(
customer_type = as.factor(customer_type),
deposit_type = as.factor(deposit_type),
meal = as.factor(meal),
country = as.factor(country)
)
# 4. Penanganan Variabel 'country'
# Variabel 'country' memiliki terlalu banyak kategori (lebih dari 100),
# yang dapat membuat model menjadi terlalu kompleks dan sulit diinterpretasi.
# Kita akan membatasi kategori negara menjadi 5 teratas + 'Lainnya'.
# Menemukan 5 negara teratas berdasarkan jumlah observasi
top_countries <- data_hotel %>%
count(country) %>%
top_n(5, n) %>%
pull(country)
# Membuat variabel country yang lebih ringkas
data_hotel_clean <- data_hotel %>%
mutate(
country_grouped = if_else(country %in% top_countries, as.character(country), "Other_Country")
)
# Pastikan variabel ini adalah faktor dan set 'Transient' sebagai kategori referensi (baseline)
data_hotel_clean$customer_type <- relevel(data_hotel_clean$customer_type, ref = "Transient")
data_hotel_clean$country_grouped <- as.factor(data_hotel_clean$country_grouped)
# Pengecekan level variabel dependen (customer_type)
print(levels(data_hotel_clean$customer_type))
## [1] "Transient" "Contract" "Group" "Transient-Party"
# Output yang diharapkan: [1] "Transient" "Contract" "Group" "Transient-Party"
library(ggplot2)
#Bar Plot Distribusi Variabel Kategori (Termasuk Y)
ggplot(data_hotel_clean, aes(customer_type)) +
geom_bar() +
labs(title="Distribusi Customer Type")
#Boxplot X Numerik terhadap Y Kategorik
ggplot(data_hotel_clean, aes(customer_type, lead_time)) +
geom_boxplot() +
labs(title="Lead Time vs Customer Type")
#Mosaic Plot / Barplot untuk X Kategorik
ggplot(data_hotel_clean, aes(deposit_type, fill = customer_type)) +
geom_bar(position="fill") +
labs(title="Proporsi Customer Type Berdasarkan Deposit Type")
#Scatter / Jitter Plot untuk X Numerik vs Y
ggplot(data_hotel_clean, aes(lead_time, customer_type)) +
geom_jitter(alpha=0.3) +
labs(title="Hubungan Lead Time dan Customer Type")
## 5. Menjalankan Model Multinomial
model_multinom <- multinom(
customer_type ~ lead_time + deposit_type + meal,
data = data_hotel_clean,
MaxNWts = 10000 # Meningkatkan batas bobot jika Anda memiliki banyak prediktor kategorikal
)
## # weights: 36 (24 variable)
## initial value 13861.557317
## iter 10 value 6712.741398
## iter 20 value 5743.531842
## iter 30 value 5710.305254
## iter 40 value 5709.769930
## iter 50 value 5709.727993
## final value 5709.727836
## converged
Ringkasan Koefisien Model
# Ringkasan model
summary(model_multinom)
## Call:
## multinom(formula = customer_type ~ lead_time + deposit_type +
## meal, data = data_hotel_clean, MaxNWts = 10000)
##
## Coefficients:
## (Intercept) lead_time deposit_typeNon Refund
## Contract -4.013989 0.007097974 -2.955072
## Group -6.867487 0.010195050 -6.200084
## Transient-Party -2.368490 0.006197245 -2.744857
## deposit_typeRefundable mealFB mealHB mealSC
## Contract -6.30831846 -1.279515 0.6097505 1.910643
## Group 0.08438098 -6.116210 -0.5731621 4.160511
## Transient-Party 18.85994417 2.041346 1.0521860 3.159730
## mealUndefined
## Contract -7.487018
## Group -13.426999
## Transient-Party 3.407874
##
## Std. Errors:
## (Intercept) lead_time deposit_typeNon Refund
## Contract 0.10424660 0.0004979097 0.5087348208
## Group 0.37274718 0.0014746561 0.0005622585
## Transient-Party 0.05065217 0.0002714361 0.1490421570
## deposit_typeRefundable mealFB mealHB mealSC
## Contract 1.198351e-12 1.0031667314 0.12796605 0.7675904
## Group 5.284522e-10 0.0000298242 0.61089930 0.6703743
## Transient-Party 1.090951e-09 0.1320012976 0.06182913 0.4529346
## mealUndefined
## Contract 1.317620e-05
## Group 1.026026e-08
## Transient-Party 1.554517e-01
##
## Residual Deviance: 11419.46
## AIC: 11467.46
Uji Wald/ parsial
# calculate z-statistics of coefficients
z_stats <- summary(model_multinom)$coefficients/summary(model_multinom)$standard.errors
# convert to p-values
p_values <- (1 - pnorm(abs(z_stats), 0, 1)) * 2
#display pvalue in transposed data frame
data.frame(t(p_values))
# Hitung Relative Risk Ratios (RRR) - didapat dengan exp(koefisien)
rrr <- exp(coef(model_multinom))
# Tampilkan RRR
cat("\n\n*** Relative Risk Ratios (RRR) ***\n")
##
##
## *** Relative Risk Ratios (RRR) ***
print(rrr)
## (Intercept) lead_time deposit_typeNon Refund
## Contract 0.01806120 1.007123 0.052074912
## Group 0.00104109 1.010247 0.002029261
## Transient-Party 0.09362195 1.006216 0.064257489
## deposit_typeRefundable mealFB mealHB mealSC
## Contract 1.821093e-03 0.278172171 1.839972 6.757433
## Group 1.088043e+00 0.002206804 0.563740 64.104246
## Transient-Party 1.551564e+08 7.700969984 2.863905 23.564242
## mealUndefined
## Contract 5.603114e-04
## Group 1.474783e-06
## Transient-Party 3.020097e+01
Interpretasi RRR (Relative Risk Ratio) Koefisien yang dieksponensialkan (RRR) adalah cara termudah untuk menginterpretasikan hasil:
RRR > 1: Peningkatan satu unit pada prediktor akan meningkatkan risiko relatif kategori tersebut, dibandingkan dengan kategori dasar (Transient).
RRR < 1: Peningkatan satu unit pada prediktor akan menurunkan risiko relatif kategori tersebut, dibandingkan dengan kategori dasar (Transient).
uji signifikansi model Untuk mengetahui apakah prediktor secara keseluruhan signifikan dalam memprediksi variabel dependen
# Model Null (hanya intercept)
model_null <- multinom(customer_type ~ 1, data = data_hotel_clean)
## # weights: 8 (3 variable)
## initial value 13861.557317
## iter 10 value 7647.625611
## final value 6799.647944
## converged
# Uji Rasio Kemungkinan (Likelihood Ratio Test)
# Bandingkan model penuh dengan model null
lr_test <- anova(model_null, model_multinom)
cat("\n\n*** Uji Rasio Kemungkinan (ANOVA) ***\n")
##
##
## *** Uji Rasio Kemungkinan (ANOVA) ***
print(lr_test)
## Model Resid. df Resid. Dev Test Df LR stat.
## 1 1 29994 13599.30 NA NA
## 2 lead_time + deposit_type + meal 29973 11419.46 1 vs 2 21 2179.84
## Pr(Chi)
## 1 NA
## 2 0
Model Fit dan goodness of fit modul pake ini si
#utils::install.packages("DescTools")
library(DescTools)
pseudo_r2_all <- PseudoR2(model_multinom, which = c("McFadden", "CoxSnell", "Nagelkerke"))
## Warning in PseudoR2(model_multinom, which = c("McFadden", "CoxSnell",
## "Nagelkerke")): Could not find model or data element of multinom object for
## evaluating PseudoR2 null model. Will fit null model with new evaluation of
## 'data_hotel_clean'. Ensure object has not changed since initial call, or try
## running multinom with 'model = TRUE'
print(pseudo_r2_all)
## McFadden CoxSnell Nagelkerke
## 0.1602907 0.1958792 0.2635066
Uji Independensi (masing2 variabel)
#Load time
model_no_lead <- multinom(customer_type ~ deposit_type + meal, data = data_hotel_clean)
## # weights: 32 (21 variable)
## initial value 13861.557317
## iter 10 value 7081.901534
## iter 20 value 6263.460813
## iter 30 value 6049.995920
## iter 40 value 6039.606359
## iter 40 value 6039.606348
## iter 40 value 6039.606348
## final value 6039.606348
## converged
anova(model_no_lead, model_multinom, test = "Chisq")
#deposite type
model_no_deposit <- multinom(customer_type ~ lead_time + meal, data = data_hotel_clean)
## # weights: 28 (18 variable)
## initial value 13861.557317
## iter 10 value 6968.926215
## iter 20 value 6143.326295
## iter 30 value 6135.751640
## final value 6135.750981
## converged
anova(model_no_deposit, model_multinom, test = "Chisq")
#Meal
model_no_meal <- multinom(customer_type ~ lead_time + deposit_type, data = data_hotel_clean)
## # weights: 20 (12 variable)
## initial value 13861.557317
## iter 10 value 6716.416089
## iter 20 value 6225.987523
## final value 6225.738787
## converged
anova(model_no_meal, model_multinom, test = "Chisq")
Jika P value<= 0,05 variabel berpengaruh signif