R Notebook

# 1. Install dan Muat Package
# install.packages("nnet") # Hapus tanda '#' jika package belum terinstal
# install.packages("tidyverse") # Untuk manipulasi data
library(nnet)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# 2. Muat Data (Pastikan file CSV berada di direktori kerja atau ganti path)
library(readxl)
data_hotel<- read_excel("C:/Users/tengk/Downloads/data_multinom.xlsx")

## New names:
## • `` -> `...8`
## • `` -> `...9`

data_hotel

# 3. Pengecekan dan Pembersihan Variabel
data_hotel <- data_hotel %>%
  # Menghapus baris di mana customer_type kosong (seharusnya tidak ada) atau country kosong (NULL)
  drop_na(customer_type, country) %>%
  # Mengkonversi variabel kategorikal menjadi faktor
  mutate(
    customer_type = as.factor(customer_type),
    deposit_type = as.factor(deposit_type),
    meal = as.factor(meal),
    country = as.factor(country)
  )

# 4. Penanganan Variabel 'country'
# Variabel 'country' memiliki terlalu banyak kategori (lebih dari 100),
# yang dapat membuat model menjadi terlalu kompleks dan sulit diinterpretasi.
# Kita akan membatasi kategori negara menjadi 5 teratas + 'Lainnya'.

# Menemukan 5 negara teratas berdasarkan jumlah observasi
top_countries <- data_hotel %>%
  count(country) %>%
  top_n(5, n) %>%
  pull(country)

# Membuat variabel country yang lebih ringkas
data_hotel_clean <- data_hotel %>%
  mutate(
    country_grouped = if_else(country %in% top_countries, as.character(country), "Other_Country")
  )

# Pastikan variabel ini adalah faktor dan set 'Transient' sebagai kategori referensi (baseline)
data_hotel_clean$customer_type <- relevel(data_hotel_clean$customer_type, ref = "Transient")
data_hotel_clean$country_grouped <- as.factor(data_hotel_clean$country_grouped)

# Pengecekan level variabel dependen (customer_type)
print(levels(data_hotel_clean$customer_type))

## [1] "Transient"       "Contract"        "Group"           "Transient-Party"

# Output yang diharapkan: [1] "Transient" "Contract" "Group" "Transient-Party"

library(ggplot2)
#Bar Plot Distribusi Variabel Kategori (Termasuk Y)
ggplot(data_hotel_clean, aes(customer_type)) +
  geom_bar() +
  labs(title="Distribusi Customer Type")

#Boxplot X Numerik terhadap Y Kategorik
ggplot(data_hotel_clean, aes(customer_type, lead_time)) +
  geom_boxplot() +
  labs(title="Lead Time vs Customer Type")

#Mosaic Plot / Barplot untuk X Kategorik
ggplot(data_hotel_clean, aes(deposit_type, fill = customer_type)) +
  geom_bar(position="fill") +
  labs(title="Proporsi Customer Type Berdasarkan Deposit Type")

#Scatter / Jitter Plot untuk X Numerik vs Y
ggplot(data_hotel_clean, aes(lead_time, customer_type)) +
  geom_jitter(alpha=0.3) +
  labs(title="Hubungan Lead Time dan Customer Type")

## 5. Menjalankan Model Multinomial
model_multinom <- multinom(
  customer_type ~ lead_time + deposit_type  + meal,
  data = data_hotel_clean,
  MaxNWts = 10000 # Meningkatkan batas bobot jika Anda memiliki banyak prediktor kategorikal
)

## # weights:  36 (24 variable)
## initial  value 13861.557317 
## iter  10 value 6712.741398
## iter  20 value 5743.531842
## iter  30 value 5710.305254
## iter  40 value 5709.769930
## iter  50 value 5709.727993
## final  value 5709.727836 
## converged

Ringkasan Koefisien Model

# Ringkasan model
summary(model_multinom)

## Call:
## multinom(formula = customer_type ~ lead_time + deposit_type + 
##     meal, data = data_hotel_clean, MaxNWts = 10000)
## 
## Coefficients:
##                 (Intercept)   lead_time deposit_typeNon Refund
## Contract          -4.013989 0.007097974              -2.955072
## Group             -6.867487 0.010195050              -6.200084
## Transient-Party   -2.368490 0.006197245              -2.744857
##                 deposit_typeRefundable    mealFB     mealHB   mealSC
## Contract                   -6.30831846 -1.279515  0.6097505 1.910643
## Group                       0.08438098 -6.116210 -0.5731621 4.160511
## Transient-Party            18.85994417  2.041346  1.0521860 3.159730
##                 mealUndefined
## Contract            -7.487018
## Group              -13.426999
## Transient-Party      3.407874
## 
## Std. Errors:
##                 (Intercept)    lead_time deposit_typeNon Refund
## Contract         0.10424660 0.0004979097           0.5087348208
## Group            0.37274718 0.0014746561           0.0005622585
## Transient-Party  0.05065217 0.0002714361           0.1490421570
##                 deposit_typeRefundable       mealFB     mealHB    mealSC
## Contract                  1.198351e-12 1.0031667314 0.12796605 0.7675904
## Group                     5.284522e-10 0.0000298242 0.61089930 0.6703743
## Transient-Party           1.090951e-09 0.1320012976 0.06182913 0.4529346
##                 mealUndefined
## Contract         1.317620e-05
## Group            1.026026e-08
## Transient-Party  1.554517e-01
## 
## Residual Deviance: 11419.46 
## AIC: 11467.46

Uji Wald/ parsial

# calculate z-statistics of coefficients 
z_stats <- summary(model_multinom)$coefficients/summary(model_multinom)$standard.errors 
# convert to p-values 
p_values <- (1 - pnorm(abs(z_stats), 0, 1)) * 2

#display pvalue in transposed data frame
data.frame(t(p_values))

# Hitung Relative Risk Ratios (RRR) - didapat dengan exp(koefisien)
rrr <- exp(coef(model_multinom))

# Tampilkan RRR
cat("\n\n*** Relative Risk Ratios (RRR) ***\n")

## 
## 
## *** Relative Risk Ratios (RRR) ***

print(rrr)

##                 (Intercept) lead_time deposit_typeNon Refund
## Contract         0.01806120  1.007123            0.052074912
## Group            0.00104109  1.010247            0.002029261
## Transient-Party  0.09362195  1.006216            0.064257489
##                 deposit_typeRefundable      mealFB   mealHB    mealSC
## Contract                  1.821093e-03 0.278172171 1.839972  6.757433
## Group                     1.088043e+00 0.002206804 0.563740 64.104246
## Transient-Party           1.551564e+08 7.700969984 2.863905 23.564242
##                 mealUndefined
## Contract         5.603114e-04
## Group            1.474783e-06
## Transient-Party  3.020097e+01

Interpretasi RRR (Relative Risk Ratio) Koefisien yang dieksponensialkan (RRR) adalah cara termudah untuk menginterpretasikan hasil:

RRR > 1: Peningkatan satu unit pada prediktor akan meningkatkan risiko relatif kategori tersebut, dibandingkan dengan kategori dasar (Transient).

RRR < 1: Peningkatan satu unit pada prediktor akan menurunkan risiko relatif kategori tersebut, dibandingkan dengan kategori dasar (Transient).

uji signifikansi model Untuk mengetahui apakah prediktor secara keseluruhan signifikan dalam memprediksi variabel dependen

# Model Null (hanya intercept)
model_null <- multinom(customer_type ~ 1, data = data_hotel_clean)

## # weights:  8 (3 variable)
## initial  value 13861.557317 
## iter  10 value 7647.625611
## final  value 6799.647944 
## converged

# Uji Rasio Kemungkinan (Likelihood Ratio Test)
# Bandingkan model penuh dengan model null
lr_test <- anova(model_null, model_multinom)
cat("\n\n*** Uji Rasio Kemungkinan (ANOVA) ***\n")

## 
## 
## *** Uji Rasio Kemungkinan (ANOVA) ***

print(lr_test)

##                             Model Resid. df Resid. Dev   Test    Df LR stat.
## 1                               1     29994   13599.30           NA       NA
## 2 lead_time + deposit_type + meal     29973   11419.46 1 vs 2    21  2179.84
##   Pr(Chi)
## 1      NA
## 2       0

Model Fit dan goodness of fit modul pake ini si

#utils::install.packages("DescTools") 
library(DescTools) 
pseudo_r2_all <- PseudoR2(model_multinom, which = c("McFadden",  "CoxSnell", "Nagelkerke"))

## Warning in PseudoR2(model_multinom, which = c("McFadden", "CoxSnell",
## "Nagelkerke")): Could not find model or data element of multinom object for
## evaluating PseudoR2 null model. Will fit null model with new evaluation of
## 'data_hotel_clean'. Ensure object has not changed since initial call, or try
## running multinom with 'model = TRUE'

print(pseudo_r2_all)

##   McFadden   CoxSnell Nagelkerke 
##  0.1602907  0.1958792  0.2635066

Uji Independensi (masing2 variabel)

#Load time
model_no_lead <- multinom(customer_type ~ deposit_type + meal, data = data_hotel_clean)

## # weights:  32 (21 variable)
## initial  value 13861.557317 
## iter  10 value 7081.901534
## iter  20 value 6263.460813
## iter  30 value 6049.995920
## iter  40 value 6039.606359
## iter  40 value 6039.606348
## iter  40 value 6039.606348
## final  value 6039.606348 
## converged

anova(model_no_lead, model_multinom, test = "Chisq")

#deposite type
model_no_deposit <- multinom(customer_type ~ lead_time + meal, data = data_hotel_clean)

## # weights:  28 (18 variable)
## initial  value 13861.557317 
## iter  10 value 6968.926215
## iter  20 value 6143.326295
## iter  30 value 6135.751640
## final  value 6135.750981 
## converged

anova(model_no_deposit, model_multinom, test = "Chisq")

#Meal
model_no_meal <- multinom(customer_type ~ lead_time + deposit_type, data = data_hotel_clean)

## # weights:  20 (12 variable)
## initial  value 13861.557317 
## iter  10 value 6716.416089
## iter  20 value 6225.987523
## final  value 6225.738787 
## converged

anova(model_no_meal, model_multinom, test = "Chisq")

Jika P value<= 0,05 variabel berpengaruh signif