library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(scales)
library(nnet)    
library(MASS)    

1. Regresi Logistik Multinomial

1.1 Latar Belakang & Data

Dataset: Online Shoppers Purchasing Intention
Sumber: UCI Machine Learning Repository
Jumlah observasi: 12.330 sesi kunjungan

Dataset ini berisi data perilaku pengunjung situs e-commerce selama satu sesi kunjungan. Ditujukan untuk mengidentifikasi faktor-faktor yang berkaitan dengan jenis pengunjung (New Visitor, Returning Visitor, atau Other) berdasarkan aktivitas mereka di website.

Variabel Jenis Keterangan
VisitorType Y (respons) Jenis pengunjung: New_Visitor, Returning_Visitor, Other — nominal, 3 kategori, tanpa urutan
BounceRates X kontinu Proporsi sesi yang langsung meninggalkan halaman
ExitRates X kontinu Proporsi halaman yang menjadi halaman terakhir
PageValues X kontinu Nilai rata-rata halaman yang dikunjungi sebelum transaksi
ProductRelated X kontinu Jumlah halaman produk yang dikunjungi
Administrative X kontinu Jumlah halaman administratif yang dikunjungi
Weekend X biner 1 = kunjungan saat akhir pekan
Revenue X biner 1 = sesi menghasilkan transaksi
df_shop <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/online_shoppers_clean.csv", stringsAsFactors = FALSE)

df_shop$VisitorType <- factor(df_shop$VisitorType,
                               levels = c("New_Visitor", "Returning_Visitor", "Other"))
df_shop$Weekend <- as.integer(df_shop$Weekend)
df_shop$Revenue <- as.integer(df_shop$Revenue)

glimpse(df_shop)
## Rows: 12,330
## Columns: 11
## $ VisitorType    <fct> Returning_Visitor, Returning_Visitor, Returning_Visitor…
## $ Administrative <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0…
## $ Informational  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ProductRelated <int> 1, 2, 1, 2, 10, 19, 1, 0, 2, 3, 3, 16, 7, 6, 2, 23, 1, …
## $ BounceRates    <dbl> 0.200000000, 0.000000000, 0.200000000, 0.050000000, 0.0…
## $ ExitRates      <dbl> 0.200000000, 0.100000000, 0.200000000, 0.140000000, 0.0…
## $ PageValues     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SpecialDay     <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.8, 0.4, 0.0, …
## $ Month          <chr> "Feb", "Feb", "Feb", "Feb", "Feb", "Feb", "Feb", "Feb",…
## $ Weekend        <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Revenue        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

1.2 Distribusi Kategori Respons

df_shop %>%
  count(VisitorType) %>%
  mutate(proporsi = n / sum(n)) %>%
  kable(caption = "Distribusi Jenis Pengunjung", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Distribusi Jenis Pengunjung
VisitorType n proporsi
New_Visitor 1694 0.137
Returning_Visitor 10551 0.856
Other 85 0.007
df_shop %>%
  count(VisitorType) %>%
  mutate(proporsi = n / sum(n)) %>%
  ggplot(aes(x = VisitorType, y = proporsi, fill = VisitorType)) +
  geom_col(width = 0.6, alpha = 0.92) +
  geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
            vjust = -0.4, fontface = "bold") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.95)) +
  scale_fill_manual(values = c("#8B475D", "#CD6889", "#FF82AB")) +
  labs(title = "Distribusi Jenis Pengunjung",
       subtitle = "Respons multinomial: 3 kategori nominal, tidak ada urutan alami.",
       x = NULL, y = "Proporsi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

1.3 Estimasi Model Multinomial

fit_multi <- nnet::multinom(
  VisitorType ~ BounceRates + ExitRates + PageValues +
                ProductRelated + Administrative + Weekend + Revenue,
  data  = df_shop,
  trace = FALSE
)

summary(fit_multi)
## Call:
## nnet::multinom(formula = VisitorType ~ BounceRates + ExitRates + 
##     PageValues + ProductRelated + Administrative + Weekend + 
##     Revenue, data = df_shop, trace = FALSE)
## 
## Coefficients:
##                   (Intercept) BounceRates ExitRates   PageValues ProductRelated
## Returning_Visitor  -0.1310171   -11.94519  43.80824 -0.003853412   0.0475086463
## Other              -4.1512516   -10.99646  43.21727  0.012529515  -0.0002229309
##                   Administrative    Weekend    Revenue
## Returning_Visitor    -0.05446757 -0.2026824 -0.4818629
## Other                -0.01430138 -1.3055439 -0.2841169
## 
## Std. Errors:
##                   (Intercept) BounceRates ExitRates  PageValues ProductRelated
## Returning_Visitor  0.07342134    2.547536  2.003455 0.001464643    0.002081928
## Other              0.24746790    1.428562  1.649217 0.002890596    0.010481231
##                   Administrative    Weekend    Revenue
## Returning_Visitor    0.009851435 0.06364857 0.08317437
## Other                0.049615811 0.39963797 0.36010289
## 
## Residual Deviance: 9014.63 
## AIC: 9046.63

1.4 Tabel Hasil & Uji Signifikansi

multi_sum <- summary(fit_multi)

coef_long <- as.data.frame(multi_sum$coefficients) %>%
  tibble::rownames_to_column("kategori") %>%
  pivot_longer(-kategori, names_to = "variabel", values_to = "estimate")

se_long <- as.data.frame(multi_sum$standard.errors) %>%
  tibble::rownames_to_column("kategori") %>%
  pivot_longer(-kategori, names_to = "variabel", values_to = "std_error")

result_multi <- left_join(coef_long, se_long, by = c("kategori","variabel")) %>%
  mutate(
    z_value = estimate / std_error,
    p_value = 2 * (1 - pnorm(abs(z_value))),
    RRR     = exp(estimate),
    CI_low  = exp(estimate - 1.96 * std_error),
    CI_high = exp(estimate + 1.96 * std_error)
  )

result_multi %>%
  mutate(across(c(estimate, std_error, z_value, p_value, RRR, CI_low, CI_high),
                ~ round(.x, 4))) %>%
  kable(caption = "Ringkasan Model Regresi Logistik Multinomial",
        col.names = c("Kategori","Variabel","Estimate","SE","z","p-value",
                      "RRR","CI 2.5%","CI 97.5%")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Ringkasan Model Regresi Logistik Multinomial
Kategori Variabel Estimate SE z p-value RRR CI 2.5% CI 97.5%
Returning_Visitor (Intercept) -0.1310 0.0734 -1.7845 0.0743 8.772000e-01 7.596000e-01 1.013000e+00
Returning_Visitor BounceRates -11.9452 2.5475 -4.6889 0.0000 0.000000e+00 0.000000e+00 1.000000e-03
Returning_Visitor ExitRates 43.8082 2.0035 21.8663 0.0000 1.060909e+19 2.090752e+17 5.383363e+20
Returning_Visitor PageValues -0.0039 0.0015 -2.6310 0.0085 9.962000e-01 9.933000e-01 9.990000e-01
Returning_Visitor ProductRelated 0.0475 0.0021 22.8195 0.0000 1.048700e+00 1.044400e+00 1.052900e+00
Returning_Visitor Administrative -0.0545 0.0099 -5.5289 0.0000 9.470000e-01 9.289000e-01 9.655000e-01
Returning_Visitor Weekend -0.2027 0.0636 -3.1844 0.0015 8.165000e-01 7.208000e-01 9.250000e-01
Returning_Visitor Revenue -0.4819 0.0832 -5.7934 0.0000 6.176000e-01 5.247000e-01 7.270000e-01
Other (Intercept) -4.1513 0.2475 -16.7749 0.0000 1.570000e-02 9.700000e-03 2.560000e-02
Other BounceRates -10.9965 1.4286 -7.6976 0.0000 0.000000e+00 0.000000e+00 3.000000e-04
Other ExitRates 43.2173 1.6492 26.2047 0.0000 5.875163e+18 2.318344e+17 1.488888e+20
Other PageValues 0.0125 0.0029 4.3346 0.0000 1.012600e+00 1.006900e+00 1.018400e+00
Other ProductRelated -0.0002 0.0105 -0.0213 0.9830 9.998000e-01 9.794000e-01 1.020500e+00
Other Administrative -0.0143 0.0496 -0.2882 0.7732 9.858000e-01 8.944000e-01 1.086500e+00
Other Weekend -1.3055 0.3996 -3.2668 0.0011 2.710000e-01 1.238000e-01 5.932000e-01
Other Revenue -0.2841 0.3601 -0.7890 0.4301 7.527000e-01 3.716000e-01 1.524500e+00

1.5 Interpretasi

  • PageValues (Returning_Visitor vs New_Visitor): RRR = 0,9962, artinya setiap kenaikan 1 unit PageValues menurunkan peluang menjadi Returning_Visitor sebesar 0,38%.
  • ProductRelated (Returning_Visitor vs New_Visitor): RRR = 1,0487, artinya setiap kenaikan 1 unit ProductRelated meningkatkan peluang menjadi Returning_Visitor sebesar 4,87%.
  • Administrative (Returning_Visitor vs New_Visitor): RRR = 0,9470, artinya setiap kenaikan 1 unit Administrative menurunkan peluang menjadi Returning_Visitor sebesar 5,30%.
  • Weekend (Returning_Visitor vs New_Visitor): RRR = 0,8165, artinya pengunjung akhir pekan memiliki peluang menjadi Returning_Visitor 18,35% lebih rendah.
  • Revenue (Returning_Visitor vs New_Visitor): RRR = 0,6176, artinya pengunjung yang melakukan transaksi memiliki peluang menjadi Returning_Visitor 38,24% lebih rendah.
  • PageValues (Other vs New_Visitor): RRR = 1,0126, artinya setiap kenaikan 1 unit PageValues meningkatkan peluang masuk kategori Other sebesar 1,26%.
  • Weekend (Other vs New_Visitor): RRR = 0,2710, artinya pengunjung akhir pekan memiliki peluang masuk kategori Other 72,90% lebih rendah.

1.6 Prediksi Probabilitas Multinomial

pred_prob_multi <- predict(fit_multi, type = "probs")

data_multi_pred <- df_shop %>%
  bind_cols(as.data.frame(pred_prob_multi)) %>%
  mutate(
    prediksi = predict(fit_multi, type = "class")
  )

head(pred_prob_multi)
##   New_Visitor Returning_Visitor       Other
## 1 0.001819601         0.9801636 0.018016784
## 2 0.012616092         0.9724296 0.014954351
## 3 0.001819601         0.9801636 0.018016784
## 4 0.004007705         0.9805499 0.015442378
## 5 0.109432969         0.8873217 0.003245345
## 6 0.158924077         0.8350215 0.006054415
data_multi_pred %>%
  head() %>%
  kable(
    caption = "Hasil Prediksi Regresi Logistik Multinomial",
    digits = 4
  ) %>%
  kable_styling(
    bootstrap_options = c("striped","hover","condensed"),
    full_width = FALSE
  ) %>%
  scroll_box(
    width = "100%",
    height = "300px"
  )
Hasil Prediksi Regresi Logistik Multinomial
VisitorType Administrative Informational ProductRelated BounceRates ExitRates PageValues SpecialDay Month Weekend Revenue New_Visitor Returning_Visitor Other prediksi
Returning_Visitor 0 0 1 0.2000 0.2000 0 0 Feb 0 0 0.0018 0.9802 0.0180 Returning_Visitor
Returning_Visitor 0 0 2 0.0000 0.1000 0 0 Feb 0 0 0.0126 0.9724 0.0150 Returning_Visitor
Returning_Visitor 0 0 1 0.2000 0.2000 0 0 Feb 0 0 0.0018 0.9802 0.0180 Returning_Visitor
Returning_Visitor 0 0 2 0.0500 0.1400 0 0 Feb 0 0 0.0040 0.9805 0.0154 Returning_Visitor
Returning_Visitor 0 0 10 0.0200 0.0500 0 0 Feb 1 0 0.1094 0.8873 0.0032 Returning_Visitor
Returning_Visitor 0 0 19 0.0158 0.0246 0 0 Feb 0 0 0.1589 0.8350 0.0061 Returning_Visitor

1.7 Uji Model (LRT)

fit_multi_null <- nnet::multinom(VisitorType ~ 1, data = df_shop, trace = FALSE)

lrt_stat <- 2 * (as.numeric(logLik(fit_multi)) - as.numeric(logLik(fit_multi_null)))
df_lrt   <- length(coef(fit_multi)) - 2   # (K-1) * p_prediktor
p_lrt    <- 1 - pchisq(lrt_stat, df_lrt)

tibble::tibble(
  `G² (Devians)` = lrt_stat,
  `df`           = df_lrt,
  `p-value`      = p_lrt
) %>%
  mutate(across(c(`G² (Devians)`, `p-value`), ~ round(.x, 4))) %>%
  kable(caption = "Uji Kebaikan Model (Likelihood Ratio Test)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Uji Kebaikan Model (Likelihood Ratio Test)
G² (Devians) df p-value
1844.471 14 0

1.8 Confusion Matrix

conf_multi <- table(
  Aktual = df_shop$VisitorType,
  Prediksi = data_multi_pred$prediksi
)

conf_multi
##                    Prediksi
## Aktual              New_Visitor Returning_Visitor Other
##   New_Visitor                77              1616     1
##   Returning_Visitor          59             10492     0
##   Other                       5                80     0
accuracy_multi <- sum(diag(conf_multi)) /
  sum(conf_multi)

accuracy_multi
## [1] 0.8571776

1.9 Visualisasi Probabilitas Prediksi

grid_multi <- expand.grid(
  PageValues = seq(
    min(df_shop$PageValues),
    max(df_shop$PageValues),
    length.out = 100
  ),
  BounceRates = mean(df_shop$BounceRates),
  ExitRates = mean(df_shop$ExitRates),
  ProductRelated = mean(df_shop$ProductRelated),
  Administrative = mean(df_shop$Administrative),
  Weekend = 0,
  Revenue = 0
)

grid_prob_multi <- predict(
  fit_multi,
  newdata = grid_multi,
  type = "probs"
)

plot_multi <- grid_multi %>%
  bind_cols(as.data.frame(grid_prob_multi)) %>%
  pivot_longer(
    cols = c(
      "New_Visitor",
      "Returning_Visitor",
      "Other"
    ),
    names_to = "kategori",
    values_to = "prob"
  )

ggplot(
  plot_multi,
  aes(PageValues, prob, color = kategori)
) +
  geom_line(linewidth = 1.5) +
  scale_color_manual(
    values = c(
      "New_Visitor" = "#8B1C62",
      "Returning_Visitor" = "#EE30A7",
      "Other" = "#FF82AB"
    )
  ) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title = "Probabilitas Prediksi Jenis Pengunjung",
    x = "Page Values",
    y = "Probabilitas",
    color = "Kategori"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold")
  )


2. Regresi Logistik Ordinal

2.1 Latar Belakang & Data

Dataset: Student Smoking, Alcohol, and Mental Health
Sumber: Mendeley
Jumlah observasi: 706 mahasiswa

Dataset ini memuat informasi mengenai kebiasaan merokok, konsumsi alkohol, strategi mengatasi stres, dan karakteristik mahasiswa yang dikaitkan dengan tingkat kesehatan psikologis (wellness). Tujuan analisis adalah untuk mengetahui faktor-faktor yang berhubungan dengan peningkatan atau penurunan tingkat kesejahteraan psikologis mahasiswa. Variabel respons diklasifikasikan ke dalam kategori berurutan mulai dari Poor/Below Average, Average/Good, hingga Excellent.

Variabel Jenis Keterangan
wellness_3 Y (respons) Kesehatan psikologis: Poor/Below average < Average/Good < Excellentordinal, 3 kategori
freq_smoking X kategorik Frekuensi merokok (5 level)
freq_alcohol2 X kategorik Frekuensi konsumsi alkohol (4 level; Monthly & Weekly digabung)
cope_stress2 X kategorik Cara mengatasi stres (4 level; Meditation, Other, Socializing digabung)
age_group X kategorik Kelompok usia
year_study X kategorik Tahun studi
df_stud <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/student_clean.csv", stringsAsFactors = FALSE)


df_stud$wellness_3 <- dplyr::case_when(
  df_stud$wellness %in% c("Poor","Below average") ~ "Poor/Below average",
  df_stud$wellness %in% c("Average","Good")        ~ "Average/Good",
  df_stud$wellness == "Excellent"                  ~ "Excellent"
)
df_stud$wellness_3 <- ordered(df_stud$wellness_3,
  levels = c("Poor/Below average","Average/Good","Excellent"))


df_stud$freq_alcohol2 <- ifelse(
  df_stud$freq_alcohol %in% c("Monthly","Weekly"), "Monthly/Weekly",
  df_stud$freq_alcohol
)
df_stud$freq_alcohol2 <- factor(df_stud$freq_alcohol2,
  levels = c("Don't consume alcohol","Only on social occasions",
             "Monthly/Weekly","Daily"))


df_stud$cope_stress2 <- ifelse(
  df_stud$cope_stress %in% c("Meditation","Other","Socializing"), "Other",
  df_stud$cope_stress
)
df_stud$cope_stress2 <- factor(df_stud$cope_stress2,
  levels = c("Drinking alcohol","Smoking","Exercise","Other"))


df_stud$freq_smoking <- factor(df_stud$freq_smoking,
  levels = c("Don't smoke","Only on social occasions","Monthly","Weekly","Daily"))
df_stud$age_group <- factor(df_stud$age_group,
  levels = c("18-20","21-23","24-26","27 or older"))
df_stud$year_study <- factor(df_stud$year_study,
  levels = c("1st year","2nd year","3rd year","4th year","Graduate student"))

glimpse(df_stud)
## Rows: 706
## Columns: 15
## $ wellness           <chr> "Good", "Good", "Good", "Excellent", "Excellent", "…
## $ freq_smoking       <fct> NA, Daily, Daily, Daily, Weekly, Daily, Daily, Mont…
## $ freq_alcohol       <chr> "Don’t consume alcohol", "Don’t consume alcohol", "…
## $ help_seeking       <chr> "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
## $ cope_stress        <chr> "Other", "Smoking", "Drinking alcohol", "Exercise",…
## $ age_group          <fct> 24-26, 24-26, 27 or older, 21-23, 18-20, 18-20, 18-…
## $ year_study         <fct> Graduate student, Graduate student, 4th year, Gradu…
## $ wellness_num       <int> 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ wellness_3         <ord> Average/Good, Average/Good, Average/Good, Excellent…
## $ cope_stress2       <fct> Other, Smoking, Drinking alcohol, Exercise, Drinkin…
## $ freq_alcohol2      <fct> NA, NA, Only on social occasions, Daily, Only on so…
## $ cope_stress_clean  <chr> "Other", "Substance_use", "Substance_use", "Exercis…
## $ freq_alcohol_clean <chr> "Don’t consume alcohol", "Don’t consume alcohol", "…
## $ smoker             <chr> "Non_smoker", "Smoker", "Smoker", "Smoker", "Smoker…
## $ drinker            <chr> "Non_drinker", "Non_drinker", "Drinker", "Drinker",…

2.2 Distribusi Kategori Respons

df_stud %>%
  count(wellness_3) %>%
  mutate(proporsi = n / sum(n)) %>%
  kable(caption = "Distribusi Tingkat Kesehatan Psikologis (wellness_3)", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Distribusi Tingkat Kesehatan Psikologis (wellness_3)
wellness_3 n proporsi
Poor/Below average 589 0.834
Average/Good 35 0.050
Excellent 82 0.116
df_stud %>%
  count(wellness_3) %>%
  mutate(proporsi = n / sum(n)) %>%
  ggplot(aes(x = wellness_3, y = proporsi, fill = wellness_3)) +
  geom_col(width = 0.65, alpha = 0.92) +
  geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
            vjust = -0.4, fontface = "bold") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.90)) +
  scale_fill_manual(values = c("#D02090","#EE3A8C","#FF3E96")) +
  labs(title = "Distribusi Tingkat Kesehatan Psikologis",
       subtitle = "Respons ordinal: Poor/Below average → Average/Good → Excellent.",
       x = NULL, y = "Proporsi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

2.3 Estimasi Model Ordinal

fit_ord <- MASS::polr(
  wellness_3 ~ freq_smoking + freq_alcohol2 + cope_stress2 +
               age_group + year_study,
  data   = df_stud,
  method = "logistic",
  Hess   = TRUE
)

summary(fit_ord)
## Call:
## MASS::polr(formula = wellness_3 ~ freq_smoking + freq_alcohol2 + 
##     cope_stress2 + age_group + year_study, data = df_stud, Hess = TRUE, 
##     method = "logistic")
## 
## Coefficients:
##                                          Value Std. Error  t value
## freq_smokingOnly on social occasions  -0.32403     1.5248 -0.21251
## freq_smokingMonthly                   -2.78986     2.0421 -1.36616
## freq_smokingWeekly                    -0.10085     1.0531 -0.09576
## freq_alcohol2Only on social occasions  0.42065     1.1204  0.37546
## freq_alcohol2Monthly/Weekly            0.13183     1.3559  0.09723
## cope_stress2Smoking                    1.82697     0.9056  2.01749
## cope_stress2Exercise                  11.47952     2.3913  4.80048
## cope_stress2Other                      8.29811     1.9297  4.30020
## age_group21-23                        -3.48180     1.5144 -2.29907
## age_group24-26                        -4.51354     1.6306 -2.76802
## age_group27 or older                  -2.23651     2.2024 -1.01549
## year_study2nd year                     1.80758     1.7868  1.01161
## year_study3rd year                     3.06678     1.7976  1.70600
## year_study4th year                     1.98616     1.7116  1.16043
## year_studyGraduate student             0.02569     2.2474  0.01143
## 
## Intercepts:
##                                 Value   Std. Error t value
## Poor/Below average|Average/Good  3.8616  1.2105     3.1901
## Average/Good|Excellent           6.5672  1.5059     4.3610
## 
## Residual Deviance: 76.03743 
## AIC: 110.0374 
## (96 observations deleted due to missingness)

2.4 Tabel Hasil & Uji Signifikansi

ord_coef <- coef(summary(fit_ord))

result_ord <- as.data.frame(ord_coef) %>%
  tibble::rownames_to_column("parameter") %>%
  dplyr::rename(estimate = Value, std_error = `Std. Error`, t_value = `t value`) %>%
  mutate(
    p_value        = 2 * (1 - pnorm(abs(t_value))),
    jenis          = ifelse(grepl("\\|", parameter), "Cutpoint", "Koefisien"),
    OR_naik        = ifelse(jenis == "Koefisien", exp(-estimate), NA_real_),
    CI_low_naik    = ifelse(jenis == "Koefisien", exp(-estimate - 1.96 * std_error), NA_real_),
    CI_high_naik   = ifelse(jenis == "Koefisien", exp(-estimate + 1.96 * std_error), NA_real_)
  )

result_ord %>%
  mutate(across(c(estimate, std_error, t_value, p_value, OR_naik, CI_low_naik, CI_high_naik),
                ~ round(.x, 4))) %>%
  kable(caption = "Ringkasan Model Regresi Logistik Ordinal (Proportional Odds)",
        col.names = c("Parameter","Estimate (polr)","SE","t-value","p-value",
                      "Jenis","OR (naik)","CI 2.5%","CI 97.5%")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
  row_spec(which(result_ord$jenis == "Cutpoint"), background = "#f0f0f0")
Ringkasan Model Regresi Logistik Ordinal (Proportional Odds)
Parameter Estimate (polr) SE t-value p-value Jenis OR (naik) CI 2.5% CI 97.5%
freq_smokingOnly on social occasions -0.3240 1.5248 -0.2125 0.8317 Koefisien 1.3827 0.0696 27.4591
freq_smokingMonthly -2.7899 2.0421 -1.3662 0.1719 Koefisien 16.2787 0.2974 891.0659
freq_smokingWeekly -0.1008 1.0531 -0.0958 0.9237 Koefisien 1.1061 0.1404 8.7143
freq_alcohol2Only on social occasions 0.4207 1.1204 0.3755 0.7073 Koefisien 0.6566 0.0731 5.9017
freq_alcohol2Monthly/Weekly 0.1318 1.3559 0.0972 0.9225 Koefisien 0.8765 0.0615 12.4997
cope_stress2Smoking 1.8270 0.9056 2.0175 0.0436 Koefisien 0.1609 0.0273 0.9493
cope_stress2Exercise 11.4795 2.3913 4.8005 0.0000 Koefisien 0.0000 0.0000 0.0011
cope_stress2Other 8.2981 1.9297 4.3002 0.0000 Koefisien 0.0002 0.0000 0.0109
age_group21-23 -3.4818 1.5144 -2.2991 0.0215 Koefisien 32.5183 1.6711 632.7666
age_group24-26 -4.5135 1.6306 -2.7680 0.0056 Koefisien 91.2446 3.7343 2229.4850
age_group27 or older -2.2365 2.2024 -1.0155 0.3099 Koefisien 9.3606 0.1249 701.4864
year_study2nd year 1.8076 1.7868 1.0116 0.3117 Koefisien 0.1641 0.0049 5.4445
year_study3rd year 3.0668 1.7976 1.7060 0.0880 Koefisien 0.0466 0.0014 1.5787
year_study4th year 1.9862 1.7116 1.1604 0.2459 Koefisien 0.1372 0.0048 3.9295
year_studyGraduate student 0.0257 2.2474 0.0114 0.9909 Koefisien 0.9746 0.0119 79.7796
Poor/Below average&#124;Average/Good 3.8616 1.2105 3.1901 0.0014 Cutpoint NA NA NA
Average/Good&#124;Excellent 6.5672 1.5059 4.3610 0.0000 Cutpoint NA NA NA

2.5 Interpretasi

  • cope_stress = Smoking: OR = 0,1609, artinya mahasiswa yang mengatasi stres dengan merokok memiliki odds berada pada kategori wellness yang lebih tinggi 83,91% lebih rendah dibanding kategori referensi.
  • cope_stress = Exercise: OR ≈ 0,0000, artinya mahasiswa yang mengatasi stres dengan olahraga memiliki odds berada pada kategori wellness yang lebih tinggi jauh lebih rendah dibanding kategori referensi.
  • cope_stress = Other: OR = 0,0002, artinya mahasiswa yang menggunakan cara lain untuk mengatasi stres memiliki odds berada pada kategori wellness yang lebih tinggi 99,98% lebih rendah dibanding kategori referensi.
  • age_group 21–23: OR = 32,5183, artinya mahasiswa usia 21–23 tahun memiliki odds berada pada kategori wellness yang lebih tinggi 32,52 kali lebih besar dibanding kelompok usia referensi.
  • age_group 24–26: OR = 91,2446, artinya mahasiswa usia 24–26 tahun memiliki odds berada pada kategori wellness yang lebih tinggi 91,24 kali lebih besar dibanding kelompok usia referensi.

2.6 Prediksi Probabilitas Ordinal

pred_prob_ord <- predict(
  fit_ord,
  type = "probs"
)

head(pred_prob_ord)
##   Poor/Below average Average/Good   Excellent
## 3       0.9756678240  0.022668194 0.001663982
## 4       0.0153407378  0.173703845 0.810955418
## 5       0.6165745499  0.343522633 0.039902817
## 6       0.0004913311  0.006810168 0.992698501
## 7       0.0004913311  0.006810168 0.992698501
## 8       0.0079386458  0.098991531 0.893069823
pred_class_ord <- predict(
  fit_ord,
  type = "class"
)

data_ord_pred <- model.frame(fit_ord) %>%
  bind_cols(
    as.data.frame(pred_prob_ord)
  ) %>%
  mutate(
    prediksi = pred_class_ord
  )

data_ord_pred %>%
  head() %>%
  kable(
    caption = "Hasil Prediksi Regresi Logistik Ordinal",
    digits = 4
  ) %>%
  kable_styling(
    bootstrap_options = c(
      "striped",
      "hover",
      "condensed"
    ),
    full_width = FALSE
  ) %>%
  scroll_box(
    width = "100%",
    height = "300px"
  )
Hasil Prediksi Regresi Logistik Ordinal
wellness_3 freq_smoking freq_alcohol2 cope_stress2 age_group year_study Poor/Below average Average/Good Excellent prediksi
3 Average/Good Daily Only on social occasions Drinking alcohol 27 or older 4th year 0.9757 0.0227 0.0017 Poor/Below average
4 Excellent Daily Daily Exercise 21-23 Graduate student 0.0153 0.1737 0.8110 Excellent
5 Excellent Weekly Only on social occasions Drinking alcohol 18-20 3rd year 0.6166 0.3435 0.0399 Poor/Below average
6 Excellent Daily Daily Exercise 18-20 1st year 0.0005 0.0068 0.9927 Excellent
7 Excellent Daily Daily Exercise 18-20 1st year 0.0005 0.0068 0.9927 Excellent
8 Excellent Monthly Daily Exercise 18-20 1st year 0.0079 0.0990 0.8931 Excellent

2.7 Uji Model (LRT)

fit_ord_null <- MASS::polr(wellness_3 ~ 1, data = df_stud,
                            method = "logistic", Hess = TRUE)

lrt_ord  <- 2 * (as.numeric(logLik(fit_ord)) - as.numeric(logLik(fit_ord_null)))
df_ord   <- length(coef(fit_ord))
p_ord    <- 1 - pchisq(lrt_ord, df_ord)

tibble::tibble(
  `G² (Devians)` = lrt_ord,
  `df`           = df_ord,
  `p-value`      = p_ord
) %>%
  mutate(across(c(`G² (Devians)`,`p-value`), ~ round(.x, 4))) %>%
  kable(caption = "Uji Kebaikan Model Ordinal (LRT vs Model Null)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Uji Kebaikan Model Ordinal (LRT vs Model Null)
G² (Devians) df p-value
700.7769 15 0

2.8 Confusion Matrix

conf_ord <- table(
  Aktual = data_ord_pred$wellness_3,
  Prediksi = data_ord_pred$prediksi
)

conf_ord
##                     Prediksi
## Aktual               Poor/Below average Average/Good Excellent
##   Poor/Below average                576            1         0
##   Average/Good                        6            2         2
##   Excellent                           2            0        21
accuracy_ord <- sum(diag(conf_ord)) /
  sum(conf_ord)

accuracy_ord
## [1] 0.9819672

3. Regresi Logistik Biner

3.1 Latar Belakang & Data

Dataset: Bank Marketing
Sumber: UCI Machine Learning Repository
Jumlah observasi: 4.521 nasabah

Dataset ini berasal dari kampanye pemasaran langsung suatu bank Portugis yang menawarkan produk deposito berjangka kepada nasabah. Tujuan analisis adalah untuk mengetahui faktor-faktor yang memengaruhi keputusan nasabah untuk berlangganan deposito (yes) atau tidak (no).

Variabel Peran Keterangan
y Y (respons) Nasabah berlangganan deposito berjangka: yes/nobiner
age X kontinu Usia nasabah (tahun)
balance X kontinu Saldo rekening rata-rata (euro)
duration X kontinu Durasi kontak terakhir (detik)
campaign X kontinu Jumlah kontak selama kampanye ini
job X kategorik Pekerjaan
marital X kategorik Status pernikahan
education X kategorik Tingkat pendidikan
housing X biner Memiliki kredit perumahan (yes/no)
loan X biner Memiliki pinjaman pribadi (yes/no)
poutcome X kategorik Hasil kampanye pemasaran sebelumnya
df_bank <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/bank_clean.csv", stringsAsFactors = FALSE, sep = ",")

# Faktor
df_bank$y       <- factor(df_bank$y, levels = c("no","yes"))
df_bank$job     <- factor(df_bank$job)
df_bank$marital <- factor(df_bank$marital)
df_bank$education <- factor(df_bank$education,
                             levels = c("primary","secondary","tertiary","unknown"))
df_bank$housing  <- factor(df_bank$housing)
df_bank$loan     <- factor(df_bank$loan)
df_bank$poutcome <- factor(df_bank$poutcome,
                            levels = c("unknown","failure","other","success"))

glimpse(df_bank)
## Rows: 4,521
## Columns: 13
## $ y         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, yes, no,…
## $ age       <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, …
## $ job       <fct> unemployed, services, management, management, blue-collar, m…
## $ marital   <fct> married, married, single, married, married, single, married,…
## $ education <fct> primary, secondary, tertiary, tertiary, secondary, tertiary,…
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ balance   <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 26…
## $ housing   <fct> no, yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes, no…
## $ loan      <fct> no, yes, no, yes, no, no, no, no, no, yes, no, no, no, no, y…
## $ duration  <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 32…
## $ campaign  <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1, …
## $ poutcome  <fct> unknown, failure, failure, unknown, unknown, failure, other,…
## $ y_bin     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …

3.2 Distribusi Kategori Respons

df_bank %>%
  count(y) %>%
  mutate(proporsi = n / sum(n)) %>%
  kable(caption = "Distribusi Variabel Respons (Langganan Deposito)", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Distribusi Variabel Respons (Langganan Deposito)
y n proporsi
no 4000 0.885
yes 521 0.115
df_bank %>%
  count(y) %>%
  mutate(proporsi = n / sum(n)) %>%
  ggplot(aes(x = y, y = proporsi, fill = y)) +
  geom_col(width = 0.5, alpha = 0.92) +
  geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
            vjust = -0.4, fontface = "bold") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
  scale_fill_manual(values = c("#8B1C62","#EE30A7")) +
  labs(title = "Distribusi Keputusan Nasabah",
       subtitle = "Respons biner: yes = berlangganan deposito, no = tidak.",
       x = "Berlangganan deposito?", y = "Proporsi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

3.3 Estimasi Model Biner

fit_bin <- glm(
  y ~ age + balance + duration + campaign +
      job + marital + education + housing + loan + poutcome,
  data   = df_bank,
  family = binomial(link = "logit")
)

summary(fit_bin)
## 
## Call:
## glm(formula = y ~ age + balance + duration + campaign + job + 
##     marital + education + housing + loan + poutcome, family = binomial(link = "logit"), 
##     data = df_bank)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -2.899e+00  4.417e-01  -6.564 5.24e-11 ***
## age                 8.041e-04  6.783e-03   0.119 0.905632    
## balance             3.508e-06  1.755e-05   0.200 0.841598    
## duration            3.957e-03  1.917e-04  20.641  < 2e-16 ***
## campaign           -6.445e-02  2.610e-02  -2.469 0.013533 *  
## jobblue-collar     -5.127e-01  2.338e-01  -2.193 0.028338 *  
## jobentrepreneur    -4.108e-01  3.636e-01  -1.130 0.258539    
## jobhousemaid       -3.948e-01  3.985e-01  -0.991 0.321826    
## jobmanagement      -8.183e-02  2.338e-01  -0.350 0.726371    
## jobretired          6.055e-01  2.967e-01   2.040 0.041310 *  
## jobself-employed   -3.238e-01  3.389e-01  -0.955 0.339335    
## jobservices        -2.693e-01  2.626e-01  -1.025 0.305159    
## jobstudent          5.523e-01  3.581e-01   1.542 0.123048    
## jobtechnician      -2.479e-01  2.217e-01  -1.118 0.263590    
## jobunemployed      -6.685e-01  4.085e-01  -1.636 0.101780    
## jobunknown          4.758e-01  5.365e-01   0.887 0.375149    
## maritalmarried     -4.268e-01  1.665e-01  -2.563 0.010387 *  
## maritalsingle      -1.696e-01  1.950e-01  -0.870 0.384402    
## educationsecondary  1.042e-01  1.940e-01   0.537 0.591156    
## educationtertiary   4.099e-01  2.238e-01   1.831 0.067050 .  
## educationunknown   -4.056e-01  3.356e-01  -1.209 0.226806    
## housingyes         -6.343e-01  1.206e-01  -5.262 1.42e-07 ***
## loanyes            -7.221e-01  1.917e-01  -3.767 0.000165 ***
## poutcomefailure     4.995e-01  1.696e-01   2.945 0.003235 ** 
## poutcomeother       9.757e-01  2.172e-01   4.492 7.05e-06 ***
## poutcomesuccess     2.870e+00  2.154e-01  13.323  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3231.0  on 4520  degrees of freedom
## Residual deviance: 2323.3  on 4495  degrees of freedom
## AIC: 2375.3
## 
## Number of Fisher Scoring iterations: 6

3.4 Tabel Hasil & Odds Ratio

bin_coef <- as.data.frame(coef(summary(fit_bin))) %>%
  tibble::rownames_to_column("parameter") %>%
  dplyr::rename(
    estimate  = Estimate,
    std_error = `Std. Error`,
    z_value   = `z value`,
    p_value   = `Pr(>|z|)`
  ) %>%
  mutate(
    OR      = exp(estimate),
    CI_low  = exp(estimate - 1.96 * std_error),
    CI_high = exp(estimate + 1.96 * std_error)
  )

bin_coef %>%
  mutate(across(c(estimate, std_error, z_value, p_value, OR, CI_low, CI_high),
                ~ round(.x, 4))) %>%
  kable(caption = "Ringkasan Model Regresi Logistik Biner",
        col.names = c("Parameter","Estimate","SE","z-value","p-value",
                      "OR","CI 2.5%","CI 97.5%")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Ringkasan Model Regresi Logistik Biner
Parameter Estimate SE z-value p-value OR CI 2.5% CI 97.5%
(Intercept) -2.8995 0.4417 -6.5640 0.0000 0.0551 0.0232 0.1309
age 0.0008 0.0068 0.1185 0.9056 1.0008 0.9876 1.0142
balance 0.0000 0.0000 0.1998 0.8416 1.0000 1.0000 1.0000
duration 0.0040 0.0002 20.6409 0.0000 1.0040 1.0036 1.0043
campaign -0.0644 0.0261 -2.4694 0.0135 0.9376 0.8908 0.9868
jobblue-collar -0.5127 0.2338 -2.1926 0.0283 0.5989 0.3787 0.9471
jobentrepreneur -0.4108 0.3636 -1.1299 0.2585 0.6631 0.3252 1.3523
jobhousemaid -0.3948 0.3985 -0.9907 0.3218 0.6738 0.3085 1.4715
jobmanagement -0.0818 0.2338 -0.3500 0.7264 0.9214 0.5827 1.4571
jobretired 0.6055 0.2967 2.0404 0.0413 1.8321 1.0241 3.2775
jobself-employed -0.3238 0.3389 -0.9555 0.3393 0.7234 0.3723 1.4056
jobservices -0.2693 0.2626 -1.0254 0.3052 0.7639 0.4566 1.2782
jobstudent 0.5523 0.3581 1.5421 0.1230 1.7372 0.8610 3.5050
jobtechnician -0.2479 0.2217 -1.1179 0.2636 0.7804 0.5053 1.2053
jobunemployed -0.6685 0.4085 -1.6363 0.1018 0.5125 0.2301 1.1414
jobunknown 0.4758 0.5365 0.8869 0.3751 1.6094 0.5623 4.6063
maritalmarried -0.4268 0.1665 -2.5627 0.0104 0.6526 0.4709 0.9045
maritalsingle -0.1696 0.1950 -0.8698 0.3844 0.8440 0.5760 1.2368
educationsecondary 0.1042 0.1940 0.5372 0.5912 1.1099 0.7587 1.6235
educationtertiary 0.4099 0.2238 1.8313 0.0670 1.5067 0.9716 2.3364
educationunknown -0.4056 0.3356 -1.2086 0.2268 0.6666 0.3453 1.2868
housingyes -0.6343 0.1206 -5.2620 0.0000 0.5303 0.4187 0.6716
loanyes -0.7221 0.1917 -3.7668 0.0002 0.4857 0.3336 0.7072
poutcomefailure 0.4995 0.1696 2.9445 0.0032 1.6480 1.1818 2.2980
poutcomeother 0.9757 0.2172 4.4922 0.0000 2.6530 1.7332 4.0608
poutcomesuccess 2.8699 0.2154 13.3226 0.0000 17.6355 11.5617 26.9002

3.5 Interpretasi

  • duration: OR = 1,0040, artinya setiap tambahan 1 detik durasi kontak meningkatkan odds nasabah berlangganan deposito sebesar 0,4%.
  • campaign: OR = 0,9376, artinya setiap tambahan 1 kali kontak selama kampanye menurunkan odds nasabah berlangganan deposito sebesar 6,24%.
  • jobblue-collar (vs pekerjaan referensi): OR = 0,5989, artinya nasabah dengan pekerjaan blue-collar memiliki odds berlangganan deposito 40,11% lebih rendah dibanding kategori pekerjaan referensi.
  • jobretired (vs pekerjaan referensi): OR = 1,8321, artinya nasabah pensiunan memiliki odds berlangganan deposito 1,83 kali lebih besar dibanding kategori pekerjaan referensi.
  • maritalmarried (vs status referensi): OR = 0,6526, artinya nasabah yang menikah memiliki odds berlangganan deposito 34,74% lebih rendah dibanding status referensi.
  • housingyes: OR = 0,5303, artinya nasabah yang memiliki kredit perumahan memiliki odds berlangganan deposito 46,97% lebih rendah dibanding yang tidak memiliki kredit perumahan.
  • loanyes: OR = 0,4857, artinya nasabah yang memiliki pinjaman pribadi memiliki odds berlangganan deposito 51,43% lebih rendah dibanding yang tidak memiliki pinjaman.
  • poutcomefailure (vs kategori referensi): OR = 1,6480, artinya nasabah dengan hasil kampanye sebelumnya gagal memiliki odds berlangganan deposito 1,65 kali lebih besar dibanding kategori referensi.
  • poutcomeother (vs kategori referensi): OR = 2,6530, artinya nasabah dengan hasil kampanye sebelumnya kategori other memiliki odds berlangganan deposito 2,65 kali lebih besar dibanding kategori referensi.
  • poutcomesuccess (vs kategori referensi): OR = 17,6355, artinya nasabah dengan hasil kampanye sebelumnya berhasil memiliki odds berlangganan deposito 17,64 kali lebih besar dibanding kategori referensi.

3.6 Prediksi Probabilitas Biner

pred_prob_bin <- predict(
  fit_bin,
  type = "response"
)

data_bin_pred <- df_bank %>%
  mutate(
    Prob_Yes = pred_prob_bin,
    Prob_No = 1 - pred_prob_bin,
    prediksi = ifelse(
      pred_prob_bin >= 0.5,
      "yes",
      "no"
    )
  )

head(data_bin_pred)
##    y age         job marital education default balance housing loan duration
## 1 no  30  unemployed married   primary      no    1787      no   no       79
## 2 no  33    services married secondary      no    4789     yes  yes      220
## 3 no  35  management  single  tertiary      no    1350     yes   no      185
## 4 no  30  management married  tertiary      no    1476     yes  yes      199
## 5 no  59 blue-collar married secondary      no       0     yes   no      226
## 6 no  35  management  single  tertiary      no     747      no   no      141
##   campaign poutcome y_bin   Prob_Yes   Prob_No prediksi
## 1        1  unknown     0 0.02374948 0.9762505       no
## 2        1  failure     0 0.02934545 0.9706545       no
## 3        1  failure     0 0.10198746 0.8980125       no
## 4        4  unknown     0 0.02197348 0.9780265       no
## 5        1  unknown     0 0.02954577 0.9704542       no
## 6        2  failure     0 0.14409773 0.8559023       no
data_bin_pred %>%
  head() %>%
  kable(
    caption = "Hasil Prediksi Regresi Logistik Biner",
    digits = 4
  ) %>%
  kable_styling(
    bootstrap_options = c(
      "striped",
      "hover",
      "condensed"
    ),
    full_width = FALSE
  ) %>%
  scroll_box(
    width = "100%",
    height = "300px"
  )
Hasil Prediksi Regresi Logistik Biner
y age job marital education default balance housing loan duration campaign poutcome y_bin Prob_Yes Prob_No prediksi
no 30 unemployed married primary no 1787 no no 79 1 unknown 0 0.0237 0.9763 no
no 33 services married secondary no 4789 yes yes 220 1 failure 0 0.0293 0.9707 no
no 35 management single tertiary no 1350 yes no 185 1 failure 0 0.1020 0.8980 no
no 30 management married tertiary no 1476 yes yes 199 4 unknown 0 0.0220 0.9780 no
no 59 blue-collar married secondary no 0 yes no 226 1 unknown 0 0.0295 0.9705 no
no 35 management single tertiary no 747 no no 141 2 failure 0 0.1441 0.8559 no

3.7 Confusion Matrix

conf_bin <- table(
  Aktual = df_bank$y,
  Prediksi = data_bin_pred$prediksi
)

conf_bin
##       Prediksi
## Aktual   no  yes
##    no  3917   83
##    yes  362  159
accuracy_bin <- sum(diag(conf_bin)) / sum(conf_bin)
cat("Akurasi klasifikasi:", round(accuracy_bin, 4))
## Akurasi klasifikasi: 0.9016

3.8 Visualisasi Probabilitas Prediksi

grid_bin <- expand.grid(
  duration = seq(
    min(df_bank$duration),
    max(df_bank$duration),
    length.out = 100
  ),
  age = mean(df_bank$age),
  balance = mean(df_bank$balance),
  campaign = mean(df_bank$campaign),
  job = levels(df_bank$job)[1],
  marital = levels(df_bank$marital)[1],
  education = levels(df_bank$education)[1],
  housing = "no",
  loan = "no",
  poutcome = "unknown"
)

grid_bin$prob <- predict(
  fit_bin,
  newdata = grid_bin,
  type = "response"
)

ggplot(
  grid_bin,
  aes(duration, prob)
) +
  geom_line(
    linewidth = 1.5,
    color = "#EE30A7"
  ) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title = "Probabilitas Berlangganan Deposito",
    x = "Duration",
    y = "Probabilitas"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold")
  )


4. Regresi Poisson

4.1 Latar Belakang & Data

Dataset: NPHA (National Poll on Healthy Aging) — Doctor Visits
Sumber: UCI Machine Learning Repository
Jumlah observasi: 696 responden

Dataset ini berisi informasi kesehatan dan kondisi tidur responden lanjut usia yang dikumpulkan dalam survei National Poll on Healthy Aging (NPHA). Tujuan analisisnya adalah untuk mengidentifikasi faktor-faktor yang berhubungan dengan jumlah dokter yang dikunjungi oleh responden.

Variabel Peran Keterangan
Number_of_Doctors_Visited Y (respons) Jumlah dokter yang dikunjungi: 1, 2, atau 3 — hitungan (count)
Phyiscal_Health X ordinal Status kesehatan fisik (1=buruk … 5=sangat baik)
Mental_Health X ordinal Status kesehatan mental (1–5)
Trouble_Sleeping X ordinal Gangguan tidur (1–3)
Prescription_Sleep_Medication X ordinal Penggunaan obat tidur resep (1–3)
Employment X kategorik Status pekerjaan
Gender X kategorik Jenis kelamin
Race X kategorik Ras
Stress_Keeps_Patient_from_Sleeping X biner Stres mengganggu tidur (0/1)
Pain_Keeps_Patient_from_Sleeping X biner Nyeri mengganggu tidur (0/1)
df_npha <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/npha_clean.csv", stringsAsFactors = FALSE)

df_npha$Gender     <- factor(df_npha$Gender, levels = c("Male","Female"))
df_npha$Race       <- factor(df_npha$Race)
df_npha$Employment <- factor(df_npha$Employment,
                              levels = c("Employed","Self_employed","Retired","Unemployed"))

glimpse(df_npha)
## Rows: 696
## Columns: 14
## $ Number_of_Doctors_Visited                  <int> 3, 2, 3, 1, 3, 2, 3, 2, 2, …
## $ Phyiscal_Health                            <dbl> 4, 4, 3, 3, 3, 3, 4, 3, 3, …
## $ Mental_Health                              <dbl> 3, 2, 2, 2, 3, 2, 1, 2, 1, …
## $ Dental_Health                              <dbl> 3, 3, 3, 3, 3, 4, 1, 6, 2, …
## $ Employment                                 <fct> Retired, Retired, Retired, …
## $ Stress_Keeps_Patient_from_Sleeping         <int> 0, 1, 0, 0, 1, 0, 0, 1, 0, …
## $ Medication_Keeps_Patient_from_Sleeping     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Pain_Keeps_Patient_from_Sleeping           <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ Bathroom_Needs_Keeps_Patient_from_Sleeping <int> 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ Uknown_Keeps_Patient_from_Sleeping         <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ Trouble_Sleeping                           <dbl> 2, 3, 3, 3, 2, 3, 2, 3, 3, …
## $ Prescription_Sleep_Medication              <dbl> 3, 3, 3, 3, 3, 3, 1, 3, 3, …
## $ Race                                       <fct> White, White, Asian, Asian,…
## $ Gender                                     <fct> Female, Male, Male, Female,…

4.2 Distribusi Kategori Respons

df_npha %>%
  count(Number_of_Doctors_Visited) %>%
  mutate(proporsi = n / sum(n)) %>%
  kable(caption = "Distribusi Jumlah Dokter yang Dikunjungi", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Distribusi Jumlah Dokter yang Dikunjungi
Number_of_Doctors_Visited n proporsi
1 126 0.181
2 363 0.522
3 207 0.297
df_npha %>%
  count(Number_of_Doctors_Visited) %>%
  ggplot(aes(x = Number_of_Doctors_Visited, y = n)) +
  geom_col(width = 0.6, fill = "#B03060", alpha = 0.92) +
  geom_text(aes(label = n), vjust = -0.4, fontface = "bold") +
  scale_x_continuous(breaks = 1:3) +
  labs(title = "Distribusi Jumlah Dokter yang Dikunjungi",
       subtitle = "Respons Poisson: data cacah (count), bilangan bulat nonnegatif.",
       x = "Jumlah Dokter", y = "Frekuensi") +
  theme_minimal(base_size = 12)

4.3 Estimasi Model Poisson

fit_pois <- glm(
  Number_of_Doctors_Visited ~
    Phyiscal_Health + Mental_Health + Trouble_Sleeping +
    Prescription_Sleep_Medication + Employment + Gender + Race +
    Stress_Keeps_Patient_from_Sleeping + Pain_Keeps_Patient_from_Sleeping,
  data   = df_npha,
  family = poisson(link = "log")
)

summary(fit_pois)
## 
## Call:
## glm(formula = Number_of_Doctors_Visited ~ Phyiscal_Health + Mental_Health + 
##     Trouble_Sleeping + Prescription_Sleep_Medication + Employment + 
##     Gender + Race + Stress_Keeps_Patient_from_Sleeping + Pain_Keeps_Patient_from_Sleeping, 
##     family = poisson(link = "log"), data = df_npha)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                         0.5830124  0.2669421   2.184    0.029 *
## Phyiscal_Health                     0.0555237  0.0344872   1.610    0.107  
## Mental_Health                      -0.0227791  0.0334276  -0.681    0.496  
## Trouble_Sleeping                    0.0140007  0.0448053   0.312    0.755  
## Prescription_Sleep_Medication      -0.0747015  0.0536400  -1.393    0.164  
## EmploymentSelf_employed             0.0422049  0.1430003   0.295    0.768  
## EmploymentRetired                   0.0891016  0.1099064   0.811    0.418  
## EmploymentUnemployed                0.1067899  0.2027444   0.527    0.598  
## GenderFemale                       -0.0006536  0.0535056  -0.012    0.990  
## RaceBlack                           0.1075818  0.1531745   0.702    0.482  
## RaceHispanic                        0.1032178  0.2002224   0.516    0.606  
## RaceOther                           0.2093353  0.1892143   1.106    0.269  
## RaceWhite                           0.1488566  0.1200194   1.240    0.215  
## Stress_Keeps_Patient_from_Sleeping  0.0507501  0.0622897   0.815    0.415  
## Pain_Keeps_Patient_from_Sleeping    0.0149139  0.0668779   0.223    0.824  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 162.29  on 695  degrees of freedom
## Residual deviance: 152.51  on 681  degrees of freedom
## AIC: 2002.6
## 
## Number of Fisher Scoring iterations: 4

4.4 Tabel Hasil & IRR

pois_coef <- as.data.frame(coef(summary(fit_pois))) %>%
  tibble::rownames_to_column("parameter") %>%
  dplyr::rename(
    estimate  = Estimate,
    std_error = `Std. Error`,
    z_value   = `z value`,
    p_value   = `Pr(>|z|)`
  ) %>%
  mutate(
    IRR              = exp(estimate),
    CI_low           = exp(estimate - 1.96 * std_error),
    CI_high          = exp(estimate + 1.96 * std_error),
    perubahan_persen = 100 * (IRR - 1)
  )

pois_coef %>%
  mutate(across(c(estimate, std_error, z_value, p_value,
                  IRR, CI_low, CI_high, perubahan_persen), ~ round(.x, 4))) %>%
  kable(caption = "Ringkasan Model Regresi Poisson",
        col.names = c("Parameter","Estimate","SE","z-value","p-value",
                      "IRR","CI 2.5%","CI 97.5%","Δ%")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)
Ringkasan Model Regresi Poisson
Parameter Estimate SE z-value p-value IRR CI 2.5% CI 97.5% Δ%
(Intercept) 0.5830 0.2669 2.1840 0.0290 1.7914 1.0616 3.0229 79.1427
Phyiscal_Health 0.0555 0.0345 1.6100 0.1074 1.0571 0.9880 1.1310 5.7094
Mental_Health -0.0228 0.0334 -0.6814 0.4956 0.9775 0.9155 1.0437 -2.2522
Trouble_Sleeping 0.0140 0.0448 0.3125 0.7547 1.0141 0.9288 1.1072 1.4099
Prescription_Sleep_Medication -0.0747 0.0536 -1.3926 0.1637 0.9280 0.8354 1.0309 -7.1980
EmploymentSelf_employed 0.0422 0.1430 0.2951 0.7679 1.0431 0.7881 1.3806 4.3108
EmploymentRetired 0.0891 0.1099 0.8107 0.4175 1.0932 0.8813 1.3560 9.3192
EmploymentUnemployed 0.1068 0.2027 0.5267 0.5984 1.1127 0.7478 1.6556 11.2700
GenderFemale -0.0007 0.0535 -0.0122 0.9903 0.9993 0.8999 1.1098 -0.0653
RaceBlack 0.1076 0.1532 0.7023 0.4825 1.1136 0.8248 1.5035 11.3582
RaceHispanic 0.1032 0.2002 0.5155 0.6062 1.1087 0.7488 1.6416 10.8733
RaceOther 0.2093 0.1892 1.1063 0.2686 1.2329 0.8508 1.7864 23.2858
RaceWhite 0.1489 0.1200 1.2403 0.2149 1.1605 0.9172 1.4683 16.0507
Stress_Keeps_Patient_from_Sleeping 0.0508 0.0623 0.8147 0.4152 1.0521 0.9311 1.1887 5.2060
Pain_Keeps_Patient_from_Sleeping 0.0149 0.0669 0.2230 0.8235 1.0150 0.8903 1.1572 1.5026

4.5 Interpretasi

  • Phyiscal_Health: IRR = 1,0571, artinya setiap kenaikan 1 level kesehatan fisik meningkatkan rata-rata jumlah kunjungan dokter sebesar 5,71%.
  • Mental_Health: IRR = 0,9775, artinya setiap kenaikan 1 level kesehatan mental menurunkan rata-rata jumlah kunjungan dokter sebesar 2,25%.
  • Trouble_Sleeping: IRR = 1,0141, artinya responden yang mengalami kesulitan tidur memiliki rata-rata jumlah kunjungan dokter 1,41% lebih tinggi.
  • Prescription_Sleep_Medication: IRR = 0,9280, artinya responden yang menggunakan obat tidur memiliki rata-rata jumlah kunjungan dokter 7,20% lebih rendah.
  • EmploymentRetired (vs Employed): IRR = 1,0932, artinya pensiunan memiliki rata-rata jumlah kunjungan dokter 9,32% lebih tinggi dibanding yang bekerja.
  • EmploymentUnemployed (vs Employed): IRR = 1,1127, artinya pengangguran memiliki rata-rata jumlah kunjungan dokter 11,27% lebih tinggi dibanding yang bekerja.
  • GenderFemale (vs Male): IRR = 0,9993, artinya perempuan memiliki rata-rata jumlah kunjungan dokter hampir sama dengan laki-laki.
  • RaceWhite (vs kategori referensi): IRR = 1,1605, artinya responden kulit putih memiliki rata-rata jumlah kunjungan dokter 16,05% lebih tinggi dibanding kategori referensi.
  • Stress_Keeps_Patient_from_Sleeping: IRR = 1,0521, artinya responden yang tidurnya terganggu karena stres memiliki rata-rata jumlah kunjungan dokter 5,21% lebih tinggi.
  • Pain_Keeps_Patient_from_Sleeping: IRR = 1,0150, artinya responden yang tidurnya terganggu karena nyeri memiliki rata-rata jumlah kunjungan dokter 1,50% lebih tinggi.

Namun, seluruh variabel memiliki p-value > 0,05 sehingga tidak terdapat bukti yang cukup bahwa variabel-variabel tersebut berpengaruh signifikan terhadap jumlah kunjungan dokter.

# Prediksi rata-rata hitungan
head(predict(fit_pois, type = "response"))
##        1        2        3        4        5        6 
## 2.177051 2.377751 1.842318 1.841114 2.166684 2.138022

4.6 Cek Overdispersi

Regresi Poisson mengasumsikan mean = variance. Periksa dengan statistik dispersi Pearson:

disp_pois <- sum(residuals(fit_pois, type = "pearson")^2) / df.residual(fit_pois)

tibble::tibble(
  `Statistik Dispersi Pearson` = disp_pois,
  `Interpretasi` = dplyr::case_when(
    disp_pois < 1.5 ~ "Tidak ada overdispersion berat — model Poisson sudah tepat",
    disp_pois < 2.5 ~ "Ada indikasi overdispersion sedang — pertimbangkan Quasi-Poisson",
    TRUE            ~ "Overdispersion kuat — pertimbangkan Negative Binomial"
  )
) %>%
  mutate(`Statistik Dispersi Pearson` = round(`Statistik Dispersi Pearson`, 3)) %>%
  kable(caption = "Pemeriksaan Overdispersion Model Poisson") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Pemeriksaan Overdispersion Model Poisson
Statistik Dispersi Pearson Interpretasi
0.212 Tidak ada overdispersion berat — model Poisson sudah tepat

4.7 Visualisasi Prediksi Rate Poisson

grid_pois <- expand.grid(
  Phyiscal_Health = seq(
    min(df_npha$Phyiscal_Health),
    max(df_npha$Phyiscal_Health),
    length.out = 120
  ),
  Mental_Health = mean(df_npha$Mental_Health),
  Trouble_Sleeping = mean(df_npha$Trouble_Sleeping),
  Prescription_Sleep_Medication =
    mean(df_npha$Prescription_Sleep_Medication),

  Employment = "Employed",
  Gender = "Female",
  Race = levels(df_npha$Race)[1],

  Stress_Keeps_Patient_from_Sleeping = 0,
  Pain_Keeps_Patient_from_Sleeping = 0
)

grid_pois$Employment <- factor(
  grid_pois$Employment,
  levels = levels(df_npha$Employment)
)

grid_pois$Gender <- factor(
  grid_pois$Gender,
  levels = levels(df_npha$Gender)
)

grid_pois$Race <- factor(
  grid_pois$Race,
  levels = levels(df_npha$Race)
)

pred_pois <- predict(
  fit_pois,
  newdata = grid_pois,
  type = "link",
  se.fit = TRUE
)

grid_pois_plot <- grid_pois %>%
  mutate(
    fit_link = pred_pois$fit,
    se_link = pred_pois$se.fit,

    rate = exp(fit_link),

    rate_low = exp(
      fit_link - 1.96 * se_link
    ),

    rate_high = exp(
      fit_link + 1.96 * se_link
    )
  )

ggplot(
  grid_pois_plot,
  aes(
    x = Phyiscal_Health,
    y = rate
  )
) +
  geom_ribbon(
    aes(
      ymin = rate_low,
      ymax = rate_high
    ),
    fill = "#FF82AB",
    alpha = 0.20
  ) +
  geom_line(
    color = "#8B1C62",
    linewidth = 1.4
  ) +
  labs(
    title = "Prediksi Jumlah Kunjungan Dokter",
    subtitle = "Variabel lain ditahan konstan",
    x = "Kesehatan Fisik",
    y = "Prediksi Jumlah Kunjungan Dokter"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold")
  )