library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(scales)
library(nnet)    
library(MASS)

1. Regresi Logistik Multinomial

1.1 Latar Belakang & Data

Dataset: Online Shoppers Purchasing Intention
Sumber: UCI Machine Learning Repository
Jumlah observasi: 12.330 sesi kunjungan

Dataset ini berisi data perilaku pengunjung situs e-commerce selama satu sesi kunjungan. Ditujukan untuk mengidentifikasi faktor-faktor yang berkaitan dengan jenis pengunjung (New Visitor, Returning Visitor, atau Other) berdasarkan aktivitas mereka di website.

Variabel	Jenis	Keterangan
`VisitorType`	Y (respons)	Jenis pengunjung: New_Visitor, Returning_Visitor, Other — nominal, 3 kategori, tanpa urutan
`BounceRates`	X kontinu	Proporsi sesi yang langsung meninggalkan halaman
`ExitRates`	X kontinu	Proporsi halaman yang menjadi halaman terakhir
`PageValues`	X kontinu	Nilai rata-rata halaman yang dikunjungi sebelum transaksi
`ProductRelated`	X kontinu	Jumlah halaman produk yang dikunjungi
`Administrative`	X kontinu	Jumlah halaman administratif yang dikunjungi
`Weekend`	X biner	1 = kunjungan saat akhir pekan
`Revenue`	X biner	1 = sesi menghasilkan transaksi

df_shop <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/online_shoppers_clean.csv", stringsAsFactors = FALSE)

df_shop$VisitorType <- factor(df_shop$VisitorType,
                               levels = c("New_Visitor", "Returning_Visitor", "Other"))
df_shop$Weekend <- as.integer(df_shop$Weekend)
df_shop$Revenue <- as.integer(df_shop$Revenue)

glimpse(df_shop)

## Rows: 12,330
## Columns: 11
## $ VisitorType    <fct> Returning_Visitor, Returning_Visitor, Returning_Visitor…
## $ Administrative <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0…
## $ Informational  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ProductRelated <int> 1, 2, 1, 2, 10, 19, 1, 0, 2, 3, 3, 16, 7, 6, 2, 23, 1, …
## $ BounceRates    <dbl> 0.200000000, 0.000000000, 0.200000000, 0.050000000, 0.0…
## $ ExitRates      <dbl> 0.200000000, 0.100000000, 0.200000000, 0.140000000, 0.0…
## $ PageValues     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SpecialDay     <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.8, 0.4, 0.0, …
## $ Month          <chr> "Feb", "Feb", "Feb", "Feb", "Feb", "Feb", "Feb", "Feb",…
## $ Weekend        <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Revenue        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

1.2 Distribusi Kategori Respons

df_shop %>%
  count(VisitorType) %>%
  mutate(proporsi = n / sum(n)) %>%
  kable(caption = "Distribusi Jenis Pengunjung", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)

Distribusi Jenis Pengunjung
VisitorType	n	proporsi
New_Visitor	1694	0.137
Returning_Visitor	10551	0.856
Other	85	0.007

df_shop %>%
  count(VisitorType) %>%
  mutate(proporsi = n / sum(n)) %>%
  ggplot(aes(x = VisitorType, y = proporsi, fill = VisitorType)) +
  geom_col(width = 0.6, alpha = 0.92) +
  geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
            vjust = -0.4, fontface = "bold") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.95)) +
  scale_fill_manual(values = c("#8B475D", "#CD6889", "#FF82AB")) +
  labs(title = "Distribusi Jenis Pengunjung",
       subtitle = "Respons multinomial: 3 kategori nominal, tidak ada urutan alami.",
       x = NULL, y = "Proporsi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

1.3 Estimasi Model Multinomial

fit_multi <- nnet::multinom(
  VisitorType ~ BounceRates + ExitRates + PageValues +
                ProductRelated + Administrative + Weekend + Revenue,
  data  = df_shop,
  trace = FALSE
)

summary(fit_multi)

## Call:
## nnet::multinom(formula = VisitorType ~ BounceRates + ExitRates + 
##     PageValues + ProductRelated + Administrative + Weekend + 
##     Revenue, data = df_shop, trace = FALSE)
## 
## Coefficients:
##                   (Intercept) BounceRates ExitRates   PageValues ProductRelated
## Returning_Visitor  -0.1310171   -11.94519  43.80824 -0.003853412   0.0475086463
## Other              -4.1512516   -10.99646  43.21727  0.012529515  -0.0002229309
##                   Administrative    Weekend    Revenue
## Returning_Visitor    -0.05446757 -0.2026824 -0.4818629
## Other                -0.01430138 -1.3055439 -0.2841169
## 
## Std. Errors:
##                   (Intercept) BounceRates ExitRates  PageValues ProductRelated
## Returning_Visitor  0.07342134    2.547536  2.003455 0.001464643    0.002081928
## Other              0.24746790    1.428562  1.649217 0.002890596    0.010481231
##                   Administrative    Weekend    Revenue
## Returning_Visitor    0.009851435 0.06364857 0.08317437
## Other                0.049615811 0.39963797 0.36010289
## 
## Residual Deviance: 9014.63 
## AIC: 9046.63

1.4 Tabel Hasil & Uji Signifikansi

multi_sum <- summary(fit_multi)

coef_long <- as.data.frame(multi_sum$coefficients) %>%
  tibble::rownames_to_column("kategori") %>%
  pivot_longer(-kategori, names_to = "variabel", values_to = "estimate")

se_long <- as.data.frame(multi_sum$standard.errors) %>%
  tibble::rownames_to_column("kategori") %>%
  pivot_longer(-kategori, names_to = "variabel", values_to = "std_error")

result_multi <- left_join(coef_long, se_long, by = c("kategori","variabel")) %>%
  mutate(
    z_value = estimate / std_error,
    p_value = 2 * (1 - pnorm(abs(z_value))),
    RRR     = exp(estimate),
    CI_low  = exp(estimate - 1.96 * std_error),
    CI_high = exp(estimate + 1.96 * std_error)
  )

result_multi %>%
  mutate(across(c(estimate, std_error, z_value, p_value, RRR, CI_low, CI_high),
                ~ round(.x, 4))) %>%
  kable(caption = "Ringkasan Model Regresi Logistik Multinomial",
        col.names = c("Kategori","Variabel","Estimate","SE","z","p-value",
                      "RRR","CI 2.5%","CI 97.5%")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)

Ringkasan Model Regresi Logistik Multinomial
Kategori	Variabel	Estimate	SE	z	p-value	RRR	CI 2.5%	CI 97.5%
Returning_Visitor	(Intercept)	-0.1310	0.0734	-1.7845	0.0743	8.772000e-01	7.596000e-01	1.013000e+00
Returning_Visitor	BounceRates	-11.9452	2.5475	-4.6889	0.0000	0.000000e+00	0.000000e+00	1.000000e-03
Returning_Visitor	ExitRates	43.8082	2.0035	21.8663	0.0000	1.060909e+19	2.090752e+17	5.383363e+20
Returning_Visitor	PageValues	-0.0039	0.0015	-2.6310	0.0085	9.962000e-01	9.933000e-01	9.990000e-01
Returning_Visitor	ProductRelated	0.0475	0.0021	22.8195	0.0000	1.048700e+00	1.044400e+00	1.052900e+00
Returning_Visitor	Administrative	-0.0545	0.0099	-5.5289	0.0000	9.470000e-01	9.289000e-01	9.655000e-01
Returning_Visitor	Weekend	-0.2027	0.0636	-3.1844	0.0015	8.165000e-01	7.208000e-01	9.250000e-01
Returning_Visitor	Revenue	-0.4819	0.0832	-5.7934	0.0000	6.176000e-01	5.247000e-01	7.270000e-01
Other	(Intercept)	-4.1513	0.2475	-16.7749	0.0000	1.570000e-02	9.700000e-03	2.560000e-02
Other	BounceRates	-10.9965	1.4286	-7.6976	0.0000	0.000000e+00	0.000000e+00	3.000000e-04
Other	ExitRates	43.2173	1.6492	26.2047	0.0000	5.875163e+18	2.318344e+17	1.488888e+20
Other	PageValues	0.0125	0.0029	4.3346	0.0000	1.012600e+00	1.006900e+00	1.018400e+00
Other	ProductRelated	-0.0002	0.0105	-0.0213	0.9830	9.998000e-01	9.794000e-01	1.020500e+00
Other	Administrative	-0.0143	0.0496	-0.2882	0.7732	9.858000e-01	8.944000e-01	1.086500e+00
Other	Weekend	-1.3055	0.3996	-3.2668	0.0011	2.710000e-01	1.238000e-01	5.932000e-01
Other	Revenue	-0.2841	0.3601	-0.7890	0.4301	7.527000e-01	3.716000e-01	1.524500e+00

1.5 Interpretasi

PageValues (Returning_Visitor vs New_Visitor): RRR = 0,9962, artinya setiap kenaikan 1 unit PageValues menurunkan peluang menjadi Returning_Visitor sebesar 0,38%.
ProductRelated (Returning_Visitor vs New_Visitor): RRR = 1,0487, artinya setiap kenaikan 1 unit ProductRelated meningkatkan peluang menjadi Returning_Visitor sebesar 4,87%.
Administrative (Returning_Visitor vs New_Visitor): RRR = 0,9470, artinya setiap kenaikan 1 unit Administrative menurunkan peluang menjadi Returning_Visitor sebesar 5,30%.
Weekend (Returning_Visitor vs New_Visitor): RRR = 0,8165, artinya pengunjung akhir pekan memiliki peluang menjadi Returning_Visitor 18,35% lebih rendah.
Revenue (Returning_Visitor vs New_Visitor): RRR = 0,6176, artinya pengunjung yang melakukan transaksi memiliki peluang menjadi Returning_Visitor 38,24% lebih rendah.
PageValues (Other vs New_Visitor): RRR = 1,0126, artinya setiap kenaikan 1 unit PageValues meningkatkan peluang masuk kategori Other sebesar 1,26%.
Weekend (Other vs New_Visitor): RRR = 0,2710, artinya pengunjung akhir pekan memiliki peluang masuk kategori Other 72,90% lebih rendah.

1.6 Prediksi Probabilitas Multinomial

pred_prob_multi <- predict(fit_multi, type = "probs")

data_multi_pred <- df_shop %>%
  bind_cols(as.data.frame(pred_prob_multi)) %>%
  mutate(
    prediksi = predict(fit_multi, type = "class")
  )

head(pred_prob_multi)

##   New_Visitor Returning_Visitor       Other
## 1 0.001819601         0.9801636 0.018016784
## 2 0.012616092         0.9724296 0.014954351
## 3 0.001819601         0.9801636 0.018016784
## 4 0.004007705         0.9805499 0.015442378
## 5 0.109432969         0.8873217 0.003245345
## 6 0.158924077         0.8350215 0.006054415

data_multi_pred %>%
  head() %>%
  kable(
    caption = "Hasil Prediksi Regresi Logistik Multinomial",
    digits = 4
  ) %>%
  kable_styling(
    bootstrap_options = c("striped","hover","condensed"),
    full_width = FALSE
  ) %>%
  scroll_box(
    width = "100%",
    height = "300px"
  )

Hasil Prediksi Regresi Logistik Multinomial
VisitorType	ProductRelated	BounceRates	ExitRates	Month	Weekend	New_Visitor	Returning_Visitor	Other	prediksi
Returning_Visitor	1	0.2000	0.2000	Feb	0	0.0018	0.9802	0.0180	Returning_Visitor
Returning_Visitor	2	0.0000	0.1000	Feb	0	0.0126	0.9724	0.0150	Returning_Visitor
Returning_Visitor	1	0.2000	0.2000	Feb	0	0.0018	0.9802	0.0180	Returning_Visitor
Returning_Visitor	2	0.0500	0.1400	Feb	0	0.0040	0.9805	0.0154	Returning_Visitor
Returning_Visitor	10	0.0200	0.0500	Feb	1	0.1094	0.8873	0.0032	Returning_Visitor
Returning_Visitor	19	0.0158	0.0246	Feb	0	0.1589	0.8350	0.0061	Returning_Visitor

1.7 Uji Model (LRT)

fit_multi_null <- nnet::multinom(VisitorType ~ 1, data = df_shop, trace = FALSE)

lrt_stat <- 2 * (as.numeric(logLik(fit_multi)) - as.numeric(logLik(fit_multi_null)))
df_lrt   <- length(coef(fit_multi)) - 2   # (K-1) * p_prediktor
p_lrt    <- 1 - pchisq(lrt_stat, df_lrt)

tibble::tibble(
  `G² (Devians)` = lrt_stat,
  `df`           = df_lrt,
  `p-value`      = p_lrt
) %>%
  mutate(across(c(`G² (Devians)`, `p-value`), ~ round(.x, 4))) %>%
  kable(caption = "Uji Kebaikan Model (Likelihood Ratio Test)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Uji Kebaikan Model (Likelihood Ratio Test)
G² (Devians)	df	p-value
1844.471	14	0

1.8 Confusion Matrix

conf_multi <- table(
  Aktual = df_shop$VisitorType,
  Prediksi = data_multi_pred$prediksi
)

conf_multi

##                    Prediksi
## Aktual              New_Visitor Returning_Visitor Other
##   New_Visitor                77              1616     1
##   Returning_Visitor          59             10492     0
##   Other                       5                80     0

accuracy_multi <- sum(diag(conf_multi)) /
  sum(conf_multi)

accuracy_multi

## [1] 0.8571776

1.9 Visualisasi Probabilitas Prediksi

grid_multi <- expand.grid(
  PageValues = seq(
    min(df_shop$PageValues),
    max(df_shop$PageValues),
    length.out = 100
  ),
  BounceRates = mean(df_shop$BounceRates),
  ExitRates = mean(df_shop$ExitRates),
  ProductRelated = mean(df_shop$ProductRelated),
  Administrative = mean(df_shop$Administrative),
  Weekend = 0,
  Revenue = 0
)

grid_prob_multi <- predict(
  fit_multi,
  newdata = grid_multi,
  type = "probs"
)

plot_multi <- grid_multi %>%
  bind_cols(as.data.frame(grid_prob_multi)) %>%
  pivot_longer(
    cols = c(
      "New_Visitor",
      "Returning_Visitor",
      "Other"
    ),
    names_to = "kategori",
    values_to = "prob"
  )

ggplot(
  plot_multi,
  aes(PageValues, prob, color = kategori)
) +
  geom_line(linewidth = 1.5) +
  scale_color_manual(
    values = c(
      "New_Visitor" = "#8B1C62",
      "Returning_Visitor" = "#EE30A7",
      "Other" = "#FF82AB"
    )
  ) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title = "Probabilitas Prediksi Jenis Pengunjung",
    x = "Page Values",
    y = "Probabilitas",
    color = "Kategori"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold")
  )

2. Regresi Logistik Ordinal

2.1 Latar Belakang & Data

Dataset: Student Smoking, Alcohol, and Mental Health
Sumber: Mendeley
Jumlah observasi: 706 mahasiswa

Dataset ini memuat informasi mengenai kebiasaan merokok, konsumsi alkohol, strategi mengatasi stres, dan karakteristik mahasiswa yang dikaitkan dengan tingkat kesehatan psikologis (wellness). Tujuan analisis adalah untuk mengetahui faktor-faktor yang berhubungan dengan peningkatan atau penurunan tingkat kesejahteraan psikologis mahasiswa. Variabel respons diklasifikasikan ke dalam kategori berurutan mulai dari Poor/Below Average, Average/Good, hingga Excellent.

Variabel	Jenis	Keterangan
`wellness_3`	Y (respons)	Kesehatan psikologis: Poor/Below average < Average/Good < Excellent — ordinal, 3 kategori
`freq_smoking`	X kategorik	Frekuensi merokok (5 level)
`freq_alcohol2`	X kategorik	Frekuensi konsumsi alkohol (4 level; Monthly & Weekly digabung)
`cope_stress2`	X kategorik	Cara mengatasi stres (4 level; Meditation, Other, Socializing digabung)
`age_group`	X kategorik	Kelompok usia
`year_study`	X kategorik	Tahun studi

df_stud <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/student_clean.csv", stringsAsFactors = FALSE)


df_stud$wellness_3 <- dplyr::case_when(
  df_stud$wellness %in% c("Poor","Below average") ~ "Poor/Below average",
  df_stud$wellness %in% c("Average","Good")        ~ "Average/Good",
  df_stud$wellness == "Excellent"                  ~ "Excellent"
)
df_stud$wellness_3 <- ordered(df_stud$wellness_3,
  levels = c("Poor/Below average","Average/Good","Excellent"))


df_stud$freq_alcohol2 <- ifelse(
  df_stud$freq_alcohol %in% c("Monthly","Weekly"), "Monthly/Weekly",
  df_stud$freq_alcohol
)
df_stud$freq_alcohol2 <- factor(df_stud$freq_alcohol2,
  levels = c("Don't consume alcohol","Only on social occasions",
             "Monthly/Weekly","Daily"))


df_stud$cope_stress2 <- ifelse(
  df_stud$cope_stress %in% c("Meditation","Other","Socializing"), "Other",
  df_stud$cope_stress
)
df_stud$cope_stress2 <- factor(df_stud$cope_stress2,
  levels = c("Drinking alcohol","Smoking","Exercise","Other"))


df_stud$freq_smoking <- factor(df_stud$freq_smoking,
  levels = c("Don't smoke","Only on social occasions","Monthly","Weekly","Daily"))
df_stud$age_group <- factor(df_stud$age_group,
  levels = c("18-20","21-23","24-26","27 or older"))
df_stud$year_study <- factor(df_stud$year_study,
  levels = c("1st year","2nd year","3rd year","4th year","Graduate student"))

glimpse(df_stud)

## Rows: 706
## Columns: 15
## $ wellness           <chr> "Good", "Good", "Good", "Excellent", "Excellent", "…
## $ freq_smoking       <fct> NA, Daily, Daily, Daily, Weekly, Daily, Daily, Mont…
## $ freq_alcohol       <chr> "Don’t consume alcohol", "Don’t consume alcohol", "…
## $ help_seeking       <chr> "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
## $ cope_stress        <chr> "Other", "Smoking", "Drinking alcohol", "Exercise",…
## $ age_group          <fct> 24-26, 24-26, 27 or older, 21-23, 18-20, 18-20, 18-…
## $ year_study         <fct> Graduate student, Graduate student, 4th year, Gradu…
## $ wellness_num       <int> 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ wellness_3         <ord> Average/Good, Average/Good, Average/Good, Excellent…
## $ cope_stress2       <fct> Other, Smoking, Drinking alcohol, Exercise, Drinkin…
## $ freq_alcohol2      <fct> NA, NA, Only on social occasions, Daily, Only on so…
## $ cope_stress_clean  <chr> "Other", "Substance_use", "Substance_use", "Exercis…
## $ freq_alcohol_clean <chr> "Don’t consume alcohol", "Don’t consume alcohol", "…
## $ smoker             <chr> "Non_smoker", "Smoker", "Smoker", "Smoker", "Smoker…
## $ drinker            <chr> "Non_drinker", "Non_drinker", "Drinker", "Drinker",…

2.2 Distribusi Kategori Respons

df_stud %>%
  count(wellness_3) %>%
  mutate(proporsi = n / sum(n)) %>%
  kable(caption = "Distribusi Tingkat Kesehatan Psikologis (wellness_3)", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)

Distribusi Tingkat Kesehatan Psikologis (wellness_3)
wellness_3	n	proporsi
Poor/Below average	589	0.834
Average/Good	35	0.050
Excellent	82	0.116

df_stud %>%
  count(wellness_3) %>%
  mutate(proporsi = n / sum(n)) %>%
  ggplot(aes(x = wellness_3, y = proporsi, fill = wellness_3)) +
  geom_col(width = 0.65, alpha = 0.92) +
  geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
            vjust = -0.4, fontface = "bold") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.90)) +
  scale_fill_manual(values = c("#D02090","#EE3A8C","#FF3E96")) +
  labs(title = "Distribusi Tingkat Kesehatan Psikologis",
       subtitle = "Respons ordinal: Poor/Below average → Average/Good → Excellent.",
       x = NULL, y = "Proporsi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

2.3 Estimasi Model Ordinal

fit_ord <- MASS::polr(
  wellness_3 ~ freq_smoking + freq_alcohol2 + cope_stress2 +
               age_group + year_study,
  data   = df_stud,
  method = "logistic",
  Hess   = TRUE
)

summary(fit_ord)

## Call:
## MASS::polr(formula = wellness_3 ~ freq_smoking + freq_alcohol2 + 
##     cope_stress2 + age_group + year_study, data = df_stud, Hess = TRUE, 
##     method = "logistic")
## 
## Coefficients:
##                                          Value Std. Error  t value
## freq_smokingOnly on social occasions  -0.32403     1.5248 -0.21251
## freq_smokingMonthly                   -2.78986     2.0421 -1.36616
## freq_smokingWeekly                    -0.10085     1.0531 -0.09576
## freq_alcohol2Only on social occasions  0.42065     1.1204  0.37546
## freq_alcohol2Monthly/Weekly            0.13183     1.3559  0.09723
## cope_stress2Smoking                    1.82697     0.9056  2.01749
## cope_stress2Exercise                  11.47952     2.3913  4.80048
## cope_stress2Other                      8.29811     1.9297  4.30020
## age_group21-23                        -3.48180     1.5144 -2.29907
## age_group24-26                        -4.51354     1.6306 -2.76802
## age_group27 or older                  -2.23651     2.2024 -1.01549
## year_study2nd year                     1.80758     1.7868  1.01161
## year_study3rd year                     3.06678     1.7976  1.70600
## year_study4th year                     1.98616     1.7116  1.16043
## year_studyGraduate student             0.02569     2.2474  0.01143
## 
## Intercepts:
##                                 Value   Std. Error t value
## Poor/Below average|Average/Good  3.8616  1.2105     3.1901
## Average/Good|Excellent           6.5672  1.5059     4.3610
## 
## Residual Deviance: 76.03743 
## AIC: 110.0374 
## (96 observations deleted due to missingness)

2.4 Tabel Hasil & Uji Signifikansi

ord_coef <- coef(summary(fit_ord))

result_ord <- as.data.frame(ord_coef) %>%
  tibble::rownames_to_column("parameter") %>%
  dplyr::rename(estimate = Value, std_error = `Std. Error`, t_value = `t value`) %>%
  mutate(
    p_value        = 2 * (1 - pnorm(abs(t_value))),
    jenis          = ifelse(grepl("\\|", parameter), "Cutpoint", "Koefisien"),
    OR_naik        = ifelse(jenis == "Koefisien", exp(-estimate), NA_real_),
    CI_low_naik    = ifelse(jenis == "Koefisien", exp(-estimate - 1.96 * std_error), NA_real_),
    CI_high_naik   = ifelse(jenis == "Koefisien", exp(-estimate + 1.96 * std_error), NA_real_)
  )

result_ord %>%
  mutate(across(c(estimate, std_error, t_value, p_value, OR_naik, CI_low_naik, CI_high_naik),
                ~ round(.x, 4))) %>%
  kable(caption = "Ringkasan Model Regresi Logistik Ordinal (Proportional Odds)",
        col.names = c("Parameter","Estimate (polr)","SE","t-value","p-value",
                      "Jenis","OR (naik)","CI 2.5%","CI 97.5%")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
  row_spec(which(result_ord$jenis == "Cutpoint"), background = "#f0f0f0")

Ringkasan Model Regresi Logistik Ordinal (Proportional Odds)
Parameter	Estimate (polr)	SE	t-value	p-value	Jenis	OR (naik)	CI 2.5%	CI 97.5%
freq_smokingOnly on social occasions	-0.3240	1.5248	-0.2125	0.8317	Koefisien	1.3827	0.0696	27.4591
freq_smokingMonthly	-2.7899	2.0421	-1.3662	0.1719	Koefisien	16.2787	0.2974	891.0659
freq_smokingWeekly	-0.1008	1.0531	-0.0958	0.9237	Koefisien	1.1061	0.1404	8.7143
freq_alcohol2Only on social occasions	0.4207	1.1204	0.3755	0.7073	Koefisien	0.6566	0.0731	5.9017
freq_alcohol2Monthly/Weekly	0.1318	1.3559	0.0972	0.9225	Koefisien	0.8765	0.0615	12.4997
cope_stress2Smoking	1.8270	0.9056	2.0175	0.0436	Koefisien	0.1609	0.0273	0.9493
cope_stress2Exercise	11.4795	2.3913	4.8005	0.0000	Koefisien	0.0000	0.0000	0.0011
cope_stress2Other	8.2981	1.9297	4.3002	0.0000	Koefisien	0.0002	0.0000	0.0109
age_group21-23	-3.4818	1.5144	-2.2991	0.0215	Koefisien	32.5183	1.6711	632.7666
age_group24-26	-4.5135	1.6306	-2.7680	0.0056	Koefisien	91.2446	3.7343	2229.4850
age_group27 or older	-2.2365	2.2024	-1.0155	0.3099	Koefisien	9.3606	0.1249	701.4864
year_study2nd year	1.8076	1.7868	1.0116	0.3117	Koefisien	0.1641	0.0049	5.4445
year_study3rd year	3.0668	1.7976	1.7060	0.0880	Koefisien	0.0466	0.0014	1.5787
year_study4th year	1.9862	1.7116	1.1604	0.2459	Koefisien	0.1372	0.0048	3.9295
year_studyGraduate student	0.0257	2.2474	0.0114	0.9909	Koefisien	0.9746	0.0119	79.7796
Poor/Below average\|Average/Good	3.8616	1.2105	3.1901	0.0014	Cutpoint	NA	NA	NA
Average/Good\|Excellent	6.5672	1.5059	4.3610	0.0000	Cutpoint	NA	NA	NA

2.5 Interpretasi

cope_stress = Smoking: OR = 0,1609, artinya mahasiswa yang mengatasi stres dengan merokok memiliki odds berada pada kategori wellness yang lebih tinggi 83,91% lebih rendah dibanding kategori referensi.
cope_stress = Exercise: OR ≈ 0,0000, artinya mahasiswa yang mengatasi stres dengan olahraga memiliki odds berada pada kategori wellness yang lebih tinggi jauh lebih rendah dibanding kategori referensi.
cope_stress = Other: OR = 0,0002, artinya mahasiswa yang menggunakan cara lain untuk mengatasi stres memiliki odds berada pada kategori wellness yang lebih tinggi 99,98% lebih rendah dibanding kategori referensi.
age_group 21–23: OR = 32,5183, artinya mahasiswa usia 21–23 tahun memiliki odds berada pada kategori wellness yang lebih tinggi 32,52 kali lebih besar dibanding kelompok usia referensi.
age_group 24–26: OR = 91,2446, artinya mahasiswa usia 24–26 tahun memiliki odds berada pada kategori wellness yang lebih tinggi 91,24 kali lebih besar dibanding kelompok usia referensi.

2.6 Prediksi Probabilitas Ordinal

pred_prob_ord <- predict(
  fit_ord,
  type = "probs"
)

head(pred_prob_ord)

##   Poor/Below average Average/Good   Excellent
## 3       0.9756678240  0.022668194 0.001663982
## 4       0.0153407378  0.173703845 0.810955418
## 5       0.6165745499  0.343522633 0.039902817
## 6       0.0004913311  0.006810168 0.992698501
## 7       0.0004913311  0.006810168 0.992698501
## 8       0.0079386458  0.098991531 0.893069823

pred_class_ord <- predict(
  fit_ord,
  type = "class"
)

data_ord_pred <- model.frame(fit_ord) %>%
  bind_cols(
    as.data.frame(pred_prob_ord)
  ) %>%
  mutate(
    prediksi = pred_class_ord
  )

data_ord_pred %>%
  head() %>%
  kable(
    caption = "Hasil Prediksi Regresi Logistik Ordinal",
    digits = 4
  ) %>%
  kable_styling(
    bootstrap_options = c(
      "striped",
      "hover",
      "condensed"
    ),
    full_width = FALSE
  ) %>%
  scroll_box(
    width = "100%",
    height = "300px"
  )

Hasil Prediksi Regresi Logistik Ordinal
	wellness_3	freq_smoking	freq_alcohol2	cope_stress2	age_group	year_study	Poor/Below average	Average/Good	Excellent	prediksi
3	Average/Good	Daily	Only on social occasions	Drinking alcohol	27 or older	4th year	0.9757	0.0227	0.0017	Poor/Below average
4	Excellent	Daily	Daily	Exercise	21-23	Graduate student	0.0153	0.1737	0.8110	Excellent
5	Excellent	Weekly	Only on social occasions	Drinking alcohol	18-20	3rd year	0.6166	0.3435	0.0399	Poor/Below average
6	Excellent	Daily	Daily	Exercise	18-20	1st year	0.0005	0.0068	0.9927	Excellent
7	Excellent	Daily	Daily	Exercise	18-20	1st year	0.0005	0.0068	0.9927	Excellent
8	Excellent	Monthly	Daily	Exercise	18-20	1st year	0.0079	0.0990	0.8931	Excellent

2.7 Uji Model (LRT)

fit_ord_null <- MASS::polr(wellness_3 ~ 1, data = df_stud,
                            method = "logistic", Hess = TRUE)

lrt_ord  <- 2 * (as.numeric(logLik(fit_ord)) - as.numeric(logLik(fit_ord_null)))
df_ord   <- length(coef(fit_ord))
p_ord    <- 1 - pchisq(lrt_ord, df_ord)

tibble::tibble(
  `G² (Devians)` = lrt_ord,
  `df`           = df_ord,
  `p-value`      = p_ord
) %>%
  mutate(across(c(`G² (Devians)`,`p-value`), ~ round(.x, 4))) %>%
  kable(caption = "Uji Kebaikan Model Ordinal (LRT vs Model Null)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Uji Kebaikan Model Ordinal (LRT vs Model Null)
G² (Devians)	df	p-value
700.7769	15	0

2.8 Confusion Matrix

conf_ord <- table(
  Aktual = data_ord_pred$wellness_3,
  Prediksi = data_ord_pred$prediksi
)

conf_ord

##                     Prediksi
## Aktual               Poor/Below average Average/Good Excellent
##   Poor/Below average                576            1         0
##   Average/Good                        6            2         2
##   Excellent                           2            0        21

accuracy_ord <- sum(diag(conf_ord)) /
  sum(conf_ord)

accuracy_ord

## [1] 0.9819672

3. Regresi Logistik Biner

3.1 Latar Belakang & Data

Dataset: Bank Marketing
Sumber: UCI Machine Learning Repository
Jumlah observasi: 4.521 nasabah

Dataset ini berasal dari kampanye pemasaran langsung suatu bank Portugis yang menawarkan produk deposito berjangka kepada nasabah. Tujuan analisis adalah untuk mengetahui faktor-faktor yang memengaruhi keputusan nasabah untuk berlangganan deposito (yes) atau tidak (no).

Variabel	Peran	Keterangan
`y`	Y (respons)	Nasabah berlangganan deposito berjangka: yes/no — biner
`age`	X kontinu	Usia nasabah (tahun)
`balance`	X kontinu	Saldo rekening rata-rata (euro)
`duration`	X kontinu	Durasi kontak terakhir (detik)
`campaign`	X kontinu	Jumlah kontak selama kampanye ini
`job`	X kategorik	Pekerjaan
`marital`	X kategorik	Status pernikahan
`education`	X kategorik	Tingkat pendidikan
`housing`	X biner	Memiliki kredit perumahan (yes/no)
`loan`	X biner	Memiliki pinjaman pribadi (yes/no)
`poutcome`	X kategorik	Hasil kampanye pemasaran sebelumnya

df_bank <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/bank_clean.csv", stringsAsFactors = FALSE, sep = ",")

# Faktor
df_bank$y       <- factor(df_bank$y, levels = c("no","yes"))
df_bank$job     <- factor(df_bank$job)
df_bank$marital <- factor(df_bank$marital)
df_bank$education <- factor(df_bank$education,
                             levels = c("primary","secondary","tertiary","unknown"))
df_bank$housing  <- factor(df_bank$housing)
df_bank$loan     <- factor(df_bank$loan)
df_bank$poutcome <- factor(df_bank$poutcome,
                            levels = c("unknown","failure","other","success"))

glimpse(df_bank)

## Rows: 4,521
## Columns: 13
## $ y         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, yes, no,…
## $ age       <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, …
## $ job       <fct> unemployed, services, management, management, blue-collar, m…
## $ marital   <fct> married, married, single, married, married, single, married,…
## $ education <fct> primary, secondary, tertiary, tertiary, secondary, tertiary,…
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ balance   <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 26…
## $ housing   <fct> no, yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes, no…
## $ loan      <fct> no, yes, no, yes, no, no, no, no, no, yes, no, no, no, no, y…
## $ duration  <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 32…
## $ campaign  <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1, …
## $ poutcome  <fct> unknown, failure, failure, unknown, unknown, failure, other,…
## $ y_bin     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …

3.2 Distribusi Kategori Respons

df_bank %>%
  count(y) %>%
  mutate(proporsi = n / sum(n)) %>%
  kable(caption = "Distribusi Variabel Respons (Langganan Deposito)", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Distribusi Variabel Respons (Langganan Deposito)
y	n	proporsi
no	4000	0.885
yes	521	0.115

df_bank %>%
  count(y) %>%
  mutate(proporsi = n / sum(n)) %>%
  ggplot(aes(x = y, y = proporsi, fill = y)) +
  geom_col(width = 0.5, alpha = 0.92) +
  geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
            vjust = -0.4, fontface = "bold") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
  scale_fill_manual(values = c("#8B1C62","#EE30A7")) +
  labs(title = "Distribusi Keputusan Nasabah",
       subtitle = "Respons biner: yes = berlangganan deposito, no = tidak.",
       x = "Berlangganan deposito?", y = "Proporsi") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

3.3 Estimasi Model Biner

fit_bin <- glm(
  y ~ age + balance + duration + campaign +
      job + marital + education + housing + loan + poutcome,
  data   = df_bank,
  family = binomial(link = "logit")
)

summary(fit_bin)

## 
## Call:
## glm(formula = y ~ age + balance + duration + campaign + job + 
##     marital + education + housing + loan + poutcome, family = binomial(link = "logit"), 
##     data = df_bank)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -2.899e+00  4.417e-01  -6.564 5.24e-11 ***
## age                 8.041e-04  6.783e-03   0.119 0.905632    
## balance             3.508e-06  1.755e-05   0.200 0.841598    
## duration            3.957e-03  1.917e-04  20.641  < 2e-16 ***
## campaign           -6.445e-02  2.610e-02  -2.469 0.013533 *  
## jobblue-collar     -5.127e-01  2.338e-01  -2.193 0.028338 *  
## jobentrepreneur    -4.108e-01  3.636e-01  -1.130 0.258539    
## jobhousemaid       -3.948e-01  3.985e-01  -0.991 0.321826    
## jobmanagement      -8.183e-02  2.338e-01  -0.350 0.726371    
## jobretired          6.055e-01  2.967e-01   2.040 0.041310 *  
## jobself-employed   -3.238e-01  3.389e-01  -0.955 0.339335    
## jobservices        -2.693e-01  2.626e-01  -1.025 0.305159    
## jobstudent          5.523e-01  3.581e-01   1.542 0.123048    
## jobtechnician      -2.479e-01  2.217e-01  -1.118 0.263590    
## jobunemployed      -6.685e-01  4.085e-01  -1.636 0.101780    
## jobunknown          4.758e-01  5.365e-01   0.887 0.375149    
## maritalmarried     -4.268e-01  1.665e-01  -2.563 0.010387 *  
## maritalsingle      -1.696e-01  1.950e-01  -0.870 0.384402    
## educationsecondary  1.042e-01  1.940e-01   0.537 0.591156    
## educationtertiary   4.099e-01  2.238e-01   1.831 0.067050 .  
## educationunknown   -4.056e-01  3.356e-01  -1.209 0.226806    
## housingyes         -6.343e-01  1.206e-01  -5.262 1.42e-07 ***
## loanyes            -7.221e-01  1.917e-01  -3.767 0.000165 ***
## poutcomefailure     4.995e-01  1.696e-01   2.945 0.003235 ** 
## poutcomeother       9.757e-01  2.172e-01   4.492 7.05e-06 ***
## poutcomesuccess     2.870e+00  2.154e-01  13.323  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3231.0  on 4520  degrees of freedom
## Residual deviance: 2323.3  on 4495  degrees of freedom
## AIC: 2375.3
## 
## Number of Fisher Scoring iterations: 6

3.4 Tabel Hasil & Odds Ratio

bin_coef <- as.data.frame(coef(summary(fit_bin))) %>%
  tibble::rownames_to_column("parameter") %>%
  dplyr::rename(
    estimate  = Estimate,
    std_error = `Std. Error`,
    z_value   = `z value`,
    p_value   = `Pr(>|z|)`
  ) %>%
  mutate(
    OR      = exp(estimate),
    CI_low  = exp(estimate - 1.96 * std_error),
    CI_high = exp(estimate + 1.96 * std_error)
  )

bin_coef %>%
  mutate(across(c(estimate, std_error, z_value, p_value, OR, CI_low, CI_high),
                ~ round(.x, 4))) %>%
  kable(caption = "Ringkasan Model Regresi Logistik Biner",
        col.names = c("Parameter","Estimate","SE","z-value","p-value",
                      "OR","CI 2.5%","CI 97.5%")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)

Ringkasan Model Regresi Logistik Biner
Parameter	Estimate	SE	z-value	p-value	OR	CI 2.5%	CI 97.5%
(Intercept)	-2.8995	0.4417	-6.5640	0.0000	0.0551	0.0232	0.1309
age	0.0008	0.0068	0.1185	0.9056	1.0008	0.9876	1.0142
balance	0.0000	0.0000	0.1998	0.8416	1.0000	1.0000	1.0000
duration	0.0040	0.0002	20.6409	0.0000	1.0040	1.0036	1.0043
campaign	-0.0644	0.0261	-2.4694	0.0135	0.9376	0.8908	0.9868
jobblue-collar	-0.5127	0.2338	-2.1926	0.0283	0.5989	0.3787	0.9471
jobentrepreneur	-0.4108	0.3636	-1.1299	0.2585	0.6631	0.3252	1.3523
jobhousemaid	-0.3948	0.3985	-0.9907	0.3218	0.6738	0.3085	1.4715
jobmanagement	-0.0818	0.2338	-0.3500	0.7264	0.9214	0.5827	1.4571
jobretired	0.6055	0.2967	2.0404	0.0413	1.8321	1.0241	3.2775
jobself-employed	-0.3238	0.3389	-0.9555	0.3393	0.7234	0.3723	1.4056
jobservices	-0.2693	0.2626	-1.0254	0.3052	0.7639	0.4566	1.2782
jobstudent	0.5523	0.3581	1.5421	0.1230	1.7372	0.8610	3.5050
jobtechnician	-0.2479	0.2217	-1.1179	0.2636	0.7804	0.5053	1.2053
jobunemployed	-0.6685	0.4085	-1.6363	0.1018	0.5125	0.2301	1.1414
jobunknown	0.4758	0.5365	0.8869	0.3751	1.6094	0.5623	4.6063
maritalmarried	-0.4268	0.1665	-2.5627	0.0104	0.6526	0.4709	0.9045
maritalsingle	-0.1696	0.1950	-0.8698	0.3844	0.8440	0.5760	1.2368
educationsecondary	0.1042	0.1940	0.5372	0.5912	1.1099	0.7587	1.6235
educationtertiary	0.4099	0.2238	1.8313	0.0670	1.5067	0.9716	2.3364
educationunknown	-0.4056	0.3356	-1.2086	0.2268	0.6666	0.3453	1.2868
housingyes	-0.6343	0.1206	-5.2620	0.0000	0.5303	0.4187	0.6716
loanyes	-0.7221	0.1917	-3.7668	0.0002	0.4857	0.3336	0.7072
poutcomefailure	0.4995	0.1696	2.9445	0.0032	1.6480	1.1818	2.2980
poutcomeother	0.9757	0.2172	4.4922	0.0000	2.6530	1.7332	4.0608
poutcomesuccess	2.8699	0.2154	13.3226	0.0000	17.6355	11.5617	26.9002

3.5 Interpretasi

duration: OR = 1,0040, artinya setiap tambahan 1 detik durasi kontak meningkatkan odds nasabah berlangganan deposito sebesar 0,4%.
campaign: OR = 0,9376, artinya setiap tambahan 1 kali kontak selama kampanye menurunkan odds nasabah berlangganan deposito sebesar 6,24%.
jobblue-collar (vs pekerjaan referensi): OR = 0,5989, artinya nasabah dengan pekerjaan blue-collar memiliki odds berlangganan deposito 40,11% lebih rendah dibanding kategori pekerjaan referensi.
jobretired (vs pekerjaan referensi): OR = 1,8321, artinya nasabah pensiunan memiliki odds berlangganan deposito 1,83 kali lebih besar dibanding kategori pekerjaan referensi.
maritalmarried (vs status referensi): OR = 0,6526, artinya nasabah yang menikah memiliki odds berlangganan deposito 34,74% lebih rendah dibanding status referensi.
housingyes: OR = 0,5303, artinya nasabah yang memiliki kredit perumahan memiliki odds berlangganan deposito 46,97% lebih rendah dibanding yang tidak memiliki kredit perumahan.
loanyes: OR = 0,4857, artinya nasabah yang memiliki pinjaman pribadi memiliki odds berlangganan deposito 51,43% lebih rendah dibanding yang tidak memiliki pinjaman.
poutcomefailure (vs kategori referensi): OR = 1,6480, artinya nasabah dengan hasil kampanye sebelumnya gagal memiliki odds berlangganan deposito 1,65 kali lebih besar dibanding kategori referensi.
poutcomeother (vs kategori referensi): OR = 2,6530, artinya nasabah dengan hasil kampanye sebelumnya kategori other memiliki odds berlangganan deposito 2,65 kali lebih besar dibanding kategori referensi.
poutcomesuccess (vs kategori referensi): OR = 17,6355, artinya nasabah dengan hasil kampanye sebelumnya berhasil memiliki odds berlangganan deposito 17,64 kali lebih besar dibanding kategori referensi.

3.6 Prediksi Probabilitas Biner

pred_prob_bin <- predict(
  fit_bin,
  type = "response"
)

data_bin_pred <- df_bank %>%
  mutate(
    Prob_Yes = pred_prob_bin,
    Prob_No = 1 - pred_prob_bin,
    prediksi = ifelse(
      pred_prob_bin >= 0.5,
      "yes",
      "no"
    )
  )

head(data_bin_pred)

##    y age         job marital education default balance housing loan duration
## 1 no  30  unemployed married   primary      no    1787      no   no       79
## 2 no  33    services married secondary      no    4789     yes  yes      220
## 3 no  35  management  single  tertiary      no    1350     yes   no      185
## 4 no  30  management married  tertiary      no    1476     yes  yes      199
## 5 no  59 blue-collar married secondary      no       0     yes   no      226
## 6 no  35  management  single  tertiary      no     747      no   no      141
##   campaign poutcome y_bin   Prob_Yes   Prob_No prediksi
## 1        1  unknown     0 0.02374948 0.9762505       no
## 2        1  failure     0 0.02934545 0.9706545       no
## 3        1  failure     0 0.10198746 0.8980125       no
## 4        4  unknown     0 0.02197348 0.9780265       no
## 5        1  unknown     0 0.02954577 0.9704542       no
## 6        2  failure     0 0.14409773 0.8559023       no

data_bin_pred %>%
  head() %>%
  kable(
    caption = "Hasil Prediksi Regresi Logistik Biner",
    digits = 4
  ) %>%
  kable_styling(
    bootstrap_options = c(
      "striped",
      "hover",
      "condensed"
    ),
    full_width = FALSE
  ) %>%
  scroll_box(
    width = "100%",
    height = "300px"
  )

Hasil Prediksi Regresi Logistik Biner
y	age	job	marital	education	default	balance	housing	loan	duration	campaign	poutcome	Prob_Yes	Prob_No	prediksi
no	30	unemployed	married	primary	no	1787	no	no	79	1	unknown	0.0237	0.9763	no
no	33	services	married	secondary	no	4789	yes	yes	220	1	failure	0.0293	0.9707	no
no	35	management	single	tertiary	no	1350	yes	no	185	1	failure	0.1020	0.8980	no
no	30	management	married	tertiary	no	1476	yes	yes	199	4	unknown	0.0220	0.9780	no
no	59	blue-collar	married	secondary	no	0	yes	no	226	1	unknown	0.0295	0.9705	no
no	35	management	single	tertiary	no	747	no	no	141	2	failure	0.1441	0.8559	no

3.7 Confusion Matrix

conf_bin <- table(
  Aktual = df_bank$y,
  Prediksi = data_bin_pred$prediksi
)

conf_bin

##       Prediksi
## Aktual   no  yes
##    no  3917   83
##    yes  362  159

accuracy_bin <- sum(diag(conf_bin)) / sum(conf_bin)
cat("Akurasi klasifikasi:", round(accuracy_bin, 4))

## Akurasi klasifikasi: 0.9016

3.8 Visualisasi Probabilitas Prediksi

grid_bin <- expand.grid(
  duration = seq(
    min(df_bank$duration),
    max(df_bank$duration),
    length.out = 100
  ),
  age = mean(df_bank$age),
  balance = mean(df_bank$balance),
  campaign = mean(df_bank$campaign),
  job = levels(df_bank$job)[1],
  marital = levels(df_bank$marital)[1],
  education = levels(df_bank$education)[1],
  housing = "no",
  loan = "no",
  poutcome = "unknown"
)

grid_bin$prob <- predict(
  fit_bin,
  newdata = grid_bin,
  type = "response"
)

ggplot(
  grid_bin,
  aes(duration, prob)
) +
  geom_line(
    linewidth = 1.5,
    color = "#EE30A7"
  ) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title = "Probabilitas Berlangganan Deposito",
    x = "Duration",
    y = "Probabilitas"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold")
  )

4. Regresi Poisson

4.1 Latar Belakang & Data

Dataset: NPHA (National Poll on Healthy Aging) — Doctor Visits
Sumber: UCI Machine Learning Repository
Jumlah observasi: 696 responden

Dataset ini berisi informasi kesehatan dan kondisi tidur responden lanjut usia yang dikumpulkan dalam survei National Poll on Healthy Aging (NPHA). Tujuan analisisnya adalah untuk mengidentifikasi faktor-faktor yang berhubungan dengan jumlah dokter yang dikunjungi oleh responden.

Variabel	Peran	Keterangan
`Number_of_Doctors_Visited`	Y (respons)	Jumlah dokter yang dikunjungi: 1, 2, atau 3 — hitungan (count)
`Phyiscal_Health`	X ordinal	Status kesehatan fisik (1=buruk … 5=sangat baik)
`Mental_Health`	X ordinal	Status kesehatan mental (1–5)
`Trouble_Sleeping`	X ordinal	Gangguan tidur (1–3)
`Prescription_Sleep_Medication`	X ordinal	Penggunaan obat tidur resep (1–3)
`Employment`	X kategorik	Status pekerjaan
`Gender`	X kategorik	Jenis kelamin
`Race`	X kategorik	Ras
`Stress_Keeps_Patient_from_Sleeping`	X biner	Stres mengganggu tidur (0/1)
`Pain_Keeps_Patient_from_Sleeping`	X biner	Nyeri mengganggu tidur (0/1)

df_npha <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/npha_clean.csv", stringsAsFactors = FALSE)

df_npha$Gender     <- factor(df_npha$Gender, levels = c("Male","Female"))
df_npha$Race       <- factor(df_npha$Race)
df_npha$Employment <- factor(df_npha$Employment,
                              levels = c("Employed","Self_employed","Retired","Unemployed"))

glimpse(df_npha)

## Rows: 696
## Columns: 14
## $ Number_of_Doctors_Visited                  <int> 3, 2, 3, 1, 3, 2, 3, 2, 2, …
## $ Phyiscal_Health                            <dbl> 4, 4, 3, 3, 3, 3, 4, 3, 3, …
## $ Mental_Health                              <dbl> 3, 2, 2, 2, 3, 2, 1, 2, 1, …
## $ Dental_Health                              <dbl> 3, 3, 3, 3, 3, 4, 1, 6, 2, …
## $ Employment                                 <fct> Retired, Retired, Retired, …
## $ Stress_Keeps_Patient_from_Sleeping         <int> 0, 1, 0, 0, 1, 0, 0, 1, 0, …
## $ Medication_Keeps_Patient_from_Sleeping     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Pain_Keeps_Patient_from_Sleeping           <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ Bathroom_Needs_Keeps_Patient_from_Sleeping <int> 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ Uknown_Keeps_Patient_from_Sleeping         <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ Trouble_Sleeping                           <dbl> 2, 3, 3, 3, 2, 3, 2, 3, 3, …
## $ Prescription_Sleep_Medication              <dbl> 3, 3, 3, 3, 3, 3, 1, 3, 3, …
## $ Race                                       <fct> White, White, Asian, Asian,…
## $ Gender                                     <fct> Female, Male, Male, Female,…

4.2 Distribusi Kategori Respons

df_npha %>%
  count(Number_of_Doctors_Visited) %>%
  mutate(proporsi = n / sum(n)) %>%
  kable(caption = "Distribusi Jumlah Dokter yang Dikunjungi", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Distribusi Jumlah Dokter yang Dikunjungi
Number_of_Doctors_Visited	n	proporsi
1	126	0.181
2	363	0.522
3	207	0.297

df_npha %>%
  count(Number_of_Doctors_Visited) %>%
  ggplot(aes(x = Number_of_Doctors_Visited, y = n)) +
  geom_col(width = 0.6, fill = "#B03060", alpha = 0.92) +
  geom_text(aes(label = n), vjust = -0.4, fontface = "bold") +
  scale_x_continuous(breaks = 1:3) +
  labs(title = "Distribusi Jumlah Dokter yang Dikunjungi",
       subtitle = "Respons Poisson: data cacah (count), bilangan bulat nonnegatif.",
       x = "Jumlah Dokter", y = "Frekuensi") +
  theme_minimal(base_size = 12)

4.3 Estimasi Model Poisson

fit_pois <- glm(
  Number_of_Doctors_Visited ~
    Phyiscal_Health + Mental_Health + Trouble_Sleeping +
    Prescription_Sleep_Medication + Employment + Gender + Race +
    Stress_Keeps_Patient_from_Sleeping + Pain_Keeps_Patient_from_Sleeping,
  data   = df_npha,
  family = poisson(link = "log")
)

summary(fit_pois)

## 
## Call:
## glm(formula = Number_of_Doctors_Visited ~ Phyiscal_Health + Mental_Health + 
##     Trouble_Sleeping + Prescription_Sleep_Medication + Employment + 
##     Gender + Race + Stress_Keeps_Patient_from_Sleeping + Pain_Keeps_Patient_from_Sleeping, 
##     family = poisson(link = "log"), data = df_npha)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                         0.5830124  0.2669421   2.184    0.029 *
## Phyiscal_Health                     0.0555237  0.0344872   1.610    0.107  
## Mental_Health                      -0.0227791  0.0334276  -0.681    0.496  
## Trouble_Sleeping                    0.0140007  0.0448053   0.312    0.755  
## Prescription_Sleep_Medication      -0.0747015  0.0536400  -1.393    0.164  
## EmploymentSelf_employed             0.0422049  0.1430003   0.295    0.768  
## EmploymentRetired                   0.0891016  0.1099064   0.811    0.418  
## EmploymentUnemployed                0.1067899  0.2027444   0.527    0.598  
## GenderFemale                       -0.0006536  0.0535056  -0.012    0.990  
## RaceBlack                           0.1075818  0.1531745   0.702    0.482  
## RaceHispanic                        0.1032178  0.2002224   0.516    0.606  
## RaceOther                           0.2093353  0.1892143   1.106    0.269  
## RaceWhite                           0.1488566  0.1200194   1.240    0.215  
## Stress_Keeps_Patient_from_Sleeping  0.0507501  0.0622897   0.815    0.415  
## Pain_Keeps_Patient_from_Sleeping    0.0149139  0.0668779   0.223    0.824  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 162.29  on 695  degrees of freedom
## Residual deviance: 152.51  on 681  degrees of freedom
## AIC: 2002.6
## 
## Number of Fisher Scoring iterations: 4

4.4 Tabel Hasil & IRR

pois_coef <- as.data.frame(coef(summary(fit_pois))) %>%
  tibble::rownames_to_column("parameter") %>%
  dplyr::rename(
    estimate  = Estimate,
    std_error = `Std. Error`,
    z_value   = `z value`,
    p_value   = `Pr(>|z|)`
  ) %>%
  mutate(
    IRR              = exp(estimate),
    CI_low           = exp(estimate - 1.96 * std_error),
    CI_high          = exp(estimate + 1.96 * std_error),
    perubahan_persen = 100 * (IRR - 1)
  )

pois_coef %>%
  mutate(across(c(estimate, std_error, z_value, p_value,
                  IRR, CI_low, CI_high, perubahan_persen), ~ round(.x, 4))) %>%
  kable(caption = "Ringkasan Model Regresi Poisson",
        col.names = c("Parameter","Estimate","SE","z-value","p-value",
                      "IRR","CI 2.5%","CI 97.5%","Δ%")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)

Ringkasan Model Regresi Poisson
Parameter	Estimate	SE	z-value	p-value	IRR	CI 2.5%	CI 97.5%	Δ%
(Intercept)	0.5830	0.2669	2.1840	0.0290	1.7914	1.0616	3.0229	79.1427
Phyiscal_Health	0.0555	0.0345	1.6100	0.1074	1.0571	0.9880	1.1310	5.7094
Mental_Health	-0.0228	0.0334	-0.6814	0.4956	0.9775	0.9155	1.0437	-2.2522
Trouble_Sleeping	0.0140	0.0448	0.3125	0.7547	1.0141	0.9288	1.1072	1.4099
Prescription_Sleep_Medication	-0.0747	0.0536	-1.3926	0.1637	0.9280	0.8354	1.0309	-7.1980
EmploymentSelf_employed	0.0422	0.1430	0.2951	0.7679	1.0431	0.7881	1.3806	4.3108
EmploymentRetired	0.0891	0.1099	0.8107	0.4175	1.0932	0.8813	1.3560	9.3192
EmploymentUnemployed	0.1068	0.2027	0.5267	0.5984	1.1127	0.7478	1.6556	11.2700
GenderFemale	-0.0007	0.0535	-0.0122	0.9903	0.9993	0.8999	1.1098	-0.0653
RaceBlack	0.1076	0.1532	0.7023	0.4825	1.1136	0.8248	1.5035	11.3582
RaceHispanic	0.1032	0.2002	0.5155	0.6062	1.1087	0.7488	1.6416	10.8733
RaceOther	0.2093	0.1892	1.1063	0.2686	1.2329	0.8508	1.7864	23.2858
RaceWhite	0.1489	0.1200	1.2403	0.2149	1.1605	0.9172	1.4683	16.0507
Stress_Keeps_Patient_from_Sleeping	0.0508	0.0623	0.8147	0.4152	1.0521	0.9311	1.1887	5.2060
Pain_Keeps_Patient_from_Sleeping	0.0149	0.0669	0.2230	0.8235	1.0150	0.8903	1.1572	1.5026

4.5 Interpretasi

Phyiscal_Health: IRR = 1,0571, artinya setiap kenaikan 1 level kesehatan fisik meningkatkan rata-rata jumlah kunjungan dokter sebesar 5,71%.
Mental_Health: IRR = 0,9775, artinya setiap kenaikan 1 level kesehatan mental menurunkan rata-rata jumlah kunjungan dokter sebesar 2,25%.
Trouble_Sleeping: IRR = 1,0141, artinya responden yang mengalami kesulitan tidur memiliki rata-rata jumlah kunjungan dokter 1,41% lebih tinggi.
Prescription_Sleep_Medication: IRR = 0,9280, artinya responden yang menggunakan obat tidur memiliki rata-rata jumlah kunjungan dokter 7,20% lebih rendah.
EmploymentRetired (vs Employed): IRR = 1,0932, artinya pensiunan memiliki rata-rata jumlah kunjungan dokter 9,32% lebih tinggi dibanding yang bekerja.
EmploymentUnemployed (vs Employed): IRR = 1,1127, artinya pengangguran memiliki rata-rata jumlah kunjungan dokter 11,27% lebih tinggi dibanding yang bekerja.
GenderFemale (vs Male): IRR = 0,9993, artinya perempuan memiliki rata-rata jumlah kunjungan dokter hampir sama dengan laki-laki.
RaceWhite (vs kategori referensi): IRR = 1,1605, artinya responden kulit putih memiliki rata-rata jumlah kunjungan dokter 16,05% lebih tinggi dibanding kategori referensi.
Stress_Keeps_Patient_from_Sleeping: IRR = 1,0521, artinya responden yang tidurnya terganggu karena stres memiliki rata-rata jumlah kunjungan dokter 5,21% lebih tinggi.
Pain_Keeps_Patient_from_Sleeping: IRR = 1,0150, artinya responden yang tidurnya terganggu karena nyeri memiliki rata-rata jumlah kunjungan dokter 1,50% lebih tinggi.

Namun, seluruh variabel memiliki p-value > 0,05 sehingga tidak terdapat bukti yang cukup bahwa variabel-variabel tersebut berpengaruh signifikan terhadap jumlah kunjungan dokter.

# Prediksi rata-rata hitungan
head(predict(fit_pois, type = "response"))

##        1        2        3        4        5        6 
## 2.177051 2.377751 1.842318 1.841114 2.166684 2.138022

4.6 Cek Overdispersi

Regresi Poisson mengasumsikan mean = variance. Periksa dengan statistik dispersi Pearson:

disp_pois <- sum(residuals(fit_pois, type = "pearson")^2) / df.residual(fit_pois)

tibble::tibble(
  `Statistik Dispersi Pearson` = disp_pois,
  `Interpretasi` = dplyr::case_when(
    disp_pois < 1.5 ~ "Tidak ada overdispersion berat — model Poisson sudah tepat",
    disp_pois < 2.5 ~ "Ada indikasi overdispersion sedang — pertimbangkan Quasi-Poisson",
    TRUE            ~ "Overdispersion kuat — pertimbangkan Negative Binomial"
  )
) %>%
  mutate(`Statistik Dispersi Pearson` = round(`Statistik Dispersi Pearson`, 3)) %>%
  kable(caption = "Pemeriksaan Overdispersion Model Poisson") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Pemeriksaan Overdispersion Model Poisson
Statistik Dispersi Pearson	Interpretasi
0.212	Tidak ada overdispersion berat — model Poisson sudah tepat

4.7 Visualisasi Prediksi Rate Poisson

grid_pois <- expand.grid(
  Phyiscal_Health = seq(
    min(df_npha$Phyiscal_Health),
    max(df_npha$Phyiscal_Health),
    length.out = 120
  ),
  Mental_Health = mean(df_npha$Mental_Health),
  Trouble_Sleeping = mean(df_npha$Trouble_Sleeping),
  Prescription_Sleep_Medication =
    mean(df_npha$Prescription_Sleep_Medication),

  Employment = "Employed",
  Gender = "Female",
  Race = levels(df_npha$Race)[1],

  Stress_Keeps_Patient_from_Sleeping = 0,
  Pain_Keeps_Patient_from_Sleeping = 0
)

grid_pois$Employment <- factor(
  grid_pois$Employment,
  levels = levels(df_npha$Employment)
)

grid_pois$Gender <- factor(
  grid_pois$Gender,
  levels = levels(df_npha$Gender)
)

grid_pois$Race <- factor(
  grid_pois$Race,
  levels = levels(df_npha$Race)
)

pred_pois <- predict(
  fit_pois,
  newdata = grid_pois,
  type = "link",
  se.fit = TRUE
)

grid_pois_plot <- grid_pois %>%
  mutate(
    fit_link = pred_pois$fit,
    se_link = pred_pois$se.fit,

    rate = exp(fit_link),

    rate_low = exp(
      fit_link - 1.96 * se_link
    ),

    rate_high = exp(
      fit_link + 1.96 * se_link
    )
  )

ggplot(
  grid_pois_plot,
  aes(
    x = Phyiscal_Health,
    y = rate
  )
) +
  geom_ribbon(
    aes(
      ymin = rate_low,
      ymax = rate_high
    ),
    fill = "#FF82AB",
    alpha = 0.20
  ) +
  geom_line(
    color = "#8B1C62",
    linewidth = 1.4
  ) +
  labs(
    title = "Prediksi Jumlah Kunjungan Dokter",
    subtitle = "Variabel lain ditahan konstan",
    x = "Kesehatan Fisik",
    y = "Prediksi Jumlah Kunjungan Dokter"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold")
  )

Regresi Logistik Multinomial, Ordinal, Biner, dan Regresi Poisson

Zahra Putri Argakoesoemah - 140610240047