library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(scales)
library(nnet)
library(MASS) Dataset: Online Shoppers Purchasing Intention
Sumber: UCI Machine Learning Repository
Jumlah observasi: 12.330 sesi kunjungan
Dataset ini berisi data perilaku pengunjung situs e-commerce selama satu sesi kunjungan. Ditujukan untuk mengidentifikasi faktor-faktor yang berkaitan dengan jenis pengunjung (New Visitor, Returning Visitor, atau Other) berdasarkan aktivitas mereka di website.
| Variabel | Jenis | Keterangan |
|---|---|---|
VisitorType |
Y (respons) | Jenis pengunjung: New_Visitor, Returning_Visitor, Other — nominal, 3 kategori, tanpa urutan |
BounceRates |
X kontinu | Proporsi sesi yang langsung meninggalkan halaman |
ExitRates |
X kontinu | Proporsi halaman yang menjadi halaman terakhir |
PageValues |
X kontinu | Nilai rata-rata halaman yang dikunjungi sebelum transaksi |
ProductRelated |
X kontinu | Jumlah halaman produk yang dikunjungi |
Administrative |
X kontinu | Jumlah halaman administratif yang dikunjungi |
Weekend |
X biner | 1 = kunjungan saat akhir pekan |
Revenue |
X biner | 1 = sesi menghasilkan transaksi |
df_shop <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/online_shoppers_clean.csv", stringsAsFactors = FALSE)
df_shop$VisitorType <- factor(df_shop$VisitorType,
levels = c("New_Visitor", "Returning_Visitor", "Other"))
df_shop$Weekend <- as.integer(df_shop$Weekend)
df_shop$Revenue <- as.integer(df_shop$Revenue)
glimpse(df_shop)## Rows: 12,330
## Columns: 11
## $ VisitorType <fct> Returning_Visitor, Returning_Visitor, Returning_Visitor…
## $ Administrative <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0…
## $ Informational <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ProductRelated <int> 1, 2, 1, 2, 10, 19, 1, 0, 2, 3, 3, 16, 7, 6, 2, 23, 1, …
## $ BounceRates <dbl> 0.200000000, 0.000000000, 0.200000000, 0.050000000, 0.0…
## $ ExitRates <dbl> 0.200000000, 0.100000000, 0.200000000, 0.140000000, 0.0…
## $ PageValues <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SpecialDay <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.8, 0.4, 0.0, …
## $ Month <chr> "Feb", "Feb", "Feb", "Feb", "Feb", "Feb", "Feb", "Feb",…
## $ Weekend <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Revenue <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
df_shop %>%
count(VisitorType) %>%
mutate(proporsi = n / sum(n)) %>%
kable(caption = "Distribusi Jenis Pengunjung", digits = 3) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| VisitorType | n | proporsi |
|---|---|---|
| New_Visitor | 1694 | 0.137 |
| Returning_Visitor | 10551 | 0.856 |
| Other | 85 | 0.007 |
df_shop %>%
count(VisitorType) %>%
mutate(proporsi = n / sum(n)) %>%
ggplot(aes(x = VisitorType, y = proporsi, fill = VisitorType)) +
geom_col(width = 0.6, alpha = 0.92) +
geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
vjust = -0.4, fontface = "bold") +
scale_y_continuous(labels = percent_format(), limits = c(0, 0.95)) +
scale_fill_manual(values = c("#8B475D", "#CD6889", "#FF82AB")) +
labs(title = "Distribusi Jenis Pengunjung",
subtitle = "Respons multinomial: 3 kategori nominal, tidak ada urutan alami.",
x = NULL, y = "Proporsi") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")fit_multi <- nnet::multinom(
VisitorType ~ BounceRates + ExitRates + PageValues +
ProductRelated + Administrative + Weekend + Revenue,
data = df_shop,
trace = FALSE
)
summary(fit_multi)## Call:
## nnet::multinom(formula = VisitorType ~ BounceRates + ExitRates +
## PageValues + ProductRelated + Administrative + Weekend +
## Revenue, data = df_shop, trace = FALSE)
##
## Coefficients:
## (Intercept) BounceRates ExitRates PageValues ProductRelated
## Returning_Visitor -0.1310171 -11.94519 43.80824 -0.003853412 0.0475086463
## Other -4.1512516 -10.99646 43.21727 0.012529515 -0.0002229309
## Administrative Weekend Revenue
## Returning_Visitor -0.05446757 -0.2026824 -0.4818629
## Other -0.01430138 -1.3055439 -0.2841169
##
## Std. Errors:
## (Intercept) BounceRates ExitRates PageValues ProductRelated
## Returning_Visitor 0.07342134 2.547536 2.003455 0.001464643 0.002081928
## Other 0.24746790 1.428562 1.649217 0.002890596 0.010481231
## Administrative Weekend Revenue
## Returning_Visitor 0.009851435 0.06364857 0.08317437
## Other 0.049615811 0.39963797 0.36010289
##
## Residual Deviance: 9014.63
## AIC: 9046.63
multi_sum <- summary(fit_multi)
coef_long <- as.data.frame(multi_sum$coefficients) %>%
tibble::rownames_to_column("kategori") %>%
pivot_longer(-kategori, names_to = "variabel", values_to = "estimate")
se_long <- as.data.frame(multi_sum$standard.errors) %>%
tibble::rownames_to_column("kategori") %>%
pivot_longer(-kategori, names_to = "variabel", values_to = "std_error")
result_multi <- left_join(coef_long, se_long, by = c("kategori","variabel")) %>%
mutate(
z_value = estimate / std_error,
p_value = 2 * (1 - pnorm(abs(z_value))),
RRR = exp(estimate),
CI_low = exp(estimate - 1.96 * std_error),
CI_high = exp(estimate + 1.96 * std_error)
)
result_multi %>%
mutate(across(c(estimate, std_error, z_value, p_value, RRR, CI_low, CI_high),
~ round(.x, 4))) %>%
kable(caption = "Ringkasan Model Regresi Logistik Multinomial",
col.names = c("Kategori","Variabel","Estimate","SE","z","p-value",
"RRR","CI 2.5%","CI 97.5%")) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Kategori | Variabel | Estimate | SE | z | p-value | RRR | CI 2.5% | CI 97.5% |
|---|---|---|---|---|---|---|---|---|
| Returning_Visitor | (Intercept) | -0.1310 | 0.0734 | -1.7845 | 0.0743 | 8.772000e-01 | 7.596000e-01 | 1.013000e+00 |
| Returning_Visitor | BounceRates | -11.9452 | 2.5475 | -4.6889 | 0.0000 | 0.000000e+00 | 0.000000e+00 | 1.000000e-03 |
| Returning_Visitor | ExitRates | 43.8082 | 2.0035 | 21.8663 | 0.0000 | 1.060909e+19 | 2.090752e+17 | 5.383363e+20 |
| Returning_Visitor | PageValues | -0.0039 | 0.0015 | -2.6310 | 0.0085 | 9.962000e-01 | 9.933000e-01 | 9.990000e-01 |
| Returning_Visitor | ProductRelated | 0.0475 | 0.0021 | 22.8195 | 0.0000 | 1.048700e+00 | 1.044400e+00 | 1.052900e+00 |
| Returning_Visitor | Administrative | -0.0545 | 0.0099 | -5.5289 | 0.0000 | 9.470000e-01 | 9.289000e-01 | 9.655000e-01 |
| Returning_Visitor | Weekend | -0.2027 | 0.0636 | -3.1844 | 0.0015 | 8.165000e-01 | 7.208000e-01 | 9.250000e-01 |
| Returning_Visitor | Revenue | -0.4819 | 0.0832 | -5.7934 | 0.0000 | 6.176000e-01 | 5.247000e-01 | 7.270000e-01 |
| Other | (Intercept) | -4.1513 | 0.2475 | -16.7749 | 0.0000 | 1.570000e-02 | 9.700000e-03 | 2.560000e-02 |
| Other | BounceRates | -10.9965 | 1.4286 | -7.6976 | 0.0000 | 0.000000e+00 | 0.000000e+00 | 3.000000e-04 |
| Other | ExitRates | 43.2173 | 1.6492 | 26.2047 | 0.0000 | 5.875163e+18 | 2.318344e+17 | 1.488888e+20 |
| Other | PageValues | 0.0125 | 0.0029 | 4.3346 | 0.0000 | 1.012600e+00 | 1.006900e+00 | 1.018400e+00 |
| Other | ProductRelated | -0.0002 | 0.0105 | -0.0213 | 0.9830 | 9.998000e-01 | 9.794000e-01 | 1.020500e+00 |
| Other | Administrative | -0.0143 | 0.0496 | -0.2882 | 0.7732 | 9.858000e-01 | 8.944000e-01 | 1.086500e+00 |
| Other | Weekend | -1.3055 | 0.3996 | -3.2668 | 0.0011 | 2.710000e-01 | 1.238000e-01 | 5.932000e-01 |
| Other | Revenue | -0.2841 | 0.3601 | -0.7890 | 0.4301 | 7.527000e-01 | 3.716000e-01 | 1.524500e+00 |
PageValues (Returning_Visitor vs
New_Visitor): RRR = 0,9962, artinya setiap kenaikan 1 unit
PageValues menurunkan peluang menjadi Returning_Visitor sebesar
0,38%.ProductRelated (Returning_Visitor vs
New_Visitor): RRR = 1,0487, artinya setiap kenaikan 1 unit
ProductRelated meningkatkan peluang menjadi Returning_Visitor sebesar
4,87%.Administrative (Returning_Visitor vs
New_Visitor): RRR = 0,9470, artinya setiap kenaikan 1 unit
Administrative menurunkan peluang menjadi Returning_Visitor sebesar
5,30%.Weekend (Returning_Visitor vs
New_Visitor): RRR = 0,8165, artinya pengunjung akhir pekan
memiliki peluang menjadi Returning_Visitor 18,35% lebih rendah.Revenue (Returning_Visitor vs
New_Visitor): RRR = 0,6176, artinya pengunjung yang melakukan
transaksi memiliki peluang menjadi Returning_Visitor 38,24% lebih
rendah.PageValues (Other vs New_Visitor): RRR
= 1,0126, artinya setiap kenaikan 1 unit PageValues meningkatkan peluang
masuk kategori Other sebesar 1,26%.Weekend (Other vs New_Visitor): RRR =
0,2710, artinya pengunjung akhir pekan memiliki peluang masuk kategori
Other 72,90% lebih rendah.pred_prob_multi <- predict(fit_multi, type = "probs")
data_multi_pred <- df_shop %>%
bind_cols(as.data.frame(pred_prob_multi)) %>%
mutate(
prediksi = predict(fit_multi, type = "class")
)
head(pred_prob_multi)## New_Visitor Returning_Visitor Other
## 1 0.001819601 0.9801636 0.018016784
## 2 0.012616092 0.9724296 0.014954351
## 3 0.001819601 0.9801636 0.018016784
## 4 0.004007705 0.9805499 0.015442378
## 5 0.109432969 0.8873217 0.003245345
## 6 0.158924077 0.8350215 0.006054415
data_multi_pred %>%
head() %>%
kable(
caption = "Hasil Prediksi Regresi Logistik Multinomial",
digits = 4
) %>%
kable_styling(
bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE
) %>%
scroll_box(
width = "100%",
height = "300px"
)| VisitorType | Administrative | Informational | ProductRelated | BounceRates | ExitRates | PageValues | SpecialDay | Month | Weekend | Revenue | New_Visitor | Returning_Visitor | Other | prediksi |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Returning_Visitor | 0 | 0 | 1 | 0.2000 | 0.2000 | 0 | 0 | Feb | 0 | 0 | 0.0018 | 0.9802 | 0.0180 | Returning_Visitor |
| Returning_Visitor | 0 | 0 | 2 | 0.0000 | 0.1000 | 0 | 0 | Feb | 0 | 0 | 0.0126 | 0.9724 | 0.0150 | Returning_Visitor |
| Returning_Visitor | 0 | 0 | 1 | 0.2000 | 0.2000 | 0 | 0 | Feb | 0 | 0 | 0.0018 | 0.9802 | 0.0180 | Returning_Visitor |
| Returning_Visitor | 0 | 0 | 2 | 0.0500 | 0.1400 | 0 | 0 | Feb | 0 | 0 | 0.0040 | 0.9805 | 0.0154 | Returning_Visitor |
| Returning_Visitor | 0 | 0 | 10 | 0.0200 | 0.0500 | 0 | 0 | Feb | 1 | 0 | 0.1094 | 0.8873 | 0.0032 | Returning_Visitor |
| Returning_Visitor | 0 | 0 | 19 | 0.0158 | 0.0246 | 0 | 0 | Feb | 0 | 0 | 0.1589 | 0.8350 | 0.0061 | Returning_Visitor |
fit_multi_null <- nnet::multinom(VisitorType ~ 1, data = df_shop, trace = FALSE)
lrt_stat <- 2 * (as.numeric(logLik(fit_multi)) - as.numeric(logLik(fit_multi_null)))
df_lrt <- length(coef(fit_multi)) - 2 # (K-1) * p_prediktor
p_lrt <- 1 - pchisq(lrt_stat, df_lrt)
tibble::tibble(
`G² (Devians)` = lrt_stat,
`df` = df_lrt,
`p-value` = p_lrt
) %>%
mutate(across(c(`G² (Devians)`, `p-value`), ~ round(.x, 4))) %>%
kable(caption = "Uji Kebaikan Model (Likelihood Ratio Test)") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| G² (Devians) | df | p-value |
|---|---|---|
| 1844.471 | 14 | 0 |
## Prediksi
## Aktual New_Visitor Returning_Visitor Other
## New_Visitor 77 1616 1
## Returning_Visitor 59 10492 0
## Other 5 80 0
## [1] 0.8571776
grid_multi <- expand.grid(
PageValues = seq(
min(df_shop$PageValues),
max(df_shop$PageValues),
length.out = 100
),
BounceRates = mean(df_shop$BounceRates),
ExitRates = mean(df_shop$ExitRates),
ProductRelated = mean(df_shop$ProductRelated),
Administrative = mean(df_shop$Administrative),
Weekend = 0,
Revenue = 0
)
grid_prob_multi <- predict(
fit_multi,
newdata = grid_multi,
type = "probs"
)
plot_multi <- grid_multi %>%
bind_cols(as.data.frame(grid_prob_multi)) %>%
pivot_longer(
cols = c(
"New_Visitor",
"Returning_Visitor",
"Other"
),
names_to = "kategori",
values_to = "prob"
)
ggplot(
plot_multi,
aes(PageValues, prob, color = kategori)
) +
geom_line(linewidth = 1.5) +
scale_color_manual(
values = c(
"New_Visitor" = "#8B1C62",
"Returning_Visitor" = "#EE30A7",
"Other" = "#FF82AB"
)
) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Probabilitas Prediksi Jenis Pengunjung",
x = "Page Values",
y = "Probabilitas",
color = "Kategori"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold")
)Dataset: Student Smoking, Alcohol, and Mental
Health
Sumber: Mendeley
Jumlah observasi: 706 mahasiswa
Dataset ini memuat informasi mengenai kebiasaan merokok, konsumsi alkohol, strategi mengatasi stres, dan karakteristik mahasiswa yang dikaitkan dengan tingkat kesehatan psikologis (wellness). Tujuan analisis adalah untuk mengetahui faktor-faktor yang berhubungan dengan peningkatan atau penurunan tingkat kesejahteraan psikologis mahasiswa. Variabel respons diklasifikasikan ke dalam kategori berurutan mulai dari Poor/Below Average, Average/Good, hingga Excellent.
| Variabel | Jenis | Keterangan |
|---|---|---|
wellness_3 |
Y (respons) | Kesehatan psikologis: Poor/Below average < Average/Good < Excellent — ordinal, 3 kategori |
freq_smoking |
X kategorik | Frekuensi merokok (5 level) |
freq_alcohol2 |
X kategorik | Frekuensi konsumsi alkohol (4 level; Monthly & Weekly digabung) |
cope_stress2 |
X kategorik | Cara mengatasi stres (4 level; Meditation, Other, Socializing digabung) |
age_group |
X kategorik | Kelompok usia |
year_study |
X kategorik | Tahun studi |
df_stud <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/student_clean.csv", stringsAsFactors = FALSE)
df_stud$wellness_3 <- dplyr::case_when(
df_stud$wellness %in% c("Poor","Below average") ~ "Poor/Below average",
df_stud$wellness %in% c("Average","Good") ~ "Average/Good",
df_stud$wellness == "Excellent" ~ "Excellent"
)
df_stud$wellness_3 <- ordered(df_stud$wellness_3,
levels = c("Poor/Below average","Average/Good","Excellent"))
df_stud$freq_alcohol2 <- ifelse(
df_stud$freq_alcohol %in% c("Monthly","Weekly"), "Monthly/Weekly",
df_stud$freq_alcohol
)
df_stud$freq_alcohol2 <- factor(df_stud$freq_alcohol2,
levels = c("Don't consume alcohol","Only on social occasions",
"Monthly/Weekly","Daily"))
df_stud$cope_stress2 <- ifelse(
df_stud$cope_stress %in% c("Meditation","Other","Socializing"), "Other",
df_stud$cope_stress
)
df_stud$cope_stress2 <- factor(df_stud$cope_stress2,
levels = c("Drinking alcohol","Smoking","Exercise","Other"))
df_stud$freq_smoking <- factor(df_stud$freq_smoking,
levels = c("Don't smoke","Only on social occasions","Monthly","Weekly","Daily"))
df_stud$age_group <- factor(df_stud$age_group,
levels = c("18-20","21-23","24-26","27 or older"))
df_stud$year_study <- factor(df_stud$year_study,
levels = c("1st year","2nd year","3rd year","4th year","Graduate student"))
glimpse(df_stud)## Rows: 706
## Columns: 15
## $ wellness <chr> "Good", "Good", "Good", "Excellent", "Excellent", "…
## $ freq_smoking <fct> NA, Daily, Daily, Daily, Weekly, Daily, Daily, Mont…
## $ freq_alcohol <chr> "Don’t consume alcohol", "Don’t consume alcohol", "…
## $ help_seeking <chr> "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
## $ cope_stress <chr> "Other", "Smoking", "Drinking alcohol", "Exercise",…
## $ age_group <fct> 24-26, 24-26, 27 or older, 21-23, 18-20, 18-20, 18-…
## $ year_study <fct> Graduate student, Graduate student, 4th year, Gradu…
## $ wellness_num <int> 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ wellness_3 <ord> Average/Good, Average/Good, Average/Good, Excellent…
## $ cope_stress2 <fct> Other, Smoking, Drinking alcohol, Exercise, Drinkin…
## $ freq_alcohol2 <fct> NA, NA, Only on social occasions, Daily, Only on so…
## $ cope_stress_clean <chr> "Other", "Substance_use", "Substance_use", "Exercis…
## $ freq_alcohol_clean <chr> "Don’t consume alcohol", "Don’t consume alcohol", "…
## $ smoker <chr> "Non_smoker", "Smoker", "Smoker", "Smoker", "Smoker…
## $ drinker <chr> "Non_drinker", "Non_drinker", "Drinker", "Drinker",…
df_stud %>%
count(wellness_3) %>%
mutate(proporsi = n / sum(n)) %>%
kable(caption = "Distribusi Tingkat Kesehatan Psikologis (wellness_3)", digits = 3) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| wellness_3 | n | proporsi |
|---|---|---|
| Poor/Below average | 589 | 0.834 |
| Average/Good | 35 | 0.050 |
| Excellent | 82 | 0.116 |
df_stud %>%
count(wellness_3) %>%
mutate(proporsi = n / sum(n)) %>%
ggplot(aes(x = wellness_3, y = proporsi, fill = wellness_3)) +
geom_col(width = 0.65, alpha = 0.92) +
geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
vjust = -0.4, fontface = "bold") +
scale_y_continuous(labels = percent_format(), limits = c(0, 0.90)) +
scale_fill_manual(values = c("#D02090","#EE3A8C","#FF3E96")) +
labs(title = "Distribusi Tingkat Kesehatan Psikologis",
subtitle = "Respons ordinal: Poor/Below average → Average/Good → Excellent.",
x = NULL, y = "Proporsi") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")fit_ord <- MASS::polr(
wellness_3 ~ freq_smoking + freq_alcohol2 + cope_stress2 +
age_group + year_study,
data = df_stud,
method = "logistic",
Hess = TRUE
)
summary(fit_ord)## Call:
## MASS::polr(formula = wellness_3 ~ freq_smoking + freq_alcohol2 +
## cope_stress2 + age_group + year_study, data = df_stud, Hess = TRUE,
## method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## freq_smokingOnly on social occasions -0.32403 1.5248 -0.21251
## freq_smokingMonthly -2.78986 2.0421 -1.36616
## freq_smokingWeekly -0.10085 1.0531 -0.09576
## freq_alcohol2Only on social occasions 0.42065 1.1204 0.37546
## freq_alcohol2Monthly/Weekly 0.13183 1.3559 0.09723
## cope_stress2Smoking 1.82697 0.9056 2.01749
## cope_stress2Exercise 11.47952 2.3913 4.80048
## cope_stress2Other 8.29811 1.9297 4.30020
## age_group21-23 -3.48180 1.5144 -2.29907
## age_group24-26 -4.51354 1.6306 -2.76802
## age_group27 or older -2.23651 2.2024 -1.01549
## year_study2nd year 1.80758 1.7868 1.01161
## year_study3rd year 3.06678 1.7976 1.70600
## year_study4th year 1.98616 1.7116 1.16043
## year_studyGraduate student 0.02569 2.2474 0.01143
##
## Intercepts:
## Value Std. Error t value
## Poor/Below average|Average/Good 3.8616 1.2105 3.1901
## Average/Good|Excellent 6.5672 1.5059 4.3610
##
## Residual Deviance: 76.03743
## AIC: 110.0374
## (96 observations deleted due to missingness)
ord_coef <- coef(summary(fit_ord))
result_ord <- as.data.frame(ord_coef) %>%
tibble::rownames_to_column("parameter") %>%
dplyr::rename(estimate = Value, std_error = `Std. Error`, t_value = `t value`) %>%
mutate(
p_value = 2 * (1 - pnorm(abs(t_value))),
jenis = ifelse(grepl("\\|", parameter), "Cutpoint", "Koefisien"),
OR_naik = ifelse(jenis == "Koefisien", exp(-estimate), NA_real_),
CI_low_naik = ifelse(jenis == "Koefisien", exp(-estimate - 1.96 * std_error), NA_real_),
CI_high_naik = ifelse(jenis == "Koefisien", exp(-estimate + 1.96 * std_error), NA_real_)
)
result_ord %>%
mutate(across(c(estimate, std_error, t_value, p_value, OR_naik, CI_low_naik, CI_high_naik),
~ round(.x, 4))) %>%
kable(caption = "Ringkasan Model Regresi Logistik Ordinal (Proportional Odds)",
col.names = c("Parameter","Estimate (polr)","SE","t-value","p-value",
"Jenis","OR (naik)","CI 2.5%","CI 97.5%")) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
row_spec(which(result_ord$jenis == "Cutpoint"), background = "#f0f0f0")| Parameter | Estimate (polr) | SE | t-value | p-value | Jenis | OR (naik) | CI 2.5% | CI 97.5% |
|---|---|---|---|---|---|---|---|---|
| freq_smokingOnly on social occasions | -0.3240 | 1.5248 | -0.2125 | 0.8317 | Koefisien | 1.3827 | 0.0696 | 27.4591 |
| freq_smokingMonthly | -2.7899 | 2.0421 | -1.3662 | 0.1719 | Koefisien | 16.2787 | 0.2974 | 891.0659 |
| freq_smokingWeekly | -0.1008 | 1.0531 | -0.0958 | 0.9237 | Koefisien | 1.1061 | 0.1404 | 8.7143 |
| freq_alcohol2Only on social occasions | 0.4207 | 1.1204 | 0.3755 | 0.7073 | Koefisien | 0.6566 | 0.0731 | 5.9017 |
| freq_alcohol2Monthly/Weekly | 0.1318 | 1.3559 | 0.0972 | 0.9225 | Koefisien | 0.8765 | 0.0615 | 12.4997 |
| cope_stress2Smoking | 1.8270 | 0.9056 | 2.0175 | 0.0436 | Koefisien | 0.1609 | 0.0273 | 0.9493 |
| cope_stress2Exercise | 11.4795 | 2.3913 | 4.8005 | 0.0000 | Koefisien | 0.0000 | 0.0000 | 0.0011 |
| cope_stress2Other | 8.2981 | 1.9297 | 4.3002 | 0.0000 | Koefisien | 0.0002 | 0.0000 | 0.0109 |
| age_group21-23 | -3.4818 | 1.5144 | -2.2991 | 0.0215 | Koefisien | 32.5183 | 1.6711 | 632.7666 |
| age_group24-26 | -4.5135 | 1.6306 | -2.7680 | 0.0056 | Koefisien | 91.2446 | 3.7343 | 2229.4850 |
| age_group27 or older | -2.2365 | 2.2024 | -1.0155 | 0.3099 | Koefisien | 9.3606 | 0.1249 | 701.4864 |
| year_study2nd year | 1.8076 | 1.7868 | 1.0116 | 0.3117 | Koefisien | 0.1641 | 0.0049 | 5.4445 |
| year_study3rd year | 3.0668 | 1.7976 | 1.7060 | 0.0880 | Koefisien | 0.0466 | 0.0014 | 1.5787 |
| year_study4th year | 1.9862 | 1.7116 | 1.1604 | 0.2459 | Koefisien | 0.1372 | 0.0048 | 3.9295 |
| year_studyGraduate student | 0.0257 | 2.2474 | 0.0114 | 0.9909 | Koefisien | 0.9746 | 0.0119 | 79.7796 |
| Poor/Below average|Average/Good | 3.8616 | 1.2105 | 3.1901 | 0.0014 | Cutpoint | NA | NA | NA |
| Average/Good|Excellent | 6.5672 | 1.5059 | 4.3610 | 0.0000 | Cutpoint | NA | NA | NA |
cope_stress = Smoking: OR = 0,1609,
artinya mahasiswa yang mengatasi stres dengan merokok memiliki odds
berada pada kategori wellness yang lebih tinggi 83,91%
lebih rendah dibanding kategori referensi.cope_stress = Exercise: OR ≈ 0,0000,
artinya mahasiswa yang mengatasi stres dengan olahraga memiliki odds
berada pada kategori wellness yang lebih tinggi jauh lebih rendah
dibanding kategori referensi.cope_stress = Other: OR = 0,0002,
artinya mahasiswa yang menggunakan cara lain untuk mengatasi stres
memiliki odds berada pada kategori wellness yang lebih tinggi
99,98% lebih rendah dibanding kategori referensi.age_group 21–23: OR = 32,5183, artinya
mahasiswa usia 21–23 tahun memiliki odds berada pada kategori wellness
yang lebih tinggi 32,52 kali lebih besar dibanding
kelompok usia referensi.age_group 24–26: OR = 91,2446, artinya
mahasiswa usia 24–26 tahun memiliki odds berada pada kategori wellness
yang lebih tinggi 91,24 kali lebih besar dibanding
kelompok usia referensi.## Poor/Below average Average/Good Excellent
## 3 0.9756678240 0.022668194 0.001663982
## 4 0.0153407378 0.173703845 0.810955418
## 5 0.6165745499 0.343522633 0.039902817
## 6 0.0004913311 0.006810168 0.992698501
## 7 0.0004913311 0.006810168 0.992698501
## 8 0.0079386458 0.098991531 0.893069823
pred_class_ord <- predict(
fit_ord,
type = "class"
)
data_ord_pred <- model.frame(fit_ord) %>%
bind_cols(
as.data.frame(pred_prob_ord)
) %>%
mutate(
prediksi = pred_class_ord
)
data_ord_pred %>%
head() %>%
kable(
caption = "Hasil Prediksi Regresi Logistik Ordinal",
digits = 4
) %>%
kable_styling(
bootstrap_options = c(
"striped",
"hover",
"condensed"
),
full_width = FALSE
) %>%
scroll_box(
width = "100%",
height = "300px"
)| wellness_3 | freq_smoking | freq_alcohol2 | cope_stress2 | age_group | year_study | Poor/Below average | Average/Good | Excellent | prediksi | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3 | Average/Good | Daily | Only on social occasions | Drinking alcohol | 27 or older | 4th year | 0.9757 | 0.0227 | 0.0017 | Poor/Below average |
| 4 | Excellent | Daily | Daily | Exercise | 21-23 | Graduate student | 0.0153 | 0.1737 | 0.8110 | Excellent |
| 5 | Excellent | Weekly | Only on social occasions | Drinking alcohol | 18-20 | 3rd year | 0.6166 | 0.3435 | 0.0399 | Poor/Below average |
| 6 | Excellent | Daily | Daily | Exercise | 18-20 | 1st year | 0.0005 | 0.0068 | 0.9927 | Excellent |
| 7 | Excellent | Daily | Daily | Exercise | 18-20 | 1st year | 0.0005 | 0.0068 | 0.9927 | Excellent |
| 8 | Excellent | Monthly | Daily | Exercise | 18-20 | 1st year | 0.0079 | 0.0990 | 0.8931 | Excellent |
fit_ord_null <- MASS::polr(wellness_3 ~ 1, data = df_stud,
method = "logistic", Hess = TRUE)
lrt_ord <- 2 * (as.numeric(logLik(fit_ord)) - as.numeric(logLik(fit_ord_null)))
df_ord <- length(coef(fit_ord))
p_ord <- 1 - pchisq(lrt_ord, df_ord)
tibble::tibble(
`G² (Devians)` = lrt_ord,
`df` = df_ord,
`p-value` = p_ord
) %>%
mutate(across(c(`G² (Devians)`,`p-value`), ~ round(.x, 4))) %>%
kable(caption = "Uji Kebaikan Model Ordinal (LRT vs Model Null)") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| G² (Devians) | df | p-value |
|---|---|---|
| 700.7769 | 15 | 0 |
## Prediksi
## Aktual Poor/Below average Average/Good Excellent
## Poor/Below average 576 1 0
## Average/Good 6 2 2
## Excellent 2 0 21
## [1] 0.9819672
Dataset: Bank Marketing
Sumber: UCI Machine Learning Repository
Jumlah observasi: 4.521 nasabah
Dataset ini berasal dari kampanye pemasaran langsung suatu bank Portugis yang menawarkan produk deposito berjangka kepada nasabah. Tujuan analisis adalah untuk mengetahui faktor-faktor yang memengaruhi keputusan nasabah untuk berlangganan deposito (yes) atau tidak (no).
| Variabel | Peran | Keterangan |
|---|---|---|
y |
Y (respons) | Nasabah berlangganan deposito berjangka: yes/no — biner |
age |
X kontinu | Usia nasabah (tahun) |
balance |
X kontinu | Saldo rekening rata-rata (euro) |
duration |
X kontinu | Durasi kontak terakhir (detik) |
campaign |
X kontinu | Jumlah kontak selama kampanye ini |
job |
X kategorik | Pekerjaan |
marital |
X kategorik | Status pernikahan |
education |
X kategorik | Tingkat pendidikan |
housing |
X biner | Memiliki kredit perumahan (yes/no) |
loan |
X biner | Memiliki pinjaman pribadi (yes/no) |
poutcome |
X kategorik | Hasil kampanye pemasaran sebelumnya |
df_bank <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/bank_clean.csv", stringsAsFactors = FALSE, sep = ",")
# Faktor
df_bank$y <- factor(df_bank$y, levels = c("no","yes"))
df_bank$job <- factor(df_bank$job)
df_bank$marital <- factor(df_bank$marital)
df_bank$education <- factor(df_bank$education,
levels = c("primary","secondary","tertiary","unknown"))
df_bank$housing <- factor(df_bank$housing)
df_bank$loan <- factor(df_bank$loan)
df_bank$poutcome <- factor(df_bank$poutcome,
levels = c("unknown","failure","other","success"))
glimpse(df_bank)## Rows: 4,521
## Columns: 13
## $ y <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, yes, no,…
## $ age <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, …
## $ job <fct> unemployed, services, management, management, blue-collar, m…
## $ marital <fct> married, married, single, married, married, single, married,…
## $ education <fct> primary, secondary, tertiary, tertiary, secondary, tertiary,…
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ balance <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 26…
## $ housing <fct> no, yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes, no…
## $ loan <fct> no, yes, no, yes, no, no, no, no, no, yes, no, no, no, no, y…
## $ duration <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 32…
## $ campaign <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1, …
## $ poutcome <fct> unknown, failure, failure, unknown, unknown, failure, other,…
## $ y_bin <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
df_bank %>%
count(y) %>%
mutate(proporsi = n / sum(n)) %>%
kable(caption = "Distribusi Variabel Respons (Langganan Deposito)", digits = 3) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| y | n | proporsi |
|---|---|---|
| no | 4000 | 0.885 |
| yes | 521 | 0.115 |
df_bank %>%
count(y) %>%
mutate(proporsi = n / sum(n)) %>%
ggplot(aes(x = y, y = proporsi, fill = y)) +
geom_col(width = 0.5, alpha = 0.92) +
geom_text(aes(label = percent(proporsi, accuracy = 0.1)),
vjust = -0.4, fontface = "bold") +
scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
scale_fill_manual(values = c("#8B1C62","#EE30A7")) +
labs(title = "Distribusi Keputusan Nasabah",
subtitle = "Respons biner: yes = berlangganan deposito, no = tidak.",
x = "Berlangganan deposito?", y = "Proporsi") +
theme_minimal(base_size = 12) +
theme(legend.position = "none")fit_bin <- glm(
y ~ age + balance + duration + campaign +
job + marital + education + housing + loan + poutcome,
data = df_bank,
family = binomial(link = "logit")
)
summary(fit_bin)##
## Call:
## glm(formula = y ~ age + balance + duration + campaign + job +
## marital + education + housing + loan + poutcome, family = binomial(link = "logit"),
## data = df_bank)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.899e+00 4.417e-01 -6.564 5.24e-11 ***
## age 8.041e-04 6.783e-03 0.119 0.905632
## balance 3.508e-06 1.755e-05 0.200 0.841598
## duration 3.957e-03 1.917e-04 20.641 < 2e-16 ***
## campaign -6.445e-02 2.610e-02 -2.469 0.013533 *
## jobblue-collar -5.127e-01 2.338e-01 -2.193 0.028338 *
## jobentrepreneur -4.108e-01 3.636e-01 -1.130 0.258539
## jobhousemaid -3.948e-01 3.985e-01 -0.991 0.321826
## jobmanagement -8.183e-02 2.338e-01 -0.350 0.726371
## jobretired 6.055e-01 2.967e-01 2.040 0.041310 *
## jobself-employed -3.238e-01 3.389e-01 -0.955 0.339335
## jobservices -2.693e-01 2.626e-01 -1.025 0.305159
## jobstudent 5.523e-01 3.581e-01 1.542 0.123048
## jobtechnician -2.479e-01 2.217e-01 -1.118 0.263590
## jobunemployed -6.685e-01 4.085e-01 -1.636 0.101780
## jobunknown 4.758e-01 5.365e-01 0.887 0.375149
## maritalmarried -4.268e-01 1.665e-01 -2.563 0.010387 *
## maritalsingle -1.696e-01 1.950e-01 -0.870 0.384402
## educationsecondary 1.042e-01 1.940e-01 0.537 0.591156
## educationtertiary 4.099e-01 2.238e-01 1.831 0.067050 .
## educationunknown -4.056e-01 3.356e-01 -1.209 0.226806
## housingyes -6.343e-01 1.206e-01 -5.262 1.42e-07 ***
## loanyes -7.221e-01 1.917e-01 -3.767 0.000165 ***
## poutcomefailure 4.995e-01 1.696e-01 2.945 0.003235 **
## poutcomeother 9.757e-01 2.172e-01 4.492 7.05e-06 ***
## poutcomesuccess 2.870e+00 2.154e-01 13.323 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3231.0 on 4520 degrees of freedom
## Residual deviance: 2323.3 on 4495 degrees of freedom
## AIC: 2375.3
##
## Number of Fisher Scoring iterations: 6
bin_coef <- as.data.frame(coef(summary(fit_bin))) %>%
tibble::rownames_to_column("parameter") %>%
dplyr::rename(
estimate = Estimate,
std_error = `Std. Error`,
z_value = `z value`,
p_value = `Pr(>|z|)`
) %>%
mutate(
OR = exp(estimate),
CI_low = exp(estimate - 1.96 * std_error),
CI_high = exp(estimate + 1.96 * std_error)
)
bin_coef %>%
mutate(across(c(estimate, std_error, z_value, p_value, OR, CI_low, CI_high),
~ round(.x, 4))) %>%
kable(caption = "Ringkasan Model Regresi Logistik Biner",
col.names = c("Parameter","Estimate","SE","z-value","p-value",
"OR","CI 2.5%","CI 97.5%")) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Parameter | Estimate | SE | z-value | p-value | OR | CI 2.5% | CI 97.5% |
|---|---|---|---|---|---|---|---|
| (Intercept) | -2.8995 | 0.4417 | -6.5640 | 0.0000 | 0.0551 | 0.0232 | 0.1309 |
| age | 0.0008 | 0.0068 | 0.1185 | 0.9056 | 1.0008 | 0.9876 | 1.0142 |
| balance | 0.0000 | 0.0000 | 0.1998 | 0.8416 | 1.0000 | 1.0000 | 1.0000 |
| duration | 0.0040 | 0.0002 | 20.6409 | 0.0000 | 1.0040 | 1.0036 | 1.0043 |
| campaign | -0.0644 | 0.0261 | -2.4694 | 0.0135 | 0.9376 | 0.8908 | 0.9868 |
| jobblue-collar | -0.5127 | 0.2338 | -2.1926 | 0.0283 | 0.5989 | 0.3787 | 0.9471 |
| jobentrepreneur | -0.4108 | 0.3636 | -1.1299 | 0.2585 | 0.6631 | 0.3252 | 1.3523 |
| jobhousemaid | -0.3948 | 0.3985 | -0.9907 | 0.3218 | 0.6738 | 0.3085 | 1.4715 |
| jobmanagement | -0.0818 | 0.2338 | -0.3500 | 0.7264 | 0.9214 | 0.5827 | 1.4571 |
| jobretired | 0.6055 | 0.2967 | 2.0404 | 0.0413 | 1.8321 | 1.0241 | 3.2775 |
| jobself-employed | -0.3238 | 0.3389 | -0.9555 | 0.3393 | 0.7234 | 0.3723 | 1.4056 |
| jobservices | -0.2693 | 0.2626 | -1.0254 | 0.3052 | 0.7639 | 0.4566 | 1.2782 |
| jobstudent | 0.5523 | 0.3581 | 1.5421 | 0.1230 | 1.7372 | 0.8610 | 3.5050 |
| jobtechnician | -0.2479 | 0.2217 | -1.1179 | 0.2636 | 0.7804 | 0.5053 | 1.2053 |
| jobunemployed | -0.6685 | 0.4085 | -1.6363 | 0.1018 | 0.5125 | 0.2301 | 1.1414 |
| jobunknown | 0.4758 | 0.5365 | 0.8869 | 0.3751 | 1.6094 | 0.5623 | 4.6063 |
| maritalmarried | -0.4268 | 0.1665 | -2.5627 | 0.0104 | 0.6526 | 0.4709 | 0.9045 |
| maritalsingle | -0.1696 | 0.1950 | -0.8698 | 0.3844 | 0.8440 | 0.5760 | 1.2368 |
| educationsecondary | 0.1042 | 0.1940 | 0.5372 | 0.5912 | 1.1099 | 0.7587 | 1.6235 |
| educationtertiary | 0.4099 | 0.2238 | 1.8313 | 0.0670 | 1.5067 | 0.9716 | 2.3364 |
| educationunknown | -0.4056 | 0.3356 | -1.2086 | 0.2268 | 0.6666 | 0.3453 | 1.2868 |
| housingyes | -0.6343 | 0.1206 | -5.2620 | 0.0000 | 0.5303 | 0.4187 | 0.6716 |
| loanyes | -0.7221 | 0.1917 | -3.7668 | 0.0002 | 0.4857 | 0.3336 | 0.7072 |
| poutcomefailure | 0.4995 | 0.1696 | 2.9445 | 0.0032 | 1.6480 | 1.1818 | 2.2980 |
| poutcomeother | 0.9757 | 0.2172 | 4.4922 | 0.0000 | 2.6530 | 1.7332 | 4.0608 |
| poutcomesuccess | 2.8699 | 0.2154 | 13.3226 | 0.0000 | 17.6355 | 11.5617 | 26.9002 |
duration: OR = 1,0040, artinya setiap
tambahan 1 detik durasi kontak meningkatkan odds nasabah berlangganan
deposito sebesar 0,4%.campaign: OR = 0,9376, artinya setiap
tambahan 1 kali kontak selama kampanye menurunkan odds nasabah
berlangganan deposito sebesar 6,24%.jobblue-collar (vs pekerjaan referensi): OR =
0,5989, artinya nasabah dengan pekerjaan blue-collar memiliki
odds berlangganan deposito 40,11% lebih rendah
dibanding kategori pekerjaan referensi.jobretired (vs pekerjaan referensi): OR =
1,8321, artinya nasabah pensiunan memiliki odds berlangganan
deposito 1,83 kali lebih besar dibanding kategori
pekerjaan referensi.maritalmarried (vs status referensi): OR =
0,6526, artinya nasabah yang menikah memiliki odds berlangganan
deposito 34,74% lebih rendah dibanding status
referensi.housingyes: OR = 0,5303, artinya
nasabah yang memiliki kredit perumahan memiliki odds berlangganan
deposito 46,97% lebih rendah dibanding yang tidak
memiliki kredit perumahan.loanyes: OR = 0,4857, artinya nasabah
yang memiliki pinjaman pribadi memiliki odds berlangganan deposito
51,43% lebih rendah dibanding yang tidak memiliki
pinjaman.poutcomefailure (vs kategori referensi): OR =
1,6480, artinya nasabah dengan hasil kampanye sebelumnya gagal
memiliki odds berlangganan deposito 1,65 kali lebih
besar dibanding kategori referensi.poutcomeother (vs kategori referensi): OR =
2,6530, artinya nasabah dengan hasil kampanye sebelumnya
kategori other memiliki odds berlangganan deposito 2,65
kali lebih besar dibanding kategori referensi.poutcomesuccess (vs kategori referensi): OR =
17,6355, artinya nasabah dengan hasil kampanye sebelumnya
berhasil memiliki odds berlangganan deposito 17,64 kali
lebih besar dibanding kategori referensi.pred_prob_bin <- predict(
fit_bin,
type = "response"
)
data_bin_pred <- df_bank %>%
mutate(
Prob_Yes = pred_prob_bin,
Prob_No = 1 - pred_prob_bin,
prediksi = ifelse(
pred_prob_bin >= 0.5,
"yes",
"no"
)
)
head(data_bin_pred)## y age job marital education default balance housing loan duration
## 1 no 30 unemployed married primary no 1787 no no 79
## 2 no 33 services married secondary no 4789 yes yes 220
## 3 no 35 management single tertiary no 1350 yes no 185
## 4 no 30 management married tertiary no 1476 yes yes 199
## 5 no 59 blue-collar married secondary no 0 yes no 226
## 6 no 35 management single tertiary no 747 no no 141
## campaign poutcome y_bin Prob_Yes Prob_No prediksi
## 1 1 unknown 0 0.02374948 0.9762505 no
## 2 1 failure 0 0.02934545 0.9706545 no
## 3 1 failure 0 0.10198746 0.8980125 no
## 4 4 unknown 0 0.02197348 0.9780265 no
## 5 1 unknown 0 0.02954577 0.9704542 no
## 6 2 failure 0 0.14409773 0.8559023 no
data_bin_pred %>%
head() %>%
kable(
caption = "Hasil Prediksi Regresi Logistik Biner",
digits = 4
) %>%
kable_styling(
bootstrap_options = c(
"striped",
"hover",
"condensed"
),
full_width = FALSE
) %>%
scroll_box(
width = "100%",
height = "300px"
)| y | age | job | marital | education | default | balance | housing | loan | duration | campaign | poutcome | y_bin | Prob_Yes | Prob_No | prediksi |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| no | 30 | unemployed | married | primary | no | 1787 | no | no | 79 | 1 | unknown | 0 | 0.0237 | 0.9763 | no |
| no | 33 | services | married | secondary | no | 4789 | yes | yes | 220 | 1 | failure | 0 | 0.0293 | 0.9707 | no |
| no | 35 | management | single | tertiary | no | 1350 | yes | no | 185 | 1 | failure | 0 | 0.1020 | 0.8980 | no |
| no | 30 | management | married | tertiary | no | 1476 | yes | yes | 199 | 4 | unknown | 0 | 0.0220 | 0.9780 | no |
| no | 59 | blue-collar | married | secondary | no | 0 | yes | no | 226 | 1 | unknown | 0 | 0.0295 | 0.9705 | no |
| no | 35 | management | single | tertiary | no | 747 | no | no | 141 | 2 | failure | 0 | 0.1441 | 0.8559 | no |
## Prediksi
## Aktual no yes
## no 3917 83
## yes 362 159
accuracy_bin <- sum(diag(conf_bin)) / sum(conf_bin)
cat("Akurasi klasifikasi:", round(accuracy_bin, 4))## Akurasi klasifikasi: 0.9016
grid_bin <- expand.grid(
duration = seq(
min(df_bank$duration),
max(df_bank$duration),
length.out = 100
),
age = mean(df_bank$age),
balance = mean(df_bank$balance),
campaign = mean(df_bank$campaign),
job = levels(df_bank$job)[1],
marital = levels(df_bank$marital)[1],
education = levels(df_bank$education)[1],
housing = "no",
loan = "no",
poutcome = "unknown"
)
grid_bin$prob <- predict(
fit_bin,
newdata = grid_bin,
type = "response"
)
ggplot(
grid_bin,
aes(duration, prob)
) +
geom_line(
linewidth = 1.5,
color = "#EE30A7"
) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Probabilitas Berlangganan Deposito",
x = "Duration",
y = "Probabilitas"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold")
)Dataset: NPHA (National Poll on Healthy Aging) —
Doctor Visits
Sumber: UCI Machine Learning Repository
Jumlah observasi: 696 responden
Dataset ini berisi informasi kesehatan dan kondisi tidur responden lanjut usia yang dikumpulkan dalam survei National Poll on Healthy Aging (NPHA). Tujuan analisisnya adalah untuk mengidentifikasi faktor-faktor yang berhubungan dengan jumlah dokter yang dikunjungi oleh responden.
| Variabel | Peran | Keterangan |
|---|---|---|
Number_of_Doctors_Visited |
Y (respons) | Jumlah dokter yang dikunjungi: 1, 2, atau 3 — hitungan (count) |
Phyiscal_Health |
X ordinal | Status kesehatan fisik (1=buruk … 5=sangat baik) |
Mental_Health |
X ordinal | Status kesehatan mental (1–5) |
Trouble_Sleeping |
X ordinal | Gangguan tidur (1–3) |
Prescription_Sleep_Medication |
X ordinal | Penggunaan obat tidur resep (1–3) |
Employment |
X kategorik | Status pekerjaan |
Gender |
X kategorik | Jenis kelamin |
Race |
X kategorik | Ras |
Stress_Keeps_Patient_from_Sleeping |
X biner | Stres mengganggu tidur (0/1) |
Pain_Keeps_Patient_from_Sleeping |
X biner | Nyeri mengganggu tidur (0/1) |
df_npha <- read.csv("C:/Users/Zahra Putri/Downloads/ADKREGRESI/npha_clean.csv", stringsAsFactors = FALSE)
df_npha$Gender <- factor(df_npha$Gender, levels = c("Male","Female"))
df_npha$Race <- factor(df_npha$Race)
df_npha$Employment <- factor(df_npha$Employment,
levels = c("Employed","Self_employed","Retired","Unemployed"))
glimpse(df_npha)## Rows: 696
## Columns: 14
## $ Number_of_Doctors_Visited <int> 3, 2, 3, 1, 3, 2, 3, 2, 2, …
## $ Phyiscal_Health <dbl> 4, 4, 3, 3, 3, 3, 4, 3, 3, …
## $ Mental_Health <dbl> 3, 2, 2, 2, 3, 2, 1, 2, 1, …
## $ Dental_Health <dbl> 3, 3, 3, 3, 3, 4, 1, 6, 2, …
## $ Employment <fct> Retired, Retired, Retired, …
## $ Stress_Keeps_Patient_from_Sleeping <int> 0, 1, 0, 0, 1, 0, 0, 1, 0, …
## $ Medication_Keeps_Patient_from_Sleeping <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Pain_Keeps_Patient_from_Sleeping <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ Bathroom_Needs_Keeps_Patient_from_Sleeping <int> 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ Uknown_Keeps_Patient_from_Sleeping <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ Trouble_Sleeping <dbl> 2, 3, 3, 3, 2, 3, 2, 3, 3, …
## $ Prescription_Sleep_Medication <dbl> 3, 3, 3, 3, 3, 3, 1, 3, 3, …
## $ Race <fct> White, White, Asian, Asian,…
## $ Gender <fct> Female, Male, Male, Female,…
df_npha %>%
count(Number_of_Doctors_Visited) %>%
mutate(proporsi = n / sum(n)) %>%
kable(caption = "Distribusi Jumlah Dokter yang Dikunjungi", digits = 3) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Number_of_Doctors_Visited | n | proporsi |
|---|---|---|
| 1 | 126 | 0.181 |
| 2 | 363 | 0.522 |
| 3 | 207 | 0.297 |
df_npha %>%
count(Number_of_Doctors_Visited) %>%
ggplot(aes(x = Number_of_Doctors_Visited, y = n)) +
geom_col(width = 0.6, fill = "#B03060", alpha = 0.92) +
geom_text(aes(label = n), vjust = -0.4, fontface = "bold") +
scale_x_continuous(breaks = 1:3) +
labs(title = "Distribusi Jumlah Dokter yang Dikunjungi",
subtitle = "Respons Poisson: data cacah (count), bilangan bulat nonnegatif.",
x = "Jumlah Dokter", y = "Frekuensi") +
theme_minimal(base_size = 12)fit_pois <- glm(
Number_of_Doctors_Visited ~
Phyiscal_Health + Mental_Health + Trouble_Sleeping +
Prescription_Sleep_Medication + Employment + Gender + Race +
Stress_Keeps_Patient_from_Sleeping + Pain_Keeps_Patient_from_Sleeping,
data = df_npha,
family = poisson(link = "log")
)
summary(fit_pois)##
## Call:
## glm(formula = Number_of_Doctors_Visited ~ Phyiscal_Health + Mental_Health +
## Trouble_Sleeping + Prescription_Sleep_Medication + Employment +
## Gender + Race + Stress_Keeps_Patient_from_Sleeping + Pain_Keeps_Patient_from_Sleeping,
## family = poisson(link = "log"), data = df_npha)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5830124 0.2669421 2.184 0.029 *
## Phyiscal_Health 0.0555237 0.0344872 1.610 0.107
## Mental_Health -0.0227791 0.0334276 -0.681 0.496
## Trouble_Sleeping 0.0140007 0.0448053 0.312 0.755
## Prescription_Sleep_Medication -0.0747015 0.0536400 -1.393 0.164
## EmploymentSelf_employed 0.0422049 0.1430003 0.295 0.768
## EmploymentRetired 0.0891016 0.1099064 0.811 0.418
## EmploymentUnemployed 0.1067899 0.2027444 0.527 0.598
## GenderFemale -0.0006536 0.0535056 -0.012 0.990
## RaceBlack 0.1075818 0.1531745 0.702 0.482
## RaceHispanic 0.1032178 0.2002224 0.516 0.606
## RaceOther 0.2093353 0.1892143 1.106 0.269
## RaceWhite 0.1488566 0.1200194 1.240 0.215
## Stress_Keeps_Patient_from_Sleeping 0.0507501 0.0622897 0.815 0.415
## Pain_Keeps_Patient_from_Sleeping 0.0149139 0.0668779 0.223 0.824
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 162.29 on 695 degrees of freedom
## Residual deviance: 152.51 on 681 degrees of freedom
## AIC: 2002.6
##
## Number of Fisher Scoring iterations: 4
pois_coef <- as.data.frame(coef(summary(fit_pois))) %>%
tibble::rownames_to_column("parameter") %>%
dplyr::rename(
estimate = Estimate,
std_error = `Std. Error`,
z_value = `z value`,
p_value = `Pr(>|z|)`
) %>%
mutate(
IRR = exp(estimate),
CI_low = exp(estimate - 1.96 * std_error),
CI_high = exp(estimate + 1.96 * std_error),
perubahan_persen = 100 * (IRR - 1)
)
pois_coef %>%
mutate(across(c(estimate, std_error, z_value, p_value,
IRR, CI_low, CI_high, perubahan_persen), ~ round(.x, 4))) %>%
kable(caption = "Ringkasan Model Regresi Poisson",
col.names = c("Parameter","Estimate","SE","z-value","p-value",
"IRR","CI 2.5%","CI 97.5%","Δ%")) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE)| Parameter | Estimate | SE | z-value | p-value | IRR | CI 2.5% | CI 97.5% | Δ% |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.5830 | 0.2669 | 2.1840 | 0.0290 | 1.7914 | 1.0616 | 3.0229 | 79.1427 |
| Phyiscal_Health | 0.0555 | 0.0345 | 1.6100 | 0.1074 | 1.0571 | 0.9880 | 1.1310 | 5.7094 |
| Mental_Health | -0.0228 | 0.0334 | -0.6814 | 0.4956 | 0.9775 | 0.9155 | 1.0437 | -2.2522 |
| Trouble_Sleeping | 0.0140 | 0.0448 | 0.3125 | 0.7547 | 1.0141 | 0.9288 | 1.1072 | 1.4099 |
| Prescription_Sleep_Medication | -0.0747 | 0.0536 | -1.3926 | 0.1637 | 0.9280 | 0.8354 | 1.0309 | -7.1980 |
| EmploymentSelf_employed | 0.0422 | 0.1430 | 0.2951 | 0.7679 | 1.0431 | 0.7881 | 1.3806 | 4.3108 |
| EmploymentRetired | 0.0891 | 0.1099 | 0.8107 | 0.4175 | 1.0932 | 0.8813 | 1.3560 | 9.3192 |
| EmploymentUnemployed | 0.1068 | 0.2027 | 0.5267 | 0.5984 | 1.1127 | 0.7478 | 1.6556 | 11.2700 |
| GenderFemale | -0.0007 | 0.0535 | -0.0122 | 0.9903 | 0.9993 | 0.8999 | 1.1098 | -0.0653 |
| RaceBlack | 0.1076 | 0.1532 | 0.7023 | 0.4825 | 1.1136 | 0.8248 | 1.5035 | 11.3582 |
| RaceHispanic | 0.1032 | 0.2002 | 0.5155 | 0.6062 | 1.1087 | 0.7488 | 1.6416 | 10.8733 |
| RaceOther | 0.2093 | 0.1892 | 1.1063 | 0.2686 | 1.2329 | 0.8508 | 1.7864 | 23.2858 |
| RaceWhite | 0.1489 | 0.1200 | 1.2403 | 0.2149 | 1.1605 | 0.9172 | 1.4683 | 16.0507 |
| Stress_Keeps_Patient_from_Sleeping | 0.0508 | 0.0623 | 0.8147 | 0.4152 | 1.0521 | 0.9311 | 1.1887 | 5.2060 |
| Pain_Keeps_Patient_from_Sleeping | 0.0149 | 0.0669 | 0.2230 | 0.8235 | 1.0150 | 0.8903 | 1.1572 | 1.5026 |
Phyiscal_Health: IRR = 1,0571, artinya
setiap kenaikan 1 level kesehatan fisik meningkatkan rata-rata jumlah
kunjungan dokter sebesar 5,71%.Mental_Health: IRR = 0,9775, artinya
setiap kenaikan 1 level kesehatan mental menurunkan rata-rata jumlah
kunjungan dokter sebesar 2,25%.Trouble_Sleeping: IRR = 1,0141,
artinya responden yang mengalami kesulitan tidur memiliki rata-rata
jumlah kunjungan dokter 1,41% lebih tinggi.Prescription_Sleep_Medication: IRR =
0,9280, artinya responden yang menggunakan obat tidur memiliki
rata-rata jumlah kunjungan dokter 7,20% lebih
rendah.EmploymentRetired (vs Employed): IRR =
1,0932, artinya pensiunan memiliki rata-rata jumlah kunjungan
dokter 9,32% lebih tinggi dibanding yang bekerja.EmploymentUnemployed (vs Employed): IRR =
1,1127, artinya pengangguran memiliki rata-rata jumlah
kunjungan dokter 11,27% lebih tinggi dibanding yang
bekerja.GenderFemale (vs Male): IRR = 0,9993,
artinya perempuan memiliki rata-rata jumlah kunjungan dokter
hampir sama dengan laki-laki.RaceWhite (vs kategori referensi): IRR =
1,1605, artinya responden kulit putih memiliki rata-rata jumlah
kunjungan dokter 16,05% lebih tinggi dibanding kategori
referensi.Stress_Keeps_Patient_from_Sleeping: IRR =
1,0521, artinya responden yang tidurnya terganggu karena stres
memiliki rata-rata jumlah kunjungan dokter 5,21% lebih
tinggi.Pain_Keeps_Patient_from_Sleeping: IRR =
1,0150, artinya responden yang tidurnya terganggu karena nyeri
memiliki rata-rata jumlah kunjungan dokter 1,50% lebih
tinggi.Namun, seluruh variabel memiliki p-value > 0,05 sehingga tidak terdapat bukti yang cukup bahwa variabel-variabel tersebut berpengaruh signifikan terhadap jumlah kunjungan dokter.
## 1 2 3 4 5 6
## 2.177051 2.377751 1.842318 1.841114 2.166684 2.138022
Regresi Poisson mengasumsikan mean = variance. Periksa dengan statistik dispersi Pearson:
disp_pois <- sum(residuals(fit_pois, type = "pearson")^2) / df.residual(fit_pois)
tibble::tibble(
`Statistik Dispersi Pearson` = disp_pois,
`Interpretasi` = dplyr::case_when(
disp_pois < 1.5 ~ "Tidak ada overdispersion berat — model Poisson sudah tepat",
disp_pois < 2.5 ~ "Ada indikasi overdispersion sedang — pertimbangkan Quasi-Poisson",
TRUE ~ "Overdispersion kuat — pertimbangkan Negative Binomial"
)
) %>%
mutate(`Statistik Dispersi Pearson` = round(`Statistik Dispersi Pearson`, 3)) %>%
kable(caption = "Pemeriksaan Overdispersion Model Poisson") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Statistik Dispersi Pearson | Interpretasi |
|---|---|
| 0.212 | Tidak ada overdispersion berat — model Poisson sudah tepat |
grid_pois <- expand.grid(
Phyiscal_Health = seq(
min(df_npha$Phyiscal_Health),
max(df_npha$Phyiscal_Health),
length.out = 120
),
Mental_Health = mean(df_npha$Mental_Health),
Trouble_Sleeping = mean(df_npha$Trouble_Sleeping),
Prescription_Sleep_Medication =
mean(df_npha$Prescription_Sleep_Medication),
Employment = "Employed",
Gender = "Female",
Race = levels(df_npha$Race)[1],
Stress_Keeps_Patient_from_Sleeping = 0,
Pain_Keeps_Patient_from_Sleeping = 0
)
grid_pois$Employment <- factor(
grid_pois$Employment,
levels = levels(df_npha$Employment)
)
grid_pois$Gender <- factor(
grid_pois$Gender,
levels = levels(df_npha$Gender)
)
grid_pois$Race <- factor(
grid_pois$Race,
levels = levels(df_npha$Race)
)
pred_pois <- predict(
fit_pois,
newdata = grid_pois,
type = "link",
se.fit = TRUE
)
grid_pois_plot <- grid_pois %>%
mutate(
fit_link = pred_pois$fit,
se_link = pred_pois$se.fit,
rate = exp(fit_link),
rate_low = exp(
fit_link - 1.96 * se_link
),
rate_high = exp(
fit_link + 1.96 * se_link
)
)
ggplot(
grid_pois_plot,
aes(
x = Phyiscal_Health,
y = rate
)
) +
geom_ribbon(
aes(
ymin = rate_low,
ymax = rate_high
),
fill = "#FF82AB",
alpha = 0.20
) +
geom_line(
color = "#8B1C62",
linewidth = 1.4
) +
labs(
title = "Prediksi Jumlah Kunjungan Dokter",
subtitle = "Variabel lain ditahan konstan",
x = "Kesehatan Fisik",
y = "Prediksi Jumlah Kunjungan Dokter"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold")
)