Dataset Adult Income (Census
Income) — UCI ML Repository
Link
archive.ics.uci.edu/dataset/2/adult
Sumber US Census Bureau 1994 (Kohavi,
1996)
Tujuan Memprediksi apakah
pendapatan seseorang >50K atau ≤50K
per tahun berdasarkan karakteristik demografis.
| Variabel | Tipe | Keterangan |
|---|---|---|
income_bin |
Respons | 0 = ≤50K, 1 = >50K |
age |
Prediktor numerik | Usia (tahun) |
education_num |
Prediktor numerik | Level pendidikan (1–16) |
hours_per_week |
Prediktor numerik | Jam kerja per minggu |
capital_gain |
Prediktor numerik | Keuntungan modal (USD) |
sex |
Prediktor kategorik | Jenis kelamin (Female / Male) |
race |
Prediktor kategorik | Ras/etnis (5 kategori) |
col_names <- c("age","workclass","fnlwgt","education","education_num",
"marital_status","occupation","relationship","race","sex",
"capital_gain","capital_loss","hours_per_week","native_country","income")
adult_raw <- tryCatch(
read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
col_names = col_names, na = c("?"," ?",""), trim_ws = TRUE,
show_col_types = FALSE),
error = function(e) NULL
)
if (is.null(adult_raw) || nrow(adult_raw) < 100) {
# Fallback data simulasi
set.seed(42)
n <- 3000
adult_raw <- data.frame(
age = sample(18:75, n, replace = TRUE),
education_num = sample(1:16, n, replace = TRUE),
hours_per_week= sample(20:60, n, replace = TRUE),
capital_gain = sample(c(rep(0,8), 5000, 15000, 99999), n, replace = TRUE),
sex = sample(c(" Male"," Female"), n, replace = TRUE, prob = c(0.67, 0.33)),
race = sample(c(" White"," Black"," Asian-Pac-Islander",
" Amer-Indian-Eskimo"," Other"), n, replace = TRUE,
prob = c(0.86, 0.095, 0.03, 0.01, 0.005)),
income = sample(c("<=50K",">50K"), n, replace = TRUE, prob = c(0.76, 0.24))
)
cat("Menggunakan data simulasi (n =", n, ")\n")
}
adult <- adult_raw %>%
drop_na(age, education_num, hours_per_week, sex, race, capital_gain, income) %>%
mutate(
income_bin = factor(if_else(str_detect(income, ">50K"), 1L, 0L),
levels = c(0,1), labels = c("<=50K",">50K")),
sex = factor(str_trim(sex)),
race = factor(str_trim(race)),
education_grp = factor(case_when(
education_num <= 8 ~ "SD/SMP",
education_num <= 12 ~ "SMA",
education_num == 13 ~ "Diploma/S1",
TRUE ~ "S2/S3"
), levels = c("SD/SMP","SMA","Diploma/S1","S2/S3"))
)
cat("Dimensi data:", nrow(adult), "baris x", ncol(adult), "kolom\n")## Dimensi data: 32561 baris x 17 kolom
| Kategori | Frekuensi | Persentase |
|---|---|---|
| <=50K | 24720 | 75.9% |
| >50K | 7841 | 24.1% |
| age | education_grp | hours_per_week | sex | capital_gain | income_bin |
|---|---|---|---|---|---|
| 39 | Diploma/S1 | 40 | Male | 2174 | <=50K |
| 50 | Diploma/S1 | 13 | Male | 0 | <=50K |
| 38 | SMA | 40 | Male | 0 | <=50K |
| 53 | SD/SMP | 40 | Male | 0 | <=50K |
| 28 | Diploma/S1 | 40 | Female | 0 | <=50K |
p1 <- ggplot(adult, aes(x = income_bin, fill = income_bin)) +
geom_bar(width = 0.5, show.legend = FALSE) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4, fontface = "bold") +
scale_fill_manual(values = c("#3498db","#e74c3c")) +
labs(title = "Distribusi Variabel Respons", subtitle = "Income <=50K vs >50K",
x = "Kategori Income", y = "Frekuensi") + theme_tugas()
p2 <- ggplot(adult, aes(x = income_bin, y = age, fill = income_bin)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE) +
scale_fill_manual(values = c("#3498db","#e74c3c")) +
labs(title = "Distribusi Usia per Kategori Income", x = "Income", y = "Usia") +
theme_tugas()
gridExtra::grid.arrange(p1, p2, ncol = 2)p3 <- adult %>%
count(education_grp, income_bin) %>%
group_by(education_grp) %>%
mutate(prop = n / sum(n)) %>%
filter(income_bin == ">50K") %>%
ggplot(aes(x = fct_reorder(education_grp, prop), y = prop, fill = education_grp)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = percent(prop, 1)), hjust = -0.1) +
coord_flip() +
scale_y_continuous(labels = percent, limits = c(0, 0.75)) +
scale_fill_brewer(palette = "Blues") +
labs(title = "Proporsi Income >50K per Pendidikan", x = NULL, y = "Proporsi") +
theme_tugas()
p4 <- adult %>%
count(sex, income_bin) %>%
group_by(sex) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = sex, y = prop, fill = income_bin)) +
geom_col(position = "fill") +
scale_y_continuous(labels = percent) +
scale_fill_manual(values = c("#3498db","#e74c3c")) +
labs(title = "Proporsi Income per Jenis Kelamin",
x = NULL, y = "Proporsi", fill = "Income") + theme_tugas()
gridExtra::grid.arrange(p3, p4, ncol = 2)ggplot(adult, aes(x = hours_per_week, fill = income_bin)) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = c("#3498db","#e74c3c")) +
labs(title = "Distribusi Jam Kerja per Minggu",
x = "Jam Kerja/Minggu", y = "Densitas", fill = "Income") + theme_tugas()set.seed(42)
idx <- sample(nrow(adult), 0.8 * nrow(adult))
train <- adult[idx, ]
test <- adult[-idx, ]
model_biner <- glm(
income_bin ~ age + education_num + hours_per_week + sex + race + capital_gain,
data = train, family = binomial(link = "logit")
)
summary(model_biner)##
## Call:
## glm(formula = income_bin ~ age + education_num + hours_per_week +
## sex + race + capital_gain, family = binomial(link = "logit"),
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.692e+00 2.722e-01 -35.607 < 2e-16 ***
## age 4.182e-02 1.382e-03 30.260 < 2e-16 ***
## education_num 3.336e-01 7.735e-03 43.123 < 2e-16 ***
## hours_per_week 3.473e-02 1.494e-03 23.248 < 2e-16 ***
## sexMale 1.142e+00 4.483e-02 25.484 < 2e-16 ***
## raceAsian-Pac-Islander 6.228e-01 2.583e-01 2.411 0.015899 *
## raceBlack 4.093e-01 2.487e-01 1.646 0.099798 .
## raceOther 1.695e-01 3.590e-01 0.472 0.636916
## raceWhite 8.347e-01 2.393e-01 3.488 0.000486 ***
## capital_gain 3.157e-04 1.092e-05 28.923 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28763 on 26047 degrees of freedom
## Residual deviance: 20639 on 26038 degrees of freedom
## AIC: 20659
##
## Number of Fisher Scoring iterations: 7
coef_df <- tidy(model_biner, exponentiate = TRUE, conf.int = TRUE)
coef_df %>%
rename(Variabel = term, OR = estimate,
`CI 95% Bawah` = conf.low, `CI 95% Atas` = conf.high,
SE = std.error, `z-value` = statistic, `p-value` = p.value) %>%
mutate(Signifikan = if_else(`p-value` < 0.05, "✓", "")) %>%
kable(digits = 4, caption = "Odds Ratio dan Uji Signifikansi — Model Regresi Logistik Biner") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE) %>%
row_spec(which(tidy(model_biner)$p.value < 0.05), background = "#eafaf1")| Variabel | OR | SE | z-value | p-value | CI 95% Bawah | CI 95% Atas | Signifikan |
|---|---|---|---|---|---|---|---|
| (Intercept) | 0.0001 | 0.2722 | -35.6070 | 0.0000 | 0.0000 | 0.0001 | ✓ |
| age | 1.0427 | 0.0014 | 30.2596 | 0.0000 | 1.0399 | 1.0455 | ✓ |
| education_num | 1.3960 | 0.0077 | 43.1227 | 0.0000 | 1.3750 | 1.4174 | ✓ |
| hours_per_week | 1.0353 | 0.0015 | 23.2484 | 0.0000 | 1.0323 | 1.0384 | ✓ |
| sexMale | 3.1345 | 0.0448 | 25.4841 | 0.0000 | 2.8723 | 3.4242 | ✓ |
| raceAsian-Pac-Islander | 1.8641 | 0.2583 | 2.4112 | 0.0159 | 1.1424 | 3.1535 | ✓ |
| raceBlack | 1.5058 | 0.2487 | 1.6458 | 0.0998 | 0.9418 | 2.5039 | |
| raceOther | 1.1847 | 0.3590 | 0.4720 | 0.6369 | 0.5809 | 2.3878 | |
| raceWhite | 2.3042 | 0.2393 | 3.4883 | 0.0005 | 1.4705 | 3.7680 | ✓ |
| capital_gain | 1.0003 | 0.0000 | 28.9226 | 0.0000 | 1.0003 | 1.0003 | ✓ |
## INTERPRETASI ODDS RATIO (OR):
## OR > 1: prediktor meningkatkan peluang income >50K
## OR < 1: prediktor menurunkan peluang income >50K
for (i in 2:nrow(coef_df)) {
nm <- coef_df$term[i]
or <- coef_df$estimate[i]
pv <- coef_df$p.value[i]
sig <- if_else(pv < 0.05, "(signifikan)", "(tidak signifikan)")
if (or >= 1) {
cat(sprintf("- %-30s OR = %.3f -> meningkatkan odds >50K sebesar %.1f%% %s\n",
nm, or, (or-1)*100, sig))
} else {
cat(sprintf("- %-30s OR = %.3f -> menurunkan odds >50K sebesar %.1f%% %s\n",
nm, or, (1-or)*100, sig))
}
}## - age OR = 1.043 -> meningkatkan odds >50K sebesar 4.3% (signifikan)
## - education_num OR = 1.396 -> meningkatkan odds >50K sebesar 39.6% (signifikan)
## - hours_per_week OR = 1.035 -> meningkatkan odds >50K sebesar 3.5% (signifikan)
## - sexMale OR = 3.135 -> meningkatkan odds >50K sebesar 213.5% (signifikan)
## - raceAsian-Pac-Islander OR = 1.864 -> meningkatkan odds >50K sebesar 86.4% (signifikan)
## - raceBlack OR = 1.506 -> meningkatkan odds >50K sebesar 50.6% (tidak signifikan)
## - raceOther OR = 1.185 -> meningkatkan odds >50K sebesar 18.5% (tidak signifikan)
## - raceWhite OR = 2.304 -> meningkatkan odds >50K sebesar 130.4% (signifikan)
## - capital_gain OR = 1.000 -> meningkatkan odds >50K sebesar 0.0% (signifikan)
Asumsi yang diperiksa: (1) tidak ada multikolinearitas antar prediktor (VIF), (2) tidak ada observasi berpengaruh ekstrem.
# 1. Variance Inflation Factor (VIF)
# car::vif() mengembalikan matriks (GVIF) jika ada prediktor kategorik
# Ambil kolom pertama (GVIF) atau nilai tunggal (VIF numerik)
vif_raw <- car::vif(model_biner)
if (is.matrix(vif_raw)) {
# Prediktor kategorik -> gunakan GVIF^(1/(2*Df)) sebagai VIF adjusted
vif_val <- vif_raw[, "GVIF^(1/(2*Df))"]^2 # setara VIF
vif_name <- rownames(vif_raw)
vif_note <- "GVIF adjusted (setara VIF)"
} else {
vif_val <- vif_raw
vif_name <- names(vif_raw)
vif_note <- "VIF standar"
}
data.frame(
Variabel = vif_name,
VIF = round(vif_val, 3),
Status = if_else(vif_val < 5, "OK (< 5)", "Perlu perhatian (>= 5)")
) %>%
kable(caption = paste("Variance Inflation Factor —", vif_note)) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE) %>%
row_spec(which(vif_val >= 5), background = "#fdecea")| Variabel | VIF | Status | |
|---|---|---|---|
| age | age | 1.027 | OK (< 5) |
| education_num | education_num | 1.029 | OK (< 5) |
| hours_per_week | hours_per_week | 1.036 | OK (< 5) |
| sex | sex | 1.032 | OK (< 5) |
| race | race | 1.005 | OK (< 5) |
| capital_gain | capital_gain | 1.007 | OK (< 5) |
##
## Kesimpulan: VIF < 5 = tidak ada multikolinearitas yang mengkhawatirkan.
# 2. Cook's Distance — deteksi observasi berpengaruh
cooksd <- cooks.distance(model_biner)
plot_df <- data.frame(idx = seq_along(cooksd), cooksd = cooksd)
ggplot(plot_df, aes(x = idx, y = cooksd)) +
geom_point(alpha = 0.3, color = "#3498db", size = 0.8) +
geom_hline(yintercept = 4/nrow(train), linetype = "dashed", color = "#e74c3c") +
annotate("text", x = nrow(train)*0.8, y = 4/nrow(train)*1.2,
label = "Batas 4/n", color = "#e74c3c", size = 3.5) +
labs(title = "Cook's Distance — Deteksi Observasi Berpengaruh",
subtitle = "Titik di atas garis merah perlu diperiksa lebih lanjut",
x = "Indeks Observasi", y = "Cook's Distance") +
theme_tugas()prob_test <- predict(model_biner, newdata = test, type = "response")
pred_class <- factor(if_else(prob_test > 0.5, ">50K", "<=50K"),
levels = c("<=50K",">50K"))
cm <- confusionMatrix(pred_class, test$income_bin, positive = ">50K")
print(cm)## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 4697 932
## >50K 250 634
##
## Accuracy : 0.8185
## 95% CI : (0.8089, 0.8278)
## No Information Rate : 0.7596
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4163
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.40485
## Specificity : 0.94946
## Pos Pred Value : 0.71719
## Neg Pred Value : 0.83443
## Prevalence : 0.24044
## Detection Rate : 0.09734
## Detection Prevalence : 0.13573
## Balanced Accuracy : 0.67716
##
## 'Positive' Class : >50K
##
roc_obj <- roc(as.numeric(test$income_bin == ">50K"), prob_test, quiet = TRUE)
auc_val <- auc(roc_obj)
roc_df <- data.frame(fpr = 1 - roc_obj$specificities, tpr = roc_obj$sensitivities)
ggplot(roc_df, aes(x = fpr, y = tpr)) +
geom_line(color = "#e74c3c", linewidth = 1.2) +
geom_abline(linetype = "dashed", color = "grey60") +
annotate("text", x = 0.65, y = 0.25,
label = sprintf("AUC = %.3f", auc_val),
size = 5, color = "#e74c3c", fontface = "bold") +
labs(title = "Kurva ROC — Regresi Logistik Biner",
subtitle = "AUC mendekati 1 menunjukkan model yang baik",
x = "False Positive Rate (1 - Spesifisitas)",
y = "True Positive Rate (Sensitivitas)") +
theme_tugas()coef_df %>%
filter(term != "(Intercept)") %>%
ggplot(aes(x = fct_reorder(term, estimate), y = estimate, color = estimate > 1)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.25) +
geom_hline(yintercept = 1, linetype = "dashed", color = "grey50") +
coord_flip() +
scale_color_manual(values = c("#3498db","#e74c3c"),
labels = c("Menurunkan odds","Meningkatkan odds")) +
labs(title = "Odds Ratio dengan 95% Confidence Interval",
subtitle = "OR > 1: meningkatkan peluang income >50K",
x = NULL, y = "Odds Ratio (OR)", color = NULL) +
theme_tugas()Dataset Dry Bean
Dataset — UCI ML Repository
Link
archive.ics.uci.edu/dataset/602/dry+bean+dataset
Referensi Koklu & Ozkan (2020),
Computers and Electronics in Agriculture
Tujuan Mengklasifikasikan 7 jenis
kacang kering berdasarkan fitur morfologi biji.
| Variabel | Tipe | Keterangan |
|---|---|---|
Class |
Respons | 7 jenis: SEKER, BARBUNYA, BOMBAY, CALI, DERMASON, HOROZ, SIRA |
Area |
Prediktor | Luas permukaan biji (piksel²) |
Perimeter |
Prediktor | Keliling biji (piksel) |
MajorAxisLength |
Prediktor | Panjang sumbu mayor |
MinorAxisLength |
Prediktor | Panjang sumbu minor |
Eccentricity |
Prediktor | Eksentrisitas elips biji (0–1) |
Compactness |
Prediktor | Indeks kekompakan biji |
if (!"readxl" %in% installed.packages()[,"Package"])
install.packages("readxl", quiet = TRUE)
library(readxl)
tmp_zip <- tempfile(fileext = ".zip")
tmp_dir <- tempdir()
bean_raw <- tryCatch({
download.file("https://archive.ics.uci.edu/static/public/602/dry+bean+dataset.zip",
tmp_zip, mode = "wb", quiet = TRUE)
unzip(tmp_zip, exdir = tmp_dir)
xlsx_f <- list.files(tmp_dir, pattern = "\\.xlsx$", full.names = TRUE, recursive = TRUE)[1]
d <- read_excel(xlsx_f)
cat("Data UCI berhasil dimuat:", nrow(d), "baris\n"); d
}, error = function(e) {
cat("Fallback ke data simulasi\n")
set.seed(99); n <- 800
cls <- c("SEKER","BARBUNYA","BOMBAY","CALI","DERMASON","HOROZ","SIRA")
data.frame(
Area = c(rnorm(n/7,55000,8000), rnorm(n/7,90000,12000), rnorm(n/7,150000,20000),
rnorm(n/7,80000,10000), rnorm(n/7,35000,5000), rnorm(n/7,65000,9000), rnorm(n/7,60000,9000)),
Perimeter = c(rnorm(n/7,900,80), rnorm(n/7,1100,100), rnorm(n/7,1600,150),
rnorm(n/7,1050,90), rnorm(n/7,720,60), rnorm(n/7,1000,85), rnorm(n/7,950,80)),
MajorAxisLength = c(rnorm(n/7,250,25), rnorm(n/7,320,30), rnorm(n/7,480,40),
rnorm(n/7,300,28), rnorm(n/7,200,20), rnorm(n/7,280,26), rnorm(n/7,265,24)),
MinorAxisLength = c(rnorm(n/7,185,18), rnorm(n/7,230,22), rnorm(n/7,320,30),
rnorm(n/7,220,20), rnorm(n/7,145,14), rnorm(n/7,195,18), rnorm(n/7,188,17)),
Eccentricity = c(rnorm(n/7,0.65,.06), rnorm(n/7,0.70,.06), rnorm(n/7,0.72,.06),
rnorm(n/7,0.71,.06), rnorm(n/7,0.67,.06), rnorm(n/7,0.73,.06), rnorm(n/7,0.68,.06)),
Compactness = c(rnorm(n/7,0.84,.04), rnorm(n/7,0.82,.04), rnorm(n/7,0.78,.04),
rnorm(n/7,0.83,.04), rnorm(n/7,0.86,.04), rnorm(n/7,0.81,.04), rnorm(n/7,0.85,.04)),
Class = rep(cls, each = n/7)
)
})## Data UCI berhasil dimuat: 13611 baris
bean <- bean_raw %>%
clean_names() %>%
rename_with(~"Class", matches("^class$", ignore.case = TRUE)) %>%
mutate(Class = factor(str_trim(Class))) %>%
drop_na(Class)
cat("Dimensi:", nrow(bean), "x", ncol(bean), "| Kelas:", levels(bean$Class), "\n")## Dimensi: 13611 x 17 | Kelas: BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
| Jenis Kacang | Frekuensi | Persentase |
|---|---|---|
| BARBUNYA | 1322 | 9.7% |
| BOMBAY | 522 | 3.8% |
| CALI | 1630 | 12% |
| DERMASON | 3546 | 26.1% |
| HOROZ | 1928 | 14.2% |
| SEKER | 2027 | 14.9% |
| SIRA | 2636 | 19.4% |
ggplot(bean, aes(x = fct_infreq(Class), fill = Class)) +
geom_bar(show.legend = FALSE) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribusi Jenis Kacang (Variabel Respons)",
x = "Jenis Kacang", y = "Frekuensi") + theme_tugas() +
theme(axis.text.x = element_text(angle = 15, hjust = 1))fitur_ada <- c("area","perimeter","major_axis_length","minor_axis_length",
"eccentricity","compactness")
fitur_ada <- fitur_ada[fitur_ada %in% names(bean)]
if (length(fitur_ada) >= 2) {
dplyr::select(bean, Class, dplyr::all_of(fitur_ada[1:min(4, length(fitur_ada))])) %>%
pivot_longer(-Class, names_to = "Fitur", values_to = "Nilai") %>%
ggplot(aes(x = Class, y = Nilai, fill = Class)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE, outlier.size = 0.4) +
facet_wrap(~Fitur, scales = "free_y") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribusi Fitur Morfologi per Jenis Kacang",
x = NULL, y = "Nilai") + theme_tugas() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
}if (length(fitur_ada) >= 2) {
f1 <- fitur_ada[1]; f2 <- fitur_ada[2]
bean %>% slice_sample(n = min(1000, nrow(bean))) %>%
ggplot(aes(x = .data[[f1]], y = .data[[f2]], color = Class)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_brewer(palette = "Set2") +
labs(title = paste("Scatter Plot:", f1, "vs", f2),
subtitle = "Warna menunjukkan jenis kacang",
x = f1, y = f2, color = "Jenis Kacang") + theme_tugas()
}set.seed(42)
fitur_num <- bean %>% dplyr::select(-Class) %>% dplyr::select(where(is.numeric)) %>% names()
fitur_model <- fitur_num[1:min(6, length(fitur_num))]
bean_sc <- bean %>% mutate(across(all_of(fitur_model), scale))
idx_m <- sample(nrow(bean_sc), 0.8 * nrow(bean_sc))
train_m <- bean_sc[idx_m, ]; test_m <- bean_sc[-idx_m, ]
formula_m <- as.formula(paste("Class ~", paste(fitur_model, collapse = " + ")))
cat("Formula model:"); print(formula_m)## Formula model:
## Class ~ area + perimeter + major_axis_length + minor_axis_length +
## aspect_ration + eccentricity
model_multi <- multinom(formula_m, data = train_m, trace = FALSE, MaxNWts = 5000)
summary(model_multi)## Call:
## multinom(formula = formula_m, data = train_m, trace = FALSE,
## MaxNWts = 5000)
##
## Coefficients:
## (Intercept) area perimeter major_axis_length minor_axis_length
## BOMBAY -29.5635613 10.418188 -45.195631 34.07248 16.51987
## CALI -5.3264600 -40.895159 -26.229782 60.10282 11.56385
## DERMASON -3.2104361 -4.426138 -30.520272 67.13621 -43.34648
## HOROZ -4.5094629 -71.145177 -8.089909 89.98468 -22.43155
## SEKER 0.1297737 10.885249 -18.518308 55.60573 -41.89059
## SIRA -1.8088902 -60.119211 -18.742607 92.54253 -24.66747
## aspect_ration eccentricity
## BOMBAY 0.9716656 -8.885827
## CALI -9.1025759 -0.991670
## DERMASON -28.4946295 -2.424344
## HOROZ -18.5703544 -10.469927
## SEKER -29.4243980 -6.488644
## SIRA -30.3753532 -1.578155
##
## Std. Errors:
## (Intercept) area perimeter major_axis_length minor_axis_length
## BOMBAY 45.3340959 79.728093 99.704691 28.192418 72.623840
## CALI 0.7540501 4.293620 1.437490 6.007646 4.326970
## DERMASON 1.4489908 13.395488 1.825688 9.346993 6.108543
## HOROZ 0.7449877 7.201142 1.423362 7.979468 6.204546
## SEKER 0.8001198 9.241103 1.674229 13.003139 5.169411
## SIRA 0.7180104 7.426669 1.342274 6.674453 5.151782
## aspect_ration eccentricity
## BOMBAY 62.634051 29.372139
## CALI 2.935654 1.676703
## DERMASON 2.976447 1.592890
## HOROZ 3.586364 1.492527
## SEKER 5.052157 1.539004
## SIRA 2.817116 1.409862
##
## Residual Deviance: 4995.461
## AIC: 5079.461
coef_m <- coef(model_multi)
se_m <- summary(model_multi)$standard.errors
z_m <- coef_m / se_m
pval_m <- 2 * (1 - pnorm(abs(z_m)))
or_m <- exp(coef_m)
as.data.frame(round(or_m, 3)) %>%
rownames_to_column("Kelas (vs referensi)") %>%
kable(caption = "Relative Risk Ratio — exp(beta) — Model Multinomial") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = TRUE, font_size = 11) %>%
scroll_box(width = "100%")| Kelas (vs referensi) | (Intercept) | area | perimeter | major_axis_length | minor_axis_length | aspect_ration | eccentricity |
|---|---|---|---|---|---|---|---|
| BOMBAY | 0.000 | 33462.761 | 0 | 6.273195e+14 | 14944691.9 | 2.642 | 0.000 |
| CALI | 0.005 | 0.000 | 0 | 1.265679e+26 | 105224.1 | 0.000 | 0.371 |
| DERMASON | 0.040 | 0.012 | 0 | 1.435109e+29 | 0.0 | 0.000 | 0.089 |
| HOROZ | 0.011 | 0.000 | 0 | 1.201845e+39 | 0.0 | 0.000 | 0.000 |
| SEKER | 1.139 | 53383.061 | 0 | 1.410134e+24 | 0.0 | 0.000 | 0.002 |
| SIRA | 0.164 | 0.000 | 0 | 1.551358e+40 | 0.0 | 0.000 | 0.206 |
## Kategori REFERENSI: BARBUNYA
## Interpretasi Relative Risk Ratio (RRR = exp(beta)):
## RRR > 1: kenaikan prediktor meningkatkan peluang masuk kelas tsb vs referensi
## RRR < 1: kenaikan prediktor menurunkan peluang relatif vs referensi
# Tampilkan RRR signifikan per kelas (p < 0.05)
for (k in rownames(coef_m)) {
sig_idx <- which(pval_m[k,] < 0.05 & colnames(pval_m) != "(Intercept)")
if (length(sig_idx) > 0) {
cat(sprintf("[%s vs %s] Prediktor signifikan:\n", k, ref_class))
for (j in sig_idx) {
nm <- colnames(coef_m)[j]
cat(sprintf(" %-22s RRR = %.3f (p = %.4f)\n", nm, or_m[k, j], pval_m[k, j]))
}
cat("\n")
}
}## [CALI vs BARBUNYA] Prediktor signifikan:
## area RRR = 0.000 (p = 0.0000)
## perimeter RRR = 0.000 (p = 0.0000)
## major_axis_length RRR = 126567903658093233903239168.000 (p = 0.0000)
## minor_axis_length RRR = 105224.110 (p = 0.0075)
## aspect_ration RRR = 0.000 (p = 0.0019)
##
## [DERMASON vs BARBUNYA] Prediktor signifikan:
## perimeter RRR = 0.000 (p = 0.0000)
## major_axis_length RRR = 143510908930988938261428174848.000 (p = 0.0000)
## minor_axis_length RRR = 0.000 (p = 0.0000)
## aspect_ration RRR = 0.000 (p = 0.0000)
##
## [HOROZ vs BARBUNYA] Prediktor signifikan:
## area RRR = 0.000 (p = 0.0000)
## perimeter RRR = 0.000 (p = 0.0000)
## major_axis_length RRR = 1201845003173593425977308601526744252416.000 (p = 0.0000)
## minor_axis_length RRR = 0.000 (p = 0.0003)
## aspect_ration RRR = 0.000 (p = 0.0000)
## eccentricity RRR = 0.000 (p = 0.0000)
##
## [SEKER vs BARBUNYA] Prediktor signifikan:
## perimeter RRR = 0.000 (p = 0.0000)
## major_axis_length RRR = 1410133996369884320104448.000 (p = 0.0000)
## minor_axis_length RRR = 0.000 (p = 0.0000)
## aspect_ration RRR = 0.000 (p = 0.0000)
## eccentricity RRR = 0.002 (p = 0.0000)
##
## [SIRA vs BARBUNYA] Prediktor signifikan:
## area RRR = 0.000 (p = 0.0000)
## perimeter RRR = 0.000 (p = 0.0000)
## major_axis_length RRR = 15513582638288680616007664432843321245696.000 (p = 0.0000)
## minor_axis_length RRR = 0.000 (p = 0.0000)
## aspect_ration RRR = 0.000 (p = 0.0000)
Asumsi utama regresi logistik multinomial: (1) observasi independen, (2) tidak ada multikolinearitas berat, (3) Independence of Irrelevant Alternatives (IIA).
# Cek korelasi antar prediktor numerik
cor_mat <- cor(dplyr::select(bean_sc, all_of(fitur_model)), use = "complete.obs")
cat("Matriks korelasi antar prediktor:\n")## Matriks korelasi antar prediktor:
## area perimeter major_axis_length minor_axis_length
## area 1.00 0.97 0.93 0.95
## perimeter 0.97 1.00 0.98 0.91
## major_axis_length 0.93 0.98 1.00 0.83
## minor_axis_length 0.95 0.91 0.83 1.00
## aspect_ration 0.24 0.39 0.55 -0.01
## eccentricity 0.27 0.39 0.54 0.02
## aspect_ration eccentricity
## area 0.24 0.27
## perimeter 0.39 0.39
## major_axis_length 0.55 0.54
## minor_axis_length -0.01 0.02
## aspect_ration 1.00 0.92
## eccentricity 0.92 1.00
# Heatmap korelasi
cor_df <- reshape2::melt(cor_mat)
ggplot(cor_df, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), size = 3) +
scale_fill_gradient2(low = "#3498db", mid = "white", high = "#e74c3c",
midpoint = 0, limits = c(-1, 1)) +
labs(title = "Heatmap Korelasi Antar Prediktor",
subtitle = "Korelasi tinggi (|r| > 0.8) mengindikasikan potensi multikolinearitas",
x = NULL, y = NULL, fill = "Korelasi") +
theme_tugas() + theme(axis.text.x = element_text(angle = 30, hjust = 1))Catatan IIA: Model multinomial mengasumsikan
Independence of Irrelevant Alternatives — rasio peluang antara dua
kategori tidak bergantung pada ada/tidaknya kategori lain. Uji formal
(Hausman-McFadden) dapat dilakukan dengan package mlogit
bila diperlukan.
pred_m <- predict(model_multi, newdata = test_m)
acc_m <- mean(pred_m == test_m$Class)
cat(sprintf("Akurasi model multinomial: %.1f%%\n", acc_m * 100))## Akurasi model multinomial: 90.9%
## Aktual
## Prediksi BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
## BARBUNYA 230 0 10 1 1 3 2
## BOMBAY 0 109 1 0 0 0 0
## CALI 21 0 312 0 9 0 0
## DERMASON 0 0 0 632 7 8 47
## HOROZ 0 0 7 1 349 0 5
## SEKER 2 0 0 16 0 383 11
## SIRA 6 0 4 58 7 21 460
as.data.frame(cm_m) %>%
group_by(Aktual) %>%
mutate(Prop = Freq / sum(Freq)) %>%
ggplot(aes(x = Aktual, y = Prediksi, fill = Prop)) +
geom_tile(color = "white") +
geom_text(aes(label = sprintf("%.0f\n(%.0f%%)", Freq, Prop*100)),
size = 3, lineheight = 1.2) +
scale_fill_gradient(low = "#f0f8ff", high = "#2980b9") +
labs(title = "Confusion Matrix — Model Multinomial",
subtitle = paste0("Akurasi: ", round(acc_m*100,1), "%"),
fill = "Proporsi") +
theme_tugas() + theme(axis.text.x = element_text(angle = 30, hjust = 1))Dataset Drug Consumption
(Quantified) — UCI ML Repository
Link
archive.ics.uci.edu/dataset/373/drug+consumption+quantified
Referensi Fehrman et al. (2017), The
Five Factor Model of Personality and Evaluation of Drug Consumption
Risk
Tujuan Memprediksi
frekuensi konsumsi Cannabis berdasarkan profil
kepribadian NEO-FFI.
Skala respons (ordinal, dari jarang ke sering):
Never → Over Decade → Last Decade
→ Last Year → Last Month →
Last Week → Last Day
| Variabel | Tipe | Keterangan |
|---|---|---|
cannabis_ord |
Respons | Frekuensi konsumsi (7 level ordinal) |
nscore |
Prediktor | Neuroticism (NEO-FFI) |
escore |
Prediktor | Extraversion |
oscore |
Prediktor | Openness to experience |
ascore |
Prediktor | Agreeableness |
cscore |
Prediktor | Conscientiousness |
impulsive |
Prediktor | Impulsivitas (Eysenck) |
ss |
Prediktor | Sensation seeking |
gender_f |
Prediktor | Jenis kelamin |
lvl_can <- c("never","over_decade_ago","last_decade",
"last_year","last_month","last_week","last_day")
col_drug <- c("id","age","gender","education","country","ethnicity",
"nscore","escore","oscore","ascore","cscore","impulsive","ss",
"alcohol","amphet","amyl","benzos","caff","cannabis","choc",
"coke","crack","ecstasy","heroin","ketamine","legalh",
"lsd","meth","mushrooms","nicotine","semer","vsa")
make_drug_sim <- function(lvl) {
set.seed(2024); n <- 1885
ls <- with(list(
ns = rnorm(n,.07,.96), es = rnorm(n,-.10,.96),
os = rnorm(n,.10,.96), as_ = rnorm(n,-.08,.96),
cs = rnorm(n,-.11,.96), im = rnorm(n,.06,.96),
ss = rnorm(n,.08,.96)
), -0.5 + 0.3*os + 0.4*ss - 0.2*cs + 0.15*im + rnorm(n,0,1.2))
cuts <- quantile(ls, probs = seq(0,1,length.out=8))
data.frame(
age = rnorm(n,-.07,.95), gender = sample(c(-.48,.48),n,replace=TRUE),
nscore=rnorm(n,.07,.96), escore=rnorm(n,-.10,.96), oscore=rnorm(n,.10,.96),
ascore=rnorm(n,-.08,.96), cscore=rnorm(n,-.11,.96),
impulsive=rnorm(n,.06,.96), ss=rnorm(n,.08,.96),
cannabis_ord = factor(lvl[as.integer(cut(ls, breaks=cuts, include.lowest=TRUE))],
levels=lvl, ordered=TRUE)
)
}
drug_raw <- tryCatch(
read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00373/drug_consumption.data",
col_names = FALSE, show_col_types = FALSE),
error = function(e) NULL
)
if (!is.null(drug_raw) && nrow(drug_raw) > 100) {
colnames(drug_raw) <- col_drug[1:ncol(drug_raw)]
can_num <- suppressWarnings(as.numeric(as.character(drug_raw$cannabis)))
if (sum(!is.na(can_num)) >= 100) {
drug_raw$cannabis_ord <- factor(
cut(can_num, breaks = c(-Inf,-.72,-.53,-.38,-.25,-.05,.20,Inf),
labels = lvl_can, right = FALSE),
levels = lvl_can, ordered = TRUE)
cat("Data UCI dimuat:", nrow(drug_raw), "baris |",
sum(!is.na(drug_raw$cannabis_ord)), "valid\n")
if (sum(!is.na(drug_raw$cannabis_ord)) < 100) drug_raw <- make_drug_sim(lvl_can)
} else {
drug_raw <- make_drug_sim(lvl_can)
cat("Encode gagal, menggunakan simulasi\n")
}
} else {
drug_raw <- make_drug_sim(lvl_can); cat("Menggunakan data simulasi\n")
}## Encode gagal, menggunakan simulasi
drug <- drug_raw %>%
mutate(
gender_f = factor(if_else(as.numeric(gender) < 0, "Female", "Male")),
age_grp = factor(cut(as.numeric(age), breaks = c(-Inf,-.5,0,.5,Inf),
labels = c("<25","25-34","35-44","45+")))
) %>%
filter(!is.na(cannabis_ord), !is.na(nscore), !is.na(escore),
!is.na(oscore), !is.na(ascore), !is.na(cscore)) %>%
mutate(cannabis_ord = droplevels(cannabis_ord))
cat("Dimensi akhir:", nrow(drug), "x", ncol(drug), "\n")## Dimensi akhir: 1885 x 12
## Level aktif: never over_decade_ago last_decade last_year last_month last_week last_day
| Kategori (Ordinal) | Frekuensi | Persentase |
|---|---|---|
| never | 270 | 14.3% |
| over_decade_ago | 269 | 14.3% |
| last_decade | 269 | 14.3% |
| last_year | 269 | 14.3% |
| last_month | 269 | 14.3% |
| last_week | 269 | 14.3% |
| last_day | 270 | 14.3% |
| cannabis_ord | nscore | escore | oscore | ascore | cscore | impulsive | ss | gender_f |
|---|---|---|---|---|---|---|---|---|
| last_week | 0.160 | -2.345 | 1.152 | 1.222 | -0.153 | 0.913 | 0.384 | Male |
| last_year | -0.261 | -0.623 | 0.456 | -0.390 | -0.828 | 1.956 | -0.314 | Female |
| last_month | 0.668 | 0.555 | 0.274 | 0.746 | -2.461 | 0.106 | 0.668 | Female |
| last_week | -0.214 | 0.712 | 1.246 | -0.569 | -0.352 | 0.868 | 1.573 | Male |
| last_decade | -0.017 | -2.205 | 0.331 | 0.344 | 0.163 | 0.790 | 0.427 | Female |
drug %>%
count(cannabis_ord) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(x = cannabis_ord, y = n, fill = cannabis_ord)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = paste0(n, "\n(", percent(pct,1), ")")),
vjust = -0.2, size = 3.2, lineheight = 1.1) +
scale_fill_brewer(palette = "YlOrRd") +
labs(title = "Distribusi Frekuensi Konsumsi Cannabis",
subtitle = "Variabel respons ordinal — 7 kategori terurut",
x = "Frekuensi Konsumsi", y = "Frekuensi") + theme_tugas() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))p_n <- ggplot(drug, aes(x = cannabis_ord, y = nscore, fill = cannabis_ord)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE) + scale_fill_brewer(palette = "YlOrRd") +
labs(title = "Neuroticism per Kelompok", x = NULL, y = "N-score") + theme_tugas() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))
p_o <- ggplot(drug, aes(x = cannabis_ord, y = oscore, fill = cannabis_ord)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE) + scale_fill_brewer(palette = "YlOrRd") +
labs(title = "Openness per Kelompok", x = NULL, y = "O-score") + theme_tugas() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))
gridExtra::grid.arrange(p_n, p_o, ncol = 2)drug %>%
count(gender_f, cannabis_ord) %>%
group_by(gender_f) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = cannabis_ord, y = prop, fill = gender_f)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("#e91e8c","#1e88e5")) +
scale_y_continuous(labels = percent) +
labs(title = "Proporsi Konsumsi Cannabis per Gender",
x = "Frekuensi Konsumsi", y = "Proporsi", fill = "Gender") + theme_tugas() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))set.seed(42)
drug_clean <- drug %>%
dplyr::select(cannabis_ord, nscore, escore, oscore, ascore,
cscore, impulsive, ss, gender_f) %>%
drop_na()
drug_clean$cannabis_ord <- droplevels(drug_clean$cannabis_ord)
cat("Baris valid:", nrow(drug_clean), "| Level:", levels(drug_clean$cannabis_ord), "\n")## Baris valid: 1885 | Level: never over_decade_ago last_decade last_year last_month last_week last_day
idx_o <- sample(nrow(drug_clean), 0.8 * nrow(drug_clean))
train_o <- drug_clean[idx_o, ]
test_o <- drug_clean[-idx_o, ]
model_ord <- MASS::polr(
cannabis_ord ~ nscore + escore + oscore + ascore + cscore + impulsive + ss + gender_f,
data = train_o, Hess = TRUE, method = "logistic"
)
summary(model_ord)## Call:
## MASS::polr(formula = cannabis_ord ~ nscore + escore + oscore +
## ascore + cscore + impulsive + ss + gender_f, data = train_o,
## Hess = TRUE, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## nscore -0.040153 0.04722 -0.85037
## escore -0.088162 0.04680 -1.88373
## oscore 0.058096 0.04813 1.20717
## ascore 0.017391 0.04776 0.36416
## cscore 0.001555 0.04543 0.03422
## impulsive 0.002689 0.04612 0.05831
## ss -0.044970 0.04720 -0.95278
## gender_fMale 0.100776 0.09052 1.11326
##
## Intercepts:
## Value Std. Error t value
## never|over_decade_ago -1.7482 0.0869 -20.1200
## over_decade_ago|last_decade -0.8456 0.0734 -11.5203
## last_decade|last_year -0.2247 0.0699 -3.2120
## last_year|last_month 0.3489 0.0702 4.9671
## last_month|last_week 0.9570 0.0742 12.8996
## last_week|last_day 1.8197 0.0872 20.8713
##
## Residual Deviance: 5859.751
## AIC: 5887.751
ctable <- coef(summary(model_ord))
pval_o <- pnorm(abs(ctable[,"t value"]), lower.tail = FALSE) * 2
or_o <- exp(ctable[,"Value"])
hasil_ord <- data.frame(
Variabel = rownames(ctable),
Koefisien = round(ctable[,"Value"], 4),
SE = round(ctable[,"Std. Error"], 4),
`t-value` = round(ctable[,"t value"], 4),
`p-value` = round(pval_o, 4),
OR = round(or_o, 4),
check.names = FALSE
) %>% mutate(Signifikan = if_else(`p-value` < 0.05, "✓", ""))
is_coef <- !grepl("\\|", hasil_ord$Variabel)
hasil_ord %>%
kable(digits = 4, caption = "Koefisien, OR, dan p-value — Model Proportional Odds") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE) %>%
row_spec(which(is_coef & hasil_ord$`p-value` < 0.05), background = "#eafaf1") %>%
pack_rows("Koefisien Prediktor", 1, sum(is_coef)) %>%
pack_rows("Intercept (Cutpoints)", sum(is_coef)+1, nrow(hasil_ord))| Variabel | Koefisien | SE | t-value | p-value | OR | Signifikan | |
|---|---|---|---|---|---|---|---|
| Koefisien Prediktor | |||||||
| nscore | nscore | -0.0402 | 0.0472 | -0.8504 | 0.3951 | 0.9606 | |
| escore | escore | -0.0882 | 0.0468 | -1.8837 | 0.0596 | 0.9156 | |
| oscore | oscore | 0.0581 | 0.0481 | 1.2072 | 0.2274 | 1.0598 | |
| ascore | ascore | 0.0174 | 0.0478 | 0.3642 | 0.7157 | 1.0175 | |
| cscore | cscore | 0.0016 | 0.0454 | 0.0342 | 0.9727 | 1.0016 | |
| impulsive | impulsive | 0.0027 | 0.0461 | 0.0583 | 0.9535 | 1.0027 | |
| ss | ss | -0.0450 | 0.0472 | -0.9528 | 0.3407 | 0.9560 | |
| gender_fMale | gender_fMale | 0.1008 | 0.0905 | 1.1133 | 0.2656 | 1.1060 | |
| Intercept (Cutpoints) | |||||||
| never|over_decade_ago | never|over_decade_ago | -1.7482 | 0.0869 | -20.1200 | 0.0000 | 0.1741 | ✓ |
| over_decade_ago|last_decade | over_decade_ago|last_decade | -0.8456 | 0.0734 | -11.5203 | 0.0000 | 0.4293 | ✓ |
| last_decade|last_year | last_decade|last_year | -0.2247 | 0.0699 | -3.2120 | 0.0013 | 0.7988 | ✓ |
| last_year|last_month | last_year|last_month | 0.3489 | 0.0702 | 4.9671 | 0.0000 | 1.4176 | ✓ |
| last_month|last_week | last_month|last_week | 0.9570 | 0.0742 | 12.8996 | 0.0000 | 2.6040 | ✓ |
| last_week|last_day | last_week|last_day | 1.8197 | 0.0872 | 20.8713 | 0.0000 | 6.1702 | ✓ |
Konvensi tanda polr() R: Model yang digunakan adalah
logit[P(Y ≤ j)] = α_j + β·x. Artinya β > 0 (OR
> 1) berarti kenaikan prediktor meningkatkan odds untuk
berada di kategori yang lebih rendah (konsumsi lebih jarang).
β < 0 (OR < 1) berarti cenderung ke kategori
lebih tinggi (konsumsi lebih sering). Ini berlawanan dengan konvensi
beberapa buku (termasuk Agresti) yang menggunakan P(Y ≥ j).
betas <- coef(model_ord)
cat("INTERPRETASI KOEFISIEN (konvensi polr: logit[P(Y<=j)] = alpha_j + beta*x):\n\n")## INTERPRETASI KOEFISIEN (konvensi polr: logit[P(Y<=j)] = alpha_j + beta*x):
for (nm in names(betas)) {
pv <- pval_o[nm]
sig <- if_else(pv < 0.05, "(signifikan *)", "(tidak signifikan)")
arah <- if_else(betas[nm] < 0, "lebih SERING konsumsi", "lebih JARANG konsumsi")
cat(sprintf(" %-12s beta=%+.3f, OR=%.3f -> %s %s\n",
nm, betas[nm], exp(betas[nm]), arah, sig))
}## nscore beta=-0.040, OR=0.961 -> lebih SERING konsumsi (tidak signifikan)
## escore beta=-0.088, OR=0.916 -> lebih SERING konsumsi (tidak signifikan)
## oscore beta=+0.058, OR=1.060 -> lebih JARANG konsumsi (tidak signifikan)
## ascore beta=+0.017, OR=1.018 -> lebih JARANG konsumsi (tidak signifikan)
## cscore beta=+0.002, OR=1.002 -> lebih JARANG konsumsi (tidak signifikan)
## impulsive beta=+0.003, OR=1.003 -> lebih JARANG konsumsi (tidak signifikan)
## ss beta=-0.045, OR=0.956 -> lebih SERING konsumsi (tidak signifikan)
## gender_fMale beta=+0.101, OR=1.106 -> lebih JARANG konsumsi (tidak signifikan)
Asumsi utama model ordinal: Proportional Odds — efek setiap prediktor bersifat konsisten di semua titik cut-off (slope sama untuk semua j).
# Uji Proportional Odds: bandingkan model penuh vs model terpisah
# Pendekatan: cek likelihood ratio antara model constrained vs unconstrained
# (Uji formal Brant memerlukan package brant; di sini kita lakukan secara manual)
cat("Pemeriksaan asumsi Proportional Odds:\n\n")## Pemeriksaan asumsi Proportional Odds:
# Buat model biner untuk setiap cut-point dan bandingkan koefisiennya
n_levels <- nlevels(train_o$cannabis_ord)
coef_per_cut <- list()
for (j in 1:(n_levels - 1)) {
lvls <- levels(train_o$cannabis_ord)
y_bin <- as.integer(train_o$cannabis_ord > lvls[j])
df_tmp <- train_o %>% mutate(y_bin = y_bin)
m_tmp <- glm(y_bin ~ nscore + escore + oscore + ascore + cscore + impulsive + ss + gender_f,
data = df_tmp, family = binomial)
coef_per_cut[[j]] <- coef(m_tmp)[-1]
}
coef_mat <- do.call(rbind, coef_per_cut)
rownames(coef_mat) <- paste0("Cut ", 1:(n_levels-1))
cat("Koefisien model biner per cut-point (harusnya relatif konsisten):\n")## Koefisien model biner per cut-point (harusnya relatif konsisten):
## nscore escore oscore ascore cscore impulsive ss gender_fMale
## Cut 1 0.055 -0.093 0.151 0.012 0.029 0.055 -0.112 0.192
## Cut 2 -0.044 -0.096 0.095 -0.011 -0.016 -0.054 0.006 0.117
## Cut 3 -0.039 -0.071 0.071 0.003 0.011 0.005 0.004 0.038
## Cut 4 -0.048 -0.101 0.032 0.000 0.011 0.000 -0.078 0.151
## Cut 5 -0.049 -0.116 0.017 0.059 0.015 0.034 -0.040 0.092
## Cut 6 -0.109 -0.027 -0.007 0.076 -0.052 0.002 -0.106 0.067
# Hitung koefisien variasi
cv_coef <- apply(coef_mat, 2, function(x) sd(x)/abs(mean(x))*100)
cat("\nKoefisien Variasi antar cut-point (CV% < 30% = relatif konsisten):\n")##
## Koefisien Variasi antar cut-point (CV% < 30% = relatif konsisten):
## nscore escore oscore ascore cscore impulsive
## 135.6 37.4 96.5 152.7 12596.9 537.1
## ss gender_fMale
## 96.9 51.4
pred_o <- factor(as.character(predict(model_ord, newdata = test_o)),
levels = levels(drug_clean$cannabis_ord), ordered = TRUE)
aktual_o <- test_o$cannabis_ord
acc_o <- mean(as.character(pred_o) == as.character(aktual_o))
cat(sprintf("Akurasi model ordinal: %.1f%%\n", acc_o * 100))## Akurasi model ordinal: 13.0%
## Aktual
## Prediksi never over_decade_ago last_decade last_year last_month
## never 8 8 4 6 10
## over_decade_ago 15 19 20 23 23
## last_decade 0 0 0 0 0
## last_year 0 0 0 0 0
## last_month 0 0 0 0 0
## last_week 0 0 0 0 0
## last_day 33 20 32 26 28
## Aktual
## Prediksi last_week last_day
## never 11 8
## over_decade_ago 18 18
## last_decade 0 0
## last_year 0 0
## last_month 0 0
## last_week 0 0
## last_day 25 22
grid_o <- data.frame(
nscore=0, escore=0, oscore=seq(-2,2,by=0.1),
ascore=0, cscore=0, impulsive=0, ss=0,
gender_f=factor("Male", levels=levels(drug_clean$gender_f))
)
prob_o <- predict(model_ord, newdata = grid_o, type = "probs")
as.data.frame(prob_o) %>%
mutate(oscore = grid_o$oscore) %>%
pivot_longer(-oscore, names_to = "Kategori", values_to = "Probabilitas") %>%
mutate(Kategori = factor(Kategori, levels = levels(drug_clean$cannabis_ord), ordered=TRUE)) %>%
ggplot(aes(x = oscore, y = Probabilitas, color = Kategori)) +
geom_line(linewidth = 1) +
scale_color_brewer(palette = "YlOrRd") +
labs(title = "Probabilitas Prediksi per Kategori Cannabis",
subtitle = "Variasi Openness (oscore), prediktor lain = 0",
x = "Openness Score (standardized)", y = "Probabilitas",
color = "Frekuensi") + theme_tugas()hasil_ord %>%
filter(!grepl("\\|", Variabel)) %>%
ggplot(aes(x = fct_reorder(Variabel, OR), y = OR, color = OR < 1)) +
geom_point(size = 3) +
geom_hline(yintercept = 1, linetype = "dashed", color = "grey50") +
coord_flip() +
scale_color_manual(values = c("#e74c3c","#27ae60"),
labels = c("Konsumsi lebih SERING (OR<1)",
"Konsumsi lebih JARANG (OR>1)")) +
labs(title = "Odds Ratio — Model Ordinal",
x = NULL, y = "Odds Ratio (exp beta)", color = NULL) + theme_tugas()Dataset Chicago Crimes
2001–Present — Chicago Data Portal
Link
data.cityofchicago.org
Penyedia City of Chicago / Chicago
Police Department
Tujuan Memodelkan
jumlah kejahatan per distrik per bulan.
| Variabel | Tipe | Keterangan |
|---|---|---|
n_crimes |
Respons | Jumlah kejahatan per distrik per bulan (count ≥ 0) |
musim |
Prediktor kategorik | Musim: Dingin / Semi / Panas / Gugur |
year |
Prediktor numerik | Tahun (2019–2023) |
district |
Prediktor kategorik | Distrik kepolisian Chicago |
url_crime <- paste0(
"https://data.cityofchicago.org/resource/ijzp-q8t2.json",
"?$limit=50000&$select=district,year,month",
"&$where=year>=2019%20AND%20year<=2023"
)
crime_raw <- tryCatch({
resp <- httr::GET(url_crime, httr::add_headers("X-App-Token"=""), httr::timeout(30))
if (httr::status_code(resp) == 200) {
d <- jsonlite::fromJSON(rawToChar(resp$content))
cat("Data Chicago dimuat:", nrow(d), "baris\n"); d
} else NULL
}, error = function(e) NULL)
if (is.null(crime_raw) || nrow(crime_raw) < 100) {
cat("Fallback ke data simulasi\n")
set.seed(2024); n <- 5000
crime_raw <- data.frame(
district = sample(paste0("D", sprintf("%02d",1:25)), n, replace=TRUE),
year = sample(2019:2023, n, replace=TRUE),
month = sample(1:12, n, replace=TRUE)
)
}## Fallback ke data simulasi
crime <- crime_raw %>%
mutate(year = as.integer(year),
month = as.integer(month),
district = factor(str_pad(str_trim(as.character(district)), 2, pad="0"))) %>%
count(district, year, month, name = "n_crimes") %>%
mutate(
musim = factor(case_when(
month %in% c(12,1,2) ~ "Dingin",
month %in% 3:5 ~ "Semi",
month %in% 6:8 ~ "Panas",
TRUE ~ "Gugur"
), levels = c("Dingin","Semi","Panas","Gugur"))
)
cat("Dimensi agregat:", nrow(crime), "baris\n")## Dimensi agregat: 1447 baris
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.455 5.000 11.000
| Statistik | Nilai |
|---|---|
| Min | 1.00 |
| Q1 | 2.00 |
| Median | 3.00 |
| Mean | 3.46 |
| Q3 | 5.00 |
| Max | 11.00 |
| Std. Dev | 1.73 |
ggplot(crime, aes(x = n_crimes)) +
geom_histogram(bins = 30, fill = "#e74c3c", alpha = 0.8, color = "white") +
geom_vline(xintercept = mean(crime$n_crimes), linetype="dashed",
color="#2c3e50", linewidth=1) +
annotate("text", x = mean(crime$n_crimes)*1.4,
y = max(table(cut(crime$n_crimes,30)))*0.85,
label = sprintf("Mean = %.1f\nVar = %.1f",
mean(crime$n_crimes), var(crime$n_crimes)),
color="#2c3e50", size=4) +
labs(title = "Distribusi Jumlah Kejahatan per Distrik per Bulan",
subtitle = "Periksa apakah Mean ≈ Variance (asumsi Poisson)",
x = "Jumlah Kejahatan", y = "Frekuensi") + theme_tugas()p_musim <- crime %>%
group_by(musim) %>%
summarise(mean_c = mean(n_crimes), se = sd(n_crimes)/sqrt(n())) %>%
ggplot(aes(x = musim, y = mean_c, fill = musim)) +
geom_col(show.legend = FALSE) +
geom_errorbar(aes(ymin = mean_c-se, ymax = mean_c+se), width=.3) +
scale_fill_manual(values = c("#3498db","#2ecc71","#e74c3c","#f39c12")) +
labs(title = "Rata-rata Kejahatan per Musim",
x = "Musim", y = "Rata-rata") + theme_tugas()
p_year <- crime %>%
group_by(year) %>%
summarise(total = sum(n_crimes)) %>%
ggplot(aes(x = year, y = total)) +
geom_line(color = "#e74c3c", linewidth=1.2) +
geom_point(size=3, color="#e74c3c") +
geom_text(aes(label = comma(total)), vjust=-0.8, size=3) +
scale_y_continuous(labels = comma) +
labs(title = "Total Kejahatan per Tahun",
x = "Tahun", y = "Total") + theme_tugas()
gridExtra::grid.arrange(p_musim, p_year, ncol=2)crime %>%
group_by(district) %>%
summarise(mean_c = mean(n_crimes)) %>%
slice_max(mean_c, n=10) %>%
ggplot(aes(x = fct_reorder(district, mean_c), y = mean_c, fill = mean_c)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = round(mean_c,1)), hjust = -0.1) +
coord_flip() +
scale_fill_gradient(low = "#f9ebea", high = "#e74c3c") +
labs(title = "10 Distrik dengan Rata-rata Kejahatan Tertinggi",
x = "Distrik", y = "Rata-rata Kejahatan/Bulan") + theme_tugas()mn <- mean(crime$n_crimes)
vr <- var(crime$n_crimes)
rasio <- vr / mn
cat("=== UJI ASUMSI EKUIDISPERSI (Mean = Variance) ===\n\n")## === UJI ASUMSI EKUIDISPERSI (Mean = Variance) ===
## Mean : 3.455
## Variance : 2.981
## Rasio : 0.863
if (rasio > 2) {
cat("OVERDISPERSI BERAT: Variance >> Mean (rasio > 2)\n")
cat("Disarankan: Quasi-Poisson atau Negative Binomial\n")
} else if (rasio > 1.3) {
cat("OVERDISPERSI RINGAN: Model Poisson masih dapat digunakan\n")
cat("namun pertimbangkan quasi-Poisson untuk SE yang lebih konservatif.\n")
} else {
cat("Asumsi ekuidispersi terpenuhi. Model Poisson sesuai.\n")
}## Asumsi ekuidispersi terpenuhi. Model Poisson sesuai.
# Gunakan musim + year + district sebagai prediktor
# district memberikan informasi lokasi yang lebih informatif
model_poi <- glm(
n_crimes ~ musim + year + district,
data = crime, family = poisson(link = "log")
)
summary(model_poi)##
## Call:
## glm(formula = n_crimes ~ musim + year + district, family = poisson(link = "log"),
## data = crime)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -29.934343 20.216969 -1.481 0.13870
## musimSemi 0.004520 0.039360 0.115 0.90857
## musimPanas -0.009402 0.039639 -0.237 0.81252
## musimGugur -0.101103 0.040472 -2.498 0.01249 *
## year 0.015494 0.010003 1.549 0.12142
## districtD02 -0.043845 0.095068 -0.461 0.64465
## districtD03 -0.034194 0.095299 -0.359 0.71974
## districtD04 -0.205388 0.098372 -2.088 0.03681 *
## districtD05 -0.104541 0.096627 -1.082 0.27930
## districtD06 -0.236364 0.100179 -2.359 0.01830 *
## districtD07 -0.159331 0.098517 -1.617 0.10582
## districtD08 -0.013908 0.094734 -0.147 0.88328
## districtD09 -0.136237 0.097012 -1.404 0.16022
## districtD10 -0.277405 0.101857 -2.723 0.00646 **
## districtD11 -0.049644 0.094839 -0.523 0.60066
## districtD12 -0.160615 0.098092 -1.637 0.10155
## districtD13 -0.049631 0.096133 -0.516 0.60566
## districtD14 0.033947 0.093663 0.362 0.71703
## districtD15 -0.106728 0.096631 -1.104 0.26938
## districtD16 -0.172291 0.097952 -1.759 0.07859 .
## districtD17 -0.085100 0.096132 -0.885 0.37602
## districtD18 -0.181301 0.098661 -1.838 0.06612 .
## districtD19 -0.172440 0.098374 -1.753 0.07962 .
## districtD20 -0.009056 0.093455 -0.097 0.92281
## districtD21 -0.072927 0.095411 -0.764 0.44466
## districtD22 -0.280451 0.101858 -2.753 0.00590 **
## districtD23 -0.121782 0.098963 -1.231 0.21848
## districtD24 -0.095679 0.096881 -0.988 0.32336
## districtD25 -0.173824 0.097952 -1.775 0.07597 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1266.0 on 1446 degrees of freedom
## Residual deviance: 1219.9 on 1418 degrees of freedom
## AIC: 5617.2
##
## Number of Fisher Scoring iterations: 4
irr_df <- tidy(model_poi, exponentiate = TRUE, conf.int = TRUE)
irr_df %>%
rename(Variabel = term, IRR = estimate,
`CI 95% Bawah` = conf.low, `CI 95% Atas` = conf.high,
SE = std.error, `z-value` = statistic, `p-value` = p.value) %>%
mutate(Signifikan = if_else(`p-value` < 0.05, "✓", "")) %>%
kable(digits = 4, caption = "Incidence Rate Ratio (IRR = exp(beta)) — Model Regresi Poisson") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE) %>%
row_spec(which(tidy(model_poi)$p.value < 0.05), background = "#fdedec") %>%
scroll_box(height = "400px")| Variabel | IRR | SE | z-value | p-value | CI 95% Bawah | CI 95% Atas | Signifikan |
|---|---|---|---|---|---|---|---|
| (Intercept) | 0.0000 | 20.2170 | -1.4807 | 0.1387 | 0.0000 | 16113.5040 | |
| musimSemi | 1.0045 | 0.0394 | 0.1148 | 0.9086 | 0.9299 | 1.0851 | |
| musimPanas | 0.9906 | 0.0396 | -0.2372 | 0.8125 | 0.9166 | 1.0707 | |
| musimGugur | 0.9038 | 0.0405 | -2.4981 | 0.0125 | 0.8349 | 0.9784 | ✓ |
| year | 1.0156 | 0.0100 | 1.5488 | 0.1214 | 0.9959 | 1.0357 | |
| districtD02 | 0.9571 | 0.0951 | -0.4612 | 0.6447 | 0.7942 | 1.1531 | |
| districtD03 | 0.9664 | 0.0953 | -0.3588 | 0.7197 | 0.8015 | 1.1648 | |
| districtD04 | 0.8143 | 0.0984 | -2.0879 | 0.0368 | 0.6710 | 0.9871 | ✓ |
| districtD05 | 0.9007 | 0.0966 | -1.0819 | 0.2793 | 0.7450 | 1.0883 | |
| districtD06 | 0.7895 | 0.1002 | -2.3594 | 0.0183 | 0.6481 | 0.9601 | ✓ |
| districtD07 | 0.8527 | 0.0985 | -1.6173 | 0.1058 | 0.7025 | 1.0339 | |
| districtD08 | 0.9862 | 0.0947 | -0.1468 | 0.8833 | 0.8189 | 1.1874 | |
| districtD09 | 0.8726 | 0.0970 | -1.4043 | 0.1602 | 0.7211 | 1.0551 | |
| districtD10 | 0.7577 | 0.1019 | -2.7235 | 0.0065 | 0.6199 | 0.9244 | ✓ |
| districtD11 | 0.9516 | 0.0948 | -0.5235 | 0.6007 | 0.7899 | 1.1459 | |
| districtD12 | 0.8516 | 0.0981 | -1.6374 | 0.1015 | 0.7022 | 1.0317 | |
| districtD13 | 0.9516 | 0.0961 | -0.5163 | 0.6057 | 0.7878 | 1.1487 | |
| districtD14 | 1.0345 | 0.0937 | 0.3624 | 0.7170 | 0.8609 | 1.2432 | |
| districtD15 | 0.8988 | 0.0966 | -1.1045 | 0.2694 | 0.7433 | 1.0859 | |
| districtD16 | 0.8417 | 0.0980 | -1.7589 | 0.0786 | 0.6942 | 1.0195 | |
| districtD17 | 0.9184 | 0.0961 | -0.8852 | 0.3760 | 0.7604 | 1.1087 | |
| districtD18 | 0.8342 | 0.0987 | -1.8376 | 0.0661 | 0.6870 | 1.0117 | |
| districtD19 | 0.8416 | 0.0984 | -1.7529 | 0.0796 | 0.6935 | 1.0201 | |
| districtD20 | 0.9910 | 0.0935 | -0.0969 | 0.9228 | 0.8250 | 1.1904 | |
| districtD21 | 0.9297 | 0.0954 | -0.7643 | 0.4447 | 0.7708 | 1.1208 | |
| districtD22 | 0.7554 | 0.1019 | -2.7534 | 0.0059 | 0.6180 | 0.9216 | ✓ |
| districtD23 | 0.8853 | 0.0990 | -1.2306 | 0.2185 | 0.7287 | 1.0743 | |
| districtD24 | 0.9088 | 0.0969 | -0.9876 | 0.3234 | 0.7512 | 1.0985 | |
| districtD25 | 0.8404 | 0.0980 | -1.7746 | 0.0760 | 0.6932 | 1.0179 |
## INTERPRETASI INCIDENCE RATE RATIO (IRR):
## IRR > 1: prediktor meningkatkan rata-rata jumlah kejahatan
## IRR < 1: prediktor menurunkan rata-rata jumlah kejahatan
for (i in 2:min(8, nrow(irr_df))) {
nm <- irr_df$term[i]
irr <- irr_df$estimate[i]
pv <- irr_df$p.value[i]
sig <- if_else(pv < 0.05, "(signifikan)", "(tidak signifikan)")
cat(sprintf(" %-20s IRR = %.3f -> rata-rata kejahatan %s %.1f%% %s\n",
nm, irr,
if_else(irr > 1, "lebih tinggi", "lebih rendah"),
abs(irr-1)*100, sig))
}## musimSemi IRR = 1.005 -> rata-rata kejahatan lebih tinggi 0.5% (tidak signifikan)
## musimPanas IRR = 0.991 -> rata-rata kejahatan lebih rendah 0.9% (tidak signifikan)
## musimGugur IRR = 0.904 -> rata-rata kejahatan lebih rendah 9.6% (signifikan)
## year IRR = 1.016 -> rata-rata kejahatan lebih tinggi 1.6% (tidak signifikan)
## districtD02 IRR = 0.957 -> rata-rata kejahatan lebih rendah 4.3% (tidak signifikan)
## districtD03 IRR = 0.966 -> rata-rata kejahatan lebih rendah 3.4% (tidak signifikan)
## districtD04 IRR = 0.814 -> rata-rata kejahatan lebih rendah 18.6% (signifikan)
# Uji goodness-of-fit: Pearson chi-square
pearson_chisq <- sum(residuals(model_poi, type="pearson")^2)
df_res <- df.residual(model_poi)
p_gof <- pchisq(pearson_chisq, df_res, lower.tail = FALSE)
cat("=== GOODNESS-OF-FIT TEST ===\n")## === GOODNESS-OF-FIT TEST ===
## Pearson chi-square : 1191.77
## df residual : 1418
## p-value : 1.0000
if (p_gof < 0.05) {
cat("Model kurang fit (p < 0.05). Pertimbangkan penambahan prediktor\n")
cat("atau beralih ke Negative Binomial jika overdispersi berat.\n")
} else {
cat("Model fit baik (p >= 0.05).\n")
}## Model fit baik (p >= 0.05).
## Devians Residual : 1219.91 (df = 1418)
## AIC : 5617.16
cat(sprintf("Pseudo R2 McFadden: %.4f\n",
1 - logLik(model_poi)/logLik(glm(n_crimes~1,data=crime,family=poisson))))## Pseudo R2 McFadden: 0.0082
crime %>%
mutate(pred = predict(model_poi, type = "response")) %>%
group_by(musim) %>%
summarise(Aktual = mean(n_crimes), Prediksi = mean(pred)) %>%
pivot_longer(-musim, names_to = "Tipe", values_to = "Nilai") %>%
ggplot(aes(x = musim, y = Nilai, fill = Tipe)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("#2980b9","#e74c3c")) +
labs(title = "Aktual vs Prediksi per Musim",
subtitle = "Model Regresi Poisson",
x = "Musim", y = "Rata-rata Jumlah Kejahatan", fill = NULL) + theme_tugas()crime_diag <- crime %>%
mutate(fitted = fitted(model_poi),
resid_p = resid(model_poi, type="pearson"),
resid_d = resid(model_poi, type="deviance"))
p_r1 <- ggplot(crime_diag, aes(x=fitted, y=resid_p)) +
geom_point(alpha=0.3, color="#e74c3c", size=0.8) +
geom_hline(yintercept=0, linetype="dashed") +
geom_smooth(method="loess", se=FALSE, color="#2c3e50") +
labs(title="Residual Pearson vs Fitted",
x="Fitted", y="Residual Pearson") + theme_tugas()
p_r2 <- ggplot(crime_diag, aes(sample=resid_d)) +
stat_qq(color="#e74c3c", alpha=0.4) + stat_qq_line(color="#2c3e50") +
labs(title="Q-Q Plot Residual Devians",
x="Theoretical Quantiles", y="Sample Quantiles") + theme_tugas()
gridExtra::grid.arrange(p_r1, p_r2, ncol=2)| Aspek | Biner | Multinomial | Ordinal | Poisson |
|---|---|---|---|---|
| Dataset | Adult Income (UCI) | Dry Bean (UCI) | Drug Consumption (UCI) | Chicago Crime |
| n observasi | ~45.000 | 13.611 | 1.885 | Agregat distrik |
| Variabel Y | Income >50K/<=50K | Jenis kacang (7 kelas) | Konsumsi cannabis (7 level) | Jml. kejahatan/bulan |
| Skala Y | Nominal biner | Nominal (tak berurutan) | Ordinal (berurutan) | Count (cacahan) |
| Link function | logit | log (softmax) | logit kumulatif | log |
| Package R | glm (stats) | multinom (nnet) | polr (MASS) | glm (stats) |
| Interpretasi β | Odds Ratio (OR) | Relative Risk Ratio (RRR) | Cumulative Odds Ratio | Incidence Rate Ratio (IRR) |
| Asumsi tambahan | Tidak ada multikolinearitas | IIA (Irrelevant Alt.) | Proportional Odds | Ekuidispersi (mean=var) |
| Kondisi Variabel Y | Model | Fungsi R |
|---|---|---|
| 2 kategori (Ya/Tidak) | Regresi Logistik Biner | glm(…, family=binomial) |
| >=3 kategori TANPA urutan | Regresi Logistik Multinomial | multinom() — package nnet |
| >=3 kategori DENGAN urutan | Regresi Logistik Ordinal | polr() — package MASS |
| Data hitung (0,1,2,…) | Regresi Poisson | glm(…, family=poisson) |
| Count + overdispersi berat | Regresi Negative Binomial | glm.nb() — package MASS |