library(nnet)
library(car)
library(ggplot2)
library(dplyr)
library(tidyr)
library(broom)
library(png)
library(mlogit)
library(margins)
df_raw <- read.csv("Palmer Archipelago (Antarctica) Penguin Data\\penguins_lter.csv",
stringsAsFactors = FALSE)
cat("Dimensi data asli:", dim(df_raw), "\n")
## Dimensi data asli: 344 17
df_raw <- dplyr::rename(df_raw,
species_raw = Species,
island_raw = Island,
culmen_length = Culmen.Length..mm.,
culmen_depth = Culmen.Depth..mm.,
flipper_length = Flipper.Length..mm.,
body_mass = Body.Mass..g.,
sex = Sex
)
df_raw <- df_raw %>%
mutate(
species = case_when(
grepl("Adelie", species_raw, ignore.case = TRUE) ~ "Adelie",
grepl("Gentoo", species_raw, ignore.case = TRUE) ~ "Gentoo",
TRUE ~ "Others"
),
island = case_when(
island_raw == "Biscoe" ~ "Biscoe",
island_raw == "Dream" ~ "Dream",
TRUE ~ "Others"
)
)
# Bersihkan NA, filter sex valid, buat factor dengan reference level eksplisit
df <- df_raw %>%
select(species, island, culmen_length, culmen_depth,
flipper_length, body_mass, sex) %>%
filter(
!is.na(culmen_length), !is.na(culmen_depth),
!is.na(flipper_length), !is.na(body_mass),
sex %in% c("MALE", "FEMALE")
) %>%
mutate(
species = factor(species, levels = c("Adelie", "Gentoo", "Others")),
island = factor(island, levels = c("Biscoe", "Dream", "Others")),
sex = factor(sex, levels = c("FEMALE", "MALE"))
)
# MLR pakai log-transformation
df <- df %>%
mutate(
log_culmen_length = log(culmen_length),
log_culmen_depth = log(culmen_depth),
log_flipper_length = log(flipper_length),
log_body_mass = log(body_mass)
)
vars_log <- c("log_culmen_length", "log_culmen_depth",
"log_flipper_length", "log_body_mass")
cat("Data siap:", nrow(df), "baris\n")
## Data siap: 333 baris
cat("Reference category (base): Adelie\n")
## Reference category (base): Adelie
cat("Distribusi species:\n")
## Distribusi species:
print(table(df$species))
##
## Adelie Gentoo Others
## 146 119 68
cat("ASUMSI 1: LINEARITY OF LOG-ODDS\n")
## ASUMSI 1: LINEARITY OF LOG-ODDS
cat("Ringkasan prediktor SEBELUM log-transformation:\n")
## Ringkasan prediktor SEBELUM log-transformation:
print(summary(df[, c("culmen_length", "culmen_depth",
"flipper_length", "body_mass")]))
## culmen_length culmen_depth flipper_length body_mass
## Min. :32.10 Min. :13.10 Min. :172 Min. :2700
## 1st Qu.:39.50 1st Qu.:15.60 1st Qu.:190 1st Qu.:3550
## Median :44.50 Median :17.30 Median :197 Median :4050
## Mean :43.99 Mean :17.16 Mean :201 Mean :4207
## 3rd Qu.:48.60 3rd Qu.:18.70 3rd Qu.:213 3rd Qu.:4775
## Max. :59.60 Max. :21.50 Max. :231 Max. :6300
cat("\nRingkasan prediktor SETELAH log-transformation:\n")
##
## Ringkasan prediktor SETELAH log-transformation:
print(summary(df[, vars_log]))
## log_culmen_length log_culmen_depth log_flipper_length log_body_mass
## Min. :3.469 Min. :2.573 Min. :5.147 Min. :7.901
## 1st Qu.:3.676 1st Qu.:2.747 1st Qu.:5.247 1st Qu.:8.175
## Median :3.795 Median :2.851 Median :5.283 Median :8.306
## Mean :3.776 Mean :2.836 Mean :5.301 Mean :8.327
## 3rd Qu.:3.884 3rd Qu.:2.929 3rd Qu.:5.361 3rd Qu.:8.471
## Max. :4.088 Max. :3.068 Max. :5.442 Max. :8.748
status_lin <- "TERPENUHI (via log-transformation)"
cat(sprintf("\n>> STATUS ASUMSI 1 (Linearity) : %s\n", status_lin))
##
## >> STATUS ASUMSI 1 (Linearity) : TERPENUHI (via log-transformation)
Asumsi linearity dipenuhi melalui log-transformation pada keempat prediktor numerik (culmen length, culmen depth, flipper length, body mass). Log-transformation menstabilkan varians dan memungkinkan hubungan multiplikatif antara prediktor dan response.
cat("ASUMSI 2: NO MULTICOLLINEARITY\n")
## ASUMSI 2: NO MULTICOLLINEARITY
cat("Uji: Variance Inflation Factor (VIF)\n")
## Uji: Variance Inflation Factor (VIF)
cat("Threshold: VIF > 10 mengindikasikan multikolinearitas serius\n")
## Threshold: VIF > 10 mengindikasikan multikolinearitas serius
cat(" VIF > 5 perlu diwaspadai\n\n")
## VIF > 5 perlu diwaspadai
lm_proxy <- lm(as.numeric(species) ~ log_culmen_length + log_culmen_depth +
log_flipper_length + log_body_mass + island + sex,
data = df)
vif_vals <- vif(lm_proxy)
cat("VIF tiap prediktor:\n")
## VIF tiap prediktor:
print(round(vif_vals, 4))
## GVIF Df GVIF^(1/(2*Df))
## log_culmen_length 2.3204 1 1.5233
## log_culmen_depth 3.3711 1 1.8361
## log_flipper_length 5.5855 1 2.3634
## log_body_mass 5.6607 1 2.3792
## island 2.6338 2 1.2739
## sex 2.5470 1 1.5959
vif_check <- if (is.matrix(vif_vals)) vif_vals[, 3]^2 else vif_vals
status_mc <- if (all(vif_check < 10)) "TERPENUHI" else "TIDAK TERPENUHI"
cat(sprintf("\n>> STATUS ASUMSI 2 (No Multicollinearity) : %s\n", status_mc))
##
## >> STATUS ASUMSI 2 (No Multicollinearity) : TERPENUHI
Semua VIF berada di bawah threshold 10, mengindikasikan tidak ada masalah multikolinearitas serius.
cat("ASUMSI 3: INDEPENDENCE OF OBSERVATIONS\n")
## ASUMSI 3: INDEPENDENCE OF OBSERVATIONS
cat(sprintf("Jumlah observasi unik: %d\n", nrow(df)))
## Jumlah observasi unik: 333
cat("Tidak ada duplikasi atau clustering temporal.\n")
## Tidak ada duplikasi atau clustering temporal.
status_ind <- "TERPENUHI (cross-sectional design)"
cat(sprintf("\n>> STATUS ASUMSI 3 (Independence) : %s\n", status_ind))
##
## >> STATUS ASUMSI 3 (Independence) : TERPENUHI (cross-sectional design)
cat("RINGKASAN ASUMSI MULTINOMIAL LOGISTIC REGRESSION\n")
## RINGKASAN ASUMSI MULTINOMIAL LOGISTIC REGRESSION
asumsi_mlr_names <- c("Linearity of Log-Odds",
"No Multicollinearity",
"Independence of Observations")
asumsi_mlr_stats <- c(status_lin, status_mc, status_ind)
for (i in seq_along(asumsi_mlr_names)) {
tanda <- if (grepl("TERPENUHI", asumsi_mlr_stats[i])) "[v]" else "[x]"
cat(sprintf(" %s %-30s : %s\n",
tanda, asumsi_mlr_names[i], asumsi_mlr_stats[i]))
}
## [v] Linearity of Log-Odds : TERPENUHI (via log-transformation)
## [v] No Multicollinearity : TERPENUHI
## [v] Independence of Observations : TERPENUHI (cross-sectional design)
cat("\nKESIMPULAN: Asumsi MLR terpenuhi, model dapat di-fit.\n")
##
## KESIMPULAN: Asumsi MLR terpenuhi, model dapat di-fit.
cat("STEP 3: FIT MODEL MLR\n")
## STEP 3: FIT MODEL MLR
cat("Reference category: Adelie\n")
## Reference category: Adelie
cat("Predictors: log(culmen_length), log(culmen_depth), log(flipper_length),\n")
## Predictors: log(culmen_length), log(culmen_depth), log(flipper_length),
cat(" log(body_mass), island, sex\n\n")
## log(body_mass), island, sex
df$species <- relevel(df$species, ref = "Adelie")
# Fit MLR dengan log-transformed predictors + island + sex
mlr_model <- multinom(species ~ log_culmen_length + log_culmen_depth +
log_flipper_length + log_body_mass +
island + sex,
data = df,
trace = FALSE)
cat("Ringkasan Model MLR:\n")
## Ringkasan Model MLR:
print(summary(mlr_model))
## Call:
## multinom(formula = species ~ log_culmen_length + log_culmen_depth +
## log_flipper_length + log_body_mass + island + sex, data = df,
## trace = FALSE)
##
## Coefficients:
## (Intercept) log_culmen_length log_culmen_depth log_flipper_length
## Gentoo -140.8718 464.3764 -1039.7359 -77.25628
## Others -120.4244 1306.3289 -356.3137 -207.23278
## log_body_mass islandDream islandOthers sexMALE
## Gentoo 200.7984 -35.26019 11.93033 65.33625
## Others -341.7207 130.12291 79.27830 -15.77810
##
## Std. Errors:
## (Intercept) log_culmen_length log_culmen_depth log_flipper_length
## Gentoo 1.123483 4.223028 3.222035 5.929871
## Others 1786.296303 1613.643790 11423.714790 15290.935272
## log_body_mass islandDream islandOthers sexMALE
## Gentoo 9.499404 3.031093e-12 1.123483e+00 1.123483e+00
## Others 11751.830565 1.786296e+03 7.274925e-09 4.283925e-11
##
## Residual Deviance: 0.000149664
## AIC: 32.00015
# Ekstrak koefisien dan p-value (Wald test)
mlr_summary <- summary(mlr_model)
mlr_coef <- mlr_summary$coefficients
mlr_se <- mlr_summary$standard.errors
# Z-score dan p-value (two-tailed Wald test)
z_scores <- mlr_coef / mlr_se
p_values <- 2 * (1 - pnorm(abs(z_scores)))
cat("Koefisien (Log-Odds):\n")
## Koefisien (Log-Odds):
print(round(mlr_coef, 4))
## (Intercept) log_culmen_length log_culmen_depth log_flipper_length
## Gentoo -140.8718 464.3764 -1039.7359 -77.2563
## Others -120.4244 1306.3289 -356.3137 -207.2328
## log_body_mass islandDream islandOthers sexMALE
## Gentoo 200.7984 -35.2602 11.9303 65.3363
## Others -341.7207 130.1229 79.2783 -15.7781
cat("\nStandard Error:\n")
##
## Standard Error:
print(round(mlr_se, 4))
## (Intercept) log_culmen_length log_culmen_depth log_flipper_length
## Gentoo 1.1235 4.223 3.222 5.9299
## Others 1786.2963 1613.644 11423.715 15290.9353
## log_body_mass islandDream islandOthers sexMALE
## Gentoo 9.4994 0.000 1.1235 1.1235
## Others 11751.8306 1786.296 0.0000 0.0000
cat("\nZ-score:\n")
##
## Z-score:
print(round(z_scores, 4))
## (Intercept) log_culmen_length log_culmen_depth log_flipper_length
## Gentoo -125.3885 109.9629 -322.6954 -13.0283
## Others -0.0674 0.8096 -0.0312 -0.0136
## log_body_mass islandDream islandOthers sexMALE
## Gentoo 21.1380 -1.163283e+13 1.061910e+01 5.815510e+01
## Others -0.0291 7.280000e-02 1.089747e+10 -3.683095e+11
cat("\np-value (Wald Test):\n")
##
## p-value (Wald Test):
print(round(p_values, 4))
## (Intercept) log_culmen_length log_culmen_depth log_flipper_length
## Gentoo 0.0000 0.0000 0.0000 0.0000
## Others 0.9463 0.4182 0.9751 0.9892
## log_body_mass islandDream islandOthers sexMALE
## Gentoo 0.0000 0.0000 0 0
## Others 0.9768 0.9419 0 0
Koefisien MLR diinterpretasikan pada skala log-odds. Karena prediktor numerik di-log-transform, satu unit kenaikan adalah pada skala log (bukan original scale).
cat("STEP 4: INTERPRETASI RELATIVE LOG-ODDS\n")
## STEP 4: INTERPRETASI RELATIVE LOG-ODDS
make_coef_table <- function(coef_mat, pval_mat, contrast_name) {
data.frame(
Predictor = colnames(coef_mat),
Beta = round(coef_mat[contrast_name, ], 4),
P_value = round(pval_mat[contrast_name, ], 4),
Sig = ifelse(pval_mat[contrast_name, ] < 0.001, "***",
ifelse(pval_mat[contrast_name, ] < 0.01, "**",
ifelse(pval_mat[contrast_name, ] < 0.05, "*",
ifelse(pval_mat[contrast_name, ] < 0.1, ".", ""))))
)
}
cat("\n Contrast 1: Gentoo vs Adelie\n")
##
## Contrast 1: Gentoo vs Adelie
coef_gentoo <- make_coef_table(mlr_coef, p_values, "Gentoo")
print(coef_gentoo, row.names = FALSE)
## Predictor Beta P_value Sig
## (Intercept) -140.8718 0 ***
## log_culmen_length 464.3764 0 ***
## log_culmen_depth -1039.7359 0 ***
## log_flipper_length -77.2563 0 ***
## log_body_mass 200.7984 0 ***
## islandDream -35.2602 0 ***
## islandOthers 11.9303 0 ***
## sexMALE 65.3363 0 ***
cat("\n Contrast 2: Others (Chinstrap) vs Adelie\n")
##
## Contrast 2: Others (Chinstrap) vs Adelie
coef_others <- make_coef_table(mlr_coef, p_values, "Others")
print(coef_others, row.names = FALSE)
## Predictor Beta P_value Sig
## (Intercept) -120.4244 0.9463
## log_culmen_length 1306.3289 0.4182
## log_culmen_depth -356.3137 0.9751
## log_flipper_length -207.2328 0.9892
## log_body_mass -341.7207 0.9768
## islandDream 130.1229 0.9419
## islandOthers 79.2783 0.0000 ***
## sexMALE -15.7781 0.0000 ***
cat("\nKeterangan: *** p<0.001, ** p<0.01, * p<0.05, . p<0.10\n")
##
## Keterangan: *** p<0.001, ** p<0.01, * p<0.05, . p<0.10
cat("INTERPRETASI LOG-ODDS\n")
## INTERPRETASI LOG-ODDS
cat("\n PREDIKTOR NUMERIK (Log-Transformed)\n")
##
## PREDIKTOR NUMERIK (Log-Transformed)
cat("Interpretasi: Satu unit kenaikan log(X) -> log-odds berubah sebesar beta\n\n")
## Interpretasi: Satu unit kenaikan log(X) -> log-odds berubah sebesar beta
numeric_vars <- c("log_culmen_length", "log_culmen_depth",
"log_flipper_length", "log_body_mass")
for (v in numeric_vars) {
cat(sprintf("Variable: %s\n", v))
cat(sprintf(" Gentoo vs Adelie : beta = %.4f (p = %.4f)\n",
mlr_coef["Gentoo", v], p_values["Gentoo", v]))
cat(sprintf(" Others vs Adelie : beta = %.4f (p = %.4f)\n",
mlr_coef["Others", v], p_values["Others", v]))
cat("\n")
}
## Variable: log_culmen_length
## Gentoo vs Adelie : beta = 464.3764 (p = 0.0000)
## Others vs Adelie : beta = 1306.3289 (p = 0.4182)
##
## Variable: log_culmen_depth
## Gentoo vs Adelie : beta = -1039.7359 (p = 0.0000)
## Others vs Adelie : beta = -356.3137 (p = 0.9751)
##
## Variable: log_flipper_length
## Gentoo vs Adelie : beta = -77.2563 (p = 0.0000)
## Others vs Adelie : beta = -207.2328 (p = 0.9892)
##
## Variable: log_body_mass
## Gentoo vs Adelie : beta = 200.7984 (p = 0.0000)
## Others vs Adelie : beta = -341.7207 (p = 0.9768)
cat("PREDIKTOR KATEGORIK\n")
## PREDIKTOR KATEGORIK
cat("Interpretasi: Selisih log-odds antara kategori vs reference\n")
## Interpretasi: Selisih log-odds antara kategori vs reference
cat("Reference: Island = Biscoe | Sex = FEMALE\n\n")
## Reference: Island = Biscoe | Sex = FEMALE
categorical_vars <- grep("^island|^sex", colnames(mlr_coef), value = TRUE)
for (v in categorical_vars) {
cat(sprintf("Variable: %s\n", v))
cat(sprintf(" Gentoo vs Adelie : beta = %.4f (p = %.4f)\n",
mlr_coef["Gentoo", v], p_values["Gentoo", v]))
cat(sprintf(" Others vs Adelie : beta = %.4f (p = %.4f)\n",
mlr_coef["Others", v], p_values["Others", v]))
cat("\n")
}
## Variable: islandDream
## Gentoo vs Adelie : beta = -35.2602 (p = 0.0000)
## Others vs Adelie : beta = 130.1229 (p = 0.9419)
##
## Variable: islandOthers
## Gentoo vs Adelie : beta = 11.9303 (p = 0.0000)
## Others vs Adelie : beta = 79.2783 (p = 0.0000)
##
## Variable: sexMALE
## Gentoo vs Adelie : beta = 65.3363 (p = 0.0000)
## Others vs Adelie : beta = -15.7781 (p = 0.0000)
cat("STEP 5: RELATIVE RISK RATIOS (RRR)\n")
## STEP 5: RELATIVE RISK RATIOS (RRR)
cat("RRR = exp(beta) -> seberapa besar risk relatif berubah\n")
## RRR = exp(beta) -> seberapa besar risk relatif berubah
cat("RRR > 1 : meningkatkan kemungkinan species j (vs Adelie)\n")
## RRR > 1 : meningkatkan kemungkinan species j (vs Adelie)
cat("RRR < 1 : menurunkan kemungkinan species j (vs Adelie)\n")
## RRR < 1 : menurunkan kemungkinan species j (vs Adelie)
cat("RRR = 1 : tidak ada perubahan\n\n")
## RRR = 1 : tidak ada perubahan
# Eksponentialkan koefisien
rrr <- exp(mlr_coef)
cat("Tabel Relative Risk Ratios:\n")
## Tabel Relative Risk Ratios:
print(round(rrr, 4))
## (Intercept) log_culmen_length log_culmen_depth log_flipper_length
## Gentoo 0 4.743703e+201 0 0
## Others 0 Inf 0 0
## log_body_mass islandDream islandOthers sexMALE
## Gentoo 1.605582e+87 0.000000e+00 1.518017e+05 2.372322e+28
## Others 0.000000e+00 3.248347e+56 2.692328e+34 0.000000e+00
cat("INTERPRETASI RRR\n")
## INTERPRETASI RRR
cat("\n- A. NUMERIK (Log-Transformed) -\n")
##
## - A. NUMERIK (Log-Transformed) -
cat("Interpretasi: jika log(X) naik 1 unit (atau X dikalikan e ~ 2.718),\n")
## Interpretasi: jika log(X) naik 1 unit (atau X dikalikan e ~ 2.718),
cat(" relative risk vs Adelie berubah sebesar faktor RRR\n\n")
## relative risk vs Adelie berubah sebesar faktor RRR
for (v in numeric_vars) {
cat(sprintf("%s:\n", v))
cat(sprintf(" RRR Gentoo = %.4e | RRR Others = %.4e\n",
rrr["Gentoo", v], rrr["Others", v]))
}
## log_culmen_length:
## RRR Gentoo = 4.7437e+201 | RRR Others = Inf
## log_culmen_depth:
## RRR Gentoo = 0.0000e+00 | RRR Others = 1.7986e-155
## log_flipper_length:
## RRR Gentoo = 2.8056e-34 | RRR Others = 9.9988e-91
## log_body_mass:
## RRR Gentoo = 1.6056e+87 | RRR Others = 3.9136e-149
cat("\n- B. KATEGORIK -\n")
##
## - B. KATEGORIK -
cat("Interpretasi per kategori (vs reference): tiap level non-reference\n")
## Interpretasi per kategori (vs reference): tiap level non-reference
cat("dibandingkan terhadap reference, holding other variables constant\n\n")
## dibandingkan terhadap reference, holding other variables constant
for (v in categorical_vars) {
cat(sprintf("%s:\n", v))
cat(sprintf(" RRR Gentoo = %.4f | RRR Others = %.4f\n",
rrr["Gentoo", v], rrr["Others", v]))
}
## islandDream:
## RRR Gentoo = 0.0000 | RRR Others = 324834735872812779520006262080260682620204224442284844806.0000
## islandOthers:
## RRR Gentoo = 151801.6825 | RRR Others = 26923279014863810530680440044222608.0000
## sexMALE:
## RRR Gentoo = 23723220945913667986628040886.0000 | RRR Others = 0.0000
rrr_table_gentoo <- data.frame(
Predictor = colnames(rrr),
RRR = round(rrr["Gentoo", ], 4),
Beta = round(mlr_coef["Gentoo", ], 4),
P_value = round(p_values["Gentoo", ], 4)
)
rrr_table_others <- data.frame(
Predictor = colnames(rrr),
RRR = round(rrr["Others", ], 4),
Beta = round(mlr_coef["Others", ], 4),
P_value = round(p_values["Others", ], 4)
)
cat("Tabel RRR untuk Gentoo vs Adelie:\n")
## Tabel RRR untuk Gentoo vs Adelie:
print(rrr_table_gentoo, row.names = FALSE)
## Predictor RRR Beta P_value
## (Intercept) 0.000000e+00 -140.8718 0
## log_culmen_length 4.743703e+201 464.3764 0
## log_culmen_depth 0.000000e+00 -1039.7359 0
## log_flipper_length 0.000000e+00 -77.2563 0
## log_body_mass 1.605582e+87 200.7984 0
## islandDream 0.000000e+00 -35.2602 0
## islandOthers 1.518017e+05 11.9303 0
## sexMALE 2.372322e+28 65.3363 0
cat("\nTabel RRR untuk Others (Chinstrap) vs Adelie:\n")
##
## Tabel RRR untuk Others (Chinstrap) vs Adelie:
print(rrr_table_others, row.names = FALSE)
## Predictor RRR Beta P_value
## (Intercept) 0.000000e+00 -120.4244 0.9463
## log_culmen_length Inf 1306.3289 0.4182
## log_culmen_depth 0.000000e+00 -356.3137 0.9751
## log_flipper_length 0.000000e+00 -207.2328 0.9892
## log_body_mass 0.000000e+00 -341.7207 0.9768
## islandDream 3.248347e+56 130.1229 0.9419
## islandOthers 2.692328e+34 79.2783 0.0000
## sexMALE 0.000000e+00 -15.7781 0.0000
Marginal Effect mengukur perubahan probabilitas prediksi suatu kategori species ketika satu prediktor berubah satu unit, dengan prediktor lain dipegang konstan (evaluated at the mean / average marginal effect).
cat("STEP 6: MARGINAL EFFECTS\n")
## STEP 6: MARGINAL EFFECTS
cat("Metode: Average Marginal Effects (AME)\n")
## Metode: Average Marginal Effects (AME)
softmax_pred <- function(model, newdata) {
predict(model, newdata = newdata, type = "probs")
}
delta <- 1e-5
numeric_preds <- c("log_culmen_length", "log_culmen_depth",
"log_flipper_length", "log_body_mass")
cat("AME untuk Prediktor Numerik (Log-Transformed)\n")
## AME untuk Prediktor Numerik (Log-Transformed)
cat("Interpretasi: Kenaikan 1 unit log(X) -> rata-rata perubahan P(species) sebesar:\n\n")
## Interpretasi: Kenaikan 1 unit log(X) -> rata-rata perubahan P(species) sebesar:
ame_numeric <- data.frame()
for (v in numeric_preds) {
df_plus <- df
df_minus <- df
df_plus[[v]] <- df[[v]] + delta
df_minus[[v]] <- df[[v]] - delta
prob_plus <- softmax_pred(mlr_model, df_plus)
prob_minus <- softmax_pred(mlr_model, df_minus)
# Numerical derivative: (f(x+h) - f(x-h)) / 2h
me_mat <- (prob_plus - prob_minus) / (2 * delta)
# AME = rata-rata di seluruh observasi
ame_row <- colMeans(me_mat)
ame_numeric <- rbind(ame_numeric, data.frame(
Predictor = v,
AME_Adelie = round(ame_row["Adelie"], 6),
AME_Gentoo = round(ame_row["Gentoo"], 6),
AME_Others = round(ame_row["Others"], 6)
))
cat(sprintf(" %s:\n", v))
cat(sprintf(" AME P(Adelie) = %+.6f\n", ame_row["Adelie"]))
cat(sprintf(" AME P(Gentoo) = %+.6f\n", ame_row["Gentoo"]))
cat(sprintf(" AME P(Others) = %+.6f\n", ame_row["Others"]))
cat(sprintf(" [Catatan: Jumlah AME ketiga kategori = %.6f (harus ~ 0)]\n\n",
sum(ame_row)))
}
## log_culmen_length:
## AME P(Adelie) = -0.000165
## AME P(Gentoo) = +0.000071
## AME P(Others) = +0.000094
## [Catatan: Jumlah AME ketiga kategori = 0.000000 (harus ~ 0)]
##
## log_culmen_depth:
## AME P(Adelie) = +0.000185
## AME P(Gentoo) = -0.000159
## AME P(Others) = -0.000026
## [Catatan: Jumlah AME ketiga kategori = 0.000000 (harus ~ 0)]
##
## log_flipper_length:
## AME P(Adelie) = +0.000027
## AME P(Gentoo) = -0.000012
## AME P(Others) = -0.000015
## [Catatan: Jumlah AME ketiga kategori = 0.000000 (harus ~ 0)]
##
## log_body_mass:
## AME P(Adelie) = -0.000006
## AME P(Gentoo) = +0.000031
## AME P(Others) = -0.000025
## [Catatan: Jumlah AME ketiga kategori = 0.000000 (harus ~ 0)]
cat("Ringkasan AME Numerik:\n")
## Ringkasan AME Numerik:
print(ame_numeric, row.names = FALSE)
## Predictor AME_Adelie AME_Gentoo AME_Others
## log_culmen_length -0.000165 0.000071 9.4e-05
## log_culmen_depth 0.000185 -0.000159 -2.6e-05
## log_flipper_length 0.000027 -0.000012 -1.5e-05
## log_body_mass -0.000006 0.000031 -2.5e-05
cat("\n AME untuk Prediktor Kategorik (Discrete Change)\n")
##
## AME untuk Prediktor Kategorik (Discrete Change)
# Island: Biscoe (reference) -> Dream, Others
# Sex: FEMALE (reference) -> MALE
ame_categorical <- data.frame()
# Island: Dream vs Biscoe
df_biscoe <- df; df_biscoe$island <- factor("Biscoe", levels = levels(df$island))
df_dream <- df; df_dream$island <- factor("Dream", levels = levels(df$island))
df_others_isl <- df; df_others_isl$island <- factor("Others", levels = levels(df$island))
prob_biscoe <- colMeans(softmax_pred(mlr_model, df_biscoe))
prob_dream <- colMeans(softmax_pred(mlr_model, df_dream))
prob_others_isl <- colMeans(softmax_pred(mlr_model, df_others_isl))
ame_dream <- prob_dream - prob_biscoe
ame_oth_i <- prob_others_isl - prob_biscoe
cat("Island: Dream vs Biscoe (reference)\n")
## Island: Dream vs Biscoe (reference)
cat(sprintf(" AME P(Adelie) = %+.6f\n", ame_dream["Adelie"]))
## AME P(Adelie) = -0.100799
cat(sprintf(" AME P(Gentoo) = %+.6f\n", ame_dream["Gentoo"]))
## AME P(Gentoo) = -0.029758
cat(sprintf(" AME P(Others) = %+.6f\n\n", ame_dream["Others"]))
## AME P(Others) = +0.130557
cat("Island: Others (Torgersen) vs Biscoe (reference)\n")
## Island: Others (Torgersen) vs Biscoe (reference)
cat(sprintf(" AME P(Adelie) = %+.6f\n", ame_oth_i["Adelie"]))
## AME P(Adelie) = -0.075112
cat(sprintf(" AME P(Gentoo) = %+.6f\n", ame_oth_i["Gentoo"]))
## AME P(Gentoo) = -0.000009
cat(sprintf(" AME P(Others) = %+.6f\n\n", ame_oth_i["Others"]))
## AME P(Others) = +0.075121
# Sex: MALE vs FEMALE
df_female <- df; df_female$sex <- factor("FEMALE", levels = levels(df$sex))
df_male <- df; df_male$sex <- factor("MALE", levels = levels(df$sex))
prob_female <- colMeans(softmax_pred(mlr_model, df_female))
prob_male <- colMeans(softmax_pred(mlr_model, df_male))
ame_sex <- prob_male - prob_female
cat("Sex: MALE vs FEMALE (reference)\n")
## Sex: MALE vs FEMALE (reference)
cat(sprintf(" AME P(Adelie) = %+.6f\n", ame_sex["Adelie"]))
## AME P(Adelie) = +0.002052
cat(sprintf(" AME P(Gentoo) = %+.6f\n", ame_sex["Gentoo"]))
## AME P(Gentoo) = +0.003003
cat(sprintf(" AME P(Others) = %+.6f\n\n", ame_sex["Others"]))
## AME P(Others) = -0.005055
# Tabel ringkasan
ame_cat_df <- data.frame(
Contrast = c("island: Dream vs Biscoe",
"island: Others vs Biscoe",
"sex: MALE vs FEMALE"),
AME_Adelie = round(c(ame_dream["Adelie"], ame_oth_i["Adelie"], ame_sex["Adelie"]), 6),
AME_Gentoo = round(c(ame_dream["Gentoo"], ame_oth_i["Gentoo"], ame_sex["Gentoo"]), 6),
AME_Others = round(c(ame_dream["Others"], ame_oth_i["Others"], ame_sex["Others"]), 6)
)
cat("Tabel Ringkasan AME:\n")
## Tabel Ringkasan AME:
print(ame_cat_df, row.names = FALSE)
## Contrast AME_Adelie AME_Gentoo AME_Others
## island: Dream vs Biscoe -0.100799 -0.029758 0.130557
## island: Others vs Biscoe -0.075112 -0.000009 0.075121
## sex: MALE vs FEMALE 0.002052 0.003003 -0.005055
cat("\nINTERPRETASI MARGINAL EFFECTS:\n")
##
## INTERPRETASI MARGINAL EFFECTS:
cat("AME positif : kenaikan prediktor MENINGKATKAN probabilitas kategori\n")
## AME positif : kenaikan prediktor MENINGKATKAN probabilitas kategori
cat("AME negatif : kenaikan prediktor MENURUNKAN probabilitas kategori\n")
## AME negatif : kenaikan prediktor MENURUNKAN probabilitas kategori
cat("Jumlah AME ketiga kategori = 0 (identitas probabilitas)\n")
## Jumlah AME ketiga kategori = 0 (identitas probabilitas)
cat("Satuan AME : satuan probabilitas (bukan log-odds, bukan RRR)\n")
## Satuan AME : satuan probabilitas (bukan log-odds, bukan RRR)
cat("Numerik : marginal effect dihitung pada log(X), bukan X asli\n")
## Numerik : marginal effect dihitung pada log(X), bukan X asli
cat("untuk interpretasi pada X asli, AME perlu dibagi X (chain rule)\n")
## untuk interpretasi pada X asli, AME perlu dibagi X (chain rule)
Uji serentak menguji apakah model dengan semua prediktor secara bersama-sama lebih baik dari model null (hanya intercept). Hipotesis: - H₀: Semua koefisien prediktor = 0 (model null, tidak ada pengaruh prediktor) - H₁: Minimal satu koefisien ≠ 0
cat("STEP 7A: UJI SIGNIFIKANSI SERENTAK (LIKELIHOOD RATIO TEST)\n")
## STEP 7A: UJI SIGNIFIKANSI SERENTAK (LIKELIHOOD RATIO TEST)
cat("H0: Semua beta = 0 (model null)\n")
## H0: Semua beta = 0 (model null)
cat("H1: Minimal satu beta != 0\n\n")
## H1: Minimal satu beta != 0
# Fit model null
mlr_null <- multinom(species ~ 1, data = df, trace = FALSE)
# Likelihood Ratio Test: G^2 = -2*(logLik(null) - logLik(full))
ll_null <- logLik(mlr_null)
ll_full <- logLik(mlr_model)
G2 <- -2 * (as.numeric(ll_null) - as.numeric(ll_full))
df_lrt <- attr(ll_full, "df") - attr(ll_null, "df")
p_lrt <- pchisq(G2, df = df_lrt, lower.tail = FALSE)
cat(sprintf("Log-Likelihood Model Null : %.4f\n", as.numeric(ll_null)))
## Log-Likelihood Model Null : -350.8627
cat(sprintf("Log-Likelihood Model Full : %.4f\n", as.numeric(ll_full)))
## Log-Likelihood Model Full : -0.0001
cat(sprintf("Statistik G^2 (Chi-Square): %.4f\n", G2))
## Statistik G^2 (Chi-Square): 701.7252
cat(sprintf("Derajat Bebas : %d\n", df_lrt))
## Derajat Bebas : 14
cat(sprintf("p-value : %.6f\n", p_lrt))
## p-value : 0.000000
if (p_lrt < 0.05) {
cat("\nKESIMPULAN: Tolak H0. Model dengan prediktor signifikan lebih baik dari\n")
cat(" model null. Minimal satu prediktor berpengaruh signifikan\n")
cat(" terhadap klasifikasi species penguin.\n")
} else {
cat("\nKESIMPULAN: Gagal tolak H0. Model tidak signifikan secara serentak.\n")
}
##
## KESIMPULAN: Tolak H0. Model dengan prediktor signifikan lebih baik dari
## model null. Minimal satu prediktor berpengaruh signifikan
## terhadap klasifikasi species penguin.
# McFadden's Pseudo R-squared (ukuran goodness-of-fit)
cat("\n Pseudo R-Squared (McFadden)\n")
##
## Pseudo R-Squared (McFadden)
pseudo_r2 <- 1 - (as.numeric(ll_full) / as.numeric(ll_null))
cat(sprintf("McFadden's Pseudo R^2 = %.4f\n", pseudo_r2))
## McFadden's Pseudo R^2 = 1.0000
cat("Interpretasi:\n")
## Interpretasi:
cat(" 0.20 - 0.40 : model excellent fit\n")
## 0.20 - 0.40 : model excellent fit
cat(" 0.10 - 0.20 : model acceptable fit\n")
## 0.10 - 0.20 : model acceptable fit
cat(sprintf(" Nilai %.4f: %s\n", pseudo_r2,
ifelse(pseudo_r2 > 0.20, "Excellent fit", "Acceptable fit")))
## Nilai 1.0000: Excellent fit
Uji parsial menguji apakah masing-masing prediktor berkontribusi signifikan terhadap model. Hipotesis untuk tiap prediktor X_k: - H₀: β_k = 0 (prediktor X_k tidak berpengaruh) - H₁: β_k ≠ 0
cat("STEP 7B: UJI SIGNIFIKANSI PARSIAL (WALD TEST)\n")
## STEP 7B: UJI SIGNIFIKANSI PARSIAL (WALD TEST)
cat("Statistik uji: Z = beta / SE ~ N(0,1)\n")
## Statistik uji: Z = beta / SE ~ N(0,1)
cat("p-value: 2 * P(Z > |z_obs|)\n\n")
## p-value: 2 * P(Z > |z_obs|)
cat("Gentoo vs Adelie\n")
## Gentoo vs Adelie
wald_gentoo <- data.frame(
Predictor = colnames(mlr_coef),
Beta = round(mlr_coef["Gentoo", ], 4),
SE = round(mlr_se["Gentoo", ], 4),
Z = round(z_scores["Gentoo", ], 4),
P_value = round(p_values["Gentoo", ], 4),
Sig = ifelse(p_values["Gentoo", ] < 0.001, "***",
ifelse(p_values["Gentoo", ] < 0.01, "**",
ifelse(p_values["Gentoo", ] < 0.05, "*",
ifelse(p_values["Gentoo", ] < 0.1, ".", ""))))
)
print(wald_gentoo, row.names = FALSE)
## Predictor Beta SE Z P_value Sig
## (Intercept) -140.8718 1.1235 -1.253885e+02 0 ***
## log_culmen_length 464.3764 4.2230 1.099629e+02 0 ***
## log_culmen_depth -1039.7359 3.2220 -3.226954e+02 0 ***
## log_flipper_length -77.2563 5.9299 -1.302830e+01 0 ***
## log_body_mass 200.7984 9.4994 2.113800e+01 0 ***
## islandDream -35.2602 0.0000 -1.163283e+13 0 ***
## islandOthers 11.9303 1.1235 1.061910e+01 0 ***
## sexMALE 65.3363 1.1235 5.815510e+01 0 ***
cat("\n Others (Chinstrap) vs Adelie\n")
##
## Others (Chinstrap) vs Adelie
wald_others <- data.frame(
Predictor = colnames(mlr_coef),
Beta = round(mlr_coef["Others", ], 4),
SE = round(mlr_se["Others", ], 4),
Z = round(z_scores["Others", ], 4),
P_value = round(p_values["Others", ], 4),
Sig = ifelse(p_values["Others", ] < 0.001, "***",
ifelse(p_values["Others", ] < 0.01, "**",
ifelse(p_values["Others", ] < 0.05, "*",
ifelse(p_values["Others", ] < 0.1, ".", ""))))
)
print(wald_others, row.names = FALSE)
## Predictor Beta SE Z P_value Sig
## (Intercept) -120.4244 1786.296 -6.740000e-02 0.9463
## log_culmen_length 1306.3289 1613.644 8.096000e-01 0.4182
## log_culmen_depth -356.3137 11423.715 -3.120000e-02 0.9751
## log_flipper_length -207.2328 15290.935 -1.360000e-02 0.9892
## log_body_mass -341.7207 11751.831 -2.910000e-02 0.9768
## islandDream 130.1229 1786.296 7.280000e-02 0.9419
## islandOthers 79.2783 0.000 1.089747e+10 0.0000 ***
## sexMALE -15.7781 0.000 -3.683095e+11 0.0000 ***
cat("\nKeterangan: *** p<0.001 | ** p<0.01 | * p<0.05 | . p<0.10\n")
##
## Keterangan: *** p<0.001 | ** p<0.01 | * p<0.05 | . p<0.10
cat("\n Ringkasan Prediktor Signifikan (p < 0.05)\n")
##
## Ringkasan Prediktor Signifikan (p < 0.05)
sig_gentoo <- names(p_values["Gentoo", ])[p_values["Gentoo", ] < 0.05]
sig_others <- names(p_values["Others", ])[p_values["Others", ] < 0.05]
not_sig_g <- names(p_values["Gentoo", ])[p_values["Gentoo", ] >= 0.05]
not_sig_o <- names(p_values["Others", ])[p_values["Others", ] >= 0.05]
cat("Gentoo vs Adelie - SIGNIFIKAN:\n")
## Gentoo vs Adelie - SIGNIFIKAN:
for (v in sig_gentoo) cat(sprintf(" [*] %s\n", v))
## [*] (Intercept)
## [*] log_culmen_length
## [*] log_culmen_depth
## [*] log_flipper_length
## [*] log_body_mass
## [*] islandDream
## [*] islandOthers
## [*] sexMALE
cat("Gentoo vs Adelie - TIDAK SIGNIFIKAN:\n")
## Gentoo vs Adelie - TIDAK SIGNIFIKAN:
for (v in not_sig_g) cat(sprintf(" [ ] %s\n", v))
cat("\nOthers vs Adelie - SIGNIFIKAN:\n")
##
## Others vs Adelie - SIGNIFIKAN:
for (v in sig_others) cat(sprintf(" [*] %s\n", v))
## [*] islandOthers
## [*] sexMALE
cat("Others vs Adelie - TIDAK SIGNIFIKAN:\n")
## Others vs Adelie - TIDAK SIGNIFIKAN:
for (v in not_sig_o) cat(sprintf(" [ ] %s\n", v))
## [ ] (Intercept)
## [ ] log_culmen_length
## [ ] log_culmen_depth
## [ ] log_flipper_length
## [ ] log_body_mass
## [ ] islandDream
cat("STEP 8: EVALUASI MLR\n")
## STEP 8: EVALUASI MLR
pred_mlr <- predict(mlr_model, type = "class")
conf_mlr <- table(Actual = df$species, Predicted = pred_mlr)
cat("Confusion Matrix (MLR):\n")
## Confusion Matrix (MLR):
print(conf_mlr)
## Predicted
## Actual Adelie Gentoo Others
## Adelie 146 0 0
## Gentoo 0 119 0
## Others 0 0 68
n_total <- nrow(df)
n_correct_mlr <- sum(diag(conf_mlr))
n_wrong_mlr <- n_total - n_correct_mlr
APER_mlr <- n_wrong_mlr / n_total
acc_mlr <- n_correct_mlr / n_total
cat(sprintf("\nTotal observasi : %d\n", n_total))
##
## Total observasi : 333
cat(sprintf("Benar diklasifikasi : %d\n", n_correct_mlr))
## Benar diklasifikasi : 333
cat(sprintf("Salah diklasifikasi : %d\n", n_wrong_mlr))
## Salah diklasifikasi : 0
cat(sprintf("APER (MLR) : %.4f (%.2f%%)\n", APER_mlr, APER_mlr*100))
## APER (MLR) : 0.0000 (0.00%)
cat(sprintf("Akurasi (MLR) : %.4f (%.2f%%)\n", acc_mlr, acc_mlr*100))
## Akurasi (MLR) : 1.0000 (100.00%)
cat("\nAPER per kelas (MLR):\n")
##
## APER per kelas (MLR):
for (sp in levels(df$species)) {
n_sp <- sum(df$species == sp)
n_err <- n_sp - conf_mlr[sp, sp]
aper_sp <- n_err / n_sp
cat(sprintf(" %-8s : %d salah dari %d (APER = %.4f)\n",
sp, n_err, n_sp, aper_sp))
}
## Adelie : 0 salah dari 146 (APER = 0.0000)
## Gentoo : 0 salah dari 119 (APER = 0.0000)
## Others : 0 salah dari 68 (APER = 0.0000)
pred_prob <- predict(mlr_model, type = "probs")
pred_prob_df <- data.frame(
round(as.data.frame(pred_prob), 4),
Actual = df$species
)
cat("Contoh 10 baris pertama Predicted Probabilities:\n")
## Contoh 10 baris pertama Predicted Probabilities:
print(head(pred_prob_df, 10))
## Adelie Gentoo Others Actual
## 1 1 0 0 Adelie
## 2 1 0 0 Adelie
## 3 1 0 0 Adelie
## 4 1 0 0 Adelie
## 5 1 0 0 Adelie
## 6 1 0 0 Adelie
## 7 1 0 0 Adelie
## 8 1 0 0 Adelie
## 9 1 0 0 Adelie
## 10 1 0 0 Adelie
# Plot RRR
rrr_long <- data.frame(
Predictor = rep(colnames(rrr), 2),
Contrast = rep(c("Gentoo vs Adelie", "Others vs Adelie"), each = ncol(rrr)),
RRR = c(rrr["Gentoo", ], rrr["Others", ]),
P_value = c(p_values["Gentoo", ], p_values["Others", ])
) %>%
filter(Predictor != "(Intercept)") %>%
mutate(
Significant = ifelse(P_value < 0.05, "Significant", "Not Significant"),
log_RRR = log10(pmax(RRR, 1e-30))
)
p_rrr <- ggplot(rrr_long, aes(x = Predictor, y = log_RRR, fill = Significant)) +
geom_col(position = "dodge", alpha = 0.85) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
facet_wrap(~Contrast, ncol = 1) +
scale_fill_manual(values = c("Significant" = "#2ecc71",
"Not Significant" = "#bdc3c7")) +
labs(title = "MLR: log10(Relative Risk Ratios) per Predictor",
subtitle = "log10(RRR) > 0 = meningkatkan risk vs Adelie",
x = "Predictor", y = "log10(RRR)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
png("plot_mlr_rrr.png", width = 1100, height = 800, res = 110)
print(p_rrr)
dev.off()
## png
## 2
img <- readPNG("plot_mlr_rrr.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)
# Plot Average Marginal Effects untuk prediktor numerik
ame_long <- ame_numeric %>%
pivot_longer(cols = starts_with("AME_"),
names_to = "Species",
names_prefix = "AME_",
values_to = "AME")
p_ame <- ggplot(ame_long, aes(x = Predictor, y = AME, fill = Species)) +
geom_col(position = "dodge", alpha = 0.85) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
scale_fill_manual(values = c("Adelie" = "#E07B54",
"Gentoo" = "#457B9D",
"Others" = "#6A994E")) +
labs(title = "Average Marginal Effects (AME) - Prediktor Numerik",
subtitle = "Perubahan probabilitas per 1 unit kenaikan log(X)",
x = "Predictor (log-transformed)", y = "AME (delta probabilitas)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
png("plot_mlr_ame.png", width = 1100, height = 600, res = 110)
print(p_ame)
dev.off()
## png
## 2
img2 <- readPNG("plot_mlr_ame.png")
plot.new(); rasterImage(img2, 0, 0, 1, 1)
cat("
RINGKASAN AKHIR MULTINOMIAL LOGISTIC REGRESSION
Dataset : Palmer Archipelago Penguin
Reference category: Adelie
ASUMSI
")
##
## RINGKASAN AKHIR MULTINOMIAL LOGISTIC REGRESSION
## Dataset : Palmer Archipelago Penguin
## Reference category: Adelie
##
## ASUMSI
cat(sprintf(" [v] Linearity of Log-Odds : %s\n", status_lin))
## [v] Linearity of Log-Odds : TERPENUHI (via log-transformation)
cat(sprintf(" [%s] No Multicollinearity : %s\n",
ifelse(grepl('TERPENUHI', status_mc), 'v', 'x'),
status_mc))
## [v] No Multicollinearity : TERPENUHI
cat(sprintf(" [v] Independence of Observations : %s\n", status_ind))
## [v] Independence of Observations : TERPENUHI (cross-sectional design)
cat("\nUJI SERENTAK (Likelihood Ratio Test)\n")
##
## UJI SERENTAK (Likelihood Ratio Test)
cat(sprintf(" G^2 = %.4f | df = %d | p-value = %.6f\n",
G2, df_lrt, p_lrt))
## G^2 = 701.7252 | df = 14 | p-value = 0.000000
cat(sprintf(" McFadden Pseudo R^2 = %.4f\n",
pseudo_r2))
## McFadden Pseudo R^2 = 1.0000
cat(sprintf(" Kesimpulan: %s\n",
ifelse(p_lrt < 0.05,
"Model SIGNIFIKAN secara serentak (Tolak H0)",
"Model TIDAK signifikan secara serentak (Gagal Tolak H0)")))
## Kesimpulan: Model SIGNIFIKAN secara serentak (Tolak H0)
cat("\nEVALUASI MODEL\n")
##
## EVALUASI MODEL
cat(sprintf(" Akurasi : %.2f%%\n", acc_mlr*100))
## Akurasi : 100.00%
cat(sprintf(" APER : %.2f%%\n", APER_mlr*100))
## APER : 0.00%
cat("\nPREDIKTOR SIGNIFIKAN PARSIAL (p < 0.05)\n")
##
## PREDIKTOR SIGNIFIKAN PARSIAL (p < 0.05)
cat(" Gentoo vs Adelie : ",
paste(sig_gentoo, collapse = ", "), "\n")
## Gentoo vs Adelie : (Intercept), log_culmen_length, log_culmen_depth, log_flipper_length, log_body_mass, islandDream, islandOthers, sexMALE
cat(" Others vs Adelie : ",
paste(sig_others, collapse = ", "), "\n")
## Others vs Adelie : islandOthers, sexMALE