FIX REGLOGBINYES

Library

Input Data

 # importing dataset
df<- read.csv("C:\\Users\\Ghonniyu\\Downloads\\breast_cancer.csv")

Keterangan Data :

  • Y = Kelas jenis kanker payudara

  • X1 = Ketebalan gumpalan (Clump)

  • X2 = Keseragaman ukuran sel (Cell Size)

  • X3 = Keseragaman bentuk sel (Cell Shape)

  • X4 = Adhesi marjinal (Marginal adhesion)

  • X5 =Ukuran sel epitel tunggal (Epihelial)

  • X6 = Intisel kosong (Bare Nuclei)

  • X7 = Kromatin hambar (Bland chromatin)

  • X8 = Nucleolus normal (Normal Nucleoli)

  • X9 = Mitosis (Mitoses)

#studying the dataset
head(df)
##   X1 X2 X3 X4 X5 X6 X7 X8 X9 Class
## 1  5  1  1  1  2  1  3  1  1     2
## 2  5  4  4  5  7 10  3  2  1     2
## 3  3  1  1  1  2  2  3  1  1     2
## 4  6  8  8  1  3  4  3  7  1     2
## 5  4  1  1  3  2  1  3  1  1     2
## 6  8 10 10  8  7 10  9  7  1     4
unique(df$Class)
## [1] 2 4
# ubah angka biner jadi 0/1

df$Class <- ifelse(df$Class == 2, 0, 1)
skim(df)
Data summary
Name df
Number of rows 683
Number of columns 10
_______________________
Column type frequency:
numeric 10
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
X1 0 1 4.44 2.82 1 2 4 6 10 ▇▇▇▃▃
X2 0 1 3.15 3.07 1 1 1 5 10 ▇▂▁▁▂
X3 0 1 3.22 2.99 1 1 1 5 10 ▇▂▁▁▁
X4 0 1 2.83 2.86 1 1 1 4 10 ▇▂▁▁▁
X5 0 1 3.23 2.22 1 2 2 4 10 ▇▂▂▁▁
X6 0 1 3.54 3.64 1 1 1 6 10 ▇▁▁▁▂
X7 0 1 3.45 2.45 1 2 3 5 10 ▇▅▁▂▁
X8 0 1 2.87 3.05 1 1 1 4 10 ▇▁▁▁▁
X9 0 1 1.60 1.73 1 1 1 1 10 ▇▁▁▁▁
Class 0 1 0.35 0.48 0 0 0 1 1 ▇▁▁▁▅
# Cek Missing value

colSums(is.na(df))
##    X1    X2    X3    X4    X5    X6    X7    X8    X9 Class 
##     0     0     0     0     0     0     0     0     0     0
library(ggplot2)

ggplot(df, aes(x = Class, fill = Class == 1)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "skyblue")) +
  theme_minimal() +
  theme(panel.grid = element_blank()) +
  labs(x = "Kelas Kanker", y = "Jumlah") +
  guides(fill = "none")  # sembunyikan legenda

# using vis_miss function to visually identify missing values
vis_miss(df)

Eksplorasi

correlation <- cor(df[ , !(names(df) %in% "Class")])

ggcorrplot(correlation,lab=TRUE)

# pairwise correlation
pairs(df,
      cex.labels = 0.3,
       col = c("#E7AB79"),
       pch = 21,
       main = "Pairwise correlation",
       col.labels = "black")

SPLITTING

library(caret)
set.seed(123)  # Untuk hasil yang konsisten


set.seed(123)  # untuk reproduktibilitas
index <- createDataPartition(df$Class, p = 0.7, list = FALSE)
train_data <- df[index, ]
test_data <- df[-index, ]
head(train_data)
##    X1 X2 X3 X4 X5 X6 X7 X8 X9 Class
## 4   6  8  8  1  3  4  3  7  1     0
## 5   4  1  1  3  2  1  3  1  1     0
## 6   8 10 10  8  7 10  9  7  1     1
## 7   1  1  1  1  2 10  3  1  1     0
## 8   2  1  2  1  2  1  3  1  1     0
## 10  4  2  1  1  2  1  2  1  1     0

MODELING

Diperoleh hasil simulasi dengan menggunakan ukuran sampel posterior sebanyak 4000 (default) dengan prior default (𝑁(0; 2,5/𝑠𝑑(𝑥𝑘)))

data(likelihood)priors::Yi|β1,β2,..,β9,∼N(μi,σ2)

dengan

μi=β0+β1X1+β2X2+…+β9X9

β0∼N(0.35,0.48) Note : Mengikuti sebaran peubah respon

β1,β2,…+β9X9∼N(0,1)

σ∼Exp(1)

modG <- stan_glm(Class~.,
                 family = "binomial",
                 data = train_data,
                 prior_intercept = normal(location = 0.35   ,scale = 0.48   ),
                 prior = normal(),
                 prior_aux = exponential(1),iter=4000)
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000129 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.29 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 2.164 seconds (Warm-up)
## Chain 1:                2.192 seconds (Sampling)
## Chain 1:                4.356 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 7.3e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.73 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 2: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 2: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 2: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 2: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 2: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 2: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 2: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 2: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 2: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 2: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 2: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 2.207 seconds (Warm-up)
## Chain 2:                2.125 seconds (Sampling)
## Chain 2:                4.332 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 7.7e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.77 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 3: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 3: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 3: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 3: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 3: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 3: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 3: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 3: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 3: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 3: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 3: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 2.117 seconds (Warm-up)
## Chain 3:                2.16 seconds (Sampling)
## Chain 3:                4.277 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 6e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.6 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 4: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 4: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 4: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 4: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 4: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 4: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 4: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 4: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 4: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 4: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 4: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 2.194 seconds (Warm-up)
## Chain 4:                2.155 seconds (Sampling)
## Chain 4:                4.349 seconds (Total)
## Chain 4:
print(summary(modG, probs = c(0.025, 0.5, 0.975)), digits = 2)
## 
## Model Info:
##  function:     stan_glm
##  family:       binomial [logit]
##  formula:      Class ~ .
##  algorithm:    sampling
##  sample:       8000 (posterior sample size)
##  priors:       see help('prior_summary')
##  observations: 479
##  predictors:   10
## 
## Estimates:
##               mean   sd     2.5%   50%    97.5%
## (Intercept) -10.41   1.43 -13.57 -10.28  -7.96 
## X1            0.38   0.18   0.04   0.37   0.75 
## X2            0.35   0.29  -0.18   0.34   0.95 
## X3            0.21   0.29  -0.38   0.22   0.77 
## X4            0.39   0.17   0.07   0.39   0.74 
## X5            0.10   0.23  -0.37   0.09   0.56 
## X6            0.51   0.13   0.27   0.51   0.78 
## X7            0.53   0.26   0.05   0.52   1.06 
## X8            0.11   0.14  -0.17   0.11   0.40 
## X9            0.67   0.32   0.08   0.66   1.35 
## 
## Fit Diagnostics:
##            mean   sd   2.5%   50%   97.5%
## mean_PPD 0.37   0.01 0.35   0.37  0.38   
## 
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
## 
## MCMC diagnostics
##               mcse Rhat n_eff
## (Intercept)   0.02 1.00  7698
## X1            0.00 1.00  8812
## X2            0.00 1.00  6693
## X3            0.00 1.00  6814
## X4            0.00 1.00 10636
## X5            0.00 1.00  8379
## X6            0.00 1.00  9488
## X7            0.00 1.00 10427
## X8            0.00 1.00 10500
## X9            0.00 1.00  8777
## mean_PPD      0.00 1.00 10358
## log-posterior 0.04 1.00  3378
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
vif(modG)
##       X1       X2       X3       X4       X5       X6       X7       X8 
## 1.267988 2.390018 2.590973 1.131896 1.452702 1.067683 1.198123 1.365804 
##       X9 
## 1.055011

Melihat dari credible interval (CI) 2.5% hingga 97.5% didapatkan parameter yang signifikan adalah X1, X4, X6, X7, dan X9. Penentuannya didasarkan masing-masing parameter tidak memuat angka 0 pada CI

modG2 <- stan_glm(Class ~ X1+X4+X6+X7+X9
                  ,
                 family = "binomial",
                 data = train_data,  # Hanya gunakan data latih
                 prior_intercept = normal(location = 0.35, scale = 0.48),
                 prior = normal(),
                 prior_aux = exponential(1),
                 iter = 4000)
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 7.1e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.71 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 1.515 seconds (Warm-up)
## Chain 1:                1.269 seconds (Sampling)
## Chain 1:                2.784 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 6.8e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.68 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 2: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 2: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 2: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 2: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 2: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 2: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 2: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 2: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 2: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 2: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 2: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 1.173 seconds (Warm-up)
## Chain 2:                1.339 seconds (Sampling)
## Chain 2:                2.512 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 6.2e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.62 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 3: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 3: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 3: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 3: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 3: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 3: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 3: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 3: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 3: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 3: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 3: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 1.955 seconds (Warm-up)
## Chain 3:                1.579 seconds (Sampling)
## Chain 3:                3.534 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 6.2e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.62 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 4: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 4: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 4: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 4: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 4: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 4: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 4: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 4: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 4: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 4: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 4: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 1.313 seconds (Warm-up)
## Chain 4:                1.222 seconds (Sampling)
## Chain 4:                2.535 seconds (Total)
## Chain 4:
print(summary(modG2, probs = c(0.025, 0.5, 0.975)), digits = 2)
## 
## Model Info:
##  function:     stan_glm
##  family:       binomial [logit]
##  formula:      Class ~ X1 + X4 + X6 + X7 + X9
##  algorithm:    sampling
##  sample:       8000 (posterior sample size)
##  priors:       see help('prior_summary')
##  observations: 479
##  predictors:   6
## 
## Estimates:
##               mean   sd     2.5%   50%    97.5%
## (Intercept) -10.91   1.35 -13.74 -10.81  -8.51 
## X1            0.68   0.15   0.41   0.68   1.00 
## X4            0.50   0.16   0.21   0.50   0.83 
## X6            0.60   0.12   0.38   0.59   0.84 
## X7            0.75   0.20   0.39   0.74   1.18 
## X9            0.80   0.31   0.22   0.79   1.44 
## 
## Fit Diagnostics:
##            mean   sd   2.5%   50%   97.5%
## mean_PPD 0.37   0.01 0.35   0.37  0.39   
## 
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
## 
## MCMC diagnostics
##               mcse Rhat n_eff
## (Intercept)   0.02 1.00 7163 
## X1            0.00 1.00 7802 
## X4            0.00 1.00 8355 
## X6            0.00 1.00 8896 
## X7            0.00 1.00 8661 
## X9            0.00 1.00 7683 
## mean_PPD      0.00 1.00 8628 
## log-posterior 0.03 1.00 3279 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
vif(modG2)
##       X1       X4       X6       X7       X9 
## 1.068071 1.024002 1.058219 1.011166 1.019663

DIAGNOSTIK MODEL

mcmc_dens(as.array(modG2),facet_args = list(dir="v"))

mcmc_trace(as.array(modG2),facet_args = list(dir="v"))

mcmc_acf(as.array(modG2))

pp_check(modG2, plotfun = "dens_overlay")

pp_check(modG2, plotfun = "hist")  
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_check(modG2, plotfun = "stat", stat = "mean")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

binnedplot(fitted(modG2), resid(modG2, type = "response"), 
           xlab = "Fitted Values", ylab = "Residuals", 
           main = "Binned Residual Plot", col.pts = "blue", col.int = "red")

Hasil traceplot menunjukkan bahwa:

  • Seluruh chain saling tumpang tindih dan tersebar secara merata, tanpa pola tren naik atau turun yang berarti. Hal ini mengindikasikan bahwa setiap rantai telah mencapai distribusi stasioner dan model telah berkonvergensi dengan baik.

  • Fluktuasi yang terlihat cukup acak dan padat juga merupakan indikasi bahwa sampel yang dihasilkan mencerminkan distribusi posterior yang stabil.

  • Tidak terlihat adanya rantai yang menyimpang secara signifikan dari yang lain (tidak ada chain yang “off-track”), sehingga kita bisa menyimpulkan bahwa estimasi parameter dari model ini dapat dipercaya.

ODDS RATIO

posterior_coefs <- as.matrix(modG2)  # Mengambil sampel posterior dari koefisien
odds_ratios <- exp(posterior_coefs)

head(odds_ratios)
##           parameters
## iterations  (Intercept)       X1       X4       X6       X7       X9
##       [1,] 1.228671e-04 1.568652 1.343994 1.705776 1.933886 2.120013
##       [2,] 8.299641e-05 1.610245 1.471254 1.708378 1.942708 2.142728
##       [3,] 1.107196e-05 2.297673 1.562910 1.702388 2.694424 1.355485
##       [4,] 5.428535e-05 1.492731 1.757840 1.888475 1.695420 3.230878
##       [5,] 3.295645e-05 1.609277 1.672955 1.893904 1.984371 2.633592
##       [6,] 8.167541e-05 1.857742 1.626414 1.777937 1.687802 1.666309
# Ambil rata-rata posterior dari masing-masing koefisien
posterior_means <- colMeans(as.matrix(modG2))

# Hitung odds ratio
odds_ratios <- exp(posterior_means)

# Tampilkan hasil
odds_ratios
##  (Intercept)           X1           X4           X6           X7           X9 
## 1.834944e-05 1.983130e+00 1.655357e+00 1.814076e+00 2.118335e+00 2.215852e+00

PREDIKSI MODEL

fitted_value <- posterior_predict(modG2,seed = 123)
dim(fitted_value)
## [1] 8000  479
predict_new2 <- posterior_predict(modG2,seed = 123,newdata = df |> slice_sample(n = 6))
dim(predict_new2)
## [1] 8000    6
mcmc_dens(predict_new2,facet_args = list(dir="v"))

posterior_interval(predict_new2,prob = 0.95)
##   2.5% 97.5%
## 1    0     0
## 2    0     0
## 3    1     1
## 4    0     1
## 5    1     1
## 6    0     0
apply(predict_new2,2,median)
## 1 2 3 4 5 6 
## 0 0 1 0 1 0

KEBAIKAN MODEL

pred_probs <- posterior_predict(modG2, newdata = df)
pred_classes <- ifelse(colMeans(pred_probs) > 0.5, 1, 0)

Kebaikan Data Latih

library(caret)

# Prediksi training
fitted_train <- posterior_predict(modG2, newdata = train_data, seed = 123)
pred_train_class <- round(colMeans(fitted_train))  # majority vote

# Konversi ke faktor
pred_classes_train <- factor(pred_train_class)
actual_classes_train <- factor(train_data$Class)

# Confusion matrix & metrik
conf_mat_train <- confusionMatrix(pred_classes_train, actual_classes_train)
conf_mat_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 298   8
##          1   8 165
##                                           
##                Accuracy : 0.9666          
##                  95% CI : (0.9463, 0.9808)
##     No Information Rate : 0.6388          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9276          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9739          
##             Specificity : 0.9538          
##          Pos Pred Value : 0.9739          
##          Neg Pred Value : 0.9538          
##              Prevalence : 0.6388          
##          Detection Rate : 0.6221          
##    Detection Prevalence : 0.6388          
##       Balanced Accuracy : 0.9638          
##                                           
##        'Positive' Class : 0               
## 
# Ambil metrik
accuracy_train <- conf_mat_train$overall["Accuracy"]
specificity_train <- conf_mat_train$byClass["Specificity"]
recall_train <- conf_mat_train$byClass["Sensitivity"]
precision_train <- conf_mat_train$byClass["Precision"]
f1_score_train <- conf_mat_train$byClass["F1"]

# Tampilkan metrik training
cat("===  Data Latih Metrics ===\n")
## ===  Data Latih Metrics ===
cat("Accuracy:", round(accuracy_train, 4), "\n")
## Accuracy: 0.9666
cat("Specificity:", round(specificity_train, 4), "\n")
## Specificity: 0.9538
cat("Recall (Sensitivity):", round(recall_train, 4), "\n")
## Recall (Sensitivity): 0.9739
cat("Precision:", round(precision_train, 4), "\n")
## Precision: 0.9739
cat("F1-Score:", round(f1_score_train, 4), "\n\n")
## F1-Score: 0.9739
# Prediksi testing
fitted_test <- posterior_predict(modG2, newdata = test_data, seed = 123)
pred_test_class <- round(colMeans(fitted_test))

# Konversi ke faktor
pred_classes_test <- factor(pred_test_class)
actual_classes_test <- factor(test_data$Class)

# Confusion matrix & metrik
conf_mat_test <- confusionMatrix(pred_classes_test, actual_classes_test)
conf_mat_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 135   3
##          1   3  63
##                                           
##                Accuracy : 0.9706          
##                  95% CI : (0.9371, 0.9891)
##     No Information Rate : 0.6765          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9328          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9783          
##             Specificity : 0.9545          
##          Pos Pred Value : 0.9783          
##          Neg Pred Value : 0.9545          
##              Prevalence : 0.6765          
##          Detection Rate : 0.6618          
##    Detection Prevalence : 0.6765          
##       Balanced Accuracy : 0.9664          
##                                           
##        'Positive' Class : 0               
## 
# Ambil metrik
accuracy_test <- conf_mat_test$overall["Accuracy"]
specificity_test <- conf_mat_test$byClass["Specificity"]
recall_test <- conf_mat_test$byClass["Sensitivity"]
precision_test <- conf_mat_test$byClass["Precision"]
f1_score_test <- conf_mat_test$byClass["F1"]

# Tampilkan metrik testing
cat("===  Data Uji Metrics ===\n")
## ===  Data Uji Metrics ===
cat("Accuracy:", accuracy_test, "\n")
## Accuracy: 0.9705882
cat("Specificity:", specificity_test, "\n")
## Specificity: 0.9545455
cat("Recall (Sensitivity):", recall_test, "\n")
## Recall (Sensitivity): 0.9782609
cat("Precision:", precision_test, "\n")
## Precision: 0.9782609
cat("F1-Score:", f1_score_test, "\n")
## F1-Score: 0.9782609

Didapatkan baik melalui data uji maupun data latih memiliki kebaikan model yang sangat baik sekitar 97% pada accuracy, 95% pada specificity, 97% pada sensitivity, dan presicion sebesar 97% .

Kebaikan Seluruh Data

library(caret)

# Konversi ke faktor kalau belum
pred_classes <- factor(pred_classes)
actual_classes <- factor(df$Class)

# Confusion matrix
conf_mat <- confusionMatrix(pred_classes, actual_classes)
conf_mat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 433  11
##          1  11 228
##                                           
##                Accuracy : 0.9678          
##                  95% CI : (0.9516, 0.9797)
##     No Information Rate : 0.6501          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9292          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9752          
##             Specificity : 0.9540          
##          Pos Pred Value : 0.9752          
##          Neg Pred Value : 0.9540          
##              Prevalence : 0.6501          
##          Detection Rate : 0.6340          
##    Detection Prevalence : 0.6501          
##       Balanced Accuracy : 0.9646          
##                                           
##        'Positive' Class : 0               
## 
# Ambil metrik
accuracy <- conf_mat$overall["Accuracy"]
specificity <- conf_mat$byClass["Specificity"]
recall <- conf_mat$byClass["Sensitivity"]
precision <- conf_mat$byClass["Precision"]
f1_score<- conf_mat$byClass["F1"]

# Tampilkan metrik testing
cat("===  Data Keseluruhan Metrics ===\n")
## ===  Data Keseluruhan Metrics ===
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9677892
cat("Specificity:", specificity, "\n")
## Specificity: 0.9539749
cat("Recall (Sensitivity):", recall, "\n")
## Recall (Sensitivity): 0.9752252
cat("Precision:", precision, "\n")
## Precision: 0.9752252
cat("F1-Score:", f1_score, "\n")
## F1-Score: 0.9752252

KESIMPULAN

Didapatkan model regresi logistik biner melalui pendekatan bayesian dalam studi kasus kanker payudara yaitu kasus jinak (0) dan ganas (1) dengan 5 peubah penjelas yaitu Ketebalan gumpalan (X1), Adhesi marjinal (X4), Intisel kosong (X6), Nucleolus normal (X7), dan Mitosis (X9). Berdasarkan kebaikan model menghasilkan kategori yang sangat baik dalam penentuan kasus kanker payudara.