# =====================================================
# PRAKTIKUM 12 - REGRESI LOGISTIK BINER DI R
# =====================================================
# =====================================================
# PRAKTIKUM REGRESI LOGISTIK BINER
# Menggunakan Data.xlsx
# =====================================================

# 1. Install package (jalankan sekali saja)
#install.packages("readxl")
#install.packages("pscl")
#install.packages("ResourceSelection")

# 2. Memanggil library
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(pscl)
## Warning: package 'pscl' was built under R version 4.5.3
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.5.3
## ResourceSelection 0.3-6   2023-06-27
# 3. Import data Excel
data <- read_excel("D:/S2 IPB/Semester 3/ASPRAK/Responsi STA1513 Ganjil/Pertemuan 12/Data.xlsx")
# Melihat data
head(data)
## # A tibble: 6 × 3
##    Umur `Jenis Kelamin` `Status Pembelian`
##   <dbl>           <dbl>              <dbl>
## 1    22               0                  0
## 2    40               1                  1
## 3    35               1                  1
## 4    36               0                  1
## 5    45               1                  1
## 6    31               0                  0
str(data)
## tibble [29 × 3] (S3: tbl_df/tbl/data.frame)
##  $ Umur            : num [1:29] 22 40 35 36 45 31 50 20 30 33 ...
##  $ Jenis Kelamin   : num [1:29] 0 1 1 0 1 0 1 0 1 0 ...
##  $ Status Pembelian: num [1:29] 0 1 1 1 1 0 1 0 1 1 ...
# =====================================================
# 4. Mengubah variabel kategorik menjadi factor
# =====================================================

# Misal:
# SP = status pembelian
# JK = jenis kelamin

# Cek nama kolom
names(data)
## [1] "Umur"             "Jenis Kelamin"    "Status Pembelian"
# Ubah nama kolom agar mudah dipanggil
names(data) <- c("Umur", "JK", "SP")

# Ubah menjadi factor
data$SP <- factor(data$SP, levels = c(0, 1),
                  labels = c("Tidak", "Ya"))

data$JK <- factor(data$JK, levels = c(0, 1),
                  labels = c("Laki-laki", "Perempuan"))
# 3. Model Regresi Logistik Biner
# Model regresi logistik biner
model <- glm(SP ~ Umur + JK,
             data = data,
             family = binomial)

summary(model)
## 
## Call:
## glm(formula = SP ~ Umur + JK, family = binomial, data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.39062    1.45121  -1.647  0.09949 . 
## Umur         0.05453    0.04416   1.235  0.21694   
## JKPerempuan  2.63872    0.99578   2.650  0.00805 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 38.496  on 28  degrees of freedom
## Residual deviance: 26.401  on 26  degrees of freedom
## AIC: 32.401
## 
## Number of Fisher Scoring iterations: 5
# 4. Model Logit
coef(model)
## (Intercept)        Umur JKPerempuan 
## -2.39062412  0.05452874  2.63872358
# 5. Odds Ratio
OR <- exp(coef(model))
OR
## (Intercept)        Umur JKPerempuan 
##  0.09157251  1.05604282 13.99532827
# 6. Confidence Interval Odds Ratio
exp(confint(model))
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept) 0.00334252   1.220979
## Umur        0.97389176   1.168616
## JKPerempuan 2.30841618 129.636777
# 7. Uji Rasio Likelihood
model_null <- glm(SP ~ 1, data = data, family = binomial)
anova(model_null, model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: SP ~ 1
## Model 2: SP ~ Umur + JK
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1        28     38.496                        
## 2        26     26.401  2   12.095 0.002364 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 8. Uji Wald
summary(model)
## 
## Call:
## glm(formula = SP ~ Umur + JK, family = binomial, data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.39062    1.45121  -1.647  0.09949 . 
## Umur         0.05453    0.04416   1.235  0.21694   
## JKPerempuan  2.63872    0.99578   2.650  0.00805 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 38.496  on 28  degrees of freedom
## Residual deviance: 26.401  on 26  degrees of freedom
## AIC: 32.401
## 
## Number of Fisher Scoring iterations: 5
# 9. Pseudo R-Square
pR2(model)
## fitting null model for pseudo-r2
##         llh     llhNull          G2    McFadden        r2ML        r2CU 
## -13.2004682 -19.2480394  12.0951424   0.3141915   0.3410278   0.4640795
# 10. Uji Hosmer-Lemeshow
hoslem.test(as.numeric(data$SP) - 1, fitted(model), g = 10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  as.numeric(data$SP) - 1, fitted(model)
## X-squared = 7.5302, df = 8, p-value = 0.4807
# 11. Prediksi Peluang
data$probabilitas <- predict(model, type = "response")
# 12. Klasifikasi dengan cut-off 0.5
data$prediksi <- ifelse(data$probabilitas >= 0.5, "Ya", "Tidak")
# 13. Confusion Matrix
table(Aktual = data$SP, Prediksi = data$prediksi)
##        Prediksi
## Aktual  Tidak Ya
##   Tidak     8  3
##   Ya        4 14
# 14. Akurasi
akurasi <- mean(as.character(data$SP) == data$prediksi)
akurasi
## [1] 0.7586207
# 15. Menampilkan Data Akhir
data
## # A tibble: 29 × 5
##     Umur JK        SP    probabilitas prediksi
##    <dbl> <fct>     <fct>        <dbl> <chr>   
##  1    22 Laki-laki Tidak        0.233 Tidak   
##  2    40 Perempuan Ya           0.919 Ya      
##  3    35 Perempuan Ya           0.896 Ya      
##  4    36 Laki-laki Ya           0.395 Tidak   
##  5    45 Perempuan Ya           0.937 Ya      
##  6    31 Laki-laki Tidak        0.332 Tidak   
##  7    50 Perempuan Ya           0.951 Ya      
##  8    20 Laki-laki Tidak        0.214 Tidak   
##  9    30 Perempuan Ya           0.868 Ya      
## 10    33 Laki-laki Ya           0.356 Tidak   
## # ℹ 19 more rows
# =====================================================
# VISUALISASI REGRESI LOGISTIK BINER
# =====================================================

library(ggplot2)

# ==========================================
# 1. Scatter Plot + Kurva Probabilitas
# ==========================================

ggplot(data, aes(x = Umur,
                 y = probabilitas,
                 color = JK)) +
  geom_point(size = 3) +
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              se = FALSE) +
  labs(title = "Kurva Regresi Logistik",
       x = "Umur",
       y = "Probabilitas Membeli") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

# ==========================================
# 2. Visualisasi Status Aktual
# ==========================================

ggplot(data, aes(x = Umur,
                 fill = SP)) +
  geom_histogram(binwidth = 5,
                 alpha = 0.7,
                 position = "identity") +
  labs(title = "Distribusi Status Pembelian Berdasarkan Umur",
       x = "Umur",
       y = "Frekuensi") +
  theme_minimal()

# ==========================================
# 3. Barplot Jenis Kelamin vs Status Pembelian
# ==========================================

ggplot(data, aes(x = JK,
                 fill = SP)) +
  geom_bar(position = "dodge") +
  labs(title = "Status Pembelian Berdasarkan Jenis Kelamin",
       x = "Jenis Kelamin",
       y = "Jumlah") +
  theme_minimal()

# ==========================================
# 4. Confusion Matrix Heatmap
# ==========================================

cm <- as.data.frame(table(Aktual = data$SP,
                          Prediksi = data$prediksi))

ggplot(cm, aes(x = Prediksi,
               y = Aktual,
               fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq),
            color = "white",
            size = 6) +
  labs(title = "Confusion Matrix",
       x = "Prediksi",
       y = "Aktual") +
  theme_minimal()

# ==========================================
# 5. Plot Odds Ratio
# ==========================================

OR <- exp(coef(model))

or_data <- data.frame(
  Variabel = names(OR),
  OddsRatio = OR
)

ggplot(or_data[-1,], aes(x = Variabel,
                         y = OddsRatio,
                         fill = Variabel)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 1,
             linetype = "dashed",
             color = "red") +
  labs(title = "Odds Ratio Variabel",
       y = "Odds Ratio",
       x = "Variabel") +
  theme_minimal()

# ==========================================
# 6. Plot Probabilitas Prediksi
# ==========================================

ggplot(data,
       aes(x = probabilitas,
           fill = SP)) +
  geom_histogram(binwidth = 0.1,
                 alpha = 0.7,
                 position = "identity") +
  labs(title = "Distribusi Probabilitas Prediksi",
       x = "Probabilitas",
       y = "Frekuensi") +
  theme_minimal()