# =====================================================
# PRAKTIKUM 12 - REGRESI LOGISTIK BINER DI R
# =====================================================
# =====================================================
# PRAKTIKUM REGRESI LOGISTIK BINER
# Menggunakan Data.xlsx
# =====================================================
# 1. Install package (jalankan sekali saja)
#install.packages("readxl")
#install.packages("pscl")
#install.packages("ResourceSelection")
# 2. Memanggil library
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(pscl)
## Warning: package 'pscl' was built under R version 4.5.3
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.5.3
## ResourceSelection 0.3-6 2023-06-27
# 3. Import data Excel
data <- read_excel("D:/S2 IPB/Semester 3/ASPRAK/Responsi STA1513 Ganjil/Pertemuan 12/Data.xlsx")
# Melihat data
head(data)
## # A tibble: 6 × 3
## Umur `Jenis Kelamin` `Status Pembelian`
## <dbl> <dbl> <dbl>
## 1 22 0 0
## 2 40 1 1
## 3 35 1 1
## 4 36 0 1
## 5 45 1 1
## 6 31 0 0
str(data)
## tibble [29 × 3] (S3: tbl_df/tbl/data.frame)
## $ Umur : num [1:29] 22 40 35 36 45 31 50 20 30 33 ...
## $ Jenis Kelamin : num [1:29] 0 1 1 0 1 0 1 0 1 0 ...
## $ Status Pembelian: num [1:29] 0 1 1 1 1 0 1 0 1 1 ...
# =====================================================
# 4. Mengubah variabel kategorik menjadi factor
# =====================================================
# Misal:
# SP = status pembelian
# JK = jenis kelamin
# Cek nama kolom
names(data)
## [1] "Umur" "Jenis Kelamin" "Status Pembelian"
# Ubah nama kolom agar mudah dipanggil
names(data) <- c("Umur", "JK", "SP")
# Ubah menjadi factor
data$SP <- factor(data$SP, levels = c(0, 1),
labels = c("Tidak", "Ya"))
data$JK <- factor(data$JK, levels = c(0, 1),
labels = c("Laki-laki", "Perempuan"))
# 3. Model Regresi Logistik Biner
# Model regresi logistik biner
model <- glm(SP ~ Umur + JK,
data = data,
family = binomial)
summary(model)
##
## Call:
## glm(formula = SP ~ Umur + JK, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.39062 1.45121 -1.647 0.09949 .
## Umur 0.05453 0.04416 1.235 0.21694
## JKPerempuan 2.63872 0.99578 2.650 0.00805 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 38.496 on 28 degrees of freedom
## Residual deviance: 26.401 on 26 degrees of freedom
## AIC: 32.401
##
## Number of Fisher Scoring iterations: 5
# 4. Model Logit
coef(model)
## (Intercept) Umur JKPerempuan
## -2.39062412 0.05452874 2.63872358
# 5. Odds Ratio
OR <- exp(coef(model))
OR
## (Intercept) Umur JKPerempuan
## 0.09157251 1.05604282 13.99532827
# 6. Confidence Interval Odds Ratio
exp(confint(model))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.00334252 1.220979
## Umur 0.97389176 1.168616
## JKPerempuan 2.30841618 129.636777
# 7. Uji Rasio Likelihood
model_null <- glm(SP ~ 1, data = data, family = binomial)
anova(model_null, model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: SP ~ 1
## Model 2: SP ~ Umur + JK
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 28 38.496
## 2 26 26.401 2 12.095 0.002364 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 8. Uji Wald
summary(model)
##
## Call:
## glm(formula = SP ~ Umur + JK, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.39062 1.45121 -1.647 0.09949 .
## Umur 0.05453 0.04416 1.235 0.21694
## JKPerempuan 2.63872 0.99578 2.650 0.00805 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 38.496 on 28 degrees of freedom
## Residual deviance: 26.401 on 26 degrees of freedom
## AIC: 32.401
##
## Number of Fisher Scoring iterations: 5
# 9. Pseudo R-Square
pR2(model)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -13.2004682 -19.2480394 12.0951424 0.3141915 0.3410278 0.4640795
# 10. Uji Hosmer-Lemeshow
hoslem.test(as.numeric(data$SP) - 1, fitted(model), g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: as.numeric(data$SP) - 1, fitted(model)
## X-squared = 7.5302, df = 8, p-value = 0.4807
# 11. Prediksi Peluang
data$probabilitas <- predict(model, type = "response")
# 12. Klasifikasi dengan cut-off 0.5
data$prediksi <- ifelse(data$probabilitas >= 0.5, "Ya", "Tidak")
# 13. Confusion Matrix
table(Aktual = data$SP, Prediksi = data$prediksi)
## Prediksi
## Aktual Tidak Ya
## Tidak 8 3
## Ya 4 14
# 14. Akurasi
akurasi <- mean(as.character(data$SP) == data$prediksi)
akurasi
## [1] 0.7586207
# 15. Menampilkan Data Akhir
data
## # A tibble: 29 × 5
## Umur JK SP probabilitas prediksi
## <dbl> <fct> <fct> <dbl> <chr>
## 1 22 Laki-laki Tidak 0.233 Tidak
## 2 40 Perempuan Ya 0.919 Ya
## 3 35 Perempuan Ya 0.896 Ya
## 4 36 Laki-laki Ya 0.395 Tidak
## 5 45 Perempuan Ya 0.937 Ya
## 6 31 Laki-laki Tidak 0.332 Tidak
## 7 50 Perempuan Ya 0.951 Ya
## 8 20 Laki-laki Tidak 0.214 Tidak
## 9 30 Perempuan Ya 0.868 Ya
## 10 33 Laki-laki Ya 0.356 Tidak
## # ℹ 19 more rows
# =====================================================
# VISUALISASI REGRESI LOGISTIK BINER
# =====================================================
library(ggplot2)
# ==========================================
# 1. Scatter Plot + Kurva Probabilitas
# ==========================================
ggplot(data, aes(x = Umur,
y = probabilitas,
color = JK)) +
geom_point(size = 3) +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE) +
labs(title = "Kurva Regresi Logistik",
x = "Umur",
y = "Probabilitas Membeli") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

# ==========================================
# 2. Visualisasi Status Aktual
# ==========================================
ggplot(data, aes(x = Umur,
fill = SP)) +
geom_histogram(binwidth = 5,
alpha = 0.7,
position = "identity") +
labs(title = "Distribusi Status Pembelian Berdasarkan Umur",
x = "Umur",
y = "Frekuensi") +
theme_minimal()

# ==========================================
# 3. Barplot Jenis Kelamin vs Status Pembelian
# ==========================================
ggplot(data, aes(x = JK,
fill = SP)) +
geom_bar(position = "dodge") +
labs(title = "Status Pembelian Berdasarkan Jenis Kelamin",
x = "Jenis Kelamin",
y = "Jumlah") +
theme_minimal()

# ==========================================
# 4. Confusion Matrix Heatmap
# ==========================================
cm <- as.data.frame(table(Aktual = data$SP,
Prediksi = data$prediksi))
ggplot(cm, aes(x = Prediksi,
y = Aktual,
fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq),
color = "white",
size = 6) +
labs(title = "Confusion Matrix",
x = "Prediksi",
y = "Aktual") +
theme_minimal()

# ==========================================
# 5. Plot Odds Ratio
# ==========================================
OR <- exp(coef(model))
or_data <- data.frame(
Variabel = names(OR),
OddsRatio = OR
)
ggplot(or_data[-1,], aes(x = Variabel,
y = OddsRatio,
fill = Variabel)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = 1,
linetype = "dashed",
color = "red") +
labs(title = "Odds Ratio Variabel",
y = "Odds Ratio",
x = "Variabel") +
theme_minimal()

# ==========================================
# 6. Plot Probabilitas Prediksi
# ==========================================
ggplot(data,
aes(x = probabilitas,
fill = SP)) +
geom_histogram(binwidth = 0.1,
alpha = 0.7,
position = "identity") +
labs(title = "Distribusi Probabilitas Prediksi",
x = "Probabilitas",
y = "Frekuensi") +
theme_minimal()
