title: “Analisis Regresi Logistik Biner” author: “Tanti Putri Ramadani” date: “2026-03-09” output: html_document —

Import Data

data <- read.csv("C:/Users/Ashilah/Downloads/Titanic_data.csv")

head(data)
##     X PassengerId Survived Sex    Age       Fare Pclass_1 Pclass_2 Pclass_3
## 1 791         792        0   1 0.2000 0.05074862        0        1        0
## 2 792         793        0   0 0.3500 0.13575256        0        0        1
## 3 793         794        0   1 0.3500 0.05991421        1        0        0
## 4 794         795        0   1 0.3125 0.01541158        0        0        1
## 5 795         796        0   1 0.4875 0.02537431        0        1        0
## 6 796         797        1   0 0.6125 0.05061043        1        0        0
##   Family_size Title_1 Title_2 Title_3 Title_4 Emb_1 Emb_2 Emb_3
## 1           0       1       0       0       0     0     0     1
## 2           1       0       0       0       1     0     0     1
## 3           0       1       0       0       0     1     0     0
## 4           0       1       0       0       0     0     0     1
## 5           0       1       0       0       0     0     0     1
## 6           0       0       1       0       0     0     0     1

Nama Variabel dalam Dataset

names(data)
##  [1] "X"           "PassengerId" "Survived"    "Sex"         "Age"        
##  [6] "Fare"        "Pclass_1"    "Pclass_2"    "Pclass_3"    "Family_size"
## [11] "Title_1"     "Title_2"     "Title_3"     "Title_4"     "Emb_1"      
## [16] "Emb_2"       "Emb_3"

Struktur Data

str(data)
## 'data.frame':    100 obs. of  17 variables:
##  $ X          : int  791 792 793 794 795 796 797 798 799 800 ...
##  $ PassengerId: int  792 793 794 795 796 797 798 799 800 801 ...
##  $ Survived   : int  0 0 0 0 0 1 1 0 0 0 ...
##  $ Sex        : int  1 0 1 1 1 0 0 1 0 1 ...
##  $ Age        : num  0.2 0.35 0.35 0.312 0.487 ...
##  $ Fare       : num  0.0507 0.1358 0.0599 0.0154 0.0254 ...
##  $ Pclass_1   : int  0 0 1 0 0 1 0 0 0 0 ...
##  $ Pclass_2   : int  1 0 0 0 1 0 0 0 0 1 ...
##  $ Pclass_3   : int  0 1 0 1 0 0 1 1 1 0 ...
##  $ Family_size: num  0 1 0 0 0 0 0 0 0.2 0 ...
##  $ Title_1    : int  1 0 1 1 1 0 1 1 1 1 ...
##  $ Title_2    : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Title_3    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Title_4    : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Emb_1      : int  0 0 1 0 0 0 0 1 0 0 ...
##  $ Emb_2      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Emb_3      : int  1 1 0 1 1 1 1 0 1 1 ...
summary(data)
##        X          PassengerId       Survived         Sex      
##  Min.   :791.0   Min.   :792.0   Min.   :0.00   Min.   :0.00  
##  1st Qu.:815.8   1st Qu.:816.8   1st Qu.:0.00   1st Qu.:0.00  
##  Median :840.5   Median :841.5   Median :0.00   Median :1.00  
##  Mean   :840.5   Mean   :841.5   Mean   :0.36   Mean   :0.65  
##  3rd Qu.:865.2   3rd Qu.:866.2   3rd Qu.:1.00   3rd Qu.:1.00  
##  Max.   :890.0   Max.   :891.0   Max.   :1.00   Max.   :1.00  
##       Age               Fare            Pclass_1       Pclass_2  
##  Min.   :0.00525   Min.   :0.00000   Min.   :0.00   Min.   :0.0  
##  1st Qu.:0.27187   1st Qu.:0.01541   1st Qu.:0.00   1st Qu.:0.0  
##  Median :0.35000   Median :0.02537   Median :0.00   Median :0.0  
##  Mean   :0.35566   Mean   :0.04833   Mean   :0.23   Mean   :0.2  
##  3rd Qu.:0.42656   3rd Qu.:0.05856   3rd Qu.:0.00   3rd Qu.:0.0  
##  Max.   :0.92500   Max.   :0.32180   Max.   :1.00   Max.   :1.0  
##     Pclass_3     Family_size       Title_1        Title_2        Title_3    
##  Min.   :0.00   Min.   :0.000   Min.   :0.00   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00   1st Qu.:0.00  
##  Median :1.00   Median :0.000   Median :1.00   Median :0.00   Median :0.00  
##  Mean   :0.57   Mean   :0.104   Mean   :0.77   Mean   :0.01   Mean   :0.08  
##  3rd Qu.:1.00   3rd Qu.:0.100   3rd Qu.:1.00   3rd Qu.:0.00   3rd Qu.:0.00  
##  Max.   :1.00   Max.   :1.000   Max.   :1.00   Max.   :1.00   Max.   :1.00  
##     Title_4         Emb_1          Emb_2          Emb_3     
##  Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00  
##  Median :0.00   Median :0.00   Median :0.00   Median :1.00  
##  Mean   :0.14   Mean   :0.21   Mean   :0.04   Mean   :0.74  
##  3rd Qu.:0.00   3rd Qu.:0.00   3rd Qu.:0.00   3rd Qu.:1.00  
##  Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :1.00

Cek Missing Value

colSums(is.na(data))
##           X PassengerId    Survived         Sex         Age        Fare 
##           0           0           0           0           0           0 
##    Pclass_1    Pclass_2    Pclass_3 Family_size     Title_1     Title_2 
##           0           0           0           0           0           0 
##     Title_3     Title_4       Emb_1       Emb_2       Emb_3 
##           0           0           0           0           0

Membersihkan data dari missing value

data <- na.omit(data)

Korelasi Variabel Numerik

num_data <- data[sapply(data, is.numeric)]

if(ncol(num_data) > 1){
  
  corr_matrix <- cor(num_data, use="complete.obs")
  
  corrplot(corr_matrix,
           method="color",
           type="upper",
           tl.cex=0.8)
  
}


Model Regresi Logistik

Model logistik:

\[ \ln\left(\frac{p}{1-p}\right)= \beta_0 + \beta_1X_1 + \beta_2X_2 + ... + \beta_kX_k \]

Model akan menggunakan semua variabel numerik sebagai prediktor.

predictors <- names(num_data)

predictors <- predictors[predictors != "Survived"]

formula_model <- as.formula(
  paste("Survived ~", paste(predictors, collapse = "+"))
)

model_logit <- glm(formula_model,
                   data = data,
                   family = binomial)

summary(model_logit)
## 
## Call:
## glm(formula = formula_model, family = binomial, data = data)
## 
## Coefficients: (3 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.629e+00  2.400e+03   0.002  0.99846    
## X            4.563e-03  1.308e-02   0.349  0.72712    
## PassengerId         NA         NA      NA       NA    
## Sex         -6.381e+00  1.483e+00  -4.301  1.7e-05 ***
## Age         -2.168e+00  4.467e+00  -0.485  0.62735    
## Fare         2.836e+01  1.405e+01   2.019  0.04352 *  
## Pclass_1     1.555e+00  1.086e+00   1.432  0.15225    
## Pclass_2    -5.604e-02  1.057e+00  -0.053  0.95772    
## Pclass_3            NA         NA      NA       NA    
## Family_size -1.741e+01  5.660e+00  -3.077  0.00209 ** 
## Title_1      3.852e+00  1.502e+00   2.564  0.01034 *  
## Title_2      1.567e+01  2.400e+03   0.007  0.99479    
## Title_3      1.179e+01  3.644e+00   3.234  0.00122 ** 
## Title_4             NA         NA      NA       NA    
## Emb_1       -7.170e+00  2.400e+03  -0.003  0.99762    
## Emb_2       -6.453e+00  2.400e+03  -0.003  0.99785    
## Emb_3       -9.029e+00  2.400e+03  -0.004  0.99700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 130.684  on 99  degrees of freedom
## Residual deviance:  51.135  on 86  degrees of freedom
## AIC: 79.135
## 
## Number of Fisher Scoring iterations: 15

Odds Ratio

odds_ratio <- data.frame(
  Variabel = names(coef(model_logit)),
  Odds_Ratio = exp(coef(model_logit))
)

odds_ratio
##                Variabel   Odds_Ratio
## (Intercept) (Intercept) 1.023736e+02
## X                     X 1.004573e+00
## PassengerId PassengerId           NA
## Sex                 Sex 1.694074e-03
## Age                 Age 1.143600e-01
## Fare               Fare 2.063109e+12
## Pclass_1       Pclass_1 4.737205e+00
## Pclass_2       Pclass_2 9.454967e-01
## Pclass_3       Pclass_3           NA
## Family_size Family_size 2.739101e-08
## Title_1         Title_1 4.709295e+01
## Title_2         Title_2 6.400800e+06
## Title_3         Title_3 1.315294e+05
## Title_4         Title_4           NA
## Emb_1             Emb_1 7.692061e-04
## Emb_2             Emb_2 1.575650e-03
## Emb_3             Emb_3 1.198812e-04

Prediksi Probabilitas

prob <- predict(model_logit, type="response")

prediksi <- ifelse(prob > 0.5, 1, 0)

hasil_prediksi <- data.frame(
  Observasi = 1:nrow(data),
  Probabilitas = round(prob,4),
  Prediksi = prediksi
)

head(hasil_prediksi)
##   Observasi Probabilitas Prediksi
## 1         1       0.0855        0
## 2         2       0.0000        0
## 3         3       0.7396        1
## 4         4       0.0280        0
## 5         5       0.0242        0
## 6         6       1.0000        1

Confusion Matrix

confusionMatrix(
  as.factor(prediksi),
  as.factor(data$Survived)
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 60  6
##          1  4 30
##                                          
##                Accuracy : 0.9            
##                  95% CI : (0.8238, 0.951)
##     No Information Rate : 0.64           
##     P-Value [Acc > NIR] : 2.814e-09      
##                                          
##                   Kappa : 0.7803         
##                                          
##  Mcnemar's Test P-Value : 0.7518         
##                                          
##             Sensitivity : 0.9375         
##             Specificity : 0.8333         
##          Pos Pred Value : 0.9091         
##          Neg Pred Value : 0.8824         
##              Prevalence : 0.6400         
##          Detection Rate : 0.6000         
##    Detection Prevalence : 0.6600         
##       Balanced Accuracy : 0.8854         
##                                          
##        'Positive' Class : 0              
## 

ROC Curve

roc_curve <- roc(data$Survived, prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve,
     col="blue",
     main="ROC Curve Model Logistik")

auc(roc_curve)
## Area under the curve: 0.9431

Kesimpulan

Analisis ini menggunakan Regresi Logistik Biner untuk memodelkan probabilitas kejadian pada variabel dependen.

Tahapan analisis:

  1. Import data CSV
  2. Eksplorasi data
  3. Analisis korelasi variabel numerik
  4. Estimasi model regresi logistik
  5. Interpretasi menggunakan Odds Ratio
  6. Evaluasi model menggunakan Confusion Matrix dan ROC Curve