1. Load Library

library(tidyverse)
library(caret)
library(pROC)
library(MASS)
library(corrplot)
library(car)
library(DescTools)

2. Import Data

data <- read.csv("D:/Dataset Anmul.csv", sep = ";", stringsAsFactors = FALSE)

3. Cek Struktur Data

str(data)
## 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : chr  "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr  "Yes" "No" "No" "No" ...
##  $ Dependents      : chr  "No" "No" "No" "No" ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr  "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr  "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr  "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr  "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr  "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr  "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr  "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr  "No" "No" "No" "No" ...
##  $ StreamingMovies : chr  "No" "No" "No" "No" ...
##  $ Contract        : chr  "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr  "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr  "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : chr  "No" "No" "Yes" "No" ...

4. Konversi Tipe Data

data$Churn <- factor(data$Churn, levels = c("No", "Yes"))
cat_vars <- c("gender","SeniorCitizen","Partner","Dependents","PhoneService",
              "MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
              "DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
              "Contract","PaperlessBilling","PaymentMethod")
data[cat_vars] <- lapply(data[cat_vars], factor)
num_vars <- c("tenure", "MonthlyCharges", "TotalCharges")
data$SeniorCitizen <- as.numeric(data$SeniorCitizen)
data[num_vars] <- lapply(data[num_vars], function(x) as.numeric(as.character(x)))

5. Tangani Missing Value

cat("Jumlah NA sebelum pembersihan:", sum(is.na(data)), "\n")
## Jumlah NA sebelum pembersihan: 11
data <- na.omit(data)
cat("Jumlah baris setelah na.omit:", nrow(data), "\n")
## Jumlah baris setelah na.omit: 7032

6. Tangani Outliers

for (col in num_vars) {
  Q1 <- quantile(data[[col]], 0.25)
  Q3 <- quantile(data[[col]], 0.75)
  IQR <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  data <- data[data[[col]] >= lower & data[[col]] <= upper, ]
}

7. Eksplorasi Data (EDA)

ggplot(data, aes(Churn)) + geom_bar(fill="steelblue") + ggtitle("Distribusi Churn")

ggplot(data, aes(x = tenure, fill = Churn)) + 
  geom_histogram(bins = 30, position="dodge") + 
  ggtitle("Distribusi Tenure")

ggplot(data, aes(x = Churn, y = TotalCharges, fill = Churn)) + 
  geom_boxplot() + ggtitle("Total Charges by Churn")

cor_mat <- cor(data[, num_vars])
corrplot(cor_mat, method = "circle")

8. Split Data

set.seed(123)
train_idx <- createDataPartition(data$Churn, p = 0.8, list = FALSE)
train <- data[train_idx, ]
test <- data[-train_idx, ]

9. Logistic Regression

logit_model <- glm(Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService + OnlineSecurity +
                     TechSupport + Contract, data=train, family=binomial)
summary(logit_model)
## 
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + TotalCharges + 
##     InternetService + OnlineSecurity + TechSupport + Contract, 
##     family = binomial, data = train)
## 
## Coefficients: (2 not defined because of singularities)
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -1.722e-01  2.197e-01  -0.784 0.433147    
## tenure                            -6.031e-02  7.015e-03  -8.598  < 2e-16 ***
## MonthlyCharges                     5.811e-03  4.093e-03   1.420 0.155712    
## TotalCharges                       3.568e-04  7.967e-05   4.479 7.50e-06 ***
## InternetServiceFiber optic         5.975e-01  1.563e-01   3.822 0.000133 ***
## InternetServiceNo                 -1.116e+00  1.750e-01  -6.379 1.78e-10 ***
## OnlineSecurityNo internet service         NA         NA      NA       NA    
## OnlineSecurityYes                 -5.369e-01  9.371e-02  -5.729 1.01e-08 ***
## TechSupportNo internet service            NA         NA      NA       NA    
## TechSupportYes                    -4.334e-01  9.716e-02  -4.461 8.16e-06 ***
## ContractOne year                  -7.775e-01  1.170e-01  -6.644 3.05e-11 ***
## ContractTwo year                  -1.592e+00  1.953e-01  -8.150 3.64e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6517.2  on 5626  degrees of freedom
## Residual deviance: 4781.3  on 5617  degrees of freedom
## AIC: 4801.3
## 
## Number of Fisher Scoring iterations: 6
anova(glm(Churn ~ 1, data=train, family=binomial), logit_model, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Churn ~ 1
## Model 2: Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService + 
##     OnlineSecurity + TechSupport + Contract
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      5626     6517.2                          
## 2      5617     4781.3  9   1735.9 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
logit_prob <- predict(logit_model, newdata=test, type="response")
logit_class <- factor(ifelse(logit_prob > 0.5, "Yes", "No"), levels=c("No", "Yes"))
confusionMatrix(logit_class, test$Churn, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  917 170
##        Yes 115 203
##                                           
##                Accuracy : 0.7972          
##                  95% CI : (0.7752, 0.8179)
##     No Information Rate : 0.7345          
##     P-Value [Acc > NIR] : 2.778e-08       
##                                           
##                   Kappa : 0.4542          
##                                           
##  Mcnemar's Test P-Value : 0.001381        
##                                           
##             Sensitivity : 0.5442          
##             Specificity : 0.8886          
##          Pos Pred Value : 0.6384          
##          Neg Pred Value : 0.8436          
##              Prevalence : 0.2655          
##          Detection Rate : 0.1445          
##    Detection Prevalence : 0.2263          
##       Balanced Accuracy : 0.7164          
##                                           
##        'Positive' Class : Yes             
## 
roc_logit <- roc(test$Churn, logit_prob)
plot(roc_logit, main="ROC - Logistic Regression")

auc(roc_logit)
## Area under the curve: 0.8429

10. Odds Ratio

odds_ratios <- exp(coef(logit_model))
confint_odds <- exp(confint(logit_model))
OR_table <- data.frame(
  Variable = names(odds_ratios),
  OddsRatio = odds_ratios,
  CI_lower = confint_odds[,1],
  CI_upper = confint_odds[,2]
)
print(OR_table)
##                                                            Variable OddsRatio
## (Intercept)                                             (Intercept) 0.8417899
## tenure                                                       tenure 0.9414690
## MonthlyCharges                                       MonthlyCharges 1.0058274
## TotalCharges                                           TotalCharges 1.0003569
## InternetServiceFiber optic               InternetServiceFiber optic 1.8175059
## InternetServiceNo                                 InternetServiceNo 0.3274452
## OnlineSecurityNo internet service OnlineSecurityNo internet service        NA
## OnlineSecurityYes                                 OnlineSecurityYes 0.5845462
## TechSupportNo internet service       TechSupportNo internet service        NA
## TechSupportYes                                       TechSupportYes 0.6482912
## ContractOne year                                   ContractOne year 0.4595379
## ContractTwo year                                   ContractTwo year 0.2035833
##                                    CI_lower  CI_upper
## (Intercept)                       0.5468193 1.2943837
## tenure                            0.9283717 0.9542605
## MonthlyCharges                    0.9977986 1.0139418
## TotalCharges                      1.0002025 1.0005150
## InternetServiceFiber optic        1.3388848 2.4715567
## InternetServiceNo                 0.2317902 0.4604725
## OnlineSecurityNo internet service        NA        NA
## OnlineSecurityYes                 0.4860416 0.7018958
## TechSupportNo internet service           NA        NA
## TechSupportYes                    0.5355008 0.7838174
## ContractOne year                  0.3644314 0.5767241
## ContractTwo year                  0.1370771 0.2952625
OR_table_plot <- OR_table %>% filter(Variable != "(Intercept)")
ggplot(OR_table_plot, aes(x = reorder(Variable, OddsRatio), y = OddsRatio)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.2) +
  coord_flip() + theme_minimal() +
  labs(title = "Visualisasi Odds Ratio", x = "Variabel", y = "Odds Ratio") +
  geom_hline(yintercept = 1, linetype="dashed", color="red")

11. LDA Model

lda_model <- lda(Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService + OnlineSecurity + 
                   TechSupport + Contract, data=train)
lda_pred <- predict(lda_model, newdata=test)
lda_class <- lda_pred$class
lda_prob <- lda_pred$posterior[, "Yes"]
confusionMatrix(lda_class, test$Churn, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  916 172
##        Yes 116 201
##                                           
##                Accuracy : 0.795           
##                  95% CI : (0.7729, 0.8158)
##     No Information Rate : 0.7345          
##     P-Value [Acc > NIR] : 8.013e-08       
##                                           
##                   Kappa : 0.4479          
##                                           
##  Mcnemar's Test P-Value : 0.001192        
##                                           
##             Sensitivity : 0.5389          
##             Specificity : 0.8876          
##          Pos Pred Value : 0.6341          
##          Neg Pred Value : 0.8419          
##              Prevalence : 0.2655          
##          Detection Rate : 0.1431          
##    Detection Prevalence : 0.2256          
##       Balanced Accuracy : 0.7132          
##                                           
##        'Positive' Class : Yes             
## 
roc_lda <- roc(test$Churn, lda_prob)
plot(roc_lda, main="ROC - LDA")

auc(roc_lda)
## Area under the curve: 0.8362

12. Simpan Hasil Prediksi

logit_results <- data.frame(
  ID = test$customerID,
  Actual = test$Churn,
  Predicted_Logit = logit_class,
  Prob_Logit = round(logit_prob, 4)
)
lda_results <- data.frame(
  Predicted_LDA = lda_class,
  Prob_LDA = round(lda_prob, 4)
)
hasil_prediksi <- cbind(logit_results, lda_results)
write.csv(hasil_prediksi, file = "D:/hasil_prediksi_anmul.csv", row.names = FALSE)
cat("✅ Hasil prediksi disimpan ke: D:/hasil_prediksi_anmul.csv\n")
## ✅ Hasil prediksi disimpan ke: D:/hasil_prediksi_anmul.csv