Analisis Churn Pelanggan

1. Load Library

library(tidyverse)
library(caret)
library(pROC)
library(MASS)
library(corrplot)
library(car)
library(DescTools)

2. Import Data

data <- read.csv("D:/Dataset Anmul.csv", sep = ";", stringsAsFactors = FALSE)

3. Cek Struktur Data

str(data)

## 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : chr  "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr  "Yes" "No" "No" "No" ...
##  $ Dependents      : chr  "No" "No" "No" "No" ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr  "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr  "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr  "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr  "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr  "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr  "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr  "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr  "No" "No" "No" "No" ...
##  $ StreamingMovies : chr  "No" "No" "No" "No" ...
##  $ Contract        : chr  "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr  "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr  "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : chr  "No" "No" "Yes" "No" ...

4. Konversi Tipe Data

data$Churn <- factor(data$Churn, levels = c("No", "Yes"))
cat_vars <- c("gender","SeniorCitizen","Partner","Dependents","PhoneService",
              "MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
              "DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
              "Contract","PaperlessBilling","PaymentMethod")
data[cat_vars] <- lapply(data[cat_vars], factor)
num_vars <- c("tenure", "MonthlyCharges", "TotalCharges")
data$SeniorCitizen <- as.numeric(data$SeniorCitizen)
data[num_vars] <- lapply(data[num_vars], function(x) as.numeric(as.character(x)))

5. Tangani Missing Value

cat("Jumlah NA sebelum pembersihan:", sum(is.na(data)), "\n")

## Jumlah NA sebelum pembersihan: 11

data <- na.omit(data)
cat("Jumlah baris setelah na.omit:", nrow(data), "\n")

## Jumlah baris setelah na.omit: 7032

6. Tangani Outliers

for (col in num_vars) {
  Q1 <- quantile(data[[col]], 0.25)
  Q3 <- quantile(data[[col]], 0.75)
  IQR <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  data <- data[data[[col]] >= lower & data[[col]] <= upper, ]
}

7. Eksplorasi Data (EDA)

ggplot(data, aes(Churn)) + geom_bar(fill="steelblue") + ggtitle("Distribusi Churn")

ggplot(data, aes(x = tenure, fill = Churn)) + 
  geom_histogram(bins = 30, position="dodge") + 
  ggtitle("Distribusi Tenure")

ggplot(data, aes(x = Churn, y = TotalCharges, fill = Churn)) + 
  geom_boxplot() + ggtitle("Total Charges by Churn")

cor_mat <- cor(data[, num_vars])
corrplot(cor_mat, method = "circle")

8. Split Data

set.seed(123)
train_idx <- createDataPartition(data$Churn, p = 0.8, list = FALSE)
train <- data[train_idx, ]
test <- data[-train_idx, ]

9. Logistic Regression

logit_model <- glm(Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService + OnlineSecurity +
                     TechSupport + Contract, data=train, family=binomial)
summary(logit_model)

## 
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + TotalCharges + 
##     InternetService + OnlineSecurity + TechSupport + Contract, 
##     family = binomial, data = train)
## 
## Coefficients: (2 not defined because of singularities)
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -1.722e-01  2.197e-01  -0.784 0.433147    
## tenure                            -6.031e-02  7.015e-03  -8.598  < 2e-16 ***
## MonthlyCharges                     5.811e-03  4.093e-03   1.420 0.155712    
## TotalCharges                       3.568e-04  7.967e-05   4.479 7.50e-06 ***
## InternetServiceFiber optic         5.975e-01  1.563e-01   3.822 0.000133 ***
## InternetServiceNo                 -1.116e+00  1.750e-01  -6.379 1.78e-10 ***
## OnlineSecurityNo internet service         NA         NA      NA       NA    
## OnlineSecurityYes                 -5.369e-01  9.371e-02  -5.729 1.01e-08 ***
## TechSupportNo internet service            NA         NA      NA       NA    
## TechSupportYes                    -4.334e-01  9.716e-02  -4.461 8.16e-06 ***
## ContractOne year                  -7.775e-01  1.170e-01  -6.644 3.05e-11 ***
## ContractTwo year                  -1.592e+00  1.953e-01  -8.150 3.64e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6517.2  on 5626  degrees of freedom
## Residual deviance: 4781.3  on 5617  degrees of freedom
## AIC: 4801.3
## 
## Number of Fisher Scoring iterations: 6

anova(glm(Churn ~ 1, data=train, family=binomial), logit_model, test="Chisq")

## Analysis of Deviance Table
## 
## Model 1: Churn ~ 1
## Model 2: Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService + 
##     OnlineSecurity + TechSupport + Contract
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      5626     6517.2                          
## 2      5617     4781.3  9   1735.9 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

logit_prob <- predict(logit_model, newdata=test, type="response")
logit_class <- factor(ifelse(logit_prob > 0.5, "Yes", "No"), levels=c("No", "Yes"))
confusionMatrix(logit_class, test$Churn, positive="Yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  917 170
##        Yes 115 203
##                                           
##                Accuracy : 0.7972          
##                  95% CI : (0.7752, 0.8179)
##     No Information Rate : 0.7345          
##     P-Value [Acc > NIR] : 2.778e-08       
##                                           
##                   Kappa : 0.4542          
##                                           
##  Mcnemar's Test P-Value : 0.001381        
##                                           
##             Sensitivity : 0.5442          
##             Specificity : 0.8886          
##          Pos Pred Value : 0.6384          
##          Neg Pred Value : 0.8436          
##              Prevalence : 0.2655          
##          Detection Rate : 0.1445          
##    Detection Prevalence : 0.2263          
##       Balanced Accuracy : 0.7164          
##                                           
##        'Positive' Class : Yes             
##

roc_logit <- roc(test$Churn, logit_prob)
plot(roc_logit, main="ROC - Logistic Regression")

auc(roc_logit)

## Area under the curve: 0.8429

10. Odds Ratio

odds_ratios <- exp(coef(logit_model))
confint_odds <- exp(confint(logit_model))
OR_table <- data.frame(
  Variable = names(odds_ratios),
  OddsRatio = odds_ratios,
  CI_lower = confint_odds[,1],
  CI_upper = confint_odds[,2]
)
print(OR_table)

##                                                            Variable OddsRatio
## (Intercept)                                             (Intercept) 0.8417899
## tenure                                                       tenure 0.9414690
## MonthlyCharges                                       MonthlyCharges 1.0058274
## TotalCharges                                           TotalCharges 1.0003569
## InternetServiceFiber optic               InternetServiceFiber optic 1.8175059
## InternetServiceNo                                 InternetServiceNo 0.3274452
## OnlineSecurityNo internet service OnlineSecurityNo internet service        NA
## OnlineSecurityYes                                 OnlineSecurityYes 0.5845462
## TechSupportNo internet service       TechSupportNo internet service        NA
## TechSupportYes                                       TechSupportYes 0.6482912
## ContractOne year                                   ContractOne year 0.4595379
## ContractTwo year                                   ContractTwo year 0.2035833
##                                    CI_lower  CI_upper
## (Intercept)                       0.5468193 1.2943837
## tenure                            0.9283717 0.9542605
## MonthlyCharges                    0.9977986 1.0139418
## TotalCharges                      1.0002025 1.0005150
## InternetServiceFiber optic        1.3388848 2.4715567
## InternetServiceNo                 0.2317902 0.4604725
## OnlineSecurityNo internet service        NA        NA
## OnlineSecurityYes                 0.4860416 0.7018958
## TechSupportNo internet service           NA        NA
## TechSupportYes                    0.5355008 0.7838174
## ContractOne year                  0.3644314 0.5767241
## ContractTwo year                  0.1370771 0.2952625

OR_table_plot <- OR_table %>% filter(Variable != "(Intercept)")
ggplot(OR_table_plot, aes(x = reorder(Variable, OddsRatio), y = OddsRatio)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.2) +
  coord_flip() + theme_minimal() +
  labs(title = "Visualisasi Odds Ratio", x = "Variabel", y = "Odds Ratio") +
  geom_hline(yintercept = 1, linetype="dashed", color="red")

11. LDA Model

lda_model <- lda(Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService + OnlineSecurity + 
                   TechSupport + Contract, data=train)
lda_pred <- predict(lda_model, newdata=test)
lda_class <- lda_pred$class
lda_prob <- lda_pred$posterior[, "Yes"]
confusionMatrix(lda_class, test$Churn, positive="Yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  916 172
##        Yes 116 201
##                                           
##                Accuracy : 0.795           
##                  95% CI : (0.7729, 0.8158)
##     No Information Rate : 0.7345          
##     P-Value [Acc > NIR] : 8.013e-08       
##                                           
##                   Kappa : 0.4479          
##                                           
##  Mcnemar's Test P-Value : 0.001192        
##                                           
##             Sensitivity : 0.5389          
##             Specificity : 0.8876          
##          Pos Pred Value : 0.6341          
##          Neg Pred Value : 0.8419          
##              Prevalence : 0.2655          
##          Detection Rate : 0.1431          
##    Detection Prevalence : 0.2256          
##       Balanced Accuracy : 0.7132          
##                                           
##        'Positive' Class : Yes             
##

roc_lda <- roc(test$Churn, lda_prob)
plot(roc_lda, main="ROC - LDA")

auc(roc_lda)

## Area under the curve: 0.8362

12. Simpan Hasil Prediksi

logit_results <- data.frame(
  ID = test$customerID,
  Actual = test$Churn,
  Predicted_Logit = logit_class,
  Prob_Logit = round(logit_prob, 4)
)
lda_results <- data.frame(
  Predicted_LDA = lda_class,
  Prob_LDA = round(lda_prob, 4)
)
hasil_prediksi <- cbind(logit_results, lda_results)
write.csv(hasil_prediksi, file = "D:/hasil_prediksi_anmul.csv", row.names = FALSE)
cat("✅ Hasil prediksi disimpan ke: D:/hasil_prediksi_anmul.csv\n")

## ✅ Hasil prediksi disimpan ke: D:/hasil_prediksi_anmul.csv

Analisis Churn Pelanggan

Putri Ikrima Al-Fadhilah

1. Load Library

2. Import Data

3. Cek Struktur Data

4. Konversi Tipe Data

5. Tangani Missing Value

6. Tangani Outliers

7. Eksplorasi Data (EDA)

8. Split Data

9. Logistic Regression

10. Odds Ratio

11. LDA Model

12. Simpan Hasil Prediksi