1. Load Library
library(tidyverse)
library(caret)
library(pROC)
library(MASS)
library(corrplot)
library(car)
library(DescTools)
2. Import Data
data <- read.csv("D:/Dataset Anmul.csv", sep = ";", stringsAsFactors = FALSE)
3. Cek Struktur Data
str(data)
## 'data.frame': 7043 obs. of 21 variables:
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr "Yes" "No" "No" "No" ...
## $ Dependents : chr "No" "No" "No" "No" ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr "No" "No" "No" "Yes" ...
## $ StreamingTV : chr "No" "No" "No" "No" ...
## $ StreamingMovies : chr "No" "No" "No" "No" ...
## $ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr "No" "No" "Yes" "No" ...
4. Konversi Tipe Data
data$Churn <- factor(data$Churn, levels = c("No", "Yes"))
cat_vars <- c("gender","SeniorCitizen","Partner","Dependents","PhoneService",
"MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
"DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
"Contract","PaperlessBilling","PaymentMethod")
data[cat_vars] <- lapply(data[cat_vars], factor)
num_vars <- c("tenure", "MonthlyCharges", "TotalCharges")
data$SeniorCitizen <- as.numeric(data$SeniorCitizen)
data[num_vars] <- lapply(data[num_vars], function(x) as.numeric(as.character(x)))
5. Tangani Missing Value
cat("Jumlah NA sebelum pembersihan:", sum(is.na(data)), "\n")
## Jumlah NA sebelum pembersihan: 11
data <- na.omit(data)
cat("Jumlah baris setelah na.omit:", nrow(data), "\n")
## Jumlah baris setelah na.omit: 7032
6. Tangani Outliers
for (col in num_vars) {
Q1 <- quantile(data[[col]], 0.25)
Q3 <- quantile(data[[col]], 0.75)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
data <- data[data[[col]] >= lower & data[[col]] <= upper, ]
}
7. Eksplorasi Data (EDA)
ggplot(data, aes(Churn)) + geom_bar(fill="steelblue") + ggtitle("Distribusi Churn")

ggplot(data, aes(x = tenure, fill = Churn)) +
geom_histogram(bins = 30, position="dodge") +
ggtitle("Distribusi Tenure")

ggplot(data, aes(x = Churn, y = TotalCharges, fill = Churn)) +
geom_boxplot() + ggtitle("Total Charges by Churn")

cor_mat <- cor(data[, num_vars])
corrplot(cor_mat, method = "circle")

8. Split Data
set.seed(123)
train_idx <- createDataPartition(data$Churn, p = 0.8, list = FALSE)
train <- data[train_idx, ]
test <- data[-train_idx, ]
9. Logistic Regression
logit_model <- glm(Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService + OnlineSecurity +
TechSupport + Contract, data=train, family=binomial)
summary(logit_model)
##
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + TotalCharges +
## InternetService + OnlineSecurity + TechSupport + Contract,
## family = binomial, data = train)
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.722e-01 2.197e-01 -0.784 0.433147
## tenure -6.031e-02 7.015e-03 -8.598 < 2e-16 ***
## MonthlyCharges 5.811e-03 4.093e-03 1.420 0.155712
## TotalCharges 3.568e-04 7.967e-05 4.479 7.50e-06 ***
## InternetServiceFiber optic 5.975e-01 1.563e-01 3.822 0.000133 ***
## InternetServiceNo -1.116e+00 1.750e-01 -6.379 1.78e-10 ***
## OnlineSecurityNo internet service NA NA NA NA
## OnlineSecurityYes -5.369e-01 9.371e-02 -5.729 1.01e-08 ***
## TechSupportNo internet service NA NA NA NA
## TechSupportYes -4.334e-01 9.716e-02 -4.461 8.16e-06 ***
## ContractOne year -7.775e-01 1.170e-01 -6.644 3.05e-11 ***
## ContractTwo year -1.592e+00 1.953e-01 -8.150 3.64e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6517.2 on 5626 degrees of freedom
## Residual deviance: 4781.3 on 5617 degrees of freedom
## AIC: 4801.3
##
## Number of Fisher Scoring iterations: 6
anova(glm(Churn ~ 1, data=train, family=binomial), logit_model, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: Churn ~ 1
## Model 2: Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService +
## OnlineSecurity + TechSupport + Contract
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 5626 6517.2
## 2 5617 4781.3 9 1735.9 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
logit_prob <- predict(logit_model, newdata=test, type="response")
logit_class <- factor(ifelse(logit_prob > 0.5, "Yes", "No"), levels=c("No", "Yes"))
confusionMatrix(logit_class, test$Churn, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 917 170
## Yes 115 203
##
## Accuracy : 0.7972
## 95% CI : (0.7752, 0.8179)
## No Information Rate : 0.7345
## P-Value [Acc > NIR] : 2.778e-08
##
## Kappa : 0.4542
##
## Mcnemar's Test P-Value : 0.001381
##
## Sensitivity : 0.5442
## Specificity : 0.8886
## Pos Pred Value : 0.6384
## Neg Pred Value : 0.8436
## Prevalence : 0.2655
## Detection Rate : 0.1445
## Detection Prevalence : 0.2263
## Balanced Accuracy : 0.7164
##
## 'Positive' Class : Yes
##
roc_logit <- roc(test$Churn, logit_prob)
plot(roc_logit, main="ROC - Logistic Regression")

auc(roc_logit)
## Area under the curve: 0.8429
10. Odds Ratio
odds_ratios <- exp(coef(logit_model))
confint_odds <- exp(confint(logit_model))
OR_table <- data.frame(
Variable = names(odds_ratios),
OddsRatio = odds_ratios,
CI_lower = confint_odds[,1],
CI_upper = confint_odds[,2]
)
print(OR_table)
## Variable OddsRatio
## (Intercept) (Intercept) 0.8417899
## tenure tenure 0.9414690
## MonthlyCharges MonthlyCharges 1.0058274
## TotalCharges TotalCharges 1.0003569
## InternetServiceFiber optic InternetServiceFiber optic 1.8175059
## InternetServiceNo InternetServiceNo 0.3274452
## OnlineSecurityNo internet service OnlineSecurityNo internet service NA
## OnlineSecurityYes OnlineSecurityYes 0.5845462
## TechSupportNo internet service TechSupportNo internet service NA
## TechSupportYes TechSupportYes 0.6482912
## ContractOne year ContractOne year 0.4595379
## ContractTwo year ContractTwo year 0.2035833
## CI_lower CI_upper
## (Intercept) 0.5468193 1.2943837
## tenure 0.9283717 0.9542605
## MonthlyCharges 0.9977986 1.0139418
## TotalCharges 1.0002025 1.0005150
## InternetServiceFiber optic 1.3388848 2.4715567
## InternetServiceNo 0.2317902 0.4604725
## OnlineSecurityNo internet service NA NA
## OnlineSecurityYes 0.4860416 0.7018958
## TechSupportNo internet service NA NA
## TechSupportYes 0.5355008 0.7838174
## ContractOne year 0.3644314 0.5767241
## ContractTwo year 0.1370771 0.2952625
OR_table_plot <- OR_table %>% filter(Variable != "(Intercept)")
ggplot(OR_table_plot, aes(x = reorder(Variable, OddsRatio), y = OddsRatio)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.2) +
coord_flip() + theme_minimal() +
labs(title = "Visualisasi Odds Ratio", x = "Variabel", y = "Odds Ratio") +
geom_hline(yintercept = 1, linetype="dashed", color="red")

11. LDA Model
lda_model <- lda(Churn ~ tenure + MonthlyCharges + TotalCharges + InternetService + OnlineSecurity +
TechSupport + Contract, data=train)
lda_pred <- predict(lda_model, newdata=test)
lda_class <- lda_pred$class
lda_prob <- lda_pred$posterior[, "Yes"]
confusionMatrix(lda_class, test$Churn, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 916 172
## Yes 116 201
##
## Accuracy : 0.795
## 95% CI : (0.7729, 0.8158)
## No Information Rate : 0.7345
## P-Value [Acc > NIR] : 8.013e-08
##
## Kappa : 0.4479
##
## Mcnemar's Test P-Value : 0.001192
##
## Sensitivity : 0.5389
## Specificity : 0.8876
## Pos Pred Value : 0.6341
## Neg Pred Value : 0.8419
## Prevalence : 0.2655
## Detection Rate : 0.1431
## Detection Prevalence : 0.2256
## Balanced Accuracy : 0.7132
##
## 'Positive' Class : Yes
##
roc_lda <- roc(test$Churn, lda_prob)
plot(roc_lda, main="ROC - LDA")

auc(roc_lda)
## Area under the curve: 0.8362
12. Simpan Hasil Prediksi
logit_results <- data.frame(
ID = test$customerID,
Actual = test$Churn,
Predicted_Logit = logit_class,
Prob_Logit = round(logit_prob, 4)
)
lda_results <- data.frame(
Predicted_LDA = lda_class,
Prob_LDA = round(lda_prob, 4)
)
hasil_prediksi <- cbind(logit_results, lda_results)
write.csv(hasil_prediksi, file = "D:/hasil_prediksi_anmul.csv", row.names = FALSE)
cat("✅ Hasil prediksi disimpan ke: D:/hasil_prediksi_anmul.csv\n")
## ✅ Hasil prediksi disimpan ke: D:/hasil_prediksi_anmul.csv