# Load library
library(tidyverse)
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(DataExplorer)
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.3
# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
# Struktur data
glimpse(data)
## Rows: 1,000
## Columns: 11
## $ CustomerID <int> 1000001, 1000002, 1000003, 1000004, 1000005, 1000006, 1…
## $ Gender <chr> "Male", "Female", "Male", "Male", "Male", "Female", "Ma…
## $ Age <int> 34, 26, 50, 37, 30, 45, 65, 46, 30, 63, 52, 23, 35, 68,…
## $ Geography <chr> "France", "Spain", "Germany", "Spain", "Spain", "Italy"…
## $ Tenure <int> 14, 14, 57, 34, 53, 57, 20, 11, 16, 51, 31, 55, 6, 56, …
## $ Contract <chr> "Two-year", "Month-to-month", "Two-year", "Month-to-mon…
## $ MonthlyCharges <dbl> 21.58, 27.71, 111.12, 55.49, 62.48, 41.53, 79.71, 79.48…
## $ TotalCharges <dbl> 7933.34, 5869.34, 6321.20, 7956.44, 4922.75, 2601.51, 6…
## $ PaymentMethod <chr> "Bank transfer", "Credit card", "Bank transfer", "Bank …
## $ IsActiveMember <int> 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0…
## $ Churn <chr> "No", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Y…
# Konversi fitur kategorikal menjadi faktor
data <- data %>%
mutate(
Gender = as.factor(Gender),
Geography = as.factor(Geography),
Contract = as.factor(Contract),
PaymentMethod = as.factor(PaymentMethod),
Churn = as.factor(Churn),
IsActiveMember = as.factor(IsActiveMember)
)
# Cek struktur kolom setelah konversi
str(data)
## 'data.frame': 1000 obs. of 11 variables:
## $ CustomerID : int 1000001 1000002 1000003 1000004 1000005 1000006 1000007 1000008 1000009 1000010 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 1 2 2 2 1 ...
## $ Age : int 34 26 50 37 30 45 65 46 30 63 ...
## $ Geography : Factor w/ 5 levels "France","Germany",..: 1 4 2 4 4 3 5 2 4 2 ...
## $ Tenure : int 14 14 57 34 53 57 20 11 16 51 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 3 1 3 1 3 1 3 2 3 3 ...
## $ MonthlyCharges: num 21.6 27.7 111.1 55.5 62.5 ...
## $ TotalCharges : num 7933 5869 6321 7956 4923 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer",..: 1 2 1 1 3 2 2 1 3 2 ...
## $ IsActiveMember: Factor w/ 2 levels "0","1": 2 1 2 1 2 1 1 2 2 2 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 2 2 2 2 2 ...
# Pengecekan missing value
colSums(is.na(data))
## CustomerID Gender Age Geography Tenure
## 0 0 0 0 0
## Contract MonthlyCharges TotalCharges PaymentMethod IsActiveMember
## 0 0 0 0 0
## Churn
## 0
gg_miss_var(data)
# Hapus kolom CustomerID karena tidak relevan
data_norm <- data %>% select(-CustomerID)
# Visualisasi distribusi fitur numerik
data_norm %>%
select(where(is.numeric)) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
ggplot(aes(Value)) +
geom_histogram(fill = "#00ADB5", bins = 30) +
facet_wrap(~Feature, scales = "free") +
theme_minimal() +
labs(title = "Histogram Fitur Numerik")
# Visualisasi boxplot fitur numerik
data_norm %>%
select(where(is.numeric)) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
ggplot(aes(x = Feature, y = Value)) +
geom_boxplot(fill = "#FFB6B9") +
theme_minimal() +
coord_flip() +
labs(title = "Boxplot Fitur Numerik")
# Visualisasi distribusi fitur kategorikal (setelah dikonversi ke faktor)
data_norm %>%
select(where(is.factor)) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
ggplot(aes(x = Value)) +
geom_bar(fill = "#F08A5D") +
facet_wrap(~Feature, scales = "free") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Distribusi Fitur Kategorikal")
# Visualisasi MonthlyCharges vs Churn
ggplot(data_norm, aes(x = Churn, y = MonthlyCharges)) +
geom_boxplot(fill = "#f08a5d") +
theme_minimal() +
labs(title = "Monthly Charges vs Churn")
# Visualisasi TotalCharges vs Churn (Boxplot)
ggplot(data_norm, aes(x = Churn, y = TotalCharges, fill = Churn)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Total Charges vs Churn (Boxplot)",
x = "Churn",
y = "Total Charges")
# Visualisasi Tenure vs Churn (Boxplot)
ggplot(data_norm, aes(x = Churn, y = Tenure, fill = Churn)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Tenure vs Churn (Boxplot)",
x = "Churn",
y = "Tenure (Months)")
# Uji t untuk MonthlyCharges vs Churn
t.test(MonthlyCharges ~ Churn, data = data_norm)
##
## Welch Two Sample t-test
##
## data: MonthlyCharges by Churn
## t = 0.37292, df = 996.65, p-value = 0.7093
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -2.864145 4.208151
## sample estimates:
## mean in group No mean in group Yes
## 70.46135 69.78934
# Uji t untuk TotalCharges vs Churn
t.test(TotalCharges ~ Churn, data = data_norm)
##
## Welch Two Sample t-test
##
## data: TotalCharges by Churn
## t = -1.9009, df = 998, p-value = 0.0576
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -544.93405 8.66342
## sample estimates:
## mean in group No mean in group Yes
## 3931.618 4199.753
# Uji t untuk Tenure vs Churn
t.test(Tenure ~ Churn, data = data_norm)
##
## Welch Two Sample t-test
##
## data: Tenure by Churn
## t = 0.46232, df = 996.58, p-value = 0.644
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -1.616067 2.612243
## sample estimates:
## mean in group No mean in group Yes
## 31.01004 30.51195
# Visualisasi distribusi IsActiveMember (Barplot)
ggplot(data_norm, aes(x = IsActiveMember)) +
geom_bar(fill = "#6A0572") +
labs(x = "Is Active Member", y = "Jumlah", title = "Distribusi Member Aktif") +
theme_minimal()
# Cek korelasi antara fitur numerik
num_data <- data_norm %>% select(where(is.numeric)) %>% na.omit()
cor_matrix <- cor(num_data)
# Visualisasi matriks korelasi
corrplot(cor_matrix,
method = "circle",
type = "lower",
tl.cex = 0.8,
title = "Matriks Korelasi Numerik",
mar = c(0,0,1,0))
# Visualisasi distribusi Churn
ggplot(data_norm, aes(x = Churn)) +
geom_bar(fill = "#6A0572") +
theme_minimal() +
labs(title = "Distribusi Churn", y = "Jumlah", x = "Churn")
# Proporsi Churn
prop.table(table(data_norm$Churn))
##
## No Yes
## 0.498 0.502
============== MODELLING DATA ==============
# Load library
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071) # Untuk Naive Bayes
library(rpart) # Untuk Decision Tree
library(rpart.plot) # Visualisasi Decision Tree
# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
# Struktur data
str(data)
## 'data.frame': 1000 obs. of 11 variables:
## $ CustomerID : int 1000001 1000002 1000003 1000004 1000005 1000006 1000007 1000008 1000009 1000010 ...
## $ Gender : chr "Male" "Female" "Male" "Male" ...
## $ Age : int 34 26 50 37 30 45 65 46 30 63 ...
## $ Geography : chr "France" "Spain" "Germany" "Spain" ...
## $ Tenure : int 14 14 57 34 53 57 20 11 16 51 ...
## $ Contract : chr "Two-year" "Month-to-month" "Two-year" "Month-to-month" ...
## $ MonthlyCharges: num 21.6 27.7 111.1 55.5 62.5 ...
## $ TotalCharges : num 7933 5869 6321 7956 4923 ...
## $ PaymentMethod : chr "Bank transfer" "Credit card" "Bank transfer" "Bank transfer" ...
## $ IsActiveMember: int 1 0 1 0 1 0 0 1 1 1 ...
## $ Churn : chr "No" "Yes" "No" "Yes" ...
# Ubah kolom kategorikal menjadi numerik
var_cols <- c("Gender", "Geography", "Contract", "PaymentMethod")
data[var_cols] <- lapply(data[var_cols], function(x) as.numeric(as.factor(x)))
data$Churn <- as.factor(data$Churn) # Biarkan Churn tetap faktor
# Hilangkan kolom ID dulu
data_norm <- data %>% select(-CustomerID)
str(data_norm)
## 'data.frame': 1000 obs. of 10 variables:
## $ Gender : num 2 1 2 2 2 1 2 2 2 1 ...
## $ Age : int 34 26 50 37 30 45 65 46 30 63 ...
## $ Geography : num 1 4 2 4 4 3 5 2 4 2 ...
## $ Tenure : int 14 14 57 34 53 57 20 11 16 51 ...
## $ Contract : num 3 1 3 1 3 1 3 2 3 3 ...
## $ MonthlyCharges: num 21.6 27.7 111.1 55.5 62.5 ...
## $ TotalCharges : num 7933 5869 6321 7956 4923 ...
## $ PaymentMethod : num 1 2 1 1 3 2 2 1 3 2 ...
## $ IsActiveMember: int 1 0 1 0 1 0 0 1 1 1 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 2 2 2 2 2 ...
# Normalisasi numerik (kecuali kolom target 'Churn')
data_norm[ , -ncol(data_norm)] <- scale(data_norm[ , -ncol(data_norm)])
#Splitting Data
set.seed(101)
library(caret)
# Set the proportion of data to be used for training
train_proportion <- 0.8
# Create the train/test indices with stratified sampling
train_index <- createDataPartition(data_norm$Churn, p = train_proportion, list = FALSE, times = 1)
# Split the data into training and testing sets
train_set <- data_norm[train_index, ]
test_set <- data_norm[-train_index, ]
head(train_set)
## Gender Age Geography Tenure Contract MonthlyCharges
## 1 1.0196938 -0.64748860 -1.38643262 -0.9843945 1.224512 -1.7047145
## 2 -0.9797058 -1.17610466 0.70998767 -0.9843945 -1.185945 -1.4894480
## 3 1.0196938 0.40974353 -0.68762586 1.5412000 1.224512 1.4396522
## 4 1.0196938 -0.44925758 0.70998767 0.1903006 -1.185945 -0.5139006
## 5 1.0196938 -0.91179663 0.70998767 1.3062610 1.224512 -0.2684335
## 6 -0.9797058 0.07935849 0.01118091 1.5412000 -1.185945 -1.0041325
## TotalCharges PaymentMethod IsActiveMember Churn
## 1 1.7315979 -1.3574109 0.9935208 No
## 2 0.8073908 -0.4626114 -1.0055150 Yes
## 3 1.0097223 -1.3574109 0.9935208 No
## 4 1.7419415 -1.3574109 -1.0055150 Yes
## 5 0.3835317 0.4321882 0.9935208 No
## 6 -0.6558611 -0.4626114 -1.0055150 Yes
head(test_set)
## Gender Age Geography Tenure Contract MonthlyCharges
## 10 -0.9797058 1.2687446 -0.6876259 1.18879144 1.22451200 -0.5908066
## 11 1.0196938 0.5418975 -1.3864326 0.01409634 -1.18594470 1.3476460
## 21 -0.9797058 1.5330527 0.7099877 -0.63198597 -1.18594470 0.9462598
## 38 1.0196938 0.4097435 -1.3864326 1.54119998 1.22451200 0.9929653
## 43 -0.9797058 -0.3831806 0.7099877 -1.39553778 0.01928365 -1.0539985
## 52 -0.9797058 -1.5725667 -0.6876259 1.42373047 1.22451200 -0.3246206
## TotalCharges PaymentMethod IsActiveMember Churn
## 10 -0.8453370 -0.4626114 0.9935208 Yes
## 11 0.9104103 -1.3574109 0.9935208 Yes
## 21 -1.2863414 -1.3574109 0.9935208 Yes
## 38 1.1162748 -0.4626114 0.9935208 Yes
## 43 1.1057297 -1.3574109 -1.0055150 No
## 52 1.1667300 -0.4626114 0.9935208 Yes
=================== KNN ===================
#Using Initial K
#The initial value for k is generally chosen as the square root of the number of observations.
# Set jumlah fold untuk cross-validation
initial_k <- sqrt(NROW(data))
initial_k
## [1] 31.62278
library(class)
## Warning: package 'class' was built under R version 4.4.3
# run KNN with k=31 and k=32
knn.31 <- knn(train=train_set[,-10], test=test_set[,-10], cl=train_set$Churn,k=floor(initial_k))
# use confusion matrix to calculate accuracy
cf.31 <- confusionMatrix(factor(knn.31), factor(test_set$Churn))
cf.31
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 49 52
## Yes 50 48
##
## Accuracy : 0.4874
## 95% CI : (0.4161, 0.5591)
## No Information Rate : 0.5025
## P-Value [Acc > NIR] : 0.6901
##
## Kappa : -0.025
##
## Mcnemar's Test P-Value : 0.9211
##
## Sensitivity : 0.4949
## Specificity : 0.4800
## Pos Pred Value : 0.4851
## Neg Pred Value : 0.4898
## Prevalence : 0.4975
## Detection Rate : 0.2462
## Detection Prevalence : 0.5075
## Balanced Accuracy : 0.4875
##
## 'Positive' Class : No
##
knn.32 <- knn(train=train_set[,-10], test=test_set[,-10], cl=train_set$Churn,k=ceiling(initial_k))
# use confusion matrix to calculate accuracy
cf.32 <- confusionMatrix(factor(knn.32), factor(test_set$Churn))
cf.32
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 46 54
## Yes 53 46
##
## Accuracy : 0.4623
## 95% CI : (0.3916, 0.5342)
## No Information Rate : 0.5025
## P-Value [Acc > NIR] : 0.886
##
## Kappa : -0.0753
##
## Mcnemar's Test P-Value : 1.000
##
## Sensitivity : 0.4646
## Specificity : 0.4600
## Pos Pred Value : 0.4600
## Neg Pred Value : 0.4646
## Prevalence : 0.4975
## Detection Rate : 0.2312
## Detection Prevalence : 0.5025
## Balanced Accuracy : 0.4623
##
## 'Positive' Class : No
##
# ===== Tambahan: Cross-Validation KNN (k = 20 sampai 40) =====
library(caret)
# Siapkan data latih (sudah dari data_norm)
train_data_cv <- data_norm[train_index, ]
train_data_cv$Churn <- as.factor(train_data_cv$Churn)
# Konfigurasi cross-validation
set.seed(101)
ctrl <- trainControl(method = "cv", number = 10)
# Uji nilai k dari 20 sampai 40
knn_cv <- train(
Churn ~ .,
data = train_data_cv,
method = "knn",
trControl = ctrl,
tuneGrid = data.frame(k = 20:40)
)
# Tampilkan hasil
print(knn_cv)
## k-Nearest Neighbors
##
## 801 samples
## 9 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 721, 721, 721, 720, 722, 721, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 20 0.5093829 0.018782862
## 21 0.4981483 -0.003529786
## 22 0.5143683 0.028873061
## 23 0.5093520 0.018818594
## 24 0.4967421 -0.006728192
## 25 0.4955538 -0.009026289
## 26 0.4929126 -0.014259752
## 27 0.4967093 -0.006841561
## 28 0.5055218 0.010803692
## 29 0.5105071 0.020651001
## 30 0.5217579 0.043110516
## 31 0.5229458 0.045654329
## 32 0.5104770 0.020817559
## 33 0.5217891 0.043443923
## 34 0.5181175 0.036092729
## 35 0.5180561 0.035960364
## 36 0.5068370 0.013461071
## 37 0.5042741 0.008375946
## 38 0.5168050 0.033506238
## 39 0.5154929 0.030849098
## 40 0.5055388 0.010850984
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 31.
============= DECISION TREE =============
# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
# Ubah kolom-kolom kategorikal jadi factor
faktor_vars <- c("Gender", "Geography", "Contract", "PaymentMethod", "Churn")
data[faktor_vars] <- lapply(data[faktor_vars], as.factor)
data$IsActiveMember <- as.factor(data$IsActiveMember)
# Hapus kolom ID
data_norm <- data %>% select(-CustomerID)
# 'IsActiveMember' dan 'Churn' tetap sebagai faktor
data_norm$IsActiveMember <- as.factor(data_norm$IsActiveMember)
data_norm$Churn <- as.factor(data_norm$Churn)
# Normalisasi hanya kolom numerik selain kolom target
num_cols <- sapply(data_norm, is.numeric)
data_norm[, num_cols] <- scale(data_norm[, num_cols])
#Partisi Data
set.seed(101)
# Set the proportion of data to be used for training
train_proportion <- 0.8
# Create the train/test indices with stratified sampling
train_index <- createDataPartition(data_norm$Churn, p = train_proportion, list = FALSE, times = 1)
# Split the data into training and testing sets
train_set1 <- data_norm[train_index, ]
test_set1 <- data_norm[-train_index, ]
head(train_set1)
## Gender Age Geography Tenure Contract MonthlyCharges
## 1 Male -0.64748860 France -0.9843945 Two-year -1.7047145
## 2 Female -1.17610466 Spain -0.9843945 Month-to-month -1.4894480
## 3 Male 0.40974353 Germany 1.5412000 Two-year 1.4396522
## 4 Male -0.44925758 Spain 0.1903006 Month-to-month -0.5139006
## 5 Male -0.91179663 Spain 1.3062610 Two-year -0.2684335
## 6 Female 0.07935849 Italy 1.5412000 Month-to-month -1.0041325
## TotalCharges PaymentMethod IsActiveMember Churn
## 1 1.7315979 Bank transfer 1 No
## 2 0.8073908 Credit card 0 Yes
## 3 1.0097223 Bank transfer 1 No
## 4 1.7419415 Bank transfer 0 Yes
## 5 0.3835317 Direct debit 1 No
## 6 -0.6558611 Credit card 0 Yes
head(test_set1)
## Gender Age Geography Tenure Contract MonthlyCharges
## 10 Female 1.2687446 Germany 1.18879144 Two-year -0.5908066
## 11 Male 0.5418975 France 0.01409634 Month-to-month 1.3476460
## 21 Female 1.5330527 Spain -0.63198597 Month-to-month 0.9462598
## 38 Male 0.4097435 France 1.54119998 Two-year 0.9929653
## 43 Female -0.3831806 Spain -1.39553778 One-year -1.0539985
## 52 Female -1.5725667 Germany 1.42373047 Two-year -0.3246206
## TotalCharges PaymentMethod IsActiveMember Churn
## 10 -0.8453370 Credit card 1 Yes
## 11 0.9104103 Bank transfer 1 Yes
## 21 -1.2863414 Bank transfer 1 Yes
## 38 1.1162748 Credit card 1 Yes
## 43 1.1057297 Bank transfer 0 No
## 52 1.1667300 Credit card 1 Yes
#Decision Tree
set.seed(101)
data_class <- rpart(formula = Churn ~ .,
data = train_set1,
method = "class") # classification (not regression))
rpart.plot(data_class, yesno = TRUE)
prediksi_test <- predict(data_class, test_set1, type = "class")
plot(test_set1$Churn, prediksi_test, main = "Simple Classification: Predicted vs. Actual",xlab = "Actual",ylab = "Predicted")
conf_mat <- confusionMatrix(data = prediksi_test,
reference = test_set1$Churn)
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 59 54
## Yes 40 46
##
## Accuracy : 0.5276
## 95% CI : (0.4558, 0.5986)
## No Information Rate : 0.5025
## P-Value [Acc > NIR] : 0.2618
##
## Kappa : 0.0559
##
## Mcnemar's Test P-Value : 0.1800
##
## Sensitivity : 0.5960
## Specificity : 0.4600
## Pos Pred Value : 0.5221
## Neg Pred Value : 0.5349
## Prevalence : 0.4975
## Detection Rate : 0.2965
## Detection Prevalence : 0.5678
## Balanced Accuracy : 0.5280
##
## 'Positive' Class : No
##
=========== NAIVE BAYES ===========
# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
#Splitting Data
set.seed(101)
#- split data in training and test set.
Index <- sample(1:nrow(data), size = round(0.8*nrow(data)), replace=FALSE)
train <- data[Index ,]
test <- data[-Index ,]
head(train)
## CustomerID Gender Age Geography Tenure Contract MonthlyCharges
## 841 1000841 Male 27 UK 26 Month-to-month 62.00
## 825 1000825 Female 40 Italy 29 Two-year 54.05
## 430 1000430 Female 50 Germany 52 One-year 90.67
## 95 1000095 Female 31 France 45 Two-year 58.48
## 209 1000209 Female 24 Italy 15 Two-year 95.85
## 442 1000442 Female 53 Italy 9 Two-year 59.57
## TotalCharges PaymentMethod IsActiveMember Churn
## 841 1909.18 Bank transfer 1 No
## 825 7888.99 Electronic check 1 Yes
## 430 2608.66 Electronic check 0 No
## 95 575.12 Credit card 1 Yes
## 209 7016.22 Credit card 0 Yes
## 442 1825.52 Direct debit 1 No
head(test)
## CustomerID Gender Age Geography Tenure Contract MonthlyCharges
## 8 1000008 Male 46 Germany 11 One-year 79.48
## 10 1000010 Female 63 Germany 51 Two-year 53.30
## 12 1000012 Male 23 Italy 55 Month-to-month 58.33
## 16 1000016 Male 64 UK 37 Month-to-month 59.74
## 29 1000029 Female 66 UK 21 Month-to-month 23.50
## 32 1000032 Male 43 France 39 One-year 107.37
## TotalCharges PaymentMethod IsActiveMember Churn
## 8 4992.99 Bank transfer 1 Yes
## 10 2178.36 Credit card 1 Yes
## 12 837.00 Credit card 1 No
## 16 3650.46 Credit card 0 Yes
## 29 5230.24 Bank transfer 0 No
## 32 7151.36 Credit card 1 Yes
#Training Process
NBClassifier <- naiveBayes(Churn ~., data = train)
NBClassifier
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## No Yes
## 0.5 0.5
##
## Conditional probabilities:
## CustomerID
## Y [,1] [,2]
## No 1000490 282.1720
## Yes 1000516 294.2419
##
## Gender
## Y Female Male
## No 0.5175 0.4825
## Yes 0.5125 0.4875
##
## Age
## Y [,1] [,2]
## No 44.1225 15.13722
## Yes 44.2475 14.79945
##
## Geography
## Y France Germany Italy Spain UK
## No 0.1775 0.2075 0.2275 0.1800 0.2075
## Yes 0.2225 0.1825 0.1900 0.1675 0.2375
##
## Tenure
## Y [,1] [,2]
## No 31.2600 17.14635
## Yes 31.4125 16.71259
##
## Contract
## Y Month-to-month One-year Two-year
## No 0.3650 0.3025 0.3325
## Yes 0.3425 0.3300 0.3275
##
## MonthlyCharges
## Y [,1] [,2]
## No 70.85435 28.39937
## Yes 69.30215 28.54168
##
## TotalCharges
## Y [,1] [,2]
## No 3896.906 2212.261
## Yes 4150.544 2258.995
##
## PaymentMethod
## Y Bank transfer Credit card Direct debit Electronic check
## No 0.2425 0.2325 0.2700 0.2550
## Yes 0.2575 0.2575 0.2175 0.2675
##
## IsActiveMember
## Y [,1] [,2]
## No 0.4875 0.5004697
## Yes 0.5100 0.5005260
#Testing Process
# Predict using Naive Bayes
test$predicted <- predict(NBClassifier,test)
test$actual <- test$Churn
#confusion matrix
confusionMatrix(factor(test$predicted),
factor(test$actual))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 52 51
## Yes 46 51
##
## Accuracy : 0.515
## 95% CI : (0.4435, 0.5861)
## No Information Rate : 0.51
## P-Value [Acc > NIR] : 0.4720
##
## Kappa : 0.0306
##
## Mcnemar's Test P-Value : 0.6846
##
## Sensitivity : 0.5306
## Specificity : 0.5000
## Pos Pred Value : 0.5049
## Neg Pred Value : 0.5258
## Prevalence : 0.4900
## Detection Rate : 0.2600
## Detection Prevalence : 0.5150
## Balanced Accuracy : 0.5153
##
## 'Positive' Class : No
##
============== EVALUASI MODEL KNN ==============
# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
# Ubah kolom kategorikal menjadi numerik
var_cols <- c("Gender", "Geography", "Contract", "PaymentMethod")
data[var_cols] <- lapply(data[var_cols], function(x) as.numeric(as.factor(x)))
data$Churn <- as.factor(data$Churn) # Biarkan Churn tetap faktor
# Hilangkan kolom ID
data_norm <- data %>% select(-CustomerID)
# Normalisasi semua kolom numerik kecuali 'Churn'
data_norm[, -ncol(data_norm)] <- scale(data_norm[, -ncol(data_norm)])
# ==================== SPLIT DATA ====================
set.seed(101)
# Set the proportion of data to be used for training
train_proportion <- 0.8
# Create the train/test indices with stratified sampling
train_index <- createDataPartition(data_norm$Churn, p = train_proportion, list = FALSE, times = 1)
# Split the data into training and testing sets
train_set <- data_norm[train_index, ]
test_set <- data_norm[-train_index, ]
# ==================== PREDIKSI TEST SET (Tanpa CV) ====================
initial_k <- sqrt(NROW(data))
# K=31
knn.31 <- knn(train = train_set[,-10], test = test_set[,-10], cl = train_set$Churn, k = floor(initial_k))
cf.31 <- confusionMatrix(factor(knn.31), factor(test_set$Churn))
# K=32
knn.32 <- knn(train=train_set[,-10], test=test_set[,-10], cl=train_set$Churn,k=ceiling(initial_k))
cf.32 <- confusionMatrix(factor(knn.32), factor(test_set$Churn))
# ==================== DENGAN CROSS-VALIDATION ====================
# Fungsi untuk KNN + CV
run_knn_cv <- function(data, k, folds) {
ctrl <- trainControl(method = "cv", number = folds)
model <- train(Churn ~ ., data = data, method = "knn",
trControl = ctrl, tuneGrid = data.frame(k = k))
return(model)
}
# Model CV
knn_cv_31_5 <- run_knn_cv(data_norm, k = floor(initial_k), folds = 5)
knn_cv_31_10 <- run_knn_cv(data_norm, k = floor(initial_k), folds = 10)
knn_cv_32_5 <- run_knn_cv(data_norm, k = ceiling(initial_k), folds = 5)
knn_cv_32_10 <- run_knn_cv(data_norm, k = ceiling(initial_k), folds = 10)
# Prediksi
pred_cv_31_5 <- predict(knn_cv_31_5, newdata = test_set)
pred_cv_31_10 <- predict(knn_cv_31_10, newdata = test_set)
pred_cv_32_5 <- predict(knn_cv_32_5, newdata = test_set)
pred_cv_32_10 <- predict(knn_cv_32_10, newdata = test_set)
# Confusion Matrix
cf_cv_31_5 <- confusionMatrix(pred_cv_31_5, test_set$Churn)
cf_cv_31_10 <- confusionMatrix(pred_cv_31_10, test_set$Churn)
cf_cv_32_5 <- confusionMatrix(pred_cv_32_5, test_set$Churn)
cf_cv_32_10 <- confusionMatrix(pred_cv_32_10, test_set$Churn)
# ==================== EKSTRAKSI METRIK ====================
extract_metrics <- function(cm) {
acc <- cm$overall["Accuracy"]
prec <- cm$byClass["Precision"]
rec <- cm$byClass["Recall"]
f1 <- cm$byClass["F1"]
return(data.frame(Akurasi = acc, Presisi = prec, Recall = rec, F1_Score = f1))
}
# Metrik
metrics_31 <- extract_metrics(cf.31)
metrics_32 <- extract_metrics(cf.32)
metrics_cv_31_5 <- extract_metrics(cf_cv_31_5)
metrics_cv_31_10 <- extract_metrics(cf_cv_31_10)
metrics_cv_32_5 <- extract_metrics(cf_cv_32_5)
metrics_cv_32_10 <- extract_metrics(cf_cv_32_10)
# ==================== TABEL PERBANDINGAN ====================
comparison_table <- data.frame(
Model = c("KNN (Tanpa CV) K=31", "KNN (Tanpa CV) K=32",
"KNN (CV 5-Fold) K=31", "KNN (CV 10-Fold) K=31",
"KNN (CV 5-Fold) K=32", "KNN (CV 10-Fold) K=32"),
Accuracy = c(metrics_31$Akurasi, metrics_32$Akurasi,
metrics_cv_31_5$Akurasi, metrics_cv_31_10$Akurasi,
metrics_cv_32_5$Akurasi, metrics_cv_32_10$Akurasi),
Precision = c(metrics_31$Presisi, metrics_32$Presisi,
metrics_cv_31_5$Presisi, metrics_cv_31_10$Presisi,
metrics_cv_32_5$Presisi, metrics_cv_32_10$Presisi),
Recall = c(metrics_31$Recall, metrics_32$Recall,
metrics_cv_31_5$Recall, metrics_cv_31_10$Recall,
metrics_cv_32_5$Recall, metrics_cv_32_10$Recall),
F1_Score = c(metrics_31$F1_Score, metrics_32$F1_Score,
metrics_cv_31_5$F1_Score, metrics_cv_31_10$F1_Score,
metrics_cv_32_5$F1_Score, metrics_cv_32_10$F1_Score)
)
cat("\n=== Tabel Perbandingan Metrik KNN ===\n")
##
## === Tabel Perbandingan Metrik KNN ===
print(comparison_table)
## Model Accuracy Precision Recall F1_Score
## 1 KNN (Tanpa CV) K=31 0.4874372 0.4851485 0.4949495 0.4900000
## 2 KNN (Tanpa CV) K=32 0.4623116 0.4600000 0.4646465 0.4623116
## 3 KNN (CV 5-Fold) K=31 0.5226131 0.5185185 0.5656566 0.5410628
## 4 KNN (CV 10-Fold) K=31 0.5226131 0.5185185 0.5656566 0.5410628
## 5 KNN (CV 5-Fold) K=32 0.5427136 0.5357143 0.6060606 0.5687204
## 6 KNN (CV 10-Fold) K=32 0.5025126 0.5000000 0.5555556 0.5263158
===================== EVALUASI MODEL DECISION TREE ====================
# === LOAD LIBRARY ===
library(caret)
library(rpart)
library(rpart.plot)
library(dplyr)
# === LOAD DATA ===
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
# === UBAH KOLOM KATEGORIKAL MENJADI FAKTOR ===
faktor_vars <- c("Gender", "Geography", "Contract", "PaymentMethod", "Churn")
data[faktor_vars] <- lapply(data[faktor_vars], as.factor)
data$Churn <- as.factor(data$Churn) # Biarkan Churn tetap sebagai faktor
# === HAPUS KOLOM ID ===
data_norm <- data %>% select(-CustomerID)
# === KONVERSI 'IsActiveMember' DAN 'Churn' MENJADI FAKTOR ===
data_norm$IsActiveMember <- as.factor(data_norm$IsActiveMember)
data_norm$Churn <- as.factor(data_norm$Churn)
# === NORMALISASI KOLOM NUMERIK ===
num_cols <- sapply(data_norm, is.numeric)
data_norm[, num_cols] <- scale(data_norm[, num_cols])
# === PARTISI DATA ===
set.seed(101)
train_proportion <- 0.8
train_index <- createDataPartition(data_norm$Churn, p = train_proportion, list = FALSE, times = 1)
train_set1 <- data_norm[train_index, ]
test_set1 <- data_norm[-train_index, ]
# === TRAINING MODEL DECISION TREE (TANPA CV) ===
model_tree <- rpart(Churn ~ ., data = train_set1, method = "class")
pred_test <- predict(model_tree, test_set1, type = "class")
conf_test <- confusionMatrix(pred_test, test_set1$Churn)
# METRIK TANPA CV
accuracy <- conf_test$overall["Accuracy"]
precision <- conf_test$byClass["Pos Pred Value"]
recall <- conf_test$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)
# === CV 5-FOLD ===
ctrl_5 <- trainControl(method = "cv", number = 5, savePredictions = TRUE)
dt_cv5 <- train(Churn ~ ., data = train_set1, method = "rpart", trControl = ctrl_5)
conf_test_cv5 <- confusionMatrix(predict(dt_cv5, test_set1), test_set1$Churn)
# METRIK TEST SET DARI CV 5-FOLD
accuracy_cv5 <- conf_test_cv5$overall["Accuracy"]
precision_cv5 <- conf_test_cv5$byClass["Pos Pred Value"]
recall_cv5 <- conf_test_cv5$byClass["Sensitivity"]
f1_cv5 <- 2 * (precision_cv5 * recall_cv5) / (precision_cv5 + recall_cv5)
# METRIK RATA-RATA DARI HASIL CV (TRAINING SET)
metrics_cv5_train <- dt_cv5$resample %>%
summarise(Accuracy = mean(Accuracy))
# === CV 10-FOLD ===
ctrl_10 <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
dt_cv10 <- train(Churn ~ ., data = train_set1, method = "rpart", trControl = ctrl_10)
conf_test_cv10 <- confusionMatrix(predict(dt_cv10, test_set1), test_set1$Churn)
# METRIK TEST SET DARI CV 10-FOLD
accuracy_cv10 <- conf_test_cv10$overall["Accuracy"]
precision_cv10 <- conf_test_cv10$byClass["Pos Pred Value"]
recall_cv10 <- conf_test_cv10$byClass["Sensitivity"]
f1_cv10 <- 2 * (precision_cv10 * recall_cv10) / (precision_cv10 + recall_cv10)
# METRIK RATA-RATA DARI HASIL CV (TRAINING SET)
metrics_cv10_train <- dt_cv10$resample %>%
summarise(Accuracy = mean(Accuracy))
# === TABEL PERBANDINGAN ===
comparison_table <- data.frame(
Model = c("Decision Tree (Tanpa CV)",
"Decision Tree (CV 5-Fold) - Test Set",
"Decision Tree (CV 10-Fold) - Test Set"),
Accuracy = c(accuracy, accuracy_cv5, accuracy_cv10),
Precision = c(precision, precision_cv5, precision_cv10),
Recall = c(recall, recall_cv5, recall_cv10),
F1_Score = c(f1_score, f1_cv5, f1_cv10)
)
print("=== Perbandingan Evaluasi di Test Set ===")
## [1] "=== Perbandingan Evaluasi di Test Set ==="
print(comparison_table)
## Model Accuracy Precision Recall F1_Score
## 1 Decision Tree (Tanpa CV) 0.5276382 0.5221239 0.5959596 0.5566038
## 2 Decision Tree (CV 5-Fold) - Test Set 0.4673367 0.4678899 0.5151515 0.4903846
## 3 Decision Tree (CV 10-Fold) - Test Set 0.4673367 0.4678899 0.5151515 0.4903846
# === HASIL RATA-RATA DARI CROSS-VALIDATION DI TRAINING SET ===
cat("\n=== Akurasi Rata-rata dari CV di Training Set ===\n")
##
## === Akurasi Rata-rata dari CV di Training Set ===
cat("CV 5-Fold (Train Set):", round(metrics_cv5_train$Accuracy, 4), "\n")
## CV 5-Fold (Train Set): 0.5169
cat("CV 10-Fold (Train Set):", round(metrics_cv10_train$Accuracy, 4), "\n")
## CV 10-Fold (Train Set): 0.493
========== NAIVE BAYES ==========
# === Load Library ===
library(e1071)
library(caret)
library(dplyr)
# === Membaca Data ===
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
# === Ubah Kolom-Kolom Kategorikal Jadi Faktor ===
factor_vars <- c("Gender", "Geography", "Contract", "PaymentMethod", "Churn")
data[factor_vars] <- lapply(data[factor_vars], as.factor)
data$IsActiveMember <- as.factor(data$IsActiveMember)
# === Split Data ===
set.seed(101)
Index <- sample(1:nrow(data), size = round(0.8 * nrow(data)), replace = FALSE)
train <- data[Index, ]
test <- data[-Index, ]
# ========== MODEL TANPA CV ==========
# Training model Naive Bayes
NBClassifier <- naiveBayes(Churn ~ ., data = train)
# Prediksi di test set
test$predicted <- predict(NBClassifier, test)
test$actual <- test$Churn
# Confusion Matrix
cf_nb <- confusionMatrix(factor(test$predicted), factor(test$actual))
# ====== Evaluasi Tanpa CV ======
extract_metrics <- function(cm) {
acc <- cm$overall["Accuracy"]
prec <- cm$byClass["Precision"]
rec <- cm$byClass["Recall"]
f1 <- cm$byClass["F1"]
data.frame(Akurasi = acc, Presisi = prec, Recall = rec, F1_Score = f1)
}
# Extract metrics for the model without CV
metrics_nb <- extract_metrics(cf_nb)
# ========== MODEL DENGAN 5-FOLD CV ==========
set.seed(101)
ctrl_5 <- trainControl(method = "cv", number = 5)
nb_cv_5 <- train(
Churn ~ .,
data = train,
method = "naive_bayes",
trControl = ctrl_5
)
# Prediksi test set dari model CV 5-fold
test$predicted_cv_5 <- predict(nb_cv_5, newdata = test)
# Confusion Matrix for 5-fold CV
cf_nb_cv_5 <- confusionMatrix(test$predicted_cv_5, test$Churn)
# Extract metrics for 5-fold CV model
metrics_nb_cv_5 <- extract_metrics(cf_nb_cv_5)
# ========== MODEL DENGAN 10-FOLD CV ==========
set.seed(101)
ctrl_10 <- trainControl(method = "cv", number = 10)
nb_cv_10 <- train(
Churn ~ .,
data = train,
method = "naive_bayes",
trControl = ctrl_10
)
# Prediksi test set dari model CV 10-fold
test$predicted_cv_10 <- predict(nb_cv_10, newdata = test)
# Confusion Matrix for 10-fold CV
cf_nb_cv_10 <- confusionMatrix(test$predicted_cv_10, test$Churn)
# Extract metrics for 10-fold CV model
metrics_nb_cv_10 <- extract_metrics(cf_nb_cv_10)
# ====== TABEL PERBANDINGAN ======
comparison_table_nb <- data.frame(
Model = c("Naive Bayes (Tanpa CV)", "Naive Bayes (CV 5-Fold)", "Naive Bayes (CV 10-Fold)"),
Accuracy = c(metrics_nb$Akurasi, metrics_nb_cv_5$Akurasi, metrics_nb_cv_10$Akurasi),
Precision = c(metrics_nb$Presisi, metrics_nb_cv_5$Presisi, metrics_nb_cv_10$Presisi),
Recall = c(metrics_nb$Recall, metrics_nb_cv_5$Recall, metrics_nb_cv_10$Recall),
F1_Score = c(metrics_nb$F1_Score, metrics_nb_cv_5$F1_Score, metrics_nb_cv_10$F1_Score)
)
# Print tabel perbandingan
print("=== TABEL PERBANDINGAN NAIVE BAYES ===")
## [1] "=== TABEL PERBANDINGAN NAIVE BAYES ==="
print(comparison_table_nb)
## Model Accuracy Precision Recall F1_Score
## 1 Naive Bayes (Tanpa CV) 0.515 0.5048544 0.5306122 0.5174129
## 2 Naive Bayes (CV 5-Fold) 0.465 0.4579439 0.5000000 0.4780488
## 3 Naive Bayes (CV 10-Fold) 0.470 0.4459459 0.3367347 0.3837209
# ==================== PERSIAPAN ====================
library(caret)
library(dplyr)
# Baca dan siapkan data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
var_cols <- c("Gender", "Geography", "Contract", "PaymentMethod")
data[var_cols] <- lapply(data[var_cols], function(x) as.numeric(as.factor(x)))
data$Churn <- as.factor(ifelse(data$Churn == "Yes", 1, 0))
data_knn <- data %>% select(-CustomerID)
data_knn[, -ncol(data_knn)] <- scale(data_knn[, -ncol(data_knn)])
# Buat train/test split
set.seed(101)
train_index <- createDataPartition(data_knn$Churn, p = 0.8, list = FALSE)
train_set <- data_knn[train_index, ]
test_set <- data_knn[-train_index, ]
# ==================== MODEL KNN ASLI ====================
# Latih model asli dengan semua fitur
ctrl <- trainControl(method = "cv", number = 5)
knn_full <- train(Churn ~ ., data = train_set, method = "knn",
trControl = ctrl, tuneGrid = data.frame(k = 32))
# Prediksi & hitung recall
pred_full <- predict(knn_full, newdata = test_set)
cm_full <- confusionMatrix(pred_full, test_set$Churn, positive = "1")
recall_full <- cm_full$byClass["Recall"]
# ==================== PERMUTATION FEATURE IMPORTANCE ====================
fitur_list <- colnames(train_set)[colnames(train_set) != "Churn"]
importance_results <- data.frame(Fitur = fitur_list, Recall_Drop = NA)
for (i in seq_along(fitur_list)) {
# Copy data test & acak 1 fitur
test_permuted <- test_set
test_permuted[[fitur_list[i]]] <- sample(test_permuted[[fitur_list[i]]]) # acak nilai fitur
# Prediksi ulang dengan fitur yang diacak
pred_perm <- predict(knn_full, newdata = test_permuted)
cm_perm <- confusionMatrix(pred_perm, test_set$Churn, positive = "1")
recall_perm <- cm_perm$byClass["Recall"]
# Simpan penurunan recall
importance_results$Recall_Drop[i] <- recall_full - recall_perm
}
# Urutkan berdasarkan penurunan recall (semakin tinggi, semakin penting)
importance_results <- importance_results[order(-importance_results$Recall_Drop), ]
# Hilangkan rownames
rownames(importance_results) <- NULL
# Tampilkan hasil
cat("\n=== Fitur Terpenting untuk KNN (berdasarkan Recall Drop) ===\n")
##
## === Fitur Terpenting untuk KNN (berdasarkan Recall Drop) ===
print(importance_results)
## Fitur Recall_Drop
## 1 Gender 0.01
## 2 TotalCharges 0.01
## 3 Tenure -0.01
## 4 PaymentMethod -0.01
## 5 MonthlyCharges -0.03
## 6 Age -0.04
## 7 Contract -0.04
## 8 IsActiveMember -0.05
## 9 Geography -0.08
# (Opsional) Visualisasi
library(ggplot2)
ggplot(importance_results, aes(x = reorder(Fitur, Recall_Drop), y = Recall_Drop)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Permutation Importance (KNN, Recall-Based)",
x = "Fitur",
y = "Recall Drop (Semakin besar = Semakin penting)") +
theme_minimal()