# Load library
library(tidyverse)
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(DataExplorer)
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.3
# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")

# Struktur data
glimpse(data)
## Rows: 1,000
## Columns: 11
## $ CustomerID     <int> 1000001, 1000002, 1000003, 1000004, 1000005, 1000006, 1…
## $ Gender         <chr> "Male", "Female", "Male", "Male", "Male", "Female", "Ma…
## $ Age            <int> 34, 26, 50, 37, 30, 45, 65, 46, 30, 63, 52, 23, 35, 68,…
## $ Geography      <chr> "France", "Spain", "Germany", "Spain", "Spain", "Italy"…
## $ Tenure         <int> 14, 14, 57, 34, 53, 57, 20, 11, 16, 51, 31, 55, 6, 56, …
## $ Contract       <chr> "Two-year", "Month-to-month", "Two-year", "Month-to-mon…
## $ MonthlyCharges <dbl> 21.58, 27.71, 111.12, 55.49, 62.48, 41.53, 79.71, 79.48…
## $ TotalCharges   <dbl> 7933.34, 5869.34, 6321.20, 7956.44, 4922.75, 2601.51, 6…
## $ PaymentMethod  <chr> "Bank transfer", "Credit card", "Bank transfer", "Bank …
## $ IsActiveMember <int> 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0…
## $ Churn          <chr> "No", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Y…
# Konversi fitur kategorikal menjadi faktor
data <- data %>%
  mutate(
    Gender = as.factor(Gender),
    Geography = as.factor(Geography),
    Contract = as.factor(Contract),
    PaymentMethod = as.factor(PaymentMethod),
    Churn = as.factor(Churn),
    IsActiveMember = as.factor(IsActiveMember)
  )
# Cek struktur kolom setelah konversi
str(data)
## 'data.frame':    1000 obs. of  11 variables:
##  $ CustomerID    : int  1000001 1000002 1000003 1000004 1000005 1000006 1000007 1000008 1000009 1000010 ...
##  $ Gender        : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 1 2 2 2 1 ...
##  $ Age           : int  34 26 50 37 30 45 65 46 30 63 ...
##  $ Geography     : Factor w/ 5 levels "France","Germany",..: 1 4 2 4 4 3 5 2 4 2 ...
##  $ Tenure        : int  14 14 57 34 53 57 20 11 16 51 ...
##  $ Contract      : Factor w/ 3 levels "Month-to-month",..: 3 1 3 1 3 1 3 2 3 3 ...
##  $ MonthlyCharges: num  21.6 27.7 111.1 55.5 62.5 ...
##  $ TotalCharges  : num  7933 5869 6321 7956 4923 ...
##  $ PaymentMethod : Factor w/ 4 levels "Bank transfer",..: 1 2 1 1 3 2 2 1 3 2 ...
##  $ IsActiveMember: Factor w/ 2 levels "0","1": 2 1 2 1 2 1 1 2 2 2 ...
##  $ Churn         : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 2 2 2 2 2 ...
# Pengecekan missing value
colSums(is.na(data))
##     CustomerID         Gender            Age      Geography         Tenure 
##              0              0              0              0              0 
##       Contract MonthlyCharges   TotalCharges  PaymentMethod IsActiveMember 
##              0              0              0              0              0 
##          Churn 
##              0
gg_miss_var(data)

# Hapus kolom CustomerID karena tidak relevan
data_norm <- data %>% select(-CustomerID)

# Visualisasi distribusi fitur numerik
data_norm %>%
  select(where(is.numeric)) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(Value)) +
  geom_histogram(fill = "#00ADB5", bins = 30) +
  facet_wrap(~Feature, scales = "free") +
  theme_minimal() +
  labs(title = "Histogram Fitur Numerik")

# Visualisasi boxplot fitur numerik
data_norm %>%
  select(where(is.numeric)) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Feature, y = Value)) +
  geom_boxplot(fill = "#FFB6B9") +
  theme_minimal() +
  coord_flip() +
  labs(title = "Boxplot Fitur Numerik")

# Visualisasi distribusi fitur kategorikal (setelah dikonversi ke faktor)
data_norm %>%
  select(where(is.factor)) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value)) +
  geom_bar(fill = "#F08A5D") +
  facet_wrap(~Feature, scales = "free") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Distribusi Fitur Kategorikal")

# Visualisasi MonthlyCharges vs Churn
ggplot(data_norm, aes(x = Churn, y = MonthlyCharges)) +
  geom_boxplot(fill = "#f08a5d") +
  theme_minimal() +
  labs(title = "Monthly Charges vs Churn")

# Visualisasi TotalCharges vs Churn (Boxplot)
ggplot(data_norm, aes(x = Churn, y = TotalCharges, fill = Churn)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Total Charges vs Churn (Boxplot)",
       x = "Churn",
       y = "Total Charges")

# Visualisasi Tenure vs Churn (Boxplot)
ggplot(data_norm, aes(x = Churn, y = Tenure, fill = Churn)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Tenure vs Churn (Boxplot)",
       x = "Churn",
       y = "Tenure (Months)")

# Uji t untuk MonthlyCharges vs Churn
t.test(MonthlyCharges ~ Churn, data = data_norm)
## 
##  Welch Two Sample t-test
## 
## data:  MonthlyCharges by Churn
## t = 0.37292, df = 996.65, p-value = 0.7093
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -2.864145  4.208151
## sample estimates:
##  mean in group No mean in group Yes 
##          70.46135          69.78934
# Uji t untuk TotalCharges vs Churn
t.test(TotalCharges ~ Churn, data = data_norm)
## 
##  Welch Two Sample t-test
## 
## data:  TotalCharges by Churn
## t = -1.9009, df = 998, p-value = 0.0576
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -544.93405    8.66342
## sample estimates:
##  mean in group No mean in group Yes 
##          3931.618          4199.753
# Uji t untuk Tenure vs Churn
t.test(Tenure ~ Churn, data = data_norm)
## 
##  Welch Two Sample t-test
## 
## data:  Tenure by Churn
## t = 0.46232, df = 996.58, p-value = 0.644
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -1.616067  2.612243
## sample estimates:
##  mean in group No mean in group Yes 
##          31.01004          30.51195
# Visualisasi distribusi IsActiveMember (Barplot)
ggplot(data_norm, aes(x = IsActiveMember)) +
  geom_bar(fill = "#6A0572") +
  labs(x = "Is Active Member", y = "Jumlah", title = "Distribusi Member Aktif") +
  theme_minimal()

# Cek korelasi antara fitur numerik
num_data <- data_norm %>% select(where(is.numeric)) %>% na.omit()
cor_matrix <- cor(num_data)

# Visualisasi matriks korelasi
corrplot(cor_matrix,
         method = "circle",
         type = "lower",
         tl.cex = 0.8,
         title = "Matriks Korelasi Numerik",
         mar = c(0,0,1,0))

# Visualisasi distribusi Churn
ggplot(data_norm, aes(x = Churn)) +
  geom_bar(fill = "#6A0572") +
  theme_minimal() +
  labs(title = "Distribusi Churn", y = "Jumlah", x = "Churn")

# Proporsi Churn
prop.table(table(data_norm$Churn))
## 
##    No   Yes 
## 0.498 0.502

============== MODELLING DATA ==============

# Load library
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)       # Untuk Naive Bayes
library(rpart)       # Untuk Decision Tree
library(rpart.plot)  # Visualisasi Decision Tree
# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")

# Struktur data
str(data)
## 'data.frame':    1000 obs. of  11 variables:
##  $ CustomerID    : int  1000001 1000002 1000003 1000004 1000005 1000006 1000007 1000008 1000009 1000010 ...
##  $ Gender        : chr  "Male" "Female" "Male" "Male" ...
##  $ Age           : int  34 26 50 37 30 45 65 46 30 63 ...
##  $ Geography     : chr  "France" "Spain" "Germany" "Spain" ...
##  $ Tenure        : int  14 14 57 34 53 57 20 11 16 51 ...
##  $ Contract      : chr  "Two-year" "Month-to-month" "Two-year" "Month-to-month" ...
##  $ MonthlyCharges: num  21.6 27.7 111.1 55.5 62.5 ...
##  $ TotalCharges  : num  7933 5869 6321 7956 4923 ...
##  $ PaymentMethod : chr  "Bank transfer" "Credit card" "Bank transfer" "Bank transfer" ...
##  $ IsActiveMember: int  1 0 1 0 1 0 0 1 1 1 ...
##  $ Churn         : chr  "No" "Yes" "No" "Yes" ...
# Ubah kolom kategorikal menjadi numerik
var_cols <- c("Gender", "Geography", "Contract", "PaymentMethod")
data[var_cols] <- lapply(data[var_cols], function(x) as.numeric(as.factor(x)))
data$Churn <- as.factor(data$Churn)  # Biarkan Churn tetap faktor
# Hilangkan kolom ID dulu
data_norm <- data %>% select(-CustomerID)
str(data_norm)
## 'data.frame':    1000 obs. of  10 variables:
##  $ Gender        : num  2 1 2 2 2 1 2 2 2 1 ...
##  $ Age           : int  34 26 50 37 30 45 65 46 30 63 ...
##  $ Geography     : num  1 4 2 4 4 3 5 2 4 2 ...
##  $ Tenure        : int  14 14 57 34 53 57 20 11 16 51 ...
##  $ Contract      : num  3 1 3 1 3 1 3 2 3 3 ...
##  $ MonthlyCharges: num  21.6 27.7 111.1 55.5 62.5 ...
##  $ TotalCharges  : num  7933 5869 6321 7956 4923 ...
##  $ PaymentMethod : num  1 2 1 1 3 2 2 1 3 2 ...
##  $ IsActiveMember: int  1 0 1 0 1 0 0 1 1 1 ...
##  $ Churn         : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 2 2 2 2 2 ...
# Normalisasi numerik (kecuali kolom target 'Churn')
data_norm[ , -ncol(data_norm)] <- scale(data_norm[ , -ncol(data_norm)])
#Splitting Data
set.seed(101)
library(caret)
# Set the proportion of data to be used for training
train_proportion <- 0.8
# Create the train/test indices with stratified sampling
train_index <- createDataPartition(data_norm$Churn, p = train_proportion, list = FALSE, times = 1)
# Split the data into training and testing sets
train_set <- data_norm[train_index, ]
test_set <- data_norm[-train_index, ]
head(train_set)
##       Gender         Age   Geography     Tenure  Contract MonthlyCharges
## 1  1.0196938 -0.64748860 -1.38643262 -0.9843945  1.224512     -1.7047145
## 2 -0.9797058 -1.17610466  0.70998767 -0.9843945 -1.185945     -1.4894480
## 3  1.0196938  0.40974353 -0.68762586  1.5412000  1.224512      1.4396522
## 4  1.0196938 -0.44925758  0.70998767  0.1903006 -1.185945     -0.5139006
## 5  1.0196938 -0.91179663  0.70998767  1.3062610  1.224512     -0.2684335
## 6 -0.9797058  0.07935849  0.01118091  1.5412000 -1.185945     -1.0041325
##   TotalCharges PaymentMethod IsActiveMember Churn
## 1    1.7315979    -1.3574109      0.9935208    No
## 2    0.8073908    -0.4626114     -1.0055150   Yes
## 3    1.0097223    -1.3574109      0.9935208    No
## 4    1.7419415    -1.3574109     -1.0055150   Yes
## 5    0.3835317     0.4321882      0.9935208    No
## 6   -0.6558611    -0.4626114     -1.0055150   Yes
head(test_set)
##        Gender        Age  Geography      Tenure    Contract MonthlyCharges
## 10 -0.9797058  1.2687446 -0.6876259  1.18879144  1.22451200     -0.5908066
## 11  1.0196938  0.5418975 -1.3864326  0.01409634 -1.18594470      1.3476460
## 21 -0.9797058  1.5330527  0.7099877 -0.63198597 -1.18594470      0.9462598
## 38  1.0196938  0.4097435 -1.3864326  1.54119998  1.22451200      0.9929653
## 43 -0.9797058 -0.3831806  0.7099877 -1.39553778  0.01928365     -1.0539985
## 52 -0.9797058 -1.5725667 -0.6876259  1.42373047  1.22451200     -0.3246206
##    TotalCharges PaymentMethod IsActiveMember Churn
## 10   -0.8453370    -0.4626114      0.9935208   Yes
## 11    0.9104103    -1.3574109      0.9935208   Yes
## 21   -1.2863414    -1.3574109      0.9935208   Yes
## 38    1.1162748    -0.4626114      0.9935208   Yes
## 43    1.1057297    -1.3574109     -1.0055150    No
## 52    1.1667300    -0.4626114      0.9935208   Yes

=================== KNN ===================

#Using Initial K
#The initial value for k is generally chosen as the square root of the number of observations.
# Set jumlah fold untuk cross-validation
initial_k <- sqrt(NROW(data))
initial_k
## [1] 31.62278
library(class)
## Warning: package 'class' was built under R version 4.4.3
# run KNN with k=31 and k=32
knn.31 <- knn(train=train_set[,-10], test=test_set[,-10], cl=train_set$Churn,k=floor(initial_k))
# use confusion matrix to calculate accuracy
cf.31 <- confusionMatrix(factor(knn.31), factor(test_set$Churn))
cf.31
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  49  52
##        Yes 50  48
##                                           
##                Accuracy : 0.4874          
##                  95% CI : (0.4161, 0.5591)
##     No Information Rate : 0.5025          
##     P-Value [Acc > NIR] : 0.6901          
##                                           
##                   Kappa : -0.025          
##                                           
##  Mcnemar's Test P-Value : 0.9211          
##                                           
##             Sensitivity : 0.4949          
##             Specificity : 0.4800          
##          Pos Pred Value : 0.4851          
##          Neg Pred Value : 0.4898          
##              Prevalence : 0.4975          
##          Detection Rate : 0.2462          
##    Detection Prevalence : 0.5075          
##       Balanced Accuracy : 0.4875          
##                                           
##        'Positive' Class : No              
## 
knn.32 <- knn(train=train_set[,-10], test=test_set[,-10], cl=train_set$Churn,k=ceiling(initial_k))
# use confusion matrix to calculate accuracy
cf.32 <- confusionMatrix(factor(knn.32), factor(test_set$Churn))
cf.32
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  46  54
##        Yes 53  46
##                                           
##                Accuracy : 0.4623          
##                  95% CI : (0.3916, 0.5342)
##     No Information Rate : 0.5025          
##     P-Value [Acc > NIR] : 0.886           
##                                           
##                   Kappa : -0.0753         
##                                           
##  Mcnemar's Test P-Value : 1.000           
##                                           
##             Sensitivity : 0.4646          
##             Specificity : 0.4600          
##          Pos Pred Value : 0.4600          
##          Neg Pred Value : 0.4646          
##              Prevalence : 0.4975          
##          Detection Rate : 0.2312          
##    Detection Prevalence : 0.5025          
##       Balanced Accuracy : 0.4623          
##                                           
##        'Positive' Class : No              
## 
# ===== Tambahan: Cross-Validation KNN (k = 20 sampai 40) =====
library(caret)

# Siapkan data latih (sudah dari data_norm)
train_data_cv <- data_norm[train_index, ]
train_data_cv$Churn <- as.factor(train_data_cv$Churn)

# Konfigurasi cross-validation
set.seed(101)
ctrl <- trainControl(method = "cv", number = 10)

# Uji nilai k dari 20 sampai 40
knn_cv <- train(
  Churn ~ ., 
  data = train_data_cv,
  method = "knn",
  trControl = ctrl,
  tuneGrid = data.frame(k = 20:40)
)

# Tampilkan hasil
print(knn_cv)
## k-Nearest Neighbors 
## 
## 801 samples
##   9 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 721, 721, 721, 720, 722, 721, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa       
##   20  0.5093829   0.018782862
##   21  0.4981483  -0.003529786
##   22  0.5143683   0.028873061
##   23  0.5093520   0.018818594
##   24  0.4967421  -0.006728192
##   25  0.4955538  -0.009026289
##   26  0.4929126  -0.014259752
##   27  0.4967093  -0.006841561
##   28  0.5055218   0.010803692
##   29  0.5105071   0.020651001
##   30  0.5217579   0.043110516
##   31  0.5229458   0.045654329
##   32  0.5104770   0.020817559
##   33  0.5217891   0.043443923
##   34  0.5181175   0.036092729
##   35  0.5180561   0.035960364
##   36  0.5068370   0.013461071
##   37  0.5042741   0.008375946
##   38  0.5168050   0.033506238
##   39  0.5154929   0.030849098
##   40  0.5055388   0.010850984
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 31.

============= DECISION TREE =============

# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
# Ubah kolom-kolom kategorikal jadi factor
faktor_vars <- c("Gender", "Geography", "Contract", "PaymentMethod", "Churn")
data[faktor_vars] <- lapply(data[faktor_vars], as.factor)
data$IsActiveMember <- as.factor(data$IsActiveMember)
# Hapus kolom ID
data_norm <- data %>% select(-CustomerID)
# 'IsActiveMember' dan 'Churn' tetap sebagai faktor
data_norm$IsActiveMember <- as.factor(data_norm$IsActiveMember)
data_norm$Churn <- as.factor(data_norm$Churn)
# Normalisasi hanya kolom numerik selain kolom target
num_cols <- sapply(data_norm, is.numeric)
data_norm[, num_cols] <- scale(data_norm[, num_cols])
#Partisi Data
set.seed(101)
# Set the proportion of data to be used for training
train_proportion <- 0.8
# Create the train/test indices with stratified sampling
train_index <- createDataPartition(data_norm$Churn, p = train_proportion, list = FALSE, times = 1)
# Split the data into training and testing sets
train_set1 <- data_norm[train_index, ]
test_set1 <- data_norm[-train_index, ]
head(train_set1)
##   Gender         Age Geography     Tenure       Contract MonthlyCharges
## 1   Male -0.64748860    France -0.9843945       Two-year     -1.7047145
## 2 Female -1.17610466     Spain -0.9843945 Month-to-month     -1.4894480
## 3   Male  0.40974353   Germany  1.5412000       Two-year      1.4396522
## 4   Male -0.44925758     Spain  0.1903006 Month-to-month     -0.5139006
## 5   Male -0.91179663     Spain  1.3062610       Two-year     -0.2684335
## 6 Female  0.07935849     Italy  1.5412000 Month-to-month     -1.0041325
##   TotalCharges PaymentMethod IsActiveMember Churn
## 1    1.7315979 Bank transfer              1    No
## 2    0.8073908   Credit card              0   Yes
## 3    1.0097223 Bank transfer              1    No
## 4    1.7419415 Bank transfer              0   Yes
## 5    0.3835317  Direct debit              1    No
## 6   -0.6558611   Credit card              0   Yes
head(test_set1)
##    Gender        Age Geography      Tenure       Contract MonthlyCharges
## 10 Female  1.2687446   Germany  1.18879144       Two-year     -0.5908066
## 11   Male  0.5418975    France  0.01409634 Month-to-month      1.3476460
## 21 Female  1.5330527     Spain -0.63198597 Month-to-month      0.9462598
## 38   Male  0.4097435    France  1.54119998       Two-year      0.9929653
## 43 Female -0.3831806     Spain -1.39553778       One-year     -1.0539985
## 52 Female -1.5725667   Germany  1.42373047       Two-year     -0.3246206
##    TotalCharges PaymentMethod IsActiveMember Churn
## 10   -0.8453370   Credit card              1   Yes
## 11    0.9104103 Bank transfer              1   Yes
## 21   -1.2863414 Bank transfer              1   Yes
## 38    1.1162748   Credit card              1   Yes
## 43    1.1057297 Bank transfer              0    No
## 52    1.1667300   Credit card              1   Yes
#Decision Tree
set.seed(101)
data_class <- rpart(formula = Churn ~ .,
                       data = train_set1,
                       method = "class")  # classification (not regression))
rpart.plot(data_class, yesno = TRUE)

prediksi_test <- predict(data_class, test_set1, type = "class")
plot(test_set1$Churn, prediksi_test, main = "Simple Classification: Predicted vs. Actual",xlab = "Actual",ylab = "Predicted")

conf_mat <- confusionMatrix(data = prediksi_test, 
                                reference = test_set1$Churn)
print(conf_mat)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  59  54
##        Yes 40  46
##                                           
##                Accuracy : 0.5276          
##                  95% CI : (0.4558, 0.5986)
##     No Information Rate : 0.5025          
##     P-Value [Acc > NIR] : 0.2618          
##                                           
##                   Kappa : 0.0559          
##                                           
##  Mcnemar's Test P-Value : 0.1800          
##                                           
##             Sensitivity : 0.5960          
##             Specificity : 0.4600          
##          Pos Pred Value : 0.5221          
##          Neg Pred Value : 0.5349          
##              Prevalence : 0.4975          
##          Detection Rate : 0.2965          
##    Detection Prevalence : 0.5678          
##       Balanced Accuracy : 0.5280          
##                                           
##        'Positive' Class : No              
## 

=========== NAIVE BAYES ===========

# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
#Splitting Data
set.seed(101)
#- split data in training and test set.
Index <- sample(1:nrow(data), size = round(0.8*nrow(data)), replace=FALSE)
train <- data[Index ,]
test <- data[-Index ,]
head(train)
##     CustomerID Gender Age Geography Tenure       Contract MonthlyCharges
## 841    1000841   Male  27        UK     26 Month-to-month          62.00
## 825    1000825 Female  40     Italy     29       Two-year          54.05
## 430    1000430 Female  50   Germany     52       One-year          90.67
## 95     1000095 Female  31    France     45       Two-year          58.48
## 209    1000209 Female  24     Italy     15       Two-year          95.85
## 442    1000442 Female  53     Italy      9       Two-year          59.57
##     TotalCharges    PaymentMethod IsActiveMember Churn
## 841      1909.18    Bank transfer              1    No
## 825      7888.99 Electronic check              1   Yes
## 430      2608.66 Electronic check              0    No
## 95        575.12      Credit card              1   Yes
## 209      7016.22      Credit card              0   Yes
## 442      1825.52     Direct debit              1    No
head(test)
##    CustomerID Gender Age Geography Tenure       Contract MonthlyCharges
## 8     1000008   Male  46   Germany     11       One-year          79.48
## 10    1000010 Female  63   Germany     51       Two-year          53.30
## 12    1000012   Male  23     Italy     55 Month-to-month          58.33
## 16    1000016   Male  64        UK     37 Month-to-month          59.74
## 29    1000029 Female  66        UK     21 Month-to-month          23.50
## 32    1000032   Male  43    France     39       One-year         107.37
##    TotalCharges PaymentMethod IsActiveMember Churn
## 8       4992.99 Bank transfer              1   Yes
## 10      2178.36   Credit card              1   Yes
## 12       837.00   Credit card              1    No
## 16      3650.46   Credit card              0   Yes
## 29      5230.24 Bank transfer              0    No
## 32      7151.36   Credit card              1   Yes
#Training Process
NBClassifier <- naiveBayes(Churn ~., data = train)
NBClassifier
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##  No Yes 
## 0.5 0.5 
## 
## Conditional probabilities:
##      CustomerID
## Y        [,1]     [,2]
##   No  1000490 282.1720
##   Yes 1000516 294.2419
## 
##      Gender
## Y     Female   Male
##   No  0.5175 0.4825
##   Yes 0.5125 0.4875
## 
##      Age
## Y        [,1]     [,2]
##   No  44.1225 15.13722
##   Yes 44.2475 14.79945
## 
##      Geography
## Y     France Germany  Italy  Spain     UK
##   No  0.1775  0.2075 0.2275 0.1800 0.2075
##   Yes 0.2225  0.1825 0.1900 0.1675 0.2375
## 
##      Tenure
## Y        [,1]     [,2]
##   No  31.2600 17.14635
##   Yes 31.4125 16.71259
## 
##      Contract
## Y     Month-to-month One-year Two-year
##   No          0.3650   0.3025   0.3325
##   Yes         0.3425   0.3300   0.3275
## 
##      MonthlyCharges
## Y         [,1]     [,2]
##   No  70.85435 28.39937
##   Yes 69.30215 28.54168
## 
##      TotalCharges
## Y         [,1]     [,2]
##   No  3896.906 2212.261
##   Yes 4150.544 2258.995
## 
##      PaymentMethod
## Y     Bank transfer Credit card Direct debit Electronic check
##   No         0.2425      0.2325       0.2700           0.2550
##   Yes        0.2575      0.2575       0.2175           0.2675
## 
##      IsActiveMember
## Y       [,1]      [,2]
##   No  0.4875 0.5004697
##   Yes 0.5100 0.5005260
#Testing Process
# Predict using Naive Bayes
test$predicted <- predict(NBClassifier,test)
test$actual <- test$Churn
#confusion matrix
confusionMatrix(factor(test$predicted),
                factor(test$actual))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  52  51
##        Yes 46  51
##                                           
##                Accuracy : 0.515           
##                  95% CI : (0.4435, 0.5861)
##     No Information Rate : 0.51            
##     P-Value [Acc > NIR] : 0.4720          
##                                           
##                   Kappa : 0.0306          
##                                           
##  Mcnemar's Test P-Value : 0.6846          
##                                           
##             Sensitivity : 0.5306          
##             Specificity : 0.5000          
##          Pos Pred Value : 0.5049          
##          Neg Pred Value : 0.5258          
##              Prevalence : 0.4900          
##          Detection Rate : 0.2600          
##    Detection Prevalence : 0.5150          
##       Balanced Accuracy : 0.5153          
##                                           
##        'Positive' Class : No              
## 

============== EVALUASI MODEL KNN ==============

# Membaca data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")

# Ubah kolom kategorikal menjadi numerik
var_cols <- c("Gender", "Geography", "Contract", "PaymentMethod")
data[var_cols] <- lapply(data[var_cols], function(x) as.numeric(as.factor(x)))
data$Churn <- as.factor(data$Churn)  # Biarkan Churn tetap faktor


# Hilangkan kolom ID
data_norm <- data %>% select(-CustomerID)

# Normalisasi semua kolom numerik kecuali 'Churn'
data_norm[, -ncol(data_norm)] <- scale(data_norm[, -ncol(data_norm)])

# ==================== SPLIT DATA ====================
set.seed(101)
# Set the proportion of data to be used for training
train_proportion <- 0.8
# Create the train/test indices with stratified sampling
train_index <- createDataPartition(data_norm$Churn, p = train_proportion, list = FALSE, times = 1)
# Split the data into training and testing sets
train_set <- data_norm[train_index, ]
test_set <- data_norm[-train_index, ]
# ==================== PREDIKSI TEST SET (Tanpa CV) ====================
initial_k <- sqrt(NROW(data))

# K=31
knn.31 <- knn(train = train_set[,-10], test = test_set[,-10], cl = train_set$Churn, k = floor(initial_k))
cf.31 <- confusionMatrix(factor(knn.31), factor(test_set$Churn))

# K=32
knn.32 <- knn(train=train_set[,-10], test=test_set[,-10], cl=train_set$Churn,k=ceiling(initial_k))
cf.32 <- confusionMatrix(factor(knn.32), factor(test_set$Churn))
# ==================== DENGAN CROSS-VALIDATION ====================
# Fungsi untuk KNN + CV
run_knn_cv <- function(data, k, folds) {
  ctrl <- trainControl(method = "cv", number = folds)
  model <- train(Churn ~ ., data = data, method = "knn",
                 trControl = ctrl, tuneGrid = data.frame(k = k))
  return(model)
}

# Model CV
knn_cv_31_5 <- run_knn_cv(data_norm, k = floor(initial_k), folds = 5)
knn_cv_31_10 <- run_knn_cv(data_norm, k = floor(initial_k), folds = 10)
knn_cv_32_5 <- run_knn_cv(data_norm, k = ceiling(initial_k), folds = 5)
knn_cv_32_10 <- run_knn_cv(data_norm, k = ceiling(initial_k), folds = 10)

# Prediksi
pred_cv_31_5 <- predict(knn_cv_31_5, newdata = test_set)
pred_cv_31_10 <- predict(knn_cv_31_10, newdata = test_set)
pred_cv_32_5 <- predict(knn_cv_32_5, newdata = test_set)
pred_cv_32_10 <- predict(knn_cv_32_10, newdata = test_set)

# Confusion Matrix
cf_cv_31_5 <- confusionMatrix(pred_cv_31_5, test_set$Churn)
cf_cv_31_10 <- confusionMatrix(pred_cv_31_10, test_set$Churn)
cf_cv_32_5 <- confusionMatrix(pred_cv_32_5, test_set$Churn)
cf_cv_32_10 <- confusionMatrix(pred_cv_32_10, test_set$Churn)

# ==================== EKSTRAKSI METRIK ====================
extract_metrics <- function(cm) {
  acc <- cm$overall["Accuracy"]
  prec <- cm$byClass["Precision"]
  rec <- cm$byClass["Recall"]
  f1 <- cm$byClass["F1"]
  return(data.frame(Akurasi = acc, Presisi = prec, Recall = rec, F1_Score = f1))
}

# Metrik
metrics_31 <- extract_metrics(cf.31)
metrics_32 <- extract_metrics(cf.32)
metrics_cv_31_5 <- extract_metrics(cf_cv_31_5)
metrics_cv_31_10 <- extract_metrics(cf_cv_31_10)
metrics_cv_32_5 <- extract_metrics(cf_cv_32_5)
metrics_cv_32_10 <- extract_metrics(cf_cv_32_10)

# ==================== TABEL PERBANDINGAN ====================
comparison_table <- data.frame(
  Model = c("KNN (Tanpa CV) K=31", "KNN (Tanpa CV) K=32",
            "KNN (CV 5-Fold) K=31", "KNN (CV 10-Fold) K=31",
            "KNN (CV 5-Fold) K=32", "KNN (CV 10-Fold) K=32"),
  Accuracy = c(metrics_31$Akurasi, metrics_32$Akurasi,
               metrics_cv_31_5$Akurasi, metrics_cv_31_10$Akurasi,
               metrics_cv_32_5$Akurasi, metrics_cv_32_10$Akurasi),
  Precision = c(metrics_31$Presisi, metrics_32$Presisi,
                metrics_cv_31_5$Presisi, metrics_cv_31_10$Presisi,
                metrics_cv_32_5$Presisi, metrics_cv_32_10$Presisi),
  Recall = c(metrics_31$Recall, metrics_32$Recall,
             metrics_cv_31_5$Recall, metrics_cv_31_10$Recall,
             metrics_cv_32_5$Recall, metrics_cv_32_10$Recall),
  F1_Score = c(metrics_31$F1_Score, metrics_32$F1_Score,
               metrics_cv_31_5$F1_Score, metrics_cv_31_10$F1_Score,
               metrics_cv_32_5$F1_Score, metrics_cv_32_10$F1_Score)
)

cat("\n=== Tabel Perbandingan Metrik KNN ===\n")
## 
## === Tabel Perbandingan Metrik KNN ===
print(comparison_table)
##                   Model  Accuracy Precision    Recall  F1_Score
## 1   KNN (Tanpa CV) K=31 0.4874372 0.4851485 0.4949495 0.4900000
## 2   KNN (Tanpa CV) K=32 0.4623116 0.4600000 0.4646465 0.4623116
## 3  KNN (CV 5-Fold) K=31 0.5226131 0.5185185 0.5656566 0.5410628
## 4 KNN (CV 10-Fold) K=31 0.5226131 0.5185185 0.5656566 0.5410628
## 5  KNN (CV 5-Fold) K=32 0.5427136 0.5357143 0.6060606 0.5687204
## 6 KNN (CV 10-Fold) K=32 0.5025126 0.5000000 0.5555556 0.5263158

===================== EVALUASI MODEL DECISION TREE ====================

# === LOAD LIBRARY ===
library(caret)
library(rpart)
library(rpart.plot)
library(dplyr)

# === LOAD DATA ===
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")

# === UBAH KOLOM KATEGORIKAL MENJADI FAKTOR ===
faktor_vars <- c("Gender", "Geography", "Contract", "PaymentMethod", "Churn")
data[faktor_vars] <- lapply(data[faktor_vars], as.factor)
data$Churn <- as.factor(data$Churn)  # Biarkan Churn tetap sebagai faktor

# === HAPUS KOLOM ID ===
data_norm <- data %>% select(-CustomerID)

# === KONVERSI 'IsActiveMember' DAN 'Churn' MENJADI FAKTOR ===
data_norm$IsActiveMember <- as.factor(data_norm$IsActiveMember)
data_norm$Churn <- as.factor(data_norm$Churn)

# === NORMALISASI KOLOM NUMERIK ===
num_cols <- sapply(data_norm, is.numeric)
data_norm[, num_cols] <- scale(data_norm[, num_cols])

# === PARTISI DATA ===
set.seed(101)
train_proportion <- 0.8
train_index <- createDataPartition(data_norm$Churn, p = train_proportion, list = FALSE, times = 1)
train_set1 <- data_norm[train_index, ]
test_set1 <- data_norm[-train_index, ]

# === TRAINING MODEL DECISION TREE (TANPA CV) ===
model_tree <- rpart(Churn ~ ., data = train_set1, method = "class")
pred_test <- predict(model_tree, test_set1, type = "class")
conf_test <- confusionMatrix(pred_test, test_set1$Churn)

# METRIK TANPA CV
accuracy <- conf_test$overall["Accuracy"]
precision <- conf_test$byClass["Pos Pred Value"]
recall <- conf_test$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)

# === CV 5-FOLD ===
ctrl_5 <- trainControl(method = "cv", number = 5, savePredictions = TRUE)
dt_cv5 <- train(Churn ~ ., data = train_set1, method = "rpart", trControl = ctrl_5)
conf_test_cv5 <- confusionMatrix(predict(dt_cv5, test_set1), test_set1$Churn)

# METRIK TEST SET DARI CV 5-FOLD
accuracy_cv5 <- conf_test_cv5$overall["Accuracy"]
precision_cv5 <- conf_test_cv5$byClass["Pos Pred Value"]
recall_cv5 <- conf_test_cv5$byClass["Sensitivity"]
f1_cv5 <- 2 * (precision_cv5 * recall_cv5) / (precision_cv5 + recall_cv5)

# METRIK RATA-RATA DARI HASIL CV (TRAINING SET)
metrics_cv5_train <- dt_cv5$resample %>%
  summarise(Accuracy = mean(Accuracy))

# === CV 10-FOLD ===
ctrl_10 <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
dt_cv10 <- train(Churn ~ ., data = train_set1, method = "rpart", trControl = ctrl_10)
conf_test_cv10 <- confusionMatrix(predict(dt_cv10, test_set1), test_set1$Churn)

# METRIK TEST SET DARI CV 10-FOLD
accuracy_cv10 <- conf_test_cv10$overall["Accuracy"]
precision_cv10 <- conf_test_cv10$byClass["Pos Pred Value"]
recall_cv10 <- conf_test_cv10$byClass["Sensitivity"]
f1_cv10 <- 2 * (precision_cv10 * recall_cv10) / (precision_cv10 + recall_cv10)

# METRIK RATA-RATA DARI HASIL CV (TRAINING SET)
metrics_cv10_train <- dt_cv10$resample %>%
  summarise(Accuracy = mean(Accuracy))

# === TABEL PERBANDINGAN ===
comparison_table <- data.frame(
  Model = c("Decision Tree (Tanpa CV)",
            "Decision Tree (CV 5-Fold) - Test Set",
            "Decision Tree (CV 10-Fold) - Test Set"),
  Accuracy = c(accuracy, accuracy_cv5, accuracy_cv10),
  Precision = c(precision, precision_cv5, precision_cv10),
  Recall = c(recall, recall_cv5, recall_cv10),
  F1_Score = c(f1_score, f1_cv5, f1_cv10)
)

print("=== Perbandingan Evaluasi di Test Set ===")
## [1] "=== Perbandingan Evaluasi di Test Set ==="
print(comparison_table)
##                                   Model  Accuracy Precision    Recall  F1_Score
## 1              Decision Tree (Tanpa CV) 0.5276382 0.5221239 0.5959596 0.5566038
## 2  Decision Tree (CV 5-Fold) - Test Set 0.4673367 0.4678899 0.5151515 0.4903846
## 3 Decision Tree (CV 10-Fold) - Test Set 0.4673367 0.4678899 0.5151515 0.4903846
# === HASIL RATA-RATA DARI CROSS-VALIDATION DI TRAINING SET ===
cat("\n=== Akurasi Rata-rata dari CV di Training Set ===\n")
## 
## === Akurasi Rata-rata dari CV di Training Set ===
cat("CV 5-Fold (Train Set):", round(metrics_cv5_train$Accuracy, 4), "\n")
## CV 5-Fold (Train Set): 0.5169
cat("CV 10-Fold (Train Set):", round(metrics_cv10_train$Accuracy, 4), "\n")
## CV 10-Fold (Train Set): 0.493

========== NAIVE BAYES ==========

# === Load Library ===
library(e1071)
library(caret)
library(dplyr)

# === Membaca Data ===
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")

# === Ubah Kolom-Kolom Kategorikal Jadi Faktor ===
factor_vars <- c("Gender", "Geography", "Contract", "PaymentMethod", "Churn")
data[factor_vars] <- lapply(data[factor_vars], as.factor)
data$IsActiveMember <- as.factor(data$IsActiveMember)

# === Split Data ===
set.seed(101)
Index <- sample(1:nrow(data), size = round(0.8 * nrow(data)), replace = FALSE)
train <- data[Index, ]
test <- data[-Index, ]

# ========== MODEL TANPA CV ==========
# Training model Naive Bayes
NBClassifier <- naiveBayes(Churn ~ ., data = train)

# Prediksi di test set
test$predicted <- predict(NBClassifier, test)
test$actual <- test$Churn

# Confusion Matrix
cf_nb <- confusionMatrix(factor(test$predicted), factor(test$actual))

# ====== Evaluasi Tanpa CV ======
extract_metrics <- function(cm) {
  acc <- cm$overall["Accuracy"]
  prec <- cm$byClass["Precision"]
  rec <- cm$byClass["Recall"]
  f1 <- cm$byClass["F1"]
  data.frame(Akurasi = acc, Presisi = prec, Recall = rec, F1_Score = f1)
}

# Extract metrics for the model without CV
metrics_nb <- extract_metrics(cf_nb)

# ========== MODEL DENGAN 5-FOLD CV ==========
set.seed(101)
ctrl_5 <- trainControl(method = "cv", number = 5)

nb_cv_5 <- train(
  Churn ~ .,
  data = train,
  method = "naive_bayes",
  trControl = ctrl_5
)

# Prediksi test set dari model CV 5-fold
test$predicted_cv_5 <- predict(nb_cv_5, newdata = test)

# Confusion Matrix for 5-fold CV
cf_nb_cv_5 <- confusionMatrix(test$predicted_cv_5, test$Churn)

# Extract metrics for 5-fold CV model
metrics_nb_cv_5 <- extract_metrics(cf_nb_cv_5)

# ========== MODEL DENGAN 10-FOLD CV ==========
set.seed(101)
ctrl_10 <- trainControl(method = "cv", number = 10)

nb_cv_10 <- train(
  Churn ~ .,
  data = train,
  method = "naive_bayes",
  trControl = ctrl_10
)

# Prediksi test set dari model CV 10-fold
test$predicted_cv_10 <- predict(nb_cv_10, newdata = test)

# Confusion Matrix for 10-fold CV
cf_nb_cv_10 <- confusionMatrix(test$predicted_cv_10, test$Churn)

# Extract metrics for 10-fold CV model
metrics_nb_cv_10 <- extract_metrics(cf_nb_cv_10)

# ====== TABEL PERBANDINGAN ======
comparison_table_nb <- data.frame(
  Model = c("Naive Bayes (Tanpa CV)", "Naive Bayes (CV 5-Fold)", "Naive Bayes (CV 10-Fold)"),
  Accuracy = c(metrics_nb$Akurasi, metrics_nb_cv_5$Akurasi, metrics_nb_cv_10$Akurasi),
  Precision = c(metrics_nb$Presisi, metrics_nb_cv_5$Presisi, metrics_nb_cv_10$Presisi),
  Recall = c(metrics_nb$Recall, metrics_nb_cv_5$Recall, metrics_nb_cv_10$Recall),
  F1_Score = c(metrics_nb$F1_Score, metrics_nb_cv_5$F1_Score, metrics_nb_cv_10$F1_Score)
)

# Print tabel perbandingan
print("=== TABEL PERBANDINGAN NAIVE BAYES ===")
## [1] "=== TABEL PERBANDINGAN NAIVE BAYES ==="
print(comparison_table_nb)
##                      Model Accuracy Precision    Recall  F1_Score
## 1   Naive Bayes (Tanpa CV)    0.515 0.5048544 0.5306122 0.5174129
## 2  Naive Bayes (CV 5-Fold)    0.465 0.4579439 0.5000000 0.4780488
## 3 Naive Bayes (CV 10-Fold)    0.470 0.4459459 0.3367347 0.3837209
# ==================== PERSIAPAN ====================
library(caret)
library(dplyr)

# Baca dan siapkan data
data <- read.csv("C:/Users/ASUS/Documents/UNY/MySta/SEM 4/Kecerdasan Buatan/UTS Lana KCB/Large_Customer_Churn_Dataset.csv")
var_cols <- c("Gender", "Geography", "Contract", "PaymentMethod")
data[var_cols] <- lapply(data[var_cols], function(x) as.numeric(as.factor(x)))
data$Churn <- as.factor(ifelse(data$Churn == "Yes", 1, 0))
data_knn <- data %>% select(-CustomerID)
data_knn[, -ncol(data_knn)] <- scale(data_knn[, -ncol(data_knn)])

# Buat train/test split
set.seed(101)
train_index <- createDataPartition(data_knn$Churn, p = 0.8, list = FALSE)
train_set <- data_knn[train_index, ]
test_set <- data_knn[-train_index, ]

# ==================== MODEL KNN ASLI ====================
# Latih model asli dengan semua fitur
ctrl <- trainControl(method = "cv", number = 5)
knn_full <- train(Churn ~ ., data = train_set, method = "knn",
                  trControl = ctrl, tuneGrid = data.frame(k = 32))
# Prediksi & hitung recall
pred_full <- predict(knn_full, newdata = test_set)
cm_full <- confusionMatrix(pred_full, test_set$Churn, positive = "1")
recall_full <- cm_full$byClass["Recall"]

# ==================== PERMUTATION FEATURE IMPORTANCE ====================
fitur_list <- colnames(train_set)[colnames(train_set) != "Churn"]
importance_results <- data.frame(Fitur = fitur_list, Recall_Drop = NA)

for (i in seq_along(fitur_list)) {
  # Copy data test & acak 1 fitur
  test_permuted <- test_set
  test_permuted[[fitur_list[i]]] <- sample(test_permuted[[fitur_list[i]]])  # acak nilai fitur

  # Prediksi ulang dengan fitur yang diacak
  pred_perm <- predict(knn_full, newdata = test_permuted)
  cm_perm <- confusionMatrix(pred_perm, test_set$Churn, positive = "1")
  recall_perm <- cm_perm$byClass["Recall"]

  # Simpan penurunan recall
  importance_results$Recall_Drop[i] <- recall_full - recall_perm
}

# Urutkan berdasarkan penurunan recall (semakin tinggi, semakin penting)
importance_results <- importance_results[order(-importance_results$Recall_Drop), ]

# Hilangkan rownames
rownames(importance_results) <- NULL

# Tampilkan hasil
cat("\n=== Fitur Terpenting untuk KNN (berdasarkan Recall Drop) ===\n")
## 
## === Fitur Terpenting untuk KNN (berdasarkan Recall Drop) ===
print(importance_results)
##            Fitur Recall_Drop
## 1         Gender        0.01
## 2   TotalCharges        0.01
## 3         Tenure       -0.01
## 4  PaymentMethod       -0.01
## 5 MonthlyCharges       -0.03
## 6            Age       -0.04
## 7       Contract       -0.04
## 8 IsActiveMember       -0.05
## 9      Geography       -0.08
# (Opsional) Visualisasi
library(ggplot2)
ggplot(importance_results, aes(x = reorder(Fitur, Recall_Drop), y = Recall_Drop)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Permutation Importance (KNN, Recall-Based)",
       x = "Fitur",
       y = "Recall Drop (Semakin besar = Semakin penting)") +
  theme_minimal()