library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(tidyr)
df_churn <- read.csv("customer_churn_dataset.csv", stringsAsFactors = FALSE)
df_fitur <- df_churn %>% select(-CustomerID, -Churn)

dummy_model <- dummyVars(" ~ .", data = df_fitur)
df_dummy <- data.frame(predict(dummy_model, newdata = df_fitur))

df_scaled <- as.data.frame(scale(df_dummy))
set.seed(123)
k_values <- 1:10
wss_values <- numeric(length(k_values))

for (k in k_values) {
  kmeans_model <- kmeans(df_scaled, centers = k, nstart = 25)
  wss_values[k] <- kmeans_model$tot.withinss
}

plot(k_values, wss_values, 
     type = "b", pch = 19, col = "blue", frame = FALSE, 
     xlab = "K", ylab = "WSS")

k_optimal <- 3
set.seed(123)

kmeans_final <- kmeans(df_scaled, centers = k_optimal, nstart = 25)
df_churn$Cluster <- as.factor(kmeans_final$cluster)
pca_result <- prcomp(df_scaled, center = FALSE, scale. = FALSE)

df_pca <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  Cluster = df_churn$Cluster
)

ggplot(df_pca, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  stat_ellipse(level = 0.95, linetype = 2) +
  theme_minimal()

get_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
profil_cluster <- df_churn %>%
  group_by(Cluster) %>%
  summarise(
    Populasi = n(),
    Rata_Umur = mean(Age, na.rm = TRUE),
    Rata_Tenure = mean(Tenure, na.rm = TRUE),
    Rata_Usage = mean(Usage.Frequency, na.rm = TRUE),
    Rata_Support = mean(Support.Calls, na.rm = TRUE),
    Rata_Delay = mean(Payment.Delay, na.rm = TRUE),
    Rata_Spend = mean(Total.Spend, na.rm = TRUE),
    Rata_LastInteraction = mean(Last.Interaction, na.rm = TRUE),
    Persentase_Churn = mean(Churn, na.rm = TRUE) * 100,
    Modus_Gender = get_mode(Gender),
    Modus_Subscription = get_mode(Subscription.Type),
    Modus_Contract = get_mode(Contract.Length)
  )

print(profil_cluster)
## # A tibble: 3 × 13
##   Cluster Populasi Rata_Umur Rata_Tenure Rata_Usage Rata_Support Rata_Delay
##   <fct>      <int>     <dbl>       <dbl>      <dbl>        <dbl>      <dbl>
## 1 1           1668      40.7        30.7       15.7         4.91       15.1
## 2 2           1648      41.1        31.6       15.3         5.04       14.8
## 3 3           1677      41.3        29.9       15.6         4.97       15.1
## # ℹ 6 more variables: Rata_Spend <dbl>, Rata_LastInteraction <dbl>,
## #   Persentase_Churn <dbl>, Modus_Gender <chr>, Modus_Subscription <chr>,
## #   Modus_Contract <chr>

Interpretasi Profil Cluster (Segmen Pelanggan)

Berdasarkan hasil pemodelan K-Means, algoritma telah membagi pelanggan ke dalam 3 kelompok yang ukurannya hampir sama besar (sekitar 1.648 - 1.677 orang per kelompok). Karakteristik perilaku numerik antar-kelompok terbukti sangat seragam (Rata-rata umur ~41 tahun, rata-rata komplain ~5 kali, telat bayar ~15 hari). Oleh karena itu, pembeda utama dari segmen ini terletak pada demografi dan jenis layanannya:

1. Segmen 1: Pengguna Premium Jangka Panjang (Pria)

2. Segmen 2: Pengguna Standard Menengah (Wanita)

3. Segmen 3: Pengguna Basic Jangka Panjang (Pria)