1. DESKRIPSI DATA

Goals 
a). Memprediksi pelanggan yang akan pergi (Churn Predicition)
b). Mengklasifikasi pelanggan VIP 
c). Memprediksi pembelian di bulan depan

#Import Library

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.4     âś” readr     2.1.6
## âś” forcats   1.0.1     âś” stringr   1.6.0
## âś” ggplot2   4.0.0     âś” tibble    3.3.0
## âś” lubridate 1.9.4     âś” tidyr     1.3.1
## âś” purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

#Load data

dfo_retail <- readr::read_csv2("online_retail.csv", locale = readr::locale(decimal_mark = ","))
## Rows: 541909 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (5): InvoiceNo, StockCode, Description, InvoiceDate, Country
## dbl (3): Quantity, UnitPrice, CustomerID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Cek struktur awal data

str(dfo_retail)
## spc_tbl_ [541,909 Ă— 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ InvoiceNo  : chr [1:541909] "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: chr [1:541909] "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" ...
##  $ UnitPrice  : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
##  $ Country    : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   InvoiceNo = col_character(),
##   ..   StockCode = col_character(),
##   ..   Description = col_character(),
##   ..   Quantity = col_double(),
##   ..   InvoiceDate = col_character(),
##   ..   UnitPrice = col_double(),
##   ..   CustomerID = col_double(),
##   ..   Country = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Parse Invoide Date

library(stringr)
dfo_retail <- dfo_retail %>%
  mutate(
    InvoiceDate = as.character(InvoiceDate),
    InvoiceDate = str_squish(InvoiceDate),
    InvoiceDate = parse_date_time(
      InvoiceDate,
      orders = c("d/m/Y H:M", "d-m-Y H:M", 
                 "Y-m-d H:M:S", "Y/m/d H:M:S")
    )
  )
sum(is.na(dfo_retail$InvoiceDate))
## [1] 0
head(dfo_retail$InvoiceDate)
## [1] "2010-12-01 08:26:00 UTC" "2010-12-01 08:26:00 UTC"
## [3] "2010-12-01 08:26:00 UTC" "2010-12-01 08:26:00 UTC"
## [5] "2010-12-01 08:26:00 UTC" "2010-12-01 08:26:00 UTC"
summary(dfo_retail$InvoiceDate)
##                  Min.               1st Qu.                Median 
## "2010-12-01 08:26:00" "2011-03-28 11:34:00" "2011-07-19 17:17:00" 
##                  Mean               3rd Qu.                  Max. 
## "2011-07-04 13:34:57" "2011-10-19 11:27:00" "2011-12-09 12:50:00"

2. DATA CLEANING

# konversi tipe data
dfo_retail <- dfo_retail %>%
  mutate(
    UnitPrice = as.character(UnitPrice),
    UnitPrice = gsub(",", ".", UnitPrice),    
    UnitPrice = trimws(UnitPrice),
    UnitPrice = as.numeric(UnitPrice),         # jadi NA bila masih non-numeric
    Quantity  = as.numeric(Quantity),
    CustomerID = as.character(CustomerID)      # simpan sebagai char dulu
  )
summary(dfo_retail$UnitPrice)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -11062.060      1.250      2.080      4.611      4.130  38970.000
summary(dfo_retail$Quantity)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -80995.000      1.000      3.000      9.552     10.000  80995.000
# Menangani invalid
df_retail <- dfo_retail %>%
  filter(!is.na(CustomerID),
         !is.na(InvoiceDate),
         !is.na(UnitPrice),
         !is.na(Quantity),
         UnitPrice > 0,
         Quantity != 0)
# Jika invoice cancel ditandai dengan 'C' di InvoiceNo, kamu bisa hapus baris tersebut:
if("InvoiceNo" %in% names(df_retail)) {
  df_retail <- df_retail %>% filter(!str_starts(as.character(InvoiceNo), "C"))
}
dim(df_retail)
## [1] 397884      8
sum(is.na(df_retail$InvoiceDate))
## [1] 0
##cek duplikasi
duplicates <- sum(duplicated(df_retail))
print(duplicates)
## [1] 5192
##hapus duplikasi
df_retail <- df_retail %>%
  distinct()
##cek duplikasi lagi
sum(duplicated(df_retail))
## [1] 0
str(df_retail)
## tibble [392,692 Ă— 8] (S3: tbl_df/tbl/data.frame)
##  $ InvoiceNo  : chr [1:392692] "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr [1:392692] "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr [1:392692] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : num [1:392692] 6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: POSIXct[1:392692], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ UnitPrice  : num [1:392692] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : chr [1:392692] "17850" "17850" "17850" "17850" ...
##  $ Country    : chr [1:392692] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
summary(df_retail)
##   InvoiceNo          StockCode         Description           Quantity       
##  Length:392692      Length:392692      Length:392692      Min.   :    1.00  
##  Class :character   Class :character   Class :character   1st Qu.:    2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    6.00  
##                                                           Mean   :   13.12  
##                                                           3rd Qu.:   12.00  
##                                                           Max.   :80995.00  
##   InvoiceDate                    UnitPrice         CustomerID       
##  Min.   :2010-12-01 08:26:00   Min.   :   0.001   Length:392692     
##  1st Qu.:2011-04-07 11:12:00   1st Qu.:   1.250   Class :character  
##  Median :2011-07-31 12:02:00   Median :   1.950   Mode  :character  
##  Mean   :2011-07-10 19:13:07   Mean   :   3.126                     
##  3rd Qu.:2011-10-20 12:53:00   3rd Qu.:   3.750                     
##  Max.   :2011-12-09 12:50:00   Max.   :8142.750                     
##    Country         
##  Length:392692     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

3. MENETUKAN VAR X DAN VAR Y

Untuk semua goals, perlu membuat variabel dari data transaksi. Variabel Independen (X) yang diperoleh dari analisis RFM: - Keterkinian: Hari sejak pembelian terakhir - Frekuensi: Jumlah pembelian - Moneter: Total pengeluaran

#Hitung RFM
max_tanggal <- max(df_retail$InvoiceDate, na.rm = TRUE) + days(1)

data_rfm <- df_retail %>%
  group_by(CustomerID) %>%
  summarise(
    LastPurchase = max(InvoiceDate, na.rm = TRUE),
    Recency   = as.numeric(difftime(max_tanggal, LastPurchase, units = "days")),
    Frequency = n_distinct(InvoiceNo),
    Monetary = sum(Quantity * UnitPrice, na.rm = TRUE),
    .groups = "drop"
  )
#Bersihkan nilai aneh
data_rfm <- data_rfm %>%
  mutate(
    Recency = ifelse(is.infinite(Recency) | is.nan(Recency), NA, Recency),
    Monetary = ifelse(is.nan(Monetary), 0, Monetary)
  )
#cek
summary(data_rfm$Recency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   18.07   51.09   93.05  142.73  374.12
sum(is.na(data_rfm$Recency))
## [1] 0

Mendefinisikan Y untuk tiap goals

#1.Prediksi Churn: Churn jika Recency > 90 hari (1 jika churn, 0 sebaliknya)
data_rfm <- data_rfm %>%
  mutate(Churn = ifelse(!is.na(Recency) & Recency > 90, 1, 0))

#2. Klasifikasi Pelanggan Bernilai Tinggi: Nilai tinggi jika Moneter di atas 20% (1 jika bernilai tinggi, 0 sebaliknya)
batas_monetary <- quantile(data_rfm$Monetary, 0.8, na.rm = TRUE)
data_rfm <- data_rfm %>%
  mutate(HighValue = ifelse(!is.na(Monetary) & Monetary >= batas_monetary, 1, 0))

#3. Peluang Pembelian: Prediksi pembelian bulan depan jika Recency <= 30 hari (1 jika ya, 0 sebaliknya)
data_rfm <- data_rfm %>%
  mutate(PurchaseNextMonth = ifelse(!is.na(Recency) & Recency <= 30, 1, 0))
summary(data_rfm)
##   CustomerID         LastPurchase                    Recency      
##  Length:4338        Min.   :2010-12-01 09:53:00   Min.   :  1.00  
##  Class :character   1st Qu.:2011-07-20 19:18:00   1st Qu.: 18.07  
##  Mode  :character   Median :2011-10-20 10:40:30   Median : 51.09  
##                     Mean   :2011-09-08 11:38:59   Mean   : 93.05  
##                     3rd Qu.:2011-11-22 11:05:45   3rd Qu.:142.73  
##                     Max.   :2011-12-09 12:50:00   Max.   :374.12  
##    Frequency          Monetary             Churn          HighValue     
##  Min.   :  1.000   Min.   :     3.75   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:  1.000   1st Qu.:   306.48   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :  2.000   Median :   668.57   Median :0.0000   Median :0.0000  
##  Mean   :  4.272   Mean   :  2048.69   Mean   :0.3361   Mean   :0.2001  
##  3rd Qu.:  5.000   3rd Qu.:  1660.60   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :209.000   Max.   :280206.02   Max.   :1.0000   Max.   :1.0000  
##  PurchaseNextMonth
##  Min.   :0.0000   
##  1st Qu.:0.0000   
##  Median :0.0000   
##  Mean   :0.3704   
##  3rd Qu.:1.0000   
##  Max.   :1.0000
sum(is.na(data_rfm$Recency))
## [1] 0
sum(is.na(data_rfm$Monetary))
## [1] 0

Variabel X : Recency, Frequency, Monetary Variabel Y : Churn, HighValue, PurchaseNextMonth

4. EXPLORATORY DATA ANALYSIS (EDA)

A. Statistika Deskriptif

library(psych)
## Warning: package 'psych' was built under R version 4.5.2
## 
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(data_rfm %>% select(Recency, Frequency, Monetary))
##           vars    n    mean      sd median trimmed    mad  min       max
## Recency      1 4338   93.05  100.01  51.09   76.30  59.47 1.00    374.12
## Frequency    2 4338    4.27    7.70   2.00    2.90   1.48 1.00    209.00
## Monetary     3 4338 2048.69 8985.23 668.57  974.58 688.81 3.75 280206.02
##               range  skew kurtosis     se
## Recency      373.12  1.24     0.43   1.52
## Frequency    208.00 12.06   248.65   0.12
## Monetary  280202.27 19.33   477.91 136.42
library(psych)
describe(data_rfm %>% select(Churn, HighValue, PurchaseNextMonth))
##                   vars    n mean   sd median trimmed mad min max range skew
## Churn                1 4338 0.34 0.47      0    0.30   0   0   1     1 0.69
## HighValue            2 4338 0.20 0.40      0    0.13   0   0   1     1 1.50
## PurchaseNextMonth    3 4338 0.37 0.48      0    0.34   0   0   1     1 0.54
##                   kurtosis   se
## Churn                -1.52 0.01
## HighValue             0.25 0.01
## PurchaseNextMonth    -1.71 0.01
summary(data_rfm$Recency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   18.07   51.09   93.05  142.73  374.12

B. Korelasi

cor_matrix <- cor(data_rfm %>% select(Recency, Frequency, Monetary))
corrplot(cor_matrix, method = "circle")

Insight: Recency berkorelasi negatif dengan Frequency/Monetary (pelanggan terbaru belanja lebih sering/banyak), Frequency dan Monetary berkorelasi positif.

C. Visualisasi Distribusi

# Histogram of Recency
ggplot(data_rfm, aes(x = Recency)) +
  geom_histogram(binwidth = 10, fill = "maroon", alpha = 0.7) +
  labs(title = "Distribution of Recency", x = "Recency (days)", y = "Count")

Insight: Menunjukkan grafik ini skewed right (menceng ke kanan). Mayoritas pelanggan baru saja bertransaksi dalam waktu dekat (kurang dari 50 hari), menandakan basis pelanggan yang saat ini sedang aktif.

# Histogram of Frequency
ggplot(data_rfm, aes(x = Frequency)) +
  geom_histogram(binwidth = 1, fill = "darkseagreen", alpha = 0.7) +
  coord_cartesian(xlim = c(0,100)) + # BATAS X SAMPAI 100
  labs(title = "Distribution of Frequency", x = "Frequency", y = "Count") +
  theme_minimal()

Insight: Hampir seluruh pelanggan hanya melakukan pembelian satu kali (one-time buyers), mengindikasikan masalah serius pada loyalitas dan retensi pelanggan.

# Histogram of Monetary
ggplot(data_rfm, aes(x = Monetary)) +
  geom_histogram(binwidth = 2, fill = "burlywood", alpha = 0.7) +
  coord_cartesian(xlim = c(0,2500)) + # BATAS X SAMPAI 50
  labs(title = "Distribution of Monetary", x = "Monetary", y = "Count") + 
  theme_minimal()

Insight: Kebanyakan pelanggan membelanjakan uang dalam nominal kecil hingga menengah, namun terdapat sebagian kecil pelanggan bernilai tinggi (whales) yang tersebar di ekor kanan grafik.

#Bar Plot for Churn
ggplot(data_rfm, aes(x = factor(Churn))) +
  geom_bar(fill = "darksalmon", alpha = 0.7) +
  labs(title = "Churn Distribution", x = "Churn (0=No, 1=Yes)", y = "Count")

Insight : umlah pelanggan yang tidak Churn (0) lebih banyak—sekitar dua kali lipat—dibandingkan pelanggan yang Churn (1).

# Bar Plot for High Value
ggplot(data_rfm, aes(x = factor(HighValue))) +
  geom_bar(fill = "bisque3", alpha = 0.7) +
  labs(title = "High-Value Customer Distribution", x = "High-Value (0=No, 1=Yes)", y = "Count")

Insight: Terlihat ketimpangan yang cukup signifikan. Jumlah pelanggan biasa (0) mencapai sekitar 3.500, sedangkan pelanggan bernilai tinggi (1) hanya berada di kisaran 900-an.

# Bar Plot for Purcase Next Month
ggplot(data_rfm, aes(x = factor(PurchaseNextMonth))) +
  geom_bar(fill = "pink3", alpha = 0.7) +
  labs(title = "Next Month Purchase Distribution", x = "Next Month Purchase (0=No, 1=Yes)", y = "Count")

Insight: Mayoritas pelanggan (batang 0, sekitar 2.700 orang) diprediksi tidak akan melakukan pembelian bulan depan, dibandingkan dengan mereka yang diprediksi akan membeli (batang 1, sekitar 1.600 orang).

summary(data_rfm$PurchaseNextMonth)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3704  1.0000  1.0000
table(data_rfm$PurchaseNextMonth, useNA = "ifany")
## 
##    0    1 
## 2731 1607
unique(data_rfm$PurchaseNextMonth)
## [1] 0 1
str(data_rfm$PurchaseNextMonth)
##  num [1:4338] 0 1 0 1 0 0 0 0 0 1 ...

5. MODELLING DATA

#Untuk setiap tujuan, kami akan menggunakan Random Forest untuk klasifikasi (cocok untuk data yang tidak seimbang dan pentingnya fitur).

A. Churn Prediction

# Ubah factor 0/1 menjadi No/Yes agar valid di R
# Asumsi: 0 = Tidak Churn, 1 = Churn
data_rfm$Churn <- factor(data_rfm$Churn, 
                         levels = c("0", "1"), 
                         labels = c("No", "Yes"))
# Cek apakah sudah berubah jadi huruf
table(data_rfm$Churn)
## 
##   No  Yes 
## 2880 1458
# pastikan faktor
data_rfm$Churn <- as.factor(data_rfm$Churn)

#FEATURE ENGINEERING
# AOV: Rata-rata belanja per kunjungan
data_rfm$AOV <- data_rfm$Monetary / ifelse(data_rfm$Frequency == 0, 1, data_rfm$Frequency)
# Log Transform: Menormalkan data uang yang jomplang
data_rfm$Log_Monetary <- log(data_rfm$Monetary + 1)

# Cek keseimbangan kelas (Penting untuk melihat seberapa parah imbalance-nya)
table(data_rfm$Churn)
## 
##   No  Yes 
## 2880 1458
set.seed(42)
# Split data
indeks_churn <- createDataPartition(data_rfm$Churn, p = 0.8, list = FALSE)
churn_train <- data_rfm[indeks_churn, ]
churn_test <- data_rfm[-indeks_churn, ]

# 2. KONTROL TRAINING (Rahasia Kappa Tinggi)
# Gunakan sampling = "up" (Up-sampling) atau "smote"
# Ini memaksa model belajar mengenali kelas minoritas dengan lebih adil.
kontrol <- trainControl(
  method = "cv", 
  number = 5,
  sampling = "up",  # INI KUNCINYA
  classProbs = TRUE, # Diperlukan jika ingin melihat probabilitas
  verboseIter = TRUE
)

# Tuning parameter mtry dan ntree yang lebih kecil
grid <- expand.grid(mtry = c(1, 2, 3))
# 4. TRAINING MODEL
# Prediksi Churn hanya berdasarkan kebiasaan belanja (F & M)
set.seed(42)
churn_model <- train(
  Churn ~ Frequency + Monetary + AOV + Log_Monetary,
  data = churn_train,
  method = "rf",
  trControl = kontrol,
  tuneGrid = grid,
  ntree = 200,      # Tambah pohon sedikit agar lebih stabil
  nodesize = 5      # Mencegah overfitting (pohon tidak terlalu rimbun)
)
## + Fold1: mtry=1 
## - Fold1: mtry=1 
## + Fold1: mtry=2 
## - Fold1: mtry=2 
## + Fold1: mtry=3 
## - Fold1: mtry=3 
## + Fold2: mtry=1 
## - Fold2: mtry=1 
## + Fold2: mtry=2 
## - Fold2: mtry=2 
## + Fold2: mtry=3 
## - Fold2: mtry=3 
## + Fold3: mtry=1 
## - Fold3: mtry=1 
## + Fold3: mtry=2 
## - Fold3: mtry=2 
## + Fold3: mtry=3 
## - Fold3: mtry=3 
## + Fold4: mtry=1 
## - Fold4: mtry=1 
## + Fold4: mtry=2 
## - Fold4: mtry=2 
## + Fold4: mtry=3 
## - Fold4: mtry=3 
## + Fold5: mtry=1 
## - Fold5: mtry=1 
## + Fold5: mtry=2 
## - Fold5: mtry=2 
## + Fold5: mtry=3 
## - Fold5: mtry=3 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1 on full training set
# 5. Evaluasi
churn_prediksi <- predict(churn_model, churn_test)

# Confusion Matrix
# Pastikan positive class-nya benar ("Yes" = Churn)
confusionMatrix(churn_prediksi, churn_test$Churn, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  339  63
##        Yes 237 228
##                                           
##                Accuracy : 0.654           
##                  95% CI : (0.6213, 0.6857)
##     No Information Rate : 0.6644          
##     P-Value [Acc > NIR] : 0.7534          
##                                           
##                   Kappa : 0.3241          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7835          
##             Specificity : 0.5885          
##          Pos Pred Value : 0.4903          
##          Neg Pred Value : 0.8433          
##              Prevalence : 0.3356          
##          Detection Rate : 0.2630          
##    Detection Prevalence : 0.5363          
##       Balanced Accuracy : 0.6860          
##                                           
##        'Positive' Class : Yes             
## 

B. High Value Customer Classification

# 1. Pastikan Recency, Frequency, Monetary ada
# Hapus kolom score/hasil perhitungan lama agar bersih
data_rfm$R_Score <- NULL
data_rfm$F_Score <- NULL
data_rfm$M_Score <- NULL
data_rfm$RFM_Score <- NULL
# 2. Buat Ulang Target (High Value = Top 30% berdasarkan R, F, M)
# Hitung score manual dulu hanya untuk LABELING (Kunci Jawaban)
r_score <- ntile(-data_rfm$Recency, 5) # Negatif karena makin kecil makin bagus
f_score <- ntile(data_rfm$Frequency, 5)
m_score <- ntile(data_rfm$Monetary, 5)
total_score <- r_score + f_score + m_score
# Tentukan batas atas (Top 30%)
batas <- quantile(total_score, 0.70)
data_rfm$HighValue <- ifelse(total_score >= batas, "Yes", "No") # Gunakan Yes/No agar lebih jelas
data_rfm$HighValue <- factor(data_rfm$HighValue, levels = c("No", "Yes"))
# 3. FEATURE ENGINEERING (Kunci menaikkan Kappa)
# Buat fitur turunan agar model lebih pintar membaca pola
data_rfm$Log_Frequency <- log(data_rfm$Frequency + 1) # Menormalkan distribusi
data_rfm$Interaction_RF <- data_rfm$Recency * data_rfm$Frequency # Interaksi R dan F

# Cek proporsi data (biasanya Yes lebih sedikit)
table(data_rfm$HighValue)
## 
##   No  Yes 
## 2746 1592
set.seed(42)

# 1. Split Data
indeks_hv <- createDataPartition(data_rfm$HighValue, p = 0.8, list = FALSE)
hv_train <- data_rfm[indeks_hv, ]
hv_test  <- data_rfm[-indeks_hv, ]

# 2. Kontrol Training dengan SAMPLING
# 'sampling = "up"' akan menduplikasi data minoritas (Yes) agar model tidak bias ke 'No'
kontrol_hv <- trainControl(
  method = "cv", 
  number = 5,
  sampling = "up",  # INI KUNCINYA AGAR KAPPA NAIK
  verboseIter = TRUE
)
# 3. Training Model
# Perhatikan formulanya: TIDAK ADA 'Monetary'.
# Kita memprediksi HighValue hanya dari perilaku kunjungan (Recency & Frequency)
hv_model <- train(
  HighValue ~ Recency + Frequency + Log_Frequency + Interaction_RF,
  data = hv_train,
  method = "rf",
  trControl = kontrol_hv,
  ntree = 200,      # Jumlah pohon
  tuneLength = 5    # Biarkan Caret mencari mtry terbaik otomatis
)
## note: only 3 unique complexity parameters in default grid. Truncating the grid to 3 .
## 
## + Fold1: mtry=2 
## - Fold1: mtry=2 
## + Fold1: mtry=3 
## - Fold1: mtry=3 
## + Fold1: mtry=4 
## - Fold1: mtry=4 
## + Fold2: mtry=2 
## - Fold2: mtry=2 
## + Fold2: mtry=3 
## - Fold2: mtry=3 
## + Fold2: mtry=4 
## - Fold2: mtry=4 
## + Fold3: mtry=2 
## - Fold3: mtry=2 
## + Fold3: mtry=3 
## - Fold3: mtry=3 
## + Fold3: mtry=4 
## - Fold3: mtry=4 
## + Fold4: mtry=2 
## - Fold4: mtry=2 
## + Fold4: mtry=3 
## - Fold4: mtry=3 
## + Fold4: mtry=4 
## - Fold4: mtry=4 
## + Fold5: mtry=2 
## - Fold5: mtry=2 
## + Fold5: mtry=3 
## - Fold5: mtry=3 
## + Fold5: mtry=4 
## - Fold5: mtry=4 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
# 4. Evaluasi
hv_prediksi <- predict(hv_model, hv_test)

# Tampilkan hasil
print(hv_model)
## Random Forest 
## 
## 3471 samples
##    4 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2777, 2777, 2777, 2776, 2777 
## Addtional sampling using up-sampling
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9423797  0.8769491
##   3     0.9311438  0.8518798
##   4     0.9297037  0.8486802
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
confusionMatrix(hv_prediksi, hv_test$HighValue, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  515  19
##        Yes  34 299
##                                           
##                Accuracy : 0.9389          
##                  95% CI : (0.9208, 0.9539)
##     No Information Rate : 0.6332          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8697          
##                                           
##  Mcnemar's Test P-Value : 0.05447         
##                                           
##             Sensitivity : 0.9403          
##             Specificity : 0.9381          
##          Pos Pred Value : 0.8979          
##          Neg Pred Value : 0.9644          
##              Prevalence : 0.3668          
##          Detection Rate : 0.3449          
##    Detection Prevalence : 0.3841          
##       Balanced Accuracy : 0.9392          
##                                           
##        'Positive' Class : Yes             
## 

C. Purchase Probability (Next Month Purchase)

# Hapus var yang rusak
data_rfm$PurchaseNextMonth <- NULL

buat ulang PurchaseNextMonth

summary(data_rfm$Recency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   18.07   51.09   93.05  142.73  374.12
data_rfm$PurchaseNextMonth <- ifelse(data_rfm$Recency <= 50, "Yes", "No")
data_rfm$PurchaseNextMonth <- factor(data_rfm$PurchaseNextMonth, levels = c("No","Yes"))

# cek apakah sudah ada dua kelas
table(data_rfm$PurchaseNextMonth)
## 
##   No  Yes 
## 2213 2125
prop.table(table(data_rfm$PurchaseNextMonth))
## 
##        No       Yes 
## 0.5101429 0.4898571
# Modelling 
set.seed(123)
indeks_purchase <- createDataPartition(data_rfm$PurchaseNextMonth, p = 0.8, list = FALSE)
purchase_train <- data_rfm[indeks_purchase, ]
purchase_test <- data_rfm[-indeks_purchase, ]
purchase_model <- randomForest(
  PurchaseNextMonth ~ Frequency + Monetary,
  data = purchase_train,
  ntree = 150,
  nodesize = 10, # Mencegah pohon terlalu dalam (mengurangi overfitting)
  maxnodes = 20 #Membatasi kompleksitas pohon
)

purchase_pred <- predict(purchase_model, purchase_test)

confusionMatrix(purchase_pred, purchase_test$PurchaseNextMonth)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  331 178
##        Yes 111 247
##                                          
##                Accuracy : 0.6667         
##                  95% CI : (0.6342, 0.698)
##     No Information Rate : 0.5098         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.331          
##                                          
##  Mcnemar's Test P-Value : 0.0001035      
##                                          
##             Sensitivity : 0.7489         
##             Specificity : 0.5812         
##          Pos Pred Value : 0.6503         
##          Neg Pred Value : 0.6899         
##              Prevalence : 0.5098         
##          Detection Rate : 0.3818         
##    Detection Prevalence : 0.5871         
##       Balanced Accuracy : 0.6650         
##                                          
##        'Positive' Class : No             
##