Goals
a). Memprediksi pelanggan yang akan pergi (Churn Predicition)
b). Mengklasifikasi pelanggan VIP
c). Memprediksi pembelian di bulan depan
#Import Library
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.6
## âś” forcats 1.0.1 âś” stringr 1.6.0
## âś” ggplot2 4.0.0 âś” tibble 3.3.0
## âś” lubridate 1.9.4 âś” tidyr 1.3.1
## âś” purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
#Load data
dfo_retail <- readr::read_csv2("online_retail.csv", locale = readr::locale(decimal_mark = ","))
## Rows: 541909 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (5): InvoiceNo, StockCode, Description, InvoiceDate, Country
## dbl (3): Quantity, UnitPrice, CustomerID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Cek struktur awal data
str(dfo_retail)
## spc_tbl_ [541,909 Ă— 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ InvoiceNo : chr [1:541909] "536365" "536365" "536365" "536365" ...
## $ StockCode : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: chr [1:541909] "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" ...
## $ UnitPrice : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
## $ Country : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## - attr(*, "spec")=
## .. cols(
## .. InvoiceNo = col_character(),
## .. StockCode = col_character(),
## .. Description = col_character(),
## .. Quantity = col_double(),
## .. InvoiceDate = col_character(),
## .. UnitPrice = col_double(),
## .. CustomerID = col_double(),
## .. Country = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Parse Invoide Date
library(stringr)
dfo_retail <- dfo_retail %>%
mutate(
InvoiceDate = as.character(InvoiceDate),
InvoiceDate = str_squish(InvoiceDate),
InvoiceDate = parse_date_time(
InvoiceDate,
orders = c("d/m/Y H:M", "d-m-Y H:M",
"Y-m-d H:M:S", "Y/m/d H:M:S")
)
)
sum(is.na(dfo_retail$InvoiceDate))
## [1] 0
head(dfo_retail$InvoiceDate)
## [1] "2010-12-01 08:26:00 UTC" "2010-12-01 08:26:00 UTC"
## [3] "2010-12-01 08:26:00 UTC" "2010-12-01 08:26:00 UTC"
## [5] "2010-12-01 08:26:00 UTC" "2010-12-01 08:26:00 UTC"
summary(dfo_retail$InvoiceDate)
## Min. 1st Qu. Median
## "2010-12-01 08:26:00" "2011-03-28 11:34:00" "2011-07-19 17:17:00"
## Mean 3rd Qu. Max.
## "2011-07-04 13:34:57" "2011-10-19 11:27:00" "2011-12-09 12:50:00"
# konversi tipe data
dfo_retail <- dfo_retail %>%
mutate(
UnitPrice = as.character(UnitPrice),
UnitPrice = gsub(",", ".", UnitPrice),
UnitPrice = trimws(UnitPrice),
UnitPrice = as.numeric(UnitPrice), # jadi NA bila masih non-numeric
Quantity = as.numeric(Quantity),
CustomerID = as.character(CustomerID) # simpan sebagai char dulu
)
summary(dfo_retail$UnitPrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11062.060 1.250 2.080 4.611 4.130 38970.000
summary(dfo_retail$Quantity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -80995.000 1.000 3.000 9.552 10.000 80995.000
# Menangani invalid
df_retail <- dfo_retail %>%
filter(!is.na(CustomerID),
!is.na(InvoiceDate),
!is.na(UnitPrice),
!is.na(Quantity),
UnitPrice > 0,
Quantity != 0)
# Jika invoice cancel ditandai dengan 'C' di InvoiceNo, kamu bisa hapus baris tersebut:
if("InvoiceNo" %in% names(df_retail)) {
df_retail <- df_retail %>% filter(!str_starts(as.character(InvoiceNo), "C"))
}
dim(df_retail)
## [1] 397884 8
sum(is.na(df_retail$InvoiceDate))
## [1] 0
##cek duplikasi
duplicates <- sum(duplicated(df_retail))
print(duplicates)
## [1] 5192
##hapus duplikasi
df_retail <- df_retail %>%
distinct()
##cek duplikasi lagi
sum(duplicated(df_retail))
## [1] 0
str(df_retail)
## tibble [392,692 Ă— 8] (S3: tbl_df/tbl/data.frame)
## $ InvoiceNo : chr [1:392692] "536365" "536365" "536365" "536365" ...
## $ StockCode : chr [1:392692] "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr [1:392692] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:392692] 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: POSIXct[1:392692], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ UnitPrice : num [1:392692] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : chr [1:392692] "17850" "17850" "17850" "17850" ...
## $ Country : chr [1:392692] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
summary(df_retail)
## InvoiceNo StockCode Description Quantity
## Length:392692 Length:392692 Length:392692 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 6.00
## Mean : 13.12
## 3rd Qu.: 12.00
## Max. :80995.00
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. : 0.001 Length:392692
## 1st Qu.:2011-04-07 11:12:00 1st Qu.: 1.250 Class :character
## Median :2011-07-31 12:02:00 Median : 1.950 Mode :character
## Mean :2011-07-10 19:13:07 Mean : 3.126
## 3rd Qu.:2011-10-20 12:53:00 3rd Qu.: 3.750
## Max. :2011-12-09 12:50:00 Max. :8142.750
## Country
## Length:392692
## Class :character
## Mode :character
##
##
##
Untuk semua goals, perlu membuat variabel dari data transaksi. Variabel Independen (X) yang diperoleh dari analisis RFM: - Keterkinian: Hari sejak pembelian terakhir - Frekuensi: Jumlah pembelian - Moneter: Total pengeluaran
#Hitung RFM
max_tanggal <- max(df_retail$InvoiceDate, na.rm = TRUE) + days(1)
data_rfm <- df_retail %>%
group_by(CustomerID) %>%
summarise(
LastPurchase = max(InvoiceDate, na.rm = TRUE),
Recency = as.numeric(difftime(max_tanggal, LastPurchase, units = "days")),
Frequency = n_distinct(InvoiceNo),
Monetary = sum(Quantity * UnitPrice, na.rm = TRUE),
.groups = "drop"
)
#Bersihkan nilai aneh
data_rfm <- data_rfm %>%
mutate(
Recency = ifelse(is.infinite(Recency) | is.nan(Recency), NA, Recency),
Monetary = ifelse(is.nan(Monetary), 0, Monetary)
)
#cek
summary(data_rfm$Recency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 18.07 51.09 93.05 142.73 374.12
sum(is.na(data_rfm$Recency))
## [1] 0
#1.Prediksi Churn: Churn jika Recency > 90 hari (1 jika churn, 0 sebaliknya)
data_rfm <- data_rfm %>%
mutate(Churn = ifelse(!is.na(Recency) & Recency > 90, 1, 0))
#2. Klasifikasi Pelanggan Bernilai Tinggi: Nilai tinggi jika Moneter di atas 20% (1 jika bernilai tinggi, 0 sebaliknya)
batas_monetary <- quantile(data_rfm$Monetary, 0.8, na.rm = TRUE)
data_rfm <- data_rfm %>%
mutate(HighValue = ifelse(!is.na(Monetary) & Monetary >= batas_monetary, 1, 0))
#3. Peluang Pembelian: Prediksi pembelian bulan depan jika Recency <= 30 hari (1 jika ya, 0 sebaliknya)
data_rfm <- data_rfm %>%
mutate(PurchaseNextMonth = ifelse(!is.na(Recency) & Recency <= 30, 1, 0))
summary(data_rfm)
## CustomerID LastPurchase Recency
## Length:4338 Min. :2010-12-01 09:53:00 Min. : 1.00
## Class :character 1st Qu.:2011-07-20 19:18:00 1st Qu.: 18.07
## Mode :character Median :2011-10-20 10:40:30 Median : 51.09
## Mean :2011-09-08 11:38:59 Mean : 93.05
## 3rd Qu.:2011-11-22 11:05:45 3rd Qu.:142.73
## Max. :2011-12-09 12:50:00 Max. :374.12
## Frequency Monetary Churn HighValue
## Min. : 1.000 Min. : 3.75 Min. :0.0000 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.: 306.48 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 2.000 Median : 668.57 Median :0.0000 Median :0.0000
## Mean : 4.272 Mean : 2048.69 Mean :0.3361 Mean :0.2001
## 3rd Qu.: 5.000 3rd Qu.: 1660.60 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :209.000 Max. :280206.02 Max. :1.0000 Max. :1.0000
## PurchaseNextMonth
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3704
## 3rd Qu.:1.0000
## Max. :1.0000
sum(is.na(data_rfm$Recency))
## [1] 0
sum(is.na(data_rfm$Monetary))
## [1] 0
Variabel X : Recency, Frequency, Monetary Variabel Y : Churn, HighValue, PurchaseNextMonth
A. Statistika Deskriptif
library(psych)
## Warning: package 'psych' was built under R version 4.5.2
##
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(data_rfm %>% select(Recency, Frequency, Monetary))
## vars n mean sd median trimmed mad min max
## Recency 1 4338 93.05 100.01 51.09 76.30 59.47 1.00 374.12
## Frequency 2 4338 4.27 7.70 2.00 2.90 1.48 1.00 209.00
## Monetary 3 4338 2048.69 8985.23 668.57 974.58 688.81 3.75 280206.02
## range skew kurtosis se
## Recency 373.12 1.24 0.43 1.52
## Frequency 208.00 12.06 248.65 0.12
## Monetary 280202.27 19.33 477.91 136.42
library(psych)
describe(data_rfm %>% select(Churn, HighValue, PurchaseNextMonth))
## vars n mean sd median trimmed mad min max range skew
## Churn 1 4338 0.34 0.47 0 0.30 0 0 1 1 0.69
## HighValue 2 4338 0.20 0.40 0 0.13 0 0 1 1 1.50
## PurchaseNextMonth 3 4338 0.37 0.48 0 0.34 0 0 1 1 0.54
## kurtosis se
## Churn -1.52 0.01
## HighValue 0.25 0.01
## PurchaseNextMonth -1.71 0.01
summary(data_rfm$Recency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 18.07 51.09 93.05 142.73 374.12
B. Korelasi
cor_matrix <- cor(data_rfm %>% select(Recency, Frequency, Monetary))
corrplot(cor_matrix, method = "circle")
Insight: Recency berkorelasi negatif dengan Frequency/Monetary
(pelanggan terbaru belanja lebih sering/banyak), Frequency dan Monetary
berkorelasi positif.
C. Visualisasi Distribusi
# Histogram of Recency
ggplot(data_rfm, aes(x = Recency)) +
geom_histogram(binwidth = 10, fill = "maroon", alpha = 0.7) +
labs(title = "Distribution of Recency", x = "Recency (days)", y = "Count")
Insight: Menunjukkan grafik ini skewed right (menceng ke kanan).
Mayoritas pelanggan baru saja bertransaksi dalam waktu dekat (kurang
dari 50 hari), menandakan basis pelanggan yang saat ini sedang
aktif.
# Histogram of Frequency
ggplot(data_rfm, aes(x = Frequency)) +
geom_histogram(binwidth = 1, fill = "darkseagreen", alpha = 0.7) +
coord_cartesian(xlim = c(0,100)) + # BATAS X SAMPAI 100
labs(title = "Distribution of Frequency", x = "Frequency", y = "Count") +
theme_minimal()
Insight: Hampir seluruh pelanggan hanya melakukan pembelian satu kali
(one-time buyers), mengindikasikan masalah serius pada loyalitas dan
retensi pelanggan.
# Histogram of Monetary
ggplot(data_rfm, aes(x = Monetary)) +
geom_histogram(binwidth = 2, fill = "burlywood", alpha = 0.7) +
coord_cartesian(xlim = c(0,2500)) + # BATAS X SAMPAI 50
labs(title = "Distribution of Monetary", x = "Monetary", y = "Count") +
theme_minimal()
Insight: Kebanyakan pelanggan membelanjakan uang dalam nominal kecil
hingga menengah, namun terdapat sebagian kecil pelanggan bernilai tinggi
(whales) yang tersebar di ekor kanan grafik.
#Bar Plot for Churn
ggplot(data_rfm, aes(x = factor(Churn))) +
geom_bar(fill = "darksalmon", alpha = 0.7) +
labs(title = "Churn Distribution", x = "Churn (0=No, 1=Yes)", y = "Count")
Insight : umlah pelanggan yang tidak Churn (0) lebih banyak—sekitar dua
kali lipat—dibandingkan pelanggan yang Churn (1).
# Bar Plot for High Value
ggplot(data_rfm, aes(x = factor(HighValue))) +
geom_bar(fill = "bisque3", alpha = 0.7) +
labs(title = "High-Value Customer Distribution", x = "High-Value (0=No, 1=Yes)", y = "Count")
Insight: Terlihat ketimpangan yang cukup signifikan. Jumlah pelanggan
biasa (0) mencapai sekitar 3.500, sedangkan pelanggan bernilai tinggi
(1) hanya berada di kisaran 900-an.
# Bar Plot for Purcase Next Month
ggplot(data_rfm, aes(x = factor(PurchaseNextMonth))) +
geom_bar(fill = "pink3", alpha = 0.7) +
labs(title = "Next Month Purchase Distribution", x = "Next Month Purchase (0=No, 1=Yes)", y = "Count")
Insight: Mayoritas pelanggan (batang 0, sekitar 2.700 orang) diprediksi
tidak akan melakukan pembelian bulan depan, dibandingkan dengan mereka
yang diprediksi akan membeli (batang 1, sekitar 1.600 orang).
summary(data_rfm$PurchaseNextMonth)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3704 1.0000 1.0000
table(data_rfm$PurchaseNextMonth, useNA = "ifany")
##
## 0 1
## 2731 1607
unique(data_rfm$PurchaseNextMonth)
## [1] 0 1
str(data_rfm$PurchaseNextMonth)
## num [1:4338] 0 1 0 1 0 0 0 0 0 1 ...
#Untuk setiap tujuan, kami akan menggunakan Random Forest untuk klasifikasi (cocok untuk data yang tidak seimbang dan pentingnya fitur).
# Ubah factor 0/1 menjadi No/Yes agar valid di R
# Asumsi: 0 = Tidak Churn, 1 = Churn
data_rfm$Churn <- factor(data_rfm$Churn,
levels = c("0", "1"),
labels = c("No", "Yes"))
# Cek apakah sudah berubah jadi huruf
table(data_rfm$Churn)
##
## No Yes
## 2880 1458
# pastikan faktor
data_rfm$Churn <- as.factor(data_rfm$Churn)
#FEATURE ENGINEERING
# AOV: Rata-rata belanja per kunjungan
data_rfm$AOV <- data_rfm$Monetary / ifelse(data_rfm$Frequency == 0, 1, data_rfm$Frequency)
# Log Transform: Menormalkan data uang yang jomplang
data_rfm$Log_Monetary <- log(data_rfm$Monetary + 1)
# Cek keseimbangan kelas (Penting untuk melihat seberapa parah imbalance-nya)
table(data_rfm$Churn)
##
## No Yes
## 2880 1458
set.seed(42)
# Split data
indeks_churn <- createDataPartition(data_rfm$Churn, p = 0.8, list = FALSE)
churn_train <- data_rfm[indeks_churn, ]
churn_test <- data_rfm[-indeks_churn, ]
# 2. KONTROL TRAINING (Rahasia Kappa Tinggi)
# Gunakan sampling = "up" (Up-sampling) atau "smote"
# Ini memaksa model belajar mengenali kelas minoritas dengan lebih adil.
kontrol <- trainControl(
method = "cv",
number = 5,
sampling = "up", # INI KUNCINYA
classProbs = TRUE, # Diperlukan jika ingin melihat probabilitas
verboseIter = TRUE
)
# Tuning parameter mtry dan ntree yang lebih kecil
grid <- expand.grid(mtry = c(1, 2, 3))
# 4. TRAINING MODEL
# Prediksi Churn hanya berdasarkan kebiasaan belanja (F & M)
set.seed(42)
churn_model <- train(
Churn ~ Frequency + Monetary + AOV + Log_Monetary,
data = churn_train,
method = "rf",
trControl = kontrol,
tuneGrid = grid,
ntree = 200, # Tambah pohon sedikit agar lebih stabil
nodesize = 5 # Mencegah overfitting (pohon tidak terlalu rimbun)
)
## + Fold1: mtry=1
## - Fold1: mtry=1
## + Fold1: mtry=2
## - Fold1: mtry=2
## + Fold1: mtry=3
## - Fold1: mtry=3
## + Fold2: mtry=1
## - Fold2: mtry=1
## + Fold2: mtry=2
## - Fold2: mtry=2
## + Fold2: mtry=3
## - Fold2: mtry=3
## + Fold3: mtry=1
## - Fold3: mtry=1
## + Fold3: mtry=2
## - Fold3: mtry=2
## + Fold3: mtry=3
## - Fold3: mtry=3
## + Fold4: mtry=1
## - Fold4: mtry=1
## + Fold4: mtry=2
## - Fold4: mtry=2
## + Fold4: mtry=3
## - Fold4: mtry=3
## + Fold5: mtry=1
## - Fold5: mtry=1
## + Fold5: mtry=2
## - Fold5: mtry=2
## + Fold5: mtry=3
## - Fold5: mtry=3
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1 on full training set
# 5. Evaluasi
churn_prediksi <- predict(churn_model, churn_test)
# Confusion Matrix
# Pastikan positive class-nya benar ("Yes" = Churn)
confusionMatrix(churn_prediksi, churn_test$Churn, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 339 63
## Yes 237 228
##
## Accuracy : 0.654
## 95% CI : (0.6213, 0.6857)
## No Information Rate : 0.6644
## P-Value [Acc > NIR] : 0.7534
##
## Kappa : 0.3241
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7835
## Specificity : 0.5885
## Pos Pred Value : 0.4903
## Neg Pred Value : 0.8433
## Prevalence : 0.3356
## Detection Rate : 0.2630
## Detection Prevalence : 0.5363
## Balanced Accuracy : 0.6860
##
## 'Positive' Class : Yes
##
# 1. Pastikan Recency, Frequency, Monetary ada
# Hapus kolom score/hasil perhitungan lama agar bersih
data_rfm$R_Score <- NULL
data_rfm$F_Score <- NULL
data_rfm$M_Score <- NULL
data_rfm$RFM_Score <- NULL
# 2. Buat Ulang Target (High Value = Top 30% berdasarkan R, F, M)
# Hitung score manual dulu hanya untuk LABELING (Kunci Jawaban)
r_score <- ntile(-data_rfm$Recency, 5) # Negatif karena makin kecil makin bagus
f_score <- ntile(data_rfm$Frequency, 5)
m_score <- ntile(data_rfm$Monetary, 5)
total_score <- r_score + f_score + m_score
# Tentukan batas atas (Top 30%)
batas <- quantile(total_score, 0.70)
data_rfm$HighValue <- ifelse(total_score >= batas, "Yes", "No") # Gunakan Yes/No agar lebih jelas
data_rfm$HighValue <- factor(data_rfm$HighValue, levels = c("No", "Yes"))
# 3. FEATURE ENGINEERING (Kunci menaikkan Kappa)
# Buat fitur turunan agar model lebih pintar membaca pola
data_rfm$Log_Frequency <- log(data_rfm$Frequency + 1) # Menormalkan distribusi
data_rfm$Interaction_RF <- data_rfm$Recency * data_rfm$Frequency # Interaksi R dan F
# Cek proporsi data (biasanya Yes lebih sedikit)
table(data_rfm$HighValue)
##
## No Yes
## 2746 1592
set.seed(42)
# 1. Split Data
indeks_hv <- createDataPartition(data_rfm$HighValue, p = 0.8, list = FALSE)
hv_train <- data_rfm[indeks_hv, ]
hv_test <- data_rfm[-indeks_hv, ]
# 2. Kontrol Training dengan SAMPLING
# 'sampling = "up"' akan menduplikasi data minoritas (Yes) agar model tidak bias ke 'No'
kontrol_hv <- trainControl(
method = "cv",
number = 5,
sampling = "up", # INI KUNCINYA AGAR KAPPA NAIK
verboseIter = TRUE
)
# 3. Training Model
# Perhatikan formulanya: TIDAK ADA 'Monetary'.
# Kita memprediksi HighValue hanya dari perilaku kunjungan (Recency & Frequency)
hv_model <- train(
HighValue ~ Recency + Frequency + Log_Frequency + Interaction_RF,
data = hv_train,
method = "rf",
trControl = kontrol_hv,
ntree = 200, # Jumlah pohon
tuneLength = 5 # Biarkan Caret mencari mtry terbaik otomatis
)
## note: only 3 unique complexity parameters in default grid. Truncating the grid to 3 .
##
## + Fold1: mtry=2
## - Fold1: mtry=2
## + Fold1: mtry=3
## - Fold1: mtry=3
## + Fold1: mtry=4
## - Fold1: mtry=4
## + Fold2: mtry=2
## - Fold2: mtry=2
## + Fold2: mtry=3
## - Fold2: mtry=3
## + Fold2: mtry=4
## - Fold2: mtry=4
## + Fold3: mtry=2
## - Fold3: mtry=2
## + Fold3: mtry=3
## - Fold3: mtry=3
## + Fold3: mtry=4
## - Fold3: mtry=4
## + Fold4: mtry=2
## - Fold4: mtry=2
## + Fold4: mtry=3
## - Fold4: mtry=3
## + Fold4: mtry=4
## - Fold4: mtry=4
## + Fold5: mtry=2
## - Fold5: mtry=2
## + Fold5: mtry=3
## - Fold5: mtry=3
## + Fold5: mtry=4
## - Fold5: mtry=4
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
# 4. Evaluasi
hv_prediksi <- predict(hv_model, hv_test)
# Tampilkan hasil
print(hv_model)
## Random Forest
##
## 3471 samples
## 4 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2777, 2777, 2777, 2776, 2777
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9423797 0.8769491
## 3 0.9311438 0.8518798
## 4 0.9297037 0.8486802
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
confusionMatrix(hv_prediksi, hv_test$HighValue, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 515 19
## Yes 34 299
##
## Accuracy : 0.9389
## 95% CI : (0.9208, 0.9539)
## No Information Rate : 0.6332
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8697
##
## Mcnemar's Test P-Value : 0.05447
##
## Sensitivity : 0.9403
## Specificity : 0.9381
## Pos Pred Value : 0.8979
## Neg Pred Value : 0.9644
## Prevalence : 0.3668
## Detection Rate : 0.3449
## Detection Prevalence : 0.3841
## Balanced Accuracy : 0.9392
##
## 'Positive' Class : Yes
##
# Hapus var yang rusak
data_rfm$PurchaseNextMonth <- NULL
summary(data_rfm$Recency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 18.07 51.09 93.05 142.73 374.12
data_rfm$PurchaseNextMonth <- ifelse(data_rfm$Recency <= 50, "Yes", "No")
data_rfm$PurchaseNextMonth <- factor(data_rfm$PurchaseNextMonth, levels = c("No","Yes"))
# cek apakah sudah ada dua kelas
table(data_rfm$PurchaseNextMonth)
##
## No Yes
## 2213 2125
prop.table(table(data_rfm$PurchaseNextMonth))
##
## No Yes
## 0.5101429 0.4898571
# Modelling
set.seed(123)
indeks_purchase <- createDataPartition(data_rfm$PurchaseNextMonth, p = 0.8, list = FALSE)
purchase_train <- data_rfm[indeks_purchase, ]
purchase_test <- data_rfm[-indeks_purchase, ]
purchase_model <- randomForest(
PurchaseNextMonth ~ Frequency + Monetary,
data = purchase_train,
ntree = 150,
nodesize = 10, # Mencegah pohon terlalu dalam (mengurangi overfitting)
maxnodes = 20 #Membatasi kompleksitas pohon
)
purchase_pred <- predict(purchase_model, purchase_test)
confusionMatrix(purchase_pred, purchase_test$PurchaseNextMonth)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 331 178
## Yes 111 247
##
## Accuracy : 0.6667
## 95% CI : (0.6342, 0.698)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.331
##
## Mcnemar's Test P-Value : 0.0001035
##
## Sensitivity : 0.7489
## Specificity : 0.5812
## Pos Pred Value : 0.6503
## Neg Pred Value : 0.6899
## Prevalence : 0.5098
## Detection Rate : 0.3818
## Detection Prevalence : 0.5871
## Balanced Accuracy : 0.6650
##
## 'Positive' Class : No
##