# ============================================================
# TUGAS KLASIFIKASI - BANK MARKETING DATASET
# ============================================================
# Dataset: dataset_A.xlsx
# Target: y (apakah nasabah berlangganan deposito? yes/no)
# ============================================================
# Install & load libraries
library(readxl)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(gbm)
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(nnet)
library(ROSE)
## Loaded ROSE 0.0-4
# ============================================================
# 1. LOAD DATA
# ============================================================
getwd ()
## [1] "C:/Users/Lenovo/OneDrive - untirta.ac.id/Kuliah/Semester 4/Statistical machinne learning/tugas"
df <- read_excel("C:/Users/Lenovo/OneDrive - untirta.ac.id/Kuliah/Semester 4/Statistical machinne learning/tugas/dataset_A.xlsx")
cat("Dimensi data:", nrow(df), "baris x", ncol(df), "kolom\n")
## Dimensi data: 37071 baris x 21 kolom
# ============================================================
# 2. EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================
cat("\n--- Tipe Data ---\n")
##
## --- Tipe Data ---
str(df)
## tibble [37,071 × 21] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:37071] 46 31 34 45 45 39 29 44 29 32 ...
## $ job : chr [1:37071] "admin." "admin." "admin." "entrepreneur" ...
## $ marital : chr [1:37071] "divorced" "divorced" "married" "married" ...
## $ education : chr [1:37071] "high.school" "university.degree" "university.degree" "basic.9y" ...
## $ default : chr [1:37071] "unknown" "no" "no" "no" ...
## $ housing : chr [1:37071] "no" "no" "no" "no" ...
## $ loan : chr [1:37071] "yes" "no" "no" "no" ...
## $ contact : chr [1:37071] "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr [1:37071] "may" "may" "may" "may" ...
## $ day_of_week : chr [1:37071] "mon" "mon" "mon" "mon" ...
## $ duration : num [1:37071] 178 53 349 34 48 182 147 457 193 142 ...
## $ campaign : num [1:37071] 5 5 2 9 3 11 1 5 1 3 ...
## $ pdays : num [1:37071] 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : num [1:37071] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr [1:37071] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num [1:37071] 11 11 11 11 11 11 11 11 11 11 ...
## $ cons.price.idx: num [1:37071] 93994 93994 93994 93994 93994 ...
## $ cons.conf.idx : num [1:37071] -364 -364 -364 -364 -364 -364 -364 -364 -364 -364 ...
## $ euribor3m : num [1:37071] 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num [1:37071] 5191 5191 5191 5191 5191 ...
## $ y : chr [1:37071] "no" "no" "no" "no" ...
cat("\n--- Missing Values ---\n")
##
## --- Missing Values ---
print(colSums(is.na(df)))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
cat("\n--- Distribusi Target (y) ---\n")
##
## --- Distribusi Target (y) ---
print(table(df$y))
##
## no yes
## 32545 4526
print(prop.table(table(df$y)))
##
## no yes
## 0.87791 0.12209
# Visualisasi distribusi kelas
ggplot(df, aes(x = y, fill = y)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-0.5) +
scale_fill_manual(values = c("no"="#E74C3C","yes"="#2ECC71")) +
labs(title="Distribusi Kelas Target (y)",
subtitle="Terdapat ketidakseimbangan kelas (class imbalance)",
x="y (no/yes)", y="Jumlah") +
theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ============================================================
# 3. DATA PREPROCESSING
# ============================================================
df_clean <- df
# 3a. Ganti 'unknown' dengan modus untuk kolom kategorik
cols_unknown <- c("job","marital","education","default","housing","loan")
for (col in cols_unknown) {
df_clean[[col]] <- as.character(df_clean[[col]])
mode_val <- names(sort(table(df_clean[df_clean[[col]] != "unknown", col]), decreasing=TRUE))[1]
df_clean[[col]][df_clean[[col]] == "unknown"] <- mode_val
}
# 3b. Encoding: kategorik -> faktor
cat_cols <- c("job","marital","education","default","housing","loan",
"contact","month","day_of_week","poutcome")
for (col in cat_cols) {
df_clean[[col]] <- as.factor(df_clean[[col]])
}
df_clean$y <- as.factor(df_clean$y)
cat("\nSetelah cleaning - jumlah NA:", sum(is.na(df_clean)), "\n")
##
## Setelah cleaning - jumlah NA: 0
# 3c. Train/Test Split (80:20, stratified)
set.seed(2006)
train_idx <- createDataPartition(df_clean$y, p=0.8, list=FALSE)
train_data <- df_clean[train_idx, ]
test_data <- df_clean[-train_idx, ]
cat("Train:", nrow(train_data), "| Test:", nrow(test_data), "\n")
## Train: 29657 | Test: 7414
cat("Distribusi kelas train:\n")
## Distribusi kelas train:
print(prop.table(table(train_data$y)))
##
## no yes
## 0.877904 0.122096
# 3d. Penanganan Class Imbalance dengan ROSE (Oversampling)
set.seed(2006)
train_balanced <- ROSE(y ~ ., data=train_data, seed=42)$data
cat("\nSetelah ROSE balancing:\n")
##
## Setelah ROSE balancing:
print(table(train_balanced$y))
##
## no yes
## 14890 14767
# Cek jumlah data setelah cleaning (df_clean)
cat("Jumlah data setelah cleaning:", nrow(df_clean), "baris\n")
## Jumlah data setelah cleaning: 37071 baris
# Cek jumlah data setelah ROSE balancing (train_balanced)
cat("Jumlah data train setelah ROSE balancing:", nrow(train_balanced), "baris\n")
## Jumlah data train setelah ROSE balancing: 29657 baris
# Jika ingin melihat rincian baris dan kolom sekaligus
dim(train_balanced)
## [1] 29657 21
# ============================================================
# 4. PENGEMBANGAN MODEL
# ============================================================
train_control <- trainControl(
method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = TRUE
)
results <- list()
cat("\nTraining Random Forest...\n")
##
## Training Random Forest...
set.seed(2006)
rf_model <- train(
y ~ ., data = train_balanced,
method = "rf",
trControl = train_control,
metric = "ROC",
tuneGrid = data.frame(mtry = c(4, 6, 8)),
ntree = 100
)
pred_rf <- predict(rf_model, test_data)
cm_rf <- confusionMatrix(pred_rf, test_data$y, positive="yes")
results[["Random Forest"]] <- list(
model = rf_model, pred = pred_rf, cm = cm_rf,
accuracy = cm_rf$overall["Accuracy"],
f1 = cm_rf$byClass["F1"]
)
cat("RF | Accuracy:", round(cm_rf$overall["Accuracy"],4),
"| F1:", round(cm_rf$byClass["F1"],4), "\n")
## RF | Accuracy: 0.8897 | F1: 0.5986
cat("\nTraining Gradient Boosting...\n")
##
## Training Gradient Boosting...
set.seed(2006)
gbm_model <- train(
y ~ ., data = train_balanced,
method = "gbm",
trControl = train_control,
metric = "ROC",
verbose = FALSE,
tuneGrid = expand.grid(
n.trees=c(100,200), interaction.depth=c(3,5),
shrinkage=0.1, n.minobsinnode=10
)
)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 20: defaultyes has no variation.
pred_gbm <- predict(gbm_model, test_data)
cm_gbm <- confusionMatrix(pred_gbm, test_data$y, positive="yes")
results[["Gradient Boosting"]] <- list(
model = gbm_model, pred = pred_gbm, cm = cm_gbm,
accuracy = cm_gbm$overall["Accuracy"],
f1 = cm_gbm$byClass["F1"]
)
cat("GBM | Accuracy:", round(cm_gbm$overall["Accuracy"],4),
"| F1:", round(cm_gbm$byClass["F1"],4), "\n")
## GBM | Accuracy: 0.8967 | F1: 0.5904
cat("\nTraining ANN...\n")
##
## Training ANN...
set.seed(2006)
ann_model <- train(
y ~ ., data = train_balanced,
method = "nnet",
trControl = train_control,
metric = "ROC",
trace = FALSE,
tuneGrid = expand.grid(size=c(10,20), decay=c(0.001,0.01)),
MaxNWts = 5000, maxit = 200
)
pred_ann <- predict(ann_model, test_data)
cm_ann <- confusionMatrix(pred_ann, test_data$y, positive="yes")
results[["ANN"]] <- list(
model = ann_model, pred = pred_ann, cm = cm_ann,
accuracy = cm_ann$overall["Accuracy"],
f1 = cm_ann$byClass["F1"]
)
cat("ANN | Accuracy:", round(cm_ann$overall["Accuracy"],4),
"| F1:", round(cm_ann$byClass["F1"],4), "\n")
## ANN | Accuracy: 0.8129 | F1: 0.5472
# ============================================================
# 5. EVALUASI & PERBANDINGAN MODEL
# ============================================================
summary_df <- data.frame(
Model = names(results),
Accuracy = sapply(results, function(r) round(as.numeric(r$accuracy), 4)),
F1_Score = sapply(results, function(r) round(as.numeric(r$f1), 4)),
Precision= sapply(results, function(r) round(as.numeric(r$cm$byClass["Precision"]), 4)),
Recall = sapply(results, function(r) round(as.numeric(r$cm$byClass["Recall"]), 4))
)
summary_df <- summary_df[order(-summary_df$F1_Score), ]
cat("\n===== RINGKASAN EVALUASI MODEL =====\n")
##
## ===== RINGKASAN EVALUASI MODEL =====
print(summary_df, row.names=FALSE)
## Model Accuracy F1_Score Precision Recall
## Random Forest 0.8897 0.5986 0.5384 0.6740
## Gradient Boosting 0.8967 0.5904 0.5720 0.6099
## ANN 0.8129 0.5472 0.3883 0.9260
# Plot perbandingan model
plot_df <- reshape2::melt(summary_df[,c("Model","Accuracy","F1_Score")],
id.vars="Model", variable.name="Metric")
ggplot(plot_df, aes(x=reorder(Model, value), y=value, fill=Metric)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
scale_fill_manual(values=c("#3498DB","#E74C3C")) +
labs(title="Perbandingan Model: Accuracy vs F1-Score",
x="Model", y="Score") +
theme_minimal() +
geom_text(aes(label=round(value,3)), position=position_dodge(width=0.9), hjust=-0.1, size=3)

# ============================================================
# 6. PEMILIHAN MODEL TERBAIK
# ============================================================
best_name <- summary_df$Model[1]
best_res <- results[[best_name]]
best_model <- best_res$model
cat("\n===== PEMILIHAN MODEL TERBAIK =====\n")
##
## ===== PEMILIHAN MODEL TERBAIK =====
cat("Berdasarkan tabel evaluasi berikut:\n\n")
## Berdasarkan tabel evaluasi berikut:
print(summary_df, row.names = FALSE)
## Model Accuracy F1_Score Precision Recall
## Random Forest 0.8897 0.5986 0.5384 0.6740
## Gradient Boosting 0.8967 0.5904 0.5720 0.6099
## ANN 0.8129 0.5472 0.3883 0.9260
cat(paste0("\n>>> MODEL TERBAIK: ", best_name, " <<<\n"))
##
## >>> MODEL TERBAIK: Random Forest <<<
cat("F1-Score:", summary_df$F1_Score[1], "\n")
## F1-Score: 0.5986
cat("Accuracy:", summary_df$Accuracy[1], "\n")
## Accuracy: 0.8897
cat("\nConfusion Matrix:\n")
##
## Confusion Matrix:
print(best_res$cm$table)
## Reference
## Prediction no yes
## no 5986 295
## yes 523 610
cat("\nClassification Report:\n")
##
## Classification Report:
print(best_res$cm$byClass)
## Sensitivity Specificity Pos Pred Value
## 0.67403315 0.91964972 0.53839365
## Neg Pred Value Precision Recall
## 0.95303296 0.53839365 0.67403315
## F1 Prevalence Detection Rate
## 0.59862610 0.12206636 0.08227677
## Detection Prevalence Balanced Accuracy
## 0.15281899 0.79684143
# Feature Importance (untuk RF/GBM)
if (best_name %in% c("Random Forest", "Gradient Boosting")) {
imp <- varImp(best_res$model)
plot(imp, top = 15, main = paste("Feature Importance -", best_name))
}

# ============================================================
# 7. PREDIKSI DATA UJI (Gunakan model terbaik)
# ============================================================
# --- Load Dataset B ---
df_data_b <- read_csv("C:/Users/Lenovo/OneDrive - untirta.ac.id/Kuliah/Semester 4/Statistical machinne learning/tugas/dataset_B(in).csv")
## Rows: 4117 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): job, marital, education, default, housing, loan, contact, month, d...
## dbl (10): age, duration, campaign, pdays, previous, emp.var.rate, cons.price...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cat("Dimensi dataset B:", nrow(df_data_b), "baris x", ncol(df_data_b), "kolom\n")
## Dimensi dataset B: 4117 baris x 20 kolom
cat("\n--- Tipe Data Dataset B ---\n")
##
## --- Tipe Data Dataset B ---
str(df_data_b)
## spc_tbl_ [4,117 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:4117] 32 31 32 53 58 64 25 34 79 72 ...
## $ job : chr [1:4117] "blue-collar" "management" "blue-collar" "retired" ...
## $ marital : chr [1:4117] "unknown" "married" "married" "married" ...
## $ education : chr [1:4117] "basic.9y" "university.degree" "basic.9y" "basic.9y" ...
## $ default : chr [1:4117] "no" "no" "no" "unknown" ...
## $ housing : chr [1:4117] "yes" "yes" "unknown" "yes" ...
## $ loan : chr [1:4117] "no" "no" "unknown" "no" ...
## $ contact : chr [1:4117] "cellular" "cellular" "cellular" "cellular" ...
## $ month : chr [1:4117] "jun" "jun" "may" "jul" ...
## $ day_of_week : chr [1:4117] "mon" "tue" "thu" "fri" ...
## $ duration : num [1:4117] 314 305 1020 817 412 252 260 266 594 257 ...
## $ campaign : num [1:4117] 1 1 1 2 1 4 2 1 1 1 ...
## $ pdays : num [1:4117] 999 999 999 999 999 999 999 999 3 999 ...
## $ previous : num [1:4117] 0 0 0 0 0 0 0 0 1 0 ...
## $ poutcome : chr [1:4117] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num [1:4117] -29 -29 -18 14 -17 -29 -18 -1 -34 -34 ...
## $ cons.price.idx: num [1:4117] 92963 92963 92893 93918 94027 ...
## $ cons.conf.idx : num [1:4117] -408 -408 -462 -427 -383 -314 -471 -404 -301 -301 ...
## $ euribor3m : num [1:4117] 1.266 1.262 1.266 4.963 0.904 ...
## $ nr.employed : num [1:4117] 50762 50762 50991 52281 49916 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. job = col_character(),
## .. marital = col_character(),
## .. education = col_character(),
## .. default = col_character(),
## .. housing = col_character(),
## .. loan = col_character(),
## .. contact = col_character(),
## .. month = col_character(),
## .. day_of_week = col_character(),
## .. duration = col_double(),
## .. campaign = col_double(),
## .. pdays = col_double(),
## .. previous = col_double(),
## .. poutcome = col_character(),
## .. emp.var.rate = col_double(),
## .. cons.price.idx = col_double(),
## .. cons.conf.idx = col_double(),
## .. euribor3m = col_double(),
## .. nr.employed = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# --- Preprocessing Dataset B ---
# (Mengikuti langkah preprocessing yang sama dengan dataset A)
df_b_clean <- df_data_b
# Ganti 'unknown' dengan modus dari training data (bukan dari dataset B)
# agar tidak terjadi data leakage
cols_unknown <- c("job", "marital", "education", "default", "housing", "loan")
for (col in cols_unknown) {
df_b_clean[[col]] <- as.character(df_b_clean[[col]])
# Modus diambil dari train_data (dataset A), bukan dari dataset B
mode_val <- names(sort(table(train_data[train_data[[col]] != "unknown", col]),
decreasing = TRUE))[1]
df_b_clean[[col]][df_b_clean[[col]] == "unknown"] <- mode_val
}
# Encoding kategorik -> faktor
# PENTING: gunakan levels yang SAMA dengan training data
# agar tidak error saat prediksi
cat_cols <- c("job", "marital", "education", "default", "housing", "loan",
"contact", "month", "day_of_week", "poutcome")
for (col in cat_cols) {
df_b_clean[[col]] <- factor(df_b_clean[[col]],
levels = levels(train_data[[col]]))
}
cat("\nCek missing value setelah preprocessing dataset B:\n")
##
## Cek missing value setelah preprocessing dataset B:
print(colSums(is.na(df_b_clean)))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
# --- Prediksi menggunakan model terbaik (Random Forest) ---
pred_dataset_b <- predict(best_model, newdata = df_b_clean)
pred_dataset_b_prob <- predict(best_model, newdata = df_b_clean, type = "prob")
cat("\nDistribusi hasil prediksi pada Dataset B:\n")
##
## Distribusi hasil prediksi pada Dataset B:
print(table(pred_dataset_b))
## pred_dataset_b
## no yes
## 2660 1457
print(prop.table(table(pred_dataset_b)))
## pred_dataset_b
## no yes
## 0.6461015 0.3538985
# --- Simpan hasil prediksi ke CSV ---
hasil_prediksi <- data.frame(
id = 1:nrow(df_data_b),
y_pred = pred_dataset_b
)
write.csv(hasil_prediksi,
"hasil_prediksi_dataset_B_ariq.csv",
row.names = FALSE)
cat("\nHasil prediksi berhasil disimpan ke: hasil_prediksi_dataset_B_ariq.csv\n")
##
## Hasil prediksi berhasil disimpan ke: hasil_prediksi_dataset_B_ariq.csv
cat("Kolom tambahan:\n")
## Kolom tambahan:
cat(" - y_pred : label prediksi (no/yes)\n")
## - y_pred : label prediksi (no/yes)
cat(" - prob_no : probabilitas kelas 'no'\n")
## - prob_no : probabilitas kelas 'no'
cat(" - prob_yes : probabilitas kelas 'yes'\n")
## - prob_yes : probabilitas kelas 'yes'