Import Library

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(party)

## Warning: package 'party' was built under R version 4.4.3

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 4.4.2

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 4.4.2

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 4.4.2

## 
## Attaching package: 'party'

## The following object is masked from 'package:dplyr':
## 
##     where

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

library(class)
library(nnet)

## Warning: package 'nnet' was built under R version 4.4.3

library(mlbench)

## Warning: package 'mlbench' was built under R version 4.4.3

library(e1071)
library(rJava)

## 
## Attaching package: 'rJava'

## The following object is masked from 'package:modeltools':
## 
##     clone

library(RWeka)

## Warning: package 'RWeka' was built under R version 4.4.3

library(partykit)

## Warning: package 'partykit' was built under R version 4.4.3

## Loading required package: libcoin

## Warning: package 'libcoin' was built under R version 4.4.3

## 
## Attaching package: 'partykit'

## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
##     node_terminal, varimp

library(kknn)

## Warning: package 'kknn' was built under R version 4.4.3

## 
## Attaching package: 'kknn'

## The following object is masked from 'package:caret':
## 
##     contr.dummy

library(fastDummies)

## Warning: package 'fastDummies' was built under R version 4.4.3

library(MLmetrics)

## Warning: package 'MLmetrics' was built under R version 4.4.3

## 
## Attaching package: 'MLmetrics'

## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE

## The following object is masked from 'package:base':
## 
##     Recall

library(rpart)

## Warning: package 'rpart' was built under R version 4.4.3

library(RColorBrewer)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.4.3

library(rattle)

## Warning: package 'rattle' was built under R version 4.4.3

## Loading required package: tibble

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked from 'package:randomForest':
## 
##     importance

library(neuralnet)

## Warning: package 'neuralnet' was built under R version 4.4.3

## 
## Attaching package: 'neuralnet'

## The following object is masked from 'package:dplyr':
## 
##     compute

library(readxl)
library(NeuralNetTools)

## Warning: package 'NeuralNetTools' was built under R version 4.4.3

Import Dataset

# Membaca file bank latih
raw_data <- read.csv('bank latih.csv')

# Melihat struktur data
str(raw_data)

## 'data.frame':    4521 obs. of  17 variables:
##  $ Age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertier" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "10" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...

Preprocessing

# Mengecek konsistensi isian 
data<- raw_data
lapply(data[, sapply(data, is.character)], unique)

## $job
##  [1] "unemployed"    "services"      "management"    "blue-collar"  
##  [5] "self-employed" "technician"    "entrepreneur"  "admin."       
##  [9] "student"       "housemaid"     "retired"       "unknown"      
## 
## $marital
## [1] "married"  "single"   "menikah"  "divorced"
## 
## $education
## [1] "primary"   "secondary" "tertiary"  "tertier"   "sekunder"  "unknown"  
## 
## $default
## [1] "no"  "yes"
## 
## $housing
## [1] "no"    "yes"   "tidak"
## 
## $loan
## [1] "no"    "yes"   "tidak"
## 
## $contact
## [1] "cellular"  "unknown"   "seluler"   "telephone"
## 
## $month
##  [1] "10"  "may" "apr" "jun" "feb" "aug" "jan" "7"   "nov" "jul" "oct" "sep"
## [13] "mar" "dec"
## 
## $poutcome
## [1] "unknown" "failure" "other"   "success"
## 
## $y
## [1] "no"  "iya" "yes"

# Memperbaiki konsistensi isian
data$marital <- ifelse(data$marital == "menikah", "married", data$marital)
data$education <- ifelse(data$education == "tertier", "tertiary", data$education)
data$education <- ifelse(data$education == "sekunder", "secondary", data$education)
data$housing <- ifelse(data$housing == "tidak", "no", data$housing)
data$loan <- ifelse(data$loan == "tidak", "no", data$loan)
data$contact <- ifelse(data$contact == "seluler", "cellular", data$contact)
data$month <- ifelse(data$month == "10", "oct", data$month)
data$month <- ifelse(data$month == "7", "jul", data$month)
data$y <- ifelse(data$y == "iya", "yes", data$y)
str(data)

## 'data.frame':    4521 obs. of  17 variables:
##  $ Age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "oct" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...

# Mengubah variabel - variabel menjadi factor
data$job <- as.factor(data$job)
data$marital <- as.factor(data$marital)
data$education <- as.factor(data$education)
data$default <- as.factor(data$default)
data$housing <- as.factor(data$housing)
data$loan <- as.factor(data$loan)
data$contact <- as.factor(data$contact)
data$poutcome <- as.factor(data$poutcome)
data$y <- as.factor(data$y)
data$month <- as.factor(data$month)
head(data)

str(data)

## 'data.frame':    4521 obs. of  17 variables:
##  $ Age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 2 3 2 2 3 2 2 2 2 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 1 2 3 3 2 3 3 2 3 1 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 11 9 1 7 9 4 9 9 9 1 ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

Pemisahan Data Training dan testing

# Untuk metode RF, DT, NB dan KNN
set.seed(777)

inTrain <- createDataPartition(y=data$y, p=0.8, list=FALSE)
train <- data[inTrain,]
test <- data[-inTrain,]

# Dipisah untuk evaluasi model
X_train <- train[, setdiff(names(train), "y")]
y_train <- train$y

X_test <- test[, setdiff(names(test), "y")]
y_test <- test$y

head(train)

head(test)

Preprocessing Deep Learning

# Upsampling dengan resample secara acak terhadap kelas minoritas "yes" 
# Resampling menggunakan python dengan library sklearn
upsample_data <- read_excel("Hasil Preprocessing dan Upsample.xlsx")
head(upsample_data)

One-Hot Encoding

# OHE untuk seluruh variabel kategorik
categorical_cols <- c("job", "marital", "education", "default", "housing", "loan", "contact", "poutcome","month")

# Gabungkan semua hasil OHE dari semua kolom
ohe_all <- do.call(cbind, lapply(categorical_cols, function(col) {
  model.matrix(~ . -1, data = upsample_data[col])
}))

data_numeric <- upsample_data[ , !(names(upsample_data) %in% categorical_cols)]
data2 <- cbind(data_numeric, ohe_all)
head(data2)

Pembagian Data Training dan Testing untuk Metode ANN

set.seed(777)

train.index <- createDataPartition(data2$y, p = 0.8, list = FALSE)
train2 <- data2[train.index, ]
test2 <- data2[-train.index, ]

# Preprocessing untuk scaling range (min-max ke 0-1)
preprocessParams <- preProcess(train2[, -8], method = c("range"))

# Transformasi data training dan testing
train_X <- as.matrix(predict(preprocessParams, train2[, -8]))
test_X <- as.matrix(predict(preprocessParams, test2[, -8]))
# test_new_x <-as.matrix(predict(preprocessParams, test_new_ann))
# test_new_x_df <- data.frame(test_new_x)
# head(test_new_x_df)

train_y <- as.numeric(factor(train2$y))-1
test_y <- as.numeric(factor(test2$y))-1

# Gabungkan data fitur dan target menjadi satu data frame
train_data <- data.frame(train_X)
train_data$y <- train_y

test_data <- data.frame(test_X)
test_data$y <- test_y

head(train_data)

head(test_data)

Pemodelan

Naive Bayes

set.seed(777)
# 1. Training model
output.nb <- naiveBayes(y ~ ., data = train)

# 2. Prediksi untuk training dan testing
pred_train_nb <- predict(output.nb, newdata = train)
pred_test_nb <- predict(output.nb, newdata = test)

# 3. Confusion Matrix
conf_matrix_train_nb  <- confusionMatrix(pred_train_nb, train$y)
conf_matrix_test_nb   <- confusionMatrix(pred_test_nb, test$y)

# 4. Akurasi
acc_train_nb <- conf_matrix_train_nb $overall['Accuracy']
acc_test_nb  <- conf_matrix_test_nb $overall['Accuracy']

# 5. Precision dan Recall 
precision_train_nb <- precision(data = pred_train_nb, reference = y_train)
precision_test_nb <- precision(data = pred_test_nb, reference = y_test)
  
recall_train_nb <- recall(data = pred_train_nb, reference = y_train)
recall_test_nb <- recall(data = pred_test_nb, reference = y_test)

# 6. F1 Score
f1_train_nb <- F_meas(data = pred_train_nb, reference = y_train)
f1_test_nb <- F_meas(data = pred_test_nb, reference = y_test)

# 7. Cetak hasil
cat("=== TRAINING DATA ===\n")

## === TRAINING DATA ===

print(conf_matrix_train_nb )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2905  206
##        yes  295  211
##                                           
##                Accuracy : 0.8615          
##                  95% CI : (0.8498, 0.8726)
##     No Information Rate : 0.8847          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3787          
##                                           
##  Mcnemar's Test P-Value : 8.44e-05        
##                                           
##             Sensitivity : 0.9078          
##             Specificity : 0.5060          
##          Pos Pred Value : 0.9338          
##          Neg Pred Value : 0.4170          
##              Prevalence : 0.8847          
##          Detection Rate : 0.8032          
##    Detection Prevalence : 0.8601          
##       Balanced Accuracy : 0.7069          
##                                           
##        'Positive' Class : no              
##

cat(sprintf("Akurasi: %.4f\n", acc_train_nb ))

## Akurasi: 0.8615

cat(sprintf("Precision: %.4f\n", precision_train_nb ))

## Precision: 0.9338

cat(sprintf("Recall: %.4f\n", recall_train_nb ))

## Recall: 0.9078

cat(sprintf("F1 Score: %.4f\n\n", f1_train_nb ))

## F1 Score: 0.9206

cat("=== TESTING DATA ===\n")

## === TESTING DATA ===

print(conf_matrix_test_nb )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  735  51
##        yes  65  53
##                                           
##                Accuracy : 0.8717          
##                  95% CI : (0.8481, 0.8928)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.9022          
##                                           
##                   Kappa : 0.4047          
##                                           
##  Mcnemar's Test P-Value : 0.2274          
##                                           
##             Sensitivity : 0.9187          
##             Specificity : 0.5096          
##          Pos Pred Value : 0.9351          
##          Neg Pred Value : 0.4492          
##              Prevalence : 0.8850          
##          Detection Rate : 0.8131          
##    Detection Prevalence : 0.8695          
##       Balanced Accuracy : 0.7142          
##                                           
##        'Positive' Class : no              
##

cat(sprintf("Akurasi: %.4f\n", acc_test_nb ))

## Akurasi: 0.8717

cat(sprintf("Precision: %.4f\n", precision_test_nb ))

## Precision: 0.9351

cat(sprintf("Recall: %.4f\n", recall_test_nb ))

## Recall: 0.9187

cat(sprintf("F1 Score: %.4f\n", f1_test_nb ))

## F1 Score: 0.9269

Random Forest

set.seed(777)
# 1. Training model
output.rf <- randomForest(
  y ~ ., data = train,
  ntree = 50,
  mtry = 3,
)

# 2. Prediksi untuk training dan testing
pred_train_rf <- predict(output.rf, newdata = train)
pred_test_rf <- predict(output.rf, newdata = test)

# 3. Confusion Matrix
conf_matrix_train_rf  <- confusionMatrix(pred_train_rf, train$y)
conf_matrix_test_rf  <- confusionMatrix(pred_test_rf, test$y)

# 4. Akurasi
acc_train_rf <- conf_matrix_train_rf$overall['Accuracy']
acc_test_rf  <- conf_matrix_test_rf$overall['Accuracy']

# 5. Precision dan Recall 
precision_train_rf <- precision(data = pred_train_rf, reference = y_train)
precision_test_rf <- precision(data = pred_test_rf, reference = y_test)
  
recall_train_rf <- recall(data = pred_train_rf, reference = y_train)
recall_test_rf <- recall(data = pred_test_rf, reference = y_test)

# 6. F1 Score
f1_train_rf <- F_meas(data = pred_train_rf, reference = y_train)
f1_test_rf <- F_meas(data = pred_test_rf, reference = y_test)

# 7. Cetak hasil
cat("=== TRAINING DATA ===\n")

## === TRAINING DATA ===

print(conf_matrix_train_rf )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3200    6
##        yes    0  411
##                                           
##                Accuracy : 0.9983          
##                  95% CI : (0.9964, 0.9994)
##     No Information Rate : 0.8847          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9918          
##                                           
##  Mcnemar's Test P-Value : 0.04123         
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9856          
##          Pos Pred Value : 0.9981          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.8847          
##          Detection Rate : 0.8847          
##    Detection Prevalence : 0.8864          
##       Balanced Accuracy : 0.9928          
##                                           
##        'Positive' Class : no              
##

cat(sprintf("Akurasi: %.4f\n", acc_train_rf ))

## Akurasi: 0.9983

cat(sprintf("Precision: %.4f\n", precision_train_rf ))

## Precision: 0.9981

cat(sprintf("Recall: %.4f\n", recall_train_rf ))

## Recall: 1.0000

cat(sprintf("F1 Score: %.4f\n\n", f1_train_rf ))

## F1 Score: 0.9991

cat("=== TESTING DATA ===\n")

## === TESTING DATA ===

print(conf_matrix_test_rf )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  783  70
##        yes  17  34
##                                           
##                Accuracy : 0.9038          
##                  95% CI : (0.8826, 0.9222)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.04018         
##                                           
##                   Kappa : 0.3927          
##                                           
##  Mcnemar's Test P-Value : 2.476e-08       
##                                           
##             Sensitivity : 0.9788          
##             Specificity : 0.3269          
##          Pos Pred Value : 0.9179          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.8850          
##          Detection Rate : 0.8662          
##    Detection Prevalence : 0.9436          
##       Balanced Accuracy : 0.6528          
##                                           
##        'Positive' Class : no              
##

cat(sprintf("Akurasi: %.4f\n", acc_test_rf ))

## Akurasi: 0.9038

cat(sprintf("Precision: %.4f\n", precision_test_rf ))

## Precision: 0.9179

cat(sprintf("Recall: %.4f\n", recall_test_rf ))

## Recall: 0.9788

cat(sprintf("F1 Score: %.4f\n", f1_test_rf ))

## F1 Score: 0.9474

KNN

set.seed(777)
# 1. Train Model
output.knn <- train.kknn(y ~ ., data = train,
                         kmax = 20,        
                         distance = 2,     
                         kernel = "gaussian",  
                         tuneGrid = expand.grid(k = 1:20),  
                         trControl = trainControl(method = "cv", number = 10))  


# 2. Prediksi untuk training dan testing
pred_train_knn <- predict(output.knn, newdata = train)

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'y' is absent, its contrast will be ignored

pred_test_knn <- predict(output.knn, newdata = test)

## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable 'y' is absent, its contrast will be ignored

# 3. Confusion Matrix
conf_matrix_train_knn  <- confusionMatrix(pred_train_knn, train$y)
conf_matrix_test_knn  <- confusionMatrix(pred_test_knn, test$y)

# 4. Akurasi
acc_train_knn <- conf_matrix_train_knn$overall['Accuracy']
acc_test_knn  <- conf_matrix_test_knn$overall['Accuracy']

# 5. Precision dan Recall 
precision_train_knn <- precision(data = pred_train_knn, reference = y_train)
precision_test_knn <- precision(data = pred_test_knn, reference = y_test)
  
recall_train_knn <- recall(data = pred_train_knn, reference = y_train)
recall_test_knn <- recall(data = pred_test_knn, reference = y_test)

# 6. F1 Score
f1_train_knn <- F_meas(data = pred_train_knn, reference = y_train)
f1_test_knn <- F_meas(data = pred_test_knn, reference = y_test)

# 7. Cetak hasil
cat("=== TRAINING DATA ===\n")

## === TRAINING DATA ===

print(conf_matrix_train_knn )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3188  283
##        yes   12  134
##                                          
##                Accuracy : 0.9184         
##                  95% CI : (0.909, 0.9272)
##     No Information Rate : 0.8847         
##     P-Value [Acc > NIR] : 1.719e-11      
##                                          
##                   Kappa : 0.4427         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9962         
##             Specificity : 0.3213         
##          Pos Pred Value : 0.9185         
##          Neg Pred Value : 0.9178         
##              Prevalence : 0.8847         
##          Detection Rate : 0.8814         
##    Detection Prevalence : 0.9596         
##       Balanced Accuracy : 0.6588         
##                                          
##        'Positive' Class : no             
##

cat(sprintf("Akurasi: %.4f\n", acc_train_knn ))

## Akurasi: 0.9184

cat(sprintf("Precision: %.4f\n", precision_train_knn ))

## Precision: 0.9185

cat(sprintf("Recall: %.4f\n", recall_train_knn ))

## Recall: 0.9962

cat(sprintf("F1 Score: %.4f\n\n", f1_train_knn ))

## F1 Score: 0.9558

cat("=== TESTING DATA ===\n")

## === TESTING DATA ===

print(conf_matrix_test_knn )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  792  85
##        yes   8  19
##                                           
##                Accuracy : 0.8971          
##                  95% CI : (0.8755, 0.9162)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.1362          
##                                           
##                   Kappa : 0.2547          
##                                           
##  Mcnemar's Test P-Value : 3.252e-15       
##                                           
##             Sensitivity : 0.9900          
##             Specificity : 0.1827          
##          Pos Pred Value : 0.9031          
##          Neg Pred Value : 0.7037          
##              Prevalence : 0.8850          
##          Detection Rate : 0.8761          
##    Detection Prevalence : 0.9701          
##       Balanced Accuracy : 0.5863          
##                                           
##        'Positive' Class : no              
##

cat(sprintf("Akurasi: %.3f\n", acc_test_knn ))

## Akurasi: 0.897

cat(sprintf("Precision: %.3f\n", precision_test_knn ))

## Precision: 0.903

cat(sprintf("Recall: %.3f\n", recall_test_knn ))

## Recall: 0.990

cat(sprintf("F1 Score: %.3f\n", f1_test_knn ))

## F1 Score: 0.945

Decision Tree

set.seed(777)
# 1. Train Model
output.tree <- rpart(y ~ ., 
                    data = train,   
                    control = rpart.control(minsplit = 10, cp = 0.01))

# 2. Prediksi untuk training dan testing
pred_train_tree <- predict(output.tree, X_train, type = "class")
pred_test_tree <- predict(output.tree, X_test, type = "class")

pred_train_tree <- factor(pred_train_tree, levels = levels(y_train))
pred_test_tree <- factor(pred_test_tree, levels = levels(y_test))

# 3. Confusion Matrix
conf_matrix_train_tree  <- confusionMatrix(pred_train_tree, train$y)
conf_matrix_test_tree  <- confusionMatrix(pred_test_tree, test$y)

# 4. Akurasi
acc_train_tree <- conf_matrix_train_tree$overall['Accuracy']
acc_test_tree  <- conf_matrix_test_tree$overall['Accuracy']

# 5. Precision dan Recall 
precision_train_tree <- precision(data = pred_train_tree, reference = y_train)
precision_test_tree <- precision(data = pred_test_tree, reference = y_test)
  
recall_train_tree <- recall(data = pred_train_tree, reference = y_train)
recall_test_tree <- recall(data = pred_test_tree, reference = y_test)

# 6. F1 Score
f1_train_tree <- F_meas(data = pred_train_tree, reference = y_train)
f1_test_tree <- F_meas(data = pred_test_tree, reference = y_test)

# 7. Cetak hasil
cat("=== TRAINING DATA ===\n")

## === TRAINING DATA ===

print(conf_matrix_train_tree )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3099  222
##        yes  101  195
##                                           
##                Accuracy : 0.9107          
##                  95% CI : (0.9009, 0.9198)
##     No Information Rate : 0.8847          
##     P-Value [Acc > NIR] : 2.366e-07       
##                                           
##                   Kappa : 0.499           
##                                           
##  Mcnemar's Test P-Value : 2.439e-11       
##                                           
##             Sensitivity : 0.9684          
##             Specificity : 0.4676          
##          Pos Pred Value : 0.9332          
##          Neg Pred Value : 0.6588          
##              Prevalence : 0.8847          
##          Detection Rate : 0.8568          
##    Detection Prevalence : 0.9182          
##       Balanced Accuracy : 0.7180          
##                                           
##        'Positive' Class : no              
##

cat(sprintf("Akurasi: %.4f\n", acc_train_tree ))

## Akurasi: 0.9107

cat(sprintf("Precision: %.4f\n", precision_train_tree ))

## Precision: 0.9332

cat(sprintf("Recall: %.4f\n", recall_train_tree ))

## Recall: 0.9684

cat(sprintf("F1 Score: %.4f\n\n", f1_train_tree ))

## F1 Score: 0.9505

cat("=== TESTING DATA ===\n")

## === TESTING DATA ===

print(conf_matrix_test_tree )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  777  57
##        yes  23  47
##                                           
##                Accuracy : 0.9115          
##                  95% CI : (0.8911, 0.9292)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.0057884       
##                                           
##                   Kappa : 0.4933          
##                                           
##  Mcnemar's Test P-Value : 0.0002247       
##                                           
##             Sensitivity : 0.9712          
##             Specificity : 0.4519          
##          Pos Pred Value : 0.9317          
##          Neg Pred Value : 0.6714          
##              Prevalence : 0.8850          
##          Detection Rate : 0.8595          
##    Detection Prevalence : 0.9226          
##       Balanced Accuracy : 0.7116          
##                                           
##        'Positive' Class : no              
##

cat(sprintf("Akurasi: %.4f\n", acc_test_tree ))

## Akurasi: 0.9115

cat(sprintf("Precision: %.4f\n", precision_test_tree ))

## Precision: 0.9317

cat(sprintf("Recall: %.4f\n", recall_test_tree ))

## Recall: 0.9712

cat(sprintf("F1 Score: %.4f\n", f1_test_tree ))

## F1 Score: 0.9510

# Visualisasi Decision Tree
rpart.plot(output.tree, box.palette = "auto", nn = TRUE)

ANN

set.seed(777)
# 1. Train Model
n <- names(train_data)
f <- as.formula(paste("y ~", paste(n[!n %in% "y"], collapse = " + ")))

nn <- neuralnet(
  formula = f,
  data = train_data,
  hidden = c(8,5,3),         # Sigmoid untuk hidden dan output layer
  linear.output = FALSE,        # FALSE → binary classification (sigmoid output)
)

set.seed(777)
# 2. Prediksi untuk training dan testing
pr.nn_train <- compute(nn, train_data)
roundedresults_train <- ifelse(pr.nn_train$net.result > 0.5, 1, 0) # Klasifikasi biner (threshold 0.5)
results_train <- data.frame(actual = train_data$y, prediction = roundedresults_train) # Gabungkan dengan nilai aktual
actual_train <- factor(results_train$actual, levels = c(0, 1)) # Ubah ke factor dengan level yang sama dan urutan sama
predicted_train <- factor(results_train$prediction, levels = c(0, 1))

pr.nn_test <- compute(nn, test_data)
roundedresults_test <- ifelse(pr.nn_test$net.result > 0.5, 1, 0)
results_test <- data.frame(actual = test_data$y, prediction = roundedresults_test)
actual_test <- factor(results_test$actual, levels = c(0, 1))
predicted_test <- factor(results_test$prediction, levels = c(0, 1))

# 3. Confusion Matrix
conf_matrix_train_ann <- confusionMatrix(predicted_train, actual_train)
conf_matrix_test_ann <- confusionMatrix(predicted_test, actual_test)

# 4. Akurasi
acc_train_ann <- conf_matrix_train_ann$overall['Accuracy']
acc_test_ann  <- conf_matrix_test_ann$overall['Accuracy']

# 5. Precision dan Recall 
precision_train_ann <- conf_matrix_train_ann$byClass["Precision"]
recall_train_ann <- conf_matrix_train_ann$byClass["Recall"]
precision_test_ann <- conf_matrix_test_ann$byClass["Precision"]
recall_test_ann <- conf_matrix_test_ann$byClass["Recall"]

# 6. F1 Score
f1_train_ann <- 2 * ((precision_train_ann * recall_train_ann) / (precision_train_ann + recall_train_ann))
f1_test_ann <- 2 * ((precision_test_ann * recall_test_ann) / (precision_test_ann + recall_test_ann))

# 7. Cetak hasil
cat("=== TRAINING DATA ===\n")

## === TRAINING DATA ===

print(conf_matrix_train_ann )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3034   83
##          1  166 3117
##                                           
##                Accuracy : 0.9611          
##                  95% CI : (0.9561, 0.9657)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9222          
##                                           
##  Mcnemar's Test P-Value : 2.03e-07        
##                                           
##             Sensitivity : 0.9481          
##             Specificity : 0.9741          
##          Pos Pred Value : 0.9734          
##          Neg Pred Value : 0.9494          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4741          
##    Detection Prevalence : 0.4870          
##       Balanced Accuracy : 0.9611          
##                                           
##        'Positive' Class : 0               
##

cat(sprintf("Akurasi: %.4f\n", acc_train_ann ))

## Akurasi: 0.9611

cat(sprintf("Precision: %.4f\n", precision_train_ann ))

## Precision: 0.9734

cat(sprintf("Recall: %.4f\n", recall_train_ann ))

## Recall: 0.9481

cat(sprintf("F1 Score: %.4f\n\n", f1_train_ann ))

## F1 Score: 0.9606

cat("=== TESTING DATA ===\n")

## === TESTING DATA ===

print(conf_matrix_test_ann )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 710  21
##          1  90 779
##                                           
##                Accuracy : 0.9306          
##                  95% CI : (0.9171, 0.9426)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8613          
##                                           
##  Mcnemar's Test P-Value : 1.087e-10       
##                                           
##             Sensitivity : 0.8875          
##             Specificity : 0.9738          
##          Pos Pred Value : 0.9713          
##          Neg Pred Value : 0.8964          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4437          
##    Detection Prevalence : 0.4569          
##       Balanced Accuracy : 0.9306          
##                                           
##        'Positive' Class : 0               
##

cat(sprintf("Akurasi: %.4f\n", acc_test_ann ))

## Akurasi: 0.9306

cat(sprintf("Precision: %.4f\n", precision_test_ann ))

## Precision: 0.9713

cat(sprintf("Recall: %.4f\n", recall_test_ann ))

## Recall: 0.8875

cat(sprintf("F1 Score: %.4f\n", f1_test_ann ))

## F1 Score: 0.9275

# Visualisasi ANN
plotnet(nn)

Perbandingan Metode

hasil_perbandingan <- data.frame(
  Metode = c("Random Forest", "Naive Bayes", "Decision Tree", "KNN","ANN"),
  Akurasi_Train = c(as.numeric(acc_train_rf), as.numeric(acc_train_nb), as.numeric(acc_train_tree), as.numeric(acc_train_knn),as.numeric(acc_train_ann)),
  Akurasi_Test = c(as.numeric(acc_test_rf), as.numeric(acc_test_nb), as.numeric(acc_test_tree), as.numeric(acc_test_knn),as.numeric(acc_test_ann)),
  F1_Train = c(as.numeric(f1_train_rf), as.numeric(f1_train_nb), as.numeric(f1_train_tree),  as.numeric(f1_train_knn),as.numeric(f1_train_ann)),
  F1_Test = c(as.numeric(f1_test_rf), as.numeric(f1_test_nb), as.numeric(f1_test_tree), as.numeric(f1_test_knn),as.numeric(f1_test_ann)),
  Precision = c(as.numeric(precision_test_rf), as.numeric(precision_test_nb), as.numeric(precision_test_tree), as.numeric(precision_test_knn),as.numeric(precision_test_ann)),
  Recall = c(as.numeric(recall_test_rf), as.numeric(recall_test_nb), as.numeric(recall_test_tree), as.numeric(recall_test_knn), as.numeric(recall_test_ann))
)

print(hasil_perbandingan)

##          Metode Akurasi_Train Akurasi_Test  F1_Train   F1_Test Precision
## 1 Random Forest     0.9983412    0.9037611 0.9990634 0.9473684 0.9179367
## 2   Naive Bayes     0.8614874    0.8716814 0.9206148 0.9268600 0.9351145
## 3 Decision Tree     0.9106995    0.9115044 0.9504677 0.9510404 0.9316547
## 4           KNN     0.9184407    0.8971239 0.9557787 0.9445438 0.9030787
## 5           ANN     0.9610937    0.9306250 0.9605826 0.9274984 0.9712722
##    Recall
## 1 0.97875
## 2 0.91875
## 3 0.97125
## 4 0.99000
## 5 0.88750

Mecoba Memprediksi Menggunakan Data Baru Menggunakan ANN

test_new <- read.csv("unseen data.csv")
head(test_new)

test_new$job <- as.factor(test_new$job)
test_new$marital <- as.factor(test_new$marital)
test_new$education <- as.factor(test_new$education)
test_new$default <- as.factor(test_new$default)
test_new$housing <- as.factor(test_new$housing)
test_new$loan <- as.factor(test_new$loan)
test_new$contact <- as.factor(test_new$contact)
test_new$poutcome <- as.factor(test_new$poutcome)
test_new$month <- as.factor(test_new$month)

data_numeric_new <- test_new[ , !(names(test_new) %in% categorical_cols)]
for (col in names(test_new)) {
  if (is.factor(train[[col]])) {  # train = data latih
    test_new[[col]] <- factor(test_new[[col]], levels = levels(train[[col]]))
  }
}
ohe_all_new <- do.call(cbind, lapply(categorical_cols, function(col) {
  model.matrix(~ . -1, data = test_new[col])
}))
test_new_ann <- cbind(data_numeric_new, ohe_all_new)

test_new_x <-as.matrix(predict(preprocessParams, test_new_ann))
test_new_x_df <- data.frame(test_new_x)

pr.nn_test_new <- compute(nn, test_new_x_df)
roundedresults_test_new <- ifelse(pr.nn_test_new$net.result > 0.5, 1,0)
roundedresults_test_new_label <- factor(ifelse(roundedresults_test_new == 1, "Yes", "No"), levels = c("No", "Yes"))
test_new$y <- roundedresults_test_new_label
test_new

Tugas Data Mining Pertemuan 6

Kelompok 10 - 3SD2

2025-04-25

Import Library

Import Dataset

Preprocessing

Pemisahan Data Training dan testing

Preprocessing Deep Learning

One-Hot Encoding

Pembagian Data Training dan Testing untuk Metode ANN

Pemodelan

Naive Bayes

Random Forest

KNN

Decision Tree

ANN

Perbandingan Metode

Mecoba Memprediksi Menggunakan Data Baru Menggunakan ANN