library(readxl)
library(dplyr)
library(tidyr)
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 4.4.2
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
df.train <- read.csv("C:\\Users\\Acer\\Downloads\\Data Training.csv",sep=";")
# Mengubah '' menjadi NA
df.train[df.train == ''] <- NA
df.test <- read.csv("C:\\Users\\Acer\\Downloads\\Data Testing.csv",sep=";")
# Mengubah '' menjadi NA
df.test[df.test == ''] <- NA
# Melihat bagaimana gambaran umum dari data
cat("Informasi Umum dari Data",'\n')
## Informasi Umum dari Data
summary(df.train)
## age job marital education
## Min. :17.00 Length:41188 Length:41188 Length:41188
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.02
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:41188 Length:41188 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 102.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 180.0 Median : 2.000
## Mean : 258.3 Mean : 2.568
## 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :4918.0 Max. :56.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.000 Length:41188 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.000 Mode :character Median : 1.10000
## Mean :962.5 Mean :0.173 Mean : 0.08189
## 3rd Qu.:999.0 3rd Qu.:0.000 3rd Qu.: 1.40000
## Max. :999.0 Max. :7.000 Max. : 1.40000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Min. :92.20 Min. :-50.8 Min. :0.634 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.344 1st Qu.:5099
## Median :93.75 Median :-41.8 Median :4.857 Median :5191
## Mean :93.58 Mean :-40.5 Mean :3.621 Mean :5167
## 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.9 Max. :5.045 Max. :5228
## y
## Length:41188
## Class :character
## Mode :character
##
##
##
# Menghitung jumlah data kosong dari setiap variabel
cat("Jumlah Data Kosong Pada Setiap Variabel",'\n')
## Jumlah Data Kosong Pada Setiap Variabel
colSums(is.na(df.train))
## age job marital education default
## 0 330 80 1731 8597
## housing loan contact month day_of_week
## 990 990 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
# Melihat interaksi antar status nasabah
table(df.train$job,df.train$marital,df.train$education)
## , , = basic.4y
##
##
## divorced married single
## admin. 4 60 13
## blue-collar 193 1832 290
## entrepreneur 7 124 6
## housemaid 68 383 22
## management 6 85 9
## retired 151 428 16
## self-employed 5 77 11
## services 26 88 18
## student 0 0 26
## technician 10 45 3
## unemployed 16 64 32
##
## , , = basic.6y
##
##
## divorced married single
## admin. 16 106 29
## blue-collar 82 1136 205
## entrepreneur 4 63 4
## housemaid 14 62 1
## management 10 74 1
## retired 8 59 8
## self-employed 1 19 3
## services 39 137 50
## student 1 0 12
## technician 5 66 15
## unemployed 2 28 4
##
## , , = basic.9y
##
##
## divorced married single
## admin. 76 286 137
## blue-collar 286 2582 752
## entrepreneur 18 166 26
## housemaid 16 68 10
## management 9 123 34
## retired 19 117 7
## self-employed 20 179 21
## services 54 229 105
## student 0 2 96
## technician 44 252 88
## unemployed 23 136 25
##
## , , = high.school
##
##
## divorced married single
## admin. 439 1719 1170
## blue-collar 71 457 347
## entrepreneur 43 160 30
## housemaid 27 117 30
## management 36 216 45
## retired 51 204 21
## self-employed 12 72 34
## services 349 1554 775
## student 0 14 343
## technician 120 471 279
## unemployed 41 152 65
##
## , , = illiterate
##
##
## divorced married single
## admin. 0 1 0
## blue-collar 0 7 1
## entrepreneur 0 2 0
## housemaid 0 1 0
## management 0 0 0
## retired 2 1 0
## self-employed 0 3 0
## services 0 0 0
## student 0 0 0
## technician 0 0 0
## unemployed 0 0 0
##
## , , = professional.course
##
##
## divorced married single
## admin. 49 207 106
## blue-collar 35 335 83
## entrepreneur 15 100 20
## housemaid 11 34 14
## management 14 62 13
## retired 51 177 13
## self-employed 23 111 32
## services 27 125 65
## student 1 6 36
## technician 412 1888 1018
## unemployed 19 102 21
##
## , , = university.degree
##
##
## divorced married single
## admin. 671 2727 2343
## blue-collar 2 31 61
## entrepreneur 82 414 112
## housemaid 16 85 36
## management 244 1431 386
## retired 57 203 24
## self-employed 68 426 270
## services 20 73 80
## student 4 17 149
## technician 150 828 826
## unemployed 22 139 99
# Melihat bagaimana gambaran umum dari data
cat("Informasi Umum dari Data",'\n')
## Informasi Umum dari Data
summary(df.test)
## age job marital education
## Min. :18.00 Length:4119 Length:4119 Length:4119
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.11
## 3rd Qu.:47.00
## Max. :88.00
## default housing loan contact
## Length:4119 Length:4119 Length:4119 Length:4119
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:4119 Length:4119 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 103.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 181.0 Median : 2.000
## Mean : 256.8 Mean : 2.537
## 3rd Qu.: 317.0 3rd Qu.: 3.000
## Max. :3643.0 Max. :35.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.0000 Length:4119 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.0000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.0000 Mode :character Median : 1.10000
## Mean :960.4 Mean :0.1903 Mean : 0.08497
## 3rd Qu.:999.0 3rd Qu.:0.0000 3rd Qu.: 1.40000
## Max. :999.0 Max. :6.0000 Max. : 1.40000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Min. :92.20 Min. :-50.8 Min. :0.635 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.334 1st Qu.:5099
## Median :93.75 Median :-41.8 Median :4.857 Median :5191
## Mean :93.58 Mean :-40.5 Mean :3.621 Mean :5166
## 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.9 Max. :5.045 Max. :5228
## y
## Length:4119
## Class :character
## Mode :character
##
##
##
# Menghitung jumlah data kosong dari setiap variabel
cat("Jumlah Data Kosong Pada Setiap Variabel",'\n')
## Jumlah Data Kosong Pada Setiap Variabel
colSums(is.na(df.test))
## age job marital education default
## 0 39 11 167 803
## housing loan contact month day_of_week
## 105 105 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
# Melihat hubungan jumlah pekerjaan dengan pendidikan nasabah
table(df.train$job,df.train$education)
##
## basic.4y basic.6y basic.9y high.school illiterate
## admin. 77 151 499 3329 1
## blue-collar 2318 1426 3623 878 8
## entrepreneur 137 71 210 234 2
## housemaid 474 77 94 174 1
## management 100 85 166 298 0
## retired 597 75 145 276 3
## self-employed 93 25 220 118 3
## services 132 226 388 2682 0
## student 26 13 99 357 0
## technician 58 87 384 873 0
## unemployed 112 34 186 259 0
##
## professional.course university.degree
## admin. 363 5753
## blue-collar 453 94
## entrepreneur 135 610
## housemaid 59 139
## management 89 2063
## retired 241 285
## self-employed 168 765
## services 218 173
## student 43 170
## technician 3320 1809
## unemployed 142 262
set.seed(10)
# Mengisi value NA pada variabel "default" berdasarkan modus
df.train2 <- df.train %>%
mutate(default = replace_na(default, "no"))
# Fungsi untuk mengisi variabel dengan acak
replace_na_random <- function(x, words, probabilities) {
ifelse(is.na(x), sample(words, size = length(x), replace = TRUE, prob = probabilities), x)
}
yes.no <- c("yes","no")
prob.hous <- c(21576/40198,18622/40198)
# Ganti nilai NA pada variabel housing dengan kata-kata yang diacak
df.train3 <- df.train2 %>%
mutate(housing = replace_na_random(housing, yes.no, prob.hous))
# Ganti nilai NA pada variabel loan dengan kata-kata yang diacak
prob.loan<-c(6248/40198,33950/40198)
df.train4 <- df.train3 %>%
mutate(loan = replace_na_random(loan, yes.no, prob.loan))
# Hapus baris yang memiliki 3 atau lebih nilai NA
df.train5 <- df.train4 %>%
filter(rowSums(is.na(.)) < 3)
educate <- c("basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree")
prob.ed<-c(4176/39457,2292/39457,6045/39457,9515/39457,18/39457,5243/39457,12168/39457)
# Mengganti nilai NA pada variable education dengan acak
df.train6 <- df.train5 %>%
mutate(education = replace_na_random(education, educate, prob.ed))
# Mengganti nilai NA pada variabel marital dengan acak dengan asumsi bahwa orang yang tidak mengisi tidak dalam status pernikahan
marit<-c("divorced","single")
prob.marit<-c(4612/16180,11568/16180)
df.train7 <- df.train6 %>%
mutate(marital = replace_na_random(marital, marit, prob.marit))
# Mengganti nilai NA pada variabel pekerjaan berdasarkan modus dari gabungan antara pekerjaan dengan pendidikan
clean.train <- df.train7 %>%
mutate(job = case_when(
is.na(job) & education == "basic.4y" ~ "blue-collar",
is.na(job) & education == "basic.6y" ~ "blue-collar",
is.na(job) & education == "basic.9y" ~ "blue-collar",
is.na(job) & education == "high.school" ~ "services",
is.na(job) & education == "illiterate" ~ "blue-collar",
is.na(job) & education == "professional.course" ~ "technician",
is.na(job) & education == "university.degree" ~ "admin.",
TRUE ~ job
))
# Mengisi value NA pada variabel "default" berdasarkan modus
df.test2 <- df.test %>%
mutate(default = replace_na(default, "no"))
# Ganti nilai NA pada variabel housing dengan kata-kata yang diacak
df.test3 <- df.test2 %>%
mutate(housing = replace_na_random(housing, yes.no, prob.hous))
# Ganti nilai NA pada variabel loan dengan kata-kata yang diacak
prob.loan<-c(6248/40198,33950/40198)
df.test4 <- df.test3 %>%
mutate(loan = replace_na_random(loan, yes.no, prob.loan))
# Hapus baris yang memiliki 3 atau lebih nilai NA
df.test5 <- df.test4 %>%
filter(rowSums(is.na(.)) < 3)
# Mengganti nilai NA pada variable education dengan acak
df.test6 <- df.test5 %>%
mutate(education = replace_na_random(education, educate, prob.ed))
# Mengganti nilai NA pada variabel marital dengan acak dengan asumsi bahwa orang yang tidak mengisi tidak dalam status pernikahan
df.test7 <- df.test6 %>%
mutate(marital = replace_na_random(marital, marit, prob.marit))
# Mengganti nilai NA pada variabel pekerjaan berdasarkan modus dari gabungan antara pekerjaan dengan pendidikan
clean.test <- df.test7 %>%
mutate(job = case_when(
is.na(job) & education == "basic.4y" ~ "blue-collar",
is.na(job) & education == "basic.6y" ~ "blue-collar",
is.na(job) & education == "basic.9y" ~ "blue-collar",
is.na(job) & education == "high.school" ~ "services",
is.na(job) & education == "illiterate" ~ "blue-collar",
is.na(job) & education == "professional.course" ~ "technician",
is.na(job) & education == "university.degree" ~ "admin.",
TRUE ~ job
))
# Mempersiapkan data untuk modelling
model.train <- clean.train[,-11]
model.train$job <- as.factor(clean.train$job)
model.train$marital <- as.factor(clean.train$marital)
model.train$education <- as.factor(clean.train$education)
model.train$default <- as.factor(clean.train$default)
model.train$housing <- as.factor(clean.train$housing)
model.train$loan <- as.factor(clean.train$loan)
model.train$contact <- as.factor(clean.train$contact)
model.train$month <- as.factor(clean.train$month)
model.train$day_of_week <- as.factor(clean.train$day_of_week)
model.train$poutcome <- as.factor(clean.train$poutcome)
model.train$y <- as.factor(clean.train$y)
Karena penggunaan ANN tidak dapat dilakukan, akan dilakukan modeling dengan menggunakan random forest
rf <- randomForest(y ~ age + job + marital + education + default + housing + loan + contact + month + day_of_week + campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed,model.train, ntree = 500)
print(rf)
##
## Call:
## randomForest(formula = y ~ age + job + marital + education + default + housing + loan + contact + month + day_of_week + campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, data = model.train, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 10.12%
## Confusion matrix:
## no yes class.error
## no 35692 853 0.02334109
## yes 3314 1324 0.71453213
# Confusion matrix values
TN <- 35688
FP <- 857
FN <- 3303
TP <- 1335
# Akurasi
accuracy <- (TP + TN) / (TP + TN + FP + FN)
# Sensitivitas (Recall)
sensitivity <- TP / (TP + FN)
# Spesifisitas
specificity <- TN / (TN + FP)
# Balanced Accuracy
balanced_accuracy <- (sensitivity + specificity) / 2
# Precision
precision <- TP / (TP + FP)
# F1 Score
f1_score <- 2 * ((precision * sensitivity) / (precision + sensitivity))
# Print results
cat("Akurasi:", accuracy, "\n")
## Akurasi: 0.8989874
cat("Sensitivitas (Recall):", sensitivity, "\n")
## Sensitivitas (Recall): 0.2878396
cat("Spesifisitas:", specificity, "\n")
## Spesifisitas: 0.9765495
cat("Balanced Accuracy:", balanced_accuracy, "\n")
## Balanced Accuracy: 0.6321945
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.3909224
Didapatkan akurasi yang cukup bagus untuk model, sensitifitas yang buruk, spesifisitas yang baik, balanced accuracy yang cukup buruk dan F1 score yang buruk. Nilai-nilai yang kecil didapatkan dari prediksi yang berhubungan dengan TP yang berarti karena “yes” terlalu sedikit dalam data training terjadi ketimpangan yang tidak diharapkan.
model.test <- clean.test[,-11]
model.test$job <- as.factor(clean.test$job)
model.test$marital <- as.factor(clean.test$marital)
model.test$education <- as.factor(clean.test$education)
model.test$default <- as.factor(clean.test$default)
model.test$housing <- as.factor(clean.test$housing)
model.test$loan <- as.factor(clean.test$loan)
model.test$contact <- as.factor(clean.test$contact)
model.test$month <- as.factor(clean.test$month)
model.test$day_of_week <- as.factor(clean.test$day_of_week)
model.test$poutcome <- as.factor(clean.test$poutcome)
model.test$y <- as.factor(clean.test$y)
# Prediksi menggunakan model Random Forest
predictions <- predict(rf, newdata = model.test)
# Jika test_data memiliki label aktual, Anda bisa membuat confusion matrix untuk evaluasi
actual_labels <- model.test$y
conf_matrix <- table(Predicted = predictions, Actual = actual_labels)
# Menampilkan confusion matrix
print(conf_matrix)
## Actual
## Predicted no yes
## no 3666 201
## yes 2 250
# Menghitung metrik evaluasi
# Akurasi
accuracy <- sum(predictions == actual_labels) / length(actual_labels)
# Sensitivitas (Recall)
sensitivity <- sum(predictions == "yes" & actual_labels == "yes") / sum(actual_labels == "yes")
# Spesifisitas
specificity <- sum(predictions == "no" & actual_labels == "no") / sum(actual_labels == "no")
# Precision
precision <- sum(predictions == "yes" & actual_labels == "yes") / sum(predictions == "yes")
# F1 Score
f1_score <- 2 * ((precision * sensitivity) / (precision + sensitivity))
# Balanced Accuracy
balanced_accuracy <- (sensitivity + specificity) / 2
# Menampilkan hasil metrik evaluasi
cat("Akurasi:", accuracy, "\n")
## Akurasi: 0.9507162
cat("Sensitivitas (Recall):", sensitivity, "\n")
## Sensitivitas (Recall): 0.5543237
cat("Spesifisitas:", specificity, "\n")
## Spesifisitas: 0.9994547
cat("Balanced Accuracy:", balanced_accuracy, "\n")
## Balanced Accuracy: 0.7768892
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.7112376
Didapatkan bahwa ketika model diujikan ke dalam data uji, terjadi kebalikan dengan model, dimana model training memiliki spesifisitas yang rendah dan sensitivitas yang tinggi. Selain itu, model yang diujikan di data test memiliki performa yang lebih baik dibandingkan di data training, yang berarti model sudah cukup baik