# Impor Library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.3
# Set direktori kerja (opsional, ganti dengan path folder yang sesuai)
setwd("C:/Users/asus/Documents/1 - Dok - Back Up/Faruq/STIS/6 - Data Mining/P2")
# Impor data CSV ke dalam dataframe
bank <- read.csv("bank latih.csv", TRUE, ";")
head(bank)
## Age job marital education default balance housing loan contact day
## 1 30 unemployed married primary no 1787 no no cellular 19
## 2 33 services married secondary no 4789 yes yes cellular 11
## 3 35 management single tertiary no 1350 yes no cellular 16
## 4 30 management married tertier no 1476 yes yes unknown 3
## 5 59 blue-collar married secondary no 0 yes no unknown 5
## 6 35 management single tertiary no 747 no no cellular 23
## month duration campaign pdays previous poutcome y
## 1 10 79 1 -1 0 unknown no
## 2 may 220 1 339 4 failure no
## 3 apr 185 1 330 1 failure no
## 4 jun 199 4 -1 0 unknown no
## 5 may 226 1 -1 0 unknown no
## 6 feb 141 2 176 3 failure no
str(bank)
## 'data.frame': 4521 obs. of 17 variables:
## $ Age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertier" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "10" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
# ubah type data
bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")] <- lapply(bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")], FUN = as.factor )
str(bank)
## 'data.frame': 4521 obs. of 17 variables:
## $ Age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 4 2 2 4 2 2 2 2 ...
## $ education: Factor w/ 6 levels "primary","secondary",..: 1 2 4 5 2 4 4 2 4 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : Factor w/ 3 levels "no","tidak","yes": 1 3 3 3 3 1 3 3 3 3 ...
## $ loan : Factor w/ 3 levels "no","tidak","yes": 1 3 1 3 1 1 1 1 1 3 ...
## $ contact : Factor w/ 4 levels "cellular","seluler",..: 1 1 1 4 4 1 1 1 4 1 ...
## $ day : Factor w/ 31 levels "1","2","3","4",..: 19 11 16 3 5 23 14 6 14 17 ...
## $ month : Factor w/ 14 levels "10","7","apr",..: 1 11 3 9 11 6 11 11 11 3 ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
## $ y : Factor w/ 3 levels "iya","no","yes": 2 2 2 2 2 2 2 2 2 2 ...
# Menghitung jumlah NA
colSums(is.na(bank))
## Age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
# Imputasi menggunakan metode median untuk numerik dan modus untuk kategori
bank <- bank %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm=TRUE), .))) %>%
mutate(across(where(is.character), ~ ifelse(is.na(.), names(which.max(table(.))), .)))
# Menangani inkonsistensi pada atribut marital
bank$marital <- recode(bank$marital, "single" = "Single", "married" = "Married", "divorced" = "Divorced", "menikah" = "Married")
# Memperbaiki data tidak konsisten pada education
bank$education <- recode(bank$education, "tertier" = "tertiary", "sekunder" = "secondary")
# Menangani inkonsistensi pada atribut housing
bank$housing <- recode(bank$housing, "yes" = "Yes", "no" = "No", "tidak" = "No")
# Menangani inkonsistensi pada atribut loan
bank$loan <- recode(bank$loan, "yes" = "Yes", "no" = "No", "tidak" = "No")
# Menangani inkonsistensi pada atribut contact
bank$contact <- recode(bank$contact, "cellular" = "Cellular", "telephone" = "Telephone", "unknown" = "Unknown", "seluler" = "Cellular")
# Menangani inkonsistensi pada atribut month
bank$month <- as.character(bank$month)
bank$month <- recode(bank$month,
"1" = "January", "2" = "February", "3" = "March", "4" = "April", "5" = "May", "6" = "June",
"7" = "July", "8" = "August", "9" = "September", "10" = "October", "11" = "November",
"12" = "December", "13" = "January", "14" = "February","jan" = "January", "feb" = "February", "mar" = "March",
"apr" = "April", "may" = "May", "jun" = "June", "jul" = "July", "aug" = "August", "sep" = "September",
"oct" = "October", "nov" = "November", "dec" = "December")
# Menangani inkonsistensi pada atribut y
bank$y <- recode(bank$y, "yes" = "Yes", "no" = "No", "iya" = "Yes")
# Cek nilai yang telah dibersihkan
head(bank[, c("marital", "education", "housing", "loan", "contact", "month", "pdays", "y")])
## marital education housing loan contact month pdays y
## 1 Married primary No No Cellular October -1 No
## 2 Married secondary Yes Yes Cellular May 339 No
## 3 Single tertiary Yes No Cellular April 330 No
## 4 Married tertiary Yes Yes Unknown June -1 No
## 5 Married secondary Yes No Unknown May -1 No
## 6 Single tertiary No No Cellular February 176 No
# Memeriksa nilai unik dari kolom kategori untuk melihat anomali lain
unique(bank$job)
## [1] unemployed services management blue-collar self-employed
## [6] technician entrepreneur admin. student housemaid
## [11] retired unknown
## 12 Levels: admin. blue-collar entrepreneur housemaid management ... unknown
unique(bank$marital)
## [1] Married Single Divorced
## Levels: Divorced Married Single
unique(bank$education)
## [1] primary secondary tertiary unknown
## Levels: primary secondary tertiary unknown
unique(bank$contact)
## [1] Cellular Unknown Telephone
## Levels: Cellular Telephone Unknown
unique(bank$poutcome)
## [1] unknown failure other success
## Levels: failure other success unknown
unique(bank$month)
## [1] "October" "May" "April" "June" "February" "August"
## [7] "January" "July" "November" "September" "March" "December"
# Menampilkan jumlah kategori "unknown" di setiap kolom
unknown_counts <- sapply(bank, function(x) sum(x == "unknown"))
print(unknown_counts)
## Age job marital education default balance housing loan
## 0 38 0 187 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 3705
## y
## 0
# Menentukan batasan 5% dari total record
threshold <- 0.05 * nrow(bank) # 5% dari total record (4521) = 226
# Menentukan kolom mana saja yang bisa dihapus berdasarkan ambang batas
columns_to_clean <- names(unknown_counts[unknown_counts < threshold])
# Hapus baris dengan "unknown" hanya jika kolomnya berada dalam daftar columns_to_clean
# Cek kembali jumlah data setelah pembersihan
print(paste("Jumlah record setelah pembersihan:", nrow(bank)))
## [1] "Jumlah record setelah pembersihan: 4521"
# ubah type data
bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")] <- lapply(bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")], FUN = as.factor )
summary(bank)
## Age job marital education default
## Min. :19.00 management :969 Divorced: 528 primary : 678 no :4445
## 1st Qu.:33.00 blue-collar:946 Married :2797 secondary:2306 yes: 76
## Median :39.00 technician :768 Single :1196 tertiary :1350
## Mean :41.17 admin. :478 unknown : 187
## 3rd Qu.:49.00 services :417
## Max. :87.00 retired :230
## (Other) :713
## balance housing loan contact day
## Min. :-3313 No :1962 No :3830 Cellular :2896 20 : 257
## 1st Qu.: 69 Yes:2559 Yes: 691 Telephone: 301 18 : 226
## Median : 444 Unknown :1324 19 : 201
## Mean : 1423 21 : 198
## 3rd Qu.: 1480 14 : 195
## Max. :71188 17 : 191
## (Other):3253
## month duration campaign pdays
## May :1398 Min. : 4 Min. : 1.000 Min. : -1.00
## July : 706 1st Qu.: 104 1st Qu.: 1.000 1st Qu.: -1.00
## August : 633 Median : 185 Median : 2.000 Median : -1.00
## June : 531 Mean : 264 Mean : 2.794 Mean : 39.77
## November: 389 3rd Qu.: 329 3rd Qu.: 3.000 3rd Qu.: -1.00
## April : 293 Max. :3025 Max. :50.000 Max. :871.00
## (Other) : 571
## previous poutcome y
## Min. : 0.0000 failure: 490 Yes: 521
## 1st Qu.: 0.0000 other : 197 No :4000
## Median : 0.0000 success: 129
## Mean : 0.5426 unknown:3705
## 3rd Qu.: 0.0000
## Max. :25.0000
##
# Deteksi kolom numerik
numeric_cols <- bank %>% select(where(is.numeric))
# Loop untuk buat boxplot tiap kolom numeric
par(mfrow=c(3,3)) # Atur layout biar rapih 3x3
for (colname in names(numeric_cols)) {
boxplot(na.omit(numeric_cols[[colname]]),
main = paste("Boxplot", colname),
col = "lightblue")
}
par(mfrow=c(1,1)) # Balikin layout ke normal
Berdasarkan gambar diatas:
# Hapus data jika balance > 40000, duration > 2300, campaign > 30, atau pdays > 600
bank <- bank %>% filter(balance <= 40000, duration <= 2300, campaign <= 30, pdays <= 600)
# Cek jumlah data setelah penghapusan
print(nrow(bank))
## [1] 4504
numeric_vars <- bank %>% select(where(is.numeric))
cor_matrix <- cor(numeric_vars)
corrplot(cor_matrix, method="circle")
cor(bank$pdays, bank$previous, use = "complete.obs")
## [1] 0.5824258
tidak ada yang memiliki korelasi yang kuat antar variabel sehingga tidak perlu reduksi antar variabel
pca_result <- PCA(numeric_vars, graph=FALSE)
plot(pca_result, choix="var")
df_pca <- as.data.frame(pca_result$ind$coord)
Jika diperlukan reduksi, bisa dibagi menjadi 3.
Jika ingin mempertahankan distribusi asli untuk model numerik
# Normalisasi balance menggunakan Z-score
bank$balance_z <- scale(bank$balance)
# Cek hasil statistik setelah normalisasi
summary(bank$balance_z)
## V1
## Min. :-1.7051
## 1st Qu.:-0.4818
## Median :-0.3458
## Mean : 0.0000
## 3rd Qu.: 0.0280
## Max. : 9.5280
Data variabel ‘balance’ untuk menghindari outlier dapat dibagi juga menjadi 4 kategori “Negatif”: < 0 (Saldo negatif), “Rendah”: 0 - 1000, “Menengah”: 1000 - 5000, dan “Tinggi”: > 5000
# Diskretisasi balance ke dalam 4 kategori
bank$balance_group <- cut(bank$balance,
breaks = c(-Inf, 0, 1000, 5000, Inf),
labels = c("Negatif", "Rendah", "Menengah", "Tinggi"))
# Cek hasil diskretisasi
table(bank$balance_group)
##
## Negatif Rendah Menengah Tinggi
## 721 2308 1168 307
Data Variabel ‘age’ dibagi menjadi 3 kategori berdasarkan kelompok umur Muda: 18–34 tahun, Dewasa: 35–54 tahun, dan Lansia: 55+ tahun.
# Diskretisasi Data Usia (Age)
bank$age_group <- cut(bank$Age,
breaks = c(18, 34, 54, Inf),
labels = c("Muda", "Dewasa", "Lansia"),
right = TRUE)
# Cek hasil diskretisasi
table(bank$age_group)
##
## Muda Dewasa Lansia
## 1470 2452 582
Data Variabel ‘duration’ dibagi menjadi 3 kategori: Pendek: <100 detik, Sedang: 100–300 detik, dan Lama: >300 detik
# Diskretisasi duration
bank$duration_group <- cut(bank$duration,
breaks = c(-Inf, 100, 300, Inf),
labels = c("Pendek", "Sedang", "Lama"))
# Cek hasil diskretisasi
table(bank$duration_group)
##
## Pendek Sedang Lama
## 1064 2183 1257
Data Variabel ‘campaign’ dibagi menjadi 4 kategori beikut: “1”, “1-3”, “3-6”, “>6”
# Diskretisasi campaign
bank$campaign_group <- cut(bank$campaign,
breaks = c(-Inf, 1, 3, 6, Inf),
labels = c("1", "1-3",
"3-6", ">6"))
# Cek hasil diskretisasi
table(bank$campaign_group)
##
## 1 1-3 3-6 >6
## 1727 1818 646 313
Data Variabel ‘pdays’ dibagi menjadi 3 grup: “Tidak dihubungi” (pdays = -1), “1 tahun” (pdays = 1–365), dan “Lebih dari 1 tahun” (pdays > 366)
# Ubah -1 menjadi kategori baru
bank$pdays_group <- cut(bank$pdays,
breaks = c(-2, 0, 365, Inf),
labels = c("Tidak dihubungi", "1 tahun", "Lebih dari 1 tahun"),
right = TRUE)
# Cek hasil diskretisasi
table(bank$pdays_group)
##
## Tidak dihubungi 1 tahun Lebih dari 1 tahun
## 3695 750 59
Data Variabel ‘previous’ dibagi menjadi 3 kategori yaitu Tidak Pernah: o kali, Pernah: 1 kali, dan sering: lebih dari 1 kali.
# Diskretisasi previous
bank$previous_group <- cut(bank$previous,
breaks = c(-Inf, 0, 1, Inf),
labels = c("Tidak Pernah", "Pernah", "sering"))
# Cek hasil diskretisasi
table(bank$previous_group)
##
## Tidak Pernah Pernah sering
## 3695 284 525
# simpan
write.csv(bank, "bank_latih_cleaned.csv", row.names=FALSE)