Data Processing

1. Input Data

# Impor Library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.3
# Set direktori kerja (opsional, ganti dengan path folder yang sesuai)
setwd("C:/Users/asus/Documents/1 - Dok - Back Up/Faruq/STIS/6 - Data Mining/P2")

# Impor data CSV ke dalam dataframe
bank <- read.csv("bank latih.csv", TRUE, ";")

2. Data Inspection

head(bank)
##   Age         job marital education default balance housing loan  contact day
## 1  30  unemployed married   primary      no    1787      no   no cellular  19
## 2  33    services married secondary      no    4789     yes  yes cellular  11
## 3  35  management  single  tertiary      no    1350     yes   no cellular  16
## 4  30  management married   tertier      no    1476     yes  yes  unknown   3
## 5  59 blue-collar married secondary      no       0     yes   no  unknown   5
## 6  35  management  single  tertiary      no     747      no   no cellular  23
##   month duration campaign pdays previous poutcome  y
## 1    10       79        1    -1        0  unknown no
## 2   may      220        1   339        4  failure no
## 3   apr      185        1   330        1  failure no
## 4   jun      199        4    -1        0  unknown no
## 5   may      226        1    -1        0  unknown no
## 6   feb      141        2   176        3  failure no
str(bank)
## 'data.frame':    4521 obs. of  17 variables:
##  $ Age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertier" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "10" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...
# ubah type data
bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")] <- lapply(bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")], FUN = as.factor )

str(bank)
## 'data.frame':    4521 obs. of  17 variables:
##  $ Age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
##  $ marital  : Factor w/ 4 levels "divorced","married",..: 2 2 4 2 2 4 2 2 2 2 ...
##  $ education: Factor w/ 6 levels "primary","secondary",..: 1 2 4 5 2 4 4 2 4 1 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : Factor w/ 3 levels "no","tidak","yes": 1 3 3 3 3 1 3 3 3 3 ...
##  $ loan     : Factor w/ 3 levels "no","tidak","yes": 1 3 1 3 1 1 1 1 1 3 ...
##  $ contact  : Factor w/ 4 levels "cellular","seluler",..: 1 1 1 4 4 1 1 1 4 1 ...
##  $ day      : Factor w/ 31 levels "1","2","3","4",..: 19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : Factor w/ 14 levels "10","7","apr",..: 1 11 3 9 11 6 11 11 11 3 ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
##  $ y        : Factor w/ 3 levels "iya","no","yes": 2 2 2 2 2 2 2 2 2 2 ...

3. Data Cleaning

3.1. Handling Missing Values

# Menghitung jumlah NA
colSums(is.na(bank))
##       Age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0
# Imputasi menggunakan metode median untuk numerik dan modus untuk kategori
bank <- bank %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm=TRUE), .))) %>%
  mutate(across(where(is.character), ~ ifelse(is.na(.), names(which.max(table(.))), .)))

3.2. Handling Inconsistent Data

# Menangani inkonsistensi pada atribut marital
bank$marital <- recode(bank$marital, "single" = "Single", "married" = "Married", "divorced" = "Divorced", "menikah" = "Married")

# Memperbaiki data tidak konsisten pada education
bank$education <- recode(bank$education, "tertier" = "tertiary", "sekunder" = "secondary")

# Menangani inkonsistensi pada atribut housing
bank$housing <- recode(bank$housing, "yes" = "Yes", "no" = "No", "tidak" = "No")

# Menangani inkonsistensi pada atribut loan
bank$loan <- recode(bank$loan, "yes" = "Yes", "no" = "No", "tidak" = "No")

# Menangani inkonsistensi pada atribut contact
bank$contact <- recode(bank$contact, "cellular" = "Cellular", "telephone" = "Telephone", "unknown" = "Unknown", "seluler" = "Cellular")

# Menangani inkonsistensi pada atribut month
bank$month <- as.character(bank$month)
bank$month <- recode(bank$month, 
                     "1" = "January", "2" = "February", "3" = "March", "4" = "April", "5" = "May", "6" = "June", 
                     "7" = "July", "8" = "August", "9" = "September", "10" = "October", "11" = "November", 
                     "12" = "December", "13" = "January", "14" = "February","jan" = "January", "feb" = "February", "mar" = "March", 
                     "apr" = "April", "may" = "May", "jun" = "June", "jul" = "July", "aug" = "August", "sep" = "September", 
                     "oct" = "October", "nov" = "November", "dec" = "December")

# Menangani inkonsistensi pada atribut y
bank$y <- recode(bank$y, "yes" = "Yes", "no" = "No", "iya" = "Yes")



# Cek nilai yang telah dibersihkan
head(bank[, c("marital", "education", "housing", "loan", "contact", "month", "pdays", "y")])
##   marital education housing loan  contact    month pdays  y
## 1 Married   primary      No   No Cellular  October    -1 No
## 2 Married secondary     Yes  Yes Cellular      May   339 No
## 3  Single  tertiary     Yes   No Cellular    April   330 No
## 4 Married  tertiary     Yes  Yes  Unknown     June    -1 No
## 5 Married secondary     Yes   No  Unknown      May    -1 No
## 6  Single  tertiary      No   No Cellular February   176 No

3.3. Deteksi dan Koreksi Annomali

# Memeriksa nilai unik dari kolom kategori untuk melihat anomali lain
unique(bank$job)
##  [1] unemployed    services      management    blue-collar   self-employed
##  [6] technician    entrepreneur  admin.        student       housemaid    
## [11] retired       unknown      
## 12 Levels: admin. blue-collar entrepreneur housemaid management ... unknown
unique(bank$marital)
## [1] Married  Single   Divorced
## Levels: Divorced Married Single
unique(bank$education)
## [1] primary   secondary tertiary  unknown  
## Levels: primary secondary tertiary unknown
unique(bank$contact)
## [1] Cellular  Unknown   Telephone
## Levels: Cellular Telephone Unknown
unique(bank$poutcome)
## [1] unknown failure other   success
## Levels: failure other success unknown
unique(bank$month)
##  [1] "October"   "May"       "April"     "June"      "February"  "August"   
##  [7] "January"   "July"      "November"  "September" "March"     "December"
# Menampilkan jumlah kategori "unknown" di setiap kolom
unknown_counts <- sapply(bank, function(x) sum(x == "unknown"))
print(unknown_counts)
##       Age       job   marital education   default   balance   housing      loan 
##         0        38         0       187         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0      3705 
##         y 
##         0
# Menentukan batasan 5% dari total record
threshold <- 0.05 * nrow(bank)  # 5% dari total record (4521) = 226

# Menentukan kolom mana saja yang bisa dihapus berdasarkan ambang batas
columns_to_clean <- names(unknown_counts[unknown_counts < threshold])

# Hapus baris dengan "unknown" hanya jika kolomnya berada dalam daftar columns_to_clean


# Cek kembali jumlah data setelah pembersihan
print(paste("Jumlah record setelah pembersihan:", nrow(bank)))
## [1] "Jumlah record setelah pembersihan: 4521"

3.4. Noisy Data (Outlier Detection)

# ubah type data
bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")] <- lapply(bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")], FUN = as.factor )

summary(bank)
##       Age                 job          marital         education    default   
##  Min.   :19.00   management :969   Divorced: 528   primary  : 678   no :4445  
##  1st Qu.:33.00   blue-collar:946   Married :2797   secondary:2306   yes:  76  
##  Median :39.00   technician :768   Single  :1196   tertiary :1350             
##  Mean   :41.17   admin.     :478                   unknown  : 187             
##  3rd Qu.:49.00   services   :417                                              
##  Max.   :87.00   retired    :230                                              
##                  (Other)    :713                                              
##     balance      housing     loan           contact          day      
##  Min.   :-3313   No :1962   No :3830   Cellular :2896   20     : 257  
##  1st Qu.:   69   Yes:2559   Yes: 691   Telephone: 301   18     : 226  
##  Median :  444                         Unknown  :1324   19     : 201  
##  Mean   : 1423                                          21     : 198  
##  3rd Qu.: 1480                                          14     : 195  
##  Max.   :71188                                          17     : 191  
##                                                         (Other):3253  
##       month         duration       campaign          pdays       
##  May     :1398   Min.   :   4   Min.   : 1.000   Min.   : -1.00  
##  July    : 706   1st Qu.: 104   1st Qu.: 1.000   1st Qu.: -1.00  
##  August  : 633   Median : 185   Median : 2.000   Median : -1.00  
##  June    : 531   Mean   : 264   Mean   : 2.794   Mean   : 39.77  
##  November: 389   3rd Qu.: 329   3rd Qu.: 3.000   3rd Qu.: -1.00  
##  April   : 293   Max.   :3025   Max.   :50.000   Max.   :871.00  
##  (Other) : 571                                                   
##     previous          poutcome      y       
##  Min.   : 0.0000   failure: 490   Yes: 521  
##  1st Qu.: 0.0000   other  : 197   No :4000  
##  Median : 0.0000   success: 129             
##  Mean   : 0.5426   unknown:3705             
##  3rd Qu.: 0.0000                            
##  Max.   :25.0000                            
## 
# Deteksi kolom numerik
numeric_cols <- bank %>% select(where(is.numeric))

# Loop untuk buat boxplot tiap kolom numeric
par(mfrow=c(3,3)) # Atur layout biar rapih 3x3
for (colname in names(numeric_cols)) {
  boxplot(na.omit(numeric_cols[[colname]]), 
          main = paste("Boxplot", colname), 
          col = "lightblue")
}
par(mfrow=c(1,1)) # Balikin layout ke normal

Berdasarkan gambar diatas:

  • ‘age’ lebih cocok untuk dilakukan diskretisasi. Dikonversi menjadi kategori: Muda (18–34), Dewasa (35–54), dan Lansia (55+)
  • ’balance’memiliki nilai negatif, yang menunjukkan kemungkinan saldo negatif di rekening. Distribusi saldo cenderung skewed (tidak normal) karena ada orang dengan saldo yang sangat besar atau sangat kecil.
  • ‘day’ menunjukkan tanggal dalam bulan (1–31). Data ini sudah dalam format numerik dan tidak perlu diubah.
  • ‘duration’ adalah lama panggilan (detik) yang mungkin memiliki distribusi yang menceng.
  • ‘campaign’ menunjukkan berapa kali nasabah dihubungi selama kampanye pemasaran. Distribusi sebagian besar nasabah hanya dihubungi 1–3 kali, tapi ada yang dihubungi lebih dari 10 kali.
  • ‘pdays’ menunjukkan jumlah hari sejak nasabah terakhir dihubungi. Jika bernilai -1, artinya nasabah belum pernah dihubungi sebelumnya. Sebagian besar nilai pdays adalah -1, sehingga bisa menyebabkan bias.
  • ‘previous’ menunjukkan jumlah interaksi sebelumnya dengan nasabah. Banyak nasabah yang tidak memiliki interaksi sebelumnya (previous = 0). Bisa bervariasi dari 0 hingga beberapa kali.
# Hapus data jika balance > 40000, duration > 2300, campaign > 30, atau pdays > 600
bank <- bank %>% filter(balance <= 40000, duration <= 2300, campaign <= 30, pdays <= 600)

# Cek jumlah data setelah penghapusan
print(nrow(bank))
## [1] 4504

4. Data Reduction

4.1. Korelasi antar variabel

numeric_vars <- bank %>% select(where(is.numeric))
cor_matrix <- cor(numeric_vars)
corrplot(cor_matrix, method="circle")

cor(bank$pdays, bank$previous, use = "complete.obs")
## [1] 0.5824258

tidak ada yang memiliki korelasi yang kuat antar variabel sehingga tidak perlu reduksi antar variabel

4.2. Principal Component Analysis (PCA)

pca_result <- PCA(numeric_vars, graph=FALSE)
plot(pca_result, choix="var")

df_pca <- as.data.frame(pca_result$ind$coord)

Jika diperlukan reduksi, bisa dibagi menjadi 3.

5. Data Transformation

5.1 Data Transformation by Normalization

Jika ingin mempertahankan distribusi asli untuk model numerik

# Normalisasi balance menggunakan Z-score
bank$balance_z <- scale(bank$balance)

# Cek hasil statistik setelah normalisasi
summary(bank$balance_z)
##        V1         
##  Min.   :-1.7051  
##  1st Qu.:-0.4818  
##  Median :-0.3458  
##  Mean   : 0.0000  
##  3rd Qu.: 0.0280  
##  Max.   : 9.5280

5.2. Data Transformation by Diskretisasi

Data variabel ‘balance’ untuk menghindari outlier dapat dibagi juga menjadi 4 kategori “Negatif”: < 0 (Saldo negatif), “Rendah”: 0 - 1000, “Menengah”: 1000 - 5000, dan “Tinggi”: > 5000

# Diskretisasi balance ke dalam 4 kategori
bank$balance_group <- cut(bank$balance, 
                          breaks = c(-Inf, 0, 1000, 5000, Inf), 
                          labels = c("Negatif", "Rendah", "Menengah", "Tinggi"))

# Cek hasil diskretisasi
table(bank$balance_group)
## 
##  Negatif   Rendah Menengah   Tinggi 
##      721     2308     1168      307

Data Variabel ‘age’ dibagi menjadi 3 kategori berdasarkan kelompok umur Muda: 18–34 tahun, Dewasa: 35–54 tahun, dan Lansia: 55+ tahun.

# Diskretisasi Data Usia (Age)
bank$age_group <- cut(bank$Age, 
                    breaks = c(18, 34, 54, Inf), 
                    labels = c("Muda", "Dewasa", "Lansia"), 
                    right = TRUE)

# Cek hasil diskretisasi
table(bank$age_group)
## 
##   Muda Dewasa Lansia 
##   1470   2452    582

Data Variabel ‘duration’ dibagi menjadi 3 kategori: Pendek: <100 detik, Sedang: 100–300 detik, dan Lama: >300 detik

# Diskretisasi duration
bank$duration_group <- cut(bank$duration, 
                         breaks = c(-Inf, 100, 300, Inf), 
                         labels = c("Pendek", "Sedang", "Lama"))

# Cek hasil diskretisasi
table(bank$duration_group)
## 
## Pendek Sedang   Lama 
##   1064   2183   1257

Data Variabel ‘campaign’ dibagi menjadi 4 kategori beikut: “1”, “1-3”, “3-6”, “>6”

# Diskretisasi campaign
bank$campaign_group <- cut(bank$campaign, 
                         breaks = c(-Inf, 1, 3, 6, Inf), 
                         labels = c("1", "1-3", 
                                    "3-6", ">6"))
# Cek hasil diskretisasi
table(bank$campaign_group)
## 
##    1  1-3  3-6   >6 
## 1727 1818  646  313

Data Variabel ‘pdays’ dibagi menjadi 3 grup: “Tidak dihubungi” (pdays = -1), “1 tahun” (pdays = 1–365), dan “Lebih dari 1 tahun” (pdays > 366)

# Ubah -1 menjadi kategori baru
bank$pdays_group <- cut(bank$pdays, 
                        breaks = c(-2, 0, 365, Inf), 
                        labels = c("Tidak dihubungi", "1 tahun", "Lebih dari 1 tahun"), 
                        right = TRUE)

# Cek hasil diskretisasi
table(bank$pdays_group)
## 
##    Tidak dihubungi            1 tahun Lebih dari 1 tahun 
##               3695                750                 59

Data Variabel ‘previous’ dibagi menjadi 3 kategori yaitu Tidak Pernah: o kali, Pernah: 1 kali, dan sering: lebih dari 1 kali.

# Diskretisasi previous
bank$previous_group <- cut(bank$previous, 
                         breaks = c(-Inf, 0, 1, Inf), 
                         labels = c("Tidak Pernah", "Pernah", "sering"))
# Cek hasil diskretisasi
table(bank$previous_group)
## 
## Tidak Pernah       Pernah       sering 
##         3695          284          525

6. Simpan Data

# simpan
write.csv(bank, "bank_latih_cleaned.csv", row.names=FALSE)