Data Processing

1. Input Data

# Impor Library
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.92 loaded

library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 4.3.3

# Set direktori kerja (opsional, ganti dengan path folder yang sesuai)
setwd("C:/Users/asus/Documents/1 - Dok - Back Up/Faruq/STIS/6 - Data Mining/P2")

# Impor data CSV ke dalam dataframe
bank <- read.csv("bank latih.csv", TRUE, ";")

2. Data Inspection

head(bank)

##   Age         job marital education default balance housing loan  contact day
## 1  30  unemployed married   primary      no    1787      no   no cellular  19
## 2  33    services married secondary      no    4789     yes  yes cellular  11
## 3  35  management  single  tertiary      no    1350     yes   no cellular  16
## 4  30  management married   tertier      no    1476     yes  yes  unknown   3
## 5  59 blue-collar married secondary      no       0     yes   no  unknown   5
## 6  35  management  single  tertiary      no     747      no   no cellular  23
##   month duration campaign pdays previous poutcome  y
## 1    10       79        1    -1        0  unknown no
## 2   may      220        1   339        4  failure no
## 3   apr      185        1   330        1  failure no
## 4   jun      199        4    -1        0  unknown no
## 5   may      226        1    -1        0  unknown no
## 6   feb      141        2   176        3  failure no

str(bank)

## 'data.frame':    4521 obs. of  17 variables:
##  $ Age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertier" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "10" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...

# ubah type data
bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")] <- lapply(bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")], FUN = as.factor )

str(bank)

## 'data.frame':    4521 obs. of  17 variables:
##  $ Age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
##  $ marital  : Factor w/ 4 levels "divorced","married",..: 2 2 4 2 2 4 2 2 2 2 ...
##  $ education: Factor w/ 6 levels "primary","secondary",..: 1 2 4 5 2 4 4 2 4 1 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : Factor w/ 3 levels "no","tidak","yes": 1 3 3 3 3 1 3 3 3 3 ...
##  $ loan     : Factor w/ 3 levels "no","tidak","yes": 1 3 1 3 1 1 1 1 1 3 ...
##  $ contact  : Factor w/ 4 levels "cellular","seluler",..: 1 1 1 4 4 1 1 1 4 1 ...
##  $ day      : Factor w/ 31 levels "1","2","3","4",..: 19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : Factor w/ 14 levels "10","7","apr",..: 1 11 3 9 11 6 11 11 11 3 ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
##  $ y        : Factor w/ 3 levels "iya","no","yes": 2 2 2 2 2 2 2 2 2 2 ...

3. Data Cleaning

3.1. Handling Missing Values

# Menghitung jumlah NA
colSums(is.na(bank))

##       Age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0

# Imputasi menggunakan metode median untuk numerik dan modus untuk kategori
bank <- bank %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm=TRUE), .))) %>%
  mutate(across(where(is.character), ~ ifelse(is.na(.), names(which.max(table(.))), .)))

3.2. Handling Inconsistent Data

# Menangani inkonsistensi pada atribut marital
bank$marital <- recode(bank$marital, "single" = "Single", "married" = "Married", "divorced" = "Divorced", "menikah" = "Married")

# Memperbaiki data tidak konsisten pada education
bank$education <- recode(bank$education, "tertier" = "tertiary", "sekunder" = "secondary")

# Menangani inkonsistensi pada atribut housing
bank$housing <- recode(bank$housing, "yes" = "Yes", "no" = "No", "tidak" = "No")

# Menangani inkonsistensi pada atribut loan
bank$loan <- recode(bank$loan, "yes" = "Yes", "no" = "No", "tidak" = "No")

# Menangani inkonsistensi pada atribut contact
bank$contact <- recode(bank$contact, "cellular" = "Cellular", "telephone" = "Telephone", "unknown" = "Unknown", "seluler" = "Cellular")

# Menangani inkonsistensi pada atribut month
bank$month <- as.character(bank$month)
bank$month <- recode(bank$month, 
                     "1" = "January", "2" = "February", "3" = "March", "4" = "April", "5" = "May", "6" = "June", 
                     "7" = "July", "8" = "August", "9" = "September", "10" = "October", "11" = "November", 
                     "12" = "December", "13" = "January", "14" = "February","jan" = "January", "feb" = "February", "mar" = "March", 
                     "apr" = "April", "may" = "May", "jun" = "June", "jul" = "July", "aug" = "August", "sep" = "September", 
                     "oct" = "October", "nov" = "November", "dec" = "December")

# Menangani inkonsistensi pada atribut y
bank$y <- recode(bank$y, "yes" = "Yes", "no" = "No", "iya" = "Yes")



# Cek nilai yang telah dibersihkan
head(bank[, c("marital", "education", "housing", "loan", "contact", "month", "pdays", "y")])

##   marital education housing loan  contact    month pdays  y
## 1 Married   primary      No   No Cellular  October    -1 No
## 2 Married secondary     Yes  Yes Cellular      May   339 No
## 3  Single  tertiary     Yes   No Cellular    April   330 No
## 4 Married  tertiary     Yes  Yes  Unknown     June    -1 No
## 5 Married secondary     Yes   No  Unknown      May    -1 No
## 6  Single  tertiary      No   No Cellular February   176 No

3.3. Deteksi dan Koreksi Annomali

# Memeriksa nilai unik dari kolom kategori untuk melihat anomali lain
unique(bank$job)

##  [1] unemployed    services      management    blue-collar   self-employed
##  [6] technician    entrepreneur  admin.        student       housemaid    
## [11] retired       unknown      
## 12 Levels: admin. blue-collar entrepreneur housemaid management ... unknown

unique(bank$marital)

## [1] Married  Single   Divorced
## Levels: Divorced Married Single

unique(bank$education)

## [1] primary   secondary tertiary  unknown  
## Levels: primary secondary tertiary unknown

unique(bank$contact)

## [1] Cellular  Unknown   Telephone
## Levels: Cellular Telephone Unknown

unique(bank$poutcome)

## [1] unknown failure other   success
## Levels: failure other success unknown

unique(bank$month)

##  [1] "October"   "May"       "April"     "June"      "February"  "August"   
##  [7] "January"   "July"      "November"  "September" "March"     "December"

# Menampilkan jumlah kategori "unknown" di setiap kolom
unknown_counts <- sapply(bank, function(x) sum(x == "unknown"))
print(unknown_counts)

##       Age       job   marital education   default   balance   housing      loan 
##         0        38         0       187         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0      3705 
##         y 
##         0

# Menentukan batasan 5% dari total record
threshold <- 0.05 * nrow(bank)  # 5% dari total record (4521) = 226

# Menentukan kolom mana saja yang bisa dihapus berdasarkan ambang batas
columns_to_clean <- names(unknown_counts[unknown_counts < threshold])

# Hapus baris dengan "unknown" hanya jika kolomnya berada dalam daftar columns_to_clean


# Cek kembali jumlah data setelah pembersihan
print(paste("Jumlah record setelah pembersihan:", nrow(bank)))

## [1] "Jumlah record setelah pembersihan: 4521"

3.4. Noisy Data (Outlier Detection)

# ubah type data
bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")] <- lapply(bank[,c("job","marital","education","default","housing","loan","contact", "day", "month", "poutcome","y")], FUN = as.factor )

summary(bank)

##       Age                 job          marital         education    default   
##  Min.   :19.00   management :969   Divorced: 528   primary  : 678   no :4445  
##  1st Qu.:33.00   blue-collar:946   Married :2797   secondary:2306   yes:  76  
##  Median :39.00   technician :768   Single  :1196   tertiary :1350             
##  Mean   :41.17   admin.     :478                   unknown  : 187             
##  3rd Qu.:49.00   services   :417                                              
##  Max.   :87.00   retired    :230                                              
##                  (Other)    :713                                              
##     balance      housing     loan           contact          day      
##  Min.   :-3313   No :1962   No :3830   Cellular :2896   20     : 257  
##  1st Qu.:   69   Yes:2559   Yes: 691   Telephone: 301   18     : 226  
##  Median :  444                         Unknown  :1324   19     : 201  
##  Mean   : 1423                                          21     : 198  
##  3rd Qu.: 1480                                          14     : 195  
##  Max.   :71188                                          17     : 191  
##                                                         (Other):3253  
##       month         duration       campaign          pdays       
##  May     :1398   Min.   :   4   Min.   : 1.000   Min.   : -1.00  
##  July    : 706   1st Qu.: 104   1st Qu.: 1.000   1st Qu.: -1.00  
##  August  : 633   Median : 185   Median : 2.000   Median : -1.00  
##  June    : 531   Mean   : 264   Mean   : 2.794   Mean   : 39.77  
##  November: 389   3rd Qu.: 329   3rd Qu.: 3.000   3rd Qu.: -1.00  
##  April   : 293   Max.   :3025   Max.   :50.000   Max.   :871.00  
##  (Other) : 571                                                   
##     previous          poutcome      y       
##  Min.   : 0.0000   failure: 490   Yes: 521  
##  1st Qu.: 0.0000   other  : 197   No :4000  
##  Median : 0.0000   success: 129             
##  Mean   : 0.5426   unknown:3705             
##  3rd Qu.: 0.0000                            
##  Max.   :25.0000                            
##

# Deteksi kolom numerik
numeric_cols <- bank %>% select(where(is.numeric))

# Loop untuk buat boxplot tiap kolom numeric
par(mfrow=c(3,3)) # Atur layout biar rapih 3x3
for (colname in names(numeric_cols)) {
  boxplot(na.omit(numeric_cols[[colname]]), 
          main = paste("Boxplot", colname), 
          col = "lightblue")
}
par(mfrow=c(1,1)) # Balikin layout ke normal

Berdasarkan gambar diatas:

‘age’ lebih cocok untuk dilakukan diskretisasi. Dikonversi menjadi kategori: Muda (18–34), Dewasa (35–54), dan Lansia (55+)
’balance’memiliki nilai negatif, yang menunjukkan kemungkinan saldo negatif di rekening. Distribusi saldo cenderung skewed (tidak normal) karena ada orang dengan saldo yang sangat besar atau sangat kecil.
‘day’ menunjukkan tanggal dalam bulan (1–31). Data ini sudah dalam format numerik dan tidak perlu diubah.
‘duration’ adalah lama panggilan (detik) yang mungkin memiliki distribusi yang menceng.
‘campaign’ menunjukkan berapa kali nasabah dihubungi selama kampanye pemasaran. Distribusi sebagian besar nasabah hanya dihubungi 1–3 kali, tapi ada yang dihubungi lebih dari 10 kali.
‘pdays’ menunjukkan jumlah hari sejak nasabah terakhir dihubungi. Jika bernilai -1, artinya nasabah belum pernah dihubungi sebelumnya. Sebagian besar nilai pdays adalah -1, sehingga bisa menyebabkan bias.
‘previous’ menunjukkan jumlah interaksi sebelumnya dengan nasabah. Banyak nasabah yang tidak memiliki interaksi sebelumnya (previous = 0). Bisa bervariasi dari 0 hingga beberapa kali.

# Hapus data jika balance > 40000, duration > 2300, campaign > 30, atau pdays > 600
bank <- bank %>% filter(balance <= 40000, duration <= 2300, campaign <= 30, pdays <= 600)

# Cek jumlah data setelah penghapusan
print(nrow(bank))

## [1] 4504

4. Data Reduction

4.1. Korelasi antar variabel

numeric_vars <- bank %>% select(where(is.numeric))
cor_matrix <- cor(numeric_vars)
corrplot(cor_matrix, method="circle")

cor(bank$pdays, bank$previous, use = "complete.obs")

## [1] 0.5824258

tidak ada yang memiliki korelasi yang kuat antar variabel sehingga tidak perlu reduksi antar variabel

4.2. Principal Component Analysis (PCA)

pca_result <- PCA(numeric_vars, graph=FALSE)
plot(pca_result, choix="var")

df_pca <- as.data.frame(pca_result$ind$coord)

Jika diperlukan reduksi, bisa dibagi menjadi 3.

5. Data Transformation

5.1 Data Transformation by Normalization

Jika ingin mempertahankan distribusi asli untuk model numerik

# Normalisasi balance menggunakan Z-score
bank$balance_z <- scale(bank$balance)

# Cek hasil statistik setelah normalisasi
summary(bank$balance_z)

##        V1         
##  Min.   :-1.7051  
##  1st Qu.:-0.4818  
##  Median :-0.3458  
##  Mean   : 0.0000  
##  3rd Qu.: 0.0280  
##  Max.   : 9.5280

5.2. Data Transformation by Diskretisasi

Data variabel ‘balance’ untuk menghindari outlier dapat dibagi juga menjadi 4 kategori “Negatif”: < 0 (Saldo negatif), “Rendah”: 0 - 1000, “Menengah”: 1000 - 5000, dan “Tinggi”: > 5000

# Diskretisasi balance ke dalam 4 kategori
bank$balance_group <- cut(bank$balance, 
                          breaks = c(-Inf, 0, 1000, 5000, Inf), 
                          labels = c("Negatif", "Rendah", "Menengah", "Tinggi"))

# Cek hasil diskretisasi
table(bank$balance_group)

## 
##  Negatif   Rendah Menengah   Tinggi 
##      721     2308     1168      307

Data Variabel ‘age’ dibagi menjadi 3 kategori berdasarkan kelompok umur Muda: 18–34 tahun, Dewasa: 35–54 tahun, dan Lansia: 55+ tahun.

# Diskretisasi Data Usia (Age)
bank$age_group <- cut(bank$Age, 
                    breaks = c(18, 34, 54, Inf), 
                    labels = c("Muda", "Dewasa", "Lansia"), 
                    right = TRUE)

# Cek hasil diskretisasi
table(bank$age_group)

## 
##   Muda Dewasa Lansia 
##   1470   2452    582

Data Variabel ‘duration’ dibagi menjadi 3 kategori: Pendek: <100 detik, Sedang: 100–300 detik, dan Lama: >300 detik

# Diskretisasi duration
bank$duration_group <- cut(bank$duration, 
                         breaks = c(-Inf, 100, 300, Inf), 
                         labels = c("Pendek", "Sedang", "Lama"))

# Cek hasil diskretisasi
table(bank$duration_group)

## 
## Pendek Sedang   Lama 
##   1064   2183   1257

Data Variabel ‘campaign’ dibagi menjadi 4 kategori beikut: “1”, “1-3”, “3-6”, “>6”

# Diskretisasi campaign
bank$campaign_group <- cut(bank$campaign, 
                         breaks = c(-Inf, 1, 3, 6, Inf), 
                         labels = c("1", "1-3", 
                                    "3-6", ">6"))
# Cek hasil diskretisasi
table(bank$campaign_group)

## 
##    1  1-3  3-6   >6 
## 1727 1818  646  313

Data Variabel ‘pdays’ dibagi menjadi 3 grup: “Tidak dihubungi” (pdays = -1), “1 tahun” (pdays = 1–365), dan “Lebih dari 1 tahun” (pdays > 366)

# Ubah -1 menjadi kategori baru
bank$pdays_group <- cut(bank$pdays, 
                        breaks = c(-2, 0, 365, Inf), 
                        labels = c("Tidak dihubungi", "1 tahun", "Lebih dari 1 tahun"), 
                        right = TRUE)

# Cek hasil diskretisasi
table(bank$pdays_group)

## 
##    Tidak dihubungi            1 tahun Lebih dari 1 tahun 
##               3695                750                 59

Data Variabel ‘previous’ dibagi menjadi 3 kategori yaitu Tidak Pernah: o kali, Pernah: 1 kali, dan sering: lebih dari 1 kali.

# Diskretisasi previous
bank$previous_group <- cut(bank$previous, 
                         breaks = c(-Inf, 0, 1, Inf), 
                         labels = c("Tidak Pernah", "Pernah", "sering"))
# Cek hasil diskretisasi
table(bank$previous_group)

## 
## Tidak Pernah       Pernah       sering 
##         3695          284          525

6. Simpan Data

# simpan
write.csv(bank, "bank_latih_cleaned.csv", row.names=FALSE)

Data Mining – Data Processing: Bank Latih

Kelompok 4 3SD2 2025: Faruq, Irsan, Jernita

2025-03-10