unique(data$y)
## [1] "no"  "yes"
data$y <- ifelse(data$y == "yes", 1, 0)

# cek hasil
table(data$y)
## 
##     0     1 
## 39922  5289

#MISSING VALUE

colSums(is.na(data))
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0

#DATA DUPLIKAT

sum(duplicated(data))
## [1] 0
data <- data[!duplicated(data), ]

#encoding variabel kategorikal

data$job <- as.factor(data$job)
data$marital <- as.factor(data$marital)
data$education <- as.factor(data$education)
data$default <- as.factor(data$default)
data$housing <- as.factor(data$housing)
data$loan <- as.factor(data$loan)
data$contact <- as.factor(data$contact)
data$month <- as.factor(data$month)
data$poutcome <- as.factor(data$poutcome)

#OUTLIER

boxplot(data$balance, main="Boxplot Balance (Sebelum)")

Q1 <- quantile(data$balance, 0.25)
Q3 <- quantile(data$balance, 0.75)
IQR <- Q3 - Q1

data <- data[data$balance >= (Q1 - 1.5*IQR) & 
             data$balance <= (Q3 + 1.5*IQR), ]

boxplot(data$balance, main="Boxplot Balance (Sesudah)")

#transformasi

min_balance <- min(data$balance)

data$balance_log <- log(data$balance - min_balance + 1)
data$duration_log <- log(data$duration + 1)

# cek hasil
summary(data$balance_log)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   7.596   7.738   7.810   7.981   8.595

#statdes

summary(data)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:8951   divorced: 4750   primary  : 6163  
##  1st Qu.:32.00   management :8137   married :24198   secondary:21208  
##  Median :39.00   technician :6853   single  :11534   tertiary :11476  
##  Mean   :40.63   admin.     :4726                    unknown  : 1635  
##  3rd Qu.:48.00   services   :3843                                     
##  Max.   :95.00   retired    :1914                                     
##                  (Other)    :6058                                     
##  default        balance        housing      loan            contact     
##  no :39679   Min.   :-1944.0   no :17627   no :33632   cellular :26140  
##  yes:  803   1st Qu.:   46.0   yes:22855   yes: 6850   telephone: 2498  
##              Median :  349.0                           unknown  :11844  
##              Mean   :  640.6                                            
##              3rd Qu.:  980.8                                            
##              Max.   : 3462.0                                            
##                                                                         
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :12731   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6443   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 5579   Median : 179.0   Median : 2.000  
##  Mean   :15.77   jun    : 4665   Mean   : 256.2   Mean   : 2.774  
##  3rd Qu.:21.00   nov    : 3072   3rd Qu.: 316.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2593   Max.   :3881.0   Max.   :58.000  
##                  (Other): 5399                                    
##      pdays          previous           poutcome           y         
##  Min.   : -1.0   Min.   :  0.0000   failure: 4337   Min.   :0.0000  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1638   1st Qu.:0.0000  
##  Median : -1.0   Median :  0.0000   success: 1286   Median :0.0000  
##  Mean   : 40.3   Mean   :  0.5676   unknown:33221   Mean   :0.1117  
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                   3rd Qu.:0.0000  
##  Max.   :871.0   Max.   :275.0000                   Max.   :1.0000  
##                                                                     
##   balance_log     duration_log  
##  Min.   :0.000   Min.   :0.000  
##  1st Qu.:7.596   1st Qu.:4.644  
##  Median :7.738   Median :5.193  
##  Mean   :7.810   Mean   :5.165  
##  3rd Qu.:7.981   3rd Qu.:5.759  
##  Max.   :8.595   Max.   :8.264  
## 

#visualisasi #age

hist(data$age, main="Distribusi Umur", col="lightblue", xlab="Age")

hist(data$balance, main="Distribusi Saldo", col="lightgreen", xlab="Balance")

hist(data$duration, main="Distribusi Durasi", col="lightcoral", xlab="Duration")

boxplot(data$age, main="Boxplot Umur")

boxplot(data$balance, main="Boxplot Saldo")

barplot(table(data$job), main="Distribusi Pekerjaan", las=2)

barplot(table(data$marital), main="Status Pernikahan")

barplot(table(data$education), main="Pendidikan")

plot(data$age, data$balance,
     main="Age vs Balance",
     xlab="Age", ylab="Balance")

boxplot(balance ~ job, data=data,
        main="Saldo berdasarkan Pekerjaan",
        las=2)

boxplot(age ~ marital, data=data,
        main="Umur berdasarkan Status Pernikahan")

#Cek Pola & Trend Data

plot(data$age, data$balance,
     xlab="Age", ylab="Balance",
     main="Pola Age vs Balance")

#Trend dengan Garis Regresi

plot(data$age, data$balance)
abline(lm(balance ~ age, data=data), col="red")

plot(data$duration, data$balance,
     xlab="Duration", ylab="Balance",
     main="Duration vs Balance")

#analisis korelasi

data_num <- data[sapply(data, is.numeric)]
cor_matrix <- cor(data_num)

print(cor_matrix)
##                        age       balance           day     duration
## age           1.0000000000  0.0957114229 -0.0066220291 -0.010996557
## balance       0.0957114229  1.0000000000 -0.0001468277  0.037598586
## day          -0.0066220291 -0.0001468277  1.0000000000 -0.031916570
## duration     -0.0109965566  0.0375985864 -0.0319165696  1.000000000
## campaign      0.0067527843 -0.0337945812  0.1665749558 -0.083418019
## pdays        -0.0237253645  0.0287152460 -0.0961230507 -0.004728600
## previous      0.0008275984  0.0356609666 -0.0528602667 -0.001389868
## y             0.0154687772  0.0910488789 -0.0313013436  0.395707785
## balance_log   0.0862380510  0.9595664920 -0.0071456101  0.037316498
## duration_log -0.0084402407  0.0375709622 -0.0587988388  0.813343593
##                  campaign         pdays      previous           y balance_log
## age           0.006752784 -2.372536e-02  0.0008275984  0.01546878  0.08623805
## balance      -0.033794581  2.871525e-02  0.0356609666  0.09104888  0.95956649
## day           0.166574956 -9.612305e-02 -0.0528602667 -0.03130134 -0.00714561
## duration     -0.083418019 -4.728600e-03 -0.0013898675  0.39570778  0.03731650
## campaign      1.000000000 -8.916572e-02 -0.0322965993 -0.07249821 -0.03498384
## pdays        -0.089165716  1.000000e+00  0.4494793728  0.10074150  0.03343287
## previous     -0.032296599  4.494794e-01  1.0000000000  0.09017872  0.03822598
## y            -0.072498207  1.007415e-01  0.0901787232  1.00000000  0.09044515
## balance_log  -0.034983836  3.343287e-02  0.0382259751  0.09044515  1.00000000
## duration_log -0.208532199  3.228521e-05 -0.0062161377  0.33961950  0.03661333
##               duration_log
## age          -8.440241e-03
## balance       3.757096e-02
## day          -5.879884e-02
## duration      8.133436e-01
## campaign     -2.085322e-01
## pdays         3.228521e-05
## previous     -6.216138e-03
## y             3.396195e-01
## balance_log   3.661333e-02
## duration_log  1.000000e+00
heatmap(cor_matrix)