unique(data$y)
## [1] "no" "yes"
data$y <- ifelse(data$y == "yes", 1, 0)
# cek hasil
table(data$y)
##
## 0 1
## 39922 5289
#MISSING VALUE
colSums(is.na(data))
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
#DATA DUPLIKAT
sum(duplicated(data))
## [1] 0
data <- data[!duplicated(data), ]
#encoding variabel kategorikal
data$job <- as.factor(data$job)
data$marital <- as.factor(data$marital)
data$education <- as.factor(data$education)
data$default <- as.factor(data$default)
data$housing <- as.factor(data$housing)
data$loan <- as.factor(data$loan)
data$contact <- as.factor(data$contact)
data$month <- as.factor(data$month)
data$poutcome <- as.factor(data$poutcome)
#OUTLIER
boxplot(data$balance, main="Boxplot Balance (Sebelum)")
Q1 <- quantile(data$balance, 0.25)
Q3 <- quantile(data$balance, 0.75)
IQR <- Q3 - Q1
data <- data[data$balance >= (Q1 - 1.5*IQR) &
data$balance <= (Q3 + 1.5*IQR), ]
boxplot(data$balance, main="Boxplot Balance (Sesudah)")
#transformasi
min_balance <- min(data$balance)
data$balance_log <- log(data$balance - min_balance + 1)
data$duration_log <- log(data$duration + 1)
# cek hasil
summary(data$balance_log)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 7.596 7.738 7.810 7.981 8.595
#statdes
summary(data)
## age job marital education
## Min. :18.00 blue-collar:8951 divorced: 4750 primary : 6163
## 1st Qu.:32.00 management :8137 married :24198 secondary:21208
## Median :39.00 technician :6853 single :11534 tertiary :11476
## Mean :40.63 admin. :4726 unknown : 1635
## 3rd Qu.:48.00 services :3843
## Max. :95.00 retired :1914
## (Other) :6058
## default balance housing loan contact
## no :39679 Min. :-1944.0 no :17627 no :33632 cellular :26140
## yes: 803 1st Qu.: 46.0 yes:22855 yes: 6850 telephone: 2498
## Median : 349.0 unknown :11844
## Mean : 640.6
## 3rd Qu.: 980.8
## Max. : 3462.0
##
## day month duration campaign
## Min. : 1.00 may :12731 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.00 jul : 6443 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.00 aug : 5579 Median : 179.0 Median : 2.000
## Mean :15.77 jun : 4665 Mean : 256.2 Mean : 2.774
## 3rd Qu.:21.00 nov : 3072 3rd Qu.: 316.0 3rd Qu.: 3.000
## Max. :31.00 apr : 2593 Max. :3881.0 Max. :58.000
## (Other): 5399
## pdays previous poutcome y
## Min. : -1.0 Min. : 0.0000 failure: 4337 Min. :0.0000
## 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1638 1st Qu.:0.0000
## Median : -1.0 Median : 0.0000 success: 1286 Median :0.0000
## Mean : 40.3 Mean : 0.5676 unknown:33221 Mean :0.1117
## 3rd Qu.: -1.0 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :871.0 Max. :275.0000 Max. :1.0000
##
## balance_log duration_log
## Min. :0.000 Min. :0.000
## 1st Qu.:7.596 1st Qu.:4.644
## Median :7.738 Median :5.193
## Mean :7.810 Mean :5.165
## 3rd Qu.:7.981 3rd Qu.:5.759
## Max. :8.595 Max. :8.264
##
#visualisasi #age
hist(data$age, main="Distribusi Umur", col="lightblue", xlab="Age")
hist(data$balance, main="Distribusi Saldo", col="lightgreen", xlab="Balance")
hist(data$duration, main="Distribusi Durasi", col="lightcoral", xlab="Duration")
boxplot(data$age, main="Boxplot Umur")
boxplot(data$balance, main="Boxplot Saldo")
barplot(table(data$job), main="Distribusi Pekerjaan", las=2)
barplot(table(data$marital), main="Status Pernikahan")
barplot(table(data$education), main="Pendidikan")
plot(data$age, data$balance,
main="Age vs Balance",
xlab="Age", ylab="Balance")
boxplot(balance ~ job, data=data,
main="Saldo berdasarkan Pekerjaan",
las=2)
boxplot(age ~ marital, data=data,
main="Umur berdasarkan Status Pernikahan")
#Cek Pola & Trend Data
plot(data$age, data$balance,
xlab="Age", ylab="Balance",
main="Pola Age vs Balance")
#Trend dengan Garis Regresi
plot(data$age, data$balance)
abline(lm(balance ~ age, data=data), col="red")
plot(data$duration, data$balance,
xlab="Duration", ylab="Balance",
main="Duration vs Balance")
#analisis korelasi
data_num <- data[sapply(data, is.numeric)]
cor_matrix <- cor(data_num)
print(cor_matrix)
## age balance day duration
## age 1.0000000000 0.0957114229 -0.0066220291 -0.010996557
## balance 0.0957114229 1.0000000000 -0.0001468277 0.037598586
## day -0.0066220291 -0.0001468277 1.0000000000 -0.031916570
## duration -0.0109965566 0.0375985864 -0.0319165696 1.000000000
## campaign 0.0067527843 -0.0337945812 0.1665749558 -0.083418019
## pdays -0.0237253645 0.0287152460 -0.0961230507 -0.004728600
## previous 0.0008275984 0.0356609666 -0.0528602667 -0.001389868
## y 0.0154687772 0.0910488789 -0.0313013436 0.395707785
## balance_log 0.0862380510 0.9595664920 -0.0071456101 0.037316498
## duration_log -0.0084402407 0.0375709622 -0.0587988388 0.813343593
## campaign pdays previous y balance_log
## age 0.006752784 -2.372536e-02 0.0008275984 0.01546878 0.08623805
## balance -0.033794581 2.871525e-02 0.0356609666 0.09104888 0.95956649
## day 0.166574956 -9.612305e-02 -0.0528602667 -0.03130134 -0.00714561
## duration -0.083418019 -4.728600e-03 -0.0013898675 0.39570778 0.03731650
## campaign 1.000000000 -8.916572e-02 -0.0322965993 -0.07249821 -0.03498384
## pdays -0.089165716 1.000000e+00 0.4494793728 0.10074150 0.03343287
## previous -0.032296599 4.494794e-01 1.0000000000 0.09017872 0.03822598
## y -0.072498207 1.007415e-01 0.0901787232 1.00000000 0.09044515
## balance_log -0.034983836 3.343287e-02 0.0382259751 0.09044515 1.00000000
## duration_log -0.208532199 3.228521e-05 -0.0062161377 0.33961950 0.03661333
## duration_log
## age -8.440241e-03
## balance 3.757096e-02
## day -5.879884e-02
## duration 8.133436e-01
## campaign -2.085322e-01
## pdays 3.228521e-05
## previous -6.216138e-03
## y 3.396195e-01
## balance_log 3.661333e-02
## duration_log 1.000000e+00
heatmap(cor_matrix)