Deposito berjangka adalah produk bank sejenis tabungan, dimana uang yang disetorkan dalam deposito berjangka tidak boleh ditarik nasabah dan baru bisa dicairkan sesuai dengan tanggal jatuh temponya. Proses klasterisasi menggunakan metode K-MEANS untuk melakukan pengelompokan dari pelanggan yang tidak menggunakan deposito berjangka.
Membaca data dari file format csv
data <- read.csv("bank-additional-full.csv", header = T, sep=";")
Struktur Data
str(data)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
Deskriptif Data
dim(data)
## [1] 41188 21
names(data)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
summary(data)
## age job marital
## Min. :17.00 admin. :10422 divorced: 4612
## 1st Qu.:32.00 blue-collar: 9254 married :24928
## Median :38.00 technician : 6743 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## education default housing
## university.degree :12168 no :32588 no :18622
## high.school : 9515 unknown: 8597 unknown: 990
## basic.9y : 6045 yes : 3 yes :21576
## professional.course: 5243
## basic.4y : 4176
## basic.6y : 2292
## (Other) : 1749
## loan contact month day_of_week
## no :33950 cellular :26144 may :13769 fri:7827
## unknown: 990 telephone:15044 jul : 7174 mon:8514
## yes : 6248 aug : 6178 thu:8623
## jun : 5318 tue:8090
## nov : 4101 wed:8134
## apr : 2632
## (Other): 2016
## duration campaign pdays previous
## Min. : 0.0 Min. : 1.000 Min. : 0.0 Min. :0.000
## 1st Qu.: 102.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000
## Median : 180.0 Median : 2.000 Median :999.0 Median :0.000
## Mean : 258.3 Mean : 2.568 Mean :962.5 Mean :0.173
## 3rd Qu.: 319.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :4918.0 Max. :56.000 Max. :999.0 Max. :7.000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure : 4252 Min. :-3.40000 Min. :92.20 Min. :-50.8
## nonexistent:35563 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7
## success : 1373 Median : 1.10000 Median :93.75 Median :-41.8
## Mean : 0.08189 Mean :93.58 Mean :-40.5
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4
## Max. : 1.40000 Max. :94.77 Max. :-26.9
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :36548
## 1st Qu.:1.344 1st Qu.:5099 yes: 4640
## Median :4.857 Median :5191
## Mean :3.621 Mean :5167
## 3rd Qu.:4.961 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
Visualisasi barlot untuk setiap variabel data bank
par(mfrow=c(2,2))
for(i in 1:length(data))
{barplot(prop.table(table(data[,i])) ,
xlab=names(data[i]), ylab= "Frequency (%)" , col = rainbow(3))}
Melihat data yang tidak bernilai
sum(is.na(data))
## [1] 0
Mengubah data ke tipe numerik
data$job <- as.numeric(data$job)
data$marital <- as.numeric(data$marital)
data$education <- as.numeric(data$education)
data$default <- as.numeric(data$default)
data$housing <- as.numeric(data$housing)
data$loan <- as.numeric(data$loan)
data$contact <- as.numeric(data$contact)
data$month <- as.numeric(data$month)
data$day_of_week <- as.numeric(data$day_of_week)
data$day_of_week <- as.numeric(data$poutcome)
data$age <- as.numeric(data$age)
data$duration <- as.numeric(data$duration)
data$campaign <- as.numeric(data$campaign)
data$pdays <- as.numeric(data$pdays)
data$previous <- as.numeric(data$previous)
data$emp.var.rate <- as.numeric(data$emp.var.rate)
data$cons.price.idx <- as.numeric(data$cons.price.idx)
data$cons.conf.idx <- as.numeric(data$cons.conf.idx)
data$nr.employed <- as.numeric(data$nr.employed)
data$poutcome <- as.numeric(data$poutcome)
Melihat data yang telah diubah
sapply(data,class)
## age job marital education default
## "numeric" "numeric" "numeric" "numeric" "numeric"
## housing loan contact month day_of_week
## "numeric" "numeric" "numeric" "numeric" "numeric"
## duration campaign pdays previous poutcome
## "numeric" "numeric" "numeric" "numeric" "numeric"
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## "numeric" "numeric" "numeric" "numeric" "numeric"
## y
## "factor"
Mengubah data ke tipe biner
data$y <- ifelse(data$y =="yes", 1,0)
bank.tidak.berjangka <- data[which(data$y<1),]
Mengatur ulang data dan menghilangkan nilai y
bank.tidak.berjangka2 <- bank.tidak.berjangka[-21]
Normalisasi data
bank.tidak.berjangka_scale <- scale(bank.tidak.berjangka2)
str(bank.tidak.berjangka_scale)
## num [1:36548, 1:20] 1.62544 1.72647 -0.29411 0.00897 1.62544 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:36548] "1" "2" "3" "4" ...
## ..$ : chr [1:20] "age" "job" "marital" "education" ...
## - attr(*, "scaled:center")= Named num [1:20] 39.91 4.69 2.16 4.7 1.22 ...
## ..- attr(*, "names")= chr [1:20] "age" "job" "marital" "education" ...
## - attr(*, "scaled:scale")= Named num [1:20] 9.898 3.587 0.605 2.134 0.417 ...
## ..- attr(*, "names")= chr [1:20] "age" "job" "marital" "education" ...
Selanjutnya, kita menentukan jumlah cluster
par(mfrow=c(1,1))
wss <- (nrow(bank.tidak.berjangka_scale))*sum(apply(bank.tidak.berjangka_scale,2,var))
for(i in 2:7){
wss[i]<-sum(kmeans(bank.tidak.berjangka_scale,centers=i)$withinss)
}
plot(1:7, wss, type="b", xlab="Jumlah Kluster",
ylab="Jumlah Pengelompokan")
Berdasarkan terjadinya siku keatas pada nilai 3, jadi nilai kluster 3 Membuat model KMEANS
fit <- kmeans(bank.tidak.berjangka_scale, 3)
Hasil Yang didapat
aggregate(bank.tidak.berjangka2, by=list(fit$cluster), FUN = mean)
## Group.1 age job marital education default housing loan
## 1 1 39.01144 4.668458 2.252859 4.721097 1.121951 2.164255 1.330302
## 2 2 39.62757 4.633872 2.199781 4.719375 1.133187 2.138394 1.334886
## 3 3 40.20619 4.707511 2.131971 4.695812 1.264761 2.030263 1.327354
## contact month day_of_week duration campaign pdays previous
## 1 1.119057 5.367645 2.065730 220.4066 2.217445 933.7065 0.09204906
## 2 1.072897 6.000274 1.001096 211.7841 2.028227 979.7555 1.14277884
## 3 1.511934 5.089814 2.000000 222.2582 2.836791 999.0000 0.00000000
## poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 1 2.065730 -2.003528 92.96657 -43.54418 1.231444 5081.824
## 2 1.001096 -1.611209 93.05368 -43.34130 1.740004 5099.719
## 3 2.000000 1.151033 93.86237 -39.36682 4.836460 5213.746
table(fit$cluster,bank.tidak.berjangka$y)
##
## 0
## 1 7257
## 2 3649
## 3 25642
Visualisasi hasil kluster
plot(bank.tidak.berjangka2[c(1,2)],col=fit$cluster)
bank.berjangka <- data[which(data$y<1),]
Mengatur ulang data dan menghilangkan nilai y
bank.berjangka2 <- bank.berjangka[-21]
Normalisasi data
bank.berjangka_scale <- scale(bank.berjangka2)
str(bank.berjangka_scale)
## num [1:36548, 1:20] 1.62544 1.72647 -0.29411 0.00897 1.62544 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:36548] "1" "2" "3" "4" ...
## ..$ : chr [1:20] "age" "job" "marital" "education" ...
## - attr(*, "scaled:center")= Named num [1:20] 39.91 4.69 2.16 4.7 1.22 ...
## ..- attr(*, "names")= chr [1:20] "age" "job" "marital" "education" ...
## - attr(*, "scaled:scale")= Named num [1:20] 9.898 3.587 0.605 2.134 0.417 ...
## ..- attr(*, "names")= chr [1:20] "age" "job" "marital" "education" ...
Menentukan jumlah cluster
wss <- (nrow(bank.berjangka_scale))*sum(apply(bank.berjangka_scale,2,var))
for(i in 2:7){
wss[i]<-sum(kmeans(bank.berjangka_scale,centers=i)$withinss)
}
plot(1:7, wss, type="b", xlab="Jumlah Klaster",
ylab="Jumlah Pengelompokan")
Berdasarkan terjadinya siku keatas pada nilai 5, jadi nilai kluster 5
fit <- kmeans(bank.berjangka_scale, 5)
Hasil yang di dapat
aggregate(bank.berjangka2, by=list(fit$cluster), FUN = mean)
## Group.1 age job marital education default housing loan
## 1 1 40.04989 4.635956 2.145459 4.759768 1.264256 2.140444 2.996832
## 2 2 39.20481 4.659402 2.235448 4.720313 1.125361 2.156375 1.331191
## 3 3 40.28053 4.574660 2.104646 4.366495 1.307743 1.922290 1.033881
## 4 4 40.21435 4.877898 2.152851 5.023584 1.221813 2.108867 1.023385
## 5 5 39.96967 4.661107 2.165277 4.680061 1.231236 1.996967 1.225171
## contact month day_of_week duration campaign pdays previous
## 1 1.492344 5.009768 2.001320 195.6613 2.845565 997.6869 1.583949e-03
## 2 1.103381 5.575114 1.703083 204.3762 2.151998 952.1256 4.444444e-01
## 3 2.000000 6.139934 2.000094 196.3047 2.920601 998.9068 9.385265e-05
## 4 1.005672 3.966564 2.002488 182.0909 2.772714 996.5245 2.587322e-03
## 5 1.432146 5.491281 1.991660 920.7172 2.552691 991.4754 2.501895e-02
## poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 1 2.001320 1.1557550 93.85056 -39.40847 4.837511 5214.309
## 2 1.703083 -1.8773214 92.99589 -43.49213 1.392734 5087.452
## 3 2.000094 1.1908024 94.12015 -38.72639 4.873467 5205.953
## 4 2.002488 1.1152951 93.59432 -39.96152 4.801161 5221.961
## 5 1.991660 0.7423048 93.74362 -40.28749 4.435521 5199.273
table(fit$cluster,bank.berjangka$y)
##
## 0
## 1 3788
## 2 10737
## 3 10655
## 4 10049
## 5 1319
Visualisasi hasil kluster
plot(bank.berjangka2[c(1,2)],col=fit$cluster)