고객 데이터를 바탕으로 고객 특성을 분석하여 고객의 정기예금 가입 여부를 예측

1 Data Prepration

getwd()
## [1] "C:/Users/Administrator/Desktop/BIG DATA"
setwd("C:/Users/Administrator/Desktop/BIG DATA/03 R 금융데이터")
Bank<-read.csv("bank.csv",header = TRUE, na.string = c("", " "))
str(Bank)
## 'data.frame':    11162 obs. of  17 variables:
##  $ age      : int  59 56 41 55 54 42 56 60 37 28 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 1 1 10 8 1 5 5 6 10 8 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 2 1 2 3 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 2 2 2 2 3 3 3 2 2 2 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance  : int  2343 45 1270 2476 184 0 830 545 1 5090 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 1 1 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day      : int  5 5 5 5 5 5 6 6 6 6 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ duration : int  1042 1467 1389 579 673 562 1201 1030 608 1297 ...
##  $ campaign : int  1 1 1 1 2 2 1 1 1 3 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ deposit  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
Bank_Flow <-Bank

1.1 Finding Missing Values

colSums(is.na(Bank))
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##   deposit 
##         0

1.2 Outliers

boxplot(Bank)

1.3 Outliers-Age

Box_Age <-boxplot(Bank$age, main="Age")

Upper_side_remove_age <-quantile(Bank$age, 0.75)+1.5*IQR(Bank$age)
Bank[Bank$age > round(Upper_side_remove_age),"age"] <-round(Upper_side_remove_age)

Bank$age_2<-trunc(Bank$age/10)*10
View(Bank)

Bank$age_group <-ifelse(Bank$age >= 18 & Bank$age <40, "Adult",
                        ifelse(Bank$age >=40 & Bank$age < 60, "Middle Age",
                               ifelse(Bank$age >=60, "Older","Child")))

Bank$age_group <-as.factor(Bank$age_group)
table(Bank$age_group) # Categorical Variable using table (Freqency)
## 
##      Adult Middle Age      Older 
##       5869       4513        780
boxplot(Bank$age, main="Age")

1.4 Outlier _ Duration 나누기 60.

boxplot(Bank$duration/60, main="Duration")

Upper_outlier_duration <-quantile(Bank$duration, 0.75)+1.5*IQR(Bank$duration)
Bank[Bank$duration > round(Upper_outlier_duration),"duration"] <-round(Upper_outlier_duration)

boxplot(Bank$duration, main="Duration")

2 Data Engineering

2.1 Job

summary(Bank$job)
##        admin.   blue-collar  entrepreneur     housemaid    management 
##          1334          1944           328           274          2566 
##       retired self-employed      services       student    technician 
##           778           405           923           360          1823 
##    unemployed       unknown 
##           357            70
Bank$job <- ifelse(Bank$job == 'admin.' | Bank$job == 'management', 0,
                   ifelse(Bank$job == 'blue-collar' |Bank$job == 'technician', 1,
                          ifelse(Bank$job == 'entrepreneur' | Bank$job == 'self-employed', 2,
                                 ifelse(Bank$job == 'housemaid' | Bank$job == 'services', 3,
                                        ifelse(Bank$job == 'retired', 4,
                                               ifelse(Bank$job == 'student', 5,
                                                      ifelse(Bank$job == 'unemployed', 6, 7)))))))

table(Bank$job)
## 
##    0    1    2    3    4    5    6    7 
## 3900 3767  733 1197  778  360  357   70

2.2 Marital

summary(Bank$marital)
## divorced  married   single 
##     1293     6351     3518
# 이혼 0, 결혼 1, 싱글 : 2 
Bank$marital <-ifelse(Bank$marital=='divorced',0,
                          ifelse(Bank$marital == 'married',1, 2))

table(Bank$marital)
## 
##    0    1    2 
## 1293 6351 3518

2.3 Duration

summary(Bank$duration_group)
## Length  Class   Mode 
##      0   NULL   NULL
Bank$duration_group <-round(Bank$duration/60,0)

Bank$duration_group <-ifelse(Bank$duration < 5, "1",
                             ifelse(Bank$duration >=5 & Bank$duration <10,"2",
                                    ifelse(Bank$duration>=10 & Bank$duration <15,"3","4")))

Bank$duration_group <-as.factor(Bank$duration_group)
table(Bank$duration_group)
## 
##     1     2     3     4 
##     4    52    69 11037

2.4 Month_Data >> Numeric factor

Bank$month <- ifelse(Bank$month == 'apr', 4,
                     ifelse(Bank$month == 'aug', 8,
                            ifelse(Bank$month == 'dec', 12,
                                   ifelse(Bank$month == 'feb', 2,
                                          ifelse(Bank$month == 'jan', 1,
                                                 ifelse(Bank$month == 'jul', 7,
                                                        ifelse(Bank$month == 'jun', 6,
                                                               ifelse(Bank$month == 'mar', 3,
                                                                      ifelse(Bank$month == 'may', 5,
                                                                             ifelse(Bank$month == 'dec', 12,
                                                                                    ifelse(Bank$month == 'oct', 10,
                                                                                           ifelse(Bank$month == 'nov', 11, 9))))))))))))

table(Bank$month)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12 
##  344  776  276  923 2824 1222 1514 1519  319  392  943  110

2.5 Education

summary(Bank$education)
##   primary secondary  tertiary   unknown 
##      1500      5476      3689       497
Bank$education <-ifelse(Bank$education == 'primary',0,
                      ifelse(Bank$education == 'secondary',1,
                             ifelse(Bank$education == 'tertiary',2,3)))

table(Bank$education)
## 
##    0    1    2    3 
## 1500 5476 3689  497

2.6 Default

Bank$default <-ifelse(Bank$default=='no',0,1)

table(Bank$default)
## 
##     0     1 
## 10994   168

2.7 Housing Loan, Contact, poutcome

Bank$housing <-ifelse(Bank$housing=='no',0,1)
table(Bank$housing)
## 
##    0    1 
## 5881 5281
Bank$loan <-ifelse(Bank$loan=='no',0,1)
table(Bank$loan)
## 
##    0    1 
## 9702 1460
summary(Bank$contact)
##  cellular telephone   unknown 
##      8042       774      2346
Bank$contact <-ifelse(Bank$contact == 'cellular',0,
                          ifelse(Bank$contact=='telephone',1,2))

table(Bank$New_contact)
## < table of extent 0 >
summary(Bank$poutcome)
## failure   other success unknown 
##    1228     537    1071    8326
Bank$poutcome <-ifelse(Bank$poutcome == 'other' | Bank$poutcome == 'unknown',0,
                          ifelse(Bank$contact=='failure',1,2))


table(Bank$poutcome)
## 
##    0    2 
## 8863 2299

3 Visualization

3.1 Deposit Rate

Deposit<-Bank%>%
  group_by(deposit)%>%
  summarise(cnt=n())%>%
  mutate(Percentage=round(cnt/sum(cnt)*100))

Deposit
## # A tibble: 2 x 3
##   deposit   cnt Percentage
##   <fct>   <int>      <dbl>
## 1 no       5873         53
## 2 yes      5289         47
ggplot(Deposit, aes(x=deposit, y=Percentage))+
  geom_bar(stat = "identity", fill="lightgreen", col="black")+
  geom_label(data=Deposit, aes(x=deposit, y=Percentage, label=paste0(Percentage, "%"),
                               group=deposit))+
  theme_bw()

3.2 Age

Bank$age <-as.numeric(Bank$age)

ggplot(Bank, aes(age, fill=deposit))+
  geom_bar()+
  ggtitle("Deposit by Age")+
  theme_economist_white() 

ggplot(Bank, aes(x=age_group, fill=deposit))+
  geom_bar(stat="count",position="fill")+
  facet_wrap(~job)

ggplot(Bank, aes(x=age_group, fill=deposit))+
  geom_bar(position="stack")+
  scale_fill_brewer(palette = "Set1")+
  theme_economist_white()+
  geom_hline(yintercept = mean(Bank$deposit))+
  ggtitle("Deposite rate by Age Groups")

  • Depsotie Yes + Age_ Groups (Older Middling Adults) - Bar Chart
Bank$age_group<-as.factor(Bank$age_group)
age_group_deposit<-xtabs(~age_group + deposit, data=Bank)
age_group_deposit<-as.data.frame(age_group_deposit)


age_group_deposit_yes <-subset(age_group_deposit, age_group_deposit$deposit=="yes")
kable(age_group_deposit_yes)
age_group deposit Freq
4 Adult yes 2841
5 Middle Age yes 1848
6 Older yes 600
ggplot(age_group_deposit_yes, aes(x=age_group, y=Freq))+
  geom_bar(stat = "identity", fill="green", col="black")+
  coord_flip()+
  geom_label(data=age_group_deposit_yes, aes(x=age_group, y=Freq, label=paste0(Freq,""),
                                             group=deposit), position=position_stack(vjust=0.5))

*Depsotie Yes + Age_ Groups (Older Middling Adults) - Pie Chart

ggplot(age_group_deposit_yes, aes(x="", y=Freq, fill=factor(age_group)))+
  geom_bar(width = 1, stat = "identity")+
  theme(axis.line = element_blank(),
        plot.title = element_text(hjust = 0.5)) +
  labs(fill = "age", x = NULL, y = NULL,
       title = "Deposit Yes Age_group Contribution") +
  coord_polar(theta = "y", start = 0)+
  theme_economist()

## Job & Deposit

ggplot(Bank, aes(job, fill=deposit))+
  geom_bar(position = "stack")+
  ylab("Customers")+
  theme_minimal()

3.3 Marital=3 & Deposit

ggplot(Bank,aes(marital, fill=deposit))+
  geom_bar(position = "fill")+
  ylab("Customer")+
  theme_minimal()

3.4 Educcation & Deposit

ggplot(Bank, aes(education, fill=deposit))+
  geom_bar(position = "stack")+
  scale_fill_brewer(palette = "Set2")+
  theme_minimal()

3.5 Default & Deposit

ggplot(Bank, aes(default, fill=deposit))+
  geom_bar(position = "stack")+
  scale_fill_brewer(palette = "Set2")

####Deposit rate by default and eudacation

ggplot(Bank, aes(default, fill=deposit))+
  geom_bar(position="stack")+
  facet_wrap(~education)

3.6 Balance & Deposit

ggplot(Bank, aes(balance))+geom_histogram(aes(fill=deposit), color="black")+
  ggtitle("Blance VS Deposit")+
  theme_economist()

ggplot(Bank, aes(x=deposit, y=balance))+
  geom_point(stat="identity")+
  theme_minimal()+
  coord_flip()

3.7 Housing (Yes, No) & Deposit

Housing_Data<-Bank%>%
  group_by(housing, deposit)%>%
  summarise(count=n())%>%
  mutate(Percent= round(count/sum(count)*100))

kable(Housing_Data)
housing deposit count Percent
0 no 2527 43
0 yes 3354 57
1 no 3346 63
1 yes 1935 37
ggplot(Housing_Data, aes(x=housing,y=Percent, fill=factor(deposit)))+
  geom_bar(stat= "identity", position = "stack")+
  geom_label(data=Housing_Data, aes(x=housing, y=Percent,label=paste0(Percent,"%"),
                               group=deposit), position = position_stack(vjust=0.5))+
  theme_minimal()

3.8 Loan (Yes, No) - Deposite

Loan_Data<-Bank%>%
  group_by(loan, deposit)%>%
  summarise(count=n())%>%
  mutate(Percent=round(count/sum(count)*100))

kable(Loan_Data, format = "markdown")
loan deposit count Percent
0 no 4897 50
0 yes 4805 50
1 no 976 67
1 yes 484 33
ggplot(Loan_Data, aes(x=loan,y=Percent, fill=factor(deposit)))+
  geom_bar(stat= "identity", position = "stack")+
  geom_label(data=Loan_Data, aes(x=loan, y=Percent,label=paste0(Percent,"%"),
                               group=deposit), position = position_stack(vjust=0.5))+
  theme_minimal()

#- admin., management : 0
#- blue-collar, technician : 1
#- entrepreneur, self-employed : 2
#- housemaid, services : 3  
#- retired : 4
#- student : 5
#- unemployed : 6
#- unknown : 7


Bank$job_name <- factor(Bank$job, level= c("0", "1","2","3","4","5","6","7"),
                   labels = c("'Admin & Management ",
                               "blue-collar, technician",
                               "entrepreneur, self-employed",
                               "housemaid, services",
                                "retired",
                               "student",
                               "unemployed",
                               "unknown"))

ggplot(Bank, aes(x=loan, fill=deposit))+
  geom_bar(position="fill", col="black")+
  facet_wrap(~job_name)+
  scale_fill_brewer(palette = "Set1")+
  theme_minimal()

3.9 Month - Deposite

Bank$month <-as.factor(Bank$month)

ggplot(Bank, aes(x=month, fill=deposit))+
  geom_bar(position="stack")+
  scale_fill_brewer(palette = "Set2")+
  theme_minimal()

Month_deposite <-Bank%>%
  group_by(deposit, month)%>%  #groupby 함수 사용시 , y~ x 
  summarise(count=n())%>%
  mutate(prob=count/sum(count))

kable(Month_deposite)
deposit month count prob
no 1 202 0.0343947
no 2 335 0.0570407
no 3 28 0.0047676
no 4 346 0.0589137
no 5 1899 0.3233441
no 6 676 0.1151030
no 7 887 0.1510301
no 8 831 0.1414950
no 9 50 0.0085135
no 10 69 0.0117487
no 11 540 0.0919462
no 12 10 0.0017027
yes 1 142 0.0268482
yes 2 441 0.0833806
yes 3 248 0.0468898
yes 4 577 0.1090943
yes 5 925 0.1748913
yes 6 546 0.1032331
yes 7 627 0.1185479
yes 8 688 0.1300813
yes 9 269 0.0508603
yes 10 323 0.0610701
yes 11 403 0.0761959
yes 12 100 0.0189072
ggplot(Month_deposite, aes(x=month, y=prob, colour=deposit))+
  geom_point(data=Month_deposite, aes(group=deposit),size=3)+
  geom_line(data=Month_deposite, aes(group=deposit))+
  scale_x_discrete("Month")+
  theme_minimal()

3.10 Campaine (이산형) - Deposite

ggplot(Bank, aes(x=campaign, fill=deposit))+
  geom_bar(position = "stack")

Top_compaign <- Bank[order(Bank$campaign, decreasing = T),]%>%
  slice(1:20)
kable(Top_compaign)
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome deposit age_2 age_group duration_group job_name
45 0 1 3 0 9051 1 0 2 19 5 124 63 -1 0 0 no 40 Middle Age 4 ’Admin & Management
51 1 1 3 0 41 1 0 1 9 7 16 43 -1 0 0 no 50 Middle Age 4 blue-collar, technician
33 1 1 1 0 0 1 1 0 31 7 16 43 -1 0 0 no 30 Adult 4 blue-collar, technician
42 1 1 0 0 170 1 0 2 19 5 51 41 -1 0 0 no 40 Middle Age 4 blue-collar, technician
49 3 0 1 0 933 0 1 0 29 7 92 33 -1 0 0 no 40 Middle Age 4 housemaid, services
50 2 1 0 0 461 1 0 2 15 5 651 32 -1 0 0 yes 50 Middle Age 4 entrepreneur, self-employed
42 2 1 2 0 1932 1 0 2 16 5 192 32 -1 0 0 no 40 Middle Age 4 entrepreneur, self-employed
31 3 1 3 0 -82 1 0 0 31 7 543 31 -1 0 0 no 30 Adult 4 housemaid, services
44 3 0 1 0 608 1 1 2 18 6 19 30 -1 0 0 no 40 Middle Age 4 housemaid, services
51 3 1 1 0 7 1 0 0 31 7 8 30 -1 0 0 no 50 Middle Age 2 housemaid, services
33 1 2 2 0 34 0 0 0 28 8 13 30 -1 0 0 no 30 Adult 3 blue-collar, technician
35 0 1 2 0 3168 0 0 0 22 8 24 30 -1 0 0 no 30 Adult 4 ’Admin & Management
46 3 1 1 0 271 1 0 0 30 7 1013 29 -1 0 0 yes 40 Middle Age 4 housemaid, services
58 0 1 1 0 1464 1 1 2 5 6 53 29 -1 0 0 no 50 Middle Age 4 ’Admin & Management
38 1 0 1 0 1686 1 0 2 19 6 9 28 -1 0 0 no 30 Adult 2 blue-collar, technician
32 1 2 2 0 723 0 0 0 28 8 68 27 -1 0 0 no 30 Adult 4 blue-collar, technician
24 5 2 0 0 0 0 0 2 5 6 25 26 -1 0 0 no 20 Adult 4 student
26 1 2 1 0 0 0 1 0 30 7 65 26 -1 0 0 no 20 Adult 4 blue-collar, technician
38 0 0 2 0 155 1 0 2 20 6 25 26 -1 0 0 no 30 Adult 4 ’Admin & Management
47 1 1 2 1 -813 1 1 2 16 6 179 25 -1 0 0 no 40 Middle Age 4 blue-collar, technician
ggplot(Top_compaign,aes(x=education, fill=deposit))+
  geom_bar(position="stack")

4 Flow Graph

Bank_Flow%>%
  count(marital, education, default, deposit)%>%
  mutate(marital=fct_relevel(as.factor(marital), c("divorced","married","single")))%>%
  mutate(education=fct_relevel(as.factor(education), c("primary","secondary","tertiary","unknown")))%>%
  mutate(default=fct_relevel(as.factor(default), c("no","yes")))%>%
  mutate(deposit=fct_relevel(as.factor(deposit), c("no","yes")))%>%
  ggplot(aes(axis1=marital, axis2=education, axis3=default, y=deposit))+
  geom_alluvium(aes(fill=deposit), aes.bind=TRUE, knot.pos=1/6)+
  geom_stratum(width = 1/3, fill = "white", color = "black")+
  geom_text(stat = "stratum", label.strata = TRUE)+
  scale_x_discrete(limits = c("marital", "education", "default"), expand = c(.05, .05))

5 Mosaic Plot

Bank_DF <-Bank_Flow[,2:4]

mosaic(~marital+education, data=Bank_DF,shade = TRUE, legend = TRUE)

mosaic(~job+education+marital, data=Bank_DF,shade = TRUE, legend = TRUE)

6 Stastics

7 Modelling