Data Prepration
getwd()
## [1] "C:/Users/Administrator/Desktop/BIG DATA"
setwd("C:/Users/Administrator/Desktop/BIG DATA/03 R 금융데이터")
Bank<-read.csv("bank.csv",header = TRUE, na.string = c("", " "))
str(Bank)
## 'data.frame': 11162 obs. of 17 variables:
## $ age : int 59 56 41 55 54 42 56 60 37 28 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 1 1 10 8 1 5 5 6 10 8 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 2 1 2 3 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 2 2 2 2 3 3 3 2 2 2 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ balance : int 2343 45 1270 2476 184 0 830 545 1 5090 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 1 1 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ day : int 5 5 5 5 5 5 6 6 6 6 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ duration : int 1042 1467 1389 579 673 562 1201 1030 608 1297 ...
## $ campaign : int 1 1 1 1 2 2 1 1 1 3 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ deposit : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
Bank_Flow <-Bank
Finding Missing Values
colSums(is.na(Bank))
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## deposit
## 0
Outliers
boxplot(Bank)

Outliers-Age
Box_Age <-boxplot(Bank$age, main="Age")

Upper_side_remove_age <-quantile(Bank$age, 0.75)+1.5*IQR(Bank$age)
Bank[Bank$age > round(Upper_side_remove_age),"age"] <-round(Upper_side_remove_age)
Bank$age_2<-trunc(Bank$age/10)*10
View(Bank)
Bank$age_group <-ifelse(Bank$age >= 18 & Bank$age <40, "Adult",
ifelse(Bank$age >=40 & Bank$age < 60, "Middle Age",
ifelse(Bank$age >=60, "Older","Child")))
Bank$age_group <-as.factor(Bank$age_group)
table(Bank$age_group) # Categorical Variable using table (Freqency)
##
## Adult Middle Age Older
## 5869 4513 780
boxplot(Bank$age, main="Age")

Outlier _ Duration 나누기 60.
boxplot(Bank$duration/60, main="Duration")

Upper_outlier_duration <-quantile(Bank$duration, 0.75)+1.5*IQR(Bank$duration)
Bank[Bank$duration > round(Upper_outlier_duration),"duration"] <-round(Upper_outlier_duration)
boxplot(Bank$duration, main="Duration")

Data Engineering
Job
summary(Bank$job)
## admin. blue-collar entrepreneur housemaid management
## 1334 1944 328 274 2566
## retired self-employed services student technician
## 778 405 923 360 1823
## unemployed unknown
## 357 70
Bank$job <- ifelse(Bank$job == 'admin.' | Bank$job == 'management', 0,
ifelse(Bank$job == 'blue-collar' |Bank$job == 'technician', 1,
ifelse(Bank$job == 'entrepreneur' | Bank$job == 'self-employed', 2,
ifelse(Bank$job == 'housemaid' | Bank$job == 'services', 3,
ifelse(Bank$job == 'retired', 4,
ifelse(Bank$job == 'student', 5,
ifelse(Bank$job == 'unemployed', 6, 7)))))))
table(Bank$job)
##
## 0 1 2 3 4 5 6 7
## 3900 3767 733 1197 778 360 357 70
Marital
summary(Bank$marital)
## divorced married single
## 1293 6351 3518
# 이혼 0, 결혼 1, 싱글 : 2
Bank$marital <-ifelse(Bank$marital=='divorced',0,
ifelse(Bank$marital == 'married',1, 2))
table(Bank$marital)
##
## 0 1 2
## 1293 6351 3518
Duration
summary(Bank$duration_group)
## Length Class Mode
## 0 NULL NULL
Bank$duration_group <-round(Bank$duration/60,0)
Bank$duration_group <-ifelse(Bank$duration < 5, "1",
ifelse(Bank$duration >=5 & Bank$duration <10,"2",
ifelse(Bank$duration>=10 & Bank$duration <15,"3","4")))
Bank$duration_group <-as.factor(Bank$duration_group)
table(Bank$duration_group)
##
## 1 2 3 4
## 4 52 69 11037
Month_Data >> Numeric factor
Bank$month <- ifelse(Bank$month == 'apr', 4,
ifelse(Bank$month == 'aug', 8,
ifelse(Bank$month == 'dec', 12,
ifelse(Bank$month == 'feb', 2,
ifelse(Bank$month == 'jan', 1,
ifelse(Bank$month == 'jul', 7,
ifelse(Bank$month == 'jun', 6,
ifelse(Bank$month == 'mar', 3,
ifelse(Bank$month == 'may', 5,
ifelse(Bank$month == 'dec', 12,
ifelse(Bank$month == 'oct', 10,
ifelse(Bank$month == 'nov', 11, 9))))))))))))
table(Bank$month)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 344 776 276 923 2824 1222 1514 1519 319 392 943 110
Education
summary(Bank$education)
## primary secondary tertiary unknown
## 1500 5476 3689 497
Bank$education <-ifelse(Bank$education == 'primary',0,
ifelse(Bank$education == 'secondary',1,
ifelse(Bank$education == 'tertiary',2,3)))
table(Bank$education)
##
## 0 1 2 3
## 1500 5476 3689 497
Default
Bank$default <-ifelse(Bank$default=='no',0,1)
table(Bank$default)
##
## 0 1
## 10994 168
Visualization
Deposit Rate
Deposit<-Bank%>%
group_by(deposit)%>%
summarise(cnt=n())%>%
mutate(Percentage=round(cnt/sum(cnt)*100))
Deposit
## # A tibble: 2 x 3
## deposit cnt Percentage
## <fct> <int> <dbl>
## 1 no 5873 53
## 2 yes 5289 47
ggplot(Deposit, aes(x=deposit, y=Percentage))+
geom_bar(stat = "identity", fill="lightgreen", col="black")+
geom_label(data=Deposit, aes(x=deposit, y=Percentage, label=paste0(Percentage, "%"),
group=deposit))+
theme_bw()

Age
Bank$age <-as.numeric(Bank$age)
ggplot(Bank, aes(age, fill=deposit))+
geom_bar()+
ggtitle("Deposit by Age")+
theme_economist_white()

ggplot(Bank, aes(x=age_group, fill=deposit))+
geom_bar(stat="count",position="fill")+
facet_wrap(~job)

ggplot(Bank, aes(x=age_group, fill=deposit))+
geom_bar(position="stack")+
scale_fill_brewer(palette = "Set1")+
theme_economist_white()+
geom_hline(yintercept = mean(Bank$deposit))+
ggtitle("Deposite rate by Age Groups")

- Depsotie Yes + Age_ Groups (Older Middling Adults) - Bar Chart
Bank$age_group<-as.factor(Bank$age_group)
age_group_deposit<-xtabs(~age_group + deposit, data=Bank)
age_group_deposit<-as.data.frame(age_group_deposit)
age_group_deposit_yes <-subset(age_group_deposit, age_group_deposit$deposit=="yes")
kable(age_group_deposit_yes)
| 4 |
Adult |
yes |
2841 |
| 5 |
Middle Age |
yes |
1848 |
| 6 |
Older |
yes |
600 |
ggplot(age_group_deposit_yes, aes(x=age_group, y=Freq))+
geom_bar(stat = "identity", fill="green", col="black")+
coord_flip()+
geom_label(data=age_group_deposit_yes, aes(x=age_group, y=Freq, label=paste0(Freq,""),
group=deposit), position=position_stack(vjust=0.5))

*Depsotie Yes + Age_ Groups (Older Middling Adults) - Pie Chart
ggplot(age_group_deposit_yes, aes(x="", y=Freq, fill=factor(age_group)))+
geom_bar(width = 1, stat = "identity")+
theme(axis.line = element_blank(),
plot.title = element_text(hjust = 0.5)) +
labs(fill = "age", x = NULL, y = NULL,
title = "Deposit Yes Age_group Contribution") +
coord_polar(theta = "y", start = 0)+
theme_economist()
## Job & Deposit
ggplot(Bank, aes(job, fill=deposit))+
geom_bar(position = "stack")+
ylab("Customers")+
theme_minimal()

Marital=3 & Deposit
ggplot(Bank,aes(marital, fill=deposit))+
geom_bar(position = "fill")+
ylab("Customer")+
theme_minimal()

Educcation & Deposit
ggplot(Bank, aes(education, fill=deposit))+
geom_bar(position = "stack")+
scale_fill_brewer(palette = "Set2")+
theme_minimal()

Default & Deposit
ggplot(Bank, aes(default, fill=deposit))+
geom_bar(position = "stack")+
scale_fill_brewer(palette = "Set2")

####Deposit rate by default and eudacation
ggplot(Bank, aes(default, fill=deposit))+
geom_bar(position="stack")+
facet_wrap(~education)

Balance & Deposit
ggplot(Bank, aes(balance))+geom_histogram(aes(fill=deposit), color="black")+
ggtitle("Blance VS Deposit")+
theme_economist()

ggplot(Bank, aes(x=deposit, y=balance))+
geom_point(stat="identity")+
theme_minimal()+
coord_flip()

Housing (Yes, No) & Deposit
Housing_Data<-Bank%>%
group_by(housing, deposit)%>%
summarise(count=n())%>%
mutate(Percent= round(count/sum(count)*100))
kable(Housing_Data)
| 0 |
no |
2527 |
43 |
| 0 |
yes |
3354 |
57 |
| 1 |
no |
3346 |
63 |
| 1 |
yes |
1935 |
37 |
ggplot(Housing_Data, aes(x=housing,y=Percent, fill=factor(deposit)))+
geom_bar(stat= "identity", position = "stack")+
geom_label(data=Housing_Data, aes(x=housing, y=Percent,label=paste0(Percent,"%"),
group=deposit), position = position_stack(vjust=0.5))+
theme_minimal()

Loan (Yes, No) - Deposite
Loan_Data<-Bank%>%
group_by(loan, deposit)%>%
summarise(count=n())%>%
mutate(Percent=round(count/sum(count)*100))
kable(Loan_Data, format = "markdown")
| 0 |
no |
4897 |
50 |
| 0 |
yes |
4805 |
50 |
| 1 |
no |
976 |
67 |
| 1 |
yes |
484 |
33 |
ggplot(Loan_Data, aes(x=loan,y=Percent, fill=factor(deposit)))+
geom_bar(stat= "identity", position = "stack")+
geom_label(data=Loan_Data, aes(x=loan, y=Percent,label=paste0(Percent,"%"),
group=deposit), position = position_stack(vjust=0.5))+
theme_minimal()

#- admin., management : 0
#- blue-collar, technician : 1
#- entrepreneur, self-employed : 2
#- housemaid, services : 3
#- retired : 4
#- student : 5
#- unemployed : 6
#- unknown : 7
Bank$job_name <- factor(Bank$job, level= c("0", "1","2","3","4","5","6","7"),
labels = c("'Admin & Management ",
"blue-collar, technician",
"entrepreneur, self-employed",
"housemaid, services",
"retired",
"student",
"unemployed",
"unknown"))
ggplot(Bank, aes(x=loan, fill=deposit))+
geom_bar(position="fill", col="black")+
facet_wrap(~job_name)+
scale_fill_brewer(palette = "Set1")+
theme_minimal()

Month - Deposite
Bank$month <-as.factor(Bank$month)
ggplot(Bank, aes(x=month, fill=deposit))+
geom_bar(position="stack")+
scale_fill_brewer(palette = "Set2")+
theme_minimal()

Month_deposite <-Bank%>%
group_by(deposit, month)%>% #groupby 함수 사용시 , y~ x
summarise(count=n())%>%
mutate(prob=count/sum(count))
kable(Month_deposite)
| no |
1 |
202 |
0.0343947 |
| no |
2 |
335 |
0.0570407 |
| no |
3 |
28 |
0.0047676 |
| no |
4 |
346 |
0.0589137 |
| no |
5 |
1899 |
0.3233441 |
| no |
6 |
676 |
0.1151030 |
| no |
7 |
887 |
0.1510301 |
| no |
8 |
831 |
0.1414950 |
| no |
9 |
50 |
0.0085135 |
| no |
10 |
69 |
0.0117487 |
| no |
11 |
540 |
0.0919462 |
| no |
12 |
10 |
0.0017027 |
| yes |
1 |
142 |
0.0268482 |
| yes |
2 |
441 |
0.0833806 |
| yes |
3 |
248 |
0.0468898 |
| yes |
4 |
577 |
0.1090943 |
| yes |
5 |
925 |
0.1748913 |
| yes |
6 |
546 |
0.1032331 |
| yes |
7 |
627 |
0.1185479 |
| yes |
8 |
688 |
0.1300813 |
| yes |
9 |
269 |
0.0508603 |
| yes |
10 |
323 |
0.0610701 |
| yes |
11 |
403 |
0.0761959 |
| yes |
12 |
100 |
0.0189072 |
ggplot(Month_deposite, aes(x=month, y=prob, colour=deposit))+
geom_point(data=Month_deposite, aes(group=deposit),size=3)+
geom_line(data=Month_deposite, aes(group=deposit))+
scale_x_discrete("Month")+
theme_minimal()

Campaine (이산형) - Deposite
ggplot(Bank, aes(x=campaign, fill=deposit))+
geom_bar(position = "stack")

Top_compaign <- Bank[order(Bank$campaign, decreasing = T),]%>%
slice(1:20)
kable(Top_compaign)
| 45 |
0 |
1 |
3 |
0 |
9051 |
1 |
0 |
2 |
19 |
5 |
124 |
63 |
-1 |
0 |
0 |
no |
40 |
Middle Age |
4 |
’Admin & Management |
| 51 |
1 |
1 |
3 |
0 |
41 |
1 |
0 |
1 |
9 |
7 |
16 |
43 |
-1 |
0 |
0 |
no |
50 |
Middle Age |
4 |
blue-collar, technician |
| 33 |
1 |
1 |
1 |
0 |
0 |
1 |
1 |
0 |
31 |
7 |
16 |
43 |
-1 |
0 |
0 |
no |
30 |
Adult |
4 |
blue-collar, technician |
| 42 |
1 |
1 |
0 |
0 |
170 |
1 |
0 |
2 |
19 |
5 |
51 |
41 |
-1 |
0 |
0 |
no |
40 |
Middle Age |
4 |
blue-collar, technician |
| 49 |
3 |
0 |
1 |
0 |
933 |
0 |
1 |
0 |
29 |
7 |
92 |
33 |
-1 |
0 |
0 |
no |
40 |
Middle Age |
4 |
housemaid, services |
| 50 |
2 |
1 |
0 |
0 |
461 |
1 |
0 |
2 |
15 |
5 |
651 |
32 |
-1 |
0 |
0 |
yes |
50 |
Middle Age |
4 |
entrepreneur, self-employed |
| 42 |
2 |
1 |
2 |
0 |
1932 |
1 |
0 |
2 |
16 |
5 |
192 |
32 |
-1 |
0 |
0 |
no |
40 |
Middle Age |
4 |
entrepreneur, self-employed |
| 31 |
3 |
1 |
3 |
0 |
-82 |
1 |
0 |
0 |
31 |
7 |
543 |
31 |
-1 |
0 |
0 |
no |
30 |
Adult |
4 |
housemaid, services |
| 44 |
3 |
0 |
1 |
0 |
608 |
1 |
1 |
2 |
18 |
6 |
19 |
30 |
-1 |
0 |
0 |
no |
40 |
Middle Age |
4 |
housemaid, services |
| 51 |
3 |
1 |
1 |
0 |
7 |
1 |
0 |
0 |
31 |
7 |
8 |
30 |
-1 |
0 |
0 |
no |
50 |
Middle Age |
2 |
housemaid, services |
| 33 |
1 |
2 |
2 |
0 |
34 |
0 |
0 |
0 |
28 |
8 |
13 |
30 |
-1 |
0 |
0 |
no |
30 |
Adult |
3 |
blue-collar, technician |
| 35 |
0 |
1 |
2 |
0 |
3168 |
0 |
0 |
0 |
22 |
8 |
24 |
30 |
-1 |
0 |
0 |
no |
30 |
Adult |
4 |
’Admin & Management |
| 46 |
3 |
1 |
1 |
0 |
271 |
1 |
0 |
0 |
30 |
7 |
1013 |
29 |
-1 |
0 |
0 |
yes |
40 |
Middle Age |
4 |
housemaid, services |
| 58 |
0 |
1 |
1 |
0 |
1464 |
1 |
1 |
2 |
5 |
6 |
53 |
29 |
-1 |
0 |
0 |
no |
50 |
Middle Age |
4 |
’Admin & Management |
| 38 |
1 |
0 |
1 |
0 |
1686 |
1 |
0 |
2 |
19 |
6 |
9 |
28 |
-1 |
0 |
0 |
no |
30 |
Adult |
2 |
blue-collar, technician |
| 32 |
1 |
2 |
2 |
0 |
723 |
0 |
0 |
0 |
28 |
8 |
68 |
27 |
-1 |
0 |
0 |
no |
30 |
Adult |
4 |
blue-collar, technician |
| 24 |
5 |
2 |
0 |
0 |
0 |
0 |
0 |
2 |
5 |
6 |
25 |
26 |
-1 |
0 |
0 |
no |
20 |
Adult |
4 |
student |
| 26 |
1 |
2 |
1 |
0 |
0 |
0 |
1 |
0 |
30 |
7 |
65 |
26 |
-1 |
0 |
0 |
no |
20 |
Adult |
4 |
blue-collar, technician |
| 38 |
0 |
0 |
2 |
0 |
155 |
1 |
0 |
2 |
20 |
6 |
25 |
26 |
-1 |
0 |
0 |
no |
30 |
Adult |
4 |
’Admin & Management |
| 47 |
1 |
1 |
2 |
1 |
-813 |
1 |
1 |
2 |
16 |
6 |
179 |
25 |
-1 |
0 |
0 |
no |
40 |
Middle Age |
4 |
blue-collar, technician |
ggplot(Top_compaign,aes(x=education, fill=deposit))+
geom_bar(position="stack")

Flow Graph
Bank_Flow%>%
count(marital, education, default, deposit)%>%
mutate(marital=fct_relevel(as.factor(marital), c("divorced","married","single")))%>%
mutate(education=fct_relevel(as.factor(education), c("primary","secondary","tertiary","unknown")))%>%
mutate(default=fct_relevel(as.factor(default), c("no","yes")))%>%
mutate(deposit=fct_relevel(as.factor(deposit), c("no","yes")))%>%
ggplot(aes(axis1=marital, axis2=education, axis3=default, y=deposit))+
geom_alluvium(aes(fill=deposit), aes.bind=TRUE, knot.pos=1/6)+
geom_stratum(width = 1/3, fill = "white", color = "black")+
geom_text(stat = "stratum", label.strata = TRUE)+
scale_x_discrete(limits = c("marital", "education", "default"), expand = c(.05, .05))
