bank <- read.csv("bank-full.csv", sep = ";")
head(bank)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepreneur married secondary no 2 yes yes unknown 5
## 4 47 blue-collar married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## month duration campaign pdays previous poutcome y
## 1 may 261 1 -1 0 unknown no
## 2 may 151 1 -1 0 unknown no
## 3 may 76 1 -1 0 unknown no
## 4 may 92 1 -1 0 unknown no
## 5 may 198 1 -1 0 unknown no
## 6 may 139 1 -1 0 unknown no
str(bank)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
summary(bank)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggforce)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.5 ✓ purrr 0.3.4
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
ggplot(data = bank, aes(x= age,))+
geom_histogram(bins = 35, binwidth = 4, color = "black", fill = "maroon" )+
labs(title="Customer Age Distribution",x="Age(years)", y = "Count",
subtitle = "yellow line indicate average Age")+
scale_x_continuous(breaks = seq(0,100,10))+
geom_vline(xintercept = mean(bank$age), color = "yellow", linetype = 5)
ggplot(bank, aes(x= marital, y = housing))+
geom_col(fill= "blue")+
ggtitle("Housing and Marital")
ggplot(data = bank, aes(x=education))+
geom_bar(fill = "dark blue")+ ggtitle("Subscription based on Education Level")+
xlab(" Education Level")+
guides(fill=guide_legend(title="Subscription of Term Deposit"))
ggplot(bank, aes(x=duration, y=balance)) +
facet_grid(cols = vars(y)) +
geom_point(shape=1, color = "purple")+
ggtitle("Duration VS Balance")
bank %>%
group_by(education, job)
## # A tibble: 45,211 x 17
## # Groups: education, job [48]
## age job marital education default balance housing loan contact day
## <int> <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int>
## 1 58 mana… married tertiary no 2143 yes no unknown 5
## 2 44 tech… single secondary no 29 yes no unknown 5
## 3 33 entr… married secondary no 2 yes yes unknown 5
## 4 47 blue… married unknown no 1506 yes no unknown 5
## 5 33 unkn… single unknown no 1 no no unknown 5
## 6 35 mana… married tertiary no 231 yes no unknown 5
## 7 28 mana… single tertiary no 447 yes yes unknown 5
## 8 42 entr… divorc… tertiary yes 2 yes no unknown 5
## 9 58 reti… married primary no 121 yes no unknown 5
## 10 43 tech… single secondary no 593 yes no unknown 5
## # … with 45,201 more rows, and 7 more variables: month <chr>, duration <int>,
## # campaign <int>, pdays <int>, previous <int>, poutcome <chr>, y <chr>
ggplot(bank, aes(education, balance, fill = job ))+
geom_bar(stat="identity", position = "dodge")+
scale_fill_brewer(palette = "Paired")
bank <- bank %>%
mutate(age_group=
case_when(
age > 20 & age <= 25 ~ "20-25",
age > 25 & age <= 30 ~ "25-30",
age > 30 & age <= 35 ~ "30-35",
age > 35 & age <= 40 ~ "35-40",
age > 40 & age <= 45 ~ "40-45",
age > 45 & age <= 50 ~ "45-50",
age > 50 & age <= 55 ~ "50-55",
age > 55 ~ "above 55"
)
) %>%
count(age_group, loan)
# calculate the start and end angles for each pie
bank_pies <- left_join(bank,
bank %>%
group_by(age_group) %>%
summarize(n_total = sum(n))) %>%
group_by(age_group) %>%
mutate(end_angle = 2*pi*cumsum(n)/n_total, # ending angle for each pie slice
start_angle = lag(end_angle, default = 0), # starting angle for each pie slice
mid_angle = 0.5*(start_angle + end_angle)) # middle of each pie slice, for the text label
## Joining, by = "age_group"
rpie = 1 # pie radius
rlabel = 0.6 * rpie # radius of the labels; a number slightly larger than 0.5 seems to work better,
# but 0.5 would place it exactly in the middle as the question asks for.ks for.
# draw the pies
ggplot(bank_pies) +
geom_arc_bar(aes(x0 = 0, y0 = 0, r0 = 0, r = rpie,
start = start_angle, end = end_angle, fill = loan)) +
geom_text(aes(x = rlabel*sin(mid_angle), y = rlabel*cos(mid_angle), label = n),
hjust = 0.5, vjust = 0.5) +
coord_fixed() +
scale_x_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
scale_y_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
facet_wrap(~age_group)+
ggtitle("Loan By Age Group")
bank
## age_group loan n
## 1 20-25 no 1063
## 2 20-25 yes 176
## 3 25-30 no 4694
## 4 25-30 yes 1000
## 5 30-35 no 8327
## 6 30-35 yes 1550
## 7 35-40 no 6633
## 8 35-40 yes 1177
## 9 40-45 no 5061
## 10 40-45 yes 985
## 11 45-50 no 4277
## 12 45-50 yes 916
## 13 50-55 no 3549
## 14 50-55 yes 806
## 15 above 55 no 4269
## 16 above 55 yes 631
## 17 <NA> no 94
## 18 <NA> yes 3
ggplot(bank, aes(x=age_group, y= bank$balance, fill=age_group)) +
geom_boxplot() +
theme(
plot.title = element_text(size=14),
) +
ggtitle("Balance by Age Group") +
xlab("Years")+
ylab("Balance")
## Warning: Use of `bank$balance` is discouraged. Use `balance` instead.
#Conclusions
there is some insight that we can get from the plots:
The insight from the plot Balance, Job, and education level, that shows the customer with the highest balance has a job in management field with tertiary education level and the customer with the lowest balances who has a job as a student with primary education level
The Loan by Age plot shows Segmentation of Loan Customers by Age, Borrowers in Age Group 30-35 years is the highest group for the loan debt, And the second position of loan debt with adults aged 35 to 40, and the third position following by age 25-30 years old.
The last plot shows the older individuals are likely to have higher balances in their savings accounts