bank <- read.csv("bank-full.csv", sep = ";")
head(bank)
##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no
str(bank)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : chr  "management" "technician" "entrepreneur" "blue-collar" ...
##  $ marital  : chr  "married" "single" "married" "married" ...
##  $ education: chr  "tertiary" "secondary" "secondary" "unknown" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : chr  "yes" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "no" "yes" "no" ...
##  $ contact  : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...
summary(bank)
##       age            job              marital           education        
##  Min.   :18.00   Length:45211       Length:45211       Length:45211      
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.94                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :95.00                                                           
##    default             balance         housing              loan          
##  Length:45211       Min.   : -8019   Length:45211       Length:45211      
##  Class :character   1st Qu.:    72   Class :character   Class :character  
##  Mode  :character   Median :   448   Mode  :character   Mode  :character  
##                     Mean   :  1362                                        
##                     3rd Qu.:  1428                                        
##                     Max.   :102127                                        
##    contact               day           month              duration     
##  Length:45211       Min.   : 1.00   Length:45211       Min.   :   0.0  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 103.0  
##  Mode  :character   Median :16.00   Mode  :character   Median : 180.0  
##                     Mean   :15.81                      Mean   : 258.2  
##                     3rd Qu.:21.00                      3rd Qu.: 319.0  
##                     Max.   :31.00                      Max.   :4918.0  
##     campaign          pdays          previous          poutcome        
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   Length:45211      
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   Class :character  
##  Median : 2.000   Median : -1.0   Median :  0.0000   Mode  :character  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803                     
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                     
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000                     
##       y            
##  Length:45211      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggforce)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.5     ✓ purrr   0.3.4
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
ggplot(data = bank, aes(x= age,))+
  geom_histogram(bins = 35, binwidth = 4, color = "black", fill = "maroon" )+
  labs(title="Customer Age Distribution",x="Age(years)", y = "Count",
       subtitle = "yellow line indicate average Age")+
  scale_x_continuous(breaks = seq(0,100,10))+
  geom_vline(xintercept = mean(bank$age), color = "yellow", linetype = 5)

ggplot(bank, aes(x= marital, y = housing))+
  geom_col(fill= "blue")+
  ggtitle("Housing and Marital")

ggplot(data = bank, aes(x=education))+
  geom_bar(fill = "dark blue")+ ggtitle("Subscription based on Education Level")+ 
  xlab(" Education Level")+
  guides(fill=guide_legend(title="Subscription of Term Deposit"))

ggplot(bank, aes(x=duration, y=balance)) +
  facet_grid(cols = vars(y)) +
  geom_point(shape=1, color = "purple")+
  ggtitle("Duration VS Balance")

bank %>% 
  group_by(education, job) 
## # A tibble: 45,211 x 17
## # Groups:   education, job [48]
##      age job   marital education default balance housing loan  contact   day
##    <int> <chr> <chr>   <chr>     <chr>     <int> <chr>   <chr> <chr>   <int>
##  1    58 mana… married tertiary  no         2143 yes     no    unknown     5
##  2    44 tech… single  secondary no           29 yes     no    unknown     5
##  3    33 entr… married secondary no            2 yes     yes   unknown     5
##  4    47 blue… married unknown   no         1506 yes     no    unknown     5
##  5    33 unkn… single  unknown   no            1 no      no    unknown     5
##  6    35 mana… married tertiary  no          231 yes     no    unknown     5
##  7    28 mana… single  tertiary  no          447 yes     yes   unknown     5
##  8    42 entr… divorc… tertiary  yes           2 yes     no    unknown     5
##  9    58 reti… married primary   no          121 yes     no    unknown     5
## 10    43 tech… single  secondary no          593 yes     no    unknown     5
## # … with 45,201 more rows, and 7 more variables: month <chr>, duration <int>,
## #   campaign <int>, pdays <int>, previous <int>, poutcome <chr>, y <chr>
ggplot(bank, aes(education, balance, fill = job ))+
  geom_bar(stat="identity", position = "dodge")+
  scale_fill_brewer(palette = "Paired")

bank <- bank %>% 
  mutate(age_group=
      case_when(
      age > 20 & age <= 25 ~ "20-25",
      age > 25 & age <= 30 ~ "25-30",
      age > 30 & age <= 35 ~ "30-35",
      age > 35 & age <= 40 ~ "35-40",
      age > 40 & age <= 45 ~ "40-45",
      age > 45 & age <= 50 ~ "45-50",
      age > 50 & age <= 55 ~ "50-55",
      age > 55             ~ "above 55"
    )
  ) %>% 
   count(age_group, loan)

# calculate the start and end angles for each pie
bank_pies <- left_join(bank,
                      bank %>% 
                        group_by(age_group) %>%
                        summarize(n_total = sum(n))) %>%
  group_by(age_group) %>%
  mutate(end_angle = 2*pi*cumsum(n)/n_total,      # ending angle for each pie slice
         start_angle = lag(end_angle, default = 0),   # starting angle for each pie slice
         mid_angle = 0.5*(start_angle + end_angle))   # middle of each pie slice, for the text label
## Joining, by = "age_group"
rpie = 1 # pie radius
rlabel = 0.6 * rpie # radius of the labels; a number slightly larger than 0.5 seems to work better,
                    # but 0.5 would place it exactly in the middle as the question asks for.ks for.
# draw the pies
ggplot(bank_pies) + 
  geom_arc_bar(aes(x0 = 0, y0 = 0, r0 = 0, r = rpie,
                   start = start_angle, end = end_angle, fill = loan)) +
  geom_text(aes(x = rlabel*sin(mid_angle), y = rlabel*cos(mid_angle), label = n),
            hjust = 0.5, vjust = 0.5) +
  coord_fixed() +
  scale_x_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
  scale_y_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
  facet_wrap(~age_group)+
  ggtitle("Loan By Age Group")

bank
##    age_group loan    n
## 1      20-25   no 1063
## 2      20-25  yes  176
## 3      25-30   no 4694
## 4      25-30  yes 1000
## 5      30-35   no 8327
## 6      30-35  yes 1550
## 7      35-40   no 6633
## 8      35-40  yes 1177
## 9      40-45   no 5061
## 10     40-45  yes  985
## 11     45-50   no 4277
## 12     45-50  yes  916
## 13     50-55   no 3549
## 14     50-55  yes  806
## 15  above 55   no 4269
## 16  above 55  yes  631
## 17      <NA>   no   94
## 18      <NA>  yes    3
ggplot(bank, aes(x=age_group, y= bank$balance, fill=age_group)) +
    geom_boxplot() +
    theme(
      plot.title = element_text(size=14), 
       ) +
    ggtitle("Balance by Age Group") +
    xlab("Years")+
    ylab("Balance")
## Warning: Use of `bank$balance` is discouraged. Use `balance` instead.

#Conclusions

there is some insight that we can get from the plots:

  1. The Customers with married marital status has more tendencies to housing
  2. The Most Subscriptions Customers who have background secondary of education level and the next position will be following by tertiary and then primary education levels
  3. we can get the insight from plot Duration VS Balance:
  1. The insight from the plot Balance, Job, and education level, that shows the customer with the highest balance has a job in management field with tertiary education level and the customer with the lowest balances who has a job as a student with primary education level

  2. The Loan by Age plot shows Segmentation of Loan Customers by Age, Borrowers in Age Group 30-35 years is the highest group for the loan debt, And the second position of loan debt with adults aged 35 to 40, and the third position following by age 25-30 years old.

  3. The last plot shows the older individuals are likely to have higher balances in their savings accounts