Banks EDA

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)
library(corrplot)

## corrplot 0.84 loaded

banks_Df = read_csv("C:/Users/HP/PycharmProjects/Complete_EDA/Data_1/bank.csv")

## Parsed with column specification:
## cols(
##   age = col_double(),
##   job = col_character(),
##   marital = col_character(),
##   education = col_character(),
##   default = col_character(),
##   balance = col_double(),
##   housing = col_character(),
##   loan = col_character(),
##   contact = col_character(),
##   day = col_double(),
##   month = col_character(),
##   duration = col_double(),
##   campaign = col_double(),
##   pdays = col_double(),
##   previous = col_double(),
##   poutcome = col_character(),
##   y = col_double()
## )

##Note: the above will follow your own path to the data, in doubt just right-click and copy the path, then change to follow slash after you've pasted.

head(banks_Df)

## # A tibble: 6 x 17
##     age job   marital education default balance housing loan  contact   day
##   <dbl> <chr> <chr>   <chr>     <chr>     <dbl> <chr>   <chr> <chr>   <dbl>
## 1    30 unem~ married primary   no         1787 no      no    cellul~    19
## 2    33 serv~ married secondary no         4789 yes     yes   cellul~    11
## 3    35 mana~ single  tertiary  no         1350 yes     no    cellul~    16
## 4    30 mana~ married tertiary  no         1476 yes     yes   unknown     3
## 5    59 blue~ married secondary no            0 yes     no    unknown     5
## 6    35 mana~ single  tertiary  no          747 no      no    cellul~    23
## # ... with 7 more variables: month <chr>, duration <dbl>, campaign <dbl>,
## #   pdays <dbl>, previous <dbl>, poutcome <chr>, y <dbl>

glimpse(banks_Df)

## Observations: 4,521
## Variables: 17
## $ age       <dbl> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 3...
## $ job       <chr> "unemployed", "services", "management", "management", "bl...
## $ marital   <chr> "married", "married", "single", "married", "married", "si...
## $ education <chr> "primary", "secondary", "tertiary", "tertiary", "secondar...
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no...
## $ balance   <dbl> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374,...
## $ housing   <chr> "no", "yes", "yes", "yes", "yes", "no", "yes", "yes", "ye...
## $ loan      <chr> "no", "yes", "no", "yes", "no", "no", "no", "no", "no", "...
## $ contact   <chr> "cellular", "cellular", "cellular", "unknown", "unknown",...
## $ day       <dbl> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, ...
## $ month     <chr> "oct", "may", "apr", "jun", "may", "feb", "may", "may", "...
## $ duration  <dbl> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113,...
## $ campaign  <dbl> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, ...
## $ pdays     <dbl> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, ...
## $ previous  <dbl> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, ...
## $ poutcome  <chr> "unknown", "failure", "failure", "unknown", "unknown", "f...
## $ y         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...

summary(banks_Df)

##       age            job              marital           education        
##  Min.   :19.00   Length:4521        Length:4521        Length:4521       
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :41.17                                                           
##  3rd Qu.:49.00                                                           
##  Max.   :87.00                                                           
##    default             balance        housing              loan          
##  Length:4521        Min.   :-3313   Length:4521        Length:4521       
##  Class :character   1st Qu.:   69   Class :character   Class :character  
##  Mode  :character   Median :  444   Mode  :character   Mode  :character  
##                     Mean   : 1423                                        
##                     3rd Qu.: 1480                                        
##                     Max.   :71188                                        
##    contact               day           month              duration   
##  Length:4521        Min.   : 1.00   Length:4521        Min.   :   4  
##  Class :character   1st Qu.: 9.00   Class :character   1st Qu.: 104  
##  Mode  :character   Median :16.00   Mode  :character   Median : 185  
##                     Mean   :15.92                      Mean   : 264  
##                     3rd Qu.:21.00                      3rd Qu.: 329  
##                     Max.   :31.00                      Max.   :3025  
##     campaign          pdays           previous         poutcome        
##  Min.   : 1.000   Min.   : -1.00   Min.   : 0.0000   Length:4521       
##  1st Qu.: 1.000   1st Qu.: -1.00   1st Qu.: 0.0000   Class :character  
##  Median : 2.000   Median : -1.00   Median : 0.0000   Mode  :character  
##  Mean   : 2.794   Mean   : 39.77   Mean   : 0.5426                     
##  3rd Qu.: 3.000   3rd Qu.: -1.00   3rd Qu.: 0.0000                     
##  Max.   :50.000   Max.   :871.00   Max.   :25.0000                     
##        y         
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1152  
##  3rd Qu.:0.0000  
##  Max.   :1.0000

sum(is.na(banks_Df)) ##No Null Values, again another clean dataset.

## [1] 0

Since our data is clean, the next step is to visualize our datasets, this will mildly introduce us to the inherent data relationships.

ggplot(data = banks_Df) +
  geom_bar(mapping = aes(x = education)) +
  facet_wrap(~loan, ncol = 2)

ggplot(data = banks_Df) +
  geom_bar(mapping = aes(x = job)) +
  facet_wrap(~loan, ncol = 2) +
  coord_flip()

banks Distribution by Factors

banks_Df %>% ggplot(aes(age, education)) +
  geom_line() +
  stat_smooth(method = 'lm') +
  facet_wrap(~loan, ncol = 1) +
  coord_flip()

## `geom_smooth()` using formula 'y ~ x'

BiVariate Relationships

cat("Respondents with Loans: \n")

## Respondents with Loans:

with(subset(banks_Df, loan == "yes"), by(age, job, summary))

## job: admin.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.00   32.50   39.00   39.49   46.00   60.00 
## ------------------------------------------------------------ 
## job: blue-collar
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24.00   33.00   38.00   39.21   45.00   58.00 
## ------------------------------------------------------------ 
## job: entrepreneur
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   34.00   40.00   40.78   49.00   54.00 
## ------------------------------------------------------------ 
## job: housemaid
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27.00   42.00   54.00   49.31   58.00   60.00 
## ------------------------------------------------------------ 
## job: management
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   33.00   40.00   41.17   49.25   60.00 
## ------------------------------------------------------------ 
## job: retired
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24.00   54.00   56.00   54.12   57.00   61.00 
## ------------------------------------------------------------ 
## job: self-employed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   37.50   49.50   45.83   54.50   60.00 
## ------------------------------------------------------------ 
## job: services
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   32.00   38.00   39.38   46.75   57.00 
## ------------------------------------------------------------ 
## job: student
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      22      22      22      22      22      22 
## ------------------------------------------------------------ 
## job: technician
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.00   32.00   36.00   39.28   47.00   58.00 
## ------------------------------------------------------------ 
## job: unemployed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   33.00   43.00   40.23   45.00   52.00 
## ------------------------------------------------------------ 
## job: unknown
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      50      50      50      50      50      50

Feature Relationship

Linear Regression: Quantifying the relatonship between age and bank loans especially if they are married.

cat("Relationship between age and the Target Variable: \n")

## Relationship between age and the Target Variable:

summary(lm(formula = age ~ y, data = banks_Df))

## 
## Call:
## lm(formula = age ~ y, data = banks_Df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.491  -7.998  -1.998   7.002  45.002 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.9980     0.1671 245.390  < 2e-16 ***
## y             1.4934     0.4922   3.034  0.00242 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.57 on 4519 degrees of freedom
## Multiple R-squared:  0.002033,   Adjusted R-squared:  0.001812 
## F-statistic: 9.207 on 1 and 4519 DF,  p-value: 0.002425

corr <- cor(subset(banks_Df,
                   loan == 'yes',
                   select = (-c(loan, job, marital, education, default, housing, loan, contact, month, campaign, previous, poutcome)),
                   method = "pearson"))

corrplot.mixed(abs(corr))

corr

##                  age     balance         day     duration        pdays
## age       1.00000000  0.06866649  0.03735216 -0.099959442 -0.023321190
## balance   0.06866649  1.00000000  0.04562914 -0.053782511 -0.012586861
## day       0.03735216  0.04562914  1.00000000 -0.020048853 -0.066080265
## duration -0.09995944 -0.05378251 -0.02004885  1.000000000 -0.005133647
## pdays    -0.02332119 -0.01258686 -0.06608026 -0.005133647  1.000000000
## y        -0.04364700 -0.01709895  0.01851498  0.487867499  0.111742887
##                    y
## age      -0.04364700
## balance  -0.01709895
## day       0.01851498
## duration  0.48786750
## pdays     0.11174289
## y         1.00000000

ASSIGNMENT

The end-user does not really understand your 1 and O outcome, using the previous tutorials, go and adjust the ‘Outcome’ feature to have 0 as ‘No’ and 1 as ‘Yes’, and then adjust your codes accordingly.

Banks EDA

Kaggle

9/30/2020

banks Distribution by Factors

Feature Relationship

Linear Regression: Quantifying the relatonship between age and bank loans especially if they are married.

ASSIGNMENT