library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
banks_Df = read_csv("C:/Users/HP/PycharmProjects/Complete_EDA/Data_1/bank.csv")
## Parsed with column specification:
## cols(
## age = col_double(),
## job = col_character(),
## marital = col_character(),
## education = col_character(),
## default = col_character(),
## balance = col_double(),
## housing = col_character(),
## loan = col_character(),
## contact = col_character(),
## day = col_double(),
## month = col_character(),
## duration = col_double(),
## campaign = col_double(),
## pdays = col_double(),
## previous = col_double(),
## poutcome = col_character(),
## y = col_double()
## )
##Note: the above will follow your own path to the data, in doubt just right-click and copy the path, then change to follow slash after you've pasted.
head(banks_Df)
## # A tibble: 6 x 17
## age job marital education default balance housing loan contact day
## <dbl> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 30 unem~ married primary no 1787 no no cellul~ 19
## 2 33 serv~ married secondary no 4789 yes yes cellul~ 11
## 3 35 mana~ single tertiary no 1350 yes no cellul~ 16
## 4 30 mana~ married tertiary no 1476 yes yes unknown 3
## 5 59 blue~ married secondary no 0 yes no unknown 5
## 6 35 mana~ single tertiary no 747 no no cellul~ 23
## # ... with 7 more variables: month <chr>, duration <dbl>, campaign <dbl>,
## # pdays <dbl>, previous <dbl>, poutcome <chr>, y <dbl>
glimpse(banks_Df)
## Observations: 4,521
## Variables: 17
## $ age <dbl> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 3...
## $ job <chr> "unemployed", "services", "management", "management", "bl...
## $ marital <chr> "married", "married", "single", "married", "married", "si...
## $ education <chr> "primary", "secondary", "tertiary", "tertiary", "secondar...
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no...
## $ balance <dbl> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374,...
## $ housing <chr> "no", "yes", "yes", "yes", "yes", "no", "yes", "yes", "ye...
## $ loan <chr> "no", "yes", "no", "yes", "no", "no", "no", "no", "no", "...
## $ contact <chr> "cellular", "cellular", "cellular", "unknown", "unknown",...
## $ day <dbl> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, ...
## $ month <chr> "oct", "may", "apr", "jun", "may", "feb", "may", "may", "...
## $ duration <dbl> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113,...
## $ campaign <dbl> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, ...
## $ pdays <dbl> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, ...
## $ previous <dbl> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, ...
## $ poutcome <chr> "unknown", "failure", "failure", "unknown", "unknown", "f...
## $ y <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
summary(banks_Df)
## age job marital education
## Min. :19.00 Length:4521 Length:4521 Length:4521
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :41.17
## 3rd Qu.:49.00
## Max. :87.00
## default balance housing loan
## Length:4521 Min. :-3313 Length:4521 Length:4521
## Class :character 1st Qu.: 69 Class :character Class :character
## Mode :character Median : 444 Mode :character Mode :character
## Mean : 1423
## 3rd Qu.: 1480
## Max. :71188
## contact day month duration
## Length:4521 Min. : 1.00 Length:4521 Min. : 4
## Class :character 1st Qu.: 9.00 Class :character 1st Qu.: 104
## Mode :character Median :16.00 Mode :character Median : 185
## Mean :15.92 Mean : 264
## 3rd Qu.:21.00 3rd Qu.: 329
## Max. :31.00 Max. :3025
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.00 Min. : 0.0000 Length:4521
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.00 Median : 0.0000 Mode :character
## Mean : 2.794 Mean : 39.77 Mean : 0.5426
## 3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000
## Max. :50.000 Max. :871.00 Max. :25.0000
## y
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1152
## 3rd Qu.:0.0000
## Max. :1.0000
sum(is.na(banks_Df)) ##No Null Values, again another clean dataset.
## [1] 0
Since our data is clean, the next step is to visualize our datasets, this will mildly introduce us to the inherent data relationships.
ggplot(data = banks_Df) +
geom_bar(mapping = aes(x = education)) +
facet_wrap(~loan, ncol = 2)
ggplot(data = banks_Df) +
geom_bar(mapping = aes(x = job)) +
facet_wrap(~loan, ncol = 2) +
coord_flip()
banks_Df %>% ggplot(aes(age, education)) +
geom_line() +
stat_smooth(method = 'lm') +
facet_wrap(~loan, ncol = 1) +
coord_flip()
## `geom_smooth()` using formula 'y ~ x'
BiVariate Relationships
cat("Respondents with Loans: \n")
## Respondents with Loans:
with(subset(banks_Df, loan == "yes"), by(age, job, summary))
## job: admin.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.00 32.50 39.00 39.49 46.00 60.00
## ------------------------------------------------------------
## job: blue-collar
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 33.00 38.00 39.21 45.00 58.00
## ------------------------------------------------------------
## job: entrepreneur
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 34.00 40.00 40.78 49.00 54.00
## ------------------------------------------------------------
## job: housemaid
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.00 42.00 54.00 49.31 58.00 60.00
## ------------------------------------------------------------
## job: management
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 33.00 40.00 41.17 49.25 60.00
## ------------------------------------------------------------
## job: retired
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 54.00 56.00 54.12 57.00 61.00
## ------------------------------------------------------------
## job: self-employed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.00 37.50 49.50 45.83 54.50 60.00
## ------------------------------------------------------------
## job: services
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 32.00 38.00 39.38 46.75 57.00
## ------------------------------------------------------------
## job: student
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22 22 22 22 22 22
## ------------------------------------------------------------
## job: technician
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.00 32.00 36.00 39.28 47.00 58.00
## ------------------------------------------------------------
## job: unemployed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 33.00 43.00 40.23 45.00 52.00
## ------------------------------------------------------------
## job: unknown
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50 50 50 50 50 50
cat("Relationship between age and the Target Variable: \n")
## Relationship between age and the Target Variable:
summary(lm(formula = age ~ y, data = banks_Df))
##
## Call:
## lm(formula = age ~ y, data = banks_Df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.491 -7.998 -1.998 7.002 45.002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.9980 0.1671 245.390 < 2e-16 ***
## y 1.4934 0.4922 3.034 0.00242 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.57 on 4519 degrees of freedom
## Multiple R-squared: 0.002033, Adjusted R-squared: 0.001812
## F-statistic: 9.207 on 1 and 4519 DF, p-value: 0.002425
corr <- cor(subset(banks_Df,
loan == 'yes',
select = (-c(loan, job, marital, education, default, housing, loan, contact, month, campaign, previous, poutcome)),
method = "pearson"))
corrplot.mixed(abs(corr))
corr
## age balance day duration pdays
## age 1.00000000 0.06866649 0.03735216 -0.099959442 -0.023321190
## balance 0.06866649 1.00000000 0.04562914 -0.053782511 -0.012586861
## day 0.03735216 0.04562914 1.00000000 -0.020048853 -0.066080265
## duration -0.09995944 -0.05378251 -0.02004885 1.000000000 -0.005133647
## pdays -0.02332119 -0.01258686 -0.06608026 -0.005133647 1.000000000
## y -0.04364700 -0.01709895 0.01851498 0.487867499 0.111742887
## y
## age -0.04364700
## balance -0.01709895
## day 0.01851498
## duration 0.48786750
## pdays 0.11174289
## y 1.00000000
The end-user does not really understand your 1 and O outcome, using the previous tutorials, go and adjust the ‘Outcome’ feature to have 0 as ‘No’ and 1 as ‘Yes’, and then adjust your codes accordingly.