Data Dictionary: ### bank client data:
1 - age (numeric)
2 - job : type of job (categorical:“admin.”,“unknown”,“unemployed”,“management”,“housemaid”,“entrepreneur”,“student”, “blue-collar”,“self-employed”,“retired”,“technician”,“services”)
3 - marital : marital status (categorical: “married”,“divorced”,“single”; note: “divorced” means divorced or widowed)
4 - education (categorical: “unknown”,“secondary”,“primary”,“tertiary”)
5 - default: has credit in default? (binary: “yes”,“no”)
6 - balance: average yearly balance, in euros (numeric)
7 - housing: has housing loan? (binary: “yes”,“no”)
8 - loan: has personal loan? (binary: “yes”,“no”)
### related with the last contact of the current campaign:
9 - contact: contact communication type (categorical: “unknown”,“telephone”,“cellular”)
10 - day: last contact day of the month (numeric)
11 - month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”)
12 - duration: last contact duration, in seconds (numeric)
#### other attributes:
13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
15 - previous: number of contacts performed before this campaign and for this client (numeric)
16 - poutcome: outcome of the previous marketing campaign (categorical: “unknown”,“other”,“failure”,“success”)
#### TARGET Output variable (desired target):
17 - y - has the client subscribed a term deposit? (binary: “yes”,“no”)
Synthetic data set -3 obs.
glimpse(synthetic)
## Observations: 45,208
## Variables: 16
## $ job <chr> "unemployed", "management", "blue-collar", "blue-colla…
## $ marital <chr> "married", "married", "married", "married", "married",…
## $ education <chr> "primary", "tertiary", "secondary", "primary", "second…
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ balance <dbl> 5.626440e+02, 4.456826e+02, -2.638169e+01, 1.026250e+0…
## $ housing <chr> "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", …
## $ loan <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ contact <chr> "cellular", "cellular", "unknown", "cellular", "unknow…
## $ day <dbl> 17, 19, 5, 28, 16, 20, 28, 14, 21, 8, 29, 13, 20, 13, …
## $ month <chr> "nov", "aug", "may", "jan", "may", "may", "may", "may"…
## $ duration <dbl> 158.02823, 144.25000, 286.37500, 64.28934, 159.25000, …
## $ campaign <dbl> 1, 2, 1, 3, 1, 1, 2, 2, 2, 1, 2, 1, 1, 3, 1, 2, 1, 4, …
## $ pdays <dbl> 173, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3…
## $ previous <dbl> 1.000000e+00, -4.746630e-53, -1.029628e-52, 3.352838e-…
## $ poutcome <chr> "failure", "unknown", "unknown", "unknown", "unknown",…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", …
glimpse(real)
## Observations: 45,211
## Variables: 16
## $ job <chr> "management", "technician", "entrepreneur", "blue-coll…
## $ marital <chr> "married", "single", "married", "married", "single", "…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unkn…
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no",…
## $ balance <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390,…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes",…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no"…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown",…
## $ day <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may"…
## $ duration <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137…
## $ campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1…
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown",…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", …
http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
summary(synthetic)
## job marital education
## Length:45208 Length:45208 Length:45208
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## default balance housing
## Length:45208 Min. :-3313.0 Length:45208
## Class :character 1st Qu.: 263.6 Class :character
## Mode :character Median : 555.6 Mode :character
## Mean : 992.0
## 3rd Qu.: 1115.0
## Max. :46732.5
## loan contact day month
## Length:45208 Length:45208 Min. : 1.00 Length:45208
## Class :character Class :character 1st Qu.: 7.00 Class :character
## Mode :character Mode :character Median :13.00 Mode :character
## Mean :13.94
## 3rd Qu.:20.00
## Max. :31.00
## duration campaign pdays previous
## Min. : 0.0 Min. : 1.000 Min. : -1.0 Min. : 0.0000
## 1st Qu.: 167.0 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000
## Median : 232.1 Median : 2.000 Median : -1.0 Median : 0.0000
## Mean : 268.3 Mean : 2.157 Mean : 15.4 Mean : 0.2203
## 3rd Qu.: 319.6 3rd Qu.: 2.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :3183.0 Max. :63.000 Max. :831.0 Max. :55.0000
## poutcome y
## Length:45208 Length:45208
## Class :character Class :character
## Mode :character Mode :character
##
##
##
summary(real)
## job marital education
## Length:45211 Length:45211 Length:45211
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
Categorical data only
synthetic.cat <- synthetic %>%
select(-balance, -day, -duration, -campaign, -pdays, -previous)
glimpse(synthetic.cat)
## Observations: 45,208
## Variables: 10
## $ job <chr> "unemployed", "management", "blue-collar", "blue-colla…
## $ marital <chr> "married", "married", "married", "married", "married",…
## $ education <chr> "primary", "tertiary", "secondary", "primary", "second…
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ housing <chr> "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", …
## $ loan <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ contact <chr> "cellular", "cellular", "unknown", "cellular", "unknow…
## $ month <chr> "nov", "aug", "may", "jan", "may", "may", "may", "may"…
## $ poutcome <chr> "failure", "unknown", "unknown", "unknown", "unknown",…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", …
real.cat <- real %>%
select(-balance, -day, -duration, -campaign, -pdays, -previous)
glimpse(real.cat)
## Observations: 45,211
## Variables: 10
## $ job <chr> "management", "technician", "entrepreneur", "blue-coll…
## $ marital <chr> "married", "single", "married", "married", "single", "…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unkn…
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no",…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes",…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no"…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown",…
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may"…
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown",…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", …
theme_set(theme_classic())
g.synth.jobs <- ggplot(synthetic.cat, aes(job))
g.synth.jobs + geom_density(aes(fill=factor(job)), alpha=0.8) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Density plot Synthetic Data",
x="Jobs",
fill="Jobs")
theme_set(theme_classic())
g.synth.real <- ggplot(real.cat, aes(job))
g.synth.real + geom_density(aes(fill=factor(job)), alpha=0.8) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Density plot Synthetic Data",
x="Jobs",
fill="Jobs")
Numerical data only with Correlation Matrix
synthetic.num <- synthetic %>%
select(balance, day, duration, campaign, pdays, previous)
real.num <- real %>%
select(balance, day, duration, campaign, pdays, previous)
cormat.synth <- round(cor(synthetic.num),2)
print(cormat.synth)
## balance day duration campaign pdays previous
## balance 1.00 0.07 0.02 0.01 0.05 0.05
## day 0.07 1.00 0.00 0.13 -0.01 0.00
## duration 0.02 0.00 1.00 -0.04 0.00 0.01
## campaign 0.01 0.13 -0.04 1.00 -0.01 0.00
## pdays 0.05 -0.01 0.00 -0.01 1.00 0.59
## previous 0.05 0.00 0.01 0.00 0.59 1.00
cormat.real <- round(cor(real.num),2)
print(cormat.real)
## balance day duration campaign pdays previous
## balance 1.00 0.00 0.02 -0.01 0.00 0.02
## day 0.00 1.00 -0.03 0.16 -0.09 -0.05
## duration 0.02 -0.03 1.00 -0.08 0.00 0.00
## campaign -0.01 0.16 -0.08 1.00 -0.09 -0.03
## pdays 0.00 -0.09 0.00 -0.09 1.00 0.45
## previous 0.02 -0.05 0.00 -0.03 0.45 1.00
http://www.sthda.com/english/wiki/ggcorrplot-visualization-of-a-correlation-matrix-using-ggplot2 http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
ggcorrplot(cormat.synth, hc.order = FALSE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of Synthetic Dataset",
ggtheme=theme_bw)
ggcorrplot(cormat.real, hc.order = FALSE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of Real Dataset",
ggtheme=theme_bw)
synth.pvals <- rcorr(as.matrix(synthetic.num))
synth.pvals
## balance day duration campaign pdays previous
## balance 1.00 0.07 0.02 0.01 0.05 0.05
## day 0.07 1.00 0.00 0.13 -0.01 0.00
## duration 0.02 0.00 1.00 -0.04 0.00 0.01
## campaign 0.01 0.13 -0.04 1.00 -0.01 0.00
## pdays 0.05 -0.01 0.00 -0.01 1.00 0.59
## previous 0.05 0.00 0.01 0.00 0.59 1.00
##
## n= 45208
##
##
## P
## balance day duration campaign pdays previous
## balance 0.0000 0.0002 0.0045 0.0000 0.0000
## day 0.0000 0.3949 0.0000 0.0024 0.6866
## duration 0.0002 0.3949 0.0000 0.5462 0.1392
## campaign 0.0045 0.0000 0.0000 0.0041 0.7947
## pdays 0.0000 0.0024 0.5462 0.0041 0.0000
## previous 0.0000 0.6866 0.1392 0.7947 0.0000
real.pvals <- rcorr(as.matrix(real.num))
real.pvals
## balance day duration campaign pdays previous
## balance 1.00 0.00 0.02 -0.01 0.00 0.02
## day 0.00 1.00 -0.03 0.16 -0.09 -0.05
## duration 0.02 -0.03 1.00 -0.08 0.00 0.00
## campaign -0.01 0.16 -0.08 1.00 -0.09 -0.03
## pdays 0.00 -0.09 0.00 -0.09 1.00 0.45
## previous 0.02 -0.05 0.00 -0.03 0.45 1.00
##
## n= 45211
##
##
## P
## balance day duration campaign pdays previous
## balance 0.3384 0.0000 0.0019 0.4651 0.0004
## day 0.3384 0.0000 0.0000 0.0000 0.0000
## duration 0.0000 0.0000 0.0000 0.7394 0.7981
## campaign 0.0019 0.0000 0.0000 0.0000 0.0000
## pdays 0.4651 0.0000 0.7394 0.0000 0.0000
## previous 0.0004 0.0000 0.7981 0.0000 0.0000