# set up
library(rmarkdown)
library(psych)
library(scatterplot3d)
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cloud_wd <- getwd()
setwd(cloud_wd)
cd <- read.csv(file = "CD_additional_balanced.csv", stringsAsFactors = FALSE)
# 1A
# examine data structure
cd %>% str()
## 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : chr "blue-collar" "entrepreneur" "technician" "technician" ...
## $ marital : chr "divorced" "married" "married" "married" ...
## $ education : chr "basic.4y" "university.degree" "basic.9y" "professional.course" ...
## $ default : chr "unknown" "unknown" "no" "unknown" ...
## $ housing : chr "yes" "yes" "no" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : chr "yes" "yes" "yes" "yes" ...
# 1B
# Factor categorical columns
cd$job <- factor(cd$job)
cd$marital <- factor(cd$marital)
cd$education <- factor(cd$education)
cd$default <- factor(cd$default)
cd$housing <- factor(cd$housing)
cd$loan <- factor(cd$loan)
cd$contact <- factor(cd$contact)
cd$month <- factor(cd$month)
cd$day_of_week <- factor(cd$day_of_week)
cd$poutcome <- factor(cd$poutcome)
cd$y <- factor(cd$y)
cd %>% str()
## 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
## $ default : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
cd %>% summary()
## age job marital education
## Min. :17.0 admin. :2517 divorced:1021 university.degree :3007
## 1st Qu.:31.0 blue-collar:1769 married :5338 high.school :2102
## Median :38.0 technician :1459 single :2900 professional.course:1190
## Mean :40.4 services : 773 unknown : 21 basic.9y :1177
## 3rd Qu.:48.0 management : 651 basic.4y : 895
## Max. :98.0 retired : 595 basic.6y : 458
## (Other) :1516 (Other) : 451
## default housing loan contact month
## no :7824 no :4104 no :7688 cellular :6672 may :2533
## unknown:1456 unknown: 225 unknown: 225 telephone:2608 jul :1477
## yes :4951 yes :1367 aug :1353
## jun :1169
## nov : 886
## apr : 785
## (Other):1077
## day_of_week duration campaign pdays previous
## fri:1763 Min. : 1.0 Min. : 1.000 Min. : 0.0 Min. :0.0000
## mon:1846 1st Qu.: 145.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
## thu:2000 Median : 265.0 Median : 2.000 Median :999.0 Median :0.0000
## tue:1810 Mean : 387.4 Mean : 2.333 Mean :887.3 Mean :0.3153
## wed:1861 3rd Qu.: 528.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :4199.0 Max. :39.000 Max. :999.0 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure :1074 Min. :-3.4000 Min. :92.20 Min. :-50.80
## nonexistent:7244 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70
## success : 962 Median :-0.1000 Median :93.44 Median :-41.80
## Mean :-0.4963 Mean :93.48 Mean :-40.22
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40
## Max. : 1.4000 Max. :94.77 Max. :-26.90
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :4640
## 1st Qu.:1.244 1st Qu.:5076 yes:4640
## Median :4.021 Median :5191
## Mean :2.960 Mean :5135
## 3rd Qu.:4.959 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
# 2A
# histograms
cd %>% ggplot() +
geom_histogram(aes(x=age),binwidth = 20) +
ggtitle('Histogram of Age')
cd %>% ggplot() +
geom_histogram(aes(x=duration),binwidth = 20) +
ggtitle('Histogram of Duration')
cd %>% ggplot() +
geom_histogram(aes(x=campaign),binwidth = 20) +
ggtitle('Histogram of Campaign')
cd %>% ggplot() +
geom_histogram(aes(x=pdays),binwidth = 20) +
ggtitle('Histogram of # of Days Passed by after the Client was Last Contacted from a Previous Campaign')
# 2B
# boxplots
cd %>%
ggplot(aes(x=age)) +
geom_boxplot() +
ggtitle('Boxplot of Age')
cd %>%
ggplot(aes(x=duration)) +
geom_boxplot() +
ggtitle('Boxplot of Duration')
cd %>%
ggplot(aes(x=campaign)) +
geom_boxplot() +
ggtitle('Boxplot of Campaign')
cd %>%
ggplot(aes(x=pdays)) +
geom_boxplot() +
ggtitle('Boxplot of # of Days Passed by after the Client was Last Contacted from a Previous Campaign')
# 2C
# deciles
cd %>% pull(age) %>% quantile(., seq(from = 0, to = 1, by = 0.10), na.rm=TRUE)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 17 27 30 33 35 38 41 46 51 57 98
cd %>% pull(duration) %>% quantile(., seq(from = 0, to = 1, by = 0.10), na.rm=TRUE)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 80 124 167 211 265 340 452 615 860 4199
cd %>% pull(campaign) %>% quantile(., seq(from = 0, to = 1, by = 0.10), na.rm=TRUE)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 1 1 1 1 2 2 2 3 4 39
cd %>% pull(pdays) %>% quantile(., seq(from = 0, to = 1, by = 0.10), na.rm=TRUE)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 0 11 999 999 999 999 999 999 999 999 999
# 3A
cd %>% count(y)
## y n
## 1 no 4640
## 2 yes 4640
cd %>% count(job)
## job n
## 1 admin. 2517
## 2 blue-collar 1769
## 3 entrepreneur 308
## 4 housemaid 216
## 5 management 651
## 6 retired 595
## 7 self-employed 306
## 8 services 773
## 9 student 358
## 10 technician 1459
## 11 unemployed 248
## 12 unknown 80
cd %>% count(marital)
## marital n
## 1 divorced 1021
## 2 married 5338
## 3 single 2900
## 4 unknown 21
cd %>% count(education)
## education n
## 1 basic.4y 895
## 2 basic.6y 458
## 3 basic.9y 1177
## 4 high.school 2102
## 5 illiterate 6
## 6 professional.course 1190
## 7 university.degree 3007
## 8 unknown 445
prop.table(table(cd$y))
##
## no yes
## 0.5 0.5
prop.table(table(cd$job))
##
## admin. blue-collar entrepreneur housemaid management
## 0.27122845 0.19062500 0.03318966 0.02327586 0.07015086
## retired self-employed services student technician
## 0.06411638 0.03297414 0.08329741 0.03857759 0.15721983
## unemployed unknown
## 0.02672414 0.00862069
prop.table(table(cd$marital))
##
## divorced married single unknown
## 0.110021552 0.575215517 0.312500000 0.002262931
prop.table(table(cd$education))
##
## basic.4y basic.6y basic.9y high.school
## 0.0964439655 0.0493534483 0.1268318966 0.2265086207
## illiterate professional.course university.degree unknown
## 0.0006465517 0.1282327586 0.3240301724 0.0479525862
# 3B
# barplots
cd %>% ggplot() +
geom_bar(aes(x=y)) +
ggtitle("Barplot of Whether the Client Has Subscribed a Certified Term Deposit (CD)")
cd %>% ggplot() +
geom_bar(aes(x=job)) +
ggtitle("Barplot of Type of Job")
cd %>% ggplot() +
geom_bar(aes(x=marital)) +
ggtitle("Barplot of Marital Status")
cd %>% ggplot() +
geom_bar(aes(x=education)) +
ggtitle("Barplot of Education")
# 4A
# correlations
cov(cd[,c("age","duration","campaign","pdays","euribor3m","emp.var.rate","nr.employed")])
## age duration campaign pdays euribor3m
## age 145.492563 -89.42929 0.1039050 -202.56369 -1.0175989
## duration -89.429289 127957.69034 -21.6052246 3248.11379 38.7740642
## campaign 0.103905 -21.60522 5.4497372 65.41819 0.7728313
## pdays -202.563686 3248.11379 65.4181946 98471.95568 230.0119052
## euribor3m -1.017599 38.77406 0.7728313 230.01191 3.5736198
## emp.var.rate -1.018392 43.98545 0.7463054 180.87900 3.1184175
## nr.employed -78.470653 1814.43009 35.9863278 12983.36899 154.8741019
## emp.var.rate nr.employed
## age -1.0183920 -78.47065
## duration 43.9854493 1814.43009
## campaign 0.7463054 35.98633
## pdays 180.8789986 12983.36899
## euribor3m 3.1184175 154.87410
## emp.var.rate 2.9625424 130.06529
## nr.employed 130.0652867 7587.33632
cd %>% select(age,duration,campaign,pdays,euribor3m,emp.var.rate,nr.employed) %>% pairs.panels()
# 4B
# boxplots
cd %>%
ggplot() +
geom_boxplot(aes(x=y,y=duration)) +
ggtitle('Duration by Whether the Client Has Subscribed a Certified Term Deposit (CD)')
cd %>%
ggplot() +
geom_boxplot(aes(x=y,y=emp.var.rate)) +
ggtitle('Quarterly Employment Variation Rate by Whether the Client Has Subscribed a Certified Term Deposit (CD)')
cd %>%
ggplot() +
geom_boxplot(aes(x=y,y=cons.price.idx)) +
ggtitle('Monthly Consumer Price Index by Whether the Client Has Subscribed a Certified Term Deposit (CD)')
cd %>%
ggplot() +
geom_boxplot(aes(x=y,y=cons.conf.idx)) +
ggtitle('Monthly Consumer Confidence Index by Whether the Client Has Subscribed a Certified Term Deposit (CD)')
# aggregate
aggregate(duration~y, summary, data = cd)
## y duration.Min. duration.1st Qu. duration.Median duration.Mean
## 1 no 1.0000 94.0000 166.0000 221.5323
## 2 yes 37.0000 253.0000 449.0000 553.1912
## duration.3rd Qu. duration.Max.
## 1 279.2500 1994.0000
## 2 741.2500 4199.0000
aggregate(emp.var.rate~y, summary, data = cd)
## y emp.var.rate.Min. emp.var.rate.1st Qu. emp.var.rate.Median
## 1 no -3.4000000 -1.8000000 1.1000000
## 2 yes -3.4000000 -1.8000000 -1.8000000
## emp.var.rate.Mean emp.var.rate.3rd Qu. emp.var.rate.Max.
## 1 0.2409052 1.4000000 1.4000000
## 2 -1.2334483 -0.1000000 1.4000000
aggregate(cons.price.idx~y, summary, data = cd)
## y cons.price.idx.Min. cons.price.idx.1st Qu. cons.price.idx.Median
## 1 no 92.20100 93.07500 93.91800
## 2 yes 92.20100 92.89300 93.20000
## cons.price.idx.Mean cons.price.idx.3rd Qu. cons.price.idx.Max.
## 1 93.60397 93.99400 94.76700
## 2 93.35439 93.91800 94.76700
aggregate(cons.conf.idx~y, summary, data = cd)
## y cons.conf.idx.Min. cons.conf.idx.1st Qu. cons.conf.idx.Median
## 1 no -50.80000 -42.70000 -41.80000
## 2 yes -50.80000 -46.20000 -40.40000
## cons.conf.idx.Mean cons.conf.idx.3rd Qu. cons.conf.idx.Max.
## 1 -40.64647 -36.40000 -26.90000
## 2 -39.78978 -36.10000 -26.90000
Observations: This dataset consists of 9,280 rows of observations and 21 columns of variables. The target variable is ‘y,’ which indicates whether the client has subscribed to a certified term deposit (CD). According to the pairs panel, there was a high positive correlation (i.e., a correlation above 0.85) among the numeric social and economic factors: interest rate, the employment variation rate, and the number of employees. Interestingly, these variables also exhibited a similar distribution of data, with higher values at both ends. Additionally, in the box plots presented in #4B, these social and economic factors showed a negative relationship with ‘y.’ In other words, when society expected the economy to be on the downturn—such as when banks offered high-interest rates or when there were significant layoffs in companies—people were more inclined to subscribe to a CD. This trend was also more commonly observed among younger married employees.