# set up
library(rmarkdown)
library(psych)
library(scatterplot3d) 
library(caret) 
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ purrr::lift()    masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cloud_wd <- getwd()
setwd(cloud_wd)

cd <- read.csv(file = "CD_additional_balanced.csv", stringsAsFactors = FALSE)

# 1A
# examine data structure
cd %>% str()
## 'data.frame':    9280 obs. of  21 variables:
##  $ age           : int  41 49 49 41 45 42 39 28 44 42 ...
##  $ job           : chr  "blue-collar" "entrepreneur" "technician" "technician" ...
##  $ marital       : chr  "divorced" "married" "married" "married" ...
##  $ education     : chr  "basic.4y" "university.degree" "basic.9y" "professional.course" ...
##  $ default       : chr  "unknown" "unknown" "no" "unknown" ...
##  $ housing       : chr  "yes" "yes" "no" "yes" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "may" "may" ...
##  $ day_of_week   : chr  "mon" "mon" "mon" "mon" ...
##  $ duration      : int  1575 1042 1467 579 461 673 935 1201 1030 1623 ...
##  $ campaign      : int  1 1 1 1 1 2 3 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : chr  "yes" "yes" "yes" "yes" ...
# 1B
# Factor categorical columns
cd$job <- factor(cd$job)
cd$marital <- factor(cd$marital)
cd$education <- factor(cd$education)
cd$default <- factor(cd$default)
cd$housing <- factor(cd$housing)
cd$loan <- factor(cd$loan)
cd$contact <- factor(cd$contact)
cd$month <- factor(cd$month)
cd$day_of_week <- factor(cd$day_of_week)
cd$poutcome <- factor(cd$poutcome)
cd$y <- factor(cd$y)

cd %>% str()
## 'data.frame':    9280 obs. of  21 variables:
##  $ age           : int  41 49 49 41 45 42 39 28 44 42 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
##  $ duration      : int  1575 1042 1467 579 461 673 935 1201 1030 1623 ...
##  $ campaign      : int  1 1 1 1 1 2 3 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
cd %>% summary()
##       age                job           marital                   education   
##  Min.   :17.0   admin.     :2517   divorced:1021   university.degree  :3007  
##  1st Qu.:31.0   blue-collar:1769   married :5338   high.school        :2102  
##  Median :38.0   technician :1459   single  :2900   professional.course:1190  
##  Mean   :40.4   services   : 773   unknown :  21   basic.9y           :1177  
##  3rd Qu.:48.0   management : 651                   basic.4y           : 895  
##  Max.   :98.0   retired    : 595                   basic.6y           : 458  
##                 (Other)    :1516                   (Other)            : 451  
##     default        housing          loan           contact         month     
##  no     :7824   no     :4104   no     :7688   cellular :6672   may    :2533  
##  unknown:1456   unknown: 225   unknown: 225   telephone:2608   jul    :1477  
##                 yes    :4951   yes    :1367                    aug    :1353  
##                                                                jun    :1169  
##                                                                nov    : 886  
##                                                                apr    : 785  
##                                                                (Other):1077  
##  day_of_week    duration         campaign          pdays          previous     
##  fri:1763    Min.   :   1.0   Min.   : 1.000   Min.   :  0.0   Min.   :0.0000  
##  mon:1846    1st Qu.: 145.0   1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000  
##  thu:2000    Median : 265.0   Median : 2.000   Median :999.0   Median :0.0000  
##  tue:1810    Mean   : 387.4   Mean   : 2.333   Mean   :887.3   Mean   :0.3153  
##  wed:1861    3rd Qu.: 528.0   3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000  
##              Max.   :4199.0   Max.   :39.000   Max.   :999.0   Max.   :6.0000  
##                                                                                
##         poutcome     emp.var.rate     cons.price.idx  cons.conf.idx   
##  failure    :1074   Min.   :-3.4000   Min.   :92.20   Min.   :-50.80  
##  nonexistent:7244   1st Qu.:-1.8000   1st Qu.:92.89   1st Qu.:-42.70  
##  success    : 962   Median :-0.1000   Median :93.44   Median :-41.80  
##                     Mean   :-0.4963   Mean   :93.48   Mean   :-40.22  
##                     3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40  
##                     Max.   : 1.4000   Max.   :94.77   Max.   :-26.90  
##                                                                       
##    euribor3m      nr.employed     y       
##  Min.   :0.634   Min.   :4964   no :4640  
##  1st Qu.:1.244   1st Qu.:5076   yes:4640  
##  Median :4.021   Median :5191             
##  Mean   :2.960   Mean   :5135             
##  3rd Qu.:4.959   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228             
## 

1 Explore numeric variables

# 2A
# histograms
cd %>% ggplot() +
  geom_histogram(aes(x=age),binwidth = 20) +
  ggtitle('Histogram of Age')

cd %>% ggplot() +
  geom_histogram(aes(x=duration),binwidth = 20) +
  ggtitle('Histogram of Duration')

cd %>% ggplot() +
  geom_histogram(aes(x=campaign),binwidth = 20) +
  ggtitle('Histogram of Campaign')

cd %>% ggplot() +
  geom_histogram(aes(x=pdays),binwidth = 20) +
  ggtitle('Histogram of # of Days Passed by after the Client was Last Contacted from a Previous Campaign')

# 2B
# boxplots
cd %>% 
  ggplot(aes(x=age)) + 
  geom_boxplot() +
  ggtitle('Boxplot of Age')

cd %>% 
  ggplot(aes(x=duration)) + 
  geom_boxplot() +
  ggtitle('Boxplot of Duration')

cd %>% 
  ggplot(aes(x=campaign)) + 
  geom_boxplot() +
  ggtitle('Boxplot of Campaign')

cd %>% 
  ggplot(aes(x=pdays)) + 
  geom_boxplot() +
  ggtitle('Boxplot of # of Days Passed by after the Client was Last Contacted from a Previous Campaign')

# 2C
# deciles
cd %>% pull(age) %>% quantile(., seq(from = 0, to = 1, by = 0.10), na.rm=TRUE)
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##   17   27   30   33   35   38   41   46   51   57   98
cd %>% pull(duration) %>% quantile(., seq(from = 0, to = 1, by = 0.10), na.rm=TRUE)
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1   80  124  167  211  265  340  452  615  860 4199
cd %>% pull(campaign) %>% quantile(., seq(from = 0, to = 1, by = 0.10), na.rm=TRUE)
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1    1    1    1    1    2    2    2    3    4   39
cd %>% pull(pdays) %>% quantile(., seq(from = 0, to = 1, by = 0.10), na.rm=TRUE)
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    0   11  999  999  999  999  999  999  999  999  999

2 Explore factor variables

# 3A
cd %>% count(y)
##     y    n
## 1  no 4640
## 2 yes 4640
cd %>% count(job)
##              job    n
## 1         admin. 2517
## 2    blue-collar 1769
## 3   entrepreneur  308
## 4      housemaid  216
## 5     management  651
## 6        retired  595
## 7  self-employed  306
## 8       services  773
## 9        student  358
## 10    technician 1459
## 11    unemployed  248
## 12       unknown   80
cd %>% count(marital)
##    marital    n
## 1 divorced 1021
## 2  married 5338
## 3   single 2900
## 4  unknown   21
cd %>% count(education)
##             education    n
## 1            basic.4y  895
## 2            basic.6y  458
## 3            basic.9y 1177
## 4         high.school 2102
## 5          illiterate    6
## 6 professional.course 1190
## 7   university.degree 3007
## 8             unknown  445
prop.table(table(cd$y))
## 
##  no yes 
## 0.5 0.5
prop.table(table(cd$job))
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##    0.27122845    0.19062500    0.03318966    0.02327586    0.07015086 
##       retired self-employed      services       student    technician 
##    0.06411638    0.03297414    0.08329741    0.03857759    0.15721983 
##    unemployed       unknown 
##    0.02672414    0.00862069
prop.table(table(cd$marital))
## 
##    divorced     married      single     unknown 
## 0.110021552 0.575215517 0.312500000 0.002262931
prop.table(table(cd$education))
## 
##            basic.4y            basic.6y            basic.9y         high.school 
##        0.0964439655        0.0493534483        0.1268318966        0.2265086207 
##          illiterate professional.course   university.degree             unknown 
##        0.0006465517        0.1282327586        0.3240301724        0.0479525862
# 3B
# barplots
cd %>% ggplot() +
  geom_bar(aes(x=y)) +
  ggtitle("Barplot of Whether the Client Has Subscribed a Certified Term Deposit (CD)")

cd %>% ggplot() +
  geom_bar(aes(x=job)) +
  ggtitle("Barplot of Type of Job")

cd %>% ggplot() +
  geom_bar(aes(x=marital)) +
  ggtitle("Barplot of Marital Status")

cd %>% ggplot() +
  geom_bar(aes(x=education)) +
  ggtitle("Barplot of Education")

# 4A
# correlations
cov(cd[,c("age","duration","campaign","pdays","euribor3m","emp.var.rate","nr.employed")])
##                      age     duration    campaign       pdays   euribor3m
## age           145.492563    -89.42929   0.1039050  -202.56369  -1.0175989
## duration      -89.429289 127957.69034 -21.6052246  3248.11379  38.7740642
## campaign        0.103905    -21.60522   5.4497372    65.41819   0.7728313
## pdays        -202.563686   3248.11379  65.4181946 98471.95568 230.0119052
## euribor3m      -1.017599     38.77406   0.7728313   230.01191   3.5736198
## emp.var.rate   -1.018392     43.98545   0.7463054   180.87900   3.1184175
## nr.employed   -78.470653   1814.43009  35.9863278 12983.36899 154.8741019
##              emp.var.rate nr.employed
## age            -1.0183920   -78.47065
## duration       43.9854493  1814.43009
## campaign        0.7463054    35.98633
## pdays         180.8789986 12983.36899
## euribor3m       3.1184175   154.87410
## emp.var.rate    2.9625424   130.06529
## nr.employed   130.0652867  7587.33632
cd %>% select(age,duration,campaign,pdays,euribor3m,emp.var.rate,nr.employed) %>% pairs.panels()

# 4B
# boxplots
cd %>% 
  ggplot() +
  geom_boxplot(aes(x=y,y=duration)) +
  ggtitle('Duration by Whether the Client Has Subscribed a Certified Term Deposit (CD)')

cd %>% 
  ggplot() +
  geom_boxplot(aes(x=y,y=emp.var.rate)) +
  ggtitle('Quarterly Employment Variation Rate by Whether the Client Has Subscribed a Certified Term Deposit (CD)')

cd %>% 
  ggplot() +
  geom_boxplot(aes(x=y,y=cons.price.idx)) +
  ggtitle('Monthly Consumer Price Index by Whether the Client Has Subscribed a Certified Term Deposit (CD)')

cd %>% 
  ggplot() +
  geom_boxplot(aes(x=y,y=cons.conf.idx)) +
  ggtitle('Monthly Consumer Confidence Index by Whether the Client Has Subscribed a Certified Term Deposit (CD)')

# aggregate
aggregate(duration~y, summary, data = cd)
##     y duration.Min. duration.1st Qu. duration.Median duration.Mean
## 1  no        1.0000          94.0000        166.0000      221.5323
## 2 yes       37.0000         253.0000        449.0000      553.1912
##   duration.3rd Qu. duration.Max.
## 1         279.2500     1994.0000
## 2         741.2500     4199.0000
aggregate(emp.var.rate~y, summary, data = cd)
##     y emp.var.rate.Min. emp.var.rate.1st Qu. emp.var.rate.Median
## 1  no        -3.4000000           -1.8000000           1.1000000
## 2 yes        -3.4000000           -1.8000000          -1.8000000
##   emp.var.rate.Mean emp.var.rate.3rd Qu. emp.var.rate.Max.
## 1         0.2409052            1.4000000         1.4000000
## 2        -1.2334483           -0.1000000         1.4000000
aggregate(cons.price.idx~y, summary, data = cd)
##     y cons.price.idx.Min. cons.price.idx.1st Qu. cons.price.idx.Median
## 1  no            92.20100               93.07500              93.91800
## 2 yes            92.20100               92.89300              93.20000
##   cons.price.idx.Mean cons.price.idx.3rd Qu. cons.price.idx.Max.
## 1            93.60397               93.99400            94.76700
## 2            93.35439               93.91800            94.76700
aggregate(cons.conf.idx~y, summary, data = cd)
##     y cons.conf.idx.Min. cons.conf.idx.1st Qu. cons.conf.idx.Median
## 1  no          -50.80000             -42.70000            -41.80000
## 2 yes          -50.80000             -46.20000            -40.40000
##   cons.conf.idx.Mean cons.conf.idx.3rd Qu. cons.conf.idx.Max.
## 1          -40.64647             -36.40000          -26.90000
## 2          -39.78978             -36.10000          -26.90000

Observations: This dataset consists of 9,280 rows of observations and 21 columns of variables. The target variable is ‘y,’ which indicates whether the client has subscribed to a certified term deposit (CD). According to the pairs panel, there was a high positive correlation (i.e., a correlation above 0.85) among the numeric social and economic factors: interest rate, the employment variation rate, and the number of employees. Interestingly, these variables also exhibited a similar distribution of data, with higher values at both ends. Additionally, in the box plots presented in #4B, these social and economic factors showed a negative relationship with ‘y.’ In other words, when society expected the economy to be on the downturn—such as when banks offered high-interest rates or when there were significant layoffs in companies—people were more inclined to subscribe to a CD. This trend was also more commonly observed among younger married employees.