library(gmodels) # Cross Tables [CrossTable()]
library(ggplot2)
library(ggmosaic) # Mosaic plot with ggplot [geom_mosaic()]
library(corrplot) # Correlation plot [corrplot()]
library(ggpubr) # Arranging ggplots together [ggarrange()]
library(cowplot) # Arranging ggplots together [plot_grid()]
library(caret) # ML [train(), confusionMatrix(), createDataPartition(), varImp(), trainControl()]
library(ROCR) # Model performance [performance(), prediction()]
library(plotROC) # ROC Curve with ggplot [geom_roc()]
library(pROC) # AUC computation [auc()]
library(PRROC) # AUPR computation [pr.curve()]
library(vcd)
library(rpart) # Decision trees [rpart(), plotcp(), prune()]
library(rpart.plot) # Decision trees plotting [rpart.plot()]
library(ranger) # Optimized Random Forest [ranger()]
library(lightgbm) # Light GBM [lgb.train()]
library(xgboost) # XGBoost [xgb.DMatrix(), xgb.train()]
library(MLmetrics) # Custom metrics (F1 score for example)
library(tidyverse) # Data manipulation
# library(doMC) # Parallel processing
bank = read.csv(file = "./bank-additional-full.csv", sep = ";", stringsAsFactors = F)
head(bank)
rows <- nrow(bank)
dim(bank)
## [1] 41188 21
names(bank)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
CrossTable(bank$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 41188
##
##
## | no | yes |
## |-----------|-----------|
## | 36548 | 4640 |
## | 0.887 | 0.113 |
## |-----------|-----------|
##
##
##
##
bank = bank %>%
mutate(y = factor(if_else(y == "yes", "1", "0"), levels = c("0", "1")))
sum(is.na(bank))
## [1] 0
There are 12,718 unknown values in the dataset, let’s try to find out which variables suffer the most from those “missing values”.
6 features have at least 1 unknown value. Before deciding how to manage those missing values, we’ll study each variable and take a decision after visualisations. We can’t afford to delete 8,597 rows in our dataset, it’s more than 20% of our observations.
sum(bank == "unknown")
## [1] 12718
# summarise_all is a function from the dplyr package that applies a function to
# all columns in the data frame. In this case, the function is list(~sum(. ==
# 'unknown')), which uses the ~ notation to define an anonymous function that
# counts the number of times the value 'unknown' appears in each column. The
# output is a new data frame with a single row that summarizes the counts for
# each column. gather is a function from the tidyr package that reshapes data
# from wide format to long format. The key parameter specifies the name of the
# new column that will contain the names of the original columns, and the value
# parameter specifies the name of the new column that will contain the counts.
# The resulting data frame has three columns: 'variable', 'nr_unknown', and the
# names of the original columns. arrange is a function from the dplyr package
# that sorts rows based on one or more columns. In this case, -Count specifies
# that the rows should be sorted in descending order based on the values in the
# 'Count' column.
bank %>%
summarise_all(list(~sum(. == "unknown"))) %>%
gather(key = "Variable Name", value = "Unknown_Count") %>%
arrange(-Unknown_Count)
crosstable_f = function(df, x1, x2){
# df: dataframe containing both columns to cross
# var1, var2: columns to cross together.
CrossTable(df[, x1], df[, x2],
prop.r = T, # include raw percentages
prop.c = F,
prop.t = F,
prop.chisq = F,
dnn = c(x1, x2)) # set names of columns
}
mosaic_theme <- theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
fun_mosaic_plot <- function(data, x_var, y_var, x_label, y_label){
data %>%
ggplot() +
geom_mosaic(aes_string(x = paste0("product(", y_var, ",", x_var, ")"), fill = y_var)) +
mosaic_theme +
xlab(x_label) +
ylab(y_label)
}
summary(bank$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 32.00 38.00 40.02 47.00 98.00
bank %>%
ggplot() + # begin plot construction
aes(x = age) + # specify x axis = age
geom_bar() + # plot a bar plot faceted on y
facet_grid(y ~ .,
scales = "free_y") + # scales vary across y, y-axis scales of each panel will be independent of each other
scale_x_continuous(breaks = seq(0, 100, 5))
# first creates the age_60 variable Then, count() is used to compute the
# frequency of each combination of age_60 and y. Next, the data is grouped by
# y and nr_y is computed as the sum of n within each y group.
# Finally, relative_freq is computed as a percentage of n within each elder60-y
# group, and the results are ungrouped and selected for output.
bank %>%
mutate(age_60 = if_else(age > 60, "1", "0")) %>%
count(age_60, y) %>%
group_by(y) %>%
mutate(nr_y = sum(n)) %>%
mutate(relative_freq = round(100 * n/nr_y, 2)) %>%
ungroup() %>%
select(age_60, y, n, relative_freq)
We can also slice the age feature at 30 years to make three easily interpretable classes : [0, 30[, [30, 60[ and [60, +Inf[. The minimum and maximum values are 17 and 98 but we can expect new observations outside this range. We’re replacing the continious variable “age” by this categorical variable.
We might lose some information from this continious-to-discrete transformation, but there wasn’t any clear pattern between years. Cutting into classes make the algorithms easier to interpret later.
bank = bank %>%
mutate(age = if_else(age > 60, "high", if_else(age > 30, "mid", "low")))
crosstable_f(bank, "age", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 41188
##
##
## | y
## age | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## high | 496 | 414 | 910 |
## | 0.545 | 0.455 | 0.022 |
## -------------|-----------|-----------|-----------|
## low | 6259 | 1124 | 7383 |
## | 0.848 | 0.152 | 0.179 |
## -------------|-----------|-----------|-----------|
## mid | 29793 | 3102 | 32895 |
## | 0.906 | 0.094 | 0.799 |
## -------------|-----------|-----------|-----------|
## Column Total | 36548 | 4640 | 41188 |
## -------------|-----------|-----------|-----------|
##
##
# types of jobs
table(bank$job)
##
## admin. blue-collar entrepreneur housemaid management
## 10422 9254 1456 1060 2924
## retired self-employed services student technician
## 1720 1421 3969 875 6743
## unemployed unknown
## 1014 330
crosstable_f(bank, "job", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 41188
##
##
## | y
## job | 0 | 1 | Row Total |
## --------------|-----------|-----------|-----------|
## admin. | 9070 | 1352 | 10422 |
## | 0.870 | 0.130 | 0.253 |
## --------------|-----------|-----------|-----------|
## blue-collar | 8616 | 638 | 9254 |
## | 0.931 | 0.069 | 0.225 |
## --------------|-----------|-----------|-----------|
## entrepreneur | 1332 | 124 | 1456 |
## | 0.915 | 0.085 | 0.035 |
## --------------|-----------|-----------|-----------|
## housemaid | 954 | 106 | 1060 |
## | 0.900 | 0.100 | 0.026 |
## --------------|-----------|-----------|-----------|
## management | 2596 | 328 | 2924 |
## | 0.888 | 0.112 | 0.071 |
## --------------|-----------|-----------|-----------|
## retired | 1286 | 434 | 1720 |
## | 0.748 | 0.252 | 0.042 |
## --------------|-----------|-----------|-----------|
## self-employed | 1272 | 149 | 1421 |
## | 0.895 | 0.105 | 0.035 |
## --------------|-----------|-----------|-----------|
## services | 3646 | 323 | 3969 |
## | 0.919 | 0.081 | 0.096 |
## --------------|-----------|-----------|-----------|
## student | 600 | 275 | 875 |
## | 0.686 | 0.314 | 0.021 |
## --------------|-----------|-----------|-----------|
## technician | 6013 | 730 | 6743 |
## | 0.892 | 0.108 | 0.164 |
## --------------|-----------|-----------|-----------|
## unemployed | 870 | 144 | 1014 |
## | 0.858 | 0.142 | 0.025 |
## --------------|-----------|-----------|-----------|
## unknown | 293 | 37 | 330 |
## | 0.888 | 0.112 | 0.008 |
## --------------|-----------|-----------|-----------|
## Column Total | 36548 | 4640 | 41188 |
## --------------|-----------|-----------|-----------|
##
##
bank = bank %>%
filter(job != "unknown")
head(bank)
fun_mosaic_plot(bank, "job", "y", "Job", "Proportion")
crosstable_f(bank, "marital", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40858
##
##
## | y
## marital | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## divorced | 4126 | 473 | 4599 |
## | 0.897 | 0.103 | 0.113 |
## -------------|-----------|-----------|-----------|
## married | 22178 | 2516 | 24694 |
## | 0.898 | 0.102 | 0.604 |
## -------------|-----------|-----------|-----------|
## single | 9889 | 1605 | 11494 |
## | 0.860 | 0.140 | 0.281 |
## -------------|-----------|-----------|-----------|
## unknown | 62 | 9 | 71 |
## | 0.873 | 0.127 | 0.002 |
## -------------|-----------|-----------|-----------|
## Column Total | 36255 | 4603 | 40858 |
## -------------|-----------|-----------|-----------|
##
##
bank = bank %>%
filter(marital != "unknown")
fun_mosaic_plot(bank, "marital", "y", "Job", "Proportion")
crosstable_f(bank, "education", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40787
##
##
## | y
## education | 0 | 1 | Row Total |
## --------------------|-----------|-----------|-----------|
## basic.4y | 3695 | 423 | 4118 |
## | 0.897 | 0.103 | 0.101 |
## --------------------|-----------|-----------|-----------|
## basic.6y | 2077 | 187 | 2264 |
## | 0.917 | 0.083 | 0.056 |
## --------------------|-----------|-----------|-----------|
## basic.9y | 5536 | 470 | 6006 |
## | 0.922 | 0.078 | 0.147 |
## --------------------|-----------|-----------|-----------|
## high.school | 8436 | 1028 | 9464 |
## | 0.891 | 0.109 | 0.232 |
## --------------------|-----------|-----------|-----------|
## illiterate | 14 | 4 | 18 |
## | 0.778 | 0.222 | 0.000 |
## --------------------|-----------|-----------|-----------|
## professional.course | 4631 | 594 | 5225 |
## | 0.886 | 0.114 | 0.128 |
## --------------------|-----------|-----------|-----------|
## university.degree | 10442 | 1654 | 12096 |
## | 0.863 | 0.137 | 0.297 |
## --------------------|-----------|-----------|-----------|
## unknown | 1362 | 234 | 1596 |
## | 0.853 | 0.147 | 0.039 |
## --------------------|-----------|-----------|-----------|
## Column Total | 36193 | 4594 | 40787 |
## --------------------|-----------|-----------|-----------|
##
##
bank = bank %>%
filter(education != "illiterate")
Among the 1,596 rows containing the “unknown” value, 234 of them subscribed to a term deposit. This is around 5% of the total group of subscribers. Since we’re facing a very unbalanced dependent variable situation, we can not afford to discard those rows. Because this category has the highest relative frequency of “y = 1” (14.7%), we’re going to add them in the “university.degree” level. It has the second highest “y = 1” relative frequency (13.7%).
It appears that a positive correlation between the number of years of education and the odds to subscribe to a term deposit exists.
bank = bank %>%
mutate(education = recode(education, unknown = "university.degree"))
fun_mosaic_plot(bank, "education", "y", "Job", "Proportion")
bank %>%
ggplot() + aes(x = education, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")
crosstable_f(bank, "default", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40769
##
##
## | y
## default | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## no | 28182 | 4155 | 32337 |
## | 0.872 | 0.128 | 0.793 |
## -------------|-----------|-----------|-----------|
## unknown | 7994 | 435 | 8429 |
## | 0.948 | 0.052 | 0.207 |
## -------------|-----------|-----------|-----------|
## yes | 3 | 0 | 3 |
## | 1.000 | 0.000 | 0.000 |
## -------------|-----------|-----------|-----------|
## Column Total | 36179 | 4590 | 40769 |
## -------------|-----------|-----------|-----------|
##
##
bank = bank %>%
select(-default)
One way to assess the strength and significance of the association between the response and a categorical predictor is by using a chi-squared test.
The chi-squared test helps us to determine whether there is a statistically significant difference in the distribution of the response variable across different categories of the predictor variable. In other words, it tests whether the proportions of the response variable in different levels of the categorical predictor variable are significantly different from each other
The p-value associated to the Chi-squared test equals to 0.065, which is higher than a 0.05-threshold. So, for a confidence level of 95%, there’s no association between the dependent variable y and our feature housing. We’re removing it from the dataset.
# Stacked bar chart for categorical variable and binary response variable
bank %>%
ggplot(aes(x = y, fill = housing)) + geom_bar() + labs(x = "Response Variable",
y = "Count") + scale_fill_discrete(name = "Housing Loan")
crosstable_f(bank, "housing", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40769
##
##
## | y
## housing | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## no | 16416 | 2003 | 18419 |
## | 0.891 | 0.109 | 0.452 |
## -------------|-----------|-----------|-----------|
## unknown | 877 | 107 | 984 |
## | 0.891 | 0.109 | 0.024 |
## -------------|-----------|-----------|-----------|
## yes | 18886 | 2480 | 21366 |
## | 0.884 | 0.116 | 0.524 |
## -------------|-----------|-----------|-----------|
## Column Total | 36179 | 4590 | 40769 |
## -------------|-----------|-----------|-----------|
##
##
chisq.test(bank$housing, bank$y)
##
## Pearson's Chi-squared test
##
## data: bank$housing and bank$y
## X-squared = 5.4627, df = 2, p-value = 0.06513
bank = bank %>%
select(-housing)
crosstable_f(bank, "loan", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40769
##
##
## | y
## loan | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## no | 29799 | 3806 | 33605 |
## | 0.887 | 0.113 | 0.824 |
## -------------|-----------|-----------|-----------|
## unknown | 877 | 107 | 984 |
## | 0.891 | 0.109 | 0.024 |
## -------------|-----------|-----------|-----------|
## yes | 5503 | 677 | 6180 |
## | 0.890 | 0.110 | 0.152 |
## -------------|-----------|-----------|-----------|
## Column Total | 36179 | 4590 | 40769 |
## -------------|-----------|-----------|-----------|
##
##
chisq.test(bank$loan, bank$y)
##
## Pearson's Chi-squared test
##
## data: bank$loan and bank$y
## X-squared = 0.86841, df = 2, p-value = 0.6478
bank = bank %>%
select(-loan)
crosstable_f(bank, "contact", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40769
##
##
## | y
## contact | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## cellular | 22098 | 3815 | 25913 |
## | 0.853 | 0.147 | 0.636 |
## -------------|-----------|-----------|-----------|
## telephone | 14081 | 775 | 14856 |
## | 0.948 | 0.052 | 0.364 |
## -------------|-----------|-----------|-----------|
## Column Total | 36179 | 4590 | 40769 |
## -------------|-----------|-----------|-----------|
##
##
fun_mosaic_plot(bank, "contact", "y", "Job", "Proportion")
bank %>%
ggplot() + aes(x = contact, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")
crosstable_f(bank, "month", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40769
##
##
## | y
## month | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## apr | 2082 | 536 | 2618 |
## | 0.795 | 0.205 | 0.064 |
## -------------|-----------|-----------|-----------|
## aug | 5459 | 644 | 6103 |
## | 0.894 | 0.106 | 0.150 |
## -------------|-----------|-----------|-----------|
## dec | 92 | 88 | 180 |
## | 0.511 | 0.489 | 0.004 |
## -------------|-----------|-----------|-----------|
## jul | 6471 | 642 | 7113 |
## | 0.910 | 0.090 | 0.174 |
## -------------|-----------|-----------|-----------|
## jun | 4697 | 548 | 5245 |
## | 0.896 | 0.104 | 0.129 |
## -------------|-----------|-----------|-----------|
## mar | 267 | 274 | 541 |
## | 0.494 | 0.506 | 0.013 |
## -------------|-----------|-----------|-----------|
## may | 12734 | 882 | 13616 |
## | 0.935 | 0.065 | 0.334 |
## -------------|-----------|-----------|-----------|
## nov | 3672 | 412 | 4084 |
## | 0.899 | 0.101 | 0.100 |
## -------------|-----------|-----------|-----------|
## oct | 396 | 311 | 707 |
## | 0.560 | 0.440 | 0.017 |
## -------------|-----------|-----------|-----------|
## sep | 309 | 253 | 562 |
## | 0.550 | 0.450 | 0.014 |
## -------------|-----------|-----------|-----------|
## Column Total | 36179 | 4590 | 40769 |
## -------------|-----------|-----------|-----------|
##
##
bank %>%
ggplot() + aes(x = month, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")
crosstable_f(bank, "day_of_week", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 40769
##
##
## | y
## day_of_week | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## fri | 6936 | 839 | 7775 |
## | 0.892 | 0.108 | 0.191 |
## -------------|-----------|-----------|-----------|
## mon | 7578 | 841 | 8419 |
## | 0.900 | 0.100 | 0.207 |
## -------------|-----------|-----------|-----------|
## thu | 7493 | 1031 | 8524 |
## | 0.879 | 0.121 | 0.209 |
## -------------|-----------|-----------|-----------|
## tue | 7056 | 945 | 8001 |
## | 0.882 | 0.118 | 0.196 |
## -------------|-----------|-----------|-----------|
## wed | 7116 | 934 | 8050 |
## | 0.884 | 0.116 | 0.197 |
## -------------|-----------|-----------|-----------|
## Column Total | 36179 | 4590 | 40769 |
## -------------|-----------|-----------|-----------|
##
##
bank %>%
ggplot() + aes(x = day_of_week, y = ..count../rows, fill = y) + geom_bar() +
ylab("relative frequency")
ggplot(bank, aes(x = duration, fill = y)) + geom_density(alpha = 0.5) + ggtitle("Distribution of y and duration") +
xlab("Duration") + ylab("Density")
# How many times was the client contacted during this campaign?
bank %>%
ggplot() + aes(x = campaign, y = y, fill = y) + geom_boxplot() + ylab("relative frequency")
bank %>%
ggplot() + # begin plot construction
aes(x = campaign) + # specify x axis = campaign
geom_bar() + # plot a bar plot faceted on y
facet_grid(y ~ .,
scales = "free_y") + # scales vary across y, y-axis scales of each panel will be independent of each other
scale_x_continuous(breaks = seq(0, 100, 5))
bank = bank %>%
filter(campaign <= 10)
bank %>%
ggplot() + # begin plot construction
aes(x = campaign) + # specify x axis = campaign
geom_bar() + # plot a bar plot faceted on y
facet_grid(y ~ .,
scales = "free_y") + # scales vary across y, y-axis scales of each panel will be independent of each other
scale_x_continuous(breaks = seq(0, 100, 5))
- We can see that more number of times a customer is contacted in a
campaign, the less likely it is they will subscribe for a term
deposit
bank %>%
ggplot() + aes(x = campaign, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")
fun_mosaic_plot(bank, "campaign", "y", "Campaign", "Proportion")
This is the number of days that passed by after the client was last contacted from a previous campaign. 999 value means the client wasn’t previously contacted. Let’s make a dummy out of it.
Clients who haven’t been contacted in a previous campaign will be labeled “0” in the pdays_dummy variable.
Interesting to note that, Recontacting a client after a previous campaign seems to highly increase the odds of subscription
# number of days that passed by after the client was last contacted from a
# previous campaign
bank_counts <- table(bank$pdays)
c <- data.frame(bank_counts)
c
bank = bank %>%
mutate(pdays_dummy = if_else(pdays == 999, "0", "1")) %>%
select(-pdays)
fun_mosaic_plot(bank, "pdays_dummy", "y", "Previous Campaign Contact", "Proportion")
# previous: number of contacts performed before this campaign and for this
# client
fun_mosaic_plot(bank, "previous", "y", "Number of Contacts in Previous Campaign",
"Proportion")
bank %>%
ggplot() + aes(x = previous, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")
crosstable_f(bank, "poutcome", "y")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 39912
##
##
## | y
## poutcome | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## failure | 3616 | 595 | 4211 |
## | 0.859 | 0.141 | 0.106 |
## -------------|-----------|-----------|-----------|
## nonexistent | 31268 | 3085 | 34353 |
## | 0.910 | 0.090 | 0.861 |
## -------------|-----------|-----------|-----------|
## success | 464 | 884 | 1348 |
## | 0.344 | 0.656 | 0.034 |
## -------------|-----------|-----------|-----------|
## Column Total | 35348 | 4564 | 39912 |
## -------------|-----------|-----------|-----------|
##
##
3 month rate is the interest rate at which euro interbank term deposits are offered by one prime bank to another prime bank within the eurozone. It is an important benchmark used in financial markets and is used as a reference rate for various financial instruments. In the context of a person subscribing for a term deposit, a higher euribor 3 month rate generally indicates that interest rates are higher, and the returns on the term deposit will also be higher. This may make the term deposit more attractive to potential investors, including individuals who may be considering subscribing for a term deposit. Conversely, a lower euribor 3 month rate may make term deposits less attractive, as the returns on the investment would be lower.
is an economic indicator that measures changes in the number of employed people in a given population over a certain period, usually a quarter. The rate is calculated as the percentage change in the number of employed people from one period to another. The employment variation rate can have an impact on a person’s decision to subscribe for a term deposit in several ways. For example, if the employment variation rate is high, it may indicate a strong and growing economy, which can lead to higher confidence among consumers and increased spending. This increased spending may include investments in term deposits as a way to save money and earn interest.
library(reshape2)
# Select the variables to include in the correlation matrix
vars <- bank %>%
select(emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed)
# Calculate the correlation matrix
cor_matrix <- cor(vars)
# Convert the correlation matrix to a data frame
cor_df <- cor_matrix %>%
as.data.frame() %>%
rownames_to_column(var = "var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "correlation")
# Plot the correlation heatmap
ggplot(cor_df, aes(var1, var2)) + geom_tile(aes(fill = correlation)) + scale_fill_gradient2(low = "blue",
mid = "white", high = "red", midpoint = 0) + geom_text(aes(label = round(correlation,
2)), color = "black", size = 3) + theme_minimal() + theme(axis.text.x = element_text(angle = 90)) +
coord_fixed() + labs(title = "Correlation Heatmap", x = "", y = "")
pivot_longer function is used to reshape the data frame from a wide format to a long format. It takes the columns emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed and stacks them into a single column named variable, while creating a new column named value to store the actual values.
facet_wrap function is used to create a faceted plot. It takes the variable column as the facetting variable and sets the scales argument to “free”, so that the scales of each panel can be adjusted independently.
geom_histogram is used to add a histogram layer to the plot.
scale_fill_manual is used to set the fill colors for the two levels of the y variable.
theme is used to customize the appearance of the plot. In this case, it is used to tilt the x axis labels by 90 degrees.
bank %>%
pivot_longer(cols = c(emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed),
names_to = "variable") %>%
ggplot(aes(x = value, fill = y)) +
facet_wrap(~variable, scales = "free") +
geom_histogram(alpha = 0.5, position = "identity", bins = 20) +
xlab("Variable Value") +
ylab("Count") +
ggtitle("Histogram of Financial Variables against Response") +
scale_fill_manual(values = c("#999999", "#E69F00")) + # custom colors for the fill
theme(axis.text.x = element_text(angle = 90))
library(tidyverse)
# Scatterplot matrix
plot(bank %>%
select(emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed))
# Boxplot of employment variation rate by response
ggplot(bank, aes(x = y, y = emp.var.rate)) + geom_boxplot()
# Density plot of consumer price index by response
ggplot(bank, aes(x = cons.price.idx, fill = y)) + geom_density(alpha = 0.5)
# Histogram of consumer confidence index by response
ggplot(bank, aes(x = cons.conf.idx, fill = y)) + geom_histogram(alpha = 0.5, bins = 30)
# Density plot of euribor 3 month rate by response
ggplot(bank, aes(x = euribor3m, fill = y)) + geom_density(alpha = 0.5)
# Boxplot of number of employees by response
ggplot(bank, aes(x = y, y = nr.employed)) + geom_boxplot()
library(ggplot2)
library(dplyr)
# Scatter plots for numeric variables
bank %>%
ggplot(aes(x = emp.var.rate, y = nr.employed)) + geom_point() + labs(x = "Employment Variation Rate",
y = "Number of Employees")
bank %>%
ggplot(aes(x = cons.price.idx, y = euribor3m)) + geom_point() + labs(x = "Consumer Price Index",
y = "Euribor 3 Month Rate")
bank %>%
ggplot(aes(x = cons.conf.idx, y = nr.employed)) + geom_point() + labs(x = "Consumer Confidence Index",
y = "Number of Employees")
# Boxplots for numeric variables and binary response variable
bank %>%
ggplot(aes(x = y, y = emp.var.rate)) + geom_boxplot() + labs(x = "Response Variable",
y = "Employment Variation Rate")
bank %>%
ggplot(aes(x = y, y = cons.price.idx)) + geom_boxplot() + labs(x = "Response Variable",
y = "Consumer Price Index")
bank %>%
ggplot(aes(x = y, y = cons.conf.idx)) + geom_boxplot() + labs(x = "Response Variable",
y = "Consumer Confidence Index")
bank %>%
ggplot(aes(x = y, y = euribor3m)) + geom_boxplot() + labs(x = "Response Variable",
y = "Euribor 3 Month Rate")
bank %>%
ggplot(aes(x = y, y = nr.employed)) + geom_boxplot() + labs(x = "Response Variable",
y = "Number of Employees")
library(rcompanion)
cat_vars = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 15)
y_cV <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Variable", "Pvalue",
"CramerV", "DF"))
for (var in cat_vars) {
ct <- xtabs(~bank[, var] + y, data = bank)
cV <- cramerV(ct)
y_cV[nrow(y_cV) + 1, 1] <- names(bank)[var]
y_cV[nrow(y_cV), 2] <- round(chisq.test(ct)$p.value, 4)
y_cV[nrow(y_cV), 3] <- cV
y_cV[nrow(y_cV), 4] <- min(length(unique(bank[, var])) - 1, length(unique(bank)) -
1)
}
y_cV <- y_cV[order(y_cV$CramerV, decreasing = TRUE), ]
y_cV
# y is independent of housing and loan
# Create a bar plot
ggplot(y_cV[c("Variable", "CramerV")], aes(x = Variable, y = CramerV, fill = CramerV)) +
geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + geom_col()
geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + +
geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + #scale_fill_gradient(low
geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + = 'white',
geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + high =
geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + 'red') +
ggtitle("Cramer V for different variables") + xlab("Variables") + ylab("Cramer V vs Response") +
theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
cat_vars2 = c(2, 3, 4, 5, 8, 9, 10, 15)
cat_cV <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Variable1", "Variable2",
"CramerV", "DF"))
for (var1 in cat_vars2) {
for (var2 in cat_vars2) {
if (var1 == var2) {
break
} else {
ct <- xtabs(~bank[, var1] + bank[, var2], data = bank)
cV <- cramerV(ct)
cat_cV[nrow(cat_cV) + 1, 1] <- names(bank)[var1]
cat_cV[nrow(cat_cV), 2] <- names(bank)[var2]
cat_cV[nrow(cat_cV), 3] <- cV
cat_cV[nrow(cat_cV), 4] <- min(length(unique(bank[, var1])) - 1, length(unique(bank[,
var2])) - 1)
}
}
}
cat_cV <- cat_cV[order(cat_cV$CramerV, decreasing = TRUE), ]
cat_cV
library(caret)
# Split the dataset into training (60%), testing (20%), and validation (20%)
# sets
set.seed(123) # For reproducibility
trainIndex <- createDataPartition(bank$y, p = 0.6, list = FALSE, times = 1)
train_data <- bank[trainIndex, ]
test <- bank[-trainIndex, ]
testIndex <- createDataPartition(test$y, p = 0.5, list = FALSE, times = 1)
test_data <- test[-testIndex, ]
val_data <- test[testIndex, ]
# Check the class distribution in each dataset
prop.table(table(bank$y))
##
## 0 1
## 0.8856484 0.1143516
prop.table(table(train_data$y))
##
## 0 1
## 0.8856272 0.1143728
prop.table(table(test_data$y))
##
## 0 1
## 0.8857286 0.1142714
prop.table(table(val_data$y))
##
## 0 1
## 0.885632 0.114368
nrow(train_data)
## [1] 23948
nrow(test_data)
## [1] 7981
nrow(val_data)
## [1] 7983