#############################################
################ DATA #######################
#############################################
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'tibble' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'stringr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Load the dataset
data <- read.csv("bank-additional-full.csv", sep = ";", stringsAsFactors = FALSE)
#Basic information
head(data)
## age job marital education default housing loan contact month
## 1 56 housemaid married basic.4y no no no telephone may
## 2 57 services married high.school unknown no no telephone may
## 3 37 services married high.school no yes no telephone may
## 4 40 admin. married basic.6y no no no telephone may
## 5 56 services married high.school no no yes telephone may
## 6 45 services married basic.9y unknown no no telephone may
## day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 mon 261 1 999 0 nonexistent 1.1
## 2 mon 149 1 999 0 nonexistent 1.1
## 3 mon 226 1 999 0 nonexistent 1.1
## 4 mon 151 1 999 0 nonexistent 1.1
## 5 mon 307 1 999 0 nonexistent 1.1
## 6 mon 198 1 999 0 nonexistent 1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 93.994 -36.4 4.857 5191 no
## 2 93.994 -36.4 4.857 5191 no
## 3 93.994 -36.4 4.857 5191 no
## 4 93.994 -36.4 4.857 5191 no
## 5 93.994 -36.4 4.857 5191 no
## 6 93.994 -36.4 4.857 5191 no
str(data)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : chr "housemaid" "services" "services" "admin." ...
## $ marital : chr "married" "married" "married" "married" ...
## $ education : chr "basic.4y" "high.school" "high.school" "basic.6y" ...
## $ default : chr "no" "unknown" "no" "no" ...
## $ housing : chr "no" "no" "yes" "no" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : chr "no" "no" "no" "no" ...
summary(data)
## age job marital education
## Min. :17.00 Length:41188 Length:41188 Length:41188
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.02
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:41188 Length:41188 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 102.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 180.0 Median : 2.000
## Mean : 258.3 Mean : 2.568
## 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :4918.0 Max. :56.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.000 Length:41188 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.000 Mode :character Median : 1.10000
## Mean :962.5 Mean :0.173 Mean : 0.08189
## 3rd Qu.:999.0 3rd Qu.:0.000 3rd Qu.: 1.40000
## Max. :999.0 Max. :7.000 Max. : 1.40000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Min. :92.20 Min. :-50.8 Min. :0.634 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.344 1st Qu.:5099
## Median :93.75 Median :-41.8 Median :4.857 Median :5191
## Mean :93.58 Mean :-40.5 Mean :3.621 Mean :5167
## 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.9 Max. :5.045 Max. :5228
## y
## Length:41188
## Class :character
## Mode :character
##
##
##
####################Preprocessing#####################################
# Convert key variables to factors
categorical_vars <- c("job", "marital", "education", "default", "housing",
"loan", "contact", "month", "day_of_week", "poutcome", "y")
data[categorical_vars] <- lapply(data[categorical_vars], as.factor)
# Create a new factor variable 'previous_contact' (if pdays==999, then no previous contact)
data$previous_contact <- factor(ifelse(data$pdays == 999, "no", "yes"))
# Drop the 'duration' variable (not available at prediction time)
data <- data %>% select(-duration)
# Print structure and summary after preprocessing
str(data)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx : num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ previous_contact: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
summary(data)
## age job marital
## Min. :17.00 admin. :10422 divorced: 4612
## 1st Qu.:32.00 blue-collar: 9254 married :24928
## Median :38.00 technician : 6743 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## education default housing loan
## university.degree :12168 no :32588 no :18622 no :33950
## high.school : 9515 unknown: 8597 unknown: 990 unknown: 990
## basic.9y : 6045 yes : 3 yes :21576 yes : 6248
## professional.course: 5243
## basic.4y : 4176
## basic.6y : 2292
## (Other) : 1749
## contact month day_of_week campaign pdays
## cellular :26144 may :13769 fri:7827 Min. : 1.000 Min. : 0.0
## telephone:15044 jul : 7174 mon:8514 1st Qu.: 1.000 1st Qu.:999.0
## aug : 6178 thu:8623 Median : 2.000 Median :999.0
## jun : 5318 tue:8090 Mean : 2.568 Mean :962.5
## nov : 4101 wed:8134 3rd Qu.: 3.000 3rd Qu.:999.0
## apr : 2632 Max. :56.000 Max. :999.0
## (Other): 2016
## previous poutcome emp.var.rate cons.price.idx
## Min. :0.000 failure : 4252 Min. :-3.40000 Min. :92.20
## 1st Qu.:0.000 nonexistent:35563 1st Qu.:-1.80000 1st Qu.:93.08
## Median :0.000 success : 1373 Median : 1.10000 Median :93.75
## Mean :0.173 Mean : 0.08189 Mean :93.58
## 3rd Qu.:0.000 3rd Qu.: 1.40000 3rd Qu.:93.99
## Max. :7.000 Max. : 1.40000 Max. :94.77
##
## cons.conf.idx euribor3m nr.employed y previous_contact
## Min. :-50.8 Min. :0.634 Min. :4964 no :36548 no :39673
## 1st Qu.:-42.7 1st Qu.:1.344 1st Qu.:5099 yes: 4640 yes: 1515
## Median :-41.8 Median :4.857 Median :5191
## Mean :-40.5 Mean :3.621 Mean :5167
## 3rd Qu.:-36.4 3rd Qu.:4.961 3rd Qu.:5228
## Max. :-26.9 Max. :5.045 Max. :5228
##
table(data$previous_contact)
##
## no yes
## 39673 1515
colSums(is.na(data))
## age job marital education
## 0 0 0 0
## default housing loan contact
## 0 0 0 0
## month day_of_week campaign pdays
## 0 0 0 0
## previous poutcome emp.var.rate cons.price.idx
## 0 0 0 0
## cons.conf.idx euribor3m nr.employed y
## 0 0 0 0
## previous_contact
## 0
#############################################
################ Analysis ###################
#############################################
# This script performs Exploratory Data Analysis (EDA) and basic statistical tests.
library(ggplot2)
library(dplyr)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# 1. Age Distribution
ggplot(data, aes(x = age)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
theme_minimal() +
labs(title = "Age Distribution", x = "Age", y = "Count")

# 2. Subscription Count
ggplot(data, aes(x = y)) +
geom_bar(fill = "lightgreen", color = "black") +
theme_minimal() +
labs(title = "Subscription Count", x = "Subscription (y)", y = "Count")

yes_count <- sum(data$y == "yes")
no_count <- sum(data$y == "no")
print(prop.test(x = yes_count, n = yes_count + no_count, p = 0.5, alternative = "two.sided"))
##
## 1-sample proportions test with continuity correction
##
## data: yes_count out of yes_count + no_count, null probability 0.5
## X-squared = 24717, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.1096248 0.1157560
## sample estimates:
## p
## 0.1126542
# 3. Age Distribution by Subscription Status (Boxplot)
ggplot(data, aes(x = y, y = age)) +
geom_boxplot(fill = "orange", color = "black") +
theme_minimal() +
labs(title = "Age by Subscription", x = "Subscription (y)", y = "Age")

# Normality test (for 'yes' group only due to sample size)
print(shapiro.test(data$age[data$y == "yes"]))
##
## Shapiro-Wilk normality test
##
## data: data$age[data$y == "yes"]
## W = 0.92629, p-value < 2.2e-16
# Compare age by subscription: t-test and Wilcoxon test
print(t.test(age ~ y, data = data))
##
## Welch Two Sample t-test
##
## data: age by y
## t = -4.7795, df = 5258.5, p-value = 1.805e-06
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## -1.4129336 -0.5909889
## sample estimates:
## mean in group no mean in group yes
## 39.91119 40.91315
print(wilcox.test(age ~ y, data = data))
##
## Wilcoxon rank sum test with continuity correction
##
## data: age by y
## W = 86626887, p-value = 0.01608
## alternative hypothesis: true location shift is not equal to 0
# 4. Job Category Distribution by Subscription
ggplot(data, aes(x = job, fill = y)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Job Category by Subscription", x = "Job", y = "Count")

job_table <- table(data$job, data$y)
print(job_table)
##
## no yes
## admin. 9070 1352
## blue-collar 8616 638
## entrepreneur 1332 124
## housemaid 954 106
## management 2596 328
## retired 1286 434
## self-employed 1272 149
## services 3646 323
## student 600 275
## technician 6013 730
## unemployed 870 144
## unknown 293 37
print(chisq.test(job_table))
##
## Pearson's Chi-squared test
##
## data: job_table
## X-squared = 961.24, df = 11, p-value < 2.2e-16
# 5. Distribution of Campaign Contacts
ggplot(data, aes(x = campaign)) +
geom_histogram(binwidth = 1, fill = "purple", color = "black") +
theme_minimal() +
labs(title = "Campaign Contacts", x = "Contacts", y = "Count")

print(summary(data$campaign[data$y == "yes"]))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.052 2.000 23.000
print(summary(data$campaign[data$y == "no"]))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.633 3.000 56.000
print(wilcox.test(campaign ~ y, data = data))
##
## Wilcoxon rank sum test with continuity correction
##
## data: campaign by y
## W = 94153912, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# 6. Subscription Rate by Campaign Contacts
campaign_rates <- data %>%
group_by(campaign) %>%
summarise(
total = n(),
yes_count = sum(y == "yes"),
subscription_rate = yes_count / total
) %>%
arrange(campaign)
print(head(campaign_rates, 10))
## # A tibble: 10 × 4
## campaign total yes_count subscription_rate
## <int> <int> <int> <dbl>
## 1 1 17642 2300 0.130
## 2 2 10570 1211 0.115
## 3 3 5341 574 0.107
## 4 4 2651 249 0.0939
## 5 5 1599 120 0.0750
## 6 6 979 75 0.0766
## 7 7 629 38 0.0604
## 8 8 400 17 0.0425
## 9 9 283 17 0.0601
## 10 10 225 12 0.0533
ggplot(campaign_rates, aes(x = campaign, y = subscription_rate)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = "Subscription Rate by Contacts", x = "Contacts", y = "Rate")

# 7. Euribor3m Density by Subscription
ggplot(data, aes(x = euribor3m, fill = y)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "Euribor3m Density by Subscription", x = "Euribor3m", y = "Density")

print(summary(data$euribor3m[data$y == "yes"]))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.634 0.849 1.266 2.123 4.406 5.045
print(summary(data$euribor3m[data$y == "no"]))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.634 1.405 4.857 3.811 4.962 5.045
print(wilcox.test(euribor3m ~ y, data = data))
##
## Wilcoxon rank sum test with continuity correction
##
## data: euribor3m by y
## W = 126080979, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
print(t.test(euribor3m ~ y, data = data))
##
## Welch Two Sample t-test
##
## data: euribor3m by y
## t = 62.58, df = 5729.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 1.635467 1.741246
## sample estimates:
## mean in group no mean in group yes
## 3.811491 2.123135
#############################################
# 8a. Marital Status Distribution and Subscription Rates
#############################################
library(ggplot2)
library(dplyr)
# Proportion Plot: Visualize subscription proportions across marital statuses
plot8a <- ggplot(data, aes(x = marital, fill = y)) +
geom_bar(position = "fill") +
theme_minimal() +
labs(title = "Proportion of Subscription by Marital Status",
x = "Marital Status",
y = "Proportion")
print(plot8a)

# Count Table: Display counts for each combination of marital status and subscription outcome
marital_table <- table(data$marital, data$y)
print(marital_table)
##
## no yes
## divorced 4136 476
## married 22396 2532
## single 9948 1620
## unknown 68 12
# Chi-Square Test: Assess the independence between marital status and subscription
chi_test <- chisq.test(marital_table)
print(chi_test)
##
## Pearson's Chi-squared test
##
## data: marital_table
## X-squared = 122.66, df = 3, p-value < 2.2e-16
#############################################
# 8b. Education Level Distribution and Subscription Rates
#############################################
library(ggplot2)
# Proportion Plot: Visualize subscription proportions across education levels
plot8b <- ggplot(data, aes(x = education, fill = y)) +
geom_bar(position = "fill") +
theme_minimal() +
labs(
title = "Proportion of Subscription by Education Level",
x = "Education",
y = "Proportion"
) +
# Rotate the x-axis labels by 45 degrees
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot8b)

#############################################
# 8c. Financial Status: default, housing, and loan
#############################################
library(ggplot2)
# List of financial status variables
fin_vars <- c("default", "housing", "loan")
for (var in fin_vars) {
cat("Analyzing:", var, "\n")
# 1. Proportion Plot
plot_fin <- ggplot(data, aes_string(x = var, fill = "y")) +
geom_bar(position = "fill") +
theme_minimal() +
labs(
title = paste("Proportion of Subscription by", var),
x = var,
y = "Proportion"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot_fin)
# 2. Count Table
fin_table <- table(data[[var]], data$y)
print(fin_table)
# 3. Chi-Square Test
fin_chi_test <- chisq.test(fin_table)
print(fin_chi_test)
cat("\n---------------------------------------\n\n")
}
## Analyzing: default
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##
## no yes
## no 28391 4197
## unknown 8154 443
## yes 3 0
## Warning in chisq.test(fin_table): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: fin_table
## X-squared = 406.58, df = 2, p-value < 2.2e-16
##
##
## ---------------------------------------
##
## Analyzing: housing

##
## no yes
## no 16596 2026
## unknown 883 107
## yes 19069 2507
##
## Pearson's Chi-squared test
##
## data: fin_table
## X-squared = 5.6845, df = 2, p-value = 0.05829
##
##
## ---------------------------------------
##
## Analyzing: loan

##
## no yes
## no 30100 3850
## unknown 883 107
## yes 5565 683
##
## Pearson's Chi-squared test
##
## data: fin_table
## X-squared = 1.094, df = 2, p-value = 0.5787
##
##
## ---------------------------------------
#############################################
# 8d. Communication and Timing Variables
# (contact, month, day_of_week, poutcome)
#############################################
library(ggplot2)
comm_vars <- c("contact", "month", "day_of_week", "poutcome")
for (var in comm_vars) {
cat("Analyzing:", var, "\n")
# 1. Proportion Plot
plot_comm <- ggplot(data, aes_string(x = var, fill = "y")) +
geom_bar(position = "fill") +
theme_minimal() +
labs(
title = paste("Proportion of Subscription by", var),
x = var,
y = "Proportion"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot_comm)
# 2. Count Table
comm_table <- table(data[[var]], data$y)
print(comm_table)
# 3. Chi-Square Test
comm_chi_test <- chisq.test(comm_table)
print(comm_chi_test)
cat("\n---------------------------------------\n\n")
}
## Analyzing: contact

##
## no yes
## cellular 22291 3853
## telephone 14257 787
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: comm_table
## X-squared = 862.32, df = 1, p-value < 2.2e-16
##
##
## ---------------------------------------
##
## Analyzing: month

##
## no yes
## apr 2093 539
## aug 5523 655
## dec 93 89
## jul 6525 649
## jun 4759 559
## mar 270 276
## may 12883 886
## nov 3685 416
## oct 403 315
## sep 314 256
##
## Pearson's Chi-squared test
##
## data: comm_table
## X-squared = 3101.1, df = 9, p-value < 2.2e-16
##
##
## ---------------------------------------
##
## Analyzing: day_of_week

##
## no yes
## fri 6981 846
## mon 7667 847
## thu 7578 1045
## tue 7137 953
## wed 7185 949
##
## Pearson's Chi-squared test
##
## data: comm_table
## X-squared = 26.145, df = 4, p-value = 2.958e-05
##
##
## ---------------------------------------
##
## Analyzing: poutcome

##
## no yes
## failure 3647 605
## nonexistent 32422 3141
## success 479 894
##
## Pearson's Chi-squared test
##
## data: comm_table
## X-squared = 4230.5, df = 2, p-value < 2.2e-16
##
##
## ---------------------------------------
# Create a data frame for CONTACT counts
contact_df <- data.frame(
contact = c("cellular", "telephone"),
no = c(22291, 14257),
yes = c(3853, 787)
)
# Reshape data for plotting
contact_long <- contact_df %>%
tidyr::pivot_longer(cols = c("no", "yes"), names_to = "subscription", values_to = "count")
# Plot counts for contact
ggplot(contact_long, aes(x = contact, y = count, fill = subscription)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Subscription Counts by Contact Type", x = "Contact Type", y = "Count")

# Create a data frame for MONTH counts (you can add more months)
month_df <- data.frame(
month = c("apr", "aug", "dec", "jul", "mar", "may"),
no = c(2093, 5523, 93, 6525, 270, 12883),
yes = c(539, 655, 89, 649, 276, 886)
)
# Reshape data for plotting
month_long <- month_df %>%
tidyr::pivot_longer(cols = c("no", "yes"), names_to = "subscription", values_to = "count")
# Plot counts for month
ggplot(month_long, aes(x = month, y = count, fill = subscription)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Subscription Counts by Month", x = "Month", y = "Count")

###
# Calculate proportions for contact
contact_df$Total <- contact_df$no + contact_df$yes
contact_df$no_prop <- contact_df$no / contact_df$Total
contact_df$yes_prop <- contact_df$yes / contact_df$Total
# Reshape for plotting
contact_prop_long <- contact_df %>%
select(contact, no_prop, yes_prop) %>%
tidyr::pivot_longer(cols = c("no_prop", "yes_prop"), names_to = "subscription", values_to = "proportion")
# Plot proportions for contact
ggplot(contact_prop_long, aes(x = contact, y = proportion, fill = subscription)) +
geom_bar(stat = "identity", position = "fill") +
theme_minimal() +
labs(title = "Subscription Proportions by Contact Type", x = "Contact Type", y = "Proportion")

# Create a data frame for day_of_week counts
day_df <- data.frame(
day = c("fri", "mon", "thu", "tue", "wed"),
no = c(6981, 7667, 7578, 7137, 7185),
yes = c(846, 847, 1045, 953, 949)
)
# Reshape data for plotting
day_long <- day_df %>%
tidyr::pivot_longer(cols = c("no", "yes"), names_to = "subscription", values_to = "count")
# Plot counts for day_of_week
ggplot(day_long, aes(x = day, y = count, fill = subscription)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Subscription Counts by Day of the Week", x = "Day of the Week", y = "Count")

# Create a data frame for poutcome counts
poutcome_df <- data.frame(
poutcome = c("failure", "nonexistent", "success"),
no = c(3647, 32422, 479),
yes = c(605, 3141, 894)
)
# Reshape data for plotting
poutcome_long <- poutcome_df %>%
tidyr::pivot_longer(cols = c("no", "yes"), names_to = "subscription", values_to = "count")
# Plot counts for poutcome
ggplot(poutcome_long, aes(x = poutcome, y = count, fill = subscription)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Subscription Counts by Outcome of Previous Campaign (poutcome)",
x = "Previous Outcome", y = "Count")

#############################################
# 9a. Economic Indicators Analysis
#############################################
#############################################
# EDA for emp.var.rate (Employment Variation Rate)
#############################################
library(ggplot2)
library(dplyr)
# 1. Summary Statistics
summary(data$emp.var.rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.40000 -1.80000 1.10000 0.08189 1.40000 1.40000
# 2. Density Plot (Compare yes vs no)
ggplot(data, aes(x = emp.var.rate, fill = y)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(
title = "Density Plot of Employment Variation Rate by Subscription",
x = "Employment Variation Rate",
y = "Density"
)

# 3. Boxplot (Compare yes vs no)
ggplot(data, aes(x = y, y = emp.var.rate, fill = y)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Boxplot of Employment Variation Rate by Subscription",
x = "Subscription",
y = "Employment Variation Rate"
)

# 4. Statistical Tests (Optional)
# Wilcoxon test for non-normal distributions
wilcox.test(emp.var.rate ~ y, data = data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: emp.var.rate by y
## W = 121548923, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# T-test if the distributions seem approximately normal
t.test(emp.var.rate ~ y, data = data)
##
## Welch Two Sample t-test
##
## data: emp.var.rate by y
## t = 59.137, df = 5665.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 1.433185 1.531463
## sample estimates:
## mean in group no mean in group yes
## 0.2488755 -1.2334483
#############################################
# EDA for cons.price.idx (Consumer Price Index)
#############################################
library(ggplot2)
library(dplyr)
# 1. Summary Statistics
summary(data$cons.price.idx)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92.20 93.08 93.75 93.58 93.99 94.77
# 2. Density Plot (Compare yes vs no)
ggplot(data, aes(x = cons.price.idx, fill = y)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(
title = "Density Plot of Consumer Price Index by Subscription",
x = "Consumer Price Index",
y = "Density"
)

# 3. Boxplot (Compare yes vs no)
ggplot(data, aes(x = y, y = cons.price.idx, fill = y)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Boxplot of Consumer Price Index by Subscription",
x = "Subscription",
y = "Consumer Price Index"
)

# 4. Statistical Tests (Optional)
# Wilcoxon test for non-normal distributions
wilcox.test(cons.price.idx ~ y, data = data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: cons.price.idx by y
## W = 103540749, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# T-test if the distributions seem approximately normal
t.test(cons.price.idx ~ y, data = data)
##
## Welch Two Sample t-test
##
## data: cons.price.idx by y
## t = 24.082, df = 5472.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 0.2290714 0.2696708
## sample estimates:
## mean in group no mean in group yes
## 93.60376 93.35439
#############################################
# EDA for cons.conf.idx (Consumer Confidence Index)
#############################################
library(ggplot2)
library(dplyr)
# 1. Summary Statistics
summary(data$cons.conf.idx)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -50.8 -42.7 -41.8 -40.5 -36.4 -26.9
# 2. Density Plot (Compare yes vs no)
ggplot(data, aes(x = cons.conf.idx, fill = y)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(
title = "Density Plot of Consumer Confidence Index by Subscription",
x = "Consumer Confidence Index",
y = "Density"
)

# 3. Boxplot (Compare yes vs no)
ggplot(data, aes(x = y, y = cons.conf.idx, fill = y)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Boxplot of Consumer Confidence Index by Subscription",
x = "Subscription",
y = "Consumer Confidence Index"
)

# 4. Statistical Tests (Optional)
# Wilcoxon test for non-normal distributions
wilcox.test(cons.conf.idx ~ y, data = data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: cons.conf.idx by y
## W = 78464909, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# T-test if the distributions seem approximately normal
t.test(cons.conf.idx ~ y, data = data)
##
## Welch Two Sample t-test
##
## data: cons.conf.idx by y
## t = -8.6365, df = 5258.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## -0.9856585 -0.6209660
## sample estimates:
## mean in group no mean in group yes
## -40.59310 -39.78978
#############################################
# EDA for euribor3m (3-Month Euribor Rate)
#############################################
library(ggplot2)
library(dplyr)
# 1. Summary Statistics
summary(data$euribor3m)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.634 1.344 4.857 3.621 4.961 5.045
# 2. Density Plot (Compare yes vs no)
ggplot(data, aes(x = euribor3m, fill = y)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(
title = "Density Plot of 3-Month Euribor Rate by Subscription",
x = "3-Month Euribor Rate",
y = "Density"
)

# 3. Boxplot (Compare yes vs no)
ggplot(data, aes(x = y, y = euribor3m, fill = y)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Boxplot of 3-Month Euribor Rate by Subscription",
x = "Subscription",
y = "3-Month Euribor Rate"
)

# 4. Statistical Tests (Optional)
# Wilcoxon test for non-normal distributions
wilcox.test(euribor3m ~ y, data = data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: euribor3m by y
## W = 126080979, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# T-test if the distributions seem approximately normal
t.test(euribor3m ~ y, data = data)
##
## Welch Two Sample t-test
##
## data: euribor3m by y
## t = 62.58, df = 5729.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 1.635467 1.741246
## sample estimates:
## mean in group no mean in group yes
## 3.811491 2.123135