EDA.R

#############################################
################ DATA #######################
#############################################
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.2

## Warning: package 'ggplot2' was built under R version 4.4.2

## Warning: package 'tibble' was built under R version 4.4.2

## Warning: package 'tidyr' was built under R version 4.4.2

## Warning: package 'readr' was built under R version 4.4.2

## Warning: package 'purrr' was built under R version 4.4.2

## Warning: package 'dplyr' was built under R version 4.4.2

## Warning: package 'stringr' was built under R version 4.4.2

## Warning: package 'forcats' was built under R version 4.4.2

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(pROC)

## Warning: package 'pROC' was built under R version 4.4.2

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Load the dataset
data <- read.csv("bank-additional-full.csv", sep = ";", stringsAsFactors = FALSE)

#Basic information
head(data)

##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1         93.994         -36.4     4.857        5191 no
## 2         93.994         -36.4     4.857        5191 no
## 3         93.994         -36.4     4.857        5191 no
## 4         93.994         -36.4     4.857        5191 no
## 5         93.994         -36.4     4.857        5191 no
## 6         93.994         -36.4     4.857        5191 no

str(data)

## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : chr  "housemaid" "services" "services" "admin." ...
##  $ marital       : chr  "married" "married" "married" "married" ...
##  $ education     : chr  "basic.4y" "high.school" "high.school" "basic.6y" ...
##  $ default       : chr  "no" "unknown" "no" "no" ...
##  $ housing       : chr  "no" "no" "yes" "no" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "may" "may" ...
##  $ day_of_week   : chr  "mon" "mon" "mon" "mon" ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : chr  "no" "no" "no" "no" ...

summary(data)

##       age            job              marital           education        
##  Min.   :17.00   Length:41188       Length:41188       Length:41188      
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.02                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :98.00                                                           
##    default            housing              loan             contact         
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:41188       Length:41188       Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 102.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 180.0   Median : 2.000  
##                                        Mean   : 258.3   Mean   : 2.568  
##                                        3rd Qu.: 319.0   3rd Qu.: 3.000  
##                                        Max.   :4918.0   Max.   :56.000  
##      pdays          previous       poutcome          emp.var.rate     
##  Min.   :  0.0   Min.   :0.000   Length:41188       Min.   :-3.40000  
##  1st Qu.:999.0   1st Qu.:0.000   Class :character   1st Qu.:-1.80000  
##  Median :999.0   Median :0.000   Mode  :character   Median : 1.10000  
##  Mean   :962.5   Mean   :0.173                      Mean   : 0.08189  
##  3rd Qu.:999.0   3rd Qu.:0.000                      3rd Qu.: 1.40000  
##  Max.   :999.0   Max.   :7.000                      Max.   : 1.40000  
##  cons.price.idx  cons.conf.idx     euribor3m      nr.employed  
##  Min.   :92.20   Min.   :-50.8   Min.   :0.634   Min.   :4964  
##  1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.344   1st Qu.:5099  
##  Median :93.75   Median :-41.8   Median :4.857   Median :5191  
##  Mean   :93.58   Mean   :-40.5   Mean   :3.621   Mean   :5167  
##  3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961   3rd Qu.:5228  
##  Max.   :94.77   Max.   :-26.9   Max.   :5.045   Max.   :5228  
##       y            
##  Length:41188      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

####################Preprocessing#####################################
# Convert key variables to factors
categorical_vars <- c("job", "marital", "education", "default", "housing", 
                      "loan", "contact", "month", "day_of_week", "poutcome", "y")
data[categorical_vars] <- lapply(data[categorical_vars], as.factor)

# Create a new factor variable 'previous_contact' (if pdays==999, then no previous contact)
data$previous_contact <- factor(ifelse(data$pdays == 999, "no", "yes"))

# Drop the 'duration' variable (not available at prediction time)
data <- data %>% select(-duration)

# Print structure and summary after preprocessing
str(data)

## 'data.frame':    41188 obs. of  21 variables:
##  $ age             : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job             : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
##  $ marital         : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ education       : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
##  $ default         : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
##  $ housing         : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
##  $ loan            : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
##  $ contact         : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month           : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week     : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ campaign        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays           : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome        : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate    : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx  : num  94 94 94 94 94 ...
##  $ cons.conf.idx   : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m       : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed     : num  5191 5191 5191 5191 5191 ...
##  $ y               : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ previous_contact: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

summary(data)

##       age                 job            marital     
##  Min.   :17.00   admin.     :10422   divorced: 4612  
##  1st Qu.:32.00   blue-collar: 9254   married :24928  
##  Median :38.00   technician : 6743   single  :11568  
##  Mean   :40.02   services   : 3969   unknown :   80  
##  3rd Qu.:47.00   management : 2924                   
##  Max.   :98.00   retired    : 1720                   
##                  (Other)    : 6156                   
##                education        default         housing           loan      
##  university.degree  :12168   no     :32588   no     :18622   no     :33950  
##  high.school        : 9515   unknown: 8597   unknown:  990   unknown:  990  
##  basic.9y           : 6045   yes    :    3   yes    :21576   yes    : 6248  
##  professional.course: 5243                                                  
##  basic.4y           : 4176                                                  
##  basic.6y           : 2292                                                  
##  (Other)            : 1749                                                  
##       contact          month       day_of_week    campaign          pdays      
##  cellular :26144   may    :13769   fri:7827    Min.   : 1.000   Min.   :  0.0  
##  telephone:15044   jul    : 7174   mon:8514    1st Qu.: 1.000   1st Qu.:999.0  
##                    aug    : 6178   thu:8623    Median : 2.000   Median :999.0  
##                    jun    : 5318   tue:8090    Mean   : 2.568   Mean   :962.5  
##                    nov    : 4101   wed:8134    3rd Qu.: 3.000   3rd Qu.:999.0  
##                    apr    : 2632               Max.   :56.000   Max.   :999.0  
##                    (Other): 2016                                               
##     previous            poutcome      emp.var.rate      cons.price.idx 
##  Min.   :0.000   failure    : 4252   Min.   :-3.40000   Min.   :92.20  
##  1st Qu.:0.000   nonexistent:35563   1st Qu.:-1.80000   1st Qu.:93.08  
##  Median :0.000   success    : 1373   Median : 1.10000   Median :93.75  
##  Mean   :0.173                       Mean   : 0.08189   Mean   :93.58  
##  3rd Qu.:0.000                       3rd Qu.: 1.40000   3rd Qu.:93.99  
##  Max.   :7.000                       Max.   : 1.40000   Max.   :94.77  
##                                                                        
##  cons.conf.idx     euribor3m      nr.employed     y         previous_contact
##  Min.   :-50.8   Min.   :0.634   Min.   :4964   no :36548   no :39673       
##  1st Qu.:-42.7   1st Qu.:1.344   1st Qu.:5099   yes: 4640   yes: 1515       
##  Median :-41.8   Median :4.857   Median :5191                               
##  Mean   :-40.5   Mean   :3.621   Mean   :5167                               
##  3rd Qu.:-36.4   3rd Qu.:4.961   3rd Qu.:5228                               
##  Max.   :-26.9   Max.   :5.045   Max.   :5228                               
##

table(data$previous_contact)

## 
##    no   yes 
## 39673  1515

colSums(is.na(data))

##              age              job          marital        education 
##                0                0                0                0 
##          default          housing             loan          contact 
##                0                0                0                0 
##            month      day_of_week         campaign            pdays 
##                0                0                0                0 
##         previous         poutcome     emp.var.rate   cons.price.idx 
##                0                0                0                0 
##    cons.conf.idx        euribor3m      nr.employed                y 
##                0                0                0                0 
## previous_contact 
##                0

#############################################
################ Analysis ###################
#############################################
# This script performs Exploratory Data Analysis (EDA) and basic statistical tests.
library(ggplot2)
library(dplyr)
library(GGally)

## Warning: package 'GGally' was built under R version 4.4.2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# 1. Age Distribution
ggplot(data, aes(x = age)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  theme_minimal() +
  labs(title = "Age Distribution", x = "Age", y = "Count")

# 2. Subscription Count
ggplot(data, aes(x = y)) +
  geom_bar(fill = "lightgreen", color = "black") +
  theme_minimal() +
  labs(title = "Subscription Count", x = "Subscription (y)", y = "Count")

yes_count <- sum(data$y == "yes")
no_count  <- sum(data$y == "no")
print(prop.test(x = yes_count, n = yes_count + no_count, p = 0.5, alternative = "two.sided"))

## 
##  1-sample proportions test with continuity correction
## 
## data:  yes_count out of yes_count + no_count, null probability 0.5
## X-squared = 24717, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.1096248 0.1157560
## sample estimates:
##         p 
## 0.1126542

# 3. Age Distribution by Subscription Status (Boxplot)
ggplot(data, aes(x = y, y = age)) +
  geom_boxplot(fill = "orange", color = "black") +
  theme_minimal() +
  labs(title = "Age by Subscription", x = "Subscription (y)", y = "Age")

# Normality test (for 'yes' group only due to sample size)
print(shapiro.test(data$age[data$y == "yes"]))

## 
##  Shapiro-Wilk normality test
## 
## data:  data$age[data$y == "yes"]
## W = 0.92629, p-value < 2.2e-16

# Compare age by subscription: t-test and Wilcoxon test
print(t.test(age ~ y, data = data))

## 
##  Welch Two Sample t-test
## 
## data:  age by y
## t = -4.7795, df = 5258.5, p-value = 1.805e-06
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  -1.4129336 -0.5909889
## sample estimates:
##  mean in group no mean in group yes 
##          39.91119          40.91315

print(wilcox.test(age ~ y, data = data))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age by y
## W = 86626887, p-value = 0.01608
## alternative hypothesis: true location shift is not equal to 0

# 4. Job Category Distribution by Subscription
ggplot(data, aes(x = job, fill = y)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Job Category by Subscription", x = "Job", y = "Count")

job_table <- table(data$job, data$y)
print(job_table)

##                
##                   no  yes
##   admin.        9070 1352
##   blue-collar   8616  638
##   entrepreneur  1332  124
##   housemaid      954  106
##   management    2596  328
##   retired       1286  434
##   self-employed 1272  149
##   services      3646  323
##   student        600  275
##   technician    6013  730
##   unemployed     870  144
##   unknown        293   37

print(chisq.test(job_table))

## 
##  Pearson's Chi-squared test
## 
## data:  job_table
## X-squared = 961.24, df = 11, p-value < 2.2e-16

# 5. Distribution of Campaign Contacts
ggplot(data, aes(x = campaign)) +
  geom_histogram(binwidth = 1, fill = "purple", color = "black") +
  theme_minimal() +
  labs(title = "Campaign Contacts", x = "Contacts", y = "Count")

print(summary(data$campaign[data$y == "yes"]))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.052   2.000  23.000

print(summary(data$campaign[data$y == "no"]))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.633   3.000  56.000

print(wilcox.test(campaign ~ y, data = data))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  campaign by y
## W = 94153912, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# 6. Subscription Rate by Campaign Contacts
campaign_rates <- data %>%
  group_by(campaign) %>%
  summarise(
    total = n(),
    yes_count = sum(y == "yes"),
    subscription_rate = yes_count / total
  ) %>%
  arrange(campaign)
print(head(campaign_rates, 10))

## # A tibble: 10 × 4
##    campaign total yes_count subscription_rate
##       <int> <int>     <int>             <dbl>
##  1        1 17642      2300            0.130 
##  2        2 10570      1211            0.115 
##  3        3  5341       574            0.107 
##  4        4  2651       249            0.0939
##  5        5  1599       120            0.0750
##  6        6   979        75            0.0766
##  7        7   629        38            0.0604
##  8        8   400        17            0.0425
##  9        9   283        17            0.0601
## 10       10   225        12            0.0533

ggplot(campaign_rates, aes(x = campaign, y = subscription_rate)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "Subscription Rate by Contacts", x = "Contacts", y = "Rate")

# 7. Euribor3m Density by Subscription
ggplot(data, aes(x = euribor3m, fill = y)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(title = "Euribor3m Density by Subscription", x = "Euribor3m", y = "Density")

print(summary(data$euribor3m[data$y == "yes"]))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.634   0.849   1.266   2.123   4.406   5.045

print(summary(data$euribor3m[data$y == "no"]))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.634   1.405   4.857   3.811   4.962   5.045

print(wilcox.test(euribor3m ~ y, data = data))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  euribor3m by y
## W = 126080979, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

print(t.test(euribor3m ~ y, data = data))

## 
##  Welch Two Sample t-test
## 
## data:  euribor3m by y
## t = 62.58, df = 5729.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  1.635467 1.741246
## sample estimates:
##  mean in group no mean in group yes 
##          3.811491          2.123135

#############################################
# 8a. Marital Status Distribution and Subscription Rates
#############################################
library(ggplot2)
library(dplyr)

# Proportion Plot: Visualize subscription proportions across marital statuses
plot8a <- ggplot(data, aes(x = marital, fill = y)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(title = "Proportion of Subscription by Marital Status", 
       x = "Marital Status", 
       y = "Proportion")
print(plot8a)

# Count Table: Display counts for each combination of marital status and subscription outcome
marital_table <- table(data$marital, data$y)
print(marital_table)

##           
##               no   yes
##   divorced  4136   476
##   married  22396  2532
##   single    9948  1620
##   unknown     68    12

# Chi-Square Test: Assess the independence between marital status and subscription
chi_test <- chisq.test(marital_table)
print(chi_test)

## 
##  Pearson's Chi-squared test
## 
## data:  marital_table
## X-squared = 122.66, df = 3, p-value < 2.2e-16

#############################################
# 8b. Education Level Distribution and Subscription Rates
#############################################
library(ggplot2)

# Proportion Plot: Visualize subscription proportions across education levels
plot8b <- ggplot(data, aes(x = education, fill = y)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(
    title = "Proportion of Subscription by Education Level", 
    x = "Education", 
    y = "Proportion"
  ) +
  # Rotate the x-axis labels by 45 degrees
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(plot8b)

#############################################
# 8c. Financial Status: default, housing, and loan
#############################################
library(ggplot2)

# List of financial status variables
fin_vars <- c("default", "housing", "loan")

for (var in fin_vars) {
  cat("Analyzing:", var, "\n")
  
  # 1. Proportion Plot
  plot_fin <- ggplot(data, aes_string(x = var, fill = "y")) +
    geom_bar(position = "fill") +
    theme_minimal() +
    labs(
      title = paste("Proportion of Subscription by", var),
      x = var,
      y = "Proportion"
    ) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  print(plot_fin)
  
  # 2. Count Table
  fin_table <- table(data[[var]], data$y)
  print(fin_table)
  
  # 3. Chi-Square Test
  fin_chi_test <- chisq.test(fin_table)
  print(fin_chi_test)
  
  cat("\n---------------------------------------\n\n")
}

## Analyzing: default

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##          
##              no   yes
##   no      28391  4197
##   unknown  8154   443
##   yes         3     0

## Warning in chisq.test(fin_table): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  fin_table
## X-squared = 406.58, df = 2, p-value < 2.2e-16
## 
## 
## ---------------------------------------
## 
## Analyzing: housing

##          
##              no   yes
##   no      16596  2026
##   unknown   883   107
##   yes     19069  2507
## 
##  Pearson's Chi-squared test
## 
## data:  fin_table
## X-squared = 5.6845, df = 2, p-value = 0.05829
## 
## 
## ---------------------------------------
## 
## Analyzing: loan

##          
##              no   yes
##   no      30100  3850
##   unknown   883   107
##   yes      5565   683
## 
##  Pearson's Chi-squared test
## 
## data:  fin_table
## X-squared = 1.094, df = 2, p-value = 0.5787
## 
## 
## ---------------------------------------

#############################################
# 8d. Communication and Timing Variables
# (contact, month, day_of_week, poutcome)
#############################################
library(ggplot2)

comm_vars <- c("contact", "month", "day_of_week", "poutcome")

for (var in comm_vars) {
  cat("Analyzing:", var, "\n")
  
  # 1. Proportion Plot
  plot_comm <- ggplot(data, aes_string(x = var, fill = "y")) +
    geom_bar(position = "fill") +
    theme_minimal() +
    labs(
      title = paste("Proportion of Subscription by", var),
      x = var,
      y = "Proportion"
    ) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  print(plot_comm)
  
  # 2. Count Table
  comm_table <- table(data[[var]], data$y)
  print(comm_table)
  
  # 3. Chi-Square Test
  comm_chi_test <- chisq.test(comm_table)
  print(comm_chi_test)
  
  cat("\n---------------------------------------\n\n")
}

## Analyzing: contact

##            
##                no   yes
##   cellular  22291  3853
##   telephone 14257   787
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  comm_table
## X-squared = 862.32, df = 1, p-value < 2.2e-16
## 
## 
## ---------------------------------------
## 
## Analyzing: month

##      
##          no   yes
##   apr  2093   539
##   aug  5523   655
##   dec    93    89
##   jul  6525   649
##   jun  4759   559
##   mar   270   276
##   may 12883   886
##   nov  3685   416
##   oct   403   315
##   sep   314   256
## 
##  Pearson's Chi-squared test
## 
## data:  comm_table
## X-squared = 3101.1, df = 9, p-value < 2.2e-16
## 
## 
## ---------------------------------------
## 
## Analyzing: day_of_week

##      
##         no  yes
##   fri 6981  846
##   mon 7667  847
##   thu 7578 1045
##   tue 7137  953
##   wed 7185  949
## 
##  Pearson's Chi-squared test
## 
## data:  comm_table
## X-squared = 26.145, df = 4, p-value = 2.958e-05
## 
## 
## ---------------------------------------
## 
## Analyzing: poutcome

##              
##                  no   yes
##   failure      3647   605
##   nonexistent 32422  3141
##   success       479   894
## 
##  Pearson's Chi-squared test
## 
## data:  comm_table
## X-squared = 4230.5, df = 2, p-value < 2.2e-16
## 
## 
## ---------------------------------------

# Create a data frame for CONTACT counts
contact_df <- data.frame(
  contact = c("cellular", "telephone"),
  no = c(22291, 14257),
  yes = c(3853, 787)
)

# Reshape data for plotting
contact_long <- contact_df %>%
  tidyr::pivot_longer(cols = c("no", "yes"), names_to = "subscription", values_to = "count")

# Plot counts for contact
ggplot(contact_long, aes(x = contact, y = count, fill = subscription)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Subscription Counts by Contact Type", x = "Contact Type", y = "Count")

# Create a data frame for MONTH counts (you can add more months)
month_df <- data.frame(
  month = c("apr", "aug", "dec", "jul", "mar", "may"),
  no = c(2093, 5523, 93, 6525, 270, 12883),
  yes = c(539, 655, 89, 649, 276, 886)
)

# Reshape data for plotting
month_long <- month_df %>%
  tidyr::pivot_longer(cols = c("no", "yes"), names_to = "subscription", values_to = "count")

# Plot counts for month
ggplot(month_long, aes(x = month, y = count, fill = subscription)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Subscription Counts by Month", x = "Month", y = "Count")

###
# Calculate proportions for contact
contact_df$Total <- contact_df$no + contact_df$yes
contact_df$no_prop <- contact_df$no / contact_df$Total
contact_df$yes_prop <- contact_df$yes / contact_df$Total

# Reshape for plotting
contact_prop_long <- contact_df %>%
  select(contact, no_prop, yes_prop) %>%
  tidyr::pivot_longer(cols = c("no_prop", "yes_prop"), names_to = "subscription", values_to = "proportion")

# Plot proportions for contact
ggplot(contact_prop_long, aes(x = contact, y = proportion, fill = subscription)) +
  geom_bar(stat = "identity", position = "fill") +
  theme_minimal() +
  labs(title = "Subscription Proportions by Contact Type", x = "Contact Type", y = "Proportion")

# Create a data frame for day_of_week counts
day_df <- data.frame(
  day = c("fri", "mon", "thu", "tue", "wed"),
  no = c(6981, 7667, 7578, 7137, 7185),
  yes = c(846, 847, 1045, 953, 949)
)

# Reshape data for plotting
day_long <- day_df %>%
  tidyr::pivot_longer(cols = c("no", "yes"), names_to = "subscription", values_to = "count")

# Plot counts for day_of_week
ggplot(day_long, aes(x = day, y = count, fill = subscription)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Subscription Counts by Day of the Week", x = "Day of the Week", y = "Count")

# Create a data frame for poutcome counts
poutcome_df <- data.frame(
  poutcome = c("failure", "nonexistent", "success"),
  no = c(3647, 32422, 479),
  yes = c(605, 3141, 894)
)

# Reshape data for plotting
poutcome_long <- poutcome_df %>%
  tidyr::pivot_longer(cols = c("no", "yes"), names_to = "subscription", values_to = "count")

# Plot counts for poutcome
ggplot(poutcome_long, aes(x = poutcome, y = count, fill = subscription)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Subscription Counts by Outcome of Previous Campaign (poutcome)",
       x = "Previous Outcome", y = "Count")

#############################################
# 9a. Economic Indicators Analysis
#############################################
#############################################
# EDA for emp.var.rate (Employment Variation Rate)
#############################################

library(ggplot2)
library(dplyr)

# 1. Summary Statistics
summary(data$emp.var.rate)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.40000 -1.80000  1.10000  0.08189  1.40000  1.40000

# 2. Density Plot (Compare yes vs no)
ggplot(data, aes(x = emp.var.rate, fill = y)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(
    title = "Density Plot of Employment Variation Rate by Subscription",
    x = "Employment Variation Rate",
    y = "Density"
  )

# 3. Boxplot (Compare yes vs no)
ggplot(data, aes(x = y, y = emp.var.rate, fill = y)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Boxplot of Employment Variation Rate by Subscription",
    x = "Subscription",
    y = "Employment Variation Rate"
  )

# 4. Statistical Tests (Optional)
# Wilcoxon test for non-normal distributions
wilcox.test(emp.var.rate ~ y, data = data)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  emp.var.rate by y
## W = 121548923, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# T-test if the distributions seem approximately normal
t.test(emp.var.rate ~ y, data = data)

## 
##  Welch Two Sample t-test
## 
## data:  emp.var.rate by y
## t = 59.137, df = 5665.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  1.433185 1.531463
## sample estimates:
##  mean in group no mean in group yes 
##         0.2488755        -1.2334483

#############################################
# EDA for cons.price.idx (Consumer Price Index)
#############################################

library(ggplot2)
library(dplyr)

# 1. Summary Statistics
summary(data$cons.price.idx)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   92.20   93.08   93.75   93.58   93.99   94.77

# 2. Density Plot (Compare yes vs no)
ggplot(data, aes(x = cons.price.idx, fill = y)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(
    title = "Density Plot of Consumer Price Index by Subscription",
    x = "Consumer Price Index",
    y = "Density"
  )

# 3. Boxplot (Compare yes vs no)
ggplot(data, aes(x = y, y = cons.price.idx, fill = y)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Boxplot of Consumer Price Index by Subscription",
    x = "Subscription",
    y = "Consumer Price Index"
  )

# 4. Statistical Tests (Optional)
# Wilcoxon test for non-normal distributions
wilcox.test(cons.price.idx ~ y, data = data)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cons.price.idx by y
## W = 103540749, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# T-test if the distributions seem approximately normal
t.test(cons.price.idx ~ y, data = data)

## 
##  Welch Two Sample t-test
## 
## data:  cons.price.idx by y
## t = 24.082, df = 5472.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  0.2290714 0.2696708
## sample estimates:
##  mean in group no mean in group yes 
##          93.60376          93.35439

#############################################
# EDA for cons.conf.idx (Consumer Confidence Index)
#############################################

library(ggplot2)
library(dplyr)

# 1. Summary Statistics
summary(data$cons.conf.idx)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -50.8   -42.7   -41.8   -40.5   -36.4   -26.9

# 2. Density Plot (Compare yes vs no)
ggplot(data, aes(x = cons.conf.idx, fill = y)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(
    title = "Density Plot of Consumer Confidence Index by Subscription",
    x = "Consumer Confidence Index",
    y = "Density"
  )

# 3. Boxplot (Compare yes vs no)
ggplot(data, aes(x = y, y = cons.conf.idx, fill = y)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Boxplot of Consumer Confidence Index by Subscription",
    x = "Subscription",
    y = "Consumer Confidence Index"
  )

# 4. Statistical Tests (Optional)
# Wilcoxon test for non-normal distributions
wilcox.test(cons.conf.idx ~ y, data = data)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cons.conf.idx by y
## W = 78464909, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# T-test if the distributions seem approximately normal
t.test(cons.conf.idx ~ y, data = data)

## 
##  Welch Two Sample t-test
## 
## data:  cons.conf.idx by y
## t = -8.6365, df = 5258.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  -0.9856585 -0.6209660
## sample estimates:
##  mean in group no mean in group yes 
##         -40.59310         -39.78978

#############################################
# EDA for euribor3m (3-Month Euribor Rate)
#############################################

library(ggplot2)
library(dplyr)

# 1. Summary Statistics
summary(data$euribor3m)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.634   1.344   4.857   3.621   4.961   5.045

# 2. Density Plot (Compare yes vs no)
ggplot(data, aes(x = euribor3m, fill = y)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(
    title = "Density Plot of 3-Month Euribor Rate by Subscription",
    x = "3-Month Euribor Rate",
    y = "Density"
  )

# 3. Boxplot (Compare yes vs no)
ggplot(data, aes(x = y, y = euribor3m, fill = y)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Boxplot of 3-Month Euribor Rate by Subscription",
    x = "Subscription",
    y = "3-Month Euribor Rate"
  )

# 4. Statistical Tests (Optional)
# Wilcoxon test for non-normal distributions
wilcox.test(euribor3m ~ y, data = data)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  euribor3m by y
## W = 126080979, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# T-test if the distributions seem approximately normal
t.test(euribor3m ~ y, data = data)

## 
##  Welch Two Sample t-test
## 
## data:  euribor3m by y
## t = 62.58, df = 5729.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  1.635467 1.741246
## sample estimates:
##  mean in group no mean in group yes 
##          3.811491          2.123135

EDA.R

ASUS

2025-03-10