#loading libraries
library(tidyverse)
library(ggplot2)
library(corrplot)
library(DescTools)
library(vcd)
library(reshape2)

EDA

#CSV file imported from github
Bank_data<-read.csv2("https://raw.githubusercontent.com/Andreina-A/Data-622/refs/heads/main/bank-full.csv", stringsAsFactors = TRUE) #imported characters as factors for further analysis
head(Bank_data)
##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no
summary(Bank_data)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous           poutcome       y        
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median :  0.0000   success: 1511              
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                              
##  Max.   :871.0   Max.   :275.0000                              
## 
str(Bank_data) #look at feature types
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

Are the features (columns) of your data correlated? The correlation of the numeric show there is small correlation for age and balance (average yearly balance), while previous and pday have a higher correlation. The variable pday is number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted) and previous is number of contacts performed before this campaign and for this client. Pday and seem to have a slight negative correlation indicating that as the number of contacts performed in the current campaign increases, the number of days since the client was last contacted from a previous campaign tends to decrease. This seems to be a campaign strategy where efforts are mostly made with clients that recently engaged and those that have been out of touch longer are being contacted less in the current campaign.

#Correlation plot for numeric features
cor_matrix<-cor(Bank_data|>
                  select_if(is.numeric))
corrplot(cor_matrix, method="number")

What is the overall distribution of each variable? The age variable seems to be slight skewed to the right. The balance variable is skewed to the right showing that less clients have high balance in the bank and that there are more people with a lesser balance amount. The campaign variable is skewed to the right, in the summary campaign the mean was higher than the median indicating possible outliers. Day seems to be normally distributed. Duration is skewed to the right, indicating most calls were short in length. Pday is skewed to the right, indicating the mojority of the clients weren’t previously contacted. Previous is skewed to the right, in the summary previous had a mean and median of zero, meaning the majority of the clients weren’t previously subscribed.

#distribution for numeric variables
Bank_data|>
  select_if(is.numeric)|>
  gather()|>
  ggplot(aes(value))+
  facet_wrap(~key,scales="free")+
  geom_histogram(bins=30, fill="skyblue", color="black")+
  theme_minimal()+
  ggtitle("Numerical Variable Distribution")

In the distribution for categorical variables we have the variable contact which is seen in the plot that most contacts were to cell phones. Default variable which is whether or not a client has a credit card on file as default, the majority of clients seem to not have a default credit card. For education the majority of the clients seem to have secondary education as the highest level of education. Housing loan wise the clients to be fairly 50/50 where about a little more clients seem to have a housing loan. The job variable plot show that the majority of the clients have a blue collar job, the second highest employment would be management, and third would be technician jobs. The loan variable shows that the majority of clients don’t have a personal loan, also the marjoirty of the clients seem to be married. The variable month show that May is the month is where most clients are contacted, based on the plot in R. The poutcome variable is the outcome of the previous campaign and the majority of the outcome were “unknown” and secondly a “failure”. The distribution for the subscription variable show most of the clients aren’t subscribed to the bank deposit term.

#distribution for categorical
Bank_data|>
  select_if(is.factor)|>
  gather()|>
  ggplot(aes(x=value))+
  facet_wrap(~ key, scales="free", ncol=3)+
  geom_bar(fill="blue")+
  theme(axis.text.x=element_text(angle=45,  hjust=1))+
  ggtitle("Categorical Variable Distribution")

Are there any outliers present? The balance, campaign, duration, and previous seem to have outliers.

Bank_data |> #Boxplot for outliers
keep(is.numeric) |>
gather() |>
ggplot(aes(value)) +
geom_boxplot() +
facet_wrap(~key, scales = 'free') +
ggtitle("Boxplots of Numerical Predictors")

Do any patterns or trends emerge in the data? The majority of the class is not subscribed to the bank deposit term, show there is some imbalance.

ggplot(Bank_data, aes(x=month, fill= y))+
  geom_bar(position="fill")+
  ylab("Subscription Rate")+
  theme_minimal()

library(ggplot2)
ggplot(Bank_data, aes(x = job, fill = y)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Subscription Rate by Job",
       x = "Job",
       y = "Proportion",
       fill = "Subscribed") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(Bank_data, aes(x = marital, fill = y)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Subscription Rate by Marital Status",
       x = "Marital Status",
       y = "Proportion",
       fill = "Subscribed")

What is the central tendency and spread of each variable?

Bank_data|>
  select_if(is.numeric)|>
  summarise_all(list(mean=mean, median=median, sd=sd, IQR=IQR))
##   age_mean balance_mean day_mean duration_mean campaign_mean pdays_mean
## 1 40.93621     1362.272 15.80642      258.1631      2.763841   40.19783
##   previous_mean age_median balance_median day_median duration_median
## 1     0.5803234         39            448         16             180
##   campaign_median pdays_median previous_median   age_sd balance_sd   day_sd
## 1               2           -1               0 10.61876   3044.766 8.322476
##   duration_sd campaign_sd pdays_sd previous_sd age_IQR balance_IQR day_IQR
## 1    257.5278    3.098021 100.1287    2.303441      15        1356      13
##   duration_IQR campaign_IQR pdays_IQR previous_IQR
## 1          216            2         0            0

Are there any missing values and how significant are they? The categorical variables have “unknown” instead of NA. The numeric variables have no missing data but the categorical variables seem to have missing values.

#check missing data
Missing_numeric<-colSums(is.na(Bank_data))
print(Missing_numeric)
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0
#check for unknown values
sapply(Bank_data, function(x) sum(x == "unknown"))
##       age       job   marital education   default   balance   housing      loan 
##         0       288         0      1857         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##     13020         0         0         0         0         0         0     36959 
##         y 
##         0

What are the relationships between different variables? Based on the correlation matrix plot the default credit has a very low correlation with the subscription outcome, but the poutcome variable seems to have a correlation with the “y”. The month of contact and housing loan has a moderate correlation.

Bank_cat<-Bank_data|>
  select_if(is.factor)

#created empty matrix for cramer values
n<-ncol(Bank_cat)
n<-as.numeric(n)
cramer_matrix<-matrix(NA, nrow=n, ncol=n)
colnames(cramer_matrix)<-rownames(cramer_matrix)<-names(Bank_cat)

#Calculate Cramer values
cramer_v<- function(x, y){
  tbl<-table(x,y)
  chi2<-suppressWarnings(chisq.test(tbl, correct=FALSE)$statistic)
  n<-sum(tbl)
  min_dim<-min(nrow(tbl),ncol(tbl))-1
  v<-sqrt(chi2/(n*min_dim))
  return(as.numeric(v))
}

#into matrix
for(i in 1:n){
  for(j in 1:n){
    cramer_matrix[i,j]<-cramer_v(Bank_cat[[i]], Bank_cat[[j]])
  }
}
cramerdf<-melt(cramer_matrix, na.rm=TRUE)
#plot
ggplot(cramerdf, aes(Var1, Var2, fill=value))+
  geom_tile(color="white")+
  geom_text(aes(label=round(value,2)), size=3)+
  scale_fill_gradient2(
    midpoint=0.3, limit= c(0,1), space="Lab",
    name= "Cramer's V"
  )+
  theme_minimal()+
  theme(axis.text.x=element_text(angle=45, vjust=1, hjust = 1))+
  labs(title= "Categorical Variables correlation", x="", y="")