GOAL - Which clients are more likely to subscribe to a term deposit or other low risk investment options

Steps Taken To Perform Analysis

Created contingency tables to examine the relationship between categorical variables and the ‘y’ variable.
Performed a chi-square test to assess the independence between categorical variables and the ‘y’ variable. This test helps determine if there is a significant association between categorical variable.
Examine the coefficients and their significance levels in the glm() model summary.

Data Prep

Libraries

Reading data into data_frame

data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")
data_frame_copy <- data_frame

Columns of data_frame

summary(data_frame)

##       age            job              marital           education        
##  Min.   :18.00   Length:45211       Length:45211       Length:45211      
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.94                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :95.00                                                           
##    default             balance         housing              loan          
##  Length:45211       Min.   : -8019   Length:45211       Length:45211      
##  Class :character   1st Qu.:    72   Class :character   Class :character  
##  Mode  :character   Median :   448   Mode  :character   Mode  :character  
##                     Mean   :  1362                                        
##                     3rd Qu.:  1428                                        
##                     Max.   :102127                                        
##    contact               day           month              duration     
##  Length:45211       Min.   : 1.00   Length:45211       Min.   :   0.0  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 103.0  
##  Mode  :character   Median :16.00   Mode  :character   Median : 180.0  
##                     Mean   :15.81                      Mean   : 258.2  
##                     3rd Qu.:21.00                      3rd Qu.: 319.0  
##                     Max.   :31.00                      Max.   :4918.0  
##     campaign          pdays          previous          poutcome        
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   Length:45211      
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   Class :character  
##  Median : 2.000   Median : -1.0   Median :  0.0000   Mode  :character  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803                     
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                     
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000                     
##       y            
##  Length:45211      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Making Data Column

data_frame['month_value'] <- unclass(factor(data_frame$month, levels = month.name))

data_frame_2008 <- data_frame[1:27729,]
data_frame_2008 <- data_frame_2008 |> mutate(year = 2008)
data_frame_2009 <- data_frame[27730:42591,]
data_frame_2009<- data_frame_2009 |> mutate(year = 2009)
data_frame_2010 <- data_frame[42592:45211,]
data_frame_2010 <- data_frame_2010 |> mutate(year = 2010)
data_frame <- rbind(data_frame_2008,data_frame_2009)
data_frame <- rbind(data_frame,data_frame_2010)
data_frame$date<-as.Date(with(data_frame,paste(year,month_value,day,sep="-")),"%Y-%m-%d")
head(data_frame)

##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no    <NA>   5
## 2  44   technician  single secondary      no      29     yes   no    <NA>   5
## 3  33 entrepreneur married secondary      no       2     yes  yes    <NA>   5
## 4  47  blue-collar married      <NA>      no    1506     yes   no    <NA>   5
## 5  33         <NA>  single      <NA>      no       1      no   no    <NA>   5
## 6  35   management married  tertiary      no     231     yes   no    <NA>   5
##   month duration campaign pdays previous poutcome  y month_value year
## 1   May      261        1    -1        0     <NA> no           5 2008
## 2   May      151        1    -1        0     <NA> no           5 2008
## 3   May       76        1    -1        0     <NA> no           5 2008
## 4   May       92        1    -1        0     <NA> no           5 2008
## 5   May      198        1    -1        0     <NA> no           5 2008
## 6   May      139        1    -1        0     <NA> no           5 2008
##         date
## 1 2008-05-05
## 2 2008-05-05
## 3 2008-05-05
## 4 2008-05-05
## 5 2008-05-05
## 6 2008-05-05

Removing unnecessary columns

data_frame <- data_frame[ -c(10,11,18,19) ]

Removing NA Values

data_frame <- na.omit(data_frame)

Grouping by categories

y vs age (creating bins for age)

data_frame <- data_frame %>% mutate(age_group = case_when(age >= 80  & age < 90 ~ '8',
                                             age >= 70  & age < 80 ~ '7',
                                             age >= 60  & age < 70 ~ '6',
                                             age >= 50  & age < 60 ~ '5',
                                             age >= 40  & age < 50 ~ '4',
                                             age >= 30  & age < 40 ~ '3',
                                             age >= 20  & age < 30 ~ '2',
                                             age >= 10  & age < 20 ~ '1'))

data_frame_age_group <- data_frame|>
    group_by(age_group,y) |>
      summarise(category_mean_age_group = mean(age,na.rm=TRUE), count=n())

## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.

data_frame_age_group

## # A tibble: 16 × 4
## # Groups:   age_group [8]
##    age_group y     category_mean_age_group count
##    <chr>     <chr>                   <dbl> <int>
##  1 1         no                       18.9     8
##  2 1         yes                      18.7     3
##  3 2         no                       26.9   628
##  4 2         yes                      26.3   284
##  5 3         no                       34.3  2737
##  6 3         yes                      34.2   671
##  7 4         no                       43.9  1486
##  8 4         yes                      44.5   348
##  9 5         no                       54.1   936
## 10 5         yes                      54.6   245
## 11 6         no                       62.4   161
## 12 6         yes                      63.0   145
## 13 7         no                       73.3    81
## 14 7         yes                      74.0    67
## 15 8         no                       83.2    19
## 16 8         yes                      81.5    23

a <- ggplot(data = data_frame_age_group, aes( x = count, y = age_group, fill = y )) +
  geom_bar(stat = 'identity', position = 'dodge')
a

Contingency Tables

education vs y

table(data_frame$education, data_frame$y, dnn = c("education","y"))

##            y
## education     no  yes
##   primary    840  172
##   secondary 3359  838
##   tertiary  1857  776

ggplot(data_frame) + 
  geom_mosaic(
    aes(x=product(job,y),fill=y),
    divider=c("vspine", "hspine"),offset = 0.02 # equivalent to divider=ddecker()
 )

## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `unite()` instead.
## ℹ The deprecated feature was likely used in the ggmosaic package.
##   Please report the issue at <https://github.com/haleyjeppson/ggmosaic>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Job vs y

table(data_frame$job, data_frame$y, dnn = c("job","y"))

##                y
## job               no  yes
##   admin.         817  240
##   blue-collar   1364  173
##   entrepreneur   186   25
##   housemaid      114   32
##   management    1260  493
##   retired        266  192
##   self-employed  200   64
##   services       571  111
##   student        133  104
##   technician    1018  271
##   unemployed     127   81

ggplot(data_frame) + 
  geom_mosaic(
    aes(x=product(job,y),fill=y),
    divider=c("vspine", "hspine"),offset = 0.02 # equivalent to divider=ddecker()
 )

poutcome vs y

table(data_frame$poutcome, data_frame$y, dnn = c("poucome","y"))

##          y
## poucome     no  yes
##   failure 4095  584
##   other   1462  288
##   success  499  914

ggplot(data_frame) + 
  geom_mosaic(
    aes(x=product(poutcome),fill=y),
    divider=c("vspine", "hspine") # equivalent to divider=ddecker()
 )+
 geom_mosaic_text(aes(x = product(poutcome),fill=y),divider=c("vspine","hspine"))

loans vs y

table(data_frame$loan, data_frame$housing, data_frame$y, dnn = c("loan","housing","y"))

## , , y = no
## 
##      housing
## loan    no  yes
##   no  1512 3569
##   yes  222  753
## 
## , , y = yes
## 
##      housing
## loan    no  yes
##   no  1122  550
##   yes   44   70

ggplot(data_frame) + 
  geom_mosaic(
    aes(x=product(loan, housing),fill=y),
    divider=c("hspine","vspine","vspine"),offset = 0.03 # equivalent to divider=ddecker()
 )

Marital vs y

table(data_frame$marital, data_frame$y, dnn = c("marital","y"))

##           y
## marital      no  yes
##   divorced  715  172
##   married  3521  980
##   single   1820  634

ggplot(data_frame) + 
  geom_mosaic(
    aes(x=product(marital),fill=y),
    divider=c("hspine","vspine"),offset = 0.03 # equivalent to divider=ddecker()
 )

default vs y

table(data_frame$default, data_frame$y, dnn = c("default","y"))

##        y
## default   no  yes
##     no  6004 1782
##     yes   52    4

ggplot(data_frame) + 
  geom_mosaic(
    aes(x=product(default),fill=y),
    divider=c("vspine","hspine"),offset = 0.02
 )

contact vs y

table(data_frame$contact, data_frame$y, dnn = c("contact","y"))

##            y
## contact       no  yes
##   cellular  5594 1663
##   telephone  462  123

ggplot(data_frame) + 
  geom_mosaic(
    aes(x=product(contact),fill=y),
    divider=c("vspine","hspine"),offset = 0.02
 )

Chi - Square Tests

Since a correlation matrix is typically used to measure the linear relationship between pairs of continuous variables, I have used chi-square tests to measure the association. This test helps determine if there is a significant association between categorical variable.

default vs y

results_default <- chisq.test(table(data_frame$default, data_frame$y))
results_default

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data_frame$default, data_frame$y)
## X-squared = 6.9667, df = 1, p-value = 0.008304

p_vals <- (results_default$p.value)

marital vs y

results_marital <- chisq.test(table(data_frame$marital, data_frame$y))
results_marital

## 
##  Pearson's Chi-squared test
## 
## data:  table(data_frame$marital, data_frame$y)
## X-squared = 21.412, df = 2, p-value = 2.241e-05

there is evidence of an association between the ‘default’ and ‘y’ variables.

p_vals <- append(p_vals,results_marital$p.value)

job vs y

results_job <- chisq.test(table(data_frame$job, data_frame$y))
results_job

## 
##  Pearson's Chi-squared test
## 
## data:  table(data_frame$job, data_frame$y)
## X-squared = 364.21, df = 10, p-value < 2.2e-16

p_vals <- append(p_vals,results_job$p.value)

education vs y

results_education <- chisq.test(table(data_frame$education, data_frame$y))
results_education

## 
##  Pearson's Chi-squared test
## 
## data:  table(data_frame$education, data_frame$y)
## X-squared = 105.18, df = 2, p-value < 2.2e-16

p_vals <- append(p_vals,results_job$p.value)

loan vs y

results_loan <- chisq.test(table(data_frame$loan, data_frame$y))
results_loan

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data_frame$loan, data_frame$y)
## X-squared = 108.09, df = 1, p-value < 2.2e-16

p_vals <- append(p_vals,results_loan$p.value)

housing vs y

results_housing <- chisq.test(table(data_frame$housing, data_frame$y))
results_housing

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data_frame$housing, data_frame$y)
## X-squared = 793.5, df = 1, p-value < 2.2e-16

p_vals <- append(p_vals,results_housing$p.value)

poutcome vs y

results_poutcome <- chisq.test(table(data_frame$poutcome, data_frame$y))
results_poutcome

## 
##  Pearson's Chi-squared test
## 
## data:  table(data_frame$poutcome, data_frame$y)
## X-squared = 1732.7, df = 2, p-value < 2.2e-16

p_vals <- append(p_vals,results_poutcome$p.value)

contact vs y

results_contact <- chisq.test(table(data_frame$contact, data_frame$y))
results_contact

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data_frame$contact, data_frame$y)
## X-squared = 0.99485, df = 1, p-value = 0.3186

p_vals <- append(p_vals,results_contact$p.value)

p_vals

## [1]  8.303980e-03  2.240731e-05  3.827864e-72  3.827864e-72  2.574899e-25
## [6] 1.395835e-174  0.000000e+00  3.185600e-01

plot(p_vals)

From the above tests and plot we can see that all the categorical variables have some degree of association with the target variable.

GLM

data_frame <- data_frame %>% mutate(y = case_when(y == 'yes' ~ 1, y== 'no' ~ 0))

model <- glm(y ~ age + job + marital + education + default + balance + housing + loan +contact + duration + campaign + date + pdays + previous + poutcome, data = data_frame, family = "binomial")

summary(model)

## 
## Call:
## glm(formula = y ~ age + job + marital + education + default + 
##     balance + housing + loan + contact + duration + campaign + 
##     date + pdays + previous + poutcome, family = "binomial", 
##     data = data_frame)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -6.361e+01  2.990e+00 -21.277  < 2e-16 ***
## age                -3.281e-03  4.382e-03  -0.749  0.45404    
## jobblue-collar     -3.527e-01  1.443e-01  -2.443  0.01455 *  
## jobentrepreneur    -7.817e-01  2.974e-01  -2.628  0.00858 ** 
## jobhousemaid       -3.404e-01  2.761e-01  -1.233  0.21771    
## jobmanagement       1.060e-01  1.360e-01   0.779  0.43569    
## jobretired         -2.401e-01  1.935e-01  -1.241  0.21450    
## jobself-employed   -1.096e-01  2.145e-01  -0.511  0.60931    
## jobservices        -5.418e-02  1.638e-01  -0.331  0.74086    
## jobstudent          1.777e-01  2.022e-01   0.879  0.37940    
## jobtechnician      -1.208e-01  1.295e-01  -0.933  0.35094    
## jobunemployed       3.034e-01  2.192e-01   1.384  0.16629    
## maritalmarried      8.021e-02  1.205e-01   0.666  0.50557    
## maritalsingle       6.088e-02  1.382e-01   0.441  0.65957    
## educationsecondary  1.552e-01  1.331e-01   1.166  0.24357    
## educationtertiary   2.678e-01  1.527e-01   1.754  0.07943 .  
## defaultyes         -3.944e-01  5.780e-01  -0.682  0.49506    
## balance             1.530e-05  1.073e-05   1.426  0.15396    
## housingyes         -4.946e-01  8.554e-02  -5.783 7.35e-09 ***
## loanyes            -2.874e-01  1.258e-01  -2.284  0.02237 *  
## contacttelephone   -3.673e-01  1.483e-01  -2.477  0.01324 *  
## duration            3.742e-03  1.526e-04  24.526  < 2e-16 ***
## campaign           -1.374e-01  2.799e-02  -4.910 9.10e-07 ***
## date                4.240e-03  2.064e-04  20.542  < 2e-16 ***
## pdays              -6.864e-04  3.211e-04  -2.138  0.03252 *  
## previous           -1.548e-03  8.899e-03  -0.174  0.86189    
## poutcomeother       1.683e-01  9.389e-02   1.792  0.07313 .  
## poutcomesuccess     1.821e+00  8.609e-02  21.154  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8415.1  on 7841  degrees of freedom
## Residual deviance: 5229.3  on 7814  degrees of freedom
## AIC: 5285.3
## 
## Number of Fisher Scoring iterations: 5

Result Analysis

Customers to target

poutcome as success results in an increase in the chance of subscribing to a term deposit by 1.821
education level tertiary results in an increase in the chance of subscribing to a term deposit 0.2678
education level secondary results in an increase in the chance of subscribing to a term deposit 0.1552
having a management job results in an increase in the chance of subscribing to a term deposit 0.106

Customers to avoid

having a housing a loan decreses the chance of subscribing to a term deposit by approximately 0.4946
having a job as an entrepreneur decreses the chance of subscribing to a term deposit by approximately 0.7817

Final Data Dive

PreranaRajole

2023-11-30