Created contingency tables to examine the relationship between categorical variables and the ‘y’ variable.
Performed a chi-square test to assess the independence between categorical variables and the ‘y’ variable. This test helps determine if there is a significant association between categorical variable.
Examine the coefficients and their significance levels in the glm() model summary.
data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")
data_frame_copy <- data_frame
summary(data_frame)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
data_frame['month_value'] <- unclass(factor(data_frame$month, levels = month.name))
data_frame_2008 <- data_frame[1:27729,]
data_frame_2008 <- data_frame_2008 |> mutate(year = 2008)
data_frame_2009 <- data_frame[27730:42591,]
data_frame_2009<- data_frame_2009 |> mutate(year = 2009)
data_frame_2010 <- data_frame[42592:45211,]
data_frame_2010 <- data_frame_2010 |> mutate(year = 2010)
data_frame <- rbind(data_frame_2008,data_frame_2009)
data_frame <- rbind(data_frame,data_frame_2010)
data_frame$date<-as.Date(with(data_frame,paste(year,month_value,day,sep="-")),"%Y-%m-%d")
head(data_frame)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no <NA> 5
## 2 44 technician single secondary no 29 yes no <NA> 5
## 3 33 entrepreneur married secondary no 2 yes yes <NA> 5
## 4 47 blue-collar married <NA> no 1506 yes no <NA> 5
## 5 33 <NA> single <NA> no 1 no no <NA> 5
## 6 35 management married tertiary no 231 yes no <NA> 5
## month duration campaign pdays previous poutcome y month_value year
## 1 May 261 1 -1 0 <NA> no 5 2008
## 2 May 151 1 -1 0 <NA> no 5 2008
## 3 May 76 1 -1 0 <NA> no 5 2008
## 4 May 92 1 -1 0 <NA> no 5 2008
## 5 May 198 1 -1 0 <NA> no 5 2008
## 6 May 139 1 -1 0 <NA> no 5 2008
## date
## 1 2008-05-05
## 2 2008-05-05
## 3 2008-05-05
## 4 2008-05-05
## 5 2008-05-05
## 6 2008-05-05
data_frame <- data_frame[ -c(10,11,18,19) ]
data_frame <- na.omit(data_frame)
data_frame <- data_frame %>% mutate(age_group = case_when(age >= 80 & age < 90 ~ '8',
age >= 70 & age < 80 ~ '7',
age >= 60 & age < 70 ~ '6',
age >= 50 & age < 60 ~ '5',
age >= 40 & age < 50 ~ '4',
age >= 30 & age < 40 ~ '3',
age >= 20 & age < 30 ~ '2',
age >= 10 & age < 20 ~ '1'))
data_frame_age_group <- data_frame|>
group_by(age_group,y) |>
summarise(category_mean_age_group = mean(age,na.rm=TRUE), count=n())
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
data_frame_age_group
## # A tibble: 16 × 4
## # Groups: age_group [8]
## age_group y category_mean_age_group count
## <chr> <chr> <dbl> <int>
## 1 1 no 18.9 8
## 2 1 yes 18.7 3
## 3 2 no 26.9 628
## 4 2 yes 26.3 284
## 5 3 no 34.3 2737
## 6 3 yes 34.2 671
## 7 4 no 43.9 1486
## 8 4 yes 44.5 348
## 9 5 no 54.1 936
## 10 5 yes 54.6 245
## 11 6 no 62.4 161
## 12 6 yes 63.0 145
## 13 7 no 73.3 81
## 14 7 yes 74.0 67
## 15 8 no 83.2 19
## 16 8 yes 81.5 23
a <- ggplot(data = data_frame_age_group, aes( x = count, y = age_group, fill = y )) +
geom_bar(stat = 'identity', position = 'dodge')
a
table(data_frame$education, data_frame$y, dnn = c("education","y"))
## y
## education no yes
## primary 840 172
## secondary 3359 838
## tertiary 1857 776
ggplot(data_frame) +
geom_mosaic(
aes(x=product(job,y),fill=y),
divider=c("vspine", "hspine"),offset = 0.02 # equivalent to divider=ddecker()
)
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `unite()` instead.
## ℹ The deprecated feature was likely used in the ggmosaic package.
## Please report the issue at <https://github.com/haleyjeppson/ggmosaic>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
table(data_frame$job, data_frame$y, dnn = c("job","y"))
## y
## job no yes
## admin. 817 240
## blue-collar 1364 173
## entrepreneur 186 25
## housemaid 114 32
## management 1260 493
## retired 266 192
## self-employed 200 64
## services 571 111
## student 133 104
## technician 1018 271
## unemployed 127 81
ggplot(data_frame) +
geom_mosaic(
aes(x=product(job,y),fill=y),
divider=c("vspine", "hspine"),offset = 0.02 # equivalent to divider=ddecker()
)
table(data_frame$poutcome, data_frame$y, dnn = c("poucome","y"))
## y
## poucome no yes
## failure 4095 584
## other 1462 288
## success 499 914
ggplot(data_frame) +
geom_mosaic(
aes(x=product(poutcome),fill=y),
divider=c("vspine", "hspine") # equivalent to divider=ddecker()
)+
geom_mosaic_text(aes(x = product(poutcome),fill=y),divider=c("vspine","hspine"))
table(data_frame$loan, data_frame$housing, data_frame$y, dnn = c("loan","housing","y"))
## , , y = no
##
## housing
## loan no yes
## no 1512 3569
## yes 222 753
##
## , , y = yes
##
## housing
## loan no yes
## no 1122 550
## yes 44 70
ggplot(data_frame) +
geom_mosaic(
aes(x=product(loan, housing),fill=y),
divider=c("hspine","vspine","vspine"),offset = 0.03 # equivalent to divider=ddecker()
)
table(data_frame$marital, data_frame$y, dnn = c("marital","y"))
## y
## marital no yes
## divorced 715 172
## married 3521 980
## single 1820 634
ggplot(data_frame) +
geom_mosaic(
aes(x=product(marital),fill=y),
divider=c("hspine","vspine"),offset = 0.03 # equivalent to divider=ddecker()
)
table(data_frame$default, data_frame$y, dnn = c("default","y"))
## y
## default no yes
## no 6004 1782
## yes 52 4
ggplot(data_frame) +
geom_mosaic(
aes(x=product(default),fill=y),
divider=c("vspine","hspine"),offset = 0.02
)
table(data_frame$contact, data_frame$y, dnn = c("contact","y"))
## y
## contact no yes
## cellular 5594 1663
## telephone 462 123
ggplot(data_frame) +
geom_mosaic(
aes(x=product(contact),fill=y),
divider=c("vspine","hspine"),offset = 0.02
)
Since a correlation matrix is typically used to measure the linear relationship between pairs of continuous variables, I have used chi-square tests to measure the association. This test helps determine if there is a significant association between categorical variable.
results_default <- chisq.test(table(data_frame$default, data_frame$y))
results_default
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data_frame$default, data_frame$y)
## X-squared = 6.9667, df = 1, p-value = 0.008304
p_vals <- (results_default$p.value)
results_marital <- chisq.test(table(data_frame$marital, data_frame$y))
results_marital
##
## Pearson's Chi-squared test
##
## data: table(data_frame$marital, data_frame$y)
## X-squared = 21.412, df = 2, p-value = 2.241e-05
there is evidence of an association between the ‘default’ and ‘y’ variables.
p_vals <- append(p_vals,results_marital$p.value)
results_job <- chisq.test(table(data_frame$job, data_frame$y))
results_job
##
## Pearson's Chi-squared test
##
## data: table(data_frame$job, data_frame$y)
## X-squared = 364.21, df = 10, p-value < 2.2e-16
p_vals <- append(p_vals,results_job$p.value)
results_education <- chisq.test(table(data_frame$education, data_frame$y))
results_education
##
## Pearson's Chi-squared test
##
## data: table(data_frame$education, data_frame$y)
## X-squared = 105.18, df = 2, p-value < 2.2e-16
p_vals <- append(p_vals,results_job$p.value)
results_loan <- chisq.test(table(data_frame$loan, data_frame$y))
results_loan
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data_frame$loan, data_frame$y)
## X-squared = 108.09, df = 1, p-value < 2.2e-16
p_vals <- append(p_vals,results_loan$p.value)
results_housing <- chisq.test(table(data_frame$housing, data_frame$y))
results_housing
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data_frame$housing, data_frame$y)
## X-squared = 793.5, df = 1, p-value < 2.2e-16
p_vals <- append(p_vals,results_housing$p.value)
results_poutcome <- chisq.test(table(data_frame$poutcome, data_frame$y))
results_poutcome
##
## Pearson's Chi-squared test
##
## data: table(data_frame$poutcome, data_frame$y)
## X-squared = 1732.7, df = 2, p-value < 2.2e-16
p_vals <- append(p_vals,results_poutcome$p.value)
results_contact <- chisq.test(table(data_frame$contact, data_frame$y))
results_contact
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data_frame$contact, data_frame$y)
## X-squared = 0.99485, df = 1, p-value = 0.3186
p_vals <- append(p_vals,results_contact$p.value)
p_vals
## [1] 8.303980e-03 2.240731e-05 3.827864e-72 3.827864e-72 2.574899e-25
## [6] 1.395835e-174 0.000000e+00 3.185600e-01
plot(p_vals)
From the above tests and plot we can see that all the categorical variables have some degree of association with the target variable.
data_frame <- data_frame %>% mutate(y = case_when(y == 'yes' ~ 1, y== 'no' ~ 0))
model <- glm(y ~ age + job + marital + education + default + balance + housing + loan +contact + duration + campaign + date + pdays + previous + poutcome, data = data_frame, family = "binomial")
summary(model)
##
## Call:
## glm(formula = y ~ age + job + marital + education + default +
## balance + housing + loan + contact + duration + campaign +
## date + pdays + previous + poutcome, family = "binomial",
## data = data_frame)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.361e+01 2.990e+00 -21.277 < 2e-16 ***
## age -3.281e-03 4.382e-03 -0.749 0.45404
## jobblue-collar -3.527e-01 1.443e-01 -2.443 0.01455 *
## jobentrepreneur -7.817e-01 2.974e-01 -2.628 0.00858 **
## jobhousemaid -3.404e-01 2.761e-01 -1.233 0.21771
## jobmanagement 1.060e-01 1.360e-01 0.779 0.43569
## jobretired -2.401e-01 1.935e-01 -1.241 0.21450
## jobself-employed -1.096e-01 2.145e-01 -0.511 0.60931
## jobservices -5.418e-02 1.638e-01 -0.331 0.74086
## jobstudent 1.777e-01 2.022e-01 0.879 0.37940
## jobtechnician -1.208e-01 1.295e-01 -0.933 0.35094
## jobunemployed 3.034e-01 2.192e-01 1.384 0.16629
## maritalmarried 8.021e-02 1.205e-01 0.666 0.50557
## maritalsingle 6.088e-02 1.382e-01 0.441 0.65957
## educationsecondary 1.552e-01 1.331e-01 1.166 0.24357
## educationtertiary 2.678e-01 1.527e-01 1.754 0.07943 .
## defaultyes -3.944e-01 5.780e-01 -0.682 0.49506
## balance 1.530e-05 1.073e-05 1.426 0.15396
## housingyes -4.946e-01 8.554e-02 -5.783 7.35e-09 ***
## loanyes -2.874e-01 1.258e-01 -2.284 0.02237 *
## contacttelephone -3.673e-01 1.483e-01 -2.477 0.01324 *
## duration 3.742e-03 1.526e-04 24.526 < 2e-16 ***
## campaign -1.374e-01 2.799e-02 -4.910 9.10e-07 ***
## date 4.240e-03 2.064e-04 20.542 < 2e-16 ***
## pdays -6.864e-04 3.211e-04 -2.138 0.03252 *
## previous -1.548e-03 8.899e-03 -0.174 0.86189
## poutcomeother 1.683e-01 9.389e-02 1.792 0.07313 .
## poutcomesuccess 1.821e+00 8.609e-02 21.154 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8415.1 on 7841 degrees of freedom
## Residual deviance: 5229.3 on 7814 degrees of freedom
## AIC: 5285.3
##
## Number of Fisher Scoring iterations: 5
poutcome as success results in an increase in the chance of subscribing to a term deposit by 1.821
education level tertiary results in an increase in the chance of subscribing to a term deposit 0.2678
education level secondary results in an increase in the chance of subscribing to a term deposit 0.1552
having a management job results in an increase in the chance of subscribing to a term deposit 0.106
having a housing a loan decreses the chance of subscribing to a term deposit by approximately 0.4946
having a job as an entrepreneur decreses the chance of subscribing to a term deposit by approximately 0.7817