Data Dive Week 8

Libraries & Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(ggrepel)
library(effsize)
library(pwrss)

## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test

library(ggplot2)
library(broom)
library(lindia)

data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")

library(patchwork)

Summary of data frame

summary(data_frame)

##       age            job              marital           education        
##  Min.   :18.00   Length:45211       Length:45211       Length:45211      
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.94                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :95.00                                                           
##    default             balance         housing              loan          
##  Length:45211       Min.   : -8019   Length:45211       Length:45211      
##  Class :character   1st Qu.:    72   Class :character   Class :character  
##  Mode  :character   Median :   448   Mode  :character   Mode  :character  
##                     Mean   :  1362                                        
##                     3rd Qu.:  1428                                        
##                     Max.   :102127                                        
##    contact               day           month              duration     
##  Length:45211       Min.   : 1.00   Length:45211       Min.   :   0.0  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 103.0  
##  Mode  :character   Median :16.00   Mode  :character   Median : 180.0  
##                     Mean   :15.81                      Mean   : 258.2  
##                     3rd Qu.:21.00                      3rd Qu.: 319.0  
##                     Max.   :31.00                      Max.   :4918.0  
##     campaign          pdays          previous          poutcome        
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   Length:45211      
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   Class :character  
##  Median : 2.000   Median : -1.0   Median :  0.0000   Mode  :character  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803                     
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                     
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000                     
##       y            
##  Length:45211      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Description of the columns

1 - age;

2 - job;

3 - marital(marital status);

4 - education;

5 - default: has credit in default?;

6 - balance: average yearly balance, in euros

7 - housing: has housing loan?;

8 - loan: has personal loan?;

9 - contact: contact communication type;

10 - day: last contact day of the month

11 - month: last contact month of year;

12 - duration: last contact duration, in seconds;

13 - campaign: number of contacts performed during this campaign and for this client

14 - pdays: number of days that passed by after the client was last contacted from a previous campaign

15 - previous: number of contacts performed before this campaign and for this client

16 - poutcome: outcome of the previous marketing campaign;

17 - y : has the client subscribed a term deposit?

Selecting the explanatory and response variables

Response Variable : duration - the total time spent on a marketing call for a particular client.

Explanatory variable : job - job of the client.

Null Hypothesis

H₀ - The average duration spent for a marketing call is same for all groups, when the data is grouped by the type of job.

data_frame_no_NA <- na.omit(data_frame)

For better understanding, I am adding a column to convert the duration in seconds to minutes.

data_frame_no_NA$duration_in_mins <- data_frame_no_NA$duration/60
head(data_frame_no_NA)

##       age         job marital education default balance housing loan   contact
## 24061  33      admin. married  tertiary      no     882      no   no telephone
## 24063  42      admin.  single secondary      no    -247     yes  yes telephone
## 24065  33    services married secondary      no    3444     yes   no telephone
## 24073  36  management married  tertiary      no    2415     yes   no telephone
## 24078  36  management married  tertiary      no       0     yes   no telephone
## 24087  44 blue-collar married secondary      no    1324     yes   no telephone
##       day month duration campaign pdays previous poutcome   y duration_in_mins
## 24061  21   oct       39        1   151        3  failure  no         0.650000
## 24063  21   oct      519        1   166        1    other yes         8.650000
## 24065  21   oct      144        1    91        4  failure yes         2.400000
## 24073  22   oct       73        1    86        4    other  no         1.216667
## 24078  23   oct      140        1   143        3  failure yes         2.333333
## 24087  25   oct      119        1    89        2    other  no         1.983333

summary(data_frame_no_NA$duration_in_mins)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.08333  1.88333  3.23333  4.35484  5.40000 36.98333

ANOVA Test

Grouping data by type of Job

data_frame_no_NA |>
    group_by(job)|>
      summarise(avg_duration = mean(duration,na.rm=TRUE),sd_duration= sd(duration,na.rm=TRUE), size=n())

## # A tibble: 11 × 4
##    job           avg_duration sd_duration  size
##    <chr>                <dbl>       <dbl> <int>
##  1 admin.                247.        216.  1057
##  2 blue-collar           252.        234.  1537
##  3 entrepreneur          280.        273.   211
##  4 housemaid             237.        206.   146
##  5 management            261.        239.  1753
##  6 retired               326.        248.   458
##  7 self-employed         274.        259.   264
##  8 services              259.        242.   682
##  9 student               262.        208.   237
## 10 technician            253.        235.  1289
## 11 unemployed            307.        243.   208

Box plot to visualize

data_frame_no_NA |>
  ggplot() +
  geom_boxplot(mapping = aes(x = job, y = duration_in_mins)) +
  labs(x = "Job type",
       y = "Duration in minutes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

par(mfrow = c(5, 2))

p1 <- data_frame_no_NA|>
  filter(job=='admin.')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p2 <- data_frame_no_NA|>
  filter(job=='blue-collar')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p3 <-data_frame_no_NA|>
  filter(job=='entrepreneur')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p4 <-data_frame_no_NA|>
  filter(job=='housemaid')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p5 <-data_frame_no_NA|>
  filter(job=='management')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p6 <-data_frame_no_NA|>
  filter(job=='retired')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p7 <-data_frame_no_NA|>
  filter(job=='self-employed')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p8 <-data_frame_no_NA|>
  filter(job=='services')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p9 <-data_frame_no_NA|>
  filter(job=='student')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p10 <-data_frame_no_NA|>
  filter(job=='technician')|>
  ggplot(aes(x=duration)) +
  geom_histogram(bins = 100) +
  theme_minimal()

p1+p2+p3+p4+p5+p6+p7+p8+p9+p10

F Distribution for distinct jobs

n <- nrow(data_frame_no_NA)
k <- n_distinct(data_frame_no_NA$job)

ggplot() +
  geom_function(xlim = c(0, 10), fun = \(x) df(x, k - 1, n - k)) +
  geom_vline(xintercept = 1, color = 'blue') +
  labs(title = 'F Distribution for different jobs',
       x = "F Values",
       y = "Probability Density") +
  theme_hc()

Summarizing the analysis of variance model

m <- aov(duration_in_mins ~ job, data = data_frame_no_NA)
summary(m)

##               Df Sum Sq Mean Sq F value   Pr(>F)    
## job           10    834   83.41   5.412 4.93e-08 ***
## Residuals   7831 120684   15.41                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since the P value is very very small, we have enough evidence to reject the null hypothesis. Hence we can conclude that there is a difference in duration of the marketing calls for clients with different jobs.

Pairwise.t.test

pairwise.t.test(data_frame_no_NA$duration_in_mins, data_frame_no_NA$job, p.adjust.method = "bonferroni")

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  data_frame_no_NA$duration_in_mins and data_frame_no_NA$job 
## 
##               admin.  blue-collar entrepreneur housemaid management retired
## blue-collar   1.00000 -           -            -         -          -      
## entrepreneur  1.00000 1.00000     -            -         -          -      
## housemaid     1.00000 1.00000     1.00000      -         -          -      
## management    1.00000 1.00000     1.00000      1.00000   -          -      
## retired       1.1e-07 2.0e-07     1.00000      0.00404   8.2e-06    -      
## self-employed 1.00000 1.00000     1.00000      1.00000   1.00000    0.26555
## services      1.00000 1.00000     1.00000      1.00000   1.00000    0.00016
## student       1.00000 1.00000     1.00000      1.00000   1.00000    0.03925
## technician    1.00000 1.00000     1.00000      1.00000   1.00000    8.3e-07
## unemployed    0.03985 0.07896     1.00000      0.31906   0.40002    1.00000
##               self-employed services student technician
## blue-collar   -             -        -       -         
## entrepreneur  -             -        -       -         
## housemaid     -             -        -       -         
## management    -             -        -       -         
## retired       -             -        -       -         
## self-employed -             -        -       -         
## services      1.00000       -        -       -         
## student       1.00000       1.00000  -       -         
## technician    1.00000       1.00000  1.00000 -         
## unemployed    1.00000       0.55068  1.00000 0.11946   
## 
## P value adjustment method: bonferroni

Single Variable Linear Regression

Balance -

data_frame_no_NA$balance_in_thousands <- data_frame_no_NA$balance/1000
head(data_frame_no_NA)

##       age         job marital education default balance housing loan   contact
## 24061  33      admin. married  tertiary      no     882      no   no telephone
## 24063  42      admin.  single secondary      no    -247     yes  yes telephone
## 24065  33    services married secondary      no    3444     yes   no telephone
## 24073  36  management married  tertiary      no    2415     yes   no telephone
## 24078  36  management married  tertiary      no       0     yes   no telephone
## 24087  44 blue-collar married secondary      no    1324     yes   no telephone
##       day month duration campaign pdays previous poutcome   y duration_in_mins
## 24061  21   oct       39        1   151        3  failure  no         0.650000
## 24063  21   oct      519        1   166        1    other yes         8.650000
## 24065  21   oct      144        1    91        4  failure yes         2.400000
## 24073  22   oct       73        1    86        4    other  no         1.216667
## 24078  23   oct      140        1   143        3  failure yes         2.333333
## 24087  25   oct      119        1    89        2    other  no         1.983333
##       balance_in_thousands
## 24061                0.882
## 24063               -0.247
## 24065                3.444
## 24073                2.415
## 24078                0.000
## 24087                1.324

data_frame_no_NA |>
  ggplot(mapping = aes(x = duration_in_mins, y = balance_in_thousands)) +
  geom_point(size = 2, color='lightpink') +
  geom_smooth(method = "lm", se = TRUE, color = 'darkblue') +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

model <- lm(duration_in_mins ~ balance_in_thousands, data_frame_no_NA)
model$coefficients

##          (Intercept) balance_in_thousands 
##           4.27450598           0.05175247

For the above model of linear regression line that can be drawn, it would have the equation as

Balance (y) = m * Duration(x= 0.051) + 4.27

With the above equation can determine certain relation between duration and balance.

model <- lm(duration_in_mins ~ balance_in_thousands, data_frame_no_NA)
tidy(model, conf.int = TRUE)

## # A tibble: 2 × 7
##   term                 estimate std.error statistic  p.value conf.low conf.high
##   <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)            4.27      0.0497     86.0  0          4.18      4.37  
## 2 balance_in_thousands   0.0518    0.0144      3.59 0.000328   0.0235    0.0800

The above output includes information about hypothesis tests for each coefficient:

estimate - estimated value of the coefficient.

std.error - standard error of the estimate.

statistic - value of a T-statistic to use in a hypothesis

p.value - Provides the p-value for the hypothesis test.

From above can see p value < than 0.05, hence Null hypothesis is rejected. Since the relation between balance and duration is not entirely positive and straight, we cannot confidently reject the null hypothesis.

Diagnostic plot

model <- lm(duration_in_mins ~ balance_in_thousands, data_frame_no_NA)
gg_resfitted(model) +
  theme_minimal()

From this particular plot, we can already see that one of our assumptions is violated.

Multiple Variable Linear Regression

The Response variable = Duration and Explanatory Variable = Balance, job, loan

I am going to consider the rows only where the balance is above 0 having a job as ‘management’ and create a mutate the loan column

data_frame_basic <- data_frame_no_NA |>
  filter(balance > 0, job == "management") |>
  mutate(loan_value = ifelse(loan %in% c("yes"),1, 0))

head(data_frame_basic)

##   age        job  marital education default balance housing loan   contact day
## 1  36 management  married  tertiary      no    2415     yes   no telephone  22
## 2  30 management   single  tertiary      no    1243     yes   no telephone  13
## 3  51 management divorced  tertiary      no     119      no   no  cellular  17
## 4  44 management  married  tertiary      no    6203     yes  yes  cellular  17
## 5  49 management  married  tertiary      no    1533      no   no  cellular  17
## 6  40 management   single secondary      no    1623     yes   no  cellular  17
##   month duration campaign pdays previous poutcome  y duration_in_mins
## 1   oct       73        1    86        4    other no        1.2166667
## 2   nov       86        1   174        1  failure no        1.4333333
## 3   nov      200        1   165        2  failure no        3.3333333
## 4   nov       58        1   188        1  failure no        0.9666667
## 5   nov      324        1   172        1  failure no        5.4000000
## 6   nov      161        1   167        2  failure no        2.6833333
##   balance_in_thousands loan_value
## 1                2.415          0
## 2                1.243          0
## 3                0.119          0
## 4                6.203          1
## 5                1.533          0
## 6                1.623          0

data_frame_basic |>
  group_by(loan_value) |>
  summarize(num = n())

## # A tibble: 2 × 2
##   loan_value   num
##        <dbl> <int>
## 1          0  1418
## 2          1   154

Plotting Facet averages

df_grouped <-
  data_frame_basic |>
  group_by(loan_value) |>
  summarise(mean_balance = mean(balance))

data_frame_basic |>
  ggplot() +
  facet_wrap(vars(loan_value), labeller = label_both) +
  geom_point(mapping = aes(x = duration, y = balance),color='lightblue') +
  geom_hline(data = df_grouped,
             mapping = aes(yintercept = mean_balance),
             color = 'black', linetype = 'dashed') +
  labs(title = "Balance VS Duration",
       subtitle = "Faceted by client having a personal loan",
       x = "duration", y = "balance") +
  theme_minimal()

model <- lm(duration ~ balance + loan_value, data_frame_basic)
model$coefficients

##   (Intercept)       balance    loan_value 
##  2.677673e+02 -4.309976e-04 -3.152233e+01

data_frame_basic |>
  ggplot(mapping = aes(x = balance_in_thousands, y = duration_in_mins, color = factor(loan_value))) +
  geom_jitter(height = 0, width = 0.1, shape = 'o', size = 3) +
  geom_smooth(method = 'lm', se = FALSE, linewidth = 0.5) +
  scale_color_brewer(palette = 'Paired') +
  labs(title = "Balance VS Duration",
   subtitle = "Colored by the client having a loan (1= having a loan, 0 = not having a loan)",
   x = "Balance (x-jittered)", y = "Duration",
   color = 'client having a loan i.e. 1= Having a loan; 0 = Not having a loan') +
  theme_hc()

## `geom_smooth()` using formula = 'y ~ x'

Since points are all overlapping, becomes difficult to interpret the added interaction Terms.

model <- lm(duration ~ balance + loan_value, data_frame_basic)

# to view more coefficients a bit easier
tidy(model) |>
  select(term, estimate) |>
  mutate(estimate = round(estimate, 1))

## # A tibble: 3 × 2
##   term        estimate
##   <chr>          <dbl>
## 1 (Intercept)    268. 
## 2 balance          0  
## 3 loan_value     -31.5