library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
library(ggplot2)
library(broom)
library(lindia)
data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")
library(patchwork)
summary(data_frame)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
1 - age;
2 - job;
3 - marital(marital status);
4 - education;
5 - default: has credit in default?;
6 - balance: average yearly balance, in euros
7 - housing: has housing loan?;
8 - loan: has personal loan?;
9 - contact: contact communication type;
10 - day: last contact day of the month
11 - month: last contact month of year;
12 - duration: last contact duration, in seconds;
13 - campaign: number of contacts performed during this campaign and for this client
14 - pdays: number of days that passed by after the client was last contacted from a previous campaign
15 - previous: number of contacts performed before this campaign and for this client
16 - poutcome: outcome of the previous marketing campaign;
17 - y : has the client subscribed a term deposit?
H0 - The average duration spent for a marketing call is same for all groups, when the data is grouped by the type of job.
data_frame_no_NA <- na.omit(data_frame)
For better understanding, I am adding a column to convert the duration in seconds to minutes.
data_frame_no_NA$duration_in_mins <- data_frame_no_NA$duration/60
head(data_frame_no_NA)
## age job marital education default balance housing loan contact
## 24061 33 admin. married tertiary no 882 no no telephone
## 24063 42 admin. single secondary no -247 yes yes telephone
## 24065 33 services married secondary no 3444 yes no telephone
## 24073 36 management married tertiary no 2415 yes no telephone
## 24078 36 management married tertiary no 0 yes no telephone
## 24087 44 blue-collar married secondary no 1324 yes no telephone
## day month duration campaign pdays previous poutcome y duration_in_mins
## 24061 21 oct 39 1 151 3 failure no 0.650000
## 24063 21 oct 519 1 166 1 other yes 8.650000
## 24065 21 oct 144 1 91 4 failure yes 2.400000
## 24073 22 oct 73 1 86 4 other no 1.216667
## 24078 23 oct 140 1 143 3 failure yes 2.333333
## 24087 25 oct 119 1 89 2 other no 1.983333
summary(data_frame_no_NA$duration_in_mins)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08333 1.88333 3.23333 4.35484 5.40000 36.98333
data_frame_no_NA |>
group_by(job)|>
summarise(avg_duration = mean(duration,na.rm=TRUE),sd_duration= sd(duration,na.rm=TRUE), size=n())
## # A tibble: 11 × 4
## job avg_duration sd_duration size
## <chr> <dbl> <dbl> <int>
## 1 admin. 247. 216. 1057
## 2 blue-collar 252. 234. 1537
## 3 entrepreneur 280. 273. 211
## 4 housemaid 237. 206. 146
## 5 management 261. 239. 1753
## 6 retired 326. 248. 458
## 7 self-employed 274. 259. 264
## 8 services 259. 242. 682
## 9 student 262. 208. 237
## 10 technician 253. 235. 1289
## 11 unemployed 307. 243. 208
data_frame_no_NA |>
ggplot() +
geom_boxplot(mapping = aes(x = job, y = duration_in_mins)) +
labs(x = "Job type",
y = "Duration in minutes") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
par(mfrow = c(5, 2))
p1 <- data_frame_no_NA|>
filter(job=='admin.')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p2 <- data_frame_no_NA|>
filter(job=='blue-collar')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p3 <-data_frame_no_NA|>
filter(job=='entrepreneur')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p4 <-data_frame_no_NA|>
filter(job=='housemaid')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p5 <-data_frame_no_NA|>
filter(job=='management')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p6 <-data_frame_no_NA|>
filter(job=='retired')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p7 <-data_frame_no_NA|>
filter(job=='self-employed')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p8 <-data_frame_no_NA|>
filter(job=='services')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p9 <-data_frame_no_NA|>
filter(job=='student')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p10 <-data_frame_no_NA|>
filter(job=='technician')|>
ggplot(aes(x=duration)) +
geom_histogram(bins = 100) +
theme_minimal()
p1+p2+p3+p4+p5+p6+p7+p8+p9+p10
n <- nrow(data_frame_no_NA)
k <- n_distinct(data_frame_no_NA$job)
ggplot() +
geom_function(xlim = c(0, 10), fun = \(x) df(x, k - 1, n - k)) +
geom_vline(xintercept = 1, color = 'blue') +
labs(title = 'F Distribution for different jobs',
x = "F Values",
y = "Probability Density") +
theme_hc()
m <- aov(duration_in_mins ~ job, data = data_frame_no_NA)
summary(m)
## Df Sum Sq Mean Sq F value Pr(>F)
## job 10 834 83.41 5.412 4.93e-08 ***
## Residuals 7831 120684 15.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since the P value is very very small, we have enough evidence to reject the null hypothesis. Hence we can conclude that there is a difference in duration of the marketing calls for clients with different jobs.
pairwise.t.test(data_frame_no_NA$duration_in_mins, data_frame_no_NA$job, p.adjust.method = "bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: data_frame_no_NA$duration_in_mins and data_frame_no_NA$job
##
## admin. blue-collar entrepreneur housemaid management retired
## blue-collar 1.00000 - - - - -
## entrepreneur 1.00000 1.00000 - - - -
## housemaid 1.00000 1.00000 1.00000 - - -
## management 1.00000 1.00000 1.00000 1.00000 - -
## retired 1.1e-07 2.0e-07 1.00000 0.00404 8.2e-06 -
## self-employed 1.00000 1.00000 1.00000 1.00000 1.00000 0.26555
## services 1.00000 1.00000 1.00000 1.00000 1.00000 0.00016
## student 1.00000 1.00000 1.00000 1.00000 1.00000 0.03925
## technician 1.00000 1.00000 1.00000 1.00000 1.00000 8.3e-07
## unemployed 0.03985 0.07896 1.00000 0.31906 0.40002 1.00000
## self-employed services student technician
## blue-collar - - - -
## entrepreneur - - - -
## housemaid - - - -
## management - - - -
## retired - - - -
## self-employed - - - -
## services 1.00000 - - -
## student 1.00000 1.00000 - -
## technician 1.00000 1.00000 1.00000 -
## unemployed 1.00000 0.55068 1.00000 0.11946
##
## P value adjustment method: bonferroni
Balance -
data_frame_no_NA$balance_in_thousands <- data_frame_no_NA$balance/1000
head(data_frame_no_NA)
## age job marital education default balance housing loan contact
## 24061 33 admin. married tertiary no 882 no no telephone
## 24063 42 admin. single secondary no -247 yes yes telephone
## 24065 33 services married secondary no 3444 yes no telephone
## 24073 36 management married tertiary no 2415 yes no telephone
## 24078 36 management married tertiary no 0 yes no telephone
## 24087 44 blue-collar married secondary no 1324 yes no telephone
## day month duration campaign pdays previous poutcome y duration_in_mins
## 24061 21 oct 39 1 151 3 failure no 0.650000
## 24063 21 oct 519 1 166 1 other yes 8.650000
## 24065 21 oct 144 1 91 4 failure yes 2.400000
## 24073 22 oct 73 1 86 4 other no 1.216667
## 24078 23 oct 140 1 143 3 failure yes 2.333333
## 24087 25 oct 119 1 89 2 other no 1.983333
## balance_in_thousands
## 24061 0.882
## 24063 -0.247
## 24065 3.444
## 24073 2.415
## 24078 0.000
## 24087 1.324
data_frame_no_NA |>
ggplot(mapping = aes(x = duration_in_mins, y = balance_in_thousands)) +
geom_point(size = 2, color='lightpink') +
geom_smooth(method = "lm", se = TRUE, color = 'darkblue') +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
model <- lm(duration_in_mins ~ balance_in_thousands, data_frame_no_NA)
model$coefficients
## (Intercept) balance_in_thousands
## 4.27450598 0.05175247
For the above model of linear regression line that can be drawn, it would have the equation as
Balance (y) = m * Duration(x= 0.051) + 4.27
With the above equation can determine certain relation between duration and balance.
model <- lm(duration_in_mins ~ balance_in_thousands, data_frame_no_NA)
tidy(model, conf.int = TRUE)
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.27 0.0497 86.0 0 4.18 4.37
## 2 balance_in_thousands 0.0518 0.0144 3.59 0.000328 0.0235 0.0800
The above output includes information about hypothesis tests for each coefficient:
estimate - estimated value of the coefficient.
std.error - standard error of the estimate.
statistic - value of a T-statistic to use in a hypothesis
p.value - Provides the p-value for the hypothesis test.
From above can see p value < than 0.05, hence Null hypothesis is rejected. Since the relation between balance and duration is not entirely positive and straight, we cannot confidently reject the null hypothesis.
model <- lm(duration_in_mins ~ balance_in_thousands, data_frame_no_NA)
gg_resfitted(model) +
theme_minimal()
From this particular plot, we can already see that one of our assumptions is violated.
The Response variable = Duration and Explanatory Variable = Balance, job, loan
I am going to consider the rows only where the balance is above 0 having a job as ‘management’ and create a mutate the loan column
data_frame_basic <- data_frame_no_NA |>
filter(balance > 0, job == "management") |>
mutate(loan_value = ifelse(loan %in% c("yes"),1, 0))
head(data_frame_basic)
## age job marital education default balance housing loan contact day
## 1 36 management married tertiary no 2415 yes no telephone 22
## 2 30 management single tertiary no 1243 yes no telephone 13
## 3 51 management divorced tertiary no 119 no no cellular 17
## 4 44 management married tertiary no 6203 yes yes cellular 17
## 5 49 management married tertiary no 1533 no no cellular 17
## 6 40 management single secondary no 1623 yes no cellular 17
## month duration campaign pdays previous poutcome y duration_in_mins
## 1 oct 73 1 86 4 other no 1.2166667
## 2 nov 86 1 174 1 failure no 1.4333333
## 3 nov 200 1 165 2 failure no 3.3333333
## 4 nov 58 1 188 1 failure no 0.9666667
## 5 nov 324 1 172 1 failure no 5.4000000
## 6 nov 161 1 167 2 failure no 2.6833333
## balance_in_thousands loan_value
## 1 2.415 0
## 2 1.243 0
## 3 0.119 0
## 4 6.203 1
## 5 1.533 0
## 6 1.623 0
data_frame_basic |>
group_by(loan_value) |>
summarize(num = n())
## # A tibble: 2 × 2
## loan_value num
## <dbl> <int>
## 1 0 1418
## 2 1 154
df_grouped <-
data_frame_basic |>
group_by(loan_value) |>
summarise(mean_balance = mean(balance))
data_frame_basic |>
ggplot() +
facet_wrap(vars(loan_value), labeller = label_both) +
geom_point(mapping = aes(x = duration, y = balance),color='lightblue') +
geom_hline(data = df_grouped,
mapping = aes(yintercept = mean_balance),
color = 'black', linetype = 'dashed') +
labs(title = "Balance VS Duration",
subtitle = "Faceted by client having a personal loan",
x = "duration", y = "balance") +
theme_minimal()
model <- lm(duration ~ balance + loan_value, data_frame_basic)
model$coefficients
## (Intercept) balance loan_value
## 2.677673e+02 -4.309976e-04 -3.152233e+01
data_frame_basic |>
ggplot(mapping = aes(x = balance_in_thousands, y = duration_in_mins, color = factor(loan_value))) +
geom_jitter(height = 0, width = 0.1, shape = 'o', size = 3) +
geom_smooth(method = 'lm', se = FALSE, linewidth = 0.5) +
scale_color_brewer(palette = 'Paired') +
labs(title = "Balance VS Duration",
subtitle = "Colored by the client having a loan (1= having a loan, 0 = not having a loan)",
x = "Balance (x-jittered)", y = "Duration",
color = 'client having a loan i.e. 1= Having a loan; 0 = Not having a loan') +
theme_hc()
## `geom_smooth()` using formula = 'y ~ x'
Since points are all overlapping, becomes difficult to interpret the added interaction Terms.
model <- lm(duration ~ balance + loan_value, data_frame_basic)
# to view more coefficients a bit easier
tidy(model) |>
select(term, estimate) |>
mutate(estimate = round(estimate, 1))
## # A tibble: 3 × 2
## term estimate
## <chr> <dbl>
## 1 (Intercept) 268.
## 2 balance 0
## 3 loan_value -31.5