library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")
summary(data_frame)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
1 - age; 2 - job; 3 - marital(marital status); 4 - education; 5 - default: has credit in default?; 6 - balance: average yearly balance, in euros
7 - housing: has housing loan?; 8 - loan: has personal loan?; 9 - contact: contact communication type; 10 - day: last contact day of the month
11 - month: last contact month of year; 12 - duration: last contact duration, in seconds; 13 - campaign: number of contacts performed during this campaign and for this client
14 - pdays: number of days that passed by after the client was last contacted from a previous campaign
15 - previous: number of contacts performed before this campaign and for this client
16 - poutcome: outcome of the previous marketing campaign; 17 - y : has the client subscribed a term deposit?
H0 - average balance remains the same for the clients having a housing loan or not having a housing loan.
represents the probability of making a Type I error. A smaller alpha is more stringent and requires stronger evidence to reject the null hypothesis. If the impact of a false positive is high, we need to choose a smaller alpha to minimize the risk. If we have a large sample, we can opt for a smaller alpha value.
Since the size of the data is big and having a loan and balance in the account of the client is important to the bank, I am assuming that the 𝛂 value for this to be 3%, i.e. 𝛂 = 0.03.
Power refers to your study’s ability to find a difference. It is denoted as 1 - β, where β is the probability of failing to reject a null hypothesis when the null hypothesis is wrong. Since the higher the power, the better the ability to detect effects. I am assuming the power to be 0.85.
effect_size = cohen.d(d = filter(data_frame, housing == 'no') |> pluck("balance"),
f = filter(data_frame, housing == 'yes') |> pluck("balance"))
effect_size
##
## Cohen's d
##
## d estimate: 0.1387277 (negligible)
## 95 percent confidence interval:
## lower upper
## 0.1201536 0.1573019
test <- pwrss.t.2means(mu1 = 0.1387277,
sd1 = 1,
kappa = 1,
power = .85, alpha = 0.03,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.85
## n1 = 1070
## n2 = 1070
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 2138
## Non-centrality parameter = 3.209
## Type I error rate = 0.03
## Type II error rate = 0.15
plot(test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'
From the above calculation, the sample size is 1070. Now calculating the data in each category,
avg_balances <- data_frame|>
filter(!(is.na(housing)))|>
group_by(housing) |>
summarise(avg_balance = mean(balance,na.rm=TRUE),max_balance= max(balance,na.rm=TRUE), size=n())
avg_balances
## # A tibble: 2 × 4
## housing avg_balance max_balance size
## <chr> <dbl> <int> <int>
## 1 no 1597. 102127 20081
## 2 yes 1175. 58544 25130
Since each of the categories have sufficient count of data. I can implement Neyman-Pearson hypothesis test.
\[ H_A: \text{Average balance does not remain the same for the clients having a housing loan or not having a housing loan.} \]
#avg_revenues$avg_revenue[1] - not having a housing loan
#avg_revenues$avg_revenue[2] - having a housing loan
observed_diff <- (avg_balances$avg_balance[1] - avg_balances$avg_balance[2])
paste("Observed Difference: ", observed_diff)
## [1] "Observed Difference: 421.398205790226"
bootstrap <- function (x, func=mean, n_iter=10^4) {
func_values <- c(NULL)
for (i in 1:n_iter) {
x_sample <- sample_n(x, size = length(x), replace = TRUE)
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
diff_in_avg <- function (x_data) {
avg_balances <- x_data |>
group_by(housing) |>
summarize(avg_balance = mean(balance)) |>
arrange(housing)
#not having a housing loan - having a housing loan
diff <- (avg_balances$avg_balance[1] - avg_balances$avg_balance[2])
return(diff)
}
diffs_in_balance_avgs <- bootstrap(data_frame, diff_in_avg, n_iter = 500)
ggplot() +
geom_function(xlim = c(-500, 500),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_balance_avgs))) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff)))) +
labs(title = "Bootstrapped Sampling Distribution of Balance Differences",
x = "Difference in Balance Calculated",
y = "Probability Density",
color = "") +
theme_minimal()
alpha <- 0.03 # Significance level
critical_value <- qnorm(1 - alpha/2) # Two-tailed test
critical_value
## [1] 2.17009
delta <- 0.1387277
f_0 <- function(x) dnorm(x, mean = 0)
f_a <- function(x) dnorm(x, mean = delta)
ggplot() +
stat_function(mapping = aes(fill = 'power'),
fun = f_a,
xlim = c(critical_value, 4),
geom = "area") +
stat_function(mapping = aes(fill = 'alpha'),
fun = f_0,
xlim = c(critical_value, 4),
geom = "area") +
geom_function(mapping = aes(color = 'Null Hypothesis'),
xlim = c(-4, 4),
fun = f_0) +
geom_function(mapping = aes(color = 'Alternative Hypothesis'),
xlim = c(-4, 4),
fun = f_a) +
geom_vline(mapping = aes(xintercept = critical_value,
color = "Critical Value")) +
geom_vline(mapping = aes(xintercept = delta,
color = "Delta")) +
geom_vline(mapping = aes(xintercept = 0),
color = 'gray', linetype=2) +
labs(title = "One-Tailed Test Illustration",
x = "Test Statistic",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-4, 4, 1)) +
scale_fill_manual(values = c('lightblue', 'pink')) +
scale_color_manual(values = c('darkred', 'darkorange', 'darkblue',
'darkgreen')) +
theme_minimal()
sample_loan_yes <- data_frame$balance[data_frame$housing == 'yes']
sample_loan_no <- data_frame$balance[data_frame$housing == 'no']
# Impute missing values with the mean of the respective samples
sample_loan_yes[is.na(sample_loan_yes)] <- mean(sample_loan_yes, na.rm = TRUE)
sample_loan_no[is.na(sample_loan_no)] <- mean(sample_loan_no, na.rm = TRUE)
t_test_result <- t.test(sample_loan_yes, sample_loan_no)
# Print t-test results
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: sample_loan_yes and sample_loan_no
## t = -14.081, df = 34204, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -480.0551 -362.7413
## sample estimates:
## mean of x mean of y
## 1175.103 1596.501
if (t_test_result$p.value < alpha) {
cat("Reject the null hypothesis. There is a significant difference in average balance between clients having a housing loan and not having a housing loan")
} else {
cat("Fail to reject the null hypothesis. There is no significant difference in average balance between clients having a housing loan and not having a housing loan")
}
## Reject the null hypothesis. There is a significant difference in average balance between clients having a housing loan and not having a housing loan
To perform a Fisher’s test, it is necessary to have success and non-success count totals to make the test meaningful in assessing significance; however, given the nature of my data and the null hypothesis, this test would not be appropriate. Hence I am opting for a Chi Square test
contingency_table <- table(data_frame$balance, data_frame$housing)
chisq.test(contingency_table, correct = TRUE)
## Warning in chisq.test(contingency_table, correct = TRUE): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 9661.8, df = 7167, p-value < 2.2e-16
The p-value is very very small, which is lesser than the alpha level of 0.03. Therefore, provides strong evidence that the null hypothesis is false.
ggplot(data_frame, aes(x = housing, y = balance)) +
geom_boxplot() +
labs(title = "Having a Housing Loan VS Balance", x = "Having a housing loan", y = "Balance") +
theme_minimal()
p <- avg_balances |>
ggplot(aes(x = housing, y=avg_balance) )+
geom_bar(position = "dodge", stat = "identity",fill="lightpink") +
theme_minimal()
p
As seen from the above graph we can see that there is a significant difference in the average balances of the two groups of clients.
H0 - average age is the same for the clients having a housing loan or not having a housing loan.
represents the probability of making a Type I error. A smaller alpha is more stringent and requires stronger evidence to reject the null hypothesis. If the impact of a false positive is high, we need to choose a smaller alpha to minimize the risk. If we have a large sample, we can opt for a smaller alpha value.
Since the size of the data is big and age of the client is important to the bank to improve targeted marketing, I am assuming that the 𝛂 value for this to be 3%, i.e. 𝛂 = 0.05.
Power refers to your study’s ability to find a difference. It is denoted as 1 - β, where β is the probability of failing to reject a null hypothesis when the null hypothesis is wrong. Since the higher the power, the better the ability to detect effects. I am assuming the power to be 0.80.
effect_size = cohen.d(d = filter(data_frame, housing == 'no') |> pluck("age"),
f = filter(data_frame, housing == 'yes') |> pluck("age"))
effect_size
##
## Cohen's d
##
## d estimate: 0.3799486 (small)
## 95 percent confidence interval:
## lower upper
## 0.3612320 0.3986653
test <- pwrss.t.2means(mu1 = 0.3799486,
sd1 = 1,
kappa = 1,
power = .80, alpha = 0.05,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.8
## n1 = 110
## n2 = 110
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 218
## Non-centrality parameter = 2.818
## Type I error rate = 0.05
## Type II error rate = 0.2
plot(test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'
From the above calculation, the sample size is 110. Now calculating the data in each category,
avg_ages <- data_frame|>
filter(!(is.na(housing)))|>
group_by(housing) |>
summarise(avg_age = mean(age,na.rm=TRUE), size=n())
avg_ages
## # A tibble: 2 × 3
## housing avg_age size
## <chr> <dbl> <int>
## 1 no 43.1 20081
## 2 yes 39.2 25130
Since each of the categories have sufficient count of data. I can implement Neyman-Pearson hypothesis test.
\[ H_A: \text{Average age is not the same for the clients having a housing loan or not having a housing loan.} \]
observed_diff <- (avg_ages$avg_age[1] - avg_ages$avg_age[2])
paste("Observed Difference: ", observed_diff)
## [1] "Observed Difference: 3.96459497213761"
diff_in_avg <- function (x_data) {
avg_ages <- x_data |>
group_by(housing) |>
summarize(avg_age = mean(age)) |>
arrange(housing)
#not having a housing loan - having a housing loan
diff <- (avg_ages$avg_age[1] - avg_ages$avg_age[2])
return(diff)
}
diffs_in_age_avgs <- bootstrap(data_frame, diff_in_avg, n_iter = 500)
ggplot() +
geom_function(xlim = c(-500, 500),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_age_avgs))) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff)))) +
labs(title = "Bootstrapped Sampling Distribution of Age Differences",
x = "Difference in Age Calculated",
y = "Probability Density",
color = "") +
theme_minimal()
alpha <- 0.05 # Significance level
critical_value <- qnorm(1 - alpha/2)
critical_value
## [1] 1.959964
delta <- 0.3799486
f_0 <- function(x) dnorm(x, mean = 0)
f_a <- function(x) dnorm(x, mean = delta)
ggplot() +
stat_function(mapping = aes(fill = 'power'),
fun = f_a,
xlim = c(critical_value, 4),
geom = "area") +
stat_function(mapping = aes(fill = 'alpha'),
fun = f_0,
xlim = c(critical_value, 4),
geom = "area") +
geom_function(mapping = aes(color = 'Null Hypothesis'),
xlim = c(-4, 4),
fun = f_0) +
geom_function(mapping = aes(color = 'Alternative Hypothesis'),
xlim = c(-4, 4),
fun = f_a) +
geom_vline(mapping = aes(xintercept = critical_value,
color = "Critical Value")) +
geom_vline(mapping = aes(xintercept = delta,
color = "Delta")) +
geom_vline(mapping = aes(xintercept = 0),
color = 'gray', linetype=2) +
labs(title = "One-Tailed Test Illustration",
x = "Test Statistic",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-4, 4, 1)) +
scale_fill_manual(values = c('lightblue', 'pink')) +
scale_color_manual(values = c('darkred', 'darkorange', 'darkblue',
'darkgreen')) +
theme_minimal()
sample_loan_yes <- data_frame$age[data_frame$housing == 'yes']
sample_loan_no <- data_frame$age[data_frame$housing == 'no']
# Impute missing values with the mean of the respective samples
sample_loan_yes[is.na(sample_loan_yes)] <- mean(sample_loan_yes, na.rm = TRUE)
sample_loan_no[is.na(sample_loan_no)] <- mean(sample_loan_no, na.rm = TRUE)
t_test_result <- t.test(sample_loan_no, sample_loan_yes)
# Print t-test results
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: sample_loan_no and sample_loan_yes
## t = 38.853, df = 36001, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.764592 4.164598
## sample estimates:
## mean of x mean of y
## 43.13988 39.17529
if (t_test_result$p.value < alpha) {
cat("Reject the null hypothesis. There is a significant difference in average age between clients having a housing loan and not having a housing loan")
} else {
cat("Fail to reject the null hypothesis. There is no significant difference in average age between clients having a housing loan and not having a housing loan")
}
## Reject the null hypothesis. There is a significant difference in average age between clients having a housing loan and not having a housing loan
To perform a Fisher’s test, it is necessary to have success and non-success count totals to make the test meaningful in assessing significance; however, given the nature of my data and the null hypothesis, this test would not be appropriate. Hence I am opting for a Chi Square test
contingency_table <- table(data_frame$age, data_frame$housing)
chisq.test(contingency_table, correct = TRUE)
## Warning in chisq.test(contingency_table, correct = TRUE): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 2688.3, df = 76, p-value < 2.2e-16
The p-value is very very small, which is lesser than the alpha level of 0.05. Therefore, provides strong evidence that the null hypothesis is false.
ggplot(data_frame, aes(x = housing, y = age)) +
geom_boxplot() +
labs(title = "Having a Housing Loan VS Age", x = "Having a housing loan", y = "Age") +
theme_minimal()
p <- avg_ages |>
ggplot(aes(x = housing, y=avg_age) )+
geom_bar(position = "dodge", stat = "identity",fill="lightblue") +
theme_minimal()
p