library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
bank_data <- read_csv("BankChurners.csv")
## Rows: 10127 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (17): CLIENTNUM, Customer_Age, Dependent_count, Months_on_book, Total_Re...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Find two categorical variables (other than Attrition_Flag) that are
strongly dependent of each other. Then create a graph to illustrate
that.
tab_G_I <- table(bank_data$Gender, bank_data$Income_Category)
chisq.test(tab_G_I)
##
## Pearson's Chi-squared test
##
## data: tab_G_I
## X-squared = 7138.4, df = 5, p-value < 2.2e-16
ggplot(bank_data,
aes(x = Gender, fill = Income_Category)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Income category distribution by gender",
y = "Proportion")

Find at least 4 variables that have non-negligible correlation or
dependence with Attrition_Flag. Show how you find them.
t.test(Total_Trans_Ct ~ Attrition_Flag, data = bank_data)
##
## Welch Two Sample t-test
##
## data: Total_Trans_Ct by Attrition_Flag
## t = -54.142, df = 3386.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Attrited Customer and group Existing Customer is not equal to 0
## 95 percent confidence interval:
## -24.59864 -22.87930
## sample estimates:
## mean in group Attrited Customer mean in group Existing Customer
## 44.93362 68.67259
t.test(Total_Trans_Amt ~ Attrition_Flag, data = bank_data)
##
## Welch Two Sample t-test
##
## data: Total_Trans_Amt by Attrition_Flag
## t = -22.686, df = 3264.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Attrited Customer and group Existing Customer is not equal to 0
## 95 percent confidence interval:
## -1694.425 -1424.835
## sample estimates:
## mean in group Attrited Customer mean in group Existing Customer
## 3095.026 4654.656
t.test(Months_Inactive_12_mon ~ Attrition_Flag, data = bank_data)
##
## Welch Two Sample t-test
##
## data: Months_Inactive_12_mon by Attrition_Flag
## t = 16.862, df = 2489.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Attrited Customer and group Existing Customer is not equal to 0
## 95 percent confidence interval:
## 0.3707468 0.4683249
## sample estimates:
## mean in group Attrited Customer mean in group Existing Customer
## 2.693301 2.273765
t.test(Avg_Utilization_Ratio ~ Attrition_Flag, data = bank_data)
##
## Welch Two Sample t-test
##
## data: Avg_Utilization_Ratio by Attrition_Flag
## t = -18.623, df = 2336, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Attrited Customer and group Existing Customer is not equal to 0
## 95 percent confidence interval:
## -0.1480402 -0.1198331
## sample estimates:
## mean in group Attrited Customer mean in group Existing Customer
## 0.1624751 0.2964118