library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
bank_data <- read_csv("BankChurners.csv")
## Rows: 10127 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (17): CLIENTNUM, Customer_Age, Dependent_count, Months_on_book, Total_Re...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Find two numeric variables that are highly correlated by checking the correlation coefficient. Then create a graph to illustrate that.

bank_data %>% 
  summarise(cor_CL_AOB = cor(Credit_Limit, Avg_Open_To_Buy))
## # A tibble: 1 × 1
##   cor_CL_AOB
##        <dbl>
## 1      0.996
ggplot(bank_data,
       aes(x = Credit_Limit, y = Avg_Open_To_Buy)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE, colour = "red") +
  labs(title = "Avg_Open_To_Buy vs Credit_Limit",
       subtitle = "Strong positive linear relationship (|r| ≈ 0.996)",
       x = "Credit Limit",
       y = "Average Open To Buy")
## `geom_smooth()` using formula = 'y ~ x'

Find two categorical variables (other than Attrition_Flag) that are strongly dependent of each other. Then create a graph to illustrate that.

tab_G_I <- table(bank_data$Gender, bank_data$Income_Category)
chisq.test(tab_G_I)
## 
##  Pearson's Chi-squared test
## 
## data:  tab_G_I
## X-squared = 7138.4, df = 5, p-value < 2.2e-16
ggplot(bank_data,
       aes(x = Gender, fill = Income_Category)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Income category distribution by gender",
       y = "Proportion") 

Find at least 4 variables that have non-negligible correlation or dependence with Attrition_Flag. Show how you find them.

t.test(Total_Trans_Ct ~ Attrition_Flag, data = bank_data)
## 
##  Welch Two Sample t-test
## 
## data:  Total_Trans_Ct by Attrition_Flag
## t = -54.142, df = 3386.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Attrited Customer and group Existing Customer is not equal to 0
## 95 percent confidence interval:
##  -24.59864 -22.87930
## sample estimates:
## mean in group Attrited Customer mean in group Existing Customer 
##                        44.93362                        68.67259
t.test(Total_Trans_Amt ~ Attrition_Flag, data = bank_data)
## 
##  Welch Two Sample t-test
## 
## data:  Total_Trans_Amt by Attrition_Flag
## t = -22.686, df = 3264.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Attrited Customer and group Existing Customer is not equal to 0
## 95 percent confidence interval:
##  -1694.425 -1424.835
## sample estimates:
## mean in group Attrited Customer mean in group Existing Customer 
##                        3095.026                        4654.656
t.test(Months_Inactive_12_mon ~ Attrition_Flag, data = bank_data)
## 
##  Welch Two Sample t-test
## 
## data:  Months_Inactive_12_mon by Attrition_Flag
## t = 16.862, df = 2489.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Attrited Customer and group Existing Customer is not equal to 0
## 95 percent confidence interval:
##  0.3707468 0.4683249
## sample estimates:
## mean in group Attrited Customer mean in group Existing Customer 
##                        2.693301                        2.273765
t.test(Avg_Utilization_Ratio ~ Attrition_Flag, data = bank_data)
## 
##  Welch Two Sample t-test
## 
## data:  Avg_Utilization_Ratio by Attrition_Flag
## t = -18.623, df = 2336, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Attrited Customer and group Existing Customer is not equal to 0
## 95 percent confidence interval:
##  -0.1480402 -0.1198331
## sample estimates:
## mean in group Attrited Customer mean in group Existing Customer 
##                       0.1624751                       0.2964118