Find two numeric variables that are highly correlated by checking the correlation coefficient. Then create a graph to illustrate that:

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
df <- read.csv("/Users/HoangDucVinh/Downloads/BankChurners.csv")
numeric_df <- df %>% select(where(is.numeric))

cor_matrix <- df %>%
  select(where(is.numeric)) %>%
  cor()

cor_tidy <- as.data.frame(cor_matrix) %>%
  rownames_to_column(var = "Var1") %>%
  pivot_longer(cols = -Var1, names_to = "Var2", values_to = "Correlation")

ggplot(cor_tidy, aes(Var1, Var2, fill = Correlation)) +
  geom_tile() +
  scale_fill_gradient2(low = "steelblue", high = "coral", mid = "darkgreen", limit = c(-1,1)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))


Find two categorical variables (other than Attrition_Flag) that are strongly dependent of each other. Then create a graph to illustrate that:

chisq.test(df$Gender, df$Income_Category)
## 
##  Pearson's Chi-squared test
## 
## data:  df$Gender and df$Income_Category
## X-squared = 7138.4, df = 5, p-value < 2.2e-16
ggplot(df, aes(x = Income_Category, fill = Gender)) +
  geom_bar(position = "fill") +
  labs(title = "Dependence: Income Category by Gender",
       y = "Proportion",
       x = "Income Category")+
  coord_flip()


Find at least 4 variables that have non-negligible correlation or dependence with Attrition_Flag. Show how you find them:

p1 <- ggplot(df, aes(x = Attrition_Flag, y = Total_Trans_Ct, fill = Attrition_Flag)) + geom_boxplot()
p2 <- ggplot(df, aes(x = Attrition_Flag, y = Total_Ct_Chng_Q4_Q1, fill = Attrition_Flag)) + geom_boxplot()
p3 <- ggplot(df, aes(x = Attrition_Flag, y = Total_Revolving_Bal, fill = Attrition_Flag)) + geom_boxplot()
p4 <- ggplot(df, aes(x = Attrition_Flag, y = Total_Relationship_Count, fill = Attrition_Flag)) + geom_boxplot()

plot_grid(p1, p2, p3, p4, ncol = 2)