Part I: Lab Exercise



Lab 1:

1. Check whether Marital_Status or Dependent_count is correlated with Attrition_Flag or not with a graph and a statistical test.


Code:

glimpse(bank_data)
## Rows: 10,127
## Columns: 23
## $ CLIENTNUM                                                                                                                          <dbl> …
## $ Attrition_Flag                                                                                                                     <chr> …
## $ Customer_Age                                                                                                                       <dbl> …
## $ Gender                                                                                                                             <chr> …
## $ Dependent_count                                                                                                                    <dbl> …
## $ Education_Level                                                                                                                    <chr> …
## $ Marital_Status                                                                                                                     <chr> …
## $ Income_Category                                                                                                                    <chr> …
## $ Card_Category                                                                                                                      <chr> …
## $ Months_on_book                                                                                                                     <dbl> …
## $ Total_Relationship_Count                                                                                                           <dbl> …
## $ Months_Inactive_12_mon                                                                                                             <dbl> …
## $ Contacts_Count_12_mon                                                                                                              <dbl> …
## $ Credit_Limit                                                                                                                       <dbl> …
## $ Total_Revolving_Bal                                                                                                                <dbl> …
## $ Avg_Open_To_Buy                                                                                                                    <dbl> …
## $ Total_Amt_Chng_Q4_Q1                                                                                                               <dbl> …
## $ Total_Trans_Amt                                                                                                                    <dbl> …
## $ Total_Trans_Ct                                                                                                                     <dbl> …
## $ Total_Ct_Chng_Q4_Q1                                                                                                                <dbl> …
## $ Avg_Utilization_Ratio                                                                                                              <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …
ggplot(bank_data) +
  geom_bar(aes(x = Marital_Status, fill = Attrition_Flag), 
           alpha = 0.7, position = "fill") +
  labs(title = "Marital Status vs Attrition",
       x = "Marital status",
       y = "Proportion") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(2), 
                                  color = "tomato4", margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.6), 
                                  color = "tomato3"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)),
        axis.text = element_text(size = rel(1.1)))

ggplot(bank_data) +
  geom_bar(aes(x = as.factor(Dependent_count), fill = Attrition_Flag), 
           alpha = 0.7, position = "fill") +
  labs(title = "Dependents vs Attrition",
       x = "Quantity of dependent",
       y = "Proportion") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(2), 
                                  color = "tomato4", margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.6), 
                                  color = "tomato3"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)),
        axis.text = element_text(size = rel(1.1)))

chisq.test(table(bank_data$Attrition_Flag, bank_data$Marital_Status))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Attrition_Flag, bank_data$Marital_Status)
## X-squared = 6.0561, df = 3, p-value = 0.1089
chisq.test(table(bank_data$Attrition_Flag, bank_data$Dependent_count))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Attrition_Flag, bank_data$Dependent_count)
## X-squared = 9.4764, df = 5, p-value = 0.0915


2. Are Marital_Status and Dependent_count correlated with each other? Can you explain the result?


Code:

ggplot(bank_data) +
  geom_bar(aes(x = as.factor(Dependent_count), fill = Marital_Status), 
           alpha = 0.7, position = "fill") +
  labs(title = "Relation Between Marital Status and Dependents",
       x = "Counts of dependents",
       y = "Proportion") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.7), 
                                  color = "skyblue4", margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.4), 
                                  color = "skyblue3"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)))

chisq.test(table(bank_data$Marital_Status, bank_data$Dependent_count))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Marital_Status, bank_data$Dependent_count)
## X-squared = 56.253, df = 15, p-value = 1.098e-06

Answer: The chi-square test results suggest a significant association between marital status and dependent count. Since the p-value is less than 0.05, we reject the null hypothesis, indicating that marital status and dependent count are not independent.

In my opinion, married individuals are more likely to have children as their dependents, while single individuals are less likely to have as many dependents as those who are married or divorced.




Lab 2: Use your common sense to find two numeric variables in the bank customer data that are closely correlated with each other (correlation coefficient larger than 0.5 or less than -0.5). Verify your speculation.


Code:

ggplot(bank_data, aes(x = Total_Trans_Amt, y = Total_Trans_Ct)) +
  geom_point(color = "pink3", fill = "pink2", shape = 21) +
  geom_smooth(color = "palegreen4", linetype = "dashed", linewidth = 1.5) +
  labs(title = "Total Transaction Amount vs Count",
       x = "Total transaction amount",
       y = "Total transaction count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.8), 
                                  color = "deeppink4", margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.5), 
                                  color = "deeppink3"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)))

cor(bank_data$Total_Trans_Amt, bank_data$Total_Trans_Ct)
## [1] 0.807192




Lab 3: Analyze whether the variable Avg_Unilization_Ratio is correlated with Attrition_Flag or not, by using a graph and a statistical test. Explain your result.


Code:

ggplot(bank_data) +
  geom_density(aes(x = Avg_Utilization_Ratio, fill = Attrition_Flag), alpha = 0.5) +
  labs(title = "Average Utilization Ratio and Attrition",
       x = "Mean of use ratio",
       y = "Denity") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.8), 
                                  color = "darkolivegreen", margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.5), 
                                  color = "olivedrab"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)),
        axis.text = element_text(size = rel(1.1)))

data1 <- bank_data$Avg_Utilization_Ratio[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Avg_Utilization_Ratio[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1, data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -18.623, df = 2336, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1480402 -0.1198331
## sample estimates:
## mean of x mean of y 
## 0.1624751 0.2964118

Answer: The density plot shows that attrited customers (red) have a higher density at very low utilization ratios compared to existing customers (blue), suggesting that lower utilization may be associated with customer attrition.

The t-test results (t = -18.623, p-value < 2.2e-16) indicate that the null hypothesis is rejected since the p-value is lower than 0.05. This suggests that the two variables are not independent. Furthermore, the mean utilization ratio for attrited customers is significantly lower than that of existing customers.

What I think: customers who maintain high utilization ratio may depend more on their credit cards, making them less likely to leave, whereas those who use their credit cards sparingly might not rely on the service and are more likely to close their accounts.




Lab 4: Cut Total_Trans_Ct into a few reasonable categories and study its effect on Attrition_Flag.


Code:

table(bank_data$Total_Trans_Ct)
## 
##  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29 
##   4   2   4   5   9  16  13  13  23  11  19  33  35  34  50  57  56  82  73  75 
##  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49 
##  84 100 104 116 107 136 135 141 139 126 136 138 132 147 127 129 100 110  98 118 
##  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69 
##  91  92  64  85  89  78 106  94 103  97 111 118 134 150 158 166 164 186 170 202 
##  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89 
## 193 203 168 183 190 203 198 197 190 184 173 208 202 169 147 148 133 137 114  93 
##  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 
##  83  62  66  55  51  40  44  42  31  38  38  25  30  31  31  32  31  14  21  22 
## 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 
##  25  22  24  23  23  25  32  21  22  16  31  22  18  15  28  12  10  12  10   6 
## 130 131 132 134 138 139 
##   5   6   1   1   1   1
bank_data <- mutate(bank_data, trans_ct_group = cut
                    (Total_Trans_Ct, breaks = c(10, 30, 50, 70 ,90, 110, 130, Inf)))

ggplot(bank_data) +
  geom_bar(aes(x = trans_ct_group, fill = Attrition_Flag), 
           alpha = 0.7, position = "fill") +
  labs(title = "Total Transaction Count & Attirtion",
       x = "Groups of transaction count",
       y = "Proportion") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.8), 
                                  color = "navyblue", margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.5), 
                                  color = "dodgerblue3"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)))




Lab 5: Analyze the composite effect of Education_Level and Marital_Status on Attrition_Flag.


Code:

bank_data %>%
  group_by(Education_Level, Marital_Status) %>%
  summarise(Attrition_ratio = prop.table(table(Attrition_Flag))[1]) %>%
  arrange(Attrition_ratio)
## # A tibble: 28 × 3
## # Groups:   Education_Level [7]
##    Education_Level Marital_Status Attrition_ratio
##    <chr>           <chr>                    <dbl>
##  1 College         Unknown                  0.122
##  2 College         Divorced                 0.128
##  3 Unknown         Unknown                  0.132
##  4 Uneducated      Married                  0.142
##  5 Graduate        Married                  0.144
##  6 High School     Married                  0.144
##  7 Uneducated      Divorced                 0.147
##  8 High School     Single                   0.151
##  9 College         Married                  0.152
## 10 Post-Graduate   Married                  0.152
## # ℹ 18 more rows
bank_edu <- factor(bank_data$Education_Level, 
                   levels = unique(bank_data$Education_Level)
                   [c("College", "Doctorate", "Graduate", "High School", 
                      "Post-Graduate", "Uneducated", "Unknown")])

ggplot(bank_data) +
  geom_bar(aes(y = Education_Level, fill = Marital_Status), 
           alpha = 0.7, position = "fill") +
  facet_wrap(~Attrition_Flag, nrow = 2) +
  labs(title = "Edu Levels, Marital Status, & Attrition",
       x = "Proportion",
       y = "Levels of Education") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.8), 
                                  color = "purple4", margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.5), 
                                  color = "mediumpurple3"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)))

Answer: According to the graph, married individuals make up the largest proportion across all education levels, followed by single and divorced individuals. While marital status may influence attrition, education level does not show a strong correlation with customer retention. The distribution remains relatively stable across both groups, suggesting that other factors might play a more significant role in customer attrition. Further statistical analysis would be needed to confirm these observations.


Part II: More Exercise



1. Find two numeric variables that are highly correlated by checking the correlation coefficient. Then create a graph to illustrate that.


Code:

cor(bank_data$Total_Revolving_Bal, bank_data$Avg_Utilization_Ratio)
## [1] 0.624022
ggplot(bank_data, aes(x = Total_Revolving_Bal, y = Avg_Utilization_Ratio)) +
  geom_point(color = "palevioletred3", fill = "palevioletred1", shape = 21) + 
  geom_smooth(color = "forestgreen", linetype = "dashed", linewidth = 1.5) +
  labs(title = "Revolving Balance vs Utilization Ratio",
       x = "Total revolving balance",
       y = "Mean utilzation ratio") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(2), color = "darkgoldenrod4", 
                                  margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.6), color = "darkgoldenrod3"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)),
        axis.text = element_text(size = rel(1.1)))


2. Find two categorical variables (other than Attrition_Flag) that are strongly dependent of each other. Then create a graph to illustrate that.


Code:

chisq.test(bank_data$Dependent_count, bank_data$Marital_Status)
## 
##  Pearson's Chi-squared test
## 
## data:  bank_data$Dependent_count and bank_data$Marital_Status
## X-squared = 56.253, df = 15, p-value = 1.098e-06
ggplot(bank_data) +
  geom_bar(aes(x = Dependent_count, fill = Marital_Status), 
           alpha = 0.7, position = "fill") +
  labs(title = "Relation Between Marital Status & Dependent",
       x = "Number of dependent",
       y = "Proportion") +
  scale_x_continuous(breaks = seq(0, 5, 1)) +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.8), color = "firebrick4",
                                  margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.6), color = "firebrick"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)),
        axis.text = element_text(size = rel(1.1)))


3. Find at least 4 variables that have non-negligible correlation or dependence with Attrition_Flag. Show how you find them.


Code:

att_rev1 <- bank_data$Avg_Utilization_Ratio[bank_data$Attrition_Flag != "Existing Customer"]
att_rev2 <- bank_data$Avg_Utilization_Ratio[bank_data$Attrition_Flag == "Existing Customer"]
t.test(att_rev1, att_rev2)
## 
##  Welch Two Sample t-test
## 
## data:  att_rev1 and att_rev2
## t = -18.623, df = 2336, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1480402 -0.1198331
## sample estimates:
## mean of x mean of y 
## 0.1624751 0.2964118
ggplot(bank_data, aes(x = Months_Inactive_12_mon, fill = Attrition_Flag)) +
  geom_bar(alpha = 0.7, position = "fill") +
  scale_x_continuous(breaks = seq(0, 6, 1)) +
  labs(title = "Relation Between Inactive Month & Attrition",
       x = "Inactive months",
       y = "Proportion") +
  scale_x_continuous(breaks = seq(0, 5, 1)) +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.8), color = "cornflowerblue",
                                  margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.6), color = "orange2"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)),
        axis.text = element_text(size = rel(1.1)))

chisq.test(bank_data$Months_Inactive_12_mon, bank_data$Attrition_Flag) 
## 
##  Pearson's Chi-squared test
## 
## data:  bank_data$Months_Inactive_12_mon and bank_data$Attrition_Flag
## X-squared = 396.46, df = 6, p-value < 2.2e-16
ggplot(bank_data, 
       aes(x = Total_Amt_Chng_Q4_Q1, y = Total_Ct_Chng_Q4_Q1, color = Attrition_Flag)) +
  geom_point(alpha = 0.3) +
  geom_smooth() +
  labs(title = "Total Transaction Amount & Count Ratio vs Attrition",
       x = "Total amount in 4th:1st quarter",
       y = "Total amount in 4th:1st quarter") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.7), color = "sienna3",
                                  margin = margin(10,15,10,10)),
        axis.title = element_text(hjust = 0.5, size = rel(1.4), color = "olivedrab4"),
        axis.title.x = element_text(margin = margin(15,10,10,10)),
        axis.title.y = element_text(margin = margin(10,10,15,10)),
        axis.text = element_text(size = rel(1.1)))

Answer:
1. Avg_Utilization_Ratio vs Attrition_Flag
2. Months_Inactive_12_mon vs Attrition_Flag
3. Total_Amt_Chng_Q4_Q1, Total_Ct_Chng_Q4_Q1 vs Attrition_Flag