R Implementation

# Load required libraries
library(tidyverse)      # For data manipulation and visualization

## Warning: package 'ggplot2' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)          # For machine learning workflows

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(corrplot)       # For correlation visualization

## corrplot 0.95 loaded

library(randomForest)   # For Random Forest model

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(xgboost)        # For XGBoost model

## Warning: package 'xgboost' was built under R version 4.4.3

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

library(pROC)           # For ROC curve analysis

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(e1071)          # For other ML algorithms
library(ggplot2)        # For advanced visualizations
library(dplyr)
library(dplyr)
library(corrplot)
library(pander)

## Warning: package 'pander' was built under R version 4.4.3

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

# Set seed for reproducibility
set.seed(123)

# 1. DATA PREPARATION ---------------------------------------------------------

# Load the dataset (it's in CSV format)

churn <- read.csv("/Users/doris/OneDrive/Documents/STA6013 Project/Bank Customer Churn Prediction.csv")

#Select n=300 subset with relatively balanced categorical variables

#Set seed for reproducibility
set.seed(999)

#Filter by "country" then and sample 100 from each
france_sample  <- churn[churn$country == "France",  ][sample(sum(churn$country == "France"),  100), ]
germany_sample <- churn[churn$country == "Germany", ][sample(sum(churn$country == "Germany"), 100), ]
spain_sample   <- churn[churn$country == "Spain",   ][sample(sum(churn$country == "Spain"),   100), ]

#Combine the subsets
churnsub <- rbind(france_sample, germany_sample, spain_sample)

#Frequency of each outcome for 'country'
country_freq <- table(churnsub$country)
country_df <- data.frame(Country = names(country_freq), Count = as.vector(country_freq))
pander(country_df, caption = "Frequency Table for Country")

Frequency Table for Country
Country	Count
France	100
Germany	100
Spain	100

#Frequency of each outcome for 'gender'
gender_freq <- table(churnsub$gender)
gender_df <- data.frame(Gender = names(gender_freq), Count = as.vector(gender_freq))
pander(gender_df, caption = "Frequency Table for Gender")

Frequency Table for Gender
Gender	Count
Female	148
Male	152

# 2. DATA CLEANING AND PREPARATION
# Check for missing values
colSums(is.na(churnsub))

##      customer_id     credit_score          country           gender 
##                0                0                0                0 
##              age           tenure          balance  products_number 
##                0                0                0                0 
##      credit_card    active_member estimated_salary            churn 
##                0                0                0                0

# Check for duplicate rows
duplicates <- churnsub[duplicated(churnsub), ]
cat("Number of duplicate rows: ", nrow(duplicates), "\n")

## Number of duplicate rows:  0

# Our dataset has no missing values and no duplicates, thus its clean

# Check data structure
str(churnsub)

## 'data.frame':    300 obs. of  12 variables:
##  $ customer_id     : int  15723720 15582841 15588219 15780770 15809999 15610165 15640491 15580227 15594450 15586959 ...
##  $ credit_score    : int  591 600 850 445 709 761 464 803 695 468 ...
##  $ country         : chr  "France" "France" "France" "France" ...
##  $ gender          : chr  "Female" "Male" "Female" "Male" ...
##  $ age             : int  31 29 38 31 41 26 33 33 49 42 ...
##  $ tenure          : int  7 8 1 7 3 1 10 6 9 5 ...
##  $ balance         : num  0 0 106872 145057 150301 ...
##  $ products_number : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ credit_card     : int  0 0 1 1 1 1 1 1 1 1 ...
##  $ active_member   : int  1 1 0 1 0 1 0 0 0 0 ...
##  $ estimated_salary: num  48778 34747 29333 175894 71673 ...
##  $ churn           : int  0 0 0 0 0 0 0 0 0 0 ...

summary(churnsub)

##   customer_id        credit_score     country             gender         
##  Min.   :15565706   Min.   :386.0   Length:300         Length:300        
##  1st Qu.:15626314   1st Qu.:581.8   Class :character   Class :character  
##  Median :15684530   Median :651.5   Mode  :character   Mode  :character  
##  Mean   :15687437   Mean   :652.1                                        
##  3rd Qu.:15750989   3rd Qu.:717.0                                        
##  Max.   :15815070   Max.   :850.0                                        
##       age            tenure          balance       products_number
##  Min.   :19.00   Min.   : 0.000   Min.   :     0   Min.   :1.000  
##  1st Qu.:32.00   1st Qu.: 3.000   1st Qu.:     0   1st Qu.:1.000  
##  Median :37.00   Median : 5.000   Median : 99965   Median :1.000  
##  Mean   :38.45   Mean   : 4.927   Mean   : 81915   Mean   :1.503  
##  3rd Qu.:43.00   3rd Qu.: 7.000   3rd Qu.:130175   3rd Qu.:2.000  
##  Max.   :77.00   Max.   :10.000   Max.   :192391   Max.   :4.000  
##   credit_card     active_member    estimated_salary       churn     
##  Min.   :0.0000   Min.   :0.0000   Min.   :   417.4   Min.   :0.00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 52671.0   1st Qu.:0.00  
##  Median :1.0000   Median :1.0000   Median : 95604.2   Median :0.00  
##  Mean   :0.6967   Mean   :0.5133   Mean   : 98968.3   Mean   :0.21  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:144791.1   3rd Qu.:0.00  
##  Max.   :1.0000   Max.   :1.0000   Max.   :199409.2   Max.   :1.00

# Our random sampled dataset consist of 300 observations and 12 variables 
# From above its clear that some variables are misclassified, this needs to be 
# corrected to ensure smooth analysis, we shall not use customer id since  
# this column is likely just an identifier and doesn't provide any actionable insights # and thus will not be useful for our analysis
#our summarise data provides more details about our data, like limits for outliers
# factor variable, just more detailed information

# Check levels
levels(as.factor(churnsub$country))

## [1] "France"  "Germany" "Spain"

levels(as.factor(churnsub$gender))

## [1] "Female" "Male"

levels(as.factor(churnsub$active_member))

## [1] "0" "1"

levels(as.factor(churnsub$credit_card))

## [1] "0" "1"

levels(as.factor(churnsub$churn))

## [1] "0" "1"

# Convert appropriate variables to factors
churnsub$country <- as.factor(churnsub$country)
churnsub$gender <- as.factor(churnsub$gender)
churnsub$credit_card <- as.factor(churnsub$credit_card)
churnsub$active_member <- as.factor(churnsub$active_member)
churnsub$churn <- as.factor(churnsub$churn)

# Remove the customer_id column
churnsub <- churnsub %>% 
 dplyr::select(-customer_id)


# See the change
str(churnsub)

## 'data.frame':    300 obs. of  11 variables:
##  $ credit_score    : int  591 600 850 445 709 761 464 803 695 468 ...
##  $ country         : Factor w/ 3 levels "France","Germany",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ gender          : Factor w/ 2 levels "Female","Male": 1 2 1 2 1 1 1 2 2 1 ...
##  $ age             : int  31 29 38 31 41 26 33 33 49 42 ...
##  $ tenure          : int  7 8 1 7 3 1 10 6 9 5 ...
##  $ balance         : num  0 0 106872 145057 150301 ...
##  $ products_number : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ credit_card     : Factor w/ 2 levels "0","1": 1 1 2 2 2 2 2 2 2 2 ...
##  $ active_member   : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 1 1 1 1 ...
##  $ estimated_salary: num  48778 34747 29333 175894 71673 ...
##  $ churn           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

# All our variables are now correctly classified.

# Converting the Boolean values to the desired values
churnsub <- churnsub %>% 
  mutate(credit_card = fct_recode(credit_card, "Credit Card" = "1", "No-Credit Card" = "0"),
         active_member = fct_recode(active_member, "Active" = "1", "Inactive" = "0"),
         churn = fct_recode(churn, "Churned" = "1", "Not Churned" = "0"))

# View the first few rows of the updated dataset
head(churnsub)

# 3. EXPLORATORY DATA ANALYSIS ------------------------------------------------
# Distribution of the target variable
churn_distribution <- table(churnsub$churn)
print(churn_distribution)

## 
## Not Churned     Churned 
##         237          63

barplot(churn_distribution, 
        main="Distribution of Customer Churn", 
        xlab="Churn Status", 
        ylab="Count",
        col=c("green", "red"))

# Based from this bar plot we can see that many customers are retained and few are
# exited, our goal is to find out what factors influenced this behavior and fine
# possible solutions

# Calculate the percentage of churned vs. non-churned customers per country
churn_counts_percentage <- churnsub %>%
  group_by(country, churn) %>%
  summarise(customer_count = n(), .groups = "drop") %>%
  group_by(country) %>%
  mutate(percentage = customer_count / sum(customer_count) * 100) %>%
  mutate(churn_label = ifelse(churn == "Churned", "Churned", "Not Churned"))

# Create the side-by-side bar plot with percentages
ggplot(churn_counts_percentage, aes(x = country, y = percentage, fill = churn_label)) +
  geom_bar(stat = "identity", position = "dodge") + 
  geom_text(aes(label = sprintf("%.1f%%", percentage)), vjust = -0.5, position = position_dodge(0.9)) +  
  labs(title = "Percentage of Churn vs Retained Customers per Country", 
       x = "Country", 
       y = "Percentage of Customers") +
  scale_fill_manual(values = c("Not Churned" = "lightblue", "Churned" = "salmon")) +  
  theme_minimal()

# Based on the above plot we can see that Germany has the highest customer churned
# 0f 31.9% followed by Spain(18.1%) and lastly by France with 16.7%

# Filter the data for churned customers only
churned_data <- churnsub %>% filter(churn == "Churned")  # Use "Churned" instead of 1

# Ensure 'active_member' is a factor for proper categorization in the plot
churned_data <- churned_data %>% mutate(active_member = factor(active_member, levels = c("Inactive", "Active")))

# Create the plot showing churned customers by active_member status per country
ggplot(churned_data, aes(x = active_member, fill = active_member)) +
  geom_bar() +  # Plot bars for churned customers only
  facet_wrap(~ country) +  # Create separate plots for each country
  labs(title = "Churned Customers by Active Member Status per Country", 
       x = "Active Member Status", 
       y = "Count of Churned Customers") +
  scale_fill_manual(values = c("Inactive" = "lightblue", "Active" = "lightgreen")) +  # Custom colors and labels
  theme_minimal() +
  theme(legend.title = element_blank())

# From the plot we can see that non active members have higher rates of churning 
# compared to active members and this is consistent across all countries

# Ensure 'churn' and 'gender' are factors and correctly formatted
churnsub <- churnsub %>%
  mutate(
    churn = factor(churn, levels = c("Not Churned", "Churned")),
    gender = factor(gender, levels = c("Male", "Female"))
  )

# Calculate the percentage of churned vs. non-churned customers by gender per country
gender_churn_by_country <- churnsub %>%
  group_by(country, gender, churn) %>%
  summarise(customer_count = n(), .groups = "drop") %>%  # Add .groups = "drop" to suppress the message
  group_by(country, gender) %>%
  mutate(percentage = customer_count / sum(customer_count) * 100) %>%
  mutate(churn_label = ifelse(churn == "Churned", "Churned", "Not Churned"))

# Create the plot showing gender distribution of churn across countries
ggplot(gender_churn_by_country, aes(x = gender, y = percentage, fill = churn_label)) +
  geom_bar(stat = "identity", position = "dodge") + 
  geom_text(aes(label = sprintf("%.1f%%", percentage)), 
            position = position_dodge(0.9), 
            vjust = -0.5, 
            size = 3) +
  facet_wrap(~ country) +
  labs(title = "Churn Distribution by Gender Across Countries", 
       x = "Gender", 
       y = "Percentage of Customers",
       fill = "Churn Status") +
  scale_fill_manual(values = c("Not Churned" = "lightblue", "Churned" = "salmon")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0))

# From the above plot we can see that Germany has the highest churned rate in terms of
# gender, followed by France and Spain which we saw above
# We also notice that females have the highest churn rate and this is consistent
# across all 3 countries

# Interactive effect of active membership and credit card on churn
active_card_churn <- churnsub %>%
  group_by(active_member, credit_card, churn) %>%
  summarise(count = n(), .groups = "drop") %>%  # Add .groups = "drop" to avoid the message
  group_by(active_member, credit_card) %>%
  mutate(percentage = count / sum(count) * 100) %>%
  filter(churn == "Churned")  

# Plot the churn rate by active membership and credit card ownership
ggplot(active_card_churn, aes(x = active_member, y = percentage, fill = credit_card)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = sprintf("%.1f%%", percentage)), 
            position = position_dodge(width = 0.9), 
            vjust = -0.5, size = 3) +
  labs(title = "Churn Rate by Active Membership and Credit Card Ownership", 
       x = "Active Member", 
       y = "Churn Percentage") +
  scale_x_discrete(labels = c("Not Active", "Active")) +  
  scale_fill_discrete(labels = c("No Card", "Has Card")) +  
  theme_minimal()

# CORRELATION MATRIX WITH VISUALIZATION

# Convert churn to numeric (Churned -> 1, Not Churned -> 0)
numeric_data <- churnsub %>%
  mutate(churn_numeric = as.numeric(churn) - 1) %>%  # Convert factor to 0/1 (Churned=1, Not Churned=0)
  dplyr::select(credit_score, age, tenure, balance, estimated_salary, churn_numeric, products_number)

# Create correlation matrix for the selected numerical variables
cor_matrix <- cor(numeric_data, use = "complete.obs")  

# Plot the correlation matrix using 'corrplot' for a visually appealing matrix
corrplot(cor_matrix, method = "circle", type = "upper", 
         tl.col = "black", tl.srt = 45, addCoef.col = "black",
         title = "Correlation Matrix of Numeric Variables",
         mar = c(0, 0, 2, 0))

# DATA METHODOLOGY

# Load the required package for splitting data
library(caret)

# Create a train-test split (70% train, 30% test)
set.seed(123)  # for reproducibility
trainIndex <- createDataPartition(churnsub$churn, p = 0.7, list = FALSE)
train_data <- churnsub[trainIndex, ]
test_data <- churnsub[-trainIndex, ]

# Check the distribution of churn in both sets
table(train_data$churn)

## 
## Not Churned     Churned 
##         166          45

table(test_data$churn)

## 
## Not Churned     Churned 
##          71          18

since the train data is unbalanced, thus we need to resample the training data to balance classes using the ROSE function

# Resample the training data to balance classes
library(ROSE)

## Loaded ROSE 0.0-4

# Apply ROSE to balance the train data
set.seed(123)
train_data_balanced <- ROSE(churn ~ ., data = train_data, seed = 123)$data

# Check the new balanced churn distribution in train set
table(train_data_balanced$churn)

## 
## Not Churned     Churned 
##         110         101

# (test set remains untouched)
table(test_data$churn)

## 
## Not Churned     Churned 
##          71          18

# Train a logistic regression model on the balanced training data
log_model <- glm(churn ~ ., data = train_data_balanced, family = binomial())

# Summary of the logistic regression model
summary(log_model)

## 
## Call:
## glm(formula = churn ~ ., family = binomial(), data = train_data_balanced)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -1.784e+00  1.103e+00  -1.618  0.10573    
## credit_score           -8.951e-04  1.358e-03  -0.659  0.50985    
## countryGermany          1.798e+00  4.476e-01   4.016 5.91e-05 ***
## countrySpain            8.560e-01  4.369e-01   1.959  0.05007 .  
## genderFemale            9.490e-01  3.326e-01   2.853  0.00433 ** 
## age                     4.964e-02  1.600e-02   3.102  0.00192 ** 
## tenure                 -5.036e-02  5.168e-02  -0.974  0.32990    
## balance                -5.232e-07  2.578e-06  -0.203  0.83917    
## products_number        -1.861e-02  2.159e-01  -0.086  0.93129    
## credit_cardCredit Card -3.311e-01  3.588e-01  -0.923  0.35612    
## active_memberActive     2.327e-02  3.308e-01   0.070  0.94392    
## estimated_salary       -6.547e-06  2.738e-06  -2.392  0.01677 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 292.12  on 210  degrees of freedom
## Residual deviance: 239.70  on 199  degrees of freedom
## AIC: 263.7
## 
## Number of Fisher Scoring iterations: 4

vif(log_model)

##                      GVIF Df GVIF^(1/(2*Df))
## credit_score     1.098155  1        1.047929
## country          1.523161  2        1.110929
## gender           1.116204  1        1.056506
## age              1.113013  1        1.054994
## tenure           1.170610  1        1.081947
## balance          1.340092  1        1.157623
## products_number  1.187203  1        1.089588
## credit_card      1.094559  1        1.046212
## active_member    1.108449  1        1.052829
## estimated_salary 1.084432  1        1.041361

# VIF Interpretation:

# A VIF value of 1 indicates no multicollinearity.

# A VIF between 1 and 5 typically suggests moderate collinearity, which is not a serious concern.

# A VIF value greater than 5 or 10 may indicate high multicollinearity, which can lead to unreliable estimates and inflated standard errors.

# Generate predictions
test_predictions <- predict(log_model, newdata = test_data, type = "response")

# Check lengths to confirm they match
length(test_predictions)  # Should match nrow(test_data)

## [1] 89

nrow(test_data)

## [1] 89

# Assign predictions to test_data
test_data$predicted_churn <- ifelse(test_predictions > 0.5, "Churned", "Not Churned")

# Ensure factor levels are consistent
test_data$churn <- factor(test_data$churn, levels = c("Not Churned", "Churned"))
test_data$predicted_churn <- factor(test_data$predicted_churn, levels = c("Not Churned", "Churned"))

# Confusion Matrix
caret::confusionMatrix(test_data$predicted_churn, test_data$churn, positive = "Churned")

## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    Not Churned Churned
##   Not Churned          51       9
##   Churned              20       9
##                                           
##                Accuracy : 0.6742          
##                  95% CI : (0.5666, 0.7698)
##     No Information Rate : 0.7978          
##     P-Value [Acc > NIR] : 0.99796         
##                                           
##                   Kappa : 0.1778          
##                                           
##  Mcnemar's Test P-Value : 0.06332         
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 0.7183          
##          Pos Pred Value : 0.3103          
##          Neg Pred Value : 0.8500          
##              Prevalence : 0.2022          
##          Detection Rate : 0.1011          
##    Detection Prevalence : 0.3258          
##       Balanced Accuracy : 0.6092          
##                                           
##        'Positive' Class : Churned         
##

# ROC Curve and AUC
library(pROC)
roc_curve <- roc(test_data$churn, test_predictions)

## Setting levels: control = Not Churned, case = Churned

## Setting direction: controls < cases

plot(roc_curve, main="ROC Curve")

auc(roc_curve)

## Area under the curve: 0.5759

STA_project

Doris Mbitazi Asongafac

2025-04-21

Customer Churn Prediction for ABC Multistate Bank

R Implementation