R Implementation
# Load required libraries
library(tidyverse) # For data manipulation and visualization
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # For machine learning workflows
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(corrplot) # For correlation visualization
## corrplot 0.95 loaded
library(randomForest) # For Random Forest model
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(xgboost) # For XGBoost model
## Warning: package 'xgboost' was built under R version 4.4.3
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(pROC) # For ROC curve analysis
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(e1071) # For other ML algorithms
library(ggplot2) # For advanced visualizations
library(dplyr)
library(dplyr)
library(corrplot)
library(pander)
## Warning: package 'pander' was built under R version 4.4.3
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
# Set seed for reproducibility
set.seed(123)
# 1. DATA PREPARATION ---------------------------------------------------------
# Load the dataset (it's in CSV format)
churn <- read.csv("/Users/doris/OneDrive/Documents/STA6013 Project/Bank Customer Churn Prediction.csv")
#Select n=300 subset with relatively balanced categorical variables
#Set seed for reproducibility
set.seed(999)
#Filter by "country" then and sample 100 from each
france_sample <- churn[churn$country == "France", ][sample(sum(churn$country == "France"), 100), ]
germany_sample <- churn[churn$country == "Germany", ][sample(sum(churn$country == "Germany"), 100), ]
spain_sample <- churn[churn$country == "Spain", ][sample(sum(churn$country == "Spain"), 100), ]
#Combine the subsets
churnsub <- rbind(france_sample, germany_sample, spain_sample)
#Frequency of each outcome for 'country'
country_freq <- table(churnsub$country)
country_df <- data.frame(Country = names(country_freq), Count = as.vector(country_freq))
pander(country_df, caption = "Frequency Table for Country")
Frequency Table for Country
| France |
100 |
| Germany |
100 |
| Spain |
100 |
#Frequency of each outcome for 'gender'
gender_freq <- table(churnsub$gender)
gender_df <- data.frame(Gender = names(gender_freq), Count = as.vector(gender_freq))
pander(gender_df, caption = "Frequency Table for Gender")
Frequency Table for Gender
| Female |
148 |
| Male |
152 |
# 2. DATA CLEANING AND PREPARATION
# Check for missing values
colSums(is.na(churnsub))
## customer_id credit_score country gender
## 0 0 0 0
## age tenure balance products_number
## 0 0 0 0
## credit_card active_member estimated_salary churn
## 0 0 0 0
# Check for duplicate rows
duplicates <- churnsub[duplicated(churnsub), ]
cat("Number of duplicate rows: ", nrow(duplicates), "\n")
## Number of duplicate rows: 0
# Our dataset has no missing values and no duplicates, thus its clean
# Check data structure
str(churnsub)
## 'data.frame': 300 obs. of 12 variables:
## $ customer_id : int 15723720 15582841 15588219 15780770 15809999 15610165 15640491 15580227 15594450 15586959 ...
## $ credit_score : int 591 600 850 445 709 761 464 803 695 468 ...
## $ country : chr "France" "France" "France" "France" ...
## $ gender : chr "Female" "Male" "Female" "Male" ...
## $ age : int 31 29 38 31 41 26 33 33 49 42 ...
## $ tenure : int 7 8 1 7 3 1 10 6 9 5 ...
## $ balance : num 0 0 106872 145057 150301 ...
## $ products_number : int 2 2 2 1 2 2 2 2 1 2 ...
## $ credit_card : int 0 0 1 1 1 1 1 1 1 1 ...
## $ active_member : int 1 1 0 1 0 1 0 0 0 0 ...
## $ estimated_salary: num 48778 34747 29333 175894 71673 ...
## $ churn : int 0 0 0 0 0 0 0 0 0 0 ...
summary(churnsub)
## customer_id credit_score country gender
## Min. :15565706 Min. :386.0 Length:300 Length:300
## 1st Qu.:15626314 1st Qu.:581.8 Class :character Class :character
## Median :15684530 Median :651.5 Mode :character Mode :character
## Mean :15687437 Mean :652.1
## 3rd Qu.:15750989 3rd Qu.:717.0
## Max. :15815070 Max. :850.0
## age tenure balance products_number
## Min. :19.00 Min. : 0.000 Min. : 0 Min. :1.000
## 1st Qu.:32.00 1st Qu.: 3.000 1st Qu.: 0 1st Qu.:1.000
## Median :37.00 Median : 5.000 Median : 99965 Median :1.000
## Mean :38.45 Mean : 4.927 Mean : 81915 Mean :1.503
## 3rd Qu.:43.00 3rd Qu.: 7.000 3rd Qu.:130175 3rd Qu.:2.000
## Max. :77.00 Max. :10.000 Max. :192391 Max. :4.000
## credit_card active_member estimated_salary churn
## Min. :0.0000 Min. :0.0000 Min. : 417.4 Min. :0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 52671.0 1st Qu.:0.00
## Median :1.0000 Median :1.0000 Median : 95604.2 Median :0.00
## Mean :0.6967 Mean :0.5133 Mean : 98968.3 Mean :0.21
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:144791.1 3rd Qu.:0.00
## Max. :1.0000 Max. :1.0000 Max. :199409.2 Max. :1.00
# Our random sampled dataset consist of 300 observations and 12 variables
# From above its clear that some variables are misclassified, this needs to be
# corrected to ensure smooth analysis, we shall not use customer id since
# this column is likely just an identifier and doesn't provide any actionable insights # and thus will not be useful for our analysis
#our summarise data provides more details about our data, like limits for outliers
# factor variable, just more detailed information
# Check levels
levels(as.factor(churnsub$country))
## [1] "France" "Germany" "Spain"
levels(as.factor(churnsub$gender))
## [1] "Female" "Male"
levels(as.factor(churnsub$active_member))
## [1] "0" "1"
levels(as.factor(churnsub$credit_card))
## [1] "0" "1"
levels(as.factor(churnsub$churn))
## [1] "0" "1"
# Convert appropriate variables to factors
churnsub$country <- as.factor(churnsub$country)
churnsub$gender <- as.factor(churnsub$gender)
churnsub$credit_card <- as.factor(churnsub$credit_card)
churnsub$active_member <- as.factor(churnsub$active_member)
churnsub$churn <- as.factor(churnsub$churn)
# Remove the customer_id column
churnsub <- churnsub %>%
dplyr::select(-customer_id)
# See the change
str(churnsub)
## 'data.frame': 300 obs. of 11 variables:
## $ credit_score : int 591 600 850 445 709 761 464 803 695 468 ...
## $ country : Factor w/ 3 levels "France","Germany",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 1 2 1 1 1 2 2 1 ...
## $ age : int 31 29 38 31 41 26 33 33 49 42 ...
## $ tenure : int 7 8 1 7 3 1 10 6 9 5 ...
## $ balance : num 0 0 106872 145057 150301 ...
## $ products_number : int 2 2 2 1 2 2 2 2 1 2 ...
## $ credit_card : Factor w/ 2 levels "0","1": 1 1 2 2 2 2 2 2 2 2 ...
## $ active_member : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 1 1 1 1 ...
## $ estimated_salary: num 48778 34747 29333 175894 71673 ...
## $ churn : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
# All our variables are now correctly classified.
# Converting the Boolean values to the desired values
churnsub <- churnsub %>%
mutate(credit_card = fct_recode(credit_card, "Credit Card" = "1", "No-Credit Card" = "0"),
active_member = fct_recode(active_member, "Active" = "1", "Inactive" = "0"),
churn = fct_recode(churn, "Churned" = "1", "Not Churned" = "0"))
# View the first few rows of the updated dataset
head(churnsub)
# 3. EXPLORATORY DATA ANALYSIS ------------------------------------------------
# Distribution of the target variable
churn_distribution <- table(churnsub$churn)
print(churn_distribution)
##
## Not Churned Churned
## 237 63
barplot(churn_distribution,
main="Distribution of Customer Churn",
xlab="Churn Status",
ylab="Count",
col=c("green", "red"))

# Based from this bar plot we can see that many customers are retained and few are
# exited, our goal is to find out what factors influenced this behavior and fine
# possible solutions
# Calculate the percentage of churned vs. non-churned customers per country
churn_counts_percentage <- churnsub %>%
group_by(country, churn) %>%
summarise(customer_count = n(), .groups = "drop") %>%
group_by(country) %>%
mutate(percentage = customer_count / sum(customer_count) * 100) %>%
mutate(churn_label = ifelse(churn == "Churned", "Churned", "Not Churned"))
# Create the side-by-side bar plot with percentages
ggplot(churn_counts_percentage, aes(x = country, y = percentage, fill = churn_label)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = sprintf("%.1f%%", percentage)), vjust = -0.5, position = position_dodge(0.9)) +
labs(title = "Percentage of Churn vs Retained Customers per Country",
x = "Country",
y = "Percentage of Customers") +
scale_fill_manual(values = c("Not Churned" = "lightblue", "Churned" = "salmon")) +
theme_minimal()

# Based on the above plot we can see that Germany has the highest customer churned
# 0f 31.9% followed by Spain(18.1%) and lastly by France with 16.7%
# Filter the data for churned customers only
churned_data <- churnsub %>% filter(churn == "Churned") # Use "Churned" instead of 1
# Ensure 'active_member' is a factor for proper categorization in the plot
churned_data <- churned_data %>% mutate(active_member = factor(active_member, levels = c("Inactive", "Active")))
# Create the plot showing churned customers by active_member status per country
ggplot(churned_data, aes(x = active_member, fill = active_member)) +
geom_bar() + # Plot bars for churned customers only
facet_wrap(~ country) + # Create separate plots for each country
labs(title = "Churned Customers by Active Member Status per Country",
x = "Active Member Status",
y = "Count of Churned Customers") +
scale_fill_manual(values = c("Inactive" = "lightblue", "Active" = "lightgreen")) + # Custom colors and labels
theme_minimal() +
theme(legend.title = element_blank())

# From the plot we can see that non active members have higher rates of churning
# compared to active members and this is consistent across all countries
# Ensure 'churn' and 'gender' are factors and correctly formatted
churnsub <- churnsub %>%
mutate(
churn = factor(churn, levels = c("Not Churned", "Churned")),
gender = factor(gender, levels = c("Male", "Female"))
)
# Calculate the percentage of churned vs. non-churned customers by gender per country
gender_churn_by_country <- churnsub %>%
group_by(country, gender, churn) %>%
summarise(customer_count = n(), .groups = "drop") %>% # Add .groups = "drop" to suppress the message
group_by(country, gender) %>%
mutate(percentage = customer_count / sum(customer_count) * 100) %>%
mutate(churn_label = ifelse(churn == "Churned", "Churned", "Not Churned"))
# Create the plot showing gender distribution of churn across countries
ggplot(gender_churn_by_country, aes(x = gender, y = percentage, fill = churn_label)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = sprintf("%.1f%%", percentage)),
position = position_dodge(0.9),
vjust = -0.5,
size = 3) +
facet_wrap(~ country) +
labs(title = "Churn Distribution by Gender Across Countries",
x = "Gender",
y = "Percentage of Customers",
fill = "Churn Status") +
scale_fill_manual(values = c("Not Churned" = "lightblue", "Churned" = "salmon")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0))

# From the above plot we can see that Germany has the highest churned rate in terms of
# gender, followed by France and Spain which we saw above
# We also notice that females have the highest churn rate and this is consistent
# across all 3 countries
# Interactive effect of active membership and credit card on churn
active_card_churn <- churnsub %>%
group_by(active_member, credit_card, churn) %>%
summarise(count = n(), .groups = "drop") %>% # Add .groups = "drop" to avoid the message
group_by(active_member, credit_card) %>%
mutate(percentage = count / sum(count) * 100) %>%
filter(churn == "Churned")
# Plot the churn rate by active membership and credit card ownership
ggplot(active_card_churn, aes(x = active_member, y = percentage, fill = credit_card)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = sprintf("%.1f%%", percentage)),
position = position_dodge(width = 0.9),
vjust = -0.5, size = 3) +
labs(title = "Churn Rate by Active Membership and Credit Card Ownership",
x = "Active Member",
y = "Churn Percentage") +
scale_x_discrete(labels = c("Not Active", "Active")) +
scale_fill_discrete(labels = c("No Card", "Has Card")) +
theme_minimal()

# CORRELATION MATRIX WITH VISUALIZATION
# Convert churn to numeric (Churned -> 1, Not Churned -> 0)
numeric_data <- churnsub %>%
mutate(churn_numeric = as.numeric(churn) - 1) %>% # Convert factor to 0/1 (Churned=1, Not Churned=0)
dplyr::select(credit_score, age, tenure, balance, estimated_salary, churn_numeric, products_number)
# Create correlation matrix for the selected numerical variables
cor_matrix <- cor(numeric_data, use = "complete.obs")
# Plot the correlation matrix using 'corrplot' for a visually appealing matrix
corrplot(cor_matrix, method = "circle", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black",
title = "Correlation Matrix of Numeric Variables",
mar = c(0, 0, 2, 0))

# DATA METHODOLOGY
# Load the required package for splitting data
library(caret)
# Create a train-test split (70% train, 30% test)
set.seed(123) # for reproducibility
trainIndex <- createDataPartition(churnsub$churn, p = 0.7, list = FALSE)
train_data <- churnsub[trainIndex, ]
test_data <- churnsub[-trainIndex, ]
# Check the distribution of churn in both sets
table(train_data$churn)
##
## Not Churned Churned
## 166 45
table(test_data$churn)
##
## Not Churned Churned
## 71 18
since the train data is unbalanced, thus we need to resample the
training data to balance classes using the ROSE function
# Resample the training data to balance classes
library(ROSE)
## Loaded ROSE 0.0-4
# Apply ROSE to balance the train data
set.seed(123)
train_data_balanced <- ROSE(churn ~ ., data = train_data, seed = 123)$data
# Check the new balanced churn distribution in train set
table(train_data_balanced$churn)
##
## Not Churned Churned
## 110 101
# (test set remains untouched)
table(test_data$churn)
##
## Not Churned Churned
## 71 18
# Train a logistic regression model on the balanced training data
log_model <- glm(churn ~ ., data = train_data_balanced, family = binomial())
# Summary of the logistic regression model
summary(log_model)
##
## Call:
## glm(formula = churn ~ ., family = binomial(), data = train_data_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.784e+00 1.103e+00 -1.618 0.10573
## credit_score -8.951e-04 1.358e-03 -0.659 0.50985
## countryGermany 1.798e+00 4.476e-01 4.016 5.91e-05 ***
## countrySpain 8.560e-01 4.369e-01 1.959 0.05007 .
## genderFemale 9.490e-01 3.326e-01 2.853 0.00433 **
## age 4.964e-02 1.600e-02 3.102 0.00192 **
## tenure -5.036e-02 5.168e-02 -0.974 0.32990
## balance -5.232e-07 2.578e-06 -0.203 0.83917
## products_number -1.861e-02 2.159e-01 -0.086 0.93129
## credit_cardCredit Card -3.311e-01 3.588e-01 -0.923 0.35612
## active_memberActive 2.327e-02 3.308e-01 0.070 0.94392
## estimated_salary -6.547e-06 2.738e-06 -2.392 0.01677 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 292.12 on 210 degrees of freedom
## Residual deviance: 239.70 on 199 degrees of freedom
## AIC: 263.7
##
## Number of Fisher Scoring iterations: 4
vif(log_model)
## GVIF Df GVIF^(1/(2*Df))
## credit_score 1.098155 1 1.047929
## country 1.523161 2 1.110929
## gender 1.116204 1 1.056506
## age 1.113013 1 1.054994
## tenure 1.170610 1 1.081947
## balance 1.340092 1 1.157623
## products_number 1.187203 1 1.089588
## credit_card 1.094559 1 1.046212
## active_member 1.108449 1 1.052829
## estimated_salary 1.084432 1 1.041361
# VIF Interpretation:
# A VIF value of 1 indicates no multicollinearity.
# A VIF between 1 and 5 typically suggests moderate collinearity, which is not a serious concern.
# A VIF value greater than 5 or 10 may indicate high multicollinearity, which can lead to unreliable estimates and inflated standard errors.
# Generate predictions
test_predictions <- predict(log_model, newdata = test_data, type = "response")
# Check lengths to confirm they match
length(test_predictions) # Should match nrow(test_data)
## [1] 89
nrow(test_data)
## [1] 89
# Assign predictions to test_data
test_data$predicted_churn <- ifelse(test_predictions > 0.5, "Churned", "Not Churned")
# Ensure factor levels are consistent
test_data$churn <- factor(test_data$churn, levels = c("Not Churned", "Churned"))
test_data$predicted_churn <- factor(test_data$predicted_churn, levels = c("Not Churned", "Churned"))
# Confusion Matrix
caret::confusionMatrix(test_data$predicted_churn, test_data$churn, positive = "Churned")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not Churned Churned
## Not Churned 51 9
## Churned 20 9
##
## Accuracy : 0.6742
## 95% CI : (0.5666, 0.7698)
## No Information Rate : 0.7978
## P-Value [Acc > NIR] : 0.99796
##
## Kappa : 0.1778
##
## Mcnemar's Test P-Value : 0.06332
##
## Sensitivity : 0.5000
## Specificity : 0.7183
## Pos Pred Value : 0.3103
## Neg Pred Value : 0.8500
## Prevalence : 0.2022
## Detection Rate : 0.1011
## Detection Prevalence : 0.3258
## Balanced Accuracy : 0.6092
##
## 'Positive' Class : Churned
##
# ROC Curve and AUC
library(pROC)
roc_curve <- roc(test_data$churn, test_predictions)
## Setting levels: control = Not Churned, case = Churned
## Setting direction: controls < cases
plot(roc_curve, main="ROC Curve")

auc(roc_curve)
## Area under the curve: 0.5759