#Essential libraries:
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.1
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ✔ readr 2.1.6
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Data Import & Initial Inspection:
bank_churn_data <- read.csv("D:/DCS402/Bank Customer Churn Prediction.csv")
# Initial Inspection
glimpse(bank_churn_data)
## Rows: 10,000
## Columns: 12
## $ customer_id <int> 15634602, 15647311, 15619304, 15701354, 15737888, 155…
## $ credit_score <int> 619, 608, 502, 699, 850, 645, 822, 376, 501, 684, 528…
## $ country <chr> "France", "Spain", "France", "France", "Spain", "Spai…
## $ gender <chr> "Female", "Female", "Female", "Female", "Female", "Ma…
## $ age <int> 42, 41, 42, 39, 43, 44, 50, 29, 44, 27, 31, 24, 34, 2…
## $ tenure <int> 2, 1, 8, 1, 2, 8, 7, 4, 4, 2, 6, 3, 10, 5, 7, 3, 1, 9…
## $ balance <dbl> 0.00, 83807.86, 159660.80, 0.00, 125510.82, 113755.78…
## $ products_number <int> 1, 1, 3, 2, 1, 2, 2, 4, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2,…
## $ credit_card <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,…
## $ active_member <int> 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,…
## $ estimated_salary <dbl> 101348.88, 112542.58, 113931.57, 93826.63, 79084.10, …
## $ churn <int> 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
summary(bank_churn_data)
## customer_id credit_score country gender
## Min. :15565701 Min. :350.0 Length:10000 Length:10000
## 1st Qu.:15628528 1st Qu.:584.0 Class :character Class :character
## Median :15690738 Median :652.0 Mode :character Mode :character
## Mean :15690941 Mean :650.5
## 3rd Qu.:15753234 3rd Qu.:718.0
## Max. :15815690 Max. :850.0
## age tenure balance products_number
## Min. :18.00 Min. : 0.000 Min. : 0 Min. :1.00
## 1st Qu.:32.00 1st Qu.: 3.000 1st Qu.: 0 1st Qu.:1.00
## Median :37.00 Median : 5.000 Median : 97199 Median :1.00
## Mean :38.92 Mean : 5.013 Mean : 76486 Mean :1.53
## 3rd Qu.:44.00 3rd Qu.: 7.000 3rd Qu.:127644 3rd Qu.:2.00
## Max. :92.00 Max. :10.000 Max. :250898 Max. :4.00
## credit_card active_member estimated_salary churn
## Min. :0.0000 Min. :0.0000 Min. : 11.58 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 51002.11 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :100193.91 Median :0.0000
## Mean :0.7055 Mean :0.5151 Mean :100090.24 Mean :0.2037
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:149388.25 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :199992.48 Max. :1.0000
#Data Cleaning:
#Null/Missing Value
missing_counts <- colSums(is.na(bank_churn_data))
missing_churndata <- data.frame(variable = names(missing_counts), missing = missing_counts)
missing_churndata
## variable missing
## customer_id customer_id 0
## credit_score credit_score 0
## country country 0
## gender gender 0
## age age 0
## tenure tenure 0
## balance balance 0
## products_number products_number 0
## credit_card credit_card 0
## active_member active_member 0
## estimated_salary estimated_salary 0
## churn churn 0
#Cleaning Inconsistencies:
churn_data <- bank_churn_data %>%
mutate(country = trimws(tolower(country)),
gender = trimws(tolower(gender))) %>%
distinct() %>%
na.omit()
#Outlier Detection: Credit Score:
Q1 <- quantile(churn_data$credit_score, 0.25)
Q3 <- quantile(churn_data$credit_score, 0.75)
IQR_val <- Q3 - Q1
# Filtering Outliers
churn_clean <- churn_data %>%
filter(credit_score >= (Q1 - 1.5 * IQR_val) &
credit_score <= (Q3 + 1.5 * IQR_val))
#Visualization Chart
# Comparison plots
ggplot(bank_churn_data, aes(y = credit_score)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
labs(title = "Credit Score: Before Outlier Removal", subtitle = "Red dots indicate outliers") +
theme_minimal()

ggplot(churn_clean, aes(y = credit_score)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Credit Score: Post-Cleaning (IQR Method)") +
theme_minimal()

#Data Transformation:
churn_final <- churn_clean %>%
mutate(
log_salary = log10(estimated_salary + 1),
scaled_salary = (estimated_salary - min(estimated_salary)) /
(max(estimated_salary) - min(estimated_salary)),
z_age = as.numeric(scale(age))
)
# Reshaping for visualization
churn_plots_long <- churn_final %>%
pivot_longer(
cols = c(log_salary, scaled_salary),
names_to = "method",
values_to = "transformed_value"
)
#Transformation Comparison:
ggplot(churn_plots_long, aes(x = transformed_value, fill = method)) +
geom_histogram(bins = 30, color = "white", show.legend = FALSE) +
facet_wrap(~method, scales = "free") +
theme_minimal() +
scale_fill_manual(values = c("log_salary" = "darkgreen", "scaled_salary" = "steelblue")) +
labs(title = "Comparison of Salary Transformations", x = "Value", y = "Frequency")

#Exploratory Data Analysis (EDA):
churn_summary <- churn_final %>%
group_by(country) %>%
summarise(
churn_rate = mean(churn, na.rm = TRUE) * 100,
customer_count = n(),
avg_balance = mean(balance, na.rm = TRUE)
) %>%
arrange(desc(churn_rate))
# Insight Plot: Churn Geography
ggplot(churn_summary, aes(x = reorder(country, -churn_rate), y = churn_rate)) +
geom_bar(stat = "identity", fill = "lightpink", width = 0.6) +
geom_text(aes(label = paste0(round(churn_rate, 1), "%")), vjust = -0.5) +
labs(title = "Churn Rate by Country", x = "Country", y = "Churn Rate (%)") +
theme_minimal()

##Statistical Modeling & Correlation
#Linear Regression:
model <- lm(balance ~ age + credit_score + country, data = churn_final)
summary(model)
##
## Call:
## lm(formula = balance ~ age + credit_score + country, data = churn_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94851 -61627 -155 48013 189212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57900.399 4483.020 12.915 <2e-16 ***
## age 56.796 54.617 1.040 0.298
## credit_score 3.126 5.955 0.525 0.600
## countrygermany 57518.119 1400.677 41.065 <2e-16 ***
## countryspain -420.412 1405.318 -0.299 0.765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57160 on 9980 degrees of freedom
## Multiple R-squared: 0.161, Adjusted R-squared: 0.1607
## F-statistic: 478.8 on 4 and 9980 DF, p-value: < 2.2e-16
#Correlation Heatmap:
numeric_cols <- churn_final %>%
select(credit_score, age, balance, estimated_salary, log_salary)
cor_matrix <- cor(numeric_cols, use = "complete.obs")
cor_df <- as.data.frame(as.table(cor_matrix))
ggplot(cor_df, aes(x = Var1, y = Var2, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
geom_text(aes(label = round(Freq, 2)), color = "black") +
labs(title = "Correlation Heatmap", subtitle = "Identifying Multicollinearity") +
theme_minimal() +
coord_fixed()
