#Essential libraries:

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales) 
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.4     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ✔ readr     2.1.6
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Data Import & Initial Inspection:
bank_churn_data <- read.csv("D:/DCS402/Bank Customer Churn Prediction.csv")

# Initial Inspection
glimpse(bank_churn_data)
## Rows: 10,000
## Columns: 12
## $ customer_id      <int> 15634602, 15647311, 15619304, 15701354, 15737888, 155…
## $ credit_score     <int> 619, 608, 502, 699, 850, 645, 822, 376, 501, 684, 528…
## $ country          <chr> "France", "Spain", "France", "France", "Spain", "Spai…
## $ gender           <chr> "Female", "Female", "Female", "Female", "Female", "Ma…
## $ age              <int> 42, 41, 42, 39, 43, 44, 50, 29, 44, 27, 31, 24, 34, 2…
## $ tenure           <int> 2, 1, 8, 1, 2, 8, 7, 4, 4, 2, 6, 3, 10, 5, 7, 3, 1, 9…
## $ balance          <dbl> 0.00, 83807.86, 159660.80, 0.00, 125510.82, 113755.78…
## $ products_number  <int> 1, 1, 3, 2, 1, 2, 2, 4, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2,…
## $ credit_card      <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,…
## $ active_member    <int> 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,…
## $ estimated_salary <dbl> 101348.88, 112542.58, 113931.57, 93826.63, 79084.10, …
## $ churn            <int> 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
summary(bank_churn_data)
##   customer_id        credit_score     country             gender         
##  Min.   :15565701   Min.   :350.0   Length:10000       Length:10000      
##  1st Qu.:15628528   1st Qu.:584.0   Class :character   Class :character  
##  Median :15690738   Median :652.0   Mode  :character   Mode  :character  
##  Mean   :15690941   Mean   :650.5                                        
##  3rd Qu.:15753234   3rd Qu.:718.0                                        
##  Max.   :15815690   Max.   :850.0                                        
##       age            tenure          balance       products_number
##  Min.   :18.00   Min.   : 0.000   Min.   :     0   Min.   :1.00   
##  1st Qu.:32.00   1st Qu.: 3.000   1st Qu.:     0   1st Qu.:1.00   
##  Median :37.00   Median : 5.000   Median : 97199   Median :1.00   
##  Mean   :38.92   Mean   : 5.013   Mean   : 76486   Mean   :1.53   
##  3rd Qu.:44.00   3rd Qu.: 7.000   3rd Qu.:127644   3rd Qu.:2.00   
##  Max.   :92.00   Max.   :10.000   Max.   :250898   Max.   :4.00   
##   credit_card     active_member    estimated_salary        churn       
##  Min.   :0.0000   Min.   :0.0000   Min.   :    11.58   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 51002.11   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :100193.91   Median :0.0000  
##  Mean   :0.7055   Mean   :0.5151   Mean   :100090.24   Mean   :0.2037  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:149388.25   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :199992.48   Max.   :1.0000
#Data Cleaning:
#Null/Missing Value
missing_counts <- colSums(is.na(bank_churn_data))
missing_churndata <- data.frame(variable = names(missing_counts), missing = missing_counts)
missing_churndata
##                          variable missing
## customer_id           customer_id       0
## credit_score         credit_score       0
## country                   country       0
## gender                     gender       0
## age                           age       0
## tenure                     tenure       0
## balance                   balance       0
## products_number   products_number       0
## credit_card           credit_card       0
## active_member       active_member       0
## estimated_salary estimated_salary       0
## churn                       churn       0
#Cleaning Inconsistencies:
churn_data <- bank_churn_data %>%
  mutate(country = trimws(tolower(country)),
         gender = trimws(tolower(gender))) %>%
  distinct() %>%
  na.omit()
#Outlier Detection: Credit Score:
Q1 <- quantile(churn_data$credit_score, 0.25)
Q3 <- quantile(churn_data$credit_score, 0.75)
IQR_val <- Q3 - Q1

# Filtering Outliers
churn_clean <- churn_data %>%
  filter(credit_score >= (Q1 - 1.5 * IQR_val) & 
           credit_score <= (Q3 + 1.5 * IQR_val))
#Visualization Chart
# Comparison plots
ggplot(bank_churn_data, aes(y = credit_score)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  labs(title = "Credit Score: Before Outlier Removal", subtitle = "Red dots indicate outliers") + 
  theme_minimal()

ggplot(churn_clean, aes(y = credit_score)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Credit Score: Post-Cleaning (IQR Method)") + 
  theme_minimal()

#Data Transformation:
churn_final <- churn_clean %>%
  mutate(
    log_salary = log10(estimated_salary + 1),
    scaled_salary = (estimated_salary - min(estimated_salary)) / 
      (max(estimated_salary) - min(estimated_salary)),
    z_age = as.numeric(scale(age))
  )

# Reshaping for visualization
churn_plots_long <- churn_final %>%
  pivot_longer(
    cols = c(log_salary, scaled_salary), 
    names_to = "method", 
    values_to = "transformed_value"
  )
#Transformation Comparison:
ggplot(churn_plots_long, aes(x = transformed_value, fill = method)) +
  geom_histogram(bins = 30, color = "white", show.legend = FALSE) +
  facet_wrap(~method, scales = "free") +
  theme_minimal() +
  scale_fill_manual(values = c("log_salary" = "darkgreen", "scaled_salary" = "steelblue")) +
  labs(title = "Comparison of Salary Transformations", x = "Value", y = "Frequency")

#Exploratory Data Analysis (EDA):
churn_summary <- churn_final %>%
  group_by(country) %>%
  summarise(
    churn_rate = mean(churn, na.rm = TRUE) * 100,
    customer_count = n(),
    avg_balance = mean(balance, na.rm = TRUE)
  ) %>%
  arrange(desc(churn_rate))

# Insight Plot: Churn Geography
ggplot(churn_summary, aes(x = reorder(country, -churn_rate), y = churn_rate)) +
  geom_bar(stat = "identity", fill = "lightpink", width = 0.6) +
  geom_text(aes(label = paste0(round(churn_rate, 1), "%")), vjust = -0.5) +
  labs(title = "Churn Rate by Country", x = "Country", y = "Churn Rate (%)") +
  theme_minimal()

##Statistical Modeling & Correlation
#Linear Regression:
model <- lm(balance ~ age + credit_score + country, data = churn_final)
summary(model)
## 
## Call:
## lm(formula = balance ~ age + credit_score + country, data = churn_final)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -94851 -61627   -155  48013 189212 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    57900.399   4483.020  12.915   <2e-16 ***
## age               56.796     54.617   1.040    0.298    
## credit_score       3.126      5.955   0.525    0.600    
## countrygermany 57518.119   1400.677  41.065   <2e-16 ***
## countryspain    -420.412   1405.318  -0.299    0.765    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57160 on 9980 degrees of freedom
## Multiple R-squared:  0.161,  Adjusted R-squared:  0.1607 
## F-statistic: 478.8 on 4 and 9980 DF,  p-value: < 2.2e-16
#Correlation Heatmap:
numeric_cols <- churn_final %>% 
  select(credit_score, age, balance, estimated_salary, log_salary)

cor_matrix <- cor(numeric_cols, use = "complete.obs")
cor_df <- as.data.frame(as.table(cor_matrix))

ggplot(cor_df, aes(x = Var1, y = Var2, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
  geom_text(aes(label = round(Freq, 2)), color = "black") +
  labs(title = "Correlation Heatmap", subtitle = "Identifying Multicollinearity") +
  theme_minimal() + 
  coord_fixed()