# Set global chunk options for the document
knitr::opts_chunk$set(
  echo = TRUE,      # Show code in the output
  warning = FALSE,  # Suppress warnings
  message = FALSE   # Suppress messages
)

Morgan State University

Department of Information Science & Systems

Fall 2024

INSS 615: Data Wrangling for Visualization

Title: Customer Churn Analysis For a European Bank

Name: AMOLE ADEYEMI & OLUWAFEMI SAMMUEL ESAN

# Load necessary library
library(rvest)
library(tidyverse)
library(readxl)
library(dplyr)
library(openxlsx)
bank <- read.csv("C:/Users/tunne/Downloads/Dirty_Bank_Churn.csv")

head(bank)
##   CustomerId  Surname CreditScore Geography Gender Age Tenure     Balance
## 1   15634602 HARGRAVE        -500         1 Female  42      2     £ -5000
## 2   15647311     hill        1500         3 Female  41      1   £ 9999999
## 3   15619304     ONIO        2000         1 Female  42      8      £ -300
## 4   15701354     boni        -100         1 Female  39      1    £ 888888
## 5   15737888 MITCHELL        9999         3 Female  43      2     £ 1e+09
## 6   15574012      chu         645         3   Male  44      8 £ 113755.78
##   NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
## 1             1         1              1     £ 101348.88      1
## 2             1         0              1     £ 112542.58      0
## 3             3         1              0     £ 113931.57      1
## 4             2         0              0      £ 93826.63      0
## 5             1         1              1       £ 79084.1      0
## 6             2         1              0     £ 149756.71      1
# Remove all "-" characters from the dataset
bank <- bank %>%
  mutate(across(everything(), ~ str_replace_all(., "-", "")))

# View the cleaned dataset
print(head(bank))
##   CustomerId  Surname CreditScore Geography Gender Age Tenure     Balance
## 1   15634602 HARGRAVE         500         1 Female  42      2      £ 5000
## 2   15647311     hill        1500         3 Female  41      1   £ 9999999
## 3   15619304     ONIO        2000         1 Female  42      8       £ 300
## 4   15701354     boni         100         1 Female  39      1    £ 888888
## 5   15737888 MITCHELL        9999         3 Female  43      2     £ 1e+09
## 6   15574012      chu         645         3   Male  44      8 £ 113755.78
##   NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
## 1             1         1              1     £ 101348.88      1
## 2             1         0              1     £ 112542.58      0
## 3             3         1              0     £ 113931.57      1
## 4             2         0              0      £ 93826.63      0
## 5             1         1              1       £ 79084.1      0
## 6             2         1              0     £ 149756.71      1
# Remove £ from all columns
bank <- bank %>%
  mutate(across(everything(), ~ str_remove(., "£")))

# View the cleaned DataFrame
print(head(bank))
##   CustomerId  Surname CreditScore Geography Gender Age Tenure    Balance
## 1   15634602 HARGRAVE         500         1 Female  42      2       5000
## 2   15647311     hill        1500         3 Female  41      1    9999999
## 3   15619304     ONIO        2000         1 Female  42      8        300
## 4   15701354     boni         100         1 Female  39      1     888888
## 5   15737888 MITCHELL        9999         3 Female  43      2      1e+09
## 6   15574012      chu         645         3   Male  44      8  113755.78
##   NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
## 1             1         1              1       101348.88      1
## 2             1         0              1       112542.58      0
## 3             3         1              0       113931.57      1
## 4             2         0              0        93826.63      0
## 5             1         1              1         79084.1      0
## 6             2         1              0       149756.71      1
# checking data types

glimpse(bank)
## Rows: 10,005
## Columns: 13
## $ CustomerId      <chr> "15634602", "15647311", "15619304", "15701354", "15737…
## $ Surname         <chr> "HARGRAVE", "hill", "ONIO", "boni", "MITCHELL", "chu",…
## $ CreditScore     <chr> "500", "1500", "2000", "100", "9999", "645", "822", "3…
## $ Geography       <chr> "1", "3", "1", "1", "3", "3", "1", "2", "1", "1", "1",…
## $ Gender          <chr> "Female", "Female", "Female", "Female", "Female", "Mal…
## $ Age             <chr> "42", "41", "42", "39", "43", "44", "50", "29", "44", …
## $ Tenure          <chr> "2", "1", "8", "1", "2", "8", "7", "4", "4", NA, "6", …
## $ Balance         <chr> " 5000", " 9999999", " 300", " 888888", " 1e+09", " 11…
## $ NumOfProducts   <chr> "1", "1", "3", "2", "1", "2", "2", "4", "2", "1", "2",…
## $ HasCrCard       <chr> "1", "0", "1", "0", "1", "1", "1", "1", "0", "1", "0",…
## $ IsActiveMember  <chr> "1", "1", "0", "0", "1", "0", "1", "0", "1", "1", "0",…
## $ EstimatedSalary <chr> " 101348.88", " 112542.58", " 113931.57", " 93826.63",…
## $ Exited          <chr> "1", "0", "1", "0", "0", "1", "0", "1", "0", "0", "0",…
#enforcing correct data types 
colnames(bank)
##  [1] "CustomerId"      "Surname"         "CreditScore"     "Geography"      
##  [5] "Gender"          "Age"             "Tenure"          "Balance"        
##  [9] "NumOfProducts"   "HasCrCard"       "IsActiveMember"  "EstimatedSalary"
## [13] "Exited"
# Enforce correct data types
bank <- bank %>%
  mutate(
    CustomerId = as.character(CustomerId),       # CustomerId as character
    Surname = as.character(Surname),             # Surname as character
    CreditScore = as.integer(CreditScore),       # CreditScore as integer
    Geography = as.factor(Geography),            # Geography as factor
    Gender = as.factor(Gender),                  # Gender as factor
    Age = as.numeric(Age),                       # Age as integer
    Tenure = as.integer(Tenure),                 # Tenure as integer
    Balance = as.numeric(Balance),               # Balance as numeric
    NumOfProducts = as.integer(NumOfProducts),   # NumOfProducts as integer
    HasCrCard = as.factor(HasCrCard),           # HasCrCard as logical (1/0 -> TRUE/FALSE)
    IsActiveMember = as.factor(IsActiveMember), # IsActiveMember as logical (1/0 -> TRUE/FALSE)
    EstimatedSalary = round(as.numeric(EstimatedSalary),2), # EstimatedSalary as numeric
    Exited = as.factor(Exited)                  # Exited as logical (1/0 -> TRUE/FALSE)
  )

# View structure of the updated dataset
str(bank)
## 'data.frame':    10005 obs. of  13 variables:
##  $ CustomerId     : chr  "15634602" "15647311" "15619304" "15701354" ...
##  $ Surname        : chr  "HARGRAVE" "hill" "ONIO" "boni" ...
##  $ CreditScore    : int  500 1500 2000 100 9999 645 822 376 501 684 ...
##  $ Geography      : Factor w/ 3 levels "1","2","3": 1 3 1 1 3 3 1 2 1 1 ...
##  $ Gender         : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
##  $ Age            : num  42 41 42 39 43 44 50 29 44 27 ...
##  $ Tenure         : int  2 1 8 1 2 8 7 4 4 NA ...
##  $ Balance        : num  5.00e+03 1.00e+07 3.00e+02 8.89e+05 1.00e+09 ...
##  $ NumOfProducts  : int  1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 2 1 2 ...
##  $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 1 2 2 ...
##  $ EstimatedSalary: num  101349 112543 113932 93827 79084 ...
##  $ Exited         : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 1 1 ...
head(bank)
##   CustomerId  Surname CreditScore Geography Gender Age Tenure      Balance
## 1   15634602 HARGRAVE         500         1 Female  42      2       5000.0
## 2   15647311     hill        1500         3 Female  41      1    9999999.0
## 3   15619304     ONIO        2000         1 Female  42      8        300.0
## 4   15701354     boni         100         1 Female  39      1     888888.0
## 5   15737888 MITCHELL        9999         3 Female  43      2 1000000000.0
## 6   15574012      chu         645         3   Male  44      8     113755.8
##   NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
## 1             1         1              1       101348.88      1
## 2             1         0              1       112542.58      0
## 3             3         1              0       113931.57      1
## 4             2         0              0        93826.63      0
## 5             1         1              1        79084.10      0
## 6             2         1              0       149756.71      1
#checking missing values 
#--is.na(bank)



# Calculate percentage of missing values for each column
# sapply(bank, function(col) {
#   mean(is.na(col)) * 100
# })
# statistical distribution 

summary (bank)
##   CustomerId          Surname           CreditScore     Geography    Gender    
##  Length:10005       Length:10005       Min.   : 100.0   1:5017    Female:4548  
##  Class :character   Class :character   1st Qu.: 584.0   2:2509    Male  :5457  
##  Mode  :character   Mode  :character   Median : 652.0   3:2479                 
##                                        Mean   : 652.7                          
##                                        3rd Qu.: 718.0                          
##                                        Max.   :9999.0                          
##                                                                                
##       Age            Tenure          Balance          NumOfProducts  HasCrCard
##  Min.   :18.00   Min.   : 0.000   Min.   :0.000e+00   Min.   :1.00   0:2947   
##  1st Qu.:32.00   1st Qu.: 3.000   1st Qu.:0.000e+00   1st Qu.:1.00   1:7058   
##  Median :37.00   Median : 5.000   Median :9.724e+04   Median :1.00            
##  Mean   :38.92   Mean   : 5.012   Mean   :2.785e+05   Mean   :1.53            
##  3rd Qu.:44.00   3rd Qu.: 7.000   3rd Qu.:1.277e+05   3rd Qu.:2.00            
##  Max.   :92.00   Max.   :10.000   Max.   :1.000e+09   Max.   :4.00            
##                  NA's   :3                                                    
##  IsActiveMember EstimatedSalary     Exited  
##  0:4851         Min.   :    11.58   0:7966  
##  1:5154         1st Qu.: 51012.47   1:2039  
##                 Median :100193.91           
##                 Mean   :100080.74           
##                 3rd Qu.:149378.72           
##                 Max.   :199992.48           
##                 NA's   :3
  cleaned_bank <- bank
# Load necessary libraries
library(dplyr)
library(summarytools)

# 1. Summary Statistics for Numeric Columns
numeric_summary <- bank %>%
  select(CreditScore, Age, Tenure, Balance, NumOfProducts, EstimatedSalary) %>%
  summary()
# Print Results
cat("\n1. Summary Statistics for Numeric Columns:\n")
## 
## 1. Summary Statistics for Numeric Columns:
print(numeric_summary)
##   CreditScore          Age            Tenure          Balance         
##  Min.   : 100.0   Min.   :18.00   Min.   : 0.000   Min.   :0.000e+00  
##  1st Qu.: 584.0   1st Qu.:32.00   1st Qu.: 3.000   1st Qu.:0.000e+00  
##  Median : 652.0   Median :37.00   Median : 5.000   Median :9.724e+04  
##  Mean   : 652.7   Mean   :38.92   Mean   : 5.012   Mean   :2.785e+05  
##  3rd Qu.: 718.0   3rd Qu.:44.00   3rd Qu.: 7.000   3rd Qu.:1.277e+05  
##  Max.   :9999.0   Max.   :92.00   Max.   :10.000   Max.   :1.000e+09  
##                                   NA's   :3                           
##  NumOfProducts  EstimatedSalary    
##  Min.   :1.00   Min.   :    11.58  
##  1st Qu.:1.00   1st Qu.: 51012.47  
##  Median :1.00   Median :100193.91  
##  Mean   :1.53   Mean   :100080.74  
##  3rd Qu.:2.00   3rd Qu.:149378.72  
##  Max.   :4.00   Max.   :199992.48  
##                 NA's   :3
# 2. Frequency Counts for Categorical Variables
categorical_summary <- bank %>%
  select(Geography, Gender, HasCrCard, IsActiveMember, Exited) %>%
  summarytools::freq()

cat("\n2. Frequency Counts for Categorical Variables:\n")
## 
## 2. Frequency Counts for Categorical Variables:
print(categorical_summary)
## Frequencies  
## bank$Geography  
## Type: Factor  
## 
##                Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------- --------- -------------- --------- --------------
##           1    5017     50.14          50.14     50.14          50.14
##           2    2509     25.08          75.22     25.08          75.22
##           3    2479     24.78         100.00     24.78         100.00
##        <NA>       0                               0.00         100.00
##       Total   10005    100.00         100.00    100.00         100.00
## 
## bank$Gender  
## Type: Factor  
## 
##                 Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ------------ ------- --------- -------------- --------- --------------
##       Female    4548     45.46          45.46     45.46          45.46
##         Male    5457     54.54         100.00     54.54         100.00
##         <NA>       0                               0.00         100.00
##        Total   10005    100.00         100.00    100.00         100.00
## 
## bank$HasCrCard  
## Type: Factor  
## 
##                Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------- --------- -------------- --------- --------------
##           0    2947     29.46          29.46     29.46          29.46
##           1    7058     70.54         100.00     70.54         100.00
##        <NA>       0                               0.00         100.00
##       Total   10005    100.00         100.00    100.00         100.00
## 
## bank$IsActiveMember  
## Type: Factor  
## 
##                Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------- --------- -------------- --------- --------------
##           0    4851     48.49          48.49     48.49          48.49
##           1    5154     51.51         100.00     51.51         100.00
##        <NA>       0                               0.00         100.00
##       Total   10005    100.00         100.00    100.00         100.00
## 
## bank$Exited  
## Type: Factor  
## 
##                Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------- --------- -------------- --------- --------------
##           0    7966     79.62          79.62     79.62          79.62
##           1    2039     20.38         100.00     20.38         100.00
##        <NA>       0                               0.00         100.00
##       Total   10005    100.00         100.00    100.00         100.00
# 4. Grouped Analysis: Average Credit Score and Balance by Geography and Gender
grouped_analysis <- bank %>%
  group_by(Geography, Gender) %>%
  summarise(
    AvgCreditScore = mean(CreditScore, na.rm = TRUE),
    AvgBalance = mean(Balance, na.rm = TRUE),
    AvgEstimatedSalary = mean(EstimatedSalary, na.rm = TRUE)
  )

cat("\n4. Grouped Analysis (Average Metrics by Geography and Gender):\n")
## 
## 4. Grouped Analysis (Average Metrics by Geography and Gender):
print(grouped_analysis)
## # A tibble: 6 × 5
## # Groups:   Geography [3]
##   Geography Gender AvgCreditScore AvgBalance AvgEstimatedSalary
##   <fct>     <fct>           <dbl>      <dbl>              <dbl>
## 1 1         Female           650.     60962.             99530.
## 2 1         Male             650.     63546.            100174.
## 3 2         Female           653.    119146.            102446.
## 4 2         Male             650.    120260.             99905.
## 5 3         Female           670.   1911073.            100719.
## 6 3         Male             651.     63353.             98426.
# 5. Correlation Analysis for Numeric Variables
numeric_correlation <- bank %>%
  select(CreditScore, Age, Tenure, Balance, NumOfProducts, EstimatedSalary) %>%
  cor(use = "complete.obs")


cat("\n3. Correlation Matrix for Numeric Variables:\n")
## 
## 3. Correlation Matrix for Numeric Variables:
print(numeric_correlation)
##                   CreditScore          Age       Tenure      Balance
## CreditScore      1.0000000000  0.002683752 -0.009997899  0.799652586
## Age              0.0026837519  1.000000000 -0.010145893  0.005643235
## Tenure          -0.0099978993 -0.010145893  1.000000000 -0.014992810
## Balance          0.7996525855  0.005643235 -0.014992810  1.000000000
## NumOfProducts    0.0001141537 -0.030498650  0.013755324 -0.014349903
## EstimatedSalary -0.0044682344 -0.007391629  0.007607030 -0.005081936
##                 NumOfProducts EstimatedSalary
## CreditScore      0.0001141537    -0.004468234
## Age             -0.0304986501    -0.007391629
## Tenure           0.0137553244     0.007607030
## Balance         -0.0143499035    -0.005081936
## NumOfProducts    1.0000000000     0.014479221
## EstimatedSalary  0.0144792212     1.000000000
head(bank)
##   CustomerId  Surname CreditScore Geography Gender Age Tenure      Balance
## 1   15634602 HARGRAVE         500         1 Female  42      2       5000.0
## 2   15647311     hill        1500         3 Female  41      1    9999999.0
## 3   15619304     ONIO        2000         1 Female  42      8        300.0
## 4   15701354     boni         100         1 Female  39      1     888888.0
## 5   15737888 MITCHELL        9999         3 Female  43      2 1000000000.0
## 6   15574012      chu         645         3   Male  44      8     113755.8
##   NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
## 1             1         1              1       101348.88      1
## 2             1         0              1       112542.58      0
## 3             3         1              0       113931.57      1
## 4             2         0              0        93826.63      0
## 5             1         1              1        79084.10      0
## 6             2         1              0       149756.71      1

checking variables whichare highly correlated

# Load necessary libraries
#install.packages("ggcorrplot")
library(ggcorrplot)
library(dplyr)

# Select only numeric variables
numeric_data <- bank %>%
  select_if(is.numeric)

# Compute the correlation matrix
cor_matrix <- cor(numeric_data, use = "complete.obs")

# Visualize the correlation matrix
ggcorrplot(
  cor_matrix,
  method = "circle",        # Type of plot (circle, square, etc.)
  type = "upper",           # Show only the upper triangle
  lab = TRUE,               # Add correlation coefficients
  lab_size = 3,             # Size of labels
  colors = c("red", "white", "blue"), # Gradient colors
  title = "Correlation Matrix",
  ggtheme = theme_minimal() # Clean theme
)

#Scatter Plot of Balance and Credit Score

# Load ggplot2 library
library(ggplot2)

# Create the scatter plot with trend line
ggplot(bank, aes(x = `CreditScore`, y = `Balance`)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = TRUE, linetype = "dashed") + # Add trend line
  scale_x_continuous(limits = c(0, 900), expand = c(0, 0)) + # X-axis limits
  scale_y_continuous(limits = c(0, 250000), expand = c(0, 0)) + # Y-axis limits
  labs(
    title = "Scatter Plot of Balance vs Credit Score with Trend Line",
    x = "Credit Score",
    y = "Balance"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12)
  )

# Load ggplot2 library
library(ggplot2)

# Create the scatter plot with trend line
ggplot(bank, aes(x = Age, y = as.numeric(Exited))) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = TRUE, linetype = "dashed") + # Add trend line
  scale_x_continuous(limits = c(min(bank$Age, na.rm = TRUE), max(bank$Age, na.rm = TRUE)), expand = c(0, 0)) +
  scale_y_continuous(limits = c(0, 1), breaks = c(0, 1), labels = c("No Churn", "Churn")) +
  labs(
    title = "Scatter Plot of Age vs Exited with Trend Line",
    x = "Age",
    y = "Exited (Churn Status)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text.y = element_text(size = 10)
  )

Demographic Analysis

Age Distribution:

Histogram or density plot to visualize the age distribution of customers. Plot a histogram of age with an overlay of churn

# Load necessary library
library(ggplot2)

# Plot the histogram with churn overlay
ggplot(bank, aes(x = Age, fill = as.factor(Exited))) +
  geom_histogram(binwidth = 5, alpha = 0.7, position = "identity") +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "red"), labels = c("No Churn", "Churn")) +
  labs(
    title = "Age Distribution with Churn Overlay",
    x = "Age",
    y = "Frequency",
    fill = "Churn Status"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

Age Distribution:

density plot to visualize the age distribution of customers. Plot a histogram of age with an overlay of churn

# Load ggplot2 library
library(ggplot2)

# Create the density plot with churn overlay
ggplot(bank, aes(x = Age, fill = as.factor(Exited))) +
  geom_density(alpha = 0.7) +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "red"), labels = c("No Churn", "Churn")) +
  labs(
    title = "Age Distribution with Churn Overlay",
    x = "Age",
    y = "Density",
    fill = "Churn Status"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

# Load ggplot2 library
library(ggplot2)

# Create the box plot
ggplot(bank, aes(x = as.factor(Exited), y = Age, fill = as.factor(Exited))) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(
    values = c("0" = "blue", "1" = "red"),
    labels = c("No Churn", "Churn")
  ) +
  labs(
    title = "Age Distribution by Churn Status",
    x = "Churn Status",
    y = "Age",
    fill = "Churn Status"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    axis.text.x = element_text(size = 12, face = "bold")
  )

Gender Analysis:

Bar plot of churn rates by gender to show whether churn varies by gender. Bar plot of gender grouped by churn status.

# Load necessary library
library(ggplot2)

# Plot the bar chart with totals and y-axis scale limit
ggplot(bank, aes(x = Gender, fill = as.factor(Exited))) +
  geom_bar(position = "dodge", alpha = 0.8) +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    position = position_dodge(width = 0.9),
    vjust = -0.5, 
    size = 4
  ) +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "red"), labels = c("No Churn", "Churn")) +
  scale_y_continuous(limits = c(0, 6000), expand = c(0, 0)) + # Set y-axis limits to 5000
  labs(
    title = "Bar Plot of Gender Grouped by Churn Status",
    x = "Gender",
    y = "Count",
    fill = "Churn Status"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Calculate churn rate by gender
churn_rate_by_gender <- bank %>%
  group_by(Gender) %>%
  summarise(
    Churned = sum(Exited == 1),  # Count churned customers
    Total = n(),                 # Total customers
    ChurnRate = round((Churned / Total) * 100, 1) # Calculate churn rate as percentage
  )

# Create the column chart with total count and limited y-axis
ggplot(churn_rate_by_gender, aes(x = Gender, y = Churned, fill = Gender)) +
  geom_col(width = 0.7, alpha = 0.8) +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "pink")) + # Custom colors
  geom_text(aes(label = paste0(Churned, " ")), 
            vjust = -0.5, size = 5, color = "black") + # Add total count of churned
  scale_y_continuous(limits = c(0, 1500), expand = c(0, 0)) + # Set y-axis limits
  labs(
    title = "Total Count of Churn by Gender",
    x = "Gender",
    y = "Total Churned",
    fill = "Gender"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

Nationality Analysis: Stacked bar plot of churn rates by nationality. Nationality vs. churn rate

# Load ggplot2 library
library(ggplot2)

# Create the bar plot
ggplot(bank, aes(x = Geography, fill = as.factor(Exited))) + 
  geom_bar(position = "fill", alpha = 0.8) +
  scale_fill_manual(
    values = c("0" = "lightblue", "1" = "red"),  # Custom colors for churn status
    labels = c("No Churn", "Churn")         # Legend labels
  ) +
  labs(
    title = "Churn Rates by Nationality",
    x = "Nationality",
    y = "Proportion",
    fill = "Churned"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

Comparative Analysis by Nationality

Objective: Compare customer behavior (e.g., balance, credit score, number of products) between customers. Boxplots: Use boxplots to compare distributions of numerical variables (e.g., balance, credit score, number of products) across nationalities.

# Create the boxplot with y-axis scaled to 300,000
ggplot(bank, aes(x = Geography, y = Balance, fill = Geography)) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(
    values = c("1" = "#1f77b4", "2" = "#ff7f0e", "3" = "#2ca02c")
  ) +
  scale_y_continuous(limits = c(0, 300000), expand = c(0, 0)) + # Maintain scale but include outliers
  labs(
    title = "Balance by Nationality",
    x = "Nationality",
    y = "Balance",
    fill = "Geography"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text.x = element_text(size = 10, face = "bold"),
    legend.position = "none" # Remove legend
  )

ggplot(bank, aes(x = as.factor(Exited), y = Balance, fill = as.factor(Exited))) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(
    values = c("0" = "lightblue", "1" = "red"),
    labels = c("No Churn", "Churn")
  ) +
  labs(
    title = "Balance by Churn Status",
    x = "Churn Status",
    y = "Balance",
    fill = "Churned"
  ) +
  scale_y_continuous(limits = c(0, 300000), expand = c(0, 0)) + # Maintain scale but include outliers
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.position = "none", # Optional: Remove legend if unnecessary
    axis.text.x = element_text(size = 12, face = "bold")
  )

#account balance by credit card ownership

ggplot(bank, aes(x = as.factor(HasCrCard), y = Balance, fill = as.factor(HasCrCard))) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(
    values = c("0" = "lightblue", "1" = "red"),
    labels = c("No Churn", "Churn")
  ) +
  labs(
    title = "Balance by Credit Card Ownership",
    x = "Credit Card Ownership",
    y = "Balance",
    fill = "HasCrCard"
  ) +
  scale_y_continuous(limits = c(0, 200000), expand = c(0, 0)) + # Maintain scale but include outliers
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.position = "none", # Optional: Remove legend if unnecessary
    axis.text.x = element_text(size = 12, face = "bold")
  )

Violin Plot or Strip Plot: To visualize the spread of credit scores across nationalities.

ggplot(bank, aes(x = Geography, y = CreditScore, fill = Geography)) +
  geom_violin(alpha = 0.7) +
  labs(title = "Credit Score by Nationality", x = "Nationality", y = "Credit Score") +
  theme_minimal()

Heatmaps or Faceted Density Plots:

Show how multiple variables interact across nationalities.

ggplot(bank, aes(x = CreditScore, y = Balance, color = Geography)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~Geography) +
  labs(title = "Credit Score vs Balance by Nationality", x = "Credit Score", y = "Balance") +
  theme_minimal()

3. Customer Segmentation Objective: Identify customer segments and their characteristics. Graphs to Plot: Cluster Plot:

Use K-means or hierarchical clustering to group customers based on variables like balance, credit score, and age. Visualize clusters using scatterplots.

ggplot(bank, aes(x = CreditScore, y = Balance, color = "")) +
  geom_point(alpha = 0.7) +
  labs(title = "Customer Segmentation Based on Credit Score and Balance", x = "Credit Score", y = "Balance") +
  theme_minimal()

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Calculate percentages for each Exited category
pie_data <- bank %>%
  group_by(Exited) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = round((Count / sum(Count)) * 100, 1)) # Calculate percentages

# Create the pie chart
ggplot(pie_data, aes(x = "", y = Percentage, fill = as.factor(Exited))) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  scale_fill_manual(
    values = c("0" = "lightblue", "1" = "red"),
    labels = c("No Churn", "Churn")
  ) +
  labs(
    title = "Churn Status Distribution",
    fill = "Churn Status"
  ) +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    plot.title = element_text(size = 14, face = "bold"),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  ) +
  geom_text(aes(label = paste0(Percentage, "%")), 
            position = position_stack(vjust = 0.5), size = 5)

#Pie Chart of Churn Rate by Gender

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Calculate churn rate by gender
churn_rate_by_gender <- bank %>%
  group_by(Gender) %>%
  summarise(
    Churned = sum(Exited == 1),  # Count churned customers
    Total = n(),                 # Total customers
    ChurnRate = round((Churned / Total) * 100, 1) # Calculate churn rate as percentage
  )

# Create a pie chart
ggplot(churn_rate_by_gender, aes(x = "", y = ChurnRate, fill = Gender)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "pink")) + # Custom colors
  labs(
    title = "Churn Rate by Gender",
    fill = "Gender"
  ) +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    plot.title = element_text(size = 14, face = "bold"),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  ) +
  geom_text(aes(label = paste0(ChurnRate, "%")), 
            position = position_stack(vjust = 0.5), size = 5)

Exited by Geography

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Prepare data
clustered_data <- bank %>%
  group_by(Geography, Exited) %>%
  summarise(Count = n(), .groups = "drop") # Count the number of customers in each group

# Create the clustered column chart
ggplot(clustered_data, aes(x = Geography, y = Count, fill = as.factor(Exited))) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  scale_fill_manual(
    values = c("0" = "lightblue", "1" = "red"),
    labels = c("No Churn", "Churn")
  ) +
  labs(
    title = "Churn Status by Geography",
    x = "Geography",
    y = "Customer Count",
    fill = "Churn Status"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10),
    axis.text.x = element_text(size = 10, face = "bold")
  )

Exited by Credit Card Status

# Load necessary library
library(ggplot2)

# Create the clustered column chart
ggplot(bank, aes(x = as.factor(HasCrCard), fill = as.factor(Exited))) +
  geom_bar(position = "dodge", alpha = 0.8) +
  scale_fill_manual(
    values = c("0" = "lightblue", "1" = "red"),
    labels = c("No Churn", "Churn")
  ) +
  labs(
    title = "Churn Status by Credit Card Ownership",
    x = "Has Credit Card",
    y = "Count",
    fill = "Churn Status"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text.x = element_text(size = 10, face = "bold"),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

#Exited by Activity Status

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Create the clustered column chart
ggplot(bank, aes(x = as.factor(IsActiveMember), fill = as.factor(Exited))) +
  geom_bar(position = "dodge", alpha = 0.8) +
  scale_fill_manual(
    values = c("0" = "lightblue", "1" = "red"),
    labels = c("No Churn", "Churn")
  ) +
  labs(
    title = "Churn Status by Activity Status",
    x = "Activity Status (Active Member)",
    y = "Count",
    fill = "Churn Status"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    axis.text.x = element_text(size = 12, face = "bold")
  )

# 
# #save as CSV
# # Specify the file path and name
# output_bank <- "cleaned_bank.csv"
# 
# # Save the final_data dataset to a CSV file
# write.csv(cleaned_bank, file = output_file, row.names = FALSE)
# 
# # Print a confirmation message
# cat("The processed final_data has been saved to:", output_bank, "\n")
# 
# #save as excel workbook
# # Load the openxlsx library
# library(openxlsx)
# 
# # Specify the file path and name for Excel
# output_bank <- "cleaned_bank.xlsx"
# 
# # Save the final_data dataset to an Excel file
# write.xlsx(cleaned_bank, file = output_bank)
# 
# # Print a confirmation message
# cat("The processed final_data has been saved to:", output_bank, "\n")