Description

Documentation provides Bank Churn prediction using Random Forest algorithms. This data set contains details of a bank’s customers and the target variable is a binary variable reflecting the fact whether the customer left the bank (closed his account) or he continues to be a customer.

The dataset link: Here

1. Data Extraction

Import Libraries

library(ggplot2)

Read Bank Churn datasets

df = read.csv("Churn Modeling.csv")
str(df)
## 'data.frame':    10000 obs. of  14 variables:
##  $ RowNumber      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CustomerId     : int  15634602 15647311 15619304 15701354 15737888 15574012 15592531 15656148 15792365 15592389 ...
##  $ Surname        : chr  "Hargrave" "Hill" "Onio" "Boni" ...
##  $ CreditScore    : int  619 608 502 699 850 645 822 376 501 684 ...
##  $ Geography      : chr  "France" "Spain" "France" "France" ...
##  $ Gender         : chr  "Female" "Female" "Female" "Female" ...
##  $ Age            : int  42 41 42 39 43 44 50 29 44 27 ...
##  $ Tenure         : int  2 1 8 1 2 8 7 4 4 2 ...
##  $ Balance        : num  0 83808 159661 0 125511 ...
##  $ NumOfProducts  : int  1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : int  1 0 1 0 1 1 1 1 0 1 ...
##  $ IsActiveMember : int  1 1 0 0 1 0 1 0 1 1 ...
##  $ EstimatedSalary: num  101349 112543 113932 93827 79084 ...
##  $ Exited         : int  1 0 1 0 0 1 0 1 0 0 ...
#factor
df$Exited = factor(df$Exited)
df$HasCrCard = factor(df$HasCrCard)
df$IsActiveMember = factor(df$IsActiveMember)
df$Geography = factor(df$Geography)
df$Gender = factor(df$Gender)

This dataset contains 10000 rows and 14 columns. The targer variable is Exited

2. Exploratory Data Analysis

2.1 Plot Distribution of Exited VS Gender (Bar)

ggplot(data = df,
       aes(x = Gender,
           fill = df$Exited)) +
  geom_bar()

2.2 Plot Distribution of Exited VS HasCrCard (Bar)

ggplot(data = df,
       aes(x = HasCrCard,
           fill = Exited)) +
  geom_bar() +
  scale_x_discrete(breaks = c(0,1),
                   labels = c("Not Have",
                              "Have"))

2.3 Plot Distribution of Exited VS Geography (Bar)

ggplot(data = df,
       aes(x = Geography,
           fill = Exited)) +
  geom_bar()

2.4 Plot Distribution of Exited VS Tenure (Bar)

ggplot(data = df,
       aes(x = Tenure,
           fill = Exited)) +
  geom_bar()

2.5 Plot Distribution of Exited VS Active Member (Bar)

ggplot(data = df,
       aes(x = IsActiveMember,
           fill = Exited)) +
  geom_bar()

2.6 Plot Distribution of Exited VS Age (Boxplot)

ggplot(data = df,
       aes(y = Age,
           fill = Exited)) +
  geom_boxplot()

3.Data Preprocessing

3.1 Data Cleansing

Remove unused rows with RowNumber, CustomerId, Surname

churn_df <- df[,4:14]

3.2 Imbalance Data Handling

library(ROSE)
## Loaded ROSE 0.0-4
set.seed(2022)
table(churn_df$Exited)
## 
##    0    1 
## 7963 2037

Not Exited data is Exited data is 7963 The data is not imbalance, need to imbalance data handling

#Both => Undersampling + Oversampling
data_balanced <- ovun.sample(Exited~.,
                                  data=churn_df,
                                  method="both",
                                  p=0.5,
                                  N = 10000,
                                  seed=1)$data
table(data_balanced$Exited)
## 
##    0    1 
## 5047 4953

3.3 Feature Extraction

One Hot Encoding for Geography and Gender features

balanced_num <- data_balanced
balanced_num$Geography <- NULL
balanced_num$Gender <- NULL
### 1.dataframe Geography and Gender
balanced_geo <- data_balanced$Geography
Geography_df <- data.frame(balanced_geo)
colnames(Geography_df) <- c("Geography")

balanced_Gender <- data_balanced$Gender
Gender_df <- data.frame(balanced_Gender)
colnames(Gender_df) <- c("Gender")

### 2. one hot encoding Geography and Gender dataframe
library(caret)
## Loading required package: lattice
df1 <- dummyVars("~.", data = Geography_df)
df2 <- data.frame(predict(df1, newdata = Geography_df))

df3 <- dummyVars("~.", data = Gender_df)
df4 <- data.frame(predict(df3, newdata = Gender_df))

balanced_df <- cbind(df2,df4,balanced_num)

### 3.Training and Testing Split
row <- dim(balanced_df)[1]
train_idx <- sample(row, row * 0.85)
training_data <- balanced_df[train_idx,]
testing_data <- balanced_df[-train_idx,]

4. Modeling

Create Random Forest

answer <- testing_data$Exited
rf <- randomForest(formula = Exited~., data = training_data)

5. Evaluations

pred.rf <- predict(rf, newdata = testing_data)
performance <- function(prediction, actual, nama_model){
  cm <- table(actual, prediction,
              dnn = c("Actual","Prediction"))
  TP <- cm[2,2]
  TN <- cm[1,1]
  FN <- cm[2,1]
  FP <- cm[1,2]
  accuracy <- (TP + TN) / (TP + TN + FP + FN)
  precision <- TP / (TP + FP)
  recall <- TP / (TP + FN)
  f1_score = ( 2 * precision * recall ) / (precision + recall)
  
  result <- paste("Model      : ",nama_model,
                  "\nAccuracy   : ",round(accuracy,3),
                  "\nPrecision  : ",round(precision,3),
                  "\nRecall     : ", round(recall,3),
                  "\nF1 Score   : ",round(f1_score,3))
  cat(result)
}
performance(pred.rf,answer,"Random Forest")
## Model      :  Random Forest 
## Accuracy   :  0.925 
## Precision  :  0.912 
## Recall     :  0.943 
## F1 Score   :  0.927