Documentation provides Bank Churn prediction using Random Forest algorithms. This data set contains details of a bank’s customers and the target variable is a binary variable reflecting the fact whether the customer left the bank (closed his account) or he continues to be a customer.
The dataset link: Here
Import Libraries
library(ggplot2)
Read Bank Churn datasets
df = read.csv("Churn Modeling.csv")
str(df)
## 'data.frame': 10000 obs. of 14 variables:
## $ RowNumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CustomerId : int 15634602 15647311 15619304 15701354 15737888 15574012 15592531 15656148 15792365 15592389 ...
## $ Surname : chr "Hargrave" "Hill" "Onio" "Boni" ...
## $ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
## $ Geography : chr "France" "Spain" "France" "France" ...
## $ Gender : chr "Female" "Female" "Female" "Female" ...
## $ Age : int 42 41 42 39 43 44 50 29 44 27 ...
## $ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
## $ Balance : num 0 83808 159661 0 125511 ...
## $ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
## $ HasCrCard : int 1 0 1 0 1 1 1 1 0 1 ...
## $ IsActiveMember : int 1 1 0 0 1 0 1 0 1 1 ...
## $ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
## $ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
#factor
df$Exited = factor(df$Exited)
df$HasCrCard = factor(df$HasCrCard)
df$IsActiveMember = factor(df$IsActiveMember)
df$Geography = factor(df$Geography)
df$Gender = factor(df$Gender)
This dataset contains 10000 rows and 14 columns. The targer variable is Exited
ggplot(data = df,
aes(x = Gender,
fill = df$Exited)) +
geom_bar()
ggplot(data = df,
aes(x = HasCrCard,
fill = Exited)) +
geom_bar() +
scale_x_discrete(breaks = c(0,1),
labels = c("Not Have",
"Have"))
ggplot(data = df,
aes(x = Geography,
fill = Exited)) +
geom_bar()
ggplot(data = df,
aes(x = Tenure,
fill = Exited)) +
geom_bar()
ggplot(data = df,
aes(x = IsActiveMember,
fill = Exited)) +
geom_bar()
ggplot(data = df,
aes(y = Age,
fill = Exited)) +
geom_boxplot()
Remove unused rows with RowNumber, CustomerId, Surname
churn_df <- df[,4:14]
library(ROSE)
## Loaded ROSE 0.0-4
set.seed(2022)
table(churn_df$Exited)
##
## 0 1
## 7963 2037
Not Exited data is Exited data is 7963 The data is not imbalance, need to imbalance data handling
#Both => Undersampling + Oversampling
data_balanced <- ovun.sample(Exited~.,
data=churn_df,
method="both",
p=0.5,
N = 10000,
seed=1)$data
table(data_balanced$Exited)
##
## 0 1
## 5047 4953
One Hot Encoding for Geography and Gender features
balanced_num <- data_balanced
balanced_num$Geography <- NULL
balanced_num$Gender <- NULL
### 1.dataframe Geography and Gender
balanced_geo <- data_balanced$Geography
Geography_df <- data.frame(balanced_geo)
colnames(Geography_df) <- c("Geography")
balanced_Gender <- data_balanced$Gender
Gender_df <- data.frame(balanced_Gender)
colnames(Gender_df) <- c("Gender")
### 2. one hot encoding Geography and Gender dataframe
library(caret)
## Loading required package: lattice
df1 <- dummyVars("~.", data = Geography_df)
df2 <- data.frame(predict(df1, newdata = Geography_df))
df3 <- dummyVars("~.", data = Gender_df)
df4 <- data.frame(predict(df3, newdata = Gender_df))
balanced_df <- cbind(df2,df4,balanced_num)
### 3.Training and Testing Split
row <- dim(balanced_df)[1]
train_idx <- sample(row, row * 0.85)
training_data <- balanced_df[train_idx,]
testing_data <- balanced_df[-train_idx,]
Create Random Forest
answer <- testing_data$Exited
rf <- randomForest(formula = Exited~., data = training_data)
pred.rf <- predict(rf, newdata = testing_data)
performance <- function(prediction, actual, nama_model){
cm <- table(actual, prediction,
dnn = c("Actual","Prediction"))
TP <- cm[2,2]
TN <- cm[1,1]
FN <- cm[2,1]
FP <- cm[1,2]
accuracy <- (TP + TN) / (TP + TN + FP + FN)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score = ( 2 * precision * recall ) / (precision + recall)
result <- paste("Model : ",nama_model,
"\nAccuracy : ",round(accuracy,3),
"\nPrecision : ",round(precision,3),
"\nRecall : ", round(recall,3),
"\nF1 Score : ",round(f1_score,3))
cat(result)
}
performance(pred.rf,answer,"Random Forest")
## Model : Random Forest
## Accuracy : 0.925
## Precision : 0.912
## Recall : 0.943
## F1 Score : 0.927