knitr::opts_chunk$set(echo = TRUE)
options(kableExtra.html.bsTable = T)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
library(ggplot2)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.4.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.2
library(car)
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
library(lattice)
library(ggvis)
## Warning: package 'ggvis' was built under R version 4.4.2
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.2
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.2
library(class)
## Warning: package 'class' was built under R version 4.4.2
library(magrittr)
library(ggvis)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:car':
##
## logit
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.2
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.2
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.4.2
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(ipred)
## Warning: package 'ipred' was built under R version 4.4.2
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:gridExtra':
##
## combine
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine(), gridExtra::combine()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::recode() masks car::recode()
## ✖ ggvis::resolution() masks ggplot2::resolution()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
y refers to the variable you are predicting.df <- read.csv("C:/Users/cmart/OneDrive/Documents/Certificate in Business Analytics/Predictive Analytics/Final Project_Bank Customer Churn Prediction/Churn_Modelling.csv", header = TRUE, sep=",", dec=".")
# Display top few records
head(df)
## RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure
## 1 1 15634602 Hargrave 619 France Female 42 2
## 2 2 15647311 Hill 608 Spain Female 41 1
## 3 3 15619304 Onio 502 France Female 42 8
## 4 4 15701354 Boni 699 France Female 39 1
## 5 5 15737888 Mitchell 850 Spain Female 43 2
## 6 6 15574012 Chu 645 Spain Male 44 8
## Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
## 1 0.00 1 1 1 101348.88 1
## 2 83807.86 1 0 1 112542.58 0
## 3 159660.80 3 1 0 113931.57 1
## 4 0.00 2 0 0 93826.63 0
## 5 125510.82 1 NA 1 79084.10 0
## 6 113755.78 2 1 0 149756.71 1
# Display a glimpse of df structure
str(df)
## 'data.frame': 10002 obs. of 14 variables:
## $ RowNumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CustomerId : int 15634602 15647311 15619304 15701354 15737888 15574012 15592531 15656148 15792365 15592389 ...
## $ Surname : chr "Hargrave" "Hill" "Onio" "Boni" ...
## $ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
## $ Geography : chr "France" "Spain" "France" "France" ...
## $ Gender : chr "Female" "Female" "Female" "Female" ...
## $ Age : num 42 41 42 39 43 44 50 29 44 NA ...
## $ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
## $ Balance : num 0 83808 159661 0 125511 ...
## $ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
## $ HasCrCard : int 1 0 1 0 NA 1 1 1 0 1 ...
## $ IsActiveMember : int 1 1 0 0 1 0 1 0 NA 1 ...
## $ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
## $ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
# Check number of rows and columns
ncol(df)
## [1] 14
nrow(df)
## [1] 10002
# Preliminary Check for Missing Values (Note - preliminary NA check does not identify any variables with complete missing values)
missing_values <- sapply(df, function(x) sum(is.na(x)))
missing_values
## RowNumber CustomerId Surname CreditScore Geography
## 0 0 0 0 0
## Gender Age Tenure Balance NumOfProducts
## 0 1 0 0 0
## HasCrCard IsActiveMember EstimatedSalary Exited
## 1 1 0 0
# Explicitly convert each variable to pre-existing data class to ensure data class uniformity, no reclassification until data cleansing complete
df$RowNumber <- as.integer(df$RowNumber)
df$CustomerId <- as.integer(df$CustomerId)
df$Surname <- as.character(df$Surname)
df$CreditScore <- as.integer(df$CreditScore)
df$Geography <- as.character(df$Geography)
df$Gender <- as.character(df$Gender)
df$Age <- as.integer(df$Age)
df$Tenure <- as.integer(df$Tenure)
df$Balance <- as.integer(df$Balance)
df$NumOfProducts <- as.integer(df$NumOfProducts)
df$HasCrCard <- as.integer(df$HasCrCard)
df$IsActiveMember <- as.integer(df$IsActiveMember)
df$EstimatedSalary <- as.integer(df$EstimatedSalary)
df$Exited <- as.integer(df$Exited)
# Replace Non-NA Missing Values with NA
df$RowNumber[df$RowNumber =="N/A"] <- NA # for empty strings
df$CustomerId[df$CustomerId == "N/A"] <- NA # for empty strings
df$Surname[df$Surname == ""] <- NA # for empty strings
df$CreditScore[df$Surname == "N/A"] <- NA # for empty strings
df$Geography[df$Geography == "N/A"] <- NA # for empty strings
df$Gender[df$Gender == "N/A"] <- NA # for empty strings
df$Age[df$Age == ""] <- NA # for empty strings
df$Tenure[df$Tenure == "N/A"] <- NA # for empty strings
df$Balance[df$Balance == "N/A"] <- NA # for empty strings
df$NumOfProducts[df$NumOfProducts == "N/A"] <- NA # for empty strings
df$HasCrCard[df$HasCrCard =="N/A"] <- NA # for empty strings
df$IsActiveMember[df$IsActiveMember == "N/A"] <- NA # for empty strings
df$EstimatedSalary[df$EstimatedSalary == "N/A"] <- NA # for empty strings
df$Exited[df$Exited == "N/A"] <- NA # for empty strings
# Replace Non-NA Missing Values with NA
df$RowNumber[df$RowNumber ==""] <- NA # for empty strings
df$CustomerId[df$CustomerId == ""] <- NA # for empty strings
df$Surname[df$Surname == ""] <- NA # for empty strings
df$CreditScore[df$Surname == ""] <- NA # for empty strings
df$Geography[df$Geography == ""] <- NA # for empty strings
df$Gender[df$Gender == ""] <- NA # for empty strings
df$Age[df$Age == ""] <- NA # for empty strings
df$Tenure[df$Tenure == ""] <- NA # for empty strings
df$Balance[df$Balance == ""] <- NA # for empty strings
df$NumOfProducts[df$NumOfProducts == ""] <- NA # for empty strings
df$HasCrCard[df$HasCrCard ==""] <- NA # for empty strings
df$IsActiveMember[df$IsActiveMember == ""] <- NA # for empty strings
df$EstimatedSalary[df$EstimatedSalary == ""] <- NA # for empty strings
df$Exited[df$Exited == ""] <- NA # for empty strings
#Re-run Missing Value Check (Note - Variable [Geography, EstimatedSalary] now each have 1 missing value, respectively)
missing_values <- sapply(df, function(x) sum(is.na(x)))
missing_values
## RowNumber CustomerId Surname CreditScore Geography
## 0 0 0 0 1
## Gender Age Tenure Balance NumOfProducts
## 0 1 0 0 0
## HasCrCard IsActiveMember EstimatedSalary Exited
## 1 1 0 0
# Convert to factor variables ($Geography, $Gender, $IsActiveMember, $HasCrCard)
df$Geography <- as.factor(df$Geography)
df$Gender <- as.factor(df$Gender)
df$IsActiveMember <- as.factor(df$IsActiveMember)
df$HasCrCard <- as.factor(df$HasCrCard)
# Remove categorical variable ($Surname), adds no value to prediction accuracy of model - $CustomerId already captures individual bank customers
df <- df[,!names(df) %in% "Surname"]
str(df)
## 'data.frame': 10002 obs. of 13 variables:
## $ RowNumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CustomerId : int 15634602 15647311 15619304 15701354 15737888 15574012 15592531 15656148 15792365 15592389 ...
## $ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 3 NA 2 1 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
## $ Age : int 42 41 42 39 43 44 50 29 44 NA ...
## $ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
## $ Balance : int 0 83807 159660 0 125510 113755 0 115046 142051 134603 ...
## $ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 1 NA 2 2 2 1 2 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 1 NA 2 ...
## $ EstimatedSalary: int 101348 112542 113931 93826 79084 149756 10062 119346 74940 71725 ...
## $ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
y. Do you see a difference between the boxplots for any of
the variables you choose?Note - The median account balance of customers who stayed with the bank is slightly lower than those who left the bank. The implication is that those with higher demand deposits, checking account balances, or savings account balances with the bank tend to be more dissatisfied and/or see more lucrative opportunities elsewhere in the financial marketplace resulting in such customers to leave at a greater rate. While the range for customers who stayed with the bank versus those who left is similar, the IQR is narrower for the customers who left the bank. This indicates that there is a more concentrated range of account balances for customers who left the bank. The bank should focus on providing targeted services to high account balance customers to reduce customer churn rate. Focusing efforts to increase engagement with bank products for those customers who remain with the bank but maintain an account balance close to $0 will further increase overall bank profitability.
Note - No noticeable differences in credit scores between customers who “exited” and those who remain with the bank. This is logical as the Bank likely has proper financial due diligence and screening procedures/policies in place to limit services to customers who have good credit score, and therefore will have less volatile account balances held with the bank and pose less risk overall.
Note - Minimal difference between age of customers who remain with the bank versus those who “exited”. The boxplot for customers who remained with the bank has a significantly greater right skew compared with the boxplot of customer age for those who “exited”. This finding is logical. Customers aged older likely have a longer experience with the bank and its services and products, meaning that such customers are likely loyal customers and less likely to “exit”.
NOTE - Median customer tenure for those who “exit” is identical to median customer tenure for those who remain at the bank, both being 5 years. The IQR and range for customers who “exited” the bank is much larger and more spread than for customers who remained with the bank. This is because as more time passes, the probability of a customer leaving the bank increases as a variety of influences stemming from market performance and personal reasons impact such a decision.
NOTE - No noticeable difference between number of products used by customers opting to “exit the bank” and those opting to remain. No noticeable difference between estimated salary by customer “exiting” the bank and those remaining with the bank.
# Create a boxplot for $Balance based on the values of y ($Exited)
boxplot(df$Balance ~ df$Exited,
main = "Boxplot - Customer Account Balance",
xlab = "Exited (0 = No, 1 = Yes)",
ylab = "Balance",
col = c("skyblue", "orange"),
border = "black")
# Create a boxplot for $CreditScore based on the values of y ($Exited)
boxplot(df$CreditScore ~ df$Exited,
main = "Boxplot - Customer Credit Score",
xlab = "Exited (0 = No, 1 = Yes)",
ylab = "Credit Score",
col = c("skyblue", "orange"),
border = "black")
# Create a boxplot for $Age based on the values of y ($Exited)
boxplot(df$Age ~ df$Exited,
main = "Boxplot - Age of Customer",
xlab = "Exited (0 = No, 1 = Yes)",
ylab = "Age",
col = c("skyblue", "orange"),
border = "black")
# Create a boxplot for $Tenure based on the values of y ($Exited)
boxplot(df$Tenure ~ df$Exited,
main = "Boxplot - Customer Tenure",
xlab = "Exited (0 = No, 1 = Yes)",
ylab = "Tenure",
col = c("skyblue", "orange"),
border = "black")
# Create a boxplot for $NumOfProducts based on the values of y ($Exited)
boxplot(df$Tenure ~ df$Exited,
main = "Boxplot - Number of Products Each Customer Uses",
xlab = "Exited (0 = No, 1 = Yes)",
ylab = "Number of Products",
col = c("skyblue", "orange"),
border = "black")
# Create a boxplot for $NumOfProducts based on the values of y ($Exited)
boxplot(df$EstimatedSalary ~ df$Exited,
main = "Boxplot - Estimated Salary per Customer",
xlab = "Exited (0 = No, 1 = Yes)",
ylab = "Estimated Salary",
col = c("skyblue", "orange"),
border = "black")
y. Do you see a difference between plots for any
of the variables you choose?Note - the bar chart of customer retention by geography illustrates that a significant concentration of the bank’s customers are domiciled in France, at least in comparison to Germany and Spain. Despite the large concentration of customers domiciled in France, customers were not leaving the bank to the same extent as Germany and Spain. Clearly (1) the bank has a stronger presence and France and (2) the bank’s services and products are valued to a greater extent by the French. The bank should take a detailed look at performance by bank location, as well as performance by manger, and try to determine what is driving results in France and whether such results can be replicated in Germany and Spain.
Note - No difference is observed between male customers “exiting” the bank and female customers “exiting” the bank. The bank has more male customers than female customers. Significantly more male customers have credit cards with the bank than do female customers, driven in part by “male” constituting a larger proportion of the bank’s overall clientele. Male customers with the bank credit card tend to leave the bank to a greater extent than do female customers with the bank credit card.
Note - No noticeable difference between active bank members leaving the bank and nonactive bank leaving the bank can be observed.
Hint: Use the function barplot You can
do so by barplot(variable), or you can try the following code:
newdata <- table(df[,“y”], df[,“variable”]“) barplot(newdata, main=”….”, xlab=“….”, legend = rownames(newdata))
# Create a bar chart for Geography given Exited = 1 and Exited = 0
newdata <- table(df[,"Exited"], df[,"Geography"])
barplot((newdata), beside = TRUE,
main="Bar Chart of Customer Rentention by Geography",
xlab = "Exited (0 = NO, 1 = Yes)",
ylab = "count",
legend = rownames(newdata))
# Create a bar chart for Gender given Exited = 1 and Exited = 0
newdata <- table(df[,"Exited"], df[,"Gender"])
barplot((newdata), beside = TRUE,
main="Bar Chart of Customer Rentention by Gender",
xlab = "Exited (0 = NO, 1 = Yes)",
ylab = "count",
legend = rownames(newdata))
# Create a bar chart for HasCrCard given Exited = 1 and Exited = 0
newdata <- table(df[,"Exited"], df[,"HasCrCard"])
barplot((newdata), beside = TRUE,
main="Bar Chart of Customer Rentention by HasCrCard",
xlab = "Exited (0 = NO, 1 = Yes)",
ylab = "count",
legend = rownames(newdata))
# Note that {if HasCrCar is Yes, then x-axis reads "1", else x-axis reads "0"}
# Create a bar chart for IsActiveMember given Exited = 1 and Exited = 0
newdata <- table(df[,"Exited"], df[,"IsActiveMember"])
barplot((newdata), beside = TRUE,
main="Bar Chart of Customer Rentention by IsActiveMember",
xlab = "Exited (0 = NO, 1 = Yes)",
ylab = "count",
legend = rownames(newdata))
# Note that {if IsActiveMember is Yes, then x-axis reads "1", else x-axis reads "0"}
# Summary Statistics for the Independent Variables in Data Set df
summary(df)
## RowNumber CustomerId CreditScore Geography
## Min. : 1 Min. :15565701 Min. :350.0 France :5014
## 1st Qu.: 2501 1st Qu.:15628525 1st Qu.:584.0 Germany:2510
## Median : 5002 Median :15690732 Median :652.0 Spain :2477
## Mean : 5002 Mean :15690933 Mean :650.6 NA's : 1
## 3rd Qu.: 7502 3rd Qu.:15753226 3rd Qu.:718.0
## Max. :10000 Max. :15815690 Max. :850.0
##
## Gender Age Tenure Balance NumOfProducts
## Female:4544 Min. :18.00 Min. : 0.000 Min. : 0 Min. :1.00
## Male :5458 1st Qu.:32.00 1st Qu.: 3.000 1st Qu.: 0 1st Qu.:1.00
## Median :37.00 Median : 5.000 Median : 97198 Median :1.00
## Mean :38.92 Mean : 5.012 Mean : 76491 Mean :1.53
## 3rd Qu.:44.00 3rd Qu.: 7.000 3rd Qu.:127647 3rd Qu.:2.00
## Max. :92.00 Max. :10.000 Max. :250898 Max. :4.00
## NA's :1
## HasCrCard IsActiveMember EstimatedSalary Exited
## 0 :2945 0 :4851 Min. : 11 Min. :0.0000
## 1 :7056 1 :5150 1st Qu.: 50983 1st Qu.:0.0000
## NA's: 1 NA's: 1 Median :100185 Median :0.0000
## Mean :100083 Mean :0.2038
## 3rd Qu.:149383 3rd Qu.:0.0000
## Max. :199992 Max. :1.0000
##
# Identify if there is class imbalance for the response variable ($Exited)in the df Data Set
Identify_Class_Imbalance <- prop.table(table(df$Exited))
Identify_Class_Imbalance #Note that (0.7962408) customers have remained with the bank, while (0.2037592) have exited
##
## 0 1
## 0.7962408 0.2037592
scale(), and combined these with your non-numeric
data.# Standardize numeric data using scale function, ensures that no individual variable dominates any other variable due to scale differences
df_scaled <- scale(df[, c("CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary")])
df_combined <- cbind(df_scaled, df[, c("Geography", "Gender", "HasCrCard", "IsActiveMember", "Exited", "CustomerId", "RowNumber")])
str(df_combined)
## 'data.frame': 10002 obs. of 13 variables:
## $ CreditScore : num -0.326 -0.44 -1.537 0.501 2.063 ...
## $ Age : num 0.29348 0.19813 0.29348 0.00742 0.38883 ...
## $ Tenure : num -1.04 -1.39 1.03 -1.39 -1.04 ...
## $ Balance : num -1.226 0.117 1.333 -1.226 0.786 ...
## $ NumOfProducts : num -0.912 -0.912 2.527 0.808 -0.912 ...
## $ EstimatedSalary: num 0.022 0.217 0.241 -0.109 -0.365 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 3 NA 2 1 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 1 NA 2 2 2 1 2 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 1 NA 2 ...
## $ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
## $ CustomerId : int 15634602 15647311 15619304 15701354 15737888 15574012 15592531 15656148 15792365 15592389 ...
## $ RowNumber : int 1 2 3 4 5 6 7 8 9 10 ...
# 70% of the sample size utilized for model training, 30% utilized for model accuracy testing
smp_size <- floor(0.70 * nrow(df_combined))
# set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(df_combined)), size = smp_size)
df_train <- df_combined[train_ind, ]
df_test = df_combined[-train_ind, ]
# Ensure that training and testing set sum to df_combined data set (10002)
nrow (df_combined)
## [1] 10002
str(df_train)
## 'data.frame': 7001 obs. of 13 variables:
## $ CreditScore : num 0.222 -0.264 -2.147 1.96 -1.547 ...
## $ Age : num 1.342 1.247 0.484 -0.851 -0.469 ...
## $ Tenure : num 1.379 -1.042 -1.042 1.033 -0.696 ...
## $ Balance : num 1.4892 0.0477 -1.2259 0.9584 0.501 ...
## $ NumOfProducts : num 4.246 -0.912 -0.912 -0.912 -0.912 ...
## $ EstimatedSalary: num 0.821 -0.269 1.027 -0.799 -1.579 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 1 1 1 1 1 3 1 3 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 1 1 1 2 2 2 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 1 2 1 ...
## $ Exited : int 1 0 0 0 0 0 1 0 0 0 ...
## $ CustomerId : int 15704442 15607993 15635502 15631912 15788539 15714680 15749905 15713949 15786454 15619699 ...
## $ RowNumber : int 2463 2511 8718 2986 1842 9334 3371 4761 6746 9819 ...
str(df_test)
## 'data.frame': 3001 obs. of 13 variables:
## $ CreditScore : num -0.0575 -0.6575 -0.1506 -1.4541 0.1908 ...
## $ Age : num 0.4842 0.5795 -0.6601 -0.0879 0.6749 ...
## $ Tenure : num 1.033 0.341 1.033 -0.35 -0.696 ...
## $ Balance : num 0.597 -1.226 -1.226 -1.226 -1.226 ...
## $ NumOfProducts : num 0.808 -0.912 0.808 -0.912 0.808 ...
## $ EstimatedSalary: num 0.864 1.019 0.669 0.327 -1.593 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 3 3 3 3 1 2 2 3 3 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 1 2 1 1 2 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 2 1 2 1 2 2 1 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 2 2 2 ...
## $ Exited : int 1 0 0 1 0 0 0 0 0 0 ...
## $ CustomerId : int 15574012 15661507 15597945 15699309 15725737 15728693 15750181 15659428 15732963 15788448 ...
## $ RowNumber : int 6 19 22 23 24 29 33 34 35 37 ...
y is
the dependent variable using the Logistic Regression method, rest of the
variables, and your training data set.# Run classification (logistic) regression model
logfit <- glm(Exited ~ ., family=binomial, data = df_train[,1:11])
summary(logfit)
##
## Call:
## glm(formula = Exited ~ ., family = binomial, data = df_train[,
## 1:11])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.95882 0.08022 -11.953 < 2e-16 ***
## CreditScore -0.07681 0.03220 -2.385 0.0171 *
## Age 0.75680 0.03215 23.538 < 2e-16 ***
## Tenure -0.05804 0.03222 -1.801 0.0716 .
## Balance 0.18937 0.03813 4.966 6.83e-07 ***
## NumOfProducts -0.07748 0.03246 -2.387 0.0170 *
## EstimatedSalary 0.06653 0.03267 2.037 0.0417 *
## GeographyGermany 0.71168 0.08097 8.789 < 2e-16 ***
## GeographySpain -0.01955 0.08449 -0.231 0.8170
## GenderMale -0.54474 0.06522 -8.353 < 2e-16 ***
## HasCrCard1 -0.03996 0.07100 -0.563 0.5736
## IsActiveMember1 -1.05342 0.06874 -15.325 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7075.8 on 6996 degrees of freedom
## Residual deviance: 5985.3 on 6985 degrees of freedom
## (4 observations deleted due to missingness)
## AIC: 6009.3
##
## Number of Fisher Scoring iterations: 5
# NOTE - the following levels are reference groups for the above classification model: ($Geography = France, $Gender = Female, $HasCrCard = 0 [No], $IsActiveMember = 0 [No] )
# Remove all variables deemed statistically insignificant with p-values greater than 0.1 (Remove Impact of $HasCrCard, Leave $Geography)
logfit_reduced <- glm (Exited ~ CreditScore+Age+Tenure+Balance+NumOfProducts+EstimatedSalary+Geography+Gender+IsActiveMember, family=binomial, data = df_train[,1:11])
summary(logfit_reduced)
##
## Call:
## glm(formula = Exited ~ CreditScore + Age + Tenure + Balance +
## NumOfProducts + EstimatedSalary + Geography + Gender + IsActiveMember,
## family = binomial, data = df_train[, 1:11])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.98672 0.06320 -15.613 < 2e-16 ***
## CreditScore -0.07698 0.03219 -2.391 0.0168 *
## Age 0.75694 0.03215 23.542 < 2e-16 ***
## Tenure -0.05825 0.03221 -1.809 0.0705 .
## Balance 0.18937 0.03813 4.967 6.8e-07 ***
## NumOfProducts -0.07749 0.03245 -2.388 0.0170 *
## EstimatedSalary 0.06666 0.03267 2.040 0.0413 *
## GeographyGermany 0.71079 0.08095 8.781 < 2e-16 ***
## GeographySpain -0.02020 0.08449 -0.239 0.8110
## GenderMale -0.54496 0.06521 -8.357 < 2e-16 ***
## IsActiveMember1 -1.05304 0.06872 -15.324 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7076.2 on 6997 degrees of freedom
## Residual deviance: 5985.9 on 6987 degrees of freedom
## (3 observations deleted due to missingness)
## AIC: 6007.9
##
## Number of Fisher Scoring iterations: 5
logprob <- predict(logfit_reduced, newdata=df_test, type="response")
logpred <-ifelse(logprob>0.5, 1, 0)
# To confirm that predicted values from (0 --> 0.5) and (0.51 --> 1) in logprob correspond to "0" and "1", respectively
head(logprob)
## 6 19 22 23 24 29
## 0.24359880 0.23575894 0.14120761 0.25334650 0.07951678 0.33950006
head(logpred)
## 6 19 22 23 24 29
## 0 0 0 0 0 0
# Create confusion matrix (with columns representing model's prediction and rows representing actual values from df_test)
table(logpred, df_test[,"Exited"])
##
## logpred 0 1
## 0 2308 483
## 1 81 129
#Testing error rate
error_rate <- 1 - mean(logpred==df_test[,"Exited"])
error_rate
## [1] 0.1879374
# Testing Precision Score
129/(129+483)
## [1] 0.2107843
Models = c("Logistic Reg")
Error.Rate = c("0.1879374")
Precision = c(129/(129+483))
Precision = round(Precision, 3)
text_tbl = data.frame(Models, Error.Rate, Precision)
kable(text_tbl) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Models | Error.Rate | Precision |
|---|---|---|
| Logistic Reg | 0.1879374 | 0.211 |
sum(is.na(df_combined))
## [1] 4
df_combined <- na.omit(df_combined)
sum(is.na(df_combined))
## [1] 0
# Inspect initial scaled df dataset
str(df_combined)
## 'data.frame': 9998 obs. of 13 variables:
## $ CreditScore : num -0.3264 -0.4402 -1.5369 0.5012 -0.0575 ...
## $ Age : num 0.29348 0.19813 0.29348 0.00742 0.48419 ...
## $ Tenure : num -1.04 -1.39 1.03 -1.39 1.03 ...
## $ Balance : num -1.226 0.117 1.333 -1.226 0.597 ...
## $ NumOfProducts : num -0.912 -0.912 2.527 0.808 0.808 ...
## $ EstimatedSalary: num 0.022 0.217 0.241 -0.109 0.864 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 2 1 3 1 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 1 2 2 1 1 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 2 2 1 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
## $ Exited : int 1 0 1 0 1 1 0 0 0 0 ...
## $ CustomerId : int 15634602 15647311 15619304 15701354 15574012 15656148 15767821 15737173 15632264 15691483 ...
## $ RowNumber : int 1 2 3 4 6 8 11 12 13 14 ...
## - attr(*, "na.action")= 'omit' Named int [1:4] 5 7 9 10
## ..- attr(*, "names")= chr [1:4] "5" "7" "9" "10"
# Set the seed to make your partition reproducible
set.seed(456)
# New data set excluding response variable ($Exited from) initial df_combined data set
df_no_Exited <- df_combined[,!(names(df) %in% "Exited")]
# Create a new train (70%) and test (30%) data set, excluding response variable ($Exited) for KNN
ind <- sample(1:2, nrow(df_no_Exited), replace = TRUE, prob = c(0.7, 0.3))
df_train_1 <- df_no_Exited[ind == 1, ]
df_test_1 <- df_no_Exited[ind == 2, ]
# Create KNN labels
df.trainLabels <- df_combined[ind==1, 11]
df.testLabels <- df_combined[ind==2, 11]
# Ensure that training and testing set sum to df_combined data set; Note removal of $Exited from new test and training data set
nrow (df_combined)
## [1] 9998
str(df_train_1)
## 'data.frame': 7030 obs. of 12 variables:
## $ CreditScore : num -0.326 -0.44 -2.84 -1.268 -1.589 ...
## $ Age : num 0.293 0.198 -0.946 -0.755 -1.423 ...
## $ Tenure : num -1.042 -1.387 -0.35 0.341 -0.696 ...
## $ Balance : num -1.226 0.117 0.618 0.409 -1.226 ...
## $ NumOfProducts : num -0.912 -0.912 4.246 0.808 0.808 ...
## $ EstimatedSalary: num 0.022 0.217 0.335 -0.346 -0.412 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 2 1 3 1 1 3 2 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 1 1 1 2 2 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 2 1 1 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 2 2 1 ...
## $ Exited : int 1 0 1 0 0 0 0 0 0 0 ...
## $ CustomerId : int 15634602 15647311 15656148 15767821 15737173 15632264 15691483 15600882 15643966 15661507 ...
str(df_test_1)
## 'data.frame': 2968 obs. of 12 variables:
## $ CreditScore : num -1.5369 0.5012 -0.0575 0.0253 -1.0506 ...
## $ Age : num 0.29348 0.00742 0.48419 1.81915 -1.42289 ...
## $ Tenure : num 1.03 -1.39 1.03 -1.39 1.38 ...
## $ Balance : num 1.333 -1.226 0.597 0.899 -1.226 ...
## $ NumOfProducts : num 2.527 0.808 0.808 -0.912 0.808 ...
## $ EstimatedSalary: num 0.241 -0.109 0.864 -1.652 -1.49 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 1 3 2 3 1 3 1 2 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 1 2 1 2 2 1 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 2 2 2 2 1 2 2 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 2 2 2 ...
## $ Exited : int 1 0 1 1 0 0 1 0 0 0 ...
## $ CustomerId : int 15619304 15701354 15574012 15737452 15788218 15577657 15699309 15738191 15736816 15728693 ...
# Convert non-numeric columns to numeric for KNN
df_train_1$Geography <- as.numeric(df_train_1$Geography) # Convert to numeric
df_train_1$Gender <- as.numeric(df_train_1$Gender) # Convert to numeric
df_train_1$HasCrCard <- as.numeric(df_train_1$HasCrCard) # Convert to numeric
df_train_1$IsActiveMember <- as.numeric(df_train_1$IsActiveMember) # Convert to numeric
df_test_1$Geography <- as.numeric(df_test_1$Geography) # Convert to numeric
df_test_1$Gender <- as.numeric(df_test_1$Gender) # Convert to numeric
df_test_1$HasCrCard <- as.numeric(df_test_1$HasCrCard) # Convert to numeric
df_test_1$IsActiveMember <- as.numeric(df_test_1$IsActiveMember) # Convert to numeric
str(df_train_1)
## 'data.frame': 7030 obs. of 12 variables:
## $ CreditScore : num -0.326 -0.44 -2.84 -1.268 -1.589 ...
## $ Age : num 0.293 0.198 -0.946 -0.755 -1.423 ...
## $ Tenure : num -1.042 -1.387 -0.35 0.341 -0.696 ...
## $ Balance : num -1.226 0.117 0.618 0.409 -1.226 ...
## $ NumOfProducts : num -0.912 -0.912 4.246 0.808 0.808 ...
## $ EstimatedSalary: num 0.022 0.217 0.335 -0.346 -0.412 ...
## $ Geography : num 1 3 2 1 3 1 1 3 2 3 ...
## $ Gender : num 1 1 1 2 2 1 1 1 2 2 ...
## $ HasCrCard : num 2 1 2 1 2 2 1 2 1 1 ...
## $ IsActiveMember : num 2 2 1 1 1 1 1 2 2 1 ...
## $ Exited : int 1 0 1 0 0 0 0 0 0 0 ...
## $ CustomerId : int 15634602 15647311 15656148 15767821 15737173 15632264 15691483 15600882 15643966 15661507 ...
str(df_test_1)
## 'data.frame': 2968 obs. of 12 variables:
## $ CreditScore : num -1.5369 0.5012 -0.0575 0.0253 -1.0506 ...
## $ Age : num 0.29348 0.00742 0.48419 1.81915 -1.42289 ...
## $ Tenure : num 1.03 -1.39 1.03 -1.39 1.38 ...
## $ Balance : num 1.333 -1.226 0.597 0.899 -1.226 ...
## $ NumOfProducts : num 2.527 0.808 0.808 -0.912 0.808 ...
## $ EstimatedSalary: num 0.241 -0.109 0.864 -1.652 -1.49 ...
## $ Geography : num 1 1 3 2 3 1 3 1 2 2 ...
## $ Gender : num 1 1 2 2 1 2 1 2 2 1 ...
## $ HasCrCard : num 2 1 2 2 2 2 2 1 2 2 ...
## $ IsActiveMember : num 1 1 1 1 2 2 1 2 2 2 ...
## $ Exited : int 1 0 1 1 0 0 1 0 0 0 ...
## $ CustomerId : int 15619304 15701354 15574012 15737452 15788218 15577657 15699309 15738191 15736816 15728693 ...
#KNN model with k=5
KNN_Exited_Pred <- knn(train = df_train_1, test = df_test_1, cl = df.trainLabels, k=5)
KNN_Exited_Pred
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
## [112] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## [445] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## [556] 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [593] 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [630] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
## [667] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## [704] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [741] 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [778] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [815] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0
## [852] 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
## [889] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [926] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
## [963] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
## [1000] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1037] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## [1074] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1111] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
## [1148] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1185] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [1222] 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1259] 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## [1296] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
## [1333] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1370] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1407] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
## [1444] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1481] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
## [1518] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [1555] 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1592] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## [1629] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [1666] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1703] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
## [1740] 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1777] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1814] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [1851] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0
## [1888] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1925] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
## [1962] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [1999] 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2036] 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [2073] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## [2110] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [2147] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2184] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2221] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [2258] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2295] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2332] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2369] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [2406] 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## [2443] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2480] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
## [2517] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2554] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0
## [2591] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2628] 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## [2665] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2702] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2739] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
## [2776] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2813] 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2850] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2887] 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2924] 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## [2961] 0 0 0 0 0 0 0 0
## Levels: 0 1
# Confusion Matrix for KNN Model
table(KNN_Exited_Pred, df.testLabels)
## df.testLabels
## KNN_Exited_Pred 0 1
## 0 2206 589
## 1 139 34
# Calculate testing error for KNN Model
1-mean(KNN_Exited_Pred==df.testLabels)
## [1] 0.245283
# Calculate Precision Score for KNN Model
31/(31+146)
## [1] 0.1751412
The KNN classifier performs worse than the logistic regression model with a higher error rate and lower precision score. Note that an error rate of 0.24 suggests that the KNN classifier has a strong preliminary design. Further analysis is needed to identify predictor variables that have minimal influence on the outcome of the response variable. Predictor variables with minimal impact to the overall functioning of the KNN classifier model should be removed or minimized to prevent the model from overfitting to noise in the dataset. This is a beneficial step for both logistic regression models and KNN models. Shrinkage methods, such as Lasso and Ridge, and dimension reduction techniques, such as principal component analysis, can be used for this purpose.
Models = c("KNN k=5","Logistic Reg")
Error.Rate = c("0.2402291", "0.1879374")
Precision = c(0.1751412,0.210784314)
Precision = round(Precision, 3)
text_tbl = data.frame(Models, Error.Rate, Precision)
kable(text_tbl) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Models | Error.Rate | Precision |
|---|---|---|
| KNN k=5 | 0.2402291 | 0.175 |
| Logistic Reg | 0.1879374 | 0.211 |
df_train <- na.omit(df_train)
df_test <- na.omit(df_test)
str(df_combined)
## 'data.frame': 9998 obs. of 13 variables:
## $ CreditScore : num -0.3264 -0.4402 -1.5369 0.5012 -0.0575 ...
## $ Age : num 0.29348 0.19813 0.29348 0.00742 0.48419 ...
## $ Tenure : num -1.04 -1.39 1.03 -1.39 1.03 ...
## $ Balance : num -1.226 0.117 1.333 -1.226 0.597 ...
## $ NumOfProducts : num -0.912 -0.912 2.527 0.808 0.808 ...
## $ EstimatedSalary: num 0.022 0.217 0.241 -0.109 0.864 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 2 1 3 1 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 1 2 2 1 1 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 2 2 1 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
## $ Exited : int 1 0 1 0 1 1 0 0 0 0 ...
## $ CustomerId : int 15634602 15647311 15619304 15701354 15574012 15656148 15767821 15737173 15632264 15691483 ...
## $ RowNumber : int 1 2 3 4 6 8 11 12 13 14 ...
## - attr(*, "na.action")= 'omit' Named int [1:4] 5 7 9 10
## ..- attr(*, "names")= chr [1:4] "5" "7" "9" "10"
str(df_train)
## 'data.frame': 6997 obs. of 13 variables:
## $ CreditScore : num 0.222 -0.264 -2.147 1.96 -1.547 ...
## $ Age : num 1.342 1.247 0.484 -0.851 -0.469 ...
## $ Tenure : num 1.379 -1.042 -1.042 1.033 -0.696 ...
## $ Balance : num 1.4892 0.0477 -1.2259 0.9584 0.501 ...
## $ NumOfProducts : num 4.246 -0.912 -0.912 -0.912 -0.912 ...
## $ EstimatedSalary: num 0.821 -0.269 1.027 -0.799 -1.579 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 1 1 1 1 1 3 1 3 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 1 1 1 2 2 2 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 1 2 1 ...
## $ Exited : int 1 0 0 0 0 0 1 0 0 0 ...
## $ CustomerId : int 15704442 15607993 15635502 15631912 15788539 15714680 15749905 15713949 15786454 15619699 ...
## $ RowNumber : int 2463 2511 8718 2986 1842 9334 3371 4761 6746 9819 ...
## - attr(*, "na.action")= 'omit' Named int [1:4] 2423 4955 6027 6499
## ..- attr(*, "names")= chr [1:4] "5" "9" "7" "10"
str(df_test)
## 'data.frame': 3001 obs. of 13 variables:
## $ CreditScore : num -0.0575 -0.6575 -0.1506 -1.4541 0.1908 ...
## $ Age : num 0.4842 0.5795 -0.6601 -0.0879 0.6749 ...
## $ Tenure : num 1.033 0.341 1.033 -0.35 -0.696 ...
## $ Balance : num 0.597 -1.226 -1.226 -1.226 -1.226 ...
## $ NumOfProducts : num 0.808 -0.912 0.808 -0.912 0.808 ...
## $ EstimatedSalary: num 0.864 1.019 0.669 0.327 -1.593 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 3 3 3 3 1 2 2 3 3 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 1 2 1 1 2 ...
## $ HasCrCard : Factor w/ 2 levels "0","1": 2 1 2 2 1 2 1 2 2 1 ...
## $ IsActiveMember : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 2 2 2 ...
## $ Exited : int 1 0 0 1 0 0 0 0 0 0 ...
## $ CustomerId : int 15574012 15661507 15597945 15699309 15725737 15728693 15750181 15659428 15732963 15788448 ...
## $ RowNumber : int 6 19 22 23 24 29 33 34 35 37 ...
df_train$Exited <- as.factor(df_train$Exited)
# Create random forest model for df train data set
customer_churn.rf <- randomForest(Exited ~ . , data = df_train[,1:11], importance = TRUE, proximity = TRUE)
print(customer_churn.rf)
##
## Call:
## randomForest(formula = Exited ~ ., data = df_train[, 1:11], importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 14.08%
## Confusion matrix:
## 0 1 class.error
## 0 5362 209 0.03751571
## 1 776 650 0.54417952
#Identify which variables had the largest impact on the determination of the satisfaction score using random forest decision tree method
head(customer_churn.rf$importance)
## 0 1 MeanDecreaseAccuracy
## CreditScore 8.251762e-04 0.0006629100 0.0007921369
## Age 3.276526e-02 0.1162813354 0.0497744729
## Tenure 4.231815e-04 0.0009541907 0.0005352457
## Balance 1.306367e-02 0.0338546369 0.0173029999
## NumOfProducts 4.016887e-02 0.0996394896 0.0522884944
## EstimatedSalary 2.289532e-05 -0.0015959035 -0.0003026631
## MeanDecreaseGini
## CreditScore 303.8454
## Age 524.1078
## Tenure 173.4943
## Balance 315.2832
## NumOfProducts 296.7886
## EstimatedSalary 309.0664
# Grab OOB error matrix
err <- customer_churn.rf$err.rate
plot(customer_churn.rf)
# Add a legend since, error matrix does not have one by default
legend(x = "right", legend = colnames(err), fill = 1:ncol(err))
# Calculate prediction performance using df_test data
pred.rf <- predict(object = customer_churn.rf, newdata = df_test[,1:11], type = "class")
# Calculate the confusion matrix
confusionMatrix(data = pred.rf, reference = factor(df_test$Exited))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2311 328
## 1 78 284
##
## Accuracy : 0.8647
## 95% CI : (0.852, 0.8768)
## No Information Rate : 0.7961
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5087
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9674
## Specificity : 0.4641
## Pos Pred Value : 0.8757
## Neg Pred Value : 0.7845
## Prevalence : 0.7961
## Detection Rate : 0.7701
## Detection Prevalence : 0.8794
## Balanced Accuracy : 0.7157
##
## 'Positive' Class : 0
##
# Calculate testing error (note - accuracy rate [0.862], error rate [0.137954] sum to 1)
1- mean(df_test$Exited==pred.rf)
## [1] 0.1352882
# Calculate precision score for pred.rf
84/(84+282)
## [1] 0.2295082
The random forest sample predictive model performed strongly, with a lower error rate (0.138) than both logistic regression and KNN models. CreditScore, Age, NumberofProducts, EstimatedSalary, balance, and tenure were among the most important variables in improving accuracy in customer churn rate predictions. The random forest model performed better than both logistic regression and KNN classifier models due to bootstrap aggregation and random selection of variables to be considered for splitting at each decision node. The randomness behind the model coupled with the diversity of training data underlying each of the 500 individual trees run is a partial solution to any adverse impact resulting from class imbalance of the $Exited response variable. The model could be enhanced and error rate further lowered through boosting. Boosting ensures that each of the individual 500 trees run as part of the overall predictive model learn from and adapt to the previous models error in a sequential manner.
Models = c("KNN k=5","Logistic Reg", "Random Forest")
Error.Rate = c("0.2402291","0.1879374","0.137954")
Precision = c(0.1751412,0.210784314,0.2295082)
Precision = round(Precision, 3)
text_tbl = data.frame(Models, Error.Rate, Precision)
kable(text_tbl) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Models | Error.Rate | Precision |
|---|---|---|
| KNN k=5 | 0.2402291 | 0.175 |
| Logistic Reg | 0.1879374 | 0.211 |
| Random Forest | 0.137954 | 0.230 |
# Create SVM model using training model
svm_model <- svm(Exited ~ ., data = df_train, type = "C-classification", kernal = "linear", scale = FALSE)
svm_model
##
## Call:
## svm(formula = Exited ~ ., data = df_train, type = "C-classification",
## kernal = "linear", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 6996
# Calculate the confusion matrix
confusionMatrix(data = pred.rf, reference = factor(df_test$Exited))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2311 328
## 1 78 284
##
## Accuracy : 0.8647
## 95% CI : (0.852, 0.8768)
## No Information Rate : 0.7961
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5087
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9674
## Specificity : 0.4641
## Pos Pred Value : 0.8757
## Neg Pred Value : 0.7845
## Prevalence : 0.7961
## Detection Rate : 0.7701
## Detection Prevalence : 0.8794
## Balanced Accuracy : 0.7157
##
## 'Positive' Class : 0
##
# Compute SVM model test accuracy [0.796068] and error rate [0.203932]
pred_test <- predict(svm_model, df_test)
mean(pred_test == df_test$Exited)
## [1] 0.796068
1 - (mean(pred_test == df_test$Exited))
## [1] 0.203932
# Compute SVM model precision score
282/(282+84)
## [1] 0.7704918
While the SVM model performs well with an error rate of 0.204, the random forest model demonstrates a higher level of predictive accuracy using test data. Note that the precision score for SVM is 0.770, an improvement over the logistic regression, KNN classifier, and random forest models. The SVM produces a higher precision score given the high class imbalance in response variable ($Exited). SVM models are better than other models at dealing with class imbalances by fitting the hyperplane, the boundary that separates classes, to support vectors (training data positioned closest to the hyperplane) rather than all training training data. The SVM model could be further refined with the objective of reducing test error by incorporating the “cost” penalty parameter to decrease the model’s acceptable margins between support vectors and hyperplane boundary line.
Models = c("KNN k=5","Logistic Reg", "Random Forest", "SVM")
Error.Rate = c("0.2402291","0.1879374","0.137954","0.203932")
Precision = c(0.1751412,0.210784314,0.2295082,0.7704918)
Precision = round(Precision, 3)
text_tbl = data.frame(Models, Error.Rate, Precision)
kable(text_tbl) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Models | Error.Rate | Precision |
|---|---|---|
| KNN k=5 | 0.2402291 | 0.175 |
| Logistic Reg | 0.1879374 | 0.211 |
| Random Forest | 0.137954 | 0.230 |
| SVM | 0.203932 | 0.770 |
(10 points) Based on the four classification models, which one do
you think is the best model to predict y? Please consider
the following in your response:
The first two predictive models constructed to predict customer churn rate for the Portuguese bank are random fores and SVM. See table below for detailed score report of all 4 models constructed and considered in this assignment. Random forest has the lowest error rate (0.138), indicating that the random forest model makes fewer wrong predictions. The precision score (0.230) for the random forest model is also reasonable, suggesting that the model performs well in predicting true positive outcomes based on test data. The SVM has the highest precision score of the 4 models (0.770), suggesting that the SVM model is ideal for reducing prediction of false positives given the underlying test data. The SVM error rate is higher than the random forest model. The higher error rate is not overly significant given the potential cost of the bank of wrongly predicting that a customer will churn (leave) the bank. For every customer the bank loses, the bank will need to expend resources to acquire new customers. As such maintaining a loyal customer base and predicting which customers are likely to remain loyal, given a variety of attributes, is important for the bank. As the bank budgets promotional activities to maintain potentially loyal customers, it is important for the bank to correctly identify (predict) which customers will remain loyal to eliminate excess spending on customers that are, based on select attributes, more likely to churn. The SVM model’s high precision score is a valued benchmark of model performance given this context.
The next two predictive models considered include logistic regression and KNN classifier. The logistic regression model offers only moderate performance across all metrics. While the model is interpret able and works well, it under performs compared to the random forest and SVM models. The logistic regression model has potential for improvement given certain adjustments. KNN has the highest error rate and the lowest precision score, making it the weakest predictor. This is not surprising. Given class imbalance of response variable, $Exited, KNN classifier will perform worse in relation to the underlying test data than will models such as SVM or random forest because the model is prone to being influenced by outliers. Further adjustments would improve model performance.
Models = c("KNN k=5","Logistic Reg", "Random Forest", "SVM")
Error.Rate = c("0.2402291","0.1879374","0.137954","0.203932")
Precision = c(0.1751412,0.210784314,0.2295082,0.7704918)
Precision = round(Precision, 3)
text_tbl = data.frame(Models, Error.Rate, Precision)
kable(text_tbl) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Models | Error.Rate | Precision |
|---|---|---|
| KNN k=5 | 0.2402291 | 0.175 |
| Logistic Reg | 0.1879374 | 0.211 |
| Random Forest | 0.137954 | 0.230 |
| SVM | 0.203932 | 0.770 |
- Do you think you can improve the model by adding any other information?
The SVM model can be improved by adding a “cost” penalty parameter to reduce the acceptable margin between support vectors and the hyperplane decision boundary. This would lower test error for the SVM predictive model. The logistic model could benefit by dimension reduction and shrinkage techniques to eliminate the potential that the model is currently overfitted to the underlying train data. While this would introduce an element of bias into the model, interpret ability of the model and test error in comparison to underlying test data would improve. The KNN model (and logistic regression model) could benefit from SMOTe, weighted classes, or other techniques focused on ensuring minority $Exited class variables are represented in the training and testing data set. This would further decrease test error for both models. Finally, the random forest model could benefit from boosting. This would enable a lower test error by ensuring that each subsequent model in the “forest” is trained to mitigate error expressed in the previous model’s outcomes. NN model was not considered in this assignment, but such a model has the potential to accurately predict test error if properly trained using multiple hidden layers and non-linear activation functions to simulate real-world scenarios and probability to a great extent.
Upon conclusion of this assignment, I have come to the realization that a predictive model functions only as good as the underlying assumptions and data. This requires knowledge of steps involved in inspecting the underlying data set, cleaning the data set, and performing EDA to develop an initial understanding of the data (once cleansed) and the predictive models that are best suited to accurately predicting outcomes. Upon completion of this preliminary stage, I was able to further develop my understanding of logistic regression, KNN classifier, SVM, and tree based decision (particularly random forest) models. Using the Bank Customer churn Rate data set for all 4 models allowed me to better understand the similarities and differences among the 4 models, both in constructing and in evaluating results. By conducting this project in RStudio, I was able to gain hands on experience using a versatile programming language to apply statistical concepts in creating 4 models. By doing so, I gained a deeper knowledge of how to code using a variety of new packages, such as e1071. I look forward to applying lessons learned in this project to future professional and academic efforts.