#Data source: https://www.kaggle.com/datasets/adammaus/predicting-churn-for-bank-customers?resource=download&select=Churn_Modelling.csv
library(openxlsx)
setwd("C:/Users/Korisnik/Desktop/Homework_3_RMT")
BankData <- read.xlsx("Churn_Modelling_Excel.xlsx")
BankData$HasCrCard = as.factor(BankData$HasCrCard)
BankData$IsActiveMember = as.factor(BankData$IsActiveMember)
BankData$Exited = as.factor(BankData$Exited)
BankData$Gender = as.factor(BankData$Gender)
#Creating dummy variables without intercepts for each dummy variable
Geography_dummy <- model.matrix(~ Geography - 1, data = BankData)
colnames(Geography_dummy) <- gsub("Country", "", colnames(Geography_dummy))
BankData <- cbind(BankData, Geography_dummy)
#Removing the original variable
BankData <- BankData[, !(names(BankData) %in% c("Geography"))]
head(BankData)
## RowNumber CustomerId Surname CreditScore Gender Age Tenure Balance
## 1 1 15634602 Hargrave 619 Female 42 2 0.00
## 2 2 15647311 Hill 608 Female 41 1 83807.86
## 3 3 15619304 Onio 502 Female 42 8 159660.80
## 4 4 15701354 Boni 699 Female 39 1 0.00
## 5 5 15737888 Mitchell 850 Female 43 2 125510.82
## 6 6 15574012 Chu 645 Male 44 8 113755.78
## NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited GeographyFrance
## 1 1 1 1 101348.88 1 1
## 2 1 0 1 112542.58 0 0
## 3 3 1 0 113931.57 1 1
## 4 2 0 0 93826.63 0 1
## 5 1 1 1 79084.10 0 0
## 6 2 1 0 149756.71 1 0
## GeographyGermany GeographySpain
## 1 0 0
## 2 0 1
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
#Descriptive statistics:
#Dataset consists of 10000 observations and 13 variables.
#Dummy variables were added (3) - GeographyFrance, GeographyGermany, GeographySpain
#According to skewness and kurtosis thresholds (-2 / +2), distributions of variables are approximately normal
#The genders are relatively balanced, with 45.43% females and 54.57% males
#The age range is between 18 and 92 years
#According to the means of dummy variables, the most clients are from France, while there is similar number of clients from Germany and Spain in comparison to each other
#When it comes to the dependent variable, there are 79.63% of clients who stayed and 20.37% of clients who exited
library(psych)
describe(BankData)
## vars n mean sd median trimmed
## RowNumber 1 10000 5000.50 2886.90 5000.50 5000.50
## CustomerId 2 10000 15690940.57 71936.19 15690738.00 15690938.68
## Surname* 3 10000 1508.78 846.20 1543.00 1512.94
## CreditScore 4 10000 650.53 96.65 652.00 651.01
## Gender* 5 10000 1.55 0.50 2.00 1.56
## Age 6 10000 38.92 10.49 37.00 37.91
## Tenure 7 10000 5.01 2.89 5.00 5.01
## Balance 8 10000 76485.89 62397.41 97198.54 74827.80
## NumOfProducts 9 10000 1.53 0.58 1.00 1.49
## HasCrCard* 10 10000 1.71 0.46 2.00 1.76
## IsActiveMember* 11 10000 1.52 0.50 2.00 1.52
## EstimatedSalary 12 10000 100090.24 57510.49 100193.91 100114.86
## Exited* 13 10000 1.20 0.40 1.00 1.13
## GeographyFrance 14 10000 0.50 0.50 1.00 0.50
## GeographyGermany 15 10000 0.25 0.43 0.00 0.19
## GeographySpain 16 10000 0.25 0.43 0.00 0.18
## mad min max range skew kurtosis se
## RowNumber 3706.50 1.00 10000.0 9999.0 0.00 -1.20 28.87
## CustomerId 92562.42 15565701.00 15815690.0 249989.0 0.00 -1.20 719.36
## Surname* 1085.26 1.00 2932.0 2931.0 -0.02 -1.20 8.46
## CreditScore 99.33 350.00 850.0 500.0 -0.07 -0.43 0.97
## Gender* 0.00 1.00 2.0 1.0 -0.18 -1.97 0.00
## Age 8.90 18.00 92.0 74.0 1.01 1.39 0.10
## Tenure 2.97 0.00 10.0 10.0 0.01 -1.17 0.03
## Balance 69336.44 0.00 250898.1 250898.1 -0.14 -1.49 623.97
## NumOfProducts 0.00 1.00 4.0 3.0 0.75 0.58 0.01
## HasCrCard* 0.00 1.00 2.0 1.0 -0.90 -1.19 0.00
## IsActiveMember* 0.00 1.00 2.0 1.0 -0.06 -2.00 0.00
## EstimatedSalary 72941.18 11.58 199992.5 199980.9 0.00 -1.18 575.10
## Exited* 0.00 1.00 2.0 1.0 1.47 0.16 0.00
## GeographyFrance 0.00 0.00 1.0 1.0 -0.01 -2.00 0.01
## GeographyGermany 0.00 0.00 1.0 1.0 1.15 -0.68 0.00
## GeographySpain 0.00 0.00 1.0 1.0 1.17 -0.63 0.00
summary(BankData)
## RowNumber CustomerId Surname CreditScore
## Min. : 1 Min. :15565701 Length:10000 Min. :350.0
## 1st Qu.: 2501 1st Qu.:15628528 Class :character 1st Qu.:584.0
## Median : 5000 Median :15690738 Mode :character Median :652.0
## Mean : 5000 Mean :15690941 Mean :650.5
## 3rd Qu.: 7500 3rd Qu.:15753234 3rd Qu.:718.0
## Max. :10000 Max. :15815690 Max. :850.0
## Gender Age Tenure Balance NumOfProducts
## Female:4543 Min. :18.00 Min. : 0.000 Min. : 0 Min. :1.00
## Male :5457 1st Qu.:32.00 1st Qu.: 3.000 1st Qu.: 0 1st Qu.:1.00
## Median :37.00 Median : 5.000 Median : 97199 Median :1.00
## Mean :38.92 Mean : 5.013 Mean : 76486 Mean :1.53
## 3rd Qu.:44.00 3rd Qu.: 7.000 3rd Qu.:127644 3rd Qu.:2.00
## Max. :92.00 Max. :10.000 Max. :250898 Max. :4.00
## HasCrCard IsActiveMember EstimatedSalary Exited GeographyFrance
## 0:2945 0:4849 Min. : 11.58 0:7963 Min. :0.0000
## 1:7055 1:5151 1st Qu.: 51002.11 1:2037 1st Qu.:0.0000
## Median :100193.91 Median :1.0000
## Mean :100090.24 Mean :0.5014
## 3rd Qu.:149388.25 3rd Qu.:1.0000
## Max. :199992.48 Max. :1.0000
## GeographyGermany GeographySpain
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.2509 Mean :0.2477
## 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
#ResearchQuestion1: Is there a significant relation between gender and being an active member at our bank?
#Chi-squared test was conducted, given the categorical nature of both IsActiveMember and Gender variables
#Chi-squared test was statistically significant at the level p<0.05, with the value of 4.9923
#Comparing the observed and expected frequencies, there are only 56.1 discrepancies among them - particularly 56.1 more females were estimated to be active members, and 56.1 males were estimated to be inactive
#Cramer's V statistic for effect size has shown that with the size of 0.02 the effect is tiny
#Odd's ratio statistic for effect size has shown that the ratio is 1.09 and thus very small
#In conclusion, the chi-squared test found that the association between the gender and activity of the members is significant and exists, with male members being more active in general, however the measures of effect size show that the relation is weak
barplot(table(BankData$IsActiveMember, BankData$Gender),
beside = TRUE,
col = c("pink", "lightblue"),
legend = TRUE,
names.arg = c("Female", "Male"),
main = "Gender vs. Activity",
xlab = "Activity",
ylab = "Count")

chi_square <- chisq.test(BankData$IsActiveMember, BankData$Gender,
correct = TRUE)
chi_square
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: BankData$IsActiveMember and BankData$Gender
## X-squared = 4.9923, df = 1, p-value = 0.02546
addmargins(chi_square$observed)
## BankData$Gender
## BankData$IsActiveMember Female Male Sum
## 0 2259 2590 4849
## 1 2284 2867 5151
## Sum 4543 5457 10000
addmargins(round(chi_square$expected, 2))
## BankData$Gender
## BankData$IsActiveMember Female Male Sum
## 0 2202.9 2646.1 4849
## 1 2340.1 2810.9 5151
## Sum 4543.0 5457.0 10000
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
effectsize::cramers_v(BankData$IsActiveMember, BankData$Gender)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.02 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.02)
## [1] "tiny"
## (Rules: funder2019)
oddsratio(BankData$IsActiveMember, BankData$Gender)
## Odds ratio | 95% CI
## -------------------------
## 1.09 | [1.01, 1.18]
interpret_oddsratio(1.09)
## [1] "very small"
## (Rules: chen2010)
#ResearchQuestion2: Can we predict whether the client exited our bank based on the available variables?
#Available variables: Client's credit score, Gender, Age, How long were they with us,...
#...Balance of their account in the end, Number of our products they've been using,...
#...Whether they have a credit card, Whether they were an active member, Their estimated salary, Their country of origin
#The fit0 binary logistic function without the explanatory variables shows that the estimate is equal to -1.36333 (significant at p<0.001), which when transformed as an exponent gives us 0.255 - this means that the odds of members exiting are about 1/0.255 times of odds of members staying with our bank (this was apparent from the descriptive statistics as well - 7963/2037)
#The vif values are low, all below 2, indicating that there is no multicolinearity between the predictors
#The analysis of standardized residuals shows that there are no outliers with residuals below the value of -3 or above the value of 3
#The analysis of influential points similarly shows that there are no influential points above the value of 1, and there are no significant gaps noticed
#The analysis of deviance is highly statistically significant (p<0.001), indicating that the more complex model with all of our predictors is better at predicting the dependent variable than the fit0 model with just the intercept
#The summary of the fit1 model shows that out of the analyzed predictors, the significant ones were: CreditScore, Gender, Age, Balance, NumOfProducts, IsActiveMember, Geography (Germany)
#The ratio/odds (RO) additionally shows the particular times for which the odds ratio would be increased or decreased with the additional one unit of a certain predictor, holding all the others constant
#The interpretation of results is that members that are: with lower credit score, female, older, with higher account balance, with the usage of fewer of our products, inactive, and from Germany - are significantly more likely to exit our bank than the others
#We could perhaps use these results to lower the churn rates by focusing on improving the experiences of these members with our bank
fit0 <- glm(Exited ~ 1,
family = binomial,
data = BankData)
summary(fit0)
##
## Call:
## glm(formula = Exited ~ 1, family = binomial, data = BankData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.675 -0.675 -0.675 -0.675 1.784
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.36333 0.02483 -54.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10110 on 9999 degrees of freedom
## Residual deviance: 10110 on 9999 degrees of freedom
## AIC: 10112
##
## Number of Fisher Scoring iterations: 4
head(fitted(fit0))
## 1 2 3 4 5 6
## 0.2037 0.2037 0.2037 0.2037 0.2037 0.2037
exp(cbind(odds = fit0$coefficients, confint.default(fit0)))
## odds 2.5 % 97.5 %
## (Intercept) 0.2558081 0.2436573 0.2685648
Pseudo_R2_fit1 <- 7963/10000
Pseudo_R2_fit1
## [1] 0.7963
fit1 <- glm(Exited ~ CreditScore + Gender + Age + Tenure + Balance + NumOfProducts + HasCrCard +
IsActiveMember + EstimatedSalary + GeographyFrance + GeographyGermany,
family = binomial,
data = BankData)
library(car)
## Warning: package 'car' was built under R version 4.0.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.0.5
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
vif(fit1)
## CreditScore Gender Age Tenure
## 1.001410 1.003915 1.081634 1.002863
## Balance NumOfProducts HasCrCard IsActiveMember
## 1.291751 1.079209 1.002089 1.078350
## EstimatedSalary GeographyFrance GeographyGermany
## 1.001414 1.666020 1.881879
BankData$StdResid <- rstandard(fit1)
BankData$Cook <- cooks.distance(fit1)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
StdResid <- ggplot(BankData, aes(x=StdResid)) +
theme_linedraw() +
geom_histogram() +
xlab("Standardized residuals")
Cook <- ggplot(BankData, aes(x=Cook)) +
theme_linedraw() +
geom_histogram() +
xlab("Cook's distance")
library(ggpubr)
ggarrange(StdResid, Cook,
ncol = 2, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

head(BankData[order(BankData$StdResid), c("RowNumber", "StdResid")], 5)
## RowNumber StdResid
## 4816 4816 -2.310921
## 9588 9588 -2.255057
## 7693 7693 -2.011095
## 8157 8157 -1.983400
## 9293 9293 -1.935901
head(BankData[order(-BankData$StdResid), c("RowNumber", "StdResid")], 5)
## RowNumber StdResid
## 2142 2142 2.994159
## 9573 9573 2.926443
## 2432 2432 2.863175
## 5216 5216 2.746696
## 4923 4923 2.731139
head(BankData[order(-BankData$Cook), c("RowNumber", "Cook")], 5)
## RowNumber Cook
## 7725 7725 0.001848151
## 3366 3366 0.001761374
## 2093 2093 0.001690396
## 5701 5701 0.001684427
## 4823 4823 0.001652999
#identify which rows are outliers based on their standard residuals being above 3 or below -3
outliers <- which(BankData$StdResid > 3 | BankData$StdResid < -3)
print(outliers)
## integer(0)
#identify which rows are influential based on the Cook's distance being above 1
threshold <- 1
influential_obs <- which(BankData$Cook > threshold)
print(influential_obs)
## integer(0)
fit0 <- glm(Exited ~ 1,
family = binomial,
data = BankData)
fit1 <- glm(Exited ~ CreditScore + Gender + Age + Tenure + Balance + NumOfProducts + HasCrCard +
IsActiveMember + EstimatedSalary + GeographyFrance + GeographyGermany,
family = binomial,
data = BankData)
anova(fit0, fit1, test = "Chi")
## Analysis of Deviance Table
##
## Model 1: Exited ~ 1
## Model 2: Exited ~ CreditScore + Gender + Age + Tenure + Balance + NumOfProducts +
## HasCrCard + IsActiveMember + EstimatedSalary + GeographyFrance +
## GeographyGermany
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 9999 10109.8
## 2 9988 8561.4 11 1548.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit1)
##
## Call:
## glm(formula = Exited ~ CreditScore + Gender + Age + Tenure +
## Balance + NumOfProducts + HasCrCard + IsActiveMember + EstimatedSalary +
## GeographyFrance + GeographyGermany, family = binomial, data = BankData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3097 -0.6589 -0.4560 -0.2697 2.9940
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.357e+00 2.493e-01 -13.467 < 2e-16 ***
## CreditScore -6.683e-04 2.803e-04 -2.384 0.0171 *
## GenderMale -5.285e-01 5.449e-02 -9.699 < 2e-16 ***
## Age 7.271e-02 2.576e-03 28.230 < 2e-16 ***
## Tenure -1.595e-02 9.355e-03 -1.705 0.0882 .
## Balance 2.637e-06 5.142e-07 5.128 2.92e-07 ***
## NumOfProducts -1.015e-01 4.713e-02 -2.154 0.0312 *
## HasCrCard1 -4.468e-02 5.934e-02 -0.753 0.4515
## IsActiveMember1 -1.075e+00 5.769e-02 -18.643 < 2e-16 ***
## EstimatedSalary 4.807e-07 4.737e-07 1.015 0.3102
## GeographyFrance -3.522e-02 7.064e-02 -0.499 0.6181
## GeographyGermany 7.395e-01 7.881e-02 9.383 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10109.8 on 9999 degrees of freedom
## Residual deviance: 8561.4 on 9988 degrees of freedom
## AIC: 8585.4
##
## Number of Fisher Scoring iterations: 5
exp(cbind(RO = fit1$coefficients, confint.default(fit1)))
## RO 2.5 % 97.5 %
## (Intercept) 0.03483742 0.02137238 0.05678573
## CreditScore 0.99933189 0.99878295 0.99988114
## GenderMale 0.58949870 0.52978819 0.65593898
## Age 1.07541433 1.06999942 1.08085664
## Tenure 0.98417741 0.96629674 1.00238894
## Balance 1.00000264 1.00000163 1.00000364
## NumOfProducts 0.90346071 0.82373695 0.99090037
## HasCrCard1 0.95630690 0.85130934 1.07425450
## IsActiveMember1 0.34114633 0.30467627 0.38198190
## EstimatedSalary 1.00000048 0.99999955 1.00000141
## GeographyFrance 0.96539516 0.84057799 1.10874639
## GeographyGermany 2.09488073 1.79503264 2.44481642