#Data source: https://www.kaggle.com/datasets/adammaus/predicting-churn-for-bank-customers?resource=download&select=Churn_Modelling.csv
library(openxlsx)
setwd("C:/Users/Korisnik/Desktop/Homework_3_RMT")
BankData <- read.xlsx("Churn_Modelling_Excel.xlsx")

BankData$HasCrCard = as.factor(BankData$HasCrCard)
BankData$IsActiveMember = as.factor(BankData$IsActiveMember)
BankData$Exited = as.factor(BankData$Exited)
BankData$Gender = as.factor(BankData$Gender)

#Creating dummy variables without intercepts for each dummy variable
Geography_dummy <- model.matrix(~ Geography - 1, data = BankData)
colnames(Geography_dummy) <- gsub("Country", "", colnames(Geography_dummy))
BankData <- cbind(BankData, Geography_dummy)

#Removing the original variable
BankData <- BankData[, !(names(BankData) %in% c("Geography"))]

head(BankData)
##   RowNumber CustomerId  Surname CreditScore Gender Age Tenure   Balance
## 1         1   15634602 Hargrave         619 Female  42      2      0.00
## 2         2   15647311     Hill         608 Female  41      1  83807.86
## 3         3   15619304     Onio         502 Female  42      8 159660.80
## 4         4   15701354     Boni         699 Female  39      1      0.00
## 5         5   15737888 Mitchell         850 Female  43      2 125510.82
## 6         6   15574012      Chu         645   Male  44      8 113755.78
##   NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited GeographyFrance
## 1             1         1              1       101348.88      1               1
## 2             1         0              1       112542.58      0               0
## 3             3         1              0       113931.57      1               1
## 4             2         0              0        93826.63      0               1
## 5             1         1              1        79084.10      0               0
## 6             2         1              0       149756.71      1               0
##   GeographyGermany GeographySpain
## 1                0              0
## 2                0              1
## 3                0              0
## 4                0              0
## 5                0              1
## 6                0              1
#Descriptive statistics: 
#Dataset consists of 10000 observations and 13 variables.
#Dummy variables were added (3) - GeographyFrance, GeographyGermany, GeographySpain
#According to skewness and kurtosis thresholds (-2 / +2), distributions of variables are approximately normal
#The genders are relatively balanced, with 45.43% females and 54.57% males
#The age range is between 18 and 92 years
#According to the means of dummy variables, the most clients are from France, while there is similar number of clients from Germany and Spain in comparison to each other
#When it comes to the dependent variable, there are 79.63% of clients who stayed and 20.37% of clients who exited


library(psych)
describe(BankData)
##                  vars     n        mean       sd      median     trimmed
## RowNumber           1 10000     5000.50  2886.90     5000.50     5000.50
## CustomerId          2 10000 15690940.57 71936.19 15690738.00 15690938.68
## Surname*            3 10000     1508.78   846.20     1543.00     1512.94
## CreditScore         4 10000      650.53    96.65      652.00      651.01
## Gender*             5 10000        1.55     0.50        2.00        1.56
## Age                 6 10000       38.92    10.49       37.00       37.91
## Tenure              7 10000        5.01     2.89        5.00        5.01
## Balance             8 10000    76485.89 62397.41    97198.54    74827.80
## NumOfProducts       9 10000        1.53     0.58        1.00        1.49
## HasCrCard*         10 10000        1.71     0.46        2.00        1.76
## IsActiveMember*    11 10000        1.52     0.50        2.00        1.52
## EstimatedSalary    12 10000   100090.24 57510.49   100193.91   100114.86
## Exited*            13 10000        1.20     0.40        1.00        1.13
## GeographyFrance    14 10000        0.50     0.50        1.00        0.50
## GeographyGermany   15 10000        0.25     0.43        0.00        0.19
## GeographySpain     16 10000        0.25     0.43        0.00        0.18
##                       mad         min        max    range  skew kurtosis     se
## RowNumber         3706.50        1.00    10000.0   9999.0  0.00    -1.20  28.87
## CustomerId       92562.42 15565701.00 15815690.0 249989.0  0.00    -1.20 719.36
## Surname*          1085.26        1.00     2932.0   2931.0 -0.02    -1.20   8.46
## CreditScore         99.33      350.00      850.0    500.0 -0.07    -0.43   0.97
## Gender*              0.00        1.00        2.0      1.0 -0.18    -1.97   0.00
## Age                  8.90       18.00       92.0     74.0  1.01     1.39   0.10
## Tenure               2.97        0.00       10.0     10.0  0.01    -1.17   0.03
## Balance          69336.44        0.00   250898.1 250898.1 -0.14    -1.49 623.97
## NumOfProducts        0.00        1.00        4.0      3.0  0.75     0.58   0.01
## HasCrCard*           0.00        1.00        2.0      1.0 -0.90    -1.19   0.00
## IsActiveMember*      0.00        1.00        2.0      1.0 -0.06    -2.00   0.00
## EstimatedSalary  72941.18       11.58   199992.5 199980.9  0.00    -1.18 575.10
## Exited*              0.00        1.00        2.0      1.0  1.47     0.16   0.00
## GeographyFrance      0.00        0.00        1.0      1.0 -0.01    -2.00   0.01
## GeographyGermany     0.00        0.00        1.0      1.0  1.15    -0.68   0.00
## GeographySpain       0.00        0.00        1.0      1.0  1.17    -0.63   0.00
summary(BankData)
##    RowNumber       CustomerId         Surname           CreditScore   
##  Min.   :    1   Min.   :15565701   Length:10000       Min.   :350.0  
##  1st Qu.: 2501   1st Qu.:15628528   Class :character   1st Qu.:584.0  
##  Median : 5000   Median :15690738   Mode  :character   Median :652.0  
##  Mean   : 5000   Mean   :15690941                      Mean   :650.5  
##  3rd Qu.: 7500   3rd Qu.:15753234                      3rd Qu.:718.0  
##  Max.   :10000   Max.   :15815690                      Max.   :850.0  
##     Gender          Age            Tenure          Balance       NumOfProducts 
##  Female:4543   Min.   :18.00   Min.   : 0.000   Min.   :     0   Min.   :1.00  
##  Male  :5457   1st Qu.:32.00   1st Qu.: 3.000   1st Qu.:     0   1st Qu.:1.00  
##                Median :37.00   Median : 5.000   Median : 97199   Median :1.00  
##                Mean   :38.92   Mean   : 5.013   Mean   : 76486   Mean   :1.53  
##                3rd Qu.:44.00   3rd Qu.: 7.000   3rd Qu.:127644   3rd Qu.:2.00  
##                Max.   :92.00   Max.   :10.000   Max.   :250898   Max.   :4.00  
##  HasCrCard IsActiveMember EstimatedSalary     Exited   GeographyFrance 
##  0:2945    0:4849         Min.   :    11.58   0:7963   Min.   :0.0000  
##  1:7055    1:5151         1st Qu.: 51002.11   1:2037   1st Qu.:0.0000  
##                           Median :100193.91            Median :1.0000  
##                           Mean   :100090.24            Mean   :0.5014  
##                           3rd Qu.:149388.25            3rd Qu.:1.0000  
##                           Max.   :199992.48            Max.   :1.0000  
##  GeographyGermany GeographySpain  
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000  
##  Mean   :0.2509   Mean   :0.2477  
##  3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000
#ResearchQuestion1: Is there a significant relation between gender and being an active member at our bank?
#Chi-squared test was conducted, given the categorical nature of both IsActiveMember and Gender variables
#Chi-squared test was statistically significant at the level p<0.05, with the value of 4.9923
#Comparing the observed and expected frequencies, there are only 56.1 discrepancies among them - particularly 56.1 more females were estimated to be active members, and 56.1 males were estimated to be inactive
#Cramer's V statistic for effect size has shown that with the size of 0.02 the effect is tiny
#Odd's ratio statistic for effect size has shown that the ratio is 1.09 and thus very small
#In conclusion, the chi-squared test found that the association between the gender and activity of the members is significant and exists, with male members being more active in general, however the measures of effect size show that the relation is weak

barplot(table(BankData$IsActiveMember, BankData$Gender),
        beside = TRUE,
        col = c("pink", "lightblue"),
        legend = TRUE,
        names.arg = c("Female", "Male"),
        main = "Gender vs. Activity",
        xlab = "Activity",
        ylab = "Count")

chi_square <- chisq.test(BankData$IsActiveMember, BankData$Gender,
                         correct = TRUE)
chi_square
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  BankData$IsActiveMember and BankData$Gender
## X-squared = 4.9923, df = 1, p-value = 0.02546
addmargins(chi_square$observed)
##                        BankData$Gender
## BankData$IsActiveMember Female  Male   Sum
##                     0     2259  2590  4849
##                     1     2284  2867  5151
##                     Sum   4543  5457 10000
addmargins(round(chi_square$expected, 2))
##                        BankData$Gender
## BankData$IsActiveMember Female   Male   Sum
##                     0   2202.9 2646.1  4849
##                     1   2340.1 2810.9  5151
##                     Sum 4543.0 5457.0 10000
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
effectsize::cramers_v(BankData$IsActiveMember, BankData$Gender)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.02              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.02)
## [1] "tiny"
## (Rules: funder2019)
oddsratio(BankData$IsActiveMember, BankData$Gender)
## Odds ratio |       95% CI
## -------------------------
## 1.09       | [1.01, 1.18]
interpret_oddsratio(1.09)
## [1] "very small"
## (Rules: chen2010)
#ResearchQuestion2: Can we predict whether the client exited our bank based on the available variables?
#Available variables: Client's credit score, Gender, Age, How long were they with us,... 
#...Balance of their account in the end, Number of our products they've been using,... 
#...Whether they have a credit card, Whether they were an active member, Their estimated salary, Their country of origin
#The fit0 binary logistic function without the explanatory variables shows that the estimate is equal to -1.36333 (significant at p<0.001), which when transformed as an exponent gives us 0.255 - this means that the odds of members exiting are about 1/0.255 times of odds of members staying with our bank (this was apparent from the descriptive statistics as well - 7963/2037)
#The vif values are low, all below 2, indicating that there is no multicolinearity between the predictors
#The analysis of standardized residuals shows that there are no outliers with residuals below the value of -3 or above the value of 3
#The analysis of influential points similarly shows that there are no influential points above the value of 1, and there are no significant gaps noticed
#The analysis of deviance is highly statistically significant (p<0.001), indicating that the more complex model with all of our predictors is better at predicting the dependent variable than the fit0 model with just the intercept
#The summary of the fit1 model shows that out of the analyzed predictors, the significant ones were: CreditScore, Gender, Age, Balance, NumOfProducts, IsActiveMember, Geography (Germany)
#The ratio/odds (RO) additionally shows the particular times for which the odds ratio would be increased or decreased with the additional one unit of a certain predictor, holding all the others constant
#The interpretation of results is that members that are: with lower credit score, female, older, with higher account balance, with the usage of fewer of our products, inactive, and from Germany - are significantly more likely to exit our bank than the others
#We could perhaps use these results to lower the churn rates by focusing on improving the experiences of these members with our bank

fit0 <- glm(Exited ~ 1,
            family = binomial,
            data = BankData)

summary(fit0)
## 
## Call:
## glm(formula = Exited ~ 1, family = binomial, data = BankData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.675  -0.675  -0.675  -0.675   1.784  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.36333    0.02483  -54.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10110  on 9999  degrees of freedom
## Residual deviance: 10110  on 9999  degrees of freedom
## AIC: 10112
## 
## Number of Fisher Scoring iterations: 4
head(fitted(fit0))
##      1      2      3      4      5      6 
## 0.2037 0.2037 0.2037 0.2037 0.2037 0.2037
exp(cbind(odds = fit0$coefficients, confint.default(fit0)))
##                  odds     2.5 %    97.5 %
## (Intercept) 0.2558081 0.2436573 0.2685648
Pseudo_R2_fit1 <- 7963/10000
Pseudo_R2_fit1
## [1] 0.7963
fit1 <- glm(Exited ~ CreditScore + Gender + Age + Tenure + Balance + NumOfProducts + HasCrCard +
            IsActiveMember + EstimatedSalary + GeographyFrance + GeographyGermany,
            family = binomial,
            data = BankData)

library(car)
## Warning: package 'car' was built under R version 4.0.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.0.5
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
vif(fit1)
##      CreditScore           Gender              Age           Tenure 
##         1.001410         1.003915         1.081634         1.002863 
##          Balance    NumOfProducts        HasCrCard   IsActiveMember 
##         1.291751         1.079209         1.002089         1.078350 
##  EstimatedSalary  GeographyFrance GeographyGermany 
##         1.001414         1.666020         1.881879
BankData$StdResid <- rstandard(fit1)
BankData$Cook <- cooks.distance(fit1)

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
StdResid <- ggplot(BankData, aes(x=StdResid)) +
              theme_linedraw() +
              geom_histogram() +
              xlab("Standardized residuals")

Cook <- ggplot(BankData, aes(x=Cook)) +
          theme_linedraw() +
          geom_histogram() +
          xlab("Cook's distance")

library(ggpubr)
ggarrange(StdResid, Cook,
         ncol = 2, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

head(BankData[order(BankData$StdResid), c("RowNumber", "StdResid")], 5)
##      RowNumber  StdResid
## 4816      4816 -2.310921
## 9588      9588 -2.255057
## 7693      7693 -2.011095
## 8157      8157 -1.983400
## 9293      9293 -1.935901
head(BankData[order(-BankData$StdResid), c("RowNumber", "StdResid")], 5)
##      RowNumber StdResid
## 2142      2142 2.994159
## 9573      9573 2.926443
## 2432      2432 2.863175
## 5216      5216 2.746696
## 4923      4923 2.731139
head(BankData[order(-BankData$Cook), c("RowNumber", "Cook")], 5)
##      RowNumber        Cook
## 7725      7725 0.001848151
## 3366      3366 0.001761374
## 2093      2093 0.001690396
## 5701      5701 0.001684427
## 4823      4823 0.001652999
#identify which rows are outliers based on their standard residuals being above 3 or below -3
outliers <- which(BankData$StdResid > 3 | BankData$StdResid < -3)
print(outliers)
## integer(0)
#identify which rows are influential based on the Cook's distance being above 1
threshold <- 1
influential_obs <- which(BankData$Cook > threshold)
print(influential_obs)
## integer(0)
fit0 <- glm(Exited ~ 1,
            family = binomial,
            data = BankData)

fit1 <- glm(Exited ~ CreditScore + Gender + Age + Tenure + Balance + NumOfProducts + HasCrCard +
            IsActiveMember + EstimatedSalary + GeographyFrance + GeographyGermany,
            family = binomial,
            data = BankData)

anova(fit0, fit1, test = "Chi")
## Analysis of Deviance Table
## 
## Model 1: Exited ~ 1
## Model 2: Exited ~ CreditScore + Gender + Age + Tenure + Balance + NumOfProducts + 
##     HasCrCard + IsActiveMember + EstimatedSalary + GeographyFrance + 
##     GeographyGermany
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      9999    10109.8                          
## 2      9988     8561.4 11   1548.4 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit1)
## 
## Call:
## glm(formula = Exited ~ CreditScore + Gender + Age + Tenure + 
##     Balance + NumOfProducts + HasCrCard + IsActiveMember + EstimatedSalary + 
##     GeographyFrance + GeographyGermany, family = binomial, data = BankData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3097  -0.6589  -0.4560  -0.2697   2.9940  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.357e+00  2.493e-01 -13.467  < 2e-16 ***
## CreditScore      -6.683e-04  2.803e-04  -2.384   0.0171 *  
## GenderMale       -5.285e-01  5.449e-02  -9.699  < 2e-16 ***
## Age               7.271e-02  2.576e-03  28.230  < 2e-16 ***
## Tenure           -1.595e-02  9.355e-03  -1.705   0.0882 .  
## Balance           2.637e-06  5.142e-07   5.128 2.92e-07 ***
## NumOfProducts    -1.015e-01  4.713e-02  -2.154   0.0312 *  
## HasCrCard1       -4.468e-02  5.934e-02  -0.753   0.4515    
## IsActiveMember1  -1.075e+00  5.769e-02 -18.643  < 2e-16 ***
## EstimatedSalary   4.807e-07  4.737e-07   1.015   0.3102    
## GeographyFrance  -3.522e-02  7.064e-02  -0.499   0.6181    
## GeographyGermany  7.395e-01  7.881e-02   9.383  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10109.8  on 9999  degrees of freedom
## Residual deviance:  8561.4  on 9988  degrees of freedom
## AIC: 8585.4
## 
## Number of Fisher Scoring iterations: 5
exp(cbind(RO = fit1$coefficients, confint.default(fit1)))
##                          RO      2.5 %     97.5 %
## (Intercept)      0.03483742 0.02137238 0.05678573
## CreditScore      0.99933189 0.99878295 0.99988114
## GenderMale       0.58949870 0.52978819 0.65593898
## Age              1.07541433 1.06999942 1.08085664
## Tenure           0.98417741 0.96629674 1.00238894
## Balance          1.00000264 1.00000163 1.00000364
## NumOfProducts    0.90346071 0.82373695 0.99090037
## HasCrCard1       0.95630690 0.85130934 1.07425450
## IsActiveMember1  0.34114633 0.30467627 0.38198190
## EstimatedSalary  1.00000048 0.99999955 1.00000141
## GeographyFrance  0.96539516 0.84057799 1.10874639
## GeographyGermany 2.09488073 1.79503264 2.44481642