library(readxl)
library(Hmisc)

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)
library(readxl)
library(naniar)
library(tibble)
library(FactoMineR)
library(ggpubr)
library(rstatix)

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ rstatix::filter()  masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::src()       masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Introduction of the data, explaining the variables, and cleaning the data

#install.packages("readxl")
excel_file_path <- "C:/Users/ACER/Desktop/R Assignment/NLB Anka.xlsx"
mydata <- read_excel("C:/Users/ACER/Desktop/R Assignment/NLB Anka.xlsx")
head(mydata)

## # A tibble: 6 × 107
##      ID Q1      Q1_10_text Q2a   Q2b   Q2c   Q2d   Q2e   Q2f   Q3a   Q3b   Q3c  
##   <dbl> <chr>   <chr>      <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1    NA Katera… Drugo (pr… NLB   NKBM  SKB   Unic… Revo… N26   NLB   NKBM  SKB  
## 2     1 1       -2         4     2     1     2     3     3     4     3     1    
## 3     2 3       -2         3     6     6     6     4     3     4     3     3    
## 4     3 1       -2         3     6     6     6     6     6     3     6     2    
## 5     4 1       -2         4     6     3     6     4     6     4     6     4    
## 6     5 10      uporablja… 6     4     6     6     6     6     6     4     6    
## # ℹ 95 more variables: Q3d <chr>, Q3e <chr>, Q3f <chr>, Q4a <chr>, Q4b <chr>,
## #   Q4c <chr>, Q4d <chr>, Q4e <chr>, Q4f <chr>, Q5a <chr>, Q5b <chr>,
## #   Q5c <chr>, Q5d <chr>, Q5e <chr>, Q5f <chr>, Q6 <chr>, Q7a <chr>, Q7b <chr>,
## #   Q7c <chr>, Q7d <chr>, Q7e <chr>, Q7f <chr>, Q7g <chr>, Q7h <chr>,
## #   Q7i <chr>, Q7i_text <chr>, Q8 <chr>, Q9 <chr>, Q10 <chr>, Q11a <chr>,
## #   Q11b <chr>, Q11c <chr>, Q11d <chr>, Q11e <chr>, Q11f <chr>, Q11g <chr>,
## #   Q11h <chr>, Q11h_text <chr>, Q12 <chr>, Q13 <chr>, Q13_7_text <chr>, …

n=174

Unit of observation: One survey taker

Description of the variables:

Q1 - Which is your primary bank? (1 - NLB; 2 - SKB; 3 - Gorenjska Banka; 4 - NKBM; 5 - Addiko Bank; 6 - Delavska hranilnica; 7 - Banka Intesa Sanpaolo; 8 - Sparkasse; 9 - DBS; 10 - Other (please specify): Q1_10_text )

Q2 - How much do you agree that services of banks are easy to use? (Q2a - NLB; Q2b - NKBM; Q2c - SKB, Q2d - UniCredit, Q2e - Revolut, Q2f - N26)

Q3 - How much do you agree that the stated bank seems safe to you (in terms of everyday transactions, cybernetic security, paying abroad)? (Q3a - NLB; Q3b - NKBM; Q3c - SKB, Q3d - UniCredit, Q3e - Revolut, Q3f - N26)

Q4 - How much do you agree that the stated bank seems advanced to you? (Q4a - NLB; Q4b - NKBM; Q4c - SKB, Q4d - UniCredit, Q4e - Revolut, Q4f - N26)

Q5 - How much do you agree that the stated bank is widespread among young people? (Q5a - NLB; Q5b - NKBM; Q5c - SKB, Q5d - UniCredit, Q5e - Revolut, Q5f - N26) (1 - I don’t agree at all; 2 - I don’t agree; 3 - undecided; 4 - I agree; 5 - I fully agree; 6 - I don’t know)

Q6 - How frequently do you use mobile banking apps? (1 - several times a day; 2 - daily; 3 - weekly; 4 - monthly; 5 - less than monthly, 6 - I don’t use mobile bank)

Q7 - Please select the functions your mobile bank offers that you are aware of? (Q7a - paying bills by scanning QR code; Q7b - checking account balance; Q7c - monthly spending overview; Q7d - cost planning; Q7e - integrated Flik; Q7f - submitting a rapid loan request; Q7g - accessing archived documents; Q7h - investing options; Q7i - Other (please specify): Q7i_text)

Q8 - How important is it for you to be able to access all services of your bank through one mobile app? (1 - Not important at all; 7 - Very important)

Q9 - How familiar are you with the latest features and updates available in your mobile banking apps? (1 - Not familiar at all; 7 - Very familiar)

Q10 - In the past year, how often have you actively explored new features in your mobile banking app? (1 - more than once a month; 2 - every second month; 3 - three to four times; 4 - once to twice; 5 - never)

Q11 - Through which channels are you most often notified about new and better bank services? (Q11a - social media; Q11b - traditional media (TV, radio); Q11c - friends and family; Q11d - in-branch assistance; Q11e - the bank’s official website; Q11f - forums; Q11g - mobile ban; Q11h - other (please specify): Q11h_text)

Q12 - How effective do you find social media as a source of information about mobile banking functions? (1 - Not effective at all; 7 - Very effective)

Q13 - What specific social media platform do you prefer the most for receiving information about banking services? (1 - Instagram; 2 - Facebook; 3 - X (Twitter); 4 - TikTok; 5 - LinkedIn; 6 - YouTube; 7 - other (please specify): Q13_7_text)

Q14 - Would you like to participate in any rewards programs offered by your bank? ( 1 - yes; 2 - no; 3 - I don’t know)

Q15 - Order next reward incentives from least motivational to the most motivational for you, when exploring new banking services? (Q15a - discounts on banking services and purchases; Q15b - cash rewards; Q15c - tickets to events; Q15d - customized financial advice)

Q16 - Are you aware of any rewards programs offered by your bank in the past? (1 - yes; 2 - no; 3 - I don’t know)

Q17 - How important are each of the stated properties of a banking service to you? (Q17a - instant money transfers; Q17b - fast and accessible customer support; Q17c - fast new account opening; Q17d - one mobile banking app for all bank services; Q17e - loan accessible through online bank; Q17f - low costs of managing the account; Q17g - fast and complete information about banking services) (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)

Q18 - Do you use a Neobank (fintech companies that offer financial products and services to consumers via a digital platform such as Revolut, N26 or mBills)? (1 - yes; 2 - no)

Q19 - Would you consider trying services of a Neobank in the future? (1 - yes; 2 - no; 3 - I don’t know) (BO)

Q20 - Which bank institution would you choose in the next situations? (Q20a - sending money among friends; Q20b - overview of self spending habits; Q20c - online shopping; Q20d - everyday shopping like groceries; Q20e - abroad payments; Q20f - payments in foreign currency; Q20g - savings account) (1 - neobank; 2 - traditional bank; 3 - neobank and traditional bank)

Q21 - When considering banking services, which is more important to you, simplicity or security? (1 - prioritize simplicity; 7 - prioritize security)

Q22 - How important are stated properties of banks to you? (Q22a - polite service from employees; Q22b - modern look of the branch office; Q22c - easy and convenient mobile banking; Q22d - access to information about services; Q22e - feeling of financial security; Q22f - fast services/speed of services; Q22g - good reputation of the bank; Q22h - technological advancement; Q22i - Accessibility; Q22j - benefits) (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)

Q23 - How likely would you decide to change the bank in the next situations? (Q23a - long lines at branches; Q23b - unprofessional attitude of employees; Q23c - high provisions; Q23d - lack of useful properties in mobile banking app; Q23e - not getting a loan; Q24f - too much of bureaucracy; Q23g - deficient information about services, Q23h - recommendation of other bank from friend/family) (1 - for sure not; 2 - not; 3 - probably not; 4 - maybe; 5 - probably yes; 6 - yes; 7 - for sure yes )

Q24 - How likely are you to recommend your current bank to your friend? (10 - Very likely; 1 - Very unlikely)

Q25 - How old are you? (x - age)

Q26 - Gender (1 - male; 2 - female; 3 - other)

Q27 - Highest education attained (1 - primary school; 2 - secondary school; 3 - gymnasium; 4 - bachelor’s degree; 5 - master’s degree)

Q28 - Employment status (1 - highschool student; 2 - student; 3 - full-time employed; 4 - part-time employed; 5 - unemployed)

Q29 - What is your average personal monthly income? (1 - less than 500€; 2 - 501€ - 1000€; 3 - 1001€ - 1500€; 4 - 1501€ - 2000€; 5 - more than 2000€)

Q30 - Mark the region of your residence. (1- Gorenjska; 2 - Goriška; 3 - Osrednjeslovenska; 4 - Obalno-kraška; 5 - Notranjska; 6 - Dolenjska; 7 - Posavska; 8 - Zasavska; 9 - Savinjska; 10 - Koroška; 11 - Podravska; 12 - Pomurska)

mydata <- mydata[-1,]

mydata$BankF <- factor(mydata$Q1, 
                            levels = c(1,2,3,4,5,6,7,8,9,10), 
                            labels = c("NLB", "SKB", "Gorenjska Banka", "NKBM", "Addiko    Bank","Delavska hranilnica","Banka Intesa Sanpaolo","Sparkasse","DBS","Other"))

mydata$FrequencyF <- factor(mydata$Q6, 
                            levels = c(1,2,3,4,5,6), 
                            labels = c("several times a day", "daily", "weekly", "monthly", "less than monthly","I don’t use mobile bank"))

mydata$Q7aF <- factor(mydata$Q7a, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7bF <- factor(mydata$Q7b, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))


mydata$Q7cF <- factor(mydata$Q7c, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7dF <- factor(mydata$Q7d, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7eF <- factor(mydata$Q7e, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7fF <- factor(mydata$Q7f, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7gF <- factor(mydata$Q7g, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7hF <- factor(mydata$Q7h, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7iF <- factor(mydata$Q7i, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$ExploreF <- factor(mydata$Q10, 
                            levels = c(1,2,3,4,5), 
                            labels = c("more than once a month", "every second month", "three to four times", "once to twice", "never"))

mydata$Q11aF <- factor(mydata$Q11a, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q11bF <- factor(mydata$Q11b, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))


mydata$Q11cF <- factor(mydata$Q11c, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q11dF <- factor(mydata$Q11d, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q11eF <- factor(mydata$Q11e, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q11fF <- factor(mydata$Q11f, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q11gF <- factor(mydata$Q11g, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q11hF <- factor(mydata$Q11h, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7aF <- factor(mydata$Q7a, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7bF <- factor(mydata$Q7b, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))


mydata$Q7cF <- factor(mydata$Q7c, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7dF <- factor(mydata$Q7d, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7eF <- factor(mydata$Q7e, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7fF <- factor(mydata$Q7f, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q7gF <- factor(mydata$Q7g, 
                            levels = c(0,1), 
                            labels = c("No", "Yes"))

mydata$Q14F <- factor(mydata$Q14, 
                            levels = c(1,2,3), 
                            labels = c("Yes", "No", "I don't know"))

mydata$Q16F <- factor(mydata$Q16, 
                            levels = c(1,2,3), 
                            labels = c("Yes", "No", "I don't know"))

mydata$Q18F <- factor(mydata$Q18, 
                            levels = c(1,2), 
                            labels = c("Yes", "No"))

mydata$Q19F <- factor(mydata$Q19, 
                            levels = c(1,2,3,-2), 
                            labels = c("Yes", "No", "I don't know","Already use them"))

mydata$Q20aF <- factor(mydata$Q20a, 
                            levels = c(1,2,3,4), 
                            labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))

mydata$Q20bF <- factor(mydata$Q20b, 
                            levels = c(1,2,3,4), 
                            labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))

mydata$Q20cF <- factor(mydata$Q20c, 
                            levels = c(1,2,3,4), 
                            labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))

mydata$Q20dF <- factor(mydata$Q20d, 
                            levels = c(1,2,3,4), 
                            labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))

mydata$Q20eF <- factor(mydata$Q20e, 
                            levels = c(1,2,3,4), 
                            labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))

mydata$Q20fF <- factor(mydata$Q20f, 
                            levels = c(1,2,3,4), 
                            labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))

mydata$Q20gF <- factor(mydata$Q20g, 
                            levels = c(1,2,3,4), 
                            labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))

mydata$GenderF <- factor(mydata$Q26, 
                            levels = c(1,2,3), 
                            labels = c("male", "female", "other"))

mydata$EducationF <- factor(mydata$Q27, 
                            levels = c(1,2,3,4,5), 
                            labels = c("primary school", "secondary school", "gymnasium","bachelor's degree","master's degree"))

mydata$EmploymentF <- factor(mydata$Q28, 
                            levels = c(1,2,3,4,5), 
                            labels = c("highschool student", "student", "full-time employed","part-time employed","unemployed"))

mydata$IncomeF <- factor(mydata$Q29, 
                            levels = c(1,2,3,4,5), 
                            labels = c("less than 500€", "501€ - 1000€", "1001€ - 1500€","1501€ - 2000€","more than 2000€"))

mydata$RegionF <- factor(mydata$Q30, 
                            levels = c(1,2,3,4,5,6,7,8,9,10,11,12), 
                            labels = c("Gorenjska", "Goriška", "Osrednjeslovenska","Obalno-kraška","Notranjska","Dolenjska","Posavska","Zasavska","Savinjska","Koroška","Podravska","Pomurska"))

mydata$Bank_CombinedF <- factor(mydata$Bank_Combined,
                                levels = c("NLB", "NKBM", "Others"),
                                labels = c("NLB", "NKBM", "Others"))

mydata$Region_CombinedF <- factor(mydata$Region_Combined,
                                  levels = c("Gorenjska", "Osrednjeslovenska", "Others"),
                                  labels = c("Gorenjska", "Osrednjeslovenska", "Others"))

mydata$Education_CombinedF <- factor(mydata$Education_Combined,
                                     levels = c("lower education", "higher education"),
                                     labels = c("lower education", "higher education"))

mydata$Employment_CombinedF <- factor(mydata$Employment_Combined,
                                     levels = c("student", "employed"),
                                     labels = c("student", "employed"))

mydata[4:27] <- lapply(mydata[4:27], as.numeric)
mydata[39:40] <- lapply(mydata[39:40], as.numeric)
mydata[51] <- lapply(mydata[51], as.numeric)
mydata[55:58] <- lapply(mydata[55:58], as.numeric)
mydata[60:66] <- lapply(mydata[60:66], as.numeric)
mydata[76:96] <- lapply(mydata[76:96], as.numeric)

Demographic Analysis

Gender

gender_summary <- table(mydata$GenderF)

print(gender_summary)

## 
##   male female  other 
##     76     95      3

barplot(gender_summary, col = c("blue", "pink", "green"), main = "Gender Distribution", 
        xlab = "Gender", ylab = "Count", names.arg = names(gender_summary))

Income

income_summary <- table(mydata$IncomeF)

print(income_summary)

## 
##  less than 500€    501€ - 1000€   1001€ - 1500€   1501€ - 2000€ more than 2000€ 
##              78              46              24              16              10

barplot(income_summary, col = c("blue", "pink", "green","yellow","red"), main = "Income Distribution", 
        xlab = "Income", ylab = "Count", names.arg = names(income_summary), las = 1, cex.names = 0.8)

Age

hist(mydata$Q25, col = "skyblue", main = "Age Distribution", xlab = "Age", ylab = "Frequency")

Education

education_summary <- table(mydata$EducationF)

print(education_summary)

## 
##    primary school  secondary school         gymnasium bachelor's degree 
##                 8                48                40                61 
##   master's degree 
##                17

barplot(education_summary, col = c("green"), main = "Education Distribution", 
        xlab = "Education", ylab = "Count", names.arg = names(education_summary), las = 1, cex.names = 0.5)

Employment

employment_summary <- table(mydata$EmploymentF)

print(employment_summary)

## 
## highschool student            student full-time employed part-time employed 
##                 26                 99                 40                  6 
##         unemployed 
##                  3

barplot(employment_summary, col = c("green"), main = "Employment Distribution", 
        xlab = "Employment", ylab = "Count", names.arg = names(employment_summary), las = 1, cex.names = 0.5)

Bank

bank_summary <- table(mydata$BankF)

print(bank_summary)

## 
##                   NLB                   SKB       Gorenjska Banka 
##                    65                    21                    10 
##                  NKBM        Addiko    Bank   Delavska hranilnica 
##                    34                     2                     6 
## Banka Intesa Sanpaolo             Sparkasse                   DBS 
##                     9                     3                     8 
##                 Other 
##                    16

barplot(bank_summary, col = c("green"), main = "Bank Distribution", 
        xlab = "Bank", ylab = "Count", names.arg = names(bank_summary), las = 2, cex.names = 0.5)

Region

region_summary <- table(mydata$RegionF)

print(region_summary)

## 
##         Gorenjska           Goriška Osrednjeslovenska     Obalno-kraška 
##                55                 7                74                 8 
##        Notranjska         Dolenjska          Posavska          Zasavska 
##                 7                 9                 1                 4 
##         Savinjska           Koroška         Podravska          Pomurska 
##                 5                 0                 2                 2

barplot(region_summary, col = c("green"), main = "Region Distribution", 
        xlab = "Region", ylab = "Count", names.arg = names(region_summary), las = 2, cex.names = 0.5)

Association

In this section we would check the association between the primary bank of a survey taker and other demographic characteristics-Income, Education level, Employment, Gender, Geographical Region. Since the bank and the stated characteristics are categorical variables, the associations are tested using the Pearson Chi square method

H0: There is no association

H1: There is an association

Based on Income

resultsI <- chisq.test(mydata$IncomeF, mydata$Bank_CombinedF,
           correct = FALSE)

## Warning in chisq.test(mydata$IncomeF, mydata$Bank_CombinedF, correct = FALSE):
## Chi-squared approximation may be incorrect

resultsI

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$IncomeF and mydata$Bank_CombinedF
## X-squared = 8.0185, df = 8, p-value = 0.4317

As p>0.05, we cannot reject that there is no association between the Income level and the primary bank

Based on Education

resultsE <- chisq.test(mydata$EducationF, mydata$Bank_CombinedF,
           correct = FALSE)

## Warning in chisq.test(mydata$EducationF, mydata$Bank_CombinedF, correct =
## FALSE): Chi-squared approximation may be incorrect

resultsE

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EducationF and mydata$Bank_CombinedF
## X-squared = 7.5102, df = 8, p-value = 0.4827

As p>0.05, we cannot reject that there is no association between the Education level and the primary bank

Based on Employment status

resultsEm <- chisq.test(mydata$EmploymentF, mydata$Bank_CombinedF,
           correct = FALSE)

## Warning in chisq.test(mydata$EmploymentF, mydata$Bank_CombinedF, correct =
## FALSE): Chi-squared approximation may be incorrect

resultsEm

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EmploymentF and mydata$Bank_CombinedF
## X-squared = 8.7914, df = 8, p-value = 0.3602

As p>0.05, we cannot reject that there is no association between the Employment status and the primary bank

Based on Gender

resultsG <- chisq.test(mydata$GenderF, mydata$Bank_CombinedF,
           correct = FALSE)

## Warning in chisq.test(mydata$GenderF, mydata$Bank_CombinedF, correct = FALSE):
## Chi-squared approximation may be incorrect

resultsG

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$GenderF and mydata$Bank_CombinedF
## X-squared = 7.8106, df = 4, p-value = 0.09877

As p>0.05, we cannot reject that there is no association between the Gender and the primary bank

Based on Region

resultsR <- chisq.test(mydata$Region_CombinedF, mydata$Bank_CombinedF,
           correct = FALSE)

resultsR

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Region_CombinedF and mydata$Bank_CombinedF
## X-squared = 31.028, df = 4, p-value = 3.022e-06

We can reject H0(p<0.001) and can say there is an association between the geographical region and the primary bank. Further, we can check the observed and expected frequencies and the differences between them

addmargins(resultsR$observed)

##                        mydata$Bank_CombinedF
## mydata$Region_CombinedF NLB NKBM Others Sum
##       Gorenjska           9   19     27  55
##       Osrednjeslovenska  44   18     12  74
##       Others             12   18     15  45
##       Sum                65   55     54 174

round(resultsR$expected)

##                        mydata$Bank_CombinedF
## mydata$Region_CombinedF NLB NKBM Others
##       Gorenjska          21   17     17
##       Osrednjeslovenska  28   23     23
##       Others             17   14     14

round(resultsR$res, 2)

##                        mydata$Bank_CombinedF
## mydata$Region_CombinedF   NLB  NKBM Others
##       Gorenjska         -2.55  0.39   2.40
##       Osrednjeslovenska  3.11 -1.11  -2.29
##       Others            -1.17  1.00   0.28

Based on above, we can say that there are less than expected NLB users in the Gorenjska region and more than expected NLB users in the Osrednjeslovenska region

Clustering

Correlation, scaling, dissimilarity, and cluster tendency

Cluster Variables:

Q12: How effective do you find social media as a source of information about mobile banking functions? (1 - Not effective at all; 7 - Very effective)
Q17b: How important is fast and accessible customer support for you(1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)
Q21: Q21 - When considering banking services, which is more important to you, simplicity or security? (1 - prioritize simplicity; 7 - prioritize security)
Q22j: How important are benefits probvided by a bank for you? (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)

mydataNew <- mydata

cor_matrixN <- cor(mydataNew[, c("Q21","Q22j","Q12","Q17b")])
print(cor_matrixN)

##             Q21      Q22j        Q12      Q17b
## Q21   1.0000000 0.1694530 -0.0548354 0.0701421
## Q22j  0.1694530 1.0000000  0.1587895 0.2807387
## Q12  -0.0548354 0.1587895  1.0000000 0.1628189
## Q17b  0.0701421 0.2807387  0.1628189 1.0000000

mydataNew$Q21_z <- scale(mydataNew$Q21)
mydataNew$Q17b_z <- scale(mydataNew$Q17b)
mydataNew$Q22j_z <- scale(mydataNew$Q22j)
mydataNew$Q12_z <- scale(mydataNew$Q12)

rcorr(as.matrix(mydataNew[ , c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")]),
      type = "pearson")

##        Q21_z Q17b_z Q22j_z Q12_z
## Q21_z   1.00   0.07   0.17 -0.05
## Q17b_z  0.07   1.00   0.28  0.16
## Q22j_z  0.17   0.28   1.00  0.16
## Q12_z  -0.05   0.16   0.16  1.00
## 
## n= 174 
## 
## 
## P
##        Q21_z  Q17b_z Q22j_z Q12_z 
## Q21_z         0.3577 0.0254 0.4724
## Q17b_z 0.3577        0.0002 0.0318
## Q22j_z 0.0254 0.0002        0.0364
## Q12_z  0.4724 0.0318 0.0364

mydataNew$Dissimilarity1 <- sqrt(mydataNew$Q21_z^2 + mydataNew$Q17b_z^2 + mydataNew$Q22j_z^2 + mydataNew$Q12_z^2)

head(mydataNew[order(-mydataNew$Dissimilarity1), c(1, 152)], 10)

## # A tibble: 10 × 2
##       ID Dissimilarity1[,1]
##    <dbl>              <dbl>
##  1   153               4.44
##  2   152               4.30
##  3   151               4.28
##  4   110               4.17
##  5   162               4.08
##  6    64               3.85
##  7   169               3.67
##  8   154               3.64
##  9   150               3.56
## 10    78               3.51

hist(mydataNew$Dissimilarity1, 
     xlab = "Dissimilarity", 
     ylab = "Frequency", 
     main = "Histogram of Dissimilarity")

distanceNew <- get_dist(mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")],
                     method = "euclidian")

distanceNew2 <- distanceNew^2

fviz_dist(distanceNew2,
          gradient = list(low = "darkred", mid = "lightblue", high = "blue"))

get_clust_tendency(mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")],
                   n = nrow(mydataNew) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.6124007
## 
## $plot
## NULL

As this value is 0.5, we can say our data is clusterable

Dendrogram

WARDNew <- mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")] %>%
           get_dist(method = "euclidian") %>%
           hclust(method = "ward.D2")

WARDNew

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 174

fviz_dend(WARDNew,
          k = 5,
          cex = 0.5,
          palette = "jama",
          color_labels_by_k = TRUE,
          rect = TRUE)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Based on the above dendrogram structure 5 clusters are the most appropriate

mydataNew$ClusterWard2 <- cutree(WARDNew,
                             k = 5)
head(mydataNew[c(1, 153)])

## # A tibble: 6 × 2
##      ID ClusterWard2
##   <dbl>        <int>
## 1     1            1
## 2     2            2
## 3     3            3
## 4     4            3
## 5     5            1
## 6     6            4

Initial_leadersNew <- aggregate(mydataNew[ , c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")],
                             by = list(mydataNew$ClusterWard2),
                             FUN = mean)
Initial_leadersNew

##   Group.1       Q21_z      Q17b_z     Q22j_z       Q12_z
## 1       1  1.03300792  0.29833863  0.3112749  0.46828266
## 2       2 -1.07180564  0.41615065  0.4504003  1.56815074
## 3       3 -0.09723225  0.36128255  0.3551104 -0.72308065
## 4       4 -0.53144035 -0.01396479 -1.4726985 -0.03593379
## 5       5 -0.13558241 -2.28655011 -0.1059309 -0.32075639

Clustering using the K Means method

K_MEANSNew <- hkmeans(mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")], 
                   k = 5,
                   hc.metric = "euclidean",
                   hc.method = "ward.D2")

K_MEANSNew

## Hierarchical K-means clustering with 5 clusters of sizes 38, 24, 70, 26, 16
## 
## Cluster means:
##         Q21_z     Q17b_z     Q22j_z      Q12_z
## 1  1.27618278  0.3351549  0.3436296  0.4196689
## 2 -0.94827619  0.3689031  0.4622637  1.4535219
## 3 -0.17432634  0.3235841  0.3436296 -0.5755567
## 4 -0.56139823 -0.1632806 -1.7370290 -0.1777341
## 5  0.06643004 -2.4996969 -0.1902236 -0.3701181
## 
## Clustering vector:
##   [1] 1 2 1 3 1 4 3 3 3 1 1 5 4 1 3 3 1 4 3 4 3 1 2 3 3 1 3 1 3 4 3 3 3 2 1 1 1
##  [38] 2 3 3 1 1 2 3 3 5 3 1 4 2 3 1 4 3 4 2 1 4 2 3 4 2 4 4 1 4 3 3 3 3 3 3 1 1
##  [75] 3 3 3 4 1 1 3 3 1 3 3 3 4 3 3 3 3 1 4 4 3 1 3 5 4 3 2 3 3 3 1 5 4 3 1 5 3
## [112] 1 2 5 5 3 2 4 1 1 3 1 5 3 1 5 3 3 3 2 3 3 1 3 3 3 3 4 3 5 2 3 3 2 1 4 1 4
## [149] 2 5 5 4 5 5 2 2 2 3 2 2 2 5 2 2 3 1 3 5 4 1 3 4 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 59.36713 29.96816 95.95803 54.48124 62.32125
##  (between_SS / total_SS =  56.3 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "data"        
## [11] "hclust"

fviz_cluster(K_MEANSNew, 
             palette = "jama", 
             repel = FALSE,
             ggtheme = theme_classic())

mydataNew$ClusterK_MeansNew <- K_MEANSNew$cluster
head(mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")])

## # A tibble: 6 × 4
##   Q21_z[,1] Q17b_z[,1] Q22j_z[,1] Q12_z[,1]
##       <dbl>      <dbl>      <dbl>     <dbl>
## 1     0.905    -0.0698      1.06     0.568 
## 2    -1.21     -0.880       0.344    1.19  
## 3     1.61     -0.0698      0.344   -1.31  
## 4    -0.507     0.740       0.344   -0.683 
## 5     0.199    -0.0698      0.344    0.568 
## 6    -0.507    -0.0698     -1.08    -0.0575

table(mydataNew$ClusterWard2)

## 
##  1  2  3  4  5 
## 44 20 62 29 19

table(mydataNew$ClusterK_MeansNew)

## 
##  1  2  3  4  5 
## 38 24 70 26 16

table(mydataNew$ClusterWard2, mydataNew$ClusterK_MeansNew)

##    
##      1  2  3  4  5
##   1 35  2  7  0  0
##   2  0 20  0  0  0
##   3  3  1 55  3  0
##   4  0  0  7 22  0
##   5  0  1  1  1 16

CentroidsNew <- K_MEANSNew$centers
CentroidsNew

##         Q21_z     Q17b_z     Q22j_z      Q12_z
## 1  1.27618278  0.3351549  0.3436296  0.4196689
## 2 -0.94827619  0.3689031  0.4622637  1.4535219
## 3 -0.17432634  0.3235841  0.3436296 -0.5755567
## 4 -0.56139823 -0.1632806 -1.7370290 -0.1777341
## 5  0.06643004 -2.4996969 -0.1902236 -0.3701181

FigureNew <- as.data.frame(CentroidsNew)
FigureNew$id <- 1:nrow(FigureNew)
FigureNew <- pivot_longer(FigureNew, cols = c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z"))

FigureNew$Groups <- factor(FigureNew$id, 
                        levels = c(1, 2, 3, 4, 5), 
                        labels = c("1", "2", "3", "4", "5"))

FigureNew$nameFactor <- factor(FigureNew$name, 
                            levels = c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z"), 
                            labels = c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z"))

ggplot(FigureNew, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Groups, col = Groups), size = 3) +
  geom_line(aes(group = id), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables") +
  ylim(-3, 3)

Checking cluster fit

H0: Mean(Cluster k variable, Cluster 1) = Mean(Cluster k variable, Cluster 2) = Mean(Cluster k variable, Cluster 3) = Mean(Cluster k variable, Cluster 4) = Mean(Cluster k variable, Cluster 5)

H1: At least one Mean is different from others

clusterfitNew <- aov(cbind(Q21_z, Q17b_z, Q22j_z, Q12_z) ~ as.factor(ClusterK_MeansNew), 
                 data = mydataNew)

summary(clusterfitNew)

##  Response 1 :
##                               Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MeansNew)   4 93.862 23.4655  50.111 < 2.2e-16 ***
## Residuals                    169 79.138  0.4683                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##                               Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MeansNew)   4 115.533  28.883  84.941 < 2.2e-16 ***
## Residuals                    169  57.467   0.340                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##                               Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MeansNew)   4 96.909 24.2273   53.81 < 2.2e-16 ***
## Residuals                    169 76.091  0.4502                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##                               Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MeansNew)   4   83.6  20.900  39.509 < 2.2e-16 ***
## Residuals                    169   89.4   0.529                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We reject H0 for all the cluster variables(p<0.001) and can say that the variables are statistically appropriate for clustering

Validation

Using Age

H0: Mean(Age, Cluster 1) = Mean(Age, Cluster 2) = Mean(Age, Cluster 3) = Mean(Age, Cluster 4) = Mean(Age, Cluster 5)

H1: At least one Mean is different from others

aggregate(mydataNew$Q25, 
          by = list(mydataNew$ClusterK_MeansNew), 
          FUN = "mean")

##   Group.1        x
## 1       1 21.13158
## 2       2 21.45833
## 3       3 21.64286
## 4       4 21.61538
## 5       5 21.43750

agefitNew <- aov(Q25 ~ as.factor(ClusterK_MeansNew), 
           data = mydataNew)

summary(agefitNew)

##                               Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew)   4      7   1.746   0.243  0.914
## Residuals                    169   1214   7.186

As p>0.05, we cannot reject H0 and as a result we cannot validate our clusters using age

Using recommender score, Q24 - How likely are you to recommend your current bank to your friend? (10 - Very likely; 1 - Very unlikely)

H0: Mean(Recommender Score, Cluster 1) = Mean(Recommender Score, Cluster 2) = Mean(Recommender Score, Cluster 3) = Mean(Recommender Score, Cluster 4) = Mean(Recommender Score, Cluster 5)

H1: At least one Mean is different from others

aggregate(mydataNew$Q24, 
          by = list(mydataNew$ClusterK_MeansNew), 
          FUN = "mean")

##   Group.1        x
## 1       1 7.263158
## 2       2 6.250000
## 3       3 6.142857
## 4       4 6.576923
## 5       5 5.312500

agefitNew <- aov(Q24 ~ as.factor(ClusterK_MeansNew), 
           data = mydataNew)

summary(agefitNew)

##                               Df Sum Sq Mean Sq F value Pr(>F)  
## as.factor(ClusterK_MeansNew)   4   53.2  13.300   2.734 0.0307 *
## Residuals                    169  822.2   4.865                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can reject H0(p<0.05) and can say At least one Mean is different from others and as a result we are able to validate our clusters using the recommender score

Using familiarity score, Q9 - Q9 - How familiar are you with the latest features and updates available in your mobile banking apps? (1 - Not familiar at all; 7 - Very familiar)

H0: Mean(Familiarity Score, Cluster 1) = Mean(Familiarity Score, Cluster 2) = Mean(Familiarity Score, Cluster 3) = Mean(Familiarity Score, Cluster 4) = Mean(Familiarity Score, Cluster 5)

H1: At least one Mean is different from others

aggregate(mydataNew$Q9, 
          by = list(mydataNew$ClusterK_MeansNew), 
          FUN = "mean")

##   Group.1        x
## 1       1 4.684211
## 2       2 4.041667
## 3       3 4.514286
## 4       4 4.653846
## 5       5 4.562500

agefitNew <- aov(Q9 ~ as.factor(ClusterK_MeansNew), 
           data = mydataNew)

summary(agefitNew)

##                               Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew)   4    7.0   1.750   0.746  0.562
## Residuals                    169  396.5   2.346

As p>0.05, we cannot reject H0 and as a result we cannot validate our clusters using familiarity score

Using gender

H0: There is no association between the clusters and gender

H1: There is an association

chisq_results <- chisq.test(mydataNew$GenderF, as.factor(mydataNew$ClusterK_MeansNew))

## Warning in chisq.test(mydataNew$GenderF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect

chisq_results

## 
##  Pearson's Chi-squared test
## 
## data:  mydataNew$GenderF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 18.492, df = 8, p-value = 0.01783

addmargins(chisq_results$observed)

##                  
## mydataNew$GenderF   1   2   3   4   5 Sum
##            male    11  15  37   8   5  76
##            female  27   9  32  16  11  95
##            other    0   0   1   2   0   3
##            Sum     38  24  70  26  16 174

addmargins(chisq_results$expected, 2)

##                  
## mydataNew$GenderF          1          2         3          4         5 Sum
##            male   16.5977011 10.4827586 30.574713 11.3563218 6.9885057  76
##            female 20.7471264 13.1034483 38.218391 14.1954023 8.7356322  95
##            other   0.6551724  0.4137931  1.206897  0.4482759 0.2758621   3

round(chisq_results$res, 2)

##                  
## mydataNew$GenderF     1     2     3     4     5
##            male   -1.37  1.40  1.16 -1.00 -0.75
##            female  1.37 -1.13 -1.01  0.48  0.77
##            other  -0.81 -0.64 -0.19  2.32 -0.53

We can reject H0(p<0.05) and can say there is an association betweeen the clusters and gender but we still cannot validate the clusters using it because the Other gender population is very small(3)

Using Employment status

chisq_results <- chisq.test(mydataNew$EmploymentF, as.factor(mydataNew$ClusterK_MeansNew))

## Warning in chisq.test(mydataNew$EmploymentF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect

chisq_results

## 
##  Pearson's Chi-squared test
## 
## data:  mydataNew$EmploymentF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 24.03, df = 16, p-value = 0.08884

We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and employment status

addmargins(chisq_results$observed)

##                      
## mydataNew$EmploymentF   1   2   3   4   5 Sum
##    highschool student   7   4  11   4   0  26
##    student             24  15  35  17   8  99
##    full-time employed   7   4  20   5   4  40
##    part-time employed   0   0   3   0   3   6
##    unemployed           0   1   1   0   1   3
##    Sum                 38  24  70  26  16 174

Using Education level

chisq_results <- chisq.test(mydataNew$EducationF, as.factor(mydataNew$ClusterK_MeansNew))

## Warning in chisq.test(mydataNew$EducationF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect

chisq_results

## 
##  Pearson's Chi-squared test
## 
## data:  mydataNew$EducationF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 14.546, df = 16, p-value = 0.5581

We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and education level

addmargins(chisq_results$observed)

##                     
## mydataNew$EducationF   1   2   3   4   5 Sum
##    primary school      2   2   2   2   0   8
##    secondary school   13   6  22   3   4  48
##    gymnasium           7   4  17   9   3  40
##    bachelor's degree  12   9  24  11   5  61
##    master's degree     4   3   5   1   4  17
##    Sum                38  24  70  26  16 174

Using Income level

chisq_resultsInc <- chisq.test(mydataNew$IncomeF, as.factor(mydataNew$ClusterK_MeansNew))

## Warning in chisq.test(mydataNew$IncomeF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect

chisq_resultsInc

## 
##  Pearson's Chi-squared test
## 
## data:  mydataNew$IncomeF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 21.262, df = 16, p-value = 0.1687

We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and Income level

addmargins(chisq_resultsInc$observed)

##                  
## mydataNew$IncomeF   1   2   3   4   5 Sum
##   less than 500€   21  16  21  13   7  78
##   501€ - 1000€      8   3  25   7   3  46
##   1001€ - 1500€     5   3  11   2   3  24
##   1501€ - 2000€     2   0   9   4   1  16
##   more than 2000€   2   2   4   0   2  10
##   Sum              38  24  70  26  16 174

Using Primary Bank

chisq_results <- chisq.test(mydataNew$Bank_CombinedF, as.factor(mydataNew$ClusterK_MeansNew))

## Warning in chisq.test(mydataNew$Bank_CombinedF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect

chisq_results

## 
##  Pearson's Chi-squared test
## 
## data:  mydataNew$Bank_CombinedF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 9.0388, df = 8, p-value = 0.339

We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and primary bank

addmargins(chisq_results$observed)

##                         
## mydataNew$Bank_CombinedF   1   2   3   4   5 Sum
##                   NLB     20   9  24   8   4  65
##                   NKBM    10   6  24  11   4  55
##                   Others   8   9  22   7   8  54
##                   Sum     38  24  70  26  16 174

Using neobank Usage

chisq_resultsNeo <- chisq.test(mydataNew$Q18F, as.factor(mydataNew$ClusterK_MeansNew))

chisq_resultsNeo

## 
##  Pearson's Chi-squared test
## 
## data:  mydataNew$Q18F and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 5.4042, df = 4, p-value = 0.2483

We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and neobank usage

addmargins(chisq_resultsNeo$observed)

##               
## mydataNew$Q18F   1   2   3   4   5 Sum
##            Yes  11  14  27  10   6  68
##            No   27  10  43  16  10 106
##            Sum  38  24  70  26  16 174

Using exploration

chisq_results <- chisq.test(mydataNew$ExploreF, as.factor(mydataNew$ClusterK_MeansNew))

## Warning in chisq.test(mydataNew$ExploreF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect

chisq_results

## 
##  Pearson's Chi-squared test
## 
## data:  mydataNew$ExploreF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 14.535, df = 16, p-value = 0.5589

We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and exploration level

addmargins(chisq_results$observed)

##                         
## mydataNew$ExploreF         1   2   3   4   5 Sum
##   more than once a month   4   3   5   1   1  14
##   every second month       6   2   8   3   2  21
##   three to four times      4   1   6   4   4  19
##   once to twice           13  12  26   5   6  62
##   never                   11   6  25  13   3  58
##   Sum                     38  24  70  26  16 174

Using frequency of usage

chisq_resultsFre <- chisq.test(mydataNew$FrequencyF, as.factor(mydataNew$ClusterK_MeansNew))

## Warning in chisq.test(mydataNew$FrequencyF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect

chisq_resultsFre

## 
##  Pearson's Chi-squared test
## 
## data:  mydataNew$FrequencyF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 44.201, df = 20, p-value = 0.001415

We can reject H0(p<0.05) and can say there is an association betweeen the clusters and frequency of usage

addmargins(chisq_resultsFre$observed)

##                          
## mydataNew$FrequencyF        1   2   3   4   5 Sum
##   several times a day       8   6   9   5   3  31
##   daily                    13  11  23   6   4  57
##   weekly                   11   6  28  11   4  60
##   monthly                   4   0   2   4   2  12
##   less than monthly         0   1   0   0   3   4
##   I don’t use mobile bank   2   0   8   0   0  10
##   Sum                      38  24  70  26  16 174

Using importance of one mobile app

importanceNew <- aov(Q8 ~ as.factor(ClusterK_MeansNew), 
           data = mydataNew)

summary(importanceNew)

##                               Df Sum Sq Mean Sq F value Pr(>F)  
## as.factor(ClusterK_MeansNew)   4   24.1   6.031   2.992 0.0203 *
## Residuals                    169  340.6   2.016                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can reject H0(p<0.05) and can say At least one Mean is different from others and as a result we are able to validate our clusters using the score of the importance of one app

aggregate(mydataNew$Q8, 
          by = list(mydataNew$ClusterK_MeansNew), 
          FUN = "mean")

##   Group.1        x
## 1       1 6.184211
## 2       2 6.250000
## 3       3 5.442857
## 4       4 5.615385
## 5       5 5.250000

Perception Map

excel_file_path <- "C:/Users/ACER/Downloads/Perception_map.xlsx"
mydataMap <- read_excel("C:/Users/ACER/Downloads/Perception_map.xlsx")
head(mydataMap)

## # A tibble: 6 × 24
##   NLB_Simple NKBM_Simple SKB_Simple Unicredit_Simple Revolut_Simple N26_Simple
##        <dbl>       <dbl>      <dbl>            <dbl>          <dbl>      <dbl>
## 1          4           2          1                2              3          3
## 2          3           6          6                6              4          3
## 3          3           6          6                6              6          6
## 4          4           6          3                6              4          6
## 5          6           4          6                6              6          6
## 6          1           4          6                6              1          6
## # ℹ 18 more variables: NLB_Safe <dbl>, NKBM_Safe <dbl>, SKB_Safe <dbl>,
## #   Unicredit_Safe <dbl>, Revolut_Safe <dbl>, N26_Safe <dbl>,
## #   NLB_Advanced <dbl>, NKBM_Advanced <dbl>, SKB_Advanced <dbl>,
## #   Unicredit_Advanced <dbl>, Revolut_Advanced <dbl>, N26_Advanced <dbl>,
## #   NLB_Popular <dbl>, NKBM_Popular <dbl>, SKB_Popular <dbl>,
## #   Unicredit_Popular <dbl>, Revolut_Popular <dbl>, N26_Popular <dbl>

mydataMap[1:24] <- lapply(mydataMap[1:24], as.numeric)
head(mydataMap)

## # A tibble: 6 × 24
##   NLB_Simple NKBM_Simple SKB_Simple Unicredit_Simple Revolut_Simple N26_Simple
##        <dbl>       <dbl>      <dbl>            <dbl>          <dbl>      <dbl>
## 1          4           2          1                2              3          3
## 2          3           6          6                6              4          3
## 3          3           6          6                6              6          6
## 4          4           6          3                6              4          6
## 5          6           4          6                6              6          6
## 6          1           4          6                6              1          6
## # ℹ 18 more variables: NLB_Safe <dbl>, NKBM_Safe <dbl>, SKB_Safe <dbl>,
## #   Unicredit_Safe <dbl>, Revolut_Safe <dbl>, N26_Safe <dbl>,
## #   NLB_Advanced <dbl>, NKBM_Advanced <dbl>, SKB_Advanced <dbl>,
## #   Unicredit_Advanced <dbl>, Revolut_Advanced <dbl>, N26_Advanced <dbl>,
## #   NLB_Popular <dbl>, NKBM_Popular <dbl>, SKB_Popular <dbl>,
## #   Unicredit_Popular <dbl>, Revolut_Popular <dbl>, N26_Popular <dbl>

mydataMap <- mydataMap %>%
             replace_with_na_all(condition = ~.x %in% c(6)) 

#Replace all NA with averages by columns
mydataMap <- mydataMap %>% 
  mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))  

head(mydataMap[1:7])

## # A tibble: 6 × 7
##   NLB_Simple NKBM_Simple SKB_Simple Unicredit_Simple Revolut_Simple N26_Simple
##        <dbl>       <dbl>      <dbl>            <dbl>          <dbl>      <dbl>
## 1       4           2          1                2              3          3   
## 2       3           2.80       2.39             2.26           4          3   
## 3       3           2.80       2.39             2.26           3.27       3.05
## 4       4           2.80       3                2.26           4          3.05
## 5       2.90        4          2.39             2.26           3.27       3.05
## 6       1           4          2.39             2.26           1          3.05
## # ℹ 1 more variable: NLB_Safe <dbl>

mydata_PCA <- mydataMap %>% 
  pivot_longer(everything(), names_to = "name", values_to = "score")  %>% 
  separate(name, into = c("retailer", "dimension"), sep = "_")%>% 
  pivot_wider(names_from = dimension, values_from = score, values_fn = mean) %>%
  column_to_rownames(var = "retailer")

print(mydata_PCA)

##             Simple     Safe Advanced  Popular
## NLB       2.904348 3.206612 2.641667 2.992424
## NKBM      2.802632 3.023256 2.477273 2.388350
## SKB       2.393443 2.986667 2.295775 2.367347
## Unicredit 2.255319 2.815385 2.310345 1.915663
## Revolut   3.265306 2.715789 3.405941 3.169355
## N26       3.054545 2.303571 3.190476 2.487179

pca <- PCA(mydata_PCA, 
           scale.unit = TRUE, 
           graph = FALSE,
           ncp = 2)

print(pca$var$cor)

##               Dim.1      Dim.2
## Simple    0.9686056  0.1408423
## Safe     -0.4779192  0.8756541
## Advanced  0.9773167 -0.1842975
## Popular   0.8051848  0.5740152

fviz_pca_biplot(pca, 
                repel = TRUE)

Hypothesis Testing

More than half of young people have limited awareness of the useful functions available in bank mobile apps, with significant gaps in knowledge, particularly regarding advanced or less commonly used features

Relevant question: Q9 - How familiar are you with the latest features and updates available in your mobile banking apps? (1 - Not familiar at all; 7 - Very familiar)

We use a test of population proportion for checking this. For this purpose, we define awareness as a score of 5 and above and 1-4 as lack of awareness

frequency <- table(mydataNew$Q9 < 5)
frequency

## 
## FALSE  TRUE 
##    80    94

H0: x = 0.5

H1: x > 0.5

prop.test(x = 94,
          n = 174,
          p = 0.5,
          correct = FALSE,
          alternative = "greater")

## 
##  1-sample proportions test without continuity correction
## 
## data:  94 out of 174, null probability 0.5
## X-squared = 1.1264, df = 1, p-value = 0.1443
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.4779426 1.0000000
## sample estimates:
##         p 
## 0.5402299

As p>0.05 we cannot reject H0, implying we cannot reject that the number of young people having awareness about useful functions is not greater than the number of young people who lack awareness

Social media as a communication channel is preferred over traditional media for banking services

Relevant Question: Q12 - How effective do you find social media as a source of information about mobile banking functions? (1 - Not effective at all; 7 - Very effective)

We use a test of population proportion for checking this. For this purpose, we define preference as a score of 5 and above and 1-4 as not preferring social media

frequency <- table(mydataNew$Q12 < 5)
frequency

## 
## FALSE  TRUE 
##    51   123

prop.test(x = 51,
          n = 174,
          p = 0.5,
          correct = FALSE,
          alternative = "greater")

## 
##  1-sample proportions test without continuity correction
## 
## data:  51 out of 174, null probability 0.5
## X-squared = 29.793, df = 1, p-value = 1
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.2398586 1.0000000
## sample estimates:
##         p 
## 0.2931034

As p>0.05 we cannot reject H0, implying we cannot reject that the number of young people finding social media effective is not greater than those who don’t find social media effective

Simplicity is a game changer rather than other factors in young people’s choice of banking services

Relevant question: Q21 - When considering banking services, which is more important to you, simplicity or security? (1 - prioritize simplicity; 7 - prioritize security)

We use a test of population proportion for checking this. For this purpose, we define preference as a score of 5 and above as leaning towards security 1-4 as leaning towards simplicity

frequency <- table(mydataNew$Q21 < 5)
frequency

## 
## FALSE  TRUE 
##    77    97

prop.test(x = 97,
          n = 174,
          p = 0.5,
          correct = FALSE,
          alternative = "greater")

## 
##  1-sample proportions test without continuity correction
## 
## data:  97 out of 174, null probability 0.5
## X-squared = 2.2989, df = 1, p-value = 0.06473
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.4951262 1.0000000
## sample estimates:
##         p 
## 0.5574713

As p>0.05 we cannot reject H0 in statistical terms but since its quite close to 0.05, we can say that the young people lean more towards simplicity than security

The type and amount of incentives is a significant motivator for young people to actively use and explore various mobile banking services offered by the institution

Relevant question: Q14 - Would you like to participate in any rewards programs offered by your bank? ( 1 - yes; 2 - no; 3 - I don’t know)

We use a test of population proportion for checking this. For this purpose, we include the responders who marked 1 as those who are motivated by the incentives and 2 as those who are not motivated and we removed I don’t know responses for this analysis

frequency <- table(mydataNew$Q14 < 2)
frequency

## 
## FALSE  TRUE 
##    99    75

prop.test(x = 75,
          n = 140,
          p = 0.5,
          correct = FALSE,
          alternative = "greater")

## 
##  1-sample proportions test without continuity correction
## 
## data:  75 out of 140, null probability 0.5
## X-squared = 0.71429, df = 1, p-value = 0.199
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.466364 1.000000
## sample estimates:
##         p 
## 0.5357143

As p>0.05 we cannot reject H0, implying we cannot reject that the number of young people motivated by incentives is not greater than those who are not motivated by incentives

For the next hypotheses, we use Comparison of three or more arithmetic means for dependent samples (extension of paired samples t-test), rANOVA

H0: Mean(1) = Mean(2) = ….. Mean(k)

H1: At least one mean is different

We also tested normality for all the hypotheses and since each one of them wasn’t distributed normally, we used the non parametric test(Friedman ANOVA) instead

H0: All distribution locations of variables are the same

H1: At least one distribution location of variable is different

Most important type of incentive

Relevant question: Q15 - Order next reward incentives from least motivational to the most motivational for you, when exploring new banking services? (Q15a - discounts on banking services and purchases; Q15b - cash rewards; Q15c - tickets to events; Q15d - customized financial advice)

mydata_motivation <- mydata[c(1, 55:58)]
head(mydata_motivation)

## # A tibble: 6 × 5
##      ID  Q15a  Q15b  Q15c  Q15d
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     2     4     3     1
## 2     2     1     2     4     3
## 3     3     4     3     2     1
## 4     4     3     4     2     1
## 5     5     2     1     3     4
## 6     6     3     4     2     1

mydata_long <- mydata_motivation %>%
                 pivot_longer(
                   cols = c("Q15a", "Q15b", "Q15c", "Q15d"),
                   names_to = "Motivation", 
                   values_to = "Score") %>%
                 convert_as_factor(Motivation)


mydata_long <- as.data.frame(mydata_long)

tail(mydata_long, 10)

##      ID Motivation Score
## 687 172       Q15c     1
## 688 172       Q15d     4
## 689 173       Q15a     3
## 690 173       Q15b     4
## 691 173       Q15c     2
## 692 173       Q15d     1
## 693 174       Q15a     3
## 694 174       Q15b     4
## 695 174       Q15c     2
## 696 174       Q15d     1

ggboxplot(mydata_long, 
          x = "Motivation", 
          y = "Score", 
          add = "jitter")

mydata_long %>%
  group_by(Motivation) %>%
  shapiro_test(Score)

## # A tibble: 4 × 4
##   Motivation variable statistic        p
##   <fct>      <chr>        <dbl>    <dbl>
## 1 Q15a       Score        0.854 6.47e-12
## 2 Q15b       Score        0.830 6.04e-13
## 3 Q15c       Score        0.841 1.75e-12
## 4 Q15d       Score        0.857 9.03e-12

As none of the variables are distributed normally, we use the Friedman ANOVA method

library(dplyr)

mydata_long <- mydata_long %>%
  select(ID, Motivation, Score)

library(rstatix)

mydata_long %>%
  group_by(Motivation) %>%
  get_summary_stats(Score, type = "common")

## # A tibble: 4 × 11
##   Motivation variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>      <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Q15a       Score      174     1     4      3     2  2.71  1.09 0.083 0.163
## 2 Q15b       Score      174     1     4      3     2  2.75  1.15 0.087 0.172
## 3 Q15c       Score      174     1     4      2     2  2.09  1.00 0.076 0.15 
## 4 Q15d       Score      174     1     4      2     1  2.45  1.11 0.084 0.166

Friedman Anova

# Load the necessary libraries
library(rstatix)

# Perform Friedman test with RowNumber as a blocking variable
FriedmanANOVA <- friedman_test(Score ~ Motivation | ID,
                               data = mydata_long)

# Display the results
FriedmanANOVA

## # A tibble: 1 × 6
##   .y.       n statistic    df          p method       
## * <chr> <int>     <dbl> <dbl>      <dbl> <chr>        
## 1 Score   174      28.4     3 0.00000292 Friedman test

We can reject H0(p<0.001) and can say At least one distribution location of variable is different

library(rstatix)

#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonpar <- wilcox_test(Score ~ Motivation, 
                             paired = TRUE, 
                             p.adjust.method = "bonferroni",
                             data = mydata_long)

paires_nonpar

## # A tibble: 6 × 9
##   .y.   group1 group2    n1    n2 statistic         p     p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl>     <dbl>     <dbl> <chr>       
## 1 Score Q15a   Q15b     174   174     7304. 0.635     1         ns          
## 2 Score Q15a   Q15c     174   174    10525  0.0000082 0.0000492 ****        
## 3 Score Q15a   Q15d     174   174     8832. 0.062     0.37      ns          
## 4 Score Q15b   Q15c     174   174    10372. 0.0000239 0.000143  ***         
## 5 Score Q15b   Q15d     174   174     8894  0.051     0.303     ns          
## 6 Score Q15c   Q15d     174   174     5750. 0.004     0.023     *

library(rstatix)
comparisons <- paires_nonpar %>% 
                 add_y_position(fun = "median", step.increase = 0.35)

library(ggpubr)
ggboxplot(mydata_long, x = "Motivation", y = "Score", add = "point", ylim=c(0, 5)) +
  stat_pvalue_manual(comparisons, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 4,
               aes(group = Motivation), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkred", 
               position = position_dodge(width = 0.8),
               geom = "text", vjust = 0.5, hjust = -8,
               aes(label = round(after_stat(y), digits = 2), group = Motivation)) +
  labs(subtitle = get_test_label(FriedmanANOVA,  detailed = TRUE),
       caption = get_pwc_label(comparisons))

Based on the above table we can say discounts on banking services and purchases and cash rewards are significantly more important for young people than the event tickets and financial advice

Most important properties of banking service

Relevant question: Q17 - How important are each of the stated properties of a banking service to you? (Q17a - instant money transfers; Q17b - fast and accessible customer support; Q17c - fast new account opening; Q17d - one mobile banking app for all bank services; Q17e - loan accessible through online bank; Q17f - low costs of managing the account; Q17g - fast and complete information about banking services) (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)

mydata_important <- mydata[c(1, 60:66)]
head(mydata_important)

## # A tibble: 6 × 8
##      ID  Q17a  Q17b  Q17c  Q17d  Q17e  Q17f  Q17g
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     6     6     5     6     4     5     6
## 2     2     7     5     5     7     5     7     6
## 3     3     5     6     5     2     2     7     6
## 4     4     7     7     7     7     6     7     7
## 5     5     6     6     5     6     2     7     5
## 6     6     7     6     5     5     3     6     6

mydata_important_long <- mydata_important %>%
                 pivot_longer(
                   cols = c("Q17a", "Q17b", "Q17c", "Q17d","Q17e","Q17f","Q17g"),
                   names_to = "Importance", 
                   values_to = "ScoreI") %>%
                 convert_as_factor(Importance)

mydata_important_long <- as.data.frame(mydata_important_long)

tail(mydata_important_long, 10)

##       ID Importance ScoreI
## 1209 173       Q17e      5
## 1210 173       Q17f      7
## 1211 173       Q17g      6
## 1212 174       Q17a      7
## 1213 174       Q17b      6
## 1214 174       Q17c      7
## 1215 174       Q17d      7
## 1216 174       Q17e      4
## 1217 174       Q17f      6
## 1218 174       Q17g      6

ggboxplot(mydata_important_long, 
          x = "Importance", 
          y = "ScoreI", 
          add = "jitter")

mydata_important_long %>%
  group_by(Importance) %>%
  shapiro_test(ScoreI)

## # A tibble: 7 × 4
##   Importance variable statistic        p
##   <fct>      <chr>        <dbl>    <dbl>
## 1 Q17a       ScoreI       0.706 3.05e-17
## 2 Q17b       ScoreI       0.734 2.10e-16
## 3 Q17c       ScoreI       0.847 3.28e-12
## 4 Q17d       ScoreI       0.747 5.30e-16
## 5 Q17e       ScoreI       0.921 4.08e- 8
## 6 Q17f       ScoreI       0.712 4.71e-17
## 7 Q17g       ScoreI       0.778 5.92e-15

As none of the variables are distributed normally, we use the Friedman ANOVA method

mydata_important_long %>%
  group_by(Importance) %>%
  get_summary_stats(ScoreI, type = "common")

## # A tibble: 7 × 11
##   Importance variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>      <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Q17a       ScoreI     174     1     7      6     1  6.06  1.33 0.101 0.199
## 2 Q17b       ScoreI     174     1     7      6     1  6.09  1.24 0.094 0.185
## 3 Q17c       ScoreI     174     1     7      6     2  5.59  1.39 0.106 0.209
## 4 Q17d       ScoreI     174     2     7      7     1  6.12  1.20 0.091 0.179
## 5 Q17e       ScoreI     174     1     7      5     2  4.57  1.64 0.125 0.246
## 6 Q17f       ScoreI     174     2     7      7     1  6.29  1.01 0.077 0.152
## 7 Q17g       ScoreI     174     2     7      6     1  6.06  1.12 0.085 0.168

# Perform Friedman test with RowNumber as a blocking variable
FriedmanANOVA <- friedman_test(ScoreI ~ Importance | ID,
                               data = mydata_important_long)

# Display the results
FriedmanANOVA

## # A tibble: 1 × 6
##   .y.        n statistic    df        p method       
## * <chr>  <int>     <dbl> <dbl>    <dbl> <chr>        
## 1 ScoreI   174      275.     6 1.58e-56 Friedman test

We can reject H0(p<0.001) and can say At least one distribution location of variable is different

#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonparI <- wilcox_test(ScoreI ~ Importance, 
                             paired = TRUE, 
                             p.adjust.method = "bonferroni",
                             data = mydata_important_long)

paires_nonparI

## # A tibble: 21 × 9
##    .y.    group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
##  * <chr>  <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
##  1 ScoreI Q17a   Q17b     174   174     1784  8.47e- 1 1   e+ 0 ns          
##  2 ScoreI Q17a   Q17c     174   174     3769  1.10e- 5 2.31e- 4 ***         
##  3 ScoreI Q17a   Q17d     174   174     1886. 9.06e- 1 1   e+ 0 ns          
##  4 ScoreI Q17a   Q17e     174   174     8690. 8.38e-18 1.76e-16 ****        
##  5 ScoreI Q17a   Q17f     174   174     1602. 6.2 e- 2 1   e+ 0 ns          
##  6 ScoreI Q17a   Q17g     174   174     1794  3.84e- 1 1   e+ 0 ns          
##  7 ScoreI Q17b   Q17c     174   174     3503  7.1 e- 7 1.49e- 5 ****        
##  8 ScoreI Q17b   Q17d     174   174     1884. 6.17e- 1 1   e+ 0 ns          
##  9 ScoreI Q17b   Q17e     174   174     8555  1.5 e-18 3.15e-17 ****        
## 10 ScoreI Q17b   Q17f     174   174     1498. 4.6 e- 2 9.68e- 1 ns          
## # ℹ 11 more rows

comparisonsI <- paires_nonparI %>% 
                 add_y_position(fun = "median", step.increase = 0.35)

ggboxplot(mydata_important_long, x = "Importance", y = "ScoreI", add = "point", ylim=c(0, 27)) +
  stat_pvalue_manual(comparisonsI, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 4,
               aes(group = Importance), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkred", 
               position = position_dodge(width = 0.6),
               geom = "text", vjust = 0.5, hjust = -8,
               aes(label = round(after_stat(y), digits = 2), group = Importance)) +
  labs(subtitle = get_test_label(FriedmanANOVA,  detailed = TRUE),
       caption = get_pwc_label(comparisonsI))

Based on the above table we can say one mobile banking app for all bank services are significantly more important for young people than other properties of banking services

Importance of bank characteristics

Relevant question: Q22 - How important are stated properties of banks to you? (Q22a - polite service from employees; Q22b - modern look of the branch office; Q22c - easy and convenient mobile banking; Q22d - access to information about services; Q22e - feeling of financial security; Q22f - fast services/speed of services; Q22g - good reputation of the bank; Q22h - technological advancement; Q22i - Accessibility; Q22j - benefits) (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)

mydata_characterstics <- mydata[c(1, 77:86)]
head(mydata_characterstics)

## # A tibble: 6 × 11
##      ID  Q22a  Q22b  Q22c  Q22d  Q22e  Q22f  Q22g  Q22h  Q22i  Q22j
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     7     5     6     7     7     7     7     7     7     7
## 2     2     3     5     7     6     6     7     6     7     6     6
## 3     3     6     2     2     6     7     6     6     4     5     6
## 4     4     5     3     7     6     7     6     6     7     7     6
## 5     5     6     6     6     6     7     6     7     5     6     6
## 6     6     7     4     7     6     6     4     5     5     5     4

mydata_characterstics_long <- mydata_characterstics %>%
                 pivot_longer(
                   cols = c("Q22a", "Q22b", "Q22c", "Q22d","Q22e","Q22f","Q22g", "Q22h", "Q22i", "Q22j"),
                   names_to = "Characterstics", 
                   values_to = "ScoreC") %>%
                 convert_as_factor(Characterstics)

mydata_characterstics_long <- as.data.frame(mydata_characterstics_long)

tail(mydata_characterstics_long, 10)

##       ID Characterstics ScoreC
## 1731 174           Q22a      6
## 1732 174           Q22b      4
## 1733 174           Q22c      6
## 1734 174           Q22d      6
## 1735 174           Q22e      6
## 1736 174           Q22f      6
## 1737 174           Q22g      6
## 1738 174           Q22h      7
## 1739 174           Q22i      7
## 1740 174           Q22j      5

ggboxplot(mydata_characterstics_long, 
          x = "Characterstics", 
          y = "ScoreC", 
          add = "jitter")

mydata_characterstics_long %>%
  group_by(Characterstics) %>%
  shapiro_test(ScoreC)

## # A tibble: 10 × 4
##    Characterstics variable statistic        p
##    <fct>          <chr>        <dbl>    <dbl>
##  1 Q22a           ScoreC       0.851 4.86e-12
##  2 Q22b           ScoreC       0.932 2.40e- 7
##  3 Q22c           ScoreC       0.694 1.38e-17
##  4 Q22d           ScoreC       0.809 8.10e-14
##  5 Q22e           ScoreC       0.656 1.38e-18
##  6 Q22f           ScoreC       0.720 8.08e-17
##  7 Q22g           ScoreC       0.840 1.63e-12
##  8 Q22h           ScoreC       0.763 1.85e-15
##  9 Q22i           ScoreC       0.707 3.38e-17
## 10 Q22j           ScoreC       0.857 9.56e-12

As the variables are not normally distributed we can use the Friendman ANOVA method

mydata_characterstics_long %>%
  group_by(Characterstics) %>%
  get_summary_stats(ScoreC, type = "common")

## # A tibble: 10 × 11
##    Characterstics variable     n   min   max median   iqr  mean    sd    se
##    <fct>          <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 Q22a           ScoreC     174     1     7      6     1  5.42 1.47  0.112
##  2 Q22b           ScoreC     174     1     7      4     2  4.17 1.56  0.118
##  3 Q22c           ScoreC     174     1     7      7     1  6.17 1.26  0.095
##  4 Q22d           ScoreC     174     3     7      6     1  6.08 0.95  0.072
##  5 Q22e           ScoreC     174     2     7      7     1  6.27 1.20  0.091
##  6 Q22f           ScoreC     174     2     7      7     1  6.29 0.997 0.076
##  7 Q22g           ScoreC     174     1     7      6     1  5.61 1.29  0.098
##  8 Q22h           ScoreC     174     1     7      6     1  6.03 1.16  0.088
##  9 Q22i           ScoreC     174     2     7      7     1  6.27 1.04  0.079
## 10 Q22j           ScoreC     174     1     7      6     2  5.52 1.40  0.107
## # ℹ 1 more variable: ci <dbl>

# Load the necessary libraries


# Perform Friedman test with RowNumber as a blocking variable
FriedmanANOVA <- friedman_test(ScoreC ~ Characterstics | ID,
                               data = mydata_characterstics_long)

# Display the results
FriedmanANOVA

## # A tibble: 1 × 6
##   .y.        n statistic    df         p method       
## * <chr>  <int>     <dbl> <dbl>     <dbl> <chr>        
## 1 ScoreC   174      496.     9 3.45e-101 Friedman test

We can reject H0(p<0.001) and can say At least one distribution location of variable is different

#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonparC <- wilcox_test(ScoreC ~ Characterstics, 
                             paired = TRUE, 
                             p.adjust.method = "bonferroni",
                             data = mydata_characterstics_long)

paires_nonparC

## # A tibble: 45 × 9
##    .y.    group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
##  * <chr>  <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
##  1 ScoreC Q22a   Q22b     174   174     7522  2.5 e-15 1.12e-13 ****        
##  2 ScoreC Q22a   Q22c     174   174      764. 2.02e-11 9.09e-10 ****        
##  3 ScoreC Q22a   Q22d     174   174      951  2.79e- 8 1.26e- 6 ****        
##  4 ScoreC Q22a   Q22e     174   174      634. 4.12e-13 1.85e-11 ****        
##  5 ScoreC Q22a   Q22f     174   174      812  9.21e-12 4.14e-10 ****        
##  6 ScoreC Q22a   Q22g     174   174     2534  1.1 e- 1 1   e+ 0 ns          
##  7 ScoreC Q22a   Q22h     174   174     1442. 8.66e- 7 3.90e- 5 ****        
##  8 ScoreC Q22a   Q22i     174   174      440  2.59e-13 1.17e-11 ****        
##  9 ScoreC Q22a   Q22j     174   174     2689  5.24e- 1 1   e+ 0 ns          
## 10 ScoreC Q22b   Q22c     174   174      295  4.71e-23 2.12e-21 ****        
## # ℹ 35 more rows

comparisonsC <- paires_nonparC %>% 
                 add_y_position(fun = "median", step.increase = 0.35)


ggboxplot(mydata_characterstics_long, x = "Characterstics", y = "ScoreC", add = "point", ylim=c(0, 57)) +
  stat_pvalue_manual(comparisonsC, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 4,
               aes(group = Characterstics), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkred", 
               position = position_dodge(width = 0.6),
               geom = "text", vjust = 0.5, hjust = -8,
               aes(label = round(after_stat(y), digits = 2), group = Characterstics)) +
  labs(subtitle = get_test_label(FriedmanANOVA,  detailed = TRUE),
       caption = get_pwc_label(comparisonsC))

Based on the above table we can say the speed of services is the most important characteristic

Reasons for changing the bank

Relevant question: Q23 - How likely would you decide to change the bank in the next situations? (Q23a - long lines at branches; Q23b - unprofessional attitude of employees; Q23c - high provisions; Q23d - lack of useful properties in mobile banking app; Q23e - not getting a loan; Q24f - too much of bureaucracy; Q23g - deficient information about services, Q23h - recommendation of other bank from friend/family) (1 - for sure not; 2 - not; 3 - probably not; 4 - maybe; 5 - probably yes; 6 - yes; 7 - for sure yes )

mydata_change <- mydata[c(1, 87:94)]
head(mydata_change)

## # A tibble: 6 × 9
##      ID  Q23a  Q23b  Q23c  Q23d  Q23e  Q23f  Q23g  Q23h
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     4     7     7     4     4     6     6     4
## 2     2     5     5     7     6     5     5     5     5
## 3     3     4     5     7     1     5     6     6     4
## 4     4     5     6     7     5     6     5     3     4
## 5     5     3     4     7     2     5     6     6     4
## 6     6     3     5     4     5     1     2     6     4

mydata_change_long <- mydata_change %>%
                 pivot_longer(
                   cols = c("Q23a", "Q23b", "Q23c", "Q23d","Q23e","Q23f","Q23g","Q23h"),
                   names_to = "Change", 
                   values_to = "ScoreCh") %>%
                 convert_as_factor(Change)

mydata_change_long <- as.data.frame(mydata_change_long)

tail(mydata_change_long, 10)

##       ID Change ScoreCh
## 1383 173   Q23g       5
## 1384 173   Q23h       4
## 1385 174   Q23a       4
## 1386 174   Q23b       4
## 1387 174   Q23c       7
## 1388 174   Q23d       5
## 1389 174   Q23e       4
## 1390 174   Q23f       4
## 1391 174   Q23g       3
## 1392 174   Q23h       5

ggboxplot(mydata_change_long, 
          x = "Change", 
          y = "ScoreCh", 
          add = "jitter")

mydata_change_long %>%
  group_by(Change) %>%
  shapiro_test(ScoreCh)

## # A tibble: 8 × 4
##   Change variable statistic        p
##   <fct>  <chr>        <dbl>    <dbl>
## 1 Q23a   ScoreCh      0.946 3.25e- 6
## 2 Q23b   ScoreCh      0.916 1.77e- 8
## 3 Q23c   ScoreCh      0.859 1.15e-11
## 4 Q23d   ScoreCh      0.925 8.21e- 8
## 5 Q23e   ScoreCh      0.925 7.98e- 8
## 6 Q23f   ScoreCh      0.938 7.42e- 7
## 7 Q23g   ScoreCh      0.933 3.18e- 7
## 8 Q23h   ScoreCh      0.892 6.36e-10

As the variables are not distributed normally we can use the Friendman ANOVA method

mydata_change_long %>%
  group_by(Change) %>%
  get_summary_stats(ScoreCh, type = "common")

## # A tibble: 8 × 11
##   Change variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>  <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Q23a   ScoreCh    174     1     7      4     2  3.98  1.40 0.106 0.209
## 2 Q23b   ScoreCh    174     1     7      5     2  4.85  1.57 0.119 0.235
## 3 Q23c   ScoreCh    174     1     7      6     2  5.41  1.49 0.113 0.222
## 4 Q23d   ScoreCh    174     1     7      5     2  4.80  1.54 0.117 0.23 
## 5 Q23e   ScoreCh    174     1     7      5     1  4.60  1.41 0.107 0.211
## 6 Q23f   ScoreCh    174     1     7      5     2  4.74  1.42 0.108 0.213
## 7 Q23g   ScoreCh    174     1     7      5     2  4.67  1.51 0.114 0.226
## 8 Q23h   ScoreCh    174     1     7      4     1  4.10  1.19 0.09  0.178

# Load the necessary libraries

# Perform Friedman test with RowNumber as a blocking variable
FriedmanANOVA <- friedman_test(ScoreCh ~ Change | ID,
                               data = mydata_change_long)

# Display the results
FriedmanANOVA

## # A tibble: 1 × 6
##   .y.         n statistic    df        p method       
## * <chr>   <int>     <dbl> <dbl>    <dbl> <chr>        
## 1 ScoreCh   174      192.     7 4.74e-38 Friedman test

We can reject H0(p<0.001) and can say At least one distribution location of variable is different

#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonparCh <- wilcox_test(ScoreCh ~ Change, 
                             paired = TRUE, 
                             p.adjust.method = "bonferroni",
                             data = mydata_change_long)

paires_nonparCh

## # A tibble: 28 × 9
##    .y.     group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
##  * <chr>   <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
##  1 ScoreCh Q23a   Q23b     174   174      923  4.47e-13 1.25e-11 ****        
##  2 ScoreCh Q23a   Q23c     174   174     1039  1.19e-17 3.33e-16 ****        
##  3 ScoreCh Q23a   Q23d     174   174     1566. 2.97e-10 8.32e- 9 ****        
##  4 ScoreCh Q23a   Q23e     174   174     2052. 9.91e- 6 2.77e- 4 ***         
##  5 ScoreCh Q23a   Q23f     174   174     1340. 8.55e- 9 2.39e- 7 ****        
##  6 ScoreCh Q23a   Q23g     174   174     1938. 9.36e- 7 2.62e- 5 ****        
##  7 ScoreCh Q23a   Q23h     174   174     3112. 3.42e- 1 1   e+ 0 ns          
##  8 ScoreCh Q23b   Q23c     174   174     1388. 9.65e- 7 2.70e- 5 ****        
##  9 ScoreCh Q23b   Q23d     174   174     3522. 4.79e- 1 1   e+ 0 ns          
## 10 ScoreCh Q23b   Q23e     174   174     4767  8.3 e- 2 1   e+ 0 ns          
## # ℹ 18 more rows

comparisonsCh <- paires_nonparCh %>% 
                 add_y_position(fun = "median", step.increase = 0.35)

ggboxplot(mydata_change_long, x = "Change", y = "ScoreCh", add = "point", ylim=c(0, 37)) +
  stat_pvalue_manual(comparisonsCh, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 2,
               aes(group = Change), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkred", 
               position = position_dodge(width = 0.6),
               geom = "text", vjust = 0.5, hjust = -8,
               aes(label = round(after_stat(y), digits = 2), group = Change)) +
  labs(subtitle = get_test_label(FriedmanANOVA,  detailed = TRUE),
       caption = get_pwc_label(comparisonsCh))

Based on the above we can say high provision is the biggest reason to change the bank

NLB Complete

Group 6

2024-02-08

Introduction of the data, explaining the variables, and cleaning the data

Demographic Analysis

Gender

Income

Age

Education

Employment

Bank

Region

Association

Based on Income

Based on Education

Based on Employment status

Based on Gender

Based on Region

Clustering

Correlation, scaling, dissimilarity, and cluster tendency

Dendrogram

Clustering using the K Means method

Checking cluster fit

Validation

Perception Map

Hypothesis Testing