library(readxl)
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(readxl)
library(naniar)
library(tibble)
library(FactoMineR)
library(ggpubr)
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ rstatix::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::src() masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#install.packages("readxl")
excel_file_path <- "C:/Users/ACER/Desktop/R Assignment/NLB Anka.xlsx"
mydata <- read_excel("C:/Users/ACER/Desktop/R Assignment/NLB Anka.xlsx")
head(mydata)
## # A tibble: 6 × 107
## ID Q1 Q1_10_text Q2a Q2b Q2c Q2d Q2e Q2f Q3a Q3b Q3c
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NA Katera… Drugo (pr… NLB NKBM SKB Unic… Revo… N26 NLB NKBM SKB
## 2 1 1 -2 4 2 1 2 3 3 4 3 1
## 3 2 3 -2 3 6 6 6 4 3 4 3 3
## 4 3 1 -2 3 6 6 6 6 6 3 6 2
## 5 4 1 -2 4 6 3 6 4 6 4 6 4
## 6 5 10 uporablja… 6 4 6 6 6 6 6 4 6
## # ℹ 95 more variables: Q3d <chr>, Q3e <chr>, Q3f <chr>, Q4a <chr>, Q4b <chr>,
## # Q4c <chr>, Q4d <chr>, Q4e <chr>, Q4f <chr>, Q5a <chr>, Q5b <chr>,
## # Q5c <chr>, Q5d <chr>, Q5e <chr>, Q5f <chr>, Q6 <chr>, Q7a <chr>, Q7b <chr>,
## # Q7c <chr>, Q7d <chr>, Q7e <chr>, Q7f <chr>, Q7g <chr>, Q7h <chr>,
## # Q7i <chr>, Q7i_text <chr>, Q8 <chr>, Q9 <chr>, Q10 <chr>, Q11a <chr>,
## # Q11b <chr>, Q11c <chr>, Q11d <chr>, Q11e <chr>, Q11f <chr>, Q11g <chr>,
## # Q11h <chr>, Q11h_text <chr>, Q12 <chr>, Q13 <chr>, Q13_7_text <chr>, …
n=174
Unit of observation: One survey taker
Description of the variables:
Q1 - Which is your primary bank? (1 - NLB; 2 - SKB; 3 - Gorenjska Banka; 4 - NKBM; 5 - Addiko Bank; 6 - Delavska hranilnica; 7 - Banka Intesa Sanpaolo; 8 - Sparkasse; 9 - DBS; 10 - Other (please specify): Q1_10_text )
Q2 - How much do you agree that services of banks are easy to use? (Q2a - NLB; Q2b - NKBM; Q2c - SKB, Q2d - UniCredit, Q2e - Revolut, Q2f - N26)
Q3 - How much do you agree that the stated bank seems safe to you (in terms of everyday transactions, cybernetic security, paying abroad)? (Q3a - NLB; Q3b - NKBM; Q3c - SKB, Q3d - UniCredit, Q3e - Revolut, Q3f - N26)
Q4 - How much do you agree that the stated bank seems advanced to you? (Q4a - NLB; Q4b - NKBM; Q4c - SKB, Q4d - UniCredit, Q4e - Revolut, Q4f - N26)
Q5 - How much do you agree that the stated bank is widespread among young people? (Q5a - NLB; Q5b - NKBM; Q5c - SKB, Q5d - UniCredit, Q5e - Revolut, Q5f - N26) (1 - I don’t agree at all; 2 - I don’t agree; 3 - undecided; 4 - I agree; 5 - I fully agree; 6 - I don’t know)
Q6 - How frequently do you use mobile banking apps? (1 - several times a day; 2 - daily; 3 - weekly; 4 - monthly; 5 - less than monthly, 6 - I don’t use mobile bank)
Q7 - Please select the functions your mobile bank offers that you are aware of? (Q7a - paying bills by scanning QR code; Q7b - checking account balance; Q7c - monthly spending overview; Q7d - cost planning; Q7e - integrated Flik; Q7f - submitting a rapid loan request; Q7g - accessing archived documents; Q7h - investing options; Q7i - Other (please specify): Q7i_text)
Q8 - How important is it for you to be able to access all services of your bank through one mobile app? (1 - Not important at all; 7 - Very important)
Q9 - How familiar are you with the latest features and updates available in your mobile banking apps? (1 - Not familiar at all; 7 - Very familiar)
Q10 - In the past year, how often have you actively explored new features in your mobile banking app? (1 - more than once a month; 2 - every second month; 3 - three to four times; 4 - once to twice; 5 - never)
Q11 - Through which channels are you most often notified about new and better bank services? (Q11a - social media; Q11b - traditional media (TV, radio); Q11c - friends and family; Q11d - in-branch assistance; Q11e - the bank’s official website; Q11f - forums; Q11g - mobile ban; Q11h - other (please specify): Q11h_text)
Q12 - How effective do you find social media as a source of information about mobile banking functions? (1 - Not effective at all; 7 - Very effective)
Q13 - What specific social media platform do you prefer the most for receiving information about banking services? (1 - Instagram; 2 - Facebook; 3 - X (Twitter); 4 - TikTok; 5 - LinkedIn; 6 - YouTube; 7 - other (please specify): Q13_7_text)
Q14 - Would you like to participate in any rewards programs offered by your bank? ( 1 - yes; 2 - no; 3 - I don’t know)
Q15 - Order next reward incentives from least motivational to the most motivational for you, when exploring new banking services? (Q15a - discounts on banking services and purchases; Q15b - cash rewards; Q15c - tickets to events; Q15d - customized financial advice)
Q16 - Are you aware of any rewards programs offered by your bank in the past? (1 - yes; 2 - no; 3 - I don’t know)
Q17 - How important are each of the stated properties of a banking service to you? (Q17a - instant money transfers; Q17b - fast and accessible customer support; Q17c - fast new account opening; Q17d - one mobile banking app for all bank services; Q17e - loan accessible through online bank; Q17f - low costs of managing the account; Q17g - fast and complete information about banking services) (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)
Q18 - Do you use a Neobank (fintech companies that offer financial products and services to consumers via a digital platform such as Revolut, N26 or mBills)? (1 - yes; 2 - no)
Q19 - Would you consider trying services of a Neobank in the future? (1 - yes; 2 - no; 3 - I don’t know) (BO)
Q20 - Which bank institution would you choose in the next situations? (Q20a - sending money among friends; Q20b - overview of self spending habits; Q20c - online shopping; Q20d - everyday shopping like groceries; Q20e - abroad payments; Q20f - payments in foreign currency; Q20g - savings account) (1 - neobank; 2 - traditional bank; 3 - neobank and traditional bank)
Q21 - When considering banking services, which is more important to you, simplicity or security? (1 - prioritize simplicity; 7 - prioritize security)
Q22 - How important are stated properties of banks to you? (Q22a - polite service from employees; Q22b - modern look of the branch office; Q22c - easy and convenient mobile banking; Q22d - access to information about services; Q22e - feeling of financial security; Q22f - fast services/speed of services; Q22g - good reputation of the bank; Q22h - technological advancement; Q22i - Accessibility; Q22j - benefits) (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)
Q23 - How likely would you decide to change the bank in the next situations? (Q23a - long lines at branches; Q23b - unprofessional attitude of employees; Q23c - high provisions; Q23d - lack of useful properties in mobile banking app; Q23e - not getting a loan; Q24f - too much of bureaucracy; Q23g - deficient information about services, Q23h - recommendation of other bank from friend/family) (1 - for sure not; 2 - not; 3 - probably not; 4 - maybe; 5 - probably yes; 6 - yes; 7 - for sure yes )
Q24 - How likely are you to recommend your current bank to your friend? (10 - Very likely; 1 - Very unlikely)
Q25 - How old are you? (x - age)
Q26 - Gender (1 - male; 2 - female; 3 - other)
Q27 - Highest education attained (1 - primary school; 2 - secondary school; 3 - gymnasium; 4 - bachelor’s degree; 5 - master’s degree)
Q28 - Employment status (1 - highschool student; 2 - student; 3 - full-time employed; 4 - part-time employed; 5 - unemployed)
Q29 - What is your average personal monthly income? (1 - less than 500€; 2 - 501€ - 1000€; 3 - 1001€ - 1500€; 4 - 1501€ - 2000€; 5 - more than 2000€)
Q30 - Mark the region of your residence. (1- Gorenjska; 2 - Goriška; 3 - Osrednjeslovenska; 4 - Obalno-kraška; 5 - Notranjska; 6 - Dolenjska; 7 - Posavska; 8 - Zasavska; 9 - Savinjska; 10 - Koroška; 11 - Podravska; 12 - Pomurska)
mydata <- mydata[-1,]
mydata$BankF <- factor(mydata$Q1,
levels = c(1,2,3,4,5,6,7,8,9,10),
labels = c("NLB", "SKB", "Gorenjska Banka", "NKBM", "Addiko Bank","Delavska hranilnica","Banka Intesa Sanpaolo","Sparkasse","DBS","Other"))
mydata$FrequencyF <- factor(mydata$Q6,
levels = c(1,2,3,4,5,6),
labels = c("several times a day", "daily", "weekly", "monthly", "less than monthly","I don’t use mobile bank"))
mydata$Q7aF <- factor(mydata$Q7a,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7bF <- factor(mydata$Q7b,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7cF <- factor(mydata$Q7c,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7dF <- factor(mydata$Q7d,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7eF <- factor(mydata$Q7e,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7fF <- factor(mydata$Q7f,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7gF <- factor(mydata$Q7g,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7hF <- factor(mydata$Q7h,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7iF <- factor(mydata$Q7i,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$ExploreF <- factor(mydata$Q10,
levels = c(1,2,3,4,5),
labels = c("more than once a month", "every second month", "three to four times", "once to twice", "never"))
mydata$Q11aF <- factor(mydata$Q11a,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q11bF <- factor(mydata$Q11b,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q11cF <- factor(mydata$Q11c,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q11dF <- factor(mydata$Q11d,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q11eF <- factor(mydata$Q11e,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q11fF <- factor(mydata$Q11f,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q11gF <- factor(mydata$Q11g,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q11hF <- factor(mydata$Q11h,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7aF <- factor(mydata$Q7a,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7bF <- factor(mydata$Q7b,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7cF <- factor(mydata$Q7c,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7dF <- factor(mydata$Q7d,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7eF <- factor(mydata$Q7e,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7fF <- factor(mydata$Q7f,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q7gF <- factor(mydata$Q7g,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Q14F <- factor(mydata$Q14,
levels = c(1,2,3),
labels = c("Yes", "No", "I don't know"))
mydata$Q16F <- factor(mydata$Q16,
levels = c(1,2,3),
labels = c("Yes", "No", "I don't know"))
mydata$Q18F <- factor(mydata$Q18,
levels = c(1,2),
labels = c("Yes", "No"))
mydata$Q19F <- factor(mydata$Q19,
levels = c(1,2,3,-2),
labels = c("Yes", "No", "I don't know","Already use them"))
mydata$Q20aF <- factor(mydata$Q20a,
levels = c(1,2,3,4),
labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))
mydata$Q20bF <- factor(mydata$Q20b,
levels = c(1,2,3,4),
labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))
mydata$Q20cF <- factor(mydata$Q20c,
levels = c(1,2,3,4),
labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))
mydata$Q20dF <- factor(mydata$Q20d,
levels = c(1,2,3,4),
labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))
mydata$Q20eF <- factor(mydata$Q20e,
levels = c(1,2,3,4),
labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))
mydata$Q20fF <- factor(mydata$Q20f,
levels = c(1,2,3,4),
labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))
mydata$Q20gF <- factor(mydata$Q20g,
levels = c(1,2,3,4),
labels = c("neobank", "traditional bank", "neobank and traditional bank","none"))
mydata$GenderF <- factor(mydata$Q26,
levels = c(1,2,3),
labels = c("male", "female", "other"))
mydata$EducationF <- factor(mydata$Q27,
levels = c(1,2,3,4,5),
labels = c("primary school", "secondary school", "gymnasium","bachelor's degree","master's degree"))
mydata$EmploymentF <- factor(mydata$Q28,
levels = c(1,2,3,4,5),
labels = c("highschool student", "student", "full-time employed","part-time employed","unemployed"))
mydata$IncomeF <- factor(mydata$Q29,
levels = c(1,2,3,4,5),
labels = c("less than 500€", "501€ - 1000€", "1001€ - 1500€","1501€ - 2000€","more than 2000€"))
mydata$RegionF <- factor(mydata$Q30,
levels = c(1,2,3,4,5,6,7,8,9,10,11,12),
labels = c("Gorenjska", "Goriška", "Osrednjeslovenska","Obalno-kraška","Notranjska","Dolenjska","Posavska","Zasavska","Savinjska","Koroška","Podravska","Pomurska"))
mydata$Bank_CombinedF <- factor(mydata$Bank_Combined,
levels = c("NLB", "NKBM", "Others"),
labels = c("NLB", "NKBM", "Others"))
mydata$Region_CombinedF <- factor(mydata$Region_Combined,
levels = c("Gorenjska", "Osrednjeslovenska", "Others"),
labels = c("Gorenjska", "Osrednjeslovenska", "Others"))
mydata$Education_CombinedF <- factor(mydata$Education_Combined,
levels = c("lower education", "higher education"),
labels = c("lower education", "higher education"))
mydata$Employment_CombinedF <- factor(mydata$Employment_Combined,
levels = c("student", "employed"),
labels = c("student", "employed"))
mydata[4:27] <- lapply(mydata[4:27], as.numeric)
mydata[39:40] <- lapply(mydata[39:40], as.numeric)
mydata[51] <- lapply(mydata[51], as.numeric)
mydata[55:58] <- lapply(mydata[55:58], as.numeric)
mydata[60:66] <- lapply(mydata[60:66], as.numeric)
mydata[76:96] <- lapply(mydata[76:96], as.numeric)
gender_summary <- table(mydata$GenderF)
print(gender_summary)
##
## male female other
## 76 95 3
barplot(gender_summary, col = c("blue", "pink", "green"), main = "Gender Distribution",
xlab = "Gender", ylab = "Count", names.arg = names(gender_summary))
income_summary <- table(mydata$IncomeF)
print(income_summary)
##
## less than 500€ 501€ - 1000€ 1001€ - 1500€ 1501€ - 2000€ more than 2000€
## 78 46 24 16 10
barplot(income_summary, col = c("blue", "pink", "green","yellow","red"), main = "Income Distribution",
xlab = "Income", ylab = "Count", names.arg = names(income_summary), las = 1, cex.names = 0.8)
hist(mydata$Q25, col = "skyblue", main = "Age Distribution", xlab = "Age", ylab = "Frequency")
education_summary <- table(mydata$EducationF)
print(education_summary)
##
## primary school secondary school gymnasium bachelor's degree
## 8 48 40 61
## master's degree
## 17
barplot(education_summary, col = c("green"), main = "Education Distribution",
xlab = "Education", ylab = "Count", names.arg = names(education_summary), las = 1, cex.names = 0.5)
employment_summary <- table(mydata$EmploymentF)
print(employment_summary)
##
## highschool student student full-time employed part-time employed
## 26 99 40 6
## unemployed
## 3
barplot(employment_summary, col = c("green"), main = "Employment Distribution",
xlab = "Employment", ylab = "Count", names.arg = names(employment_summary), las = 1, cex.names = 0.5)
bank_summary <- table(mydata$BankF)
print(bank_summary)
##
## NLB SKB Gorenjska Banka
## 65 21 10
## NKBM Addiko Bank Delavska hranilnica
## 34 2 6
## Banka Intesa Sanpaolo Sparkasse DBS
## 9 3 8
## Other
## 16
barplot(bank_summary, col = c("green"), main = "Bank Distribution",
xlab = "Bank", ylab = "Count", names.arg = names(bank_summary), las = 2, cex.names = 0.5)
region_summary <- table(mydata$RegionF)
print(region_summary)
##
## Gorenjska Goriška Osrednjeslovenska Obalno-kraška
## 55 7 74 8
## Notranjska Dolenjska Posavska Zasavska
## 7 9 1 4
## Savinjska Koroška Podravska Pomurska
## 5 0 2 2
barplot(region_summary, col = c("green"), main = "Region Distribution",
xlab = "Region", ylab = "Count", names.arg = names(region_summary), las = 2, cex.names = 0.5)
In this section we would check the association between the primary bank of a survey taker and other demographic characteristics-Income, Education level, Employment, Gender, Geographical Region. Since the bank and the stated characteristics are categorical variables, the associations are tested using the Pearson Chi square method
H0: There is no association
H1: There is an association
resultsI <- chisq.test(mydata$IncomeF, mydata$Bank_CombinedF,
correct = FALSE)
## Warning in chisq.test(mydata$IncomeF, mydata$Bank_CombinedF, correct = FALSE):
## Chi-squared approximation may be incorrect
resultsI
##
## Pearson's Chi-squared test
##
## data: mydata$IncomeF and mydata$Bank_CombinedF
## X-squared = 8.0185, df = 8, p-value = 0.4317
As p>0.05, we cannot reject that there is no association between the Income level and the primary bank
resultsE <- chisq.test(mydata$EducationF, mydata$Bank_CombinedF,
correct = FALSE)
## Warning in chisq.test(mydata$EducationF, mydata$Bank_CombinedF, correct =
## FALSE): Chi-squared approximation may be incorrect
resultsE
##
## Pearson's Chi-squared test
##
## data: mydata$EducationF and mydata$Bank_CombinedF
## X-squared = 7.5102, df = 8, p-value = 0.4827
As p>0.05, we cannot reject that there is no association between the Education level and the primary bank
resultsEm <- chisq.test(mydata$EmploymentF, mydata$Bank_CombinedF,
correct = FALSE)
## Warning in chisq.test(mydata$EmploymentF, mydata$Bank_CombinedF, correct =
## FALSE): Chi-squared approximation may be incorrect
resultsEm
##
## Pearson's Chi-squared test
##
## data: mydata$EmploymentF and mydata$Bank_CombinedF
## X-squared = 8.7914, df = 8, p-value = 0.3602
As p>0.05, we cannot reject that there is no association between the Employment status and the primary bank
resultsG <- chisq.test(mydata$GenderF, mydata$Bank_CombinedF,
correct = FALSE)
## Warning in chisq.test(mydata$GenderF, mydata$Bank_CombinedF, correct = FALSE):
## Chi-squared approximation may be incorrect
resultsG
##
## Pearson's Chi-squared test
##
## data: mydata$GenderF and mydata$Bank_CombinedF
## X-squared = 7.8106, df = 4, p-value = 0.09877
As p>0.05, we cannot reject that there is no association between the Gender and the primary bank
resultsR <- chisq.test(mydata$Region_CombinedF, mydata$Bank_CombinedF,
correct = FALSE)
resultsR
##
## Pearson's Chi-squared test
##
## data: mydata$Region_CombinedF and mydata$Bank_CombinedF
## X-squared = 31.028, df = 4, p-value = 3.022e-06
We can reject H0(p<0.001) and can say there is an association between the geographical region and the primary bank. Further, we can check the observed and expected frequencies and the differences between them
addmargins(resultsR$observed)
## mydata$Bank_CombinedF
## mydata$Region_CombinedF NLB NKBM Others Sum
## Gorenjska 9 19 27 55
## Osrednjeslovenska 44 18 12 74
## Others 12 18 15 45
## Sum 65 55 54 174
round(resultsR$expected)
## mydata$Bank_CombinedF
## mydata$Region_CombinedF NLB NKBM Others
## Gorenjska 21 17 17
## Osrednjeslovenska 28 23 23
## Others 17 14 14
round(resultsR$res, 2)
## mydata$Bank_CombinedF
## mydata$Region_CombinedF NLB NKBM Others
## Gorenjska -2.55 0.39 2.40
## Osrednjeslovenska 3.11 -1.11 -2.29
## Others -1.17 1.00 0.28
Based on above, we can say that there are less than expected NLB users in the Gorenjska region and more than expected NLB users in the Osrednjeslovenska region
Cluster Variables:
Q12: How effective do you find social media as a source of information about mobile banking functions? (1 - Not effective at all; 7 - Very effective)
Q17b: How important is fast and accessible customer support for you(1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)
Q21: Q21 - When considering banking services, which is more important to you, simplicity or security? (1 - prioritize simplicity; 7 - prioritize security)
Q22j: How important are benefits probvided by a bank for you? (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)
mydataNew <- mydata
cor_matrixN <- cor(mydataNew[, c("Q21","Q22j","Q12","Q17b")])
print(cor_matrixN)
## Q21 Q22j Q12 Q17b
## Q21 1.0000000 0.1694530 -0.0548354 0.0701421
## Q22j 0.1694530 1.0000000 0.1587895 0.2807387
## Q12 -0.0548354 0.1587895 1.0000000 0.1628189
## Q17b 0.0701421 0.2807387 0.1628189 1.0000000
mydataNew$Q21_z <- scale(mydataNew$Q21)
mydataNew$Q17b_z <- scale(mydataNew$Q17b)
mydataNew$Q22j_z <- scale(mydataNew$Q22j)
mydataNew$Q12_z <- scale(mydataNew$Q12)
rcorr(as.matrix(mydataNew[ , c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")]),
type = "pearson")
## Q21_z Q17b_z Q22j_z Q12_z
## Q21_z 1.00 0.07 0.17 -0.05
## Q17b_z 0.07 1.00 0.28 0.16
## Q22j_z 0.17 0.28 1.00 0.16
## Q12_z -0.05 0.16 0.16 1.00
##
## n= 174
##
##
## P
## Q21_z Q17b_z Q22j_z Q12_z
## Q21_z 0.3577 0.0254 0.4724
## Q17b_z 0.3577 0.0002 0.0318
## Q22j_z 0.0254 0.0002 0.0364
## Q12_z 0.4724 0.0318 0.0364
mydataNew$Dissimilarity1 <- sqrt(mydataNew$Q21_z^2 + mydataNew$Q17b_z^2 + mydataNew$Q22j_z^2 + mydataNew$Q12_z^2)
head(mydataNew[order(-mydataNew$Dissimilarity1), c(1, 152)], 10)
## # A tibble: 10 × 2
## ID Dissimilarity1[,1]
## <dbl> <dbl>
## 1 153 4.44
## 2 152 4.30
## 3 151 4.28
## 4 110 4.17
## 5 162 4.08
## 6 64 3.85
## 7 169 3.67
## 8 154 3.64
## 9 150 3.56
## 10 78 3.51
hist(mydataNew$Dissimilarity1,
xlab = "Dissimilarity",
ylab = "Frequency",
main = "Histogram of Dissimilarity")
distanceNew <- get_dist(mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")],
method = "euclidian")
distanceNew2 <- distanceNew^2
fviz_dist(distanceNew2,
gradient = list(low = "darkred", mid = "lightblue", high = "blue"))
get_clust_tendency(mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")],
n = nrow(mydataNew) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6124007
##
## $plot
## NULL
As this value is 0.5, we can say our data is clusterable
WARDNew <- mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")] %>%
get_dist(method = "euclidian") %>%
hclust(method = "ward.D2")
WARDNew
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 174
fviz_dend(WARDNew,
k = 5,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Based on the above dendrogram structure 5 clusters are the most appropriate
mydataNew$ClusterWard2 <- cutree(WARDNew,
k = 5)
head(mydataNew[c(1, 153)])
## # A tibble: 6 × 2
## ID ClusterWard2
## <dbl> <int>
## 1 1 1
## 2 2 2
## 3 3 3
## 4 4 3
## 5 5 1
## 6 6 4
Initial_leadersNew <- aggregate(mydataNew[ , c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")],
by = list(mydataNew$ClusterWard2),
FUN = mean)
Initial_leadersNew
## Group.1 Q21_z Q17b_z Q22j_z Q12_z
## 1 1 1.03300792 0.29833863 0.3112749 0.46828266
## 2 2 -1.07180564 0.41615065 0.4504003 1.56815074
## 3 3 -0.09723225 0.36128255 0.3551104 -0.72308065
## 4 4 -0.53144035 -0.01396479 -1.4726985 -0.03593379
## 5 5 -0.13558241 -2.28655011 -0.1059309 -0.32075639
K_MEANSNew <- hkmeans(mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")],
k = 5,
hc.metric = "euclidean",
hc.method = "ward.D2")
K_MEANSNew
## Hierarchical K-means clustering with 5 clusters of sizes 38, 24, 70, 26, 16
##
## Cluster means:
## Q21_z Q17b_z Q22j_z Q12_z
## 1 1.27618278 0.3351549 0.3436296 0.4196689
## 2 -0.94827619 0.3689031 0.4622637 1.4535219
## 3 -0.17432634 0.3235841 0.3436296 -0.5755567
## 4 -0.56139823 -0.1632806 -1.7370290 -0.1777341
## 5 0.06643004 -2.4996969 -0.1902236 -0.3701181
##
## Clustering vector:
## [1] 1 2 1 3 1 4 3 3 3 1 1 5 4 1 3 3 1 4 3 4 3 1 2 3 3 1 3 1 3 4 3 3 3 2 1 1 1
## [38] 2 3 3 1 1 2 3 3 5 3 1 4 2 3 1 4 3 4 2 1 4 2 3 4 2 4 4 1 4 3 3 3 3 3 3 1 1
## [75] 3 3 3 4 1 1 3 3 1 3 3 3 4 3 3 3 3 1 4 4 3 1 3 5 4 3 2 3 3 3 1 5 4 3 1 5 3
## [112] 1 2 5 5 3 2 4 1 1 3 1 5 3 1 5 3 3 3 2 3 3 1 3 3 3 3 4 3 5 2 3 3 2 1 4 1 4
## [149] 2 5 5 4 5 5 2 2 2 3 2 2 2 5 2 2 3 1 3 5 4 1 3 4 3 3
##
## Within cluster sum of squares by cluster:
## [1] 59.36713 29.96816 95.95803 54.48124 62.32125
## (between_SS / total_SS = 56.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault" "data"
## [11] "hclust"
fviz_cluster(K_MEANSNew,
palette = "jama",
repel = FALSE,
ggtheme = theme_classic())
mydataNew$ClusterK_MeansNew <- K_MEANSNew$cluster
head(mydataNew[c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z")])
## # A tibble: 6 × 4
## Q21_z[,1] Q17b_z[,1] Q22j_z[,1] Q12_z[,1]
## <dbl> <dbl> <dbl> <dbl>
## 1 0.905 -0.0698 1.06 0.568
## 2 -1.21 -0.880 0.344 1.19
## 3 1.61 -0.0698 0.344 -1.31
## 4 -0.507 0.740 0.344 -0.683
## 5 0.199 -0.0698 0.344 0.568
## 6 -0.507 -0.0698 -1.08 -0.0575
table(mydataNew$ClusterWard2)
##
## 1 2 3 4 5
## 44 20 62 29 19
table(mydataNew$ClusterK_MeansNew)
##
## 1 2 3 4 5
## 38 24 70 26 16
table(mydataNew$ClusterWard2, mydataNew$ClusterK_MeansNew)
##
## 1 2 3 4 5
## 1 35 2 7 0 0
## 2 0 20 0 0 0
## 3 3 1 55 3 0
## 4 0 0 7 22 0
## 5 0 1 1 1 16
CentroidsNew <- K_MEANSNew$centers
CentroidsNew
## Q21_z Q17b_z Q22j_z Q12_z
## 1 1.27618278 0.3351549 0.3436296 0.4196689
## 2 -0.94827619 0.3689031 0.4622637 1.4535219
## 3 -0.17432634 0.3235841 0.3436296 -0.5755567
## 4 -0.56139823 -0.1632806 -1.7370290 -0.1777341
## 5 0.06643004 -2.4996969 -0.1902236 -0.3701181
FigureNew <- as.data.frame(CentroidsNew)
FigureNew$id <- 1:nrow(FigureNew)
FigureNew <- pivot_longer(FigureNew, cols = c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z"))
FigureNew$Groups <- factor(FigureNew$id,
levels = c(1, 2, 3, 4, 5),
labels = c("1", "2", "3", "4", "5"))
FigureNew$nameFactor <- factor(FigureNew$name,
levels = c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z"),
labels = c("Q21_z", "Q17b_z", "Q22j_z", "Q12_z"))
ggplot(FigureNew, aes(x = nameFactor, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Groups, col = Groups), size = 3) +
geom_line(aes(group = id), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables") +
ylim(-3, 3)
H0: Mean(Cluster k variable, Cluster 1) = Mean(Cluster k variable, Cluster 2) = Mean(Cluster k variable, Cluster 3) = Mean(Cluster k variable, Cluster 4) = Mean(Cluster k variable, Cluster 5)
H1: At least one Mean is different from others
clusterfitNew <- aov(cbind(Q21_z, Q17b_z, Q22j_z, Q12_z) ~ as.factor(ClusterK_MeansNew),
data = mydataNew)
summary(clusterfitNew)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew) 4 93.862 23.4655 50.111 < 2.2e-16 ***
## Residuals 169 79.138 0.4683
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew) 4 115.533 28.883 84.941 < 2.2e-16 ***
## Residuals 169 57.467 0.340
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew) 4 96.909 24.2273 53.81 < 2.2e-16 ***
## Residuals 169 76.091 0.4502
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew) 4 83.6 20.900 39.509 < 2.2e-16 ***
## Residuals 169 89.4 0.529
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We reject H0 for all the cluster variables(p<0.001) and can say that the variables are statistically appropriate for clustering
H0: Mean(Age, Cluster 1) = Mean(Age, Cluster 2) = Mean(Age, Cluster 3) = Mean(Age, Cluster 4) = Mean(Age, Cluster 5)
H1: At least one Mean is different from others
aggregate(mydataNew$Q25,
by = list(mydataNew$ClusterK_MeansNew),
FUN = "mean")
## Group.1 x
## 1 1 21.13158
## 2 2 21.45833
## 3 3 21.64286
## 4 4 21.61538
## 5 5 21.43750
agefitNew <- aov(Q25 ~ as.factor(ClusterK_MeansNew),
data = mydataNew)
summary(agefitNew)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew) 4 7 1.746 0.243 0.914
## Residuals 169 1214 7.186
As p>0.05, we cannot reject H0 and as a result we cannot validate our clusters using age
H0: Mean(Recommender Score, Cluster 1) = Mean(Recommender Score, Cluster 2) = Mean(Recommender Score, Cluster 3) = Mean(Recommender Score, Cluster 4) = Mean(Recommender Score, Cluster 5)
H1: At least one Mean is different from others
aggregate(mydataNew$Q24,
by = list(mydataNew$ClusterK_MeansNew),
FUN = "mean")
## Group.1 x
## 1 1 7.263158
## 2 2 6.250000
## 3 3 6.142857
## 4 4 6.576923
## 5 5 5.312500
agefitNew <- aov(Q24 ~ as.factor(ClusterK_MeansNew),
data = mydataNew)
summary(agefitNew)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew) 4 53.2 13.300 2.734 0.0307 *
## Residuals 169 822.2 4.865
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can reject H0(p<0.05) and can say At least one Mean is different from others and as a result we are able to validate our clusters using the recommender score
H0: Mean(Familiarity Score, Cluster 1) = Mean(Familiarity Score, Cluster 2) = Mean(Familiarity Score, Cluster 3) = Mean(Familiarity Score, Cluster 4) = Mean(Familiarity Score, Cluster 5)
H1: At least one Mean is different from others
aggregate(mydataNew$Q9,
by = list(mydataNew$ClusterK_MeansNew),
FUN = "mean")
## Group.1 x
## 1 1 4.684211
## 2 2 4.041667
## 3 3 4.514286
## 4 4 4.653846
## 5 5 4.562500
agefitNew <- aov(Q9 ~ as.factor(ClusterK_MeansNew),
data = mydataNew)
summary(agefitNew)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew) 4 7.0 1.750 0.746 0.562
## Residuals 169 396.5 2.346
As p>0.05, we cannot reject H0 and as a result we cannot validate our clusters using familiarity score
H0: There is no association between the clusters and gender
H1: There is an association
chisq_results <- chisq.test(mydataNew$GenderF, as.factor(mydataNew$ClusterK_MeansNew))
## Warning in chisq.test(mydataNew$GenderF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect
chisq_results
##
## Pearson's Chi-squared test
##
## data: mydataNew$GenderF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 18.492, df = 8, p-value = 0.01783
addmargins(chisq_results$observed)
##
## mydataNew$GenderF 1 2 3 4 5 Sum
## male 11 15 37 8 5 76
## female 27 9 32 16 11 95
## other 0 0 1 2 0 3
## Sum 38 24 70 26 16 174
addmargins(chisq_results$expected, 2)
##
## mydataNew$GenderF 1 2 3 4 5 Sum
## male 16.5977011 10.4827586 30.574713 11.3563218 6.9885057 76
## female 20.7471264 13.1034483 38.218391 14.1954023 8.7356322 95
## other 0.6551724 0.4137931 1.206897 0.4482759 0.2758621 3
round(chisq_results$res, 2)
##
## mydataNew$GenderF 1 2 3 4 5
## male -1.37 1.40 1.16 -1.00 -0.75
## female 1.37 -1.13 -1.01 0.48 0.77
## other -0.81 -0.64 -0.19 2.32 -0.53
We can reject H0(p<0.05) and can say there is an association betweeen the clusters and gender but we still cannot validate the clusters using it because the Other gender population is very small(3)
chisq_results <- chisq.test(mydataNew$EmploymentF, as.factor(mydataNew$ClusterK_MeansNew))
## Warning in chisq.test(mydataNew$EmploymentF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect
chisq_results
##
## Pearson's Chi-squared test
##
## data: mydataNew$EmploymentF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 24.03, df = 16, p-value = 0.08884
We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and employment status
addmargins(chisq_results$observed)
##
## mydataNew$EmploymentF 1 2 3 4 5 Sum
## highschool student 7 4 11 4 0 26
## student 24 15 35 17 8 99
## full-time employed 7 4 20 5 4 40
## part-time employed 0 0 3 0 3 6
## unemployed 0 1 1 0 1 3
## Sum 38 24 70 26 16 174
chisq_results <- chisq.test(mydataNew$EducationF, as.factor(mydataNew$ClusterK_MeansNew))
## Warning in chisq.test(mydataNew$EducationF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect
chisq_results
##
## Pearson's Chi-squared test
##
## data: mydataNew$EducationF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 14.546, df = 16, p-value = 0.5581
We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and education level
addmargins(chisq_results$observed)
##
## mydataNew$EducationF 1 2 3 4 5 Sum
## primary school 2 2 2 2 0 8
## secondary school 13 6 22 3 4 48
## gymnasium 7 4 17 9 3 40
## bachelor's degree 12 9 24 11 5 61
## master's degree 4 3 5 1 4 17
## Sum 38 24 70 26 16 174
chisq_resultsInc <- chisq.test(mydataNew$IncomeF, as.factor(mydataNew$ClusterK_MeansNew))
## Warning in chisq.test(mydataNew$IncomeF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect
chisq_resultsInc
##
## Pearson's Chi-squared test
##
## data: mydataNew$IncomeF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 21.262, df = 16, p-value = 0.1687
We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and Income level
addmargins(chisq_resultsInc$observed)
##
## mydataNew$IncomeF 1 2 3 4 5 Sum
## less than 500€ 21 16 21 13 7 78
## 501€ - 1000€ 8 3 25 7 3 46
## 1001€ - 1500€ 5 3 11 2 3 24
## 1501€ - 2000€ 2 0 9 4 1 16
## more than 2000€ 2 2 4 0 2 10
## Sum 38 24 70 26 16 174
chisq_results <- chisq.test(mydataNew$Bank_CombinedF, as.factor(mydataNew$ClusterK_MeansNew))
## Warning in chisq.test(mydataNew$Bank_CombinedF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect
chisq_results
##
## Pearson's Chi-squared test
##
## data: mydataNew$Bank_CombinedF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 9.0388, df = 8, p-value = 0.339
We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and primary bank
addmargins(chisq_results$observed)
##
## mydataNew$Bank_CombinedF 1 2 3 4 5 Sum
## NLB 20 9 24 8 4 65
## NKBM 10 6 24 11 4 55
## Others 8 9 22 7 8 54
## Sum 38 24 70 26 16 174
chisq_resultsNeo <- chisq.test(mydataNew$Q18F, as.factor(mydataNew$ClusterK_MeansNew))
chisq_resultsNeo
##
## Pearson's Chi-squared test
##
## data: mydataNew$Q18F and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 5.4042, df = 4, p-value = 0.2483
We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and neobank usage
addmargins(chisq_resultsNeo$observed)
##
## mydataNew$Q18F 1 2 3 4 5 Sum
## Yes 11 14 27 10 6 68
## No 27 10 43 16 10 106
## Sum 38 24 70 26 16 174
chisq_results <- chisq.test(mydataNew$ExploreF, as.factor(mydataNew$ClusterK_MeansNew))
## Warning in chisq.test(mydataNew$ExploreF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect
chisq_results
##
## Pearson's Chi-squared test
##
## data: mydataNew$ExploreF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 14.535, df = 16, p-value = 0.5589
We cannot reject H0 as p>0.05 and cannot reject there is no association between the clusters and exploration level
addmargins(chisq_results$observed)
##
## mydataNew$ExploreF 1 2 3 4 5 Sum
## more than once a month 4 3 5 1 1 14
## every second month 6 2 8 3 2 21
## three to four times 4 1 6 4 4 19
## once to twice 13 12 26 5 6 62
## never 11 6 25 13 3 58
## Sum 38 24 70 26 16 174
chisq_resultsFre <- chisq.test(mydataNew$FrequencyF, as.factor(mydataNew$ClusterK_MeansNew))
## Warning in chisq.test(mydataNew$FrequencyF,
## as.factor(mydataNew$ClusterK_MeansNew)): Chi-squared approximation may be
## incorrect
chisq_resultsFre
##
## Pearson's Chi-squared test
##
## data: mydataNew$FrequencyF and as.factor(mydataNew$ClusterK_MeansNew)
## X-squared = 44.201, df = 20, p-value = 0.001415
We can reject H0(p<0.05) and can say there is an association betweeen the clusters and frequency of usage
addmargins(chisq_resultsFre$observed)
##
## mydataNew$FrequencyF 1 2 3 4 5 Sum
## several times a day 8 6 9 5 3 31
## daily 13 11 23 6 4 57
## weekly 11 6 28 11 4 60
## monthly 4 0 2 4 2 12
## less than monthly 0 1 0 0 3 4
## I don’t use mobile bank 2 0 8 0 0 10
## Sum 38 24 70 26 16 174
importanceNew <- aov(Q8 ~ as.factor(ClusterK_MeansNew),
data = mydataNew)
summary(importanceNew)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MeansNew) 4 24.1 6.031 2.992 0.0203 *
## Residuals 169 340.6 2.016
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can reject H0(p<0.05) and can say At least one Mean is different from others and as a result we are able to validate our clusters using the score of the importance of one app
aggregate(mydataNew$Q8,
by = list(mydataNew$ClusterK_MeansNew),
FUN = "mean")
## Group.1 x
## 1 1 6.184211
## 2 2 6.250000
## 3 3 5.442857
## 4 4 5.615385
## 5 5 5.250000
excel_file_path <- "C:/Users/ACER/Downloads/Perception_map.xlsx"
mydataMap <- read_excel("C:/Users/ACER/Downloads/Perception_map.xlsx")
head(mydataMap)
## # A tibble: 6 × 24
## NLB_Simple NKBM_Simple SKB_Simple Unicredit_Simple Revolut_Simple N26_Simple
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 2 1 2 3 3
## 2 3 6 6 6 4 3
## 3 3 6 6 6 6 6
## 4 4 6 3 6 4 6
## 5 6 4 6 6 6 6
## 6 1 4 6 6 1 6
## # ℹ 18 more variables: NLB_Safe <dbl>, NKBM_Safe <dbl>, SKB_Safe <dbl>,
## # Unicredit_Safe <dbl>, Revolut_Safe <dbl>, N26_Safe <dbl>,
## # NLB_Advanced <dbl>, NKBM_Advanced <dbl>, SKB_Advanced <dbl>,
## # Unicredit_Advanced <dbl>, Revolut_Advanced <dbl>, N26_Advanced <dbl>,
## # NLB_Popular <dbl>, NKBM_Popular <dbl>, SKB_Popular <dbl>,
## # Unicredit_Popular <dbl>, Revolut_Popular <dbl>, N26_Popular <dbl>
mydataMap[1:24] <- lapply(mydataMap[1:24], as.numeric)
head(mydataMap)
## # A tibble: 6 × 24
## NLB_Simple NKBM_Simple SKB_Simple Unicredit_Simple Revolut_Simple N26_Simple
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 2 1 2 3 3
## 2 3 6 6 6 4 3
## 3 3 6 6 6 6 6
## 4 4 6 3 6 4 6
## 5 6 4 6 6 6 6
## 6 1 4 6 6 1 6
## # ℹ 18 more variables: NLB_Safe <dbl>, NKBM_Safe <dbl>, SKB_Safe <dbl>,
## # Unicredit_Safe <dbl>, Revolut_Safe <dbl>, N26_Safe <dbl>,
## # NLB_Advanced <dbl>, NKBM_Advanced <dbl>, SKB_Advanced <dbl>,
## # Unicredit_Advanced <dbl>, Revolut_Advanced <dbl>, N26_Advanced <dbl>,
## # NLB_Popular <dbl>, NKBM_Popular <dbl>, SKB_Popular <dbl>,
## # Unicredit_Popular <dbl>, Revolut_Popular <dbl>, N26_Popular <dbl>
mydataMap <- mydataMap %>%
replace_with_na_all(condition = ~.x %in% c(6))
#Replace all NA with averages by columns
mydataMap <- mydataMap %>%
mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))
head(mydataMap[1:7])
## # A tibble: 6 × 7
## NLB_Simple NKBM_Simple SKB_Simple Unicredit_Simple Revolut_Simple N26_Simple
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 2 1 2 3 3
## 2 3 2.80 2.39 2.26 4 3
## 3 3 2.80 2.39 2.26 3.27 3.05
## 4 4 2.80 3 2.26 4 3.05
## 5 2.90 4 2.39 2.26 3.27 3.05
## 6 1 4 2.39 2.26 1 3.05
## # ℹ 1 more variable: NLB_Safe <dbl>
mydata_PCA <- mydataMap %>%
pivot_longer(everything(), names_to = "name", values_to = "score") %>%
separate(name, into = c("retailer", "dimension"), sep = "_")%>%
pivot_wider(names_from = dimension, values_from = score, values_fn = mean) %>%
column_to_rownames(var = "retailer")
print(mydata_PCA)
## Simple Safe Advanced Popular
## NLB 2.904348 3.206612 2.641667 2.992424
## NKBM 2.802632 3.023256 2.477273 2.388350
## SKB 2.393443 2.986667 2.295775 2.367347
## Unicredit 2.255319 2.815385 2.310345 1.915663
## Revolut 3.265306 2.715789 3.405941 3.169355
## N26 3.054545 2.303571 3.190476 2.487179
pca <- PCA(mydata_PCA,
scale.unit = TRUE,
graph = FALSE,
ncp = 2)
print(pca$var$cor)
## Dim.1 Dim.2
## Simple 0.9686056 0.1408423
## Safe -0.4779192 0.8756541
## Advanced 0.9773167 -0.1842975
## Popular 0.8051848 0.5740152
fviz_pca_biplot(pca,
repel = TRUE)
Relevant question: Q9 - How familiar are you with the latest features and updates available in your mobile banking apps? (1 - Not familiar at all; 7 - Very familiar)
We use a test of population proportion for checking this. For this purpose, we define awareness as a score of 5 and above and 1-4 as lack of awareness
frequency <- table(mydataNew$Q9 < 5)
frequency
##
## FALSE TRUE
## 80 94
H0: x = 0.5
H1: x > 0.5
prop.test(x = 94,
n = 174,
p = 0.5,
correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: 94 out of 174, null probability 0.5
## X-squared = 1.1264, df = 1, p-value = 0.1443
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.4779426 1.0000000
## sample estimates:
## p
## 0.5402299
As p>0.05 we cannot reject H0, implying we cannot reject that the number of young people having awareness about useful functions is not greater than the number of young people who lack awareness
Relevant Question: Q12 - How effective do you find social media as a source of information about mobile banking functions? (1 - Not effective at all; 7 - Very effective)
We use a test of population proportion for checking this. For this purpose, we define preference as a score of 5 and above and 1-4 as not preferring social media
frequency <- table(mydataNew$Q12 < 5)
frequency
##
## FALSE TRUE
## 51 123
prop.test(x = 51,
n = 174,
p = 0.5,
correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: 51 out of 174, null probability 0.5
## X-squared = 29.793, df = 1, p-value = 1
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.2398586 1.0000000
## sample estimates:
## p
## 0.2931034
As p>0.05 we cannot reject H0, implying we cannot reject that the number of young people finding social media effective is not greater than those who don’t find social media effective
Relevant question: Q21 - When considering banking services, which is more important to you, simplicity or security? (1 - prioritize simplicity; 7 - prioritize security)
We use a test of population proportion for checking this. For this purpose, we define preference as a score of 5 and above as leaning towards security 1-4 as leaning towards simplicity
frequency <- table(mydataNew$Q21 < 5)
frequency
##
## FALSE TRUE
## 77 97
prop.test(x = 97,
n = 174,
p = 0.5,
correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: 97 out of 174, null probability 0.5
## X-squared = 2.2989, df = 1, p-value = 0.06473
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.4951262 1.0000000
## sample estimates:
## p
## 0.5574713
As p>0.05 we cannot reject H0 in statistical terms but since its quite close to 0.05, we can say that the young people lean more towards simplicity than security
Relevant question: Q14 - Would you like to participate in any rewards programs offered by your bank? ( 1 - yes; 2 - no; 3 - I don’t know)
We use a test of population proportion for checking this. For this purpose, we include the responders who marked 1 as those who are motivated by the incentives and 2 as those who are not motivated and we removed I don’t know responses for this analysis
frequency <- table(mydataNew$Q14 < 2)
frequency
##
## FALSE TRUE
## 99 75
prop.test(x = 75,
n = 140,
p = 0.5,
correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: 75 out of 140, null probability 0.5
## X-squared = 0.71429, df = 1, p-value = 0.199
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.466364 1.000000
## sample estimates:
## p
## 0.5357143
As p>0.05 we cannot reject H0, implying we cannot reject that the number of young people motivated by incentives is not greater than those who are not motivated by incentives
For the next hypotheses, we use Comparison of three or more arithmetic means for dependent samples (extension of paired samples t-test), rANOVA
H0: Mean(1) = Mean(2) = ….. Mean(k)
H1: At least one mean is different
We also tested normality for all the hypotheses and since each one of them wasn’t distributed normally, we used the non parametric test(Friedman ANOVA) instead
H0: All distribution locations of variables are the same
H1: At least one distribution location of variable is different
Relevant question: Q15 - Order next reward incentives from least motivational to the most motivational for you, when exploring new banking services? (Q15a - discounts on banking services and purchases; Q15b - cash rewards; Q15c - tickets to events; Q15d - customized financial advice)
mydata_motivation <- mydata[c(1, 55:58)]
head(mydata_motivation)
## # A tibble: 6 × 5
## ID Q15a Q15b Q15c Q15d
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 4 3 1
## 2 2 1 2 4 3
## 3 3 4 3 2 1
## 4 4 3 4 2 1
## 5 5 2 1 3 4
## 6 6 3 4 2 1
mydata_long <- mydata_motivation %>%
pivot_longer(
cols = c("Q15a", "Q15b", "Q15c", "Q15d"),
names_to = "Motivation",
values_to = "Score") %>%
convert_as_factor(Motivation)
mydata_long <- as.data.frame(mydata_long)
tail(mydata_long, 10)
## ID Motivation Score
## 687 172 Q15c 1
## 688 172 Q15d 4
## 689 173 Q15a 3
## 690 173 Q15b 4
## 691 173 Q15c 2
## 692 173 Q15d 1
## 693 174 Q15a 3
## 694 174 Q15b 4
## 695 174 Q15c 2
## 696 174 Q15d 1
ggboxplot(mydata_long,
x = "Motivation",
y = "Score",
add = "jitter")
mydata_long %>%
group_by(Motivation) %>%
shapiro_test(Score)
## # A tibble: 4 × 4
## Motivation variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Q15a Score 0.854 6.47e-12
## 2 Q15b Score 0.830 6.04e-13
## 3 Q15c Score 0.841 1.75e-12
## 4 Q15d Score 0.857 9.03e-12
As none of the variables are distributed normally, we use the Friedman ANOVA method
library(dplyr)
mydata_long <- mydata_long %>%
select(ID, Motivation, Score)
library(rstatix)
mydata_long %>%
group_by(Motivation) %>%
get_summary_stats(Score, type = "common")
## # A tibble: 4 × 11
## Motivation variable n min max median iqr mean sd se ci
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Q15a Score 174 1 4 3 2 2.71 1.09 0.083 0.163
## 2 Q15b Score 174 1 4 3 2 2.75 1.15 0.087 0.172
## 3 Q15c Score 174 1 4 2 2 2.09 1.00 0.076 0.15
## 4 Q15d Score 174 1 4 2 1 2.45 1.11 0.084 0.166
Friedman Anova
# Load the necessary libraries
library(rstatix)
# Perform Friedman test with RowNumber as a blocking variable
FriedmanANOVA <- friedman_test(Score ~ Motivation | ID,
data = mydata_long)
# Display the results
FriedmanANOVA
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 Score 174 28.4 3 0.00000292 Friedman test
We can reject H0(p<0.001) and can say At least one distribution location of variable is different
library(rstatix)
#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonpar <- wilcox_test(Score ~ Motivation,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_long)
paires_nonpar
## # A tibble: 6 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Score Q15a Q15b 174 174 7304. 0.635 1 ns
## 2 Score Q15a Q15c 174 174 10525 0.0000082 0.0000492 ****
## 3 Score Q15a Q15d 174 174 8832. 0.062 0.37 ns
## 4 Score Q15b Q15c 174 174 10372. 0.0000239 0.000143 ***
## 5 Score Q15b Q15d 174 174 8894 0.051 0.303 ns
## 6 Score Q15c Q15d 174 174 5750. 0.004 0.023 *
library(rstatix)
comparisons <- paires_nonpar %>%
add_y_position(fun = "median", step.increase = 0.35)
library(ggpubr)
ggboxplot(mydata_long, x = "Motivation", y = "Score", add = "point", ylim=c(0, 5)) +
stat_pvalue_manual(comparisons, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = Motivation), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "darkred",
position = position_dodge(width = 0.8),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = Motivation)) +
labs(subtitle = get_test_label(FriedmanANOVA, detailed = TRUE),
caption = get_pwc_label(comparisons))
Based on the above table we can say discounts on banking services and purchases and cash rewards are significantly more important for young people than the event tickets and financial advice
Relevant question: Q17 - How important are each of the stated properties of a banking service to you? (Q17a - instant money transfers; Q17b - fast and accessible customer support; Q17c - fast new account opening; Q17d - one mobile banking app for all bank services; Q17e - loan accessible through online bank; Q17f - low costs of managing the account; Q17g - fast and complete information about banking services) (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)
mydata_important <- mydata[c(1, 60:66)]
head(mydata_important)
## # A tibble: 6 × 8
## ID Q17a Q17b Q17c Q17d Q17e Q17f Q17g
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 6 6 5 6 4 5 6
## 2 2 7 5 5 7 5 7 6
## 3 3 5 6 5 2 2 7 6
## 4 4 7 7 7 7 6 7 7
## 5 5 6 6 5 6 2 7 5
## 6 6 7 6 5 5 3 6 6
mydata_important_long <- mydata_important %>%
pivot_longer(
cols = c("Q17a", "Q17b", "Q17c", "Q17d","Q17e","Q17f","Q17g"),
names_to = "Importance",
values_to = "ScoreI") %>%
convert_as_factor(Importance)
mydata_important_long <- as.data.frame(mydata_important_long)
tail(mydata_important_long, 10)
## ID Importance ScoreI
## 1209 173 Q17e 5
## 1210 173 Q17f 7
## 1211 173 Q17g 6
## 1212 174 Q17a 7
## 1213 174 Q17b 6
## 1214 174 Q17c 7
## 1215 174 Q17d 7
## 1216 174 Q17e 4
## 1217 174 Q17f 6
## 1218 174 Q17g 6
ggboxplot(mydata_important_long,
x = "Importance",
y = "ScoreI",
add = "jitter")
mydata_important_long %>%
group_by(Importance) %>%
shapiro_test(ScoreI)
## # A tibble: 7 × 4
## Importance variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Q17a ScoreI 0.706 3.05e-17
## 2 Q17b ScoreI 0.734 2.10e-16
## 3 Q17c ScoreI 0.847 3.28e-12
## 4 Q17d ScoreI 0.747 5.30e-16
## 5 Q17e ScoreI 0.921 4.08e- 8
## 6 Q17f ScoreI 0.712 4.71e-17
## 7 Q17g ScoreI 0.778 5.92e-15
As none of the variables are distributed normally, we use the Friedman ANOVA method
mydata_important_long %>%
group_by(Importance) %>%
get_summary_stats(ScoreI, type = "common")
## # A tibble: 7 × 11
## Importance variable n min max median iqr mean sd se ci
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Q17a ScoreI 174 1 7 6 1 6.06 1.33 0.101 0.199
## 2 Q17b ScoreI 174 1 7 6 1 6.09 1.24 0.094 0.185
## 3 Q17c ScoreI 174 1 7 6 2 5.59 1.39 0.106 0.209
## 4 Q17d ScoreI 174 2 7 7 1 6.12 1.20 0.091 0.179
## 5 Q17e ScoreI 174 1 7 5 2 4.57 1.64 0.125 0.246
## 6 Q17f ScoreI 174 2 7 7 1 6.29 1.01 0.077 0.152
## 7 Q17g ScoreI 174 2 7 6 1 6.06 1.12 0.085 0.168
# Perform Friedman test with RowNumber as a blocking variable
FriedmanANOVA <- friedman_test(ScoreI ~ Importance | ID,
data = mydata_important_long)
# Display the results
FriedmanANOVA
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 ScoreI 174 275. 6 1.58e-56 Friedman test
We can reject H0(p<0.001) and can say At least one distribution location of variable is different
#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonparI <- wilcox_test(ScoreI ~ Importance,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_important_long)
paires_nonparI
## # A tibble: 21 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 ScoreI Q17a Q17b 174 174 1784 8.47e- 1 1 e+ 0 ns
## 2 ScoreI Q17a Q17c 174 174 3769 1.10e- 5 2.31e- 4 ***
## 3 ScoreI Q17a Q17d 174 174 1886. 9.06e- 1 1 e+ 0 ns
## 4 ScoreI Q17a Q17e 174 174 8690. 8.38e-18 1.76e-16 ****
## 5 ScoreI Q17a Q17f 174 174 1602. 6.2 e- 2 1 e+ 0 ns
## 6 ScoreI Q17a Q17g 174 174 1794 3.84e- 1 1 e+ 0 ns
## 7 ScoreI Q17b Q17c 174 174 3503 7.1 e- 7 1.49e- 5 ****
## 8 ScoreI Q17b Q17d 174 174 1884. 6.17e- 1 1 e+ 0 ns
## 9 ScoreI Q17b Q17e 174 174 8555 1.5 e-18 3.15e-17 ****
## 10 ScoreI Q17b Q17f 174 174 1498. 4.6 e- 2 9.68e- 1 ns
## # ℹ 11 more rows
comparisonsI <- paires_nonparI %>%
add_y_position(fun = "median", step.increase = 0.35)
ggboxplot(mydata_important_long, x = "Importance", y = "ScoreI", add = "point", ylim=c(0, 27)) +
stat_pvalue_manual(comparisonsI, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = Importance), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "darkred",
position = position_dodge(width = 0.6),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = Importance)) +
labs(subtitle = get_test_label(FriedmanANOVA, detailed = TRUE),
caption = get_pwc_label(comparisonsI))
Based on the above table we can say one mobile banking app for all bank services are significantly more important for young people than other properties of banking services
Relevant question: Q22 - How important are stated properties of banks to you? (Q22a - polite service from employees; Q22b - modern look of the branch office; Q22c - easy and convenient mobile banking; Q22d - access to information about services; Q22e - feeling of financial security; Q22f - fast services/speed of services; Q22g - good reputation of the bank; Q22h - technological advancement; Q22i - Accessibility; Q22j - benefits) (1 - completely unimportant; 2 - unimportant; 3 - slightly unimportant; 4 - undecided; 5 - slightly important; 6 - important; 7 - very important)
mydata_characterstics <- mydata[c(1, 77:86)]
head(mydata_characterstics)
## # A tibble: 6 × 11
## ID Q22a Q22b Q22c Q22d Q22e Q22f Q22g Q22h Q22i Q22j
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 7 5 6 7 7 7 7 7 7 7
## 2 2 3 5 7 6 6 7 6 7 6 6
## 3 3 6 2 2 6 7 6 6 4 5 6
## 4 4 5 3 7 6 7 6 6 7 7 6
## 5 5 6 6 6 6 7 6 7 5 6 6
## 6 6 7 4 7 6 6 4 5 5 5 4
mydata_characterstics_long <- mydata_characterstics %>%
pivot_longer(
cols = c("Q22a", "Q22b", "Q22c", "Q22d","Q22e","Q22f","Q22g", "Q22h", "Q22i", "Q22j"),
names_to = "Characterstics",
values_to = "ScoreC") %>%
convert_as_factor(Characterstics)
mydata_characterstics_long <- as.data.frame(mydata_characterstics_long)
tail(mydata_characterstics_long, 10)
## ID Characterstics ScoreC
## 1731 174 Q22a 6
## 1732 174 Q22b 4
## 1733 174 Q22c 6
## 1734 174 Q22d 6
## 1735 174 Q22e 6
## 1736 174 Q22f 6
## 1737 174 Q22g 6
## 1738 174 Q22h 7
## 1739 174 Q22i 7
## 1740 174 Q22j 5
ggboxplot(mydata_characterstics_long,
x = "Characterstics",
y = "ScoreC",
add = "jitter")
mydata_characterstics_long %>%
group_by(Characterstics) %>%
shapiro_test(ScoreC)
## # A tibble: 10 × 4
## Characterstics variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Q22a ScoreC 0.851 4.86e-12
## 2 Q22b ScoreC 0.932 2.40e- 7
## 3 Q22c ScoreC 0.694 1.38e-17
## 4 Q22d ScoreC 0.809 8.10e-14
## 5 Q22e ScoreC 0.656 1.38e-18
## 6 Q22f ScoreC 0.720 8.08e-17
## 7 Q22g ScoreC 0.840 1.63e-12
## 8 Q22h ScoreC 0.763 1.85e-15
## 9 Q22i ScoreC 0.707 3.38e-17
## 10 Q22j ScoreC 0.857 9.56e-12
As the variables are not normally distributed we can use the Friendman ANOVA method
mydata_characterstics_long %>%
group_by(Characterstics) %>%
get_summary_stats(ScoreC, type = "common")
## # A tibble: 10 × 11
## Characterstics variable n min max median iqr mean sd se
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Q22a ScoreC 174 1 7 6 1 5.42 1.47 0.112
## 2 Q22b ScoreC 174 1 7 4 2 4.17 1.56 0.118
## 3 Q22c ScoreC 174 1 7 7 1 6.17 1.26 0.095
## 4 Q22d ScoreC 174 3 7 6 1 6.08 0.95 0.072
## 5 Q22e ScoreC 174 2 7 7 1 6.27 1.20 0.091
## 6 Q22f ScoreC 174 2 7 7 1 6.29 0.997 0.076
## 7 Q22g ScoreC 174 1 7 6 1 5.61 1.29 0.098
## 8 Q22h ScoreC 174 1 7 6 1 6.03 1.16 0.088
## 9 Q22i ScoreC 174 2 7 7 1 6.27 1.04 0.079
## 10 Q22j ScoreC 174 1 7 6 2 5.52 1.40 0.107
## # ℹ 1 more variable: ci <dbl>
# Load the necessary libraries
# Perform Friedman test with RowNumber as a blocking variable
FriedmanANOVA <- friedman_test(ScoreC ~ Characterstics | ID,
data = mydata_characterstics_long)
# Display the results
FriedmanANOVA
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 ScoreC 174 496. 9 3.45e-101 Friedman test
We can reject H0(p<0.001) and can say At least one distribution location of variable is different
#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonparC <- wilcox_test(ScoreC ~ Characterstics,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_characterstics_long)
paires_nonparC
## # A tibble: 45 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 ScoreC Q22a Q22b 174 174 7522 2.5 e-15 1.12e-13 ****
## 2 ScoreC Q22a Q22c 174 174 764. 2.02e-11 9.09e-10 ****
## 3 ScoreC Q22a Q22d 174 174 951 2.79e- 8 1.26e- 6 ****
## 4 ScoreC Q22a Q22e 174 174 634. 4.12e-13 1.85e-11 ****
## 5 ScoreC Q22a Q22f 174 174 812 9.21e-12 4.14e-10 ****
## 6 ScoreC Q22a Q22g 174 174 2534 1.1 e- 1 1 e+ 0 ns
## 7 ScoreC Q22a Q22h 174 174 1442. 8.66e- 7 3.90e- 5 ****
## 8 ScoreC Q22a Q22i 174 174 440 2.59e-13 1.17e-11 ****
## 9 ScoreC Q22a Q22j 174 174 2689 5.24e- 1 1 e+ 0 ns
## 10 ScoreC Q22b Q22c 174 174 295 4.71e-23 2.12e-21 ****
## # ℹ 35 more rows
comparisonsC <- paires_nonparC %>%
add_y_position(fun = "median", step.increase = 0.35)
ggboxplot(mydata_characterstics_long, x = "Characterstics", y = "ScoreC", add = "point", ylim=c(0, 57)) +
stat_pvalue_manual(comparisonsC, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = Characterstics), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "darkred",
position = position_dodge(width = 0.6),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = Characterstics)) +
labs(subtitle = get_test_label(FriedmanANOVA, detailed = TRUE),
caption = get_pwc_label(comparisonsC))
Based on the above table we can say the speed of services is the most important characteristic
Relevant question: Q23 - How likely would you decide to change the bank in the next situations? (Q23a - long lines at branches; Q23b - unprofessional attitude of employees; Q23c - high provisions; Q23d - lack of useful properties in mobile banking app; Q23e - not getting a loan; Q24f - too much of bureaucracy; Q23g - deficient information about services, Q23h - recommendation of other bank from friend/family) (1 - for sure not; 2 - not; 3 - probably not; 4 - maybe; 5 - probably yes; 6 - yes; 7 - for sure yes )
mydata_change <- mydata[c(1, 87:94)]
head(mydata_change)
## # A tibble: 6 × 9
## ID Q23a Q23b Q23c Q23d Q23e Q23f Q23g Q23h
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 4 7 7 4 4 6 6 4
## 2 2 5 5 7 6 5 5 5 5
## 3 3 4 5 7 1 5 6 6 4
## 4 4 5 6 7 5 6 5 3 4
## 5 5 3 4 7 2 5 6 6 4
## 6 6 3 5 4 5 1 2 6 4
mydata_change_long <- mydata_change %>%
pivot_longer(
cols = c("Q23a", "Q23b", "Q23c", "Q23d","Q23e","Q23f","Q23g","Q23h"),
names_to = "Change",
values_to = "ScoreCh") %>%
convert_as_factor(Change)
mydata_change_long <- as.data.frame(mydata_change_long)
tail(mydata_change_long, 10)
## ID Change ScoreCh
## 1383 173 Q23g 5
## 1384 173 Q23h 4
## 1385 174 Q23a 4
## 1386 174 Q23b 4
## 1387 174 Q23c 7
## 1388 174 Q23d 5
## 1389 174 Q23e 4
## 1390 174 Q23f 4
## 1391 174 Q23g 3
## 1392 174 Q23h 5
ggboxplot(mydata_change_long,
x = "Change",
y = "ScoreCh",
add = "jitter")
mydata_change_long %>%
group_by(Change) %>%
shapiro_test(ScoreCh)
## # A tibble: 8 × 4
## Change variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Q23a ScoreCh 0.946 3.25e- 6
## 2 Q23b ScoreCh 0.916 1.77e- 8
## 3 Q23c ScoreCh 0.859 1.15e-11
## 4 Q23d ScoreCh 0.925 8.21e- 8
## 5 Q23e ScoreCh 0.925 7.98e- 8
## 6 Q23f ScoreCh 0.938 7.42e- 7
## 7 Q23g ScoreCh 0.933 3.18e- 7
## 8 Q23h ScoreCh 0.892 6.36e-10
As the variables are not distributed normally we can use the Friendman ANOVA method
mydata_change_long %>%
group_by(Change) %>%
get_summary_stats(ScoreCh, type = "common")
## # A tibble: 8 × 11
## Change variable n min max median iqr mean sd se ci
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Q23a ScoreCh 174 1 7 4 2 3.98 1.40 0.106 0.209
## 2 Q23b ScoreCh 174 1 7 5 2 4.85 1.57 0.119 0.235
## 3 Q23c ScoreCh 174 1 7 6 2 5.41 1.49 0.113 0.222
## 4 Q23d ScoreCh 174 1 7 5 2 4.80 1.54 0.117 0.23
## 5 Q23e ScoreCh 174 1 7 5 1 4.60 1.41 0.107 0.211
## 6 Q23f ScoreCh 174 1 7 5 2 4.74 1.42 0.108 0.213
## 7 Q23g ScoreCh 174 1 7 5 2 4.67 1.51 0.114 0.226
## 8 Q23h ScoreCh 174 1 7 4 1 4.10 1.19 0.09 0.178
# Load the necessary libraries
# Perform Friedman test with RowNumber as a blocking variable
FriedmanANOVA <- friedman_test(ScoreCh ~ Change | ID,
data = mydata_change_long)
# Display the results
FriedmanANOVA
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 ScoreCh 174 192. 7 4.74e-38 Friedman test
We can reject H0(p<0.001) and can say At least one distribution location of variable is different
#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonparCh <- wilcox_test(ScoreCh ~ Change,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_change_long)
paires_nonparCh
## # A tibble: 28 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 ScoreCh Q23a Q23b 174 174 923 4.47e-13 1.25e-11 ****
## 2 ScoreCh Q23a Q23c 174 174 1039 1.19e-17 3.33e-16 ****
## 3 ScoreCh Q23a Q23d 174 174 1566. 2.97e-10 8.32e- 9 ****
## 4 ScoreCh Q23a Q23e 174 174 2052. 9.91e- 6 2.77e- 4 ***
## 5 ScoreCh Q23a Q23f 174 174 1340. 8.55e- 9 2.39e- 7 ****
## 6 ScoreCh Q23a Q23g 174 174 1938. 9.36e- 7 2.62e- 5 ****
## 7 ScoreCh Q23a Q23h 174 174 3112. 3.42e- 1 1 e+ 0 ns
## 8 ScoreCh Q23b Q23c 174 174 1388. 9.65e- 7 2.70e- 5 ****
## 9 ScoreCh Q23b Q23d 174 174 3522. 4.79e- 1 1 e+ 0 ns
## 10 ScoreCh Q23b Q23e 174 174 4767 8.3 e- 2 1 e+ 0 ns
## # ℹ 18 more rows
comparisonsCh <- paires_nonparCh %>%
add_y_position(fun = "median", step.increase = 0.35)
ggboxplot(mydata_change_long, x = "Change", y = "ScoreCh", add = "point", ylim=c(0, 37)) +
stat_pvalue_manual(comparisonsCh, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 2,
aes(group = Change), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "darkred",
position = position_dodge(width = 0.6),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = Change)) +
labs(subtitle = get_test_label(FriedmanANOVA, detailed = TRUE),
caption = get_pwc_label(comparisonsCh))
Based on the above we can say high provision is the biggest reason to change the bank