h1.title {color: #33006F ;font-family: "Monaco", monospace; font-size: 48px;}
.author {color: #33006F ;font-family: "Monaco", monospace; font-size: 24px;}
.date {color: #33006F ;font-family: "Monaco", monospace; font-size: 18px;}
columns {display: flex;}
h1 {color: #33006F;font-family: "Monaco", monospace; font-size: 36px;}
columns {display: flex;}
h3 {color: #84BD00;font-family: "Monaco", monospace; font-size: 24px;}
#install.packages
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata_excel <- read_excel("~/Documents/Šola/IMB/2. semester/NLB project/anketa_končni podatki.xlsx")
mydata_excel <- mydata_excel[-1, ] #Delete first row in which the questions are written
mydata_excel$ID <- seq(1,nrow(mydata_excel))
head(mydata_excel)
## # A tibble: 6 × 129
## Q1 Q2 Q3 Q4 Q5 Q6a Q6b Q6c Q6d Q6e Q6f Q6g Q6h
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 1 1 1 2 1 1 2 1 2 1 1
## 2 2 1 1 2 -2 -2 -2 -2 -2 -2 -2 -2 -2
## 3 2 1 1 1 2 4 5 5 4 3 5 5 4
## 4 2 1 1 1 2 -3 -3 -3 -3 -3 -3 -3 -3
## 5 2 1 1 1 2 2 3 4 4 1 2 2 3
## 6 2 1 1 1 2 3 2 5 1 1 1 1 1
## # ℹ 116 more variables: Q6i <chr>, Q6j <chr>, Q6j_text <chr>, Q7 <chr>,
## # Assistance <chr>, Security <chr>, Transparency <chr>, Convinience <chr>,
## # `Speed and Reliability` <chr>, Q8f <chr>, Q8f_text <chr>,
## # Branch_Assistance <chr>, `Mobile bank_Assistance` <chr>,
## # Branch_Security <chr>, `Mobile bank_Security` <chr>,
## # Branch_Transparency <chr>, `Mobile bank_Transparency` <chr>,
## # Branch_Convinience <chr>, `Mobile bank_Convinience` <chr>, …
mydata <- mydata_excel[!(apply(mydata_excel == -3, 1, any)), ]
mydata <- subset(mydata, select = -c(Q21:Q40))
mydata$ID <- seq(1,nrow(mydata))
head(mydata)
## # A tibble: 6 × 83
## Q1 Q2 Q3 Q4 Q5 Q6a Q6b Q6c Q6d Q6e Q6f Q6g Q6h
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 1 1 1 2 1 1 2 1 2 1 1
## 2 2 1 1 2 -2 -2 -2 -2 -2 -2 -2 -2 -2
## 3 2 1 1 1 2 4 5 5 4 3 5 5 4
## 4 2 1 1 1 2 2 3 4 4 1 2 2 3
## 5 2 1 1 1 2 3 2 5 1 1 1 1 1
## 6 2 2 2 1 2 5 4 4 5 4 3 5 5
## # ℹ 70 more variables: Q6i <chr>, Q6j <chr>, Q6j_text <chr>, Q7 <chr>,
## # Assistance <chr>, Security <chr>, Transparency <chr>, Convinience <chr>,
## # `Speed and Reliability` <chr>, Q8f <chr>, Q8f_text <chr>,
## # Branch_Assistance <chr>, `Mobile bank_Assistance` <chr>,
## # Branch_Security <chr>, `Mobile bank_Security` <chr>,
## # Branch_Transparency <chr>, `Mobile bank_Transparency` <chr>,
## # Branch_Convinience <chr>, `Mobile bank_Convinience` <chr>, …
mydata$Q2 <- factor(mydata$Q2,
levels = c(1, 2),
labels = c("Yes","No"))
mydata$Q3 <- factor(mydata$Q3,
levels = c(1, 2),
labels = c("Yes","No"))
mydata$Q4 <- factor(mydata$Q4,
levels = c(1, 2, 3),
labels = c("Yes","No", "I don't know"))
mydata$Q5 <- factor(mydata$Q5,
levels = c(1, 2),
labels = c("Yes","No"))
mydata$Q7 <- factor(mydata$Q7,
levels = c(1, 2),
labels = c("Yes","No"))
mydata$Q16a <- factor(mydata$Q16a,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16b <- factor(mydata$Q16b,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16c <- factor(mydata$Q16c,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16d <- factor(mydata$Q16d,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16e <- factor(mydata$Q16e,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16f <- factor(mydata$Q16f,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16g <- factor(mydata$Q16g,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q41 <- factor(mydata$Q41,
levels = c(1, 2),
labels = c("Female","Male"))
mydata$Q43a <- factor(mydata$Q43a,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43b <- factor(mydata$Q43b,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43c <- factor(mydata$Q43c,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43d <- factor(mydata$Q43d,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43e <- factor(mydata$Q43e,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43f <- factor(mydata$Q43f,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43g <- factor(mydata$Q43g,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43h <- factor(mydata$Q43h,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q44 <- factor(mydata$Q44,
levels = c(2, 6, 1, 4, 3, 5),
labels = c("1.000 – 5.000 habitants","More than 100.000 habitants", "Less than 1.000 habitants", "20.001 – 50.000 habitants", "5.001 – 20.000 habitants", "50.001 – 100.000 habitants"))
mydata$Q45 <- factor(mydata$Q45,
levels = c(3, 5, 1, 9, 12, 7, 10, 4, 11, 6),
labels = c("OTP banka d.d.","Banka Intesa Sanpaolo d.d.", "Nova Ljubljanska Banka d.d. (NLB)", "Gorenjska Banka d.d.", "Delavska Hranilnica d.d.", "Revolut", "Deželna Banka Slovenije d.d.", "Banka Sparkasse d.d.", "Addiko Bank d.d.", "UniCredit Banka Slovenija d.d."))
mydata$Q46 <- factor(mydata$Q46,
levels = c(1, 2, 3, 5, 6, 4),
labels = c("Študent/-ka","Redno zaposlen/-a", "Upokojen/-a", "Samozaposlen/-a", "Delno zaposlen/-a", "Brezposeln/-a"))
mydata$Q47 <- factor(mydata$Q47,
levels = c(1, 5, 3, 8, 2, 4, 6, 7),
labels = c("Pod 1.000€","3.001€ - 5.000€", "1.501€ - 2.000€", "I don't want to answer", "1.000€ - 1.500€", "2.001€ - 3.000€", "5.001€ - 10.000€", "Above 10.000€"))
mydata$Q48 <- factor(mydata$Q48,
levels = c(2, 6, 3, 5, 7, 4),
labels = c("Dokončana osnovna šola","Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)", "Dokončana nižja ali srednja poklicna izobrazba", "Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja)", "Dokončana specializacija, znanstveni magisterij, doktorat", "Dokončana srednja strokovna ali splošna izobrazba"))
mydata$Q49 <- factor(mydata$Q49,
levels = c(1, 2, 3),
labels = c("Preko linka","Na tablici", "Na listu papirja"))
mydata[c(6:14, 18:22, 25:39, 48:55, 58:62)] <- mydata[c(6:14, 18:22, 25:39, 48:55, 58:62)] %>% mutate_all(as.numeric)
mydata <- mydata %>% mutate(across(where(is.numeric), ~ replace(., . == -2, mean(.[. != -2], na.rm = TRUE))))
mydata <- mydata %>% mutate(across(where(is.numeric), ~ replace(., is.na(.), mean(., na.rm = TRUE))))
mydata <- mydata %>%
filter(!ID %in% c(2, 16, 17))
mydata$ID <- seq(1, nrow(mydata))
summary(mydata[c(-1, -15, -16, -23, -24, -47, -56, -57, -63, -64, -66, -75, -78, -83)])
## Q2 Q3 Q4 Q5 Q6a
## Yes :121 Yes :131 Yes :103 Yes : 25 Min. :1.000
## No : 36 No : 26 No : 28 No :132 1st Qu.:3.000
## NA's: 1 NA's: 1 I don't know: 26 NA's: 1 Median :3.000
## NA's : 1 Mean :3.433
## 3rd Qu.:4.000
## Max. :5.000
##
## Q6b Q6c Q6d Q6e
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:1.000
## Median :4.000 Median :4.000 Median :4.000 Median :2.000
## Mean :3.535 Mean :3.847 Mean :3.854 Mean :2.688
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q6f Q6g Q6h Q6i Q7
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 Yes :82
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:3.000 No :75
## Median :4.000 Median :3.086 Median :3.00 Median :4.000 NA's: 1
## Mean :3.497 Mean :3.172 Mean :2.93 Mean :3.465
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
##
## Assistance Security Transparency Convinience
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.00 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000
## Median :4.00 Median :5.000 Median :4.000 Median :4.000
## Mean :3.79 Mean :4.382 Mean :4.108 Mean :3.975
## 3rd Qu.:4.00 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.000
##
## Speed and Reliability Branch_Assistance Mobile bank_Assistance Branch_Security
## Min. :1.000 Min. :2.000 Min. :1.00 Min. :2.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:3.00 1st Qu.:3.986
## Median :4.000 Median :4.000 Median :3.00 Median :4.000
## Mean :4.185 Mean :4.006 Mean :3.14 Mean :3.981
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.00 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
##
## Mobile bank_Security Branch_Transparency Mobile bank_Transparency
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :4.000 Median :3.000
## Mean :3.032 Mean :3.898 Mean :3.248
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
##
## Branch_Convinience Mobile bank_Convinience Branch_Speed and Reliability
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :3.000 Median :4.000
## Mean :3.752 Mean :3.389 Mean :3.701
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
##
## Mobile bank_Speed and Reliability Q15a Q15b Q15c
## Min. :1.000 Min. :1 Min. :2.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:4 1st Qu.:4.000 1st Qu.:4.000
## Median :4.000 Median :4 Median :4.000 Median :4.000
## Mean :3.586 Mean :4 Mean :4.134 Mean :4.115
## 3rd Qu.:4.000 3rd Qu.:5 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5 Max. :5.000 Max. :5.000
##
## Q15d Q15e Q16a Q16b
## Min. :1.000 Min. :2.000 Selected :98 Selected :66
## 1st Qu.:3.000 1st Qu.:4.000 Not selected:59 Not selected:91
## Median :4.000 Median :4.000 NA's : 1 NA's : 1
## Mean :3.809 Mean :4.127
## 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
##
## Q16c Q16d Q16e Q16f
## Selected :68 Selected :79 Selected : 49 Selected : 36
## Not selected:89 Not selected:78 Not selected:108 Not selected:121
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q16g Q17a Q17b Q17c
## Selected : 9 Min. :0.000 Min. :0.000 Min. :0.000
## Not selected:148 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000
## NA's : 1 Median :4.000 Median :4.000 Median :4.000
## Mean :3.662 Mean :3.694 Mean :3.758
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.750
## Max. :5.000 Max. :5.000 Max. :5.000
##
## Q17d Q17e Q18a Q18b Q18c
## Min. :0.000 Min. :0.000 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:4.000 1st Qu.:3.00 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.00 Median :4.000 Median :4.000
## Mean :3.223 Mean :3.809 Mean :3.35 Mean :3.732 Mean :3.713
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.00 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000 Max. :5.000
##
## Q19a Q19b Q19c Q19d
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.217 Mean :3.548 Mean :3.331 Mean :3.618
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q19e Q41 Q43a Q43b
## Min. :1.000 Female:78 Selected : 32 Selected : 12
## 1st Qu.:2.000 Male :79 Not selected:125 Not selected:145
## Median :3.000 NA's : 1 NA's : 1 NA's : 1
## Mean :3.051
## 3rd Qu.:4.000
## Max. :5.000
##
## Q43c Q43d Q43e Q43f
## Selected : 5 Selected :81 Selected : 57 Selected : 1
## Not selected:152 Not selected:76 Not selected:100 Not selected:156
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q43g Q43h Q44
## Selected : 29 Selected : 0 1.000 – 5.000 habitants :44
## Not selected:128 Not selected:157 More than 100.000 habitants:36
## NA's : 1 NA's : 1 Less than 1.000 habitants :30
## 20.001 – 50.000 habitants :14
## 5.001 – 20.000 habitants :22
## 50.001 – 100.000 habitants :11
## NA's : 1
## Q45 Q46
## OTP banka d.d. :71 Študent/-ka :22
## Nova Ljubljanska Banka d.d. (NLB):35 Redno zaposlen/-a:73
## Delavska Hranilnica d.d. :12 Upokojen/-a :43
## Banka Intesa Sanpaolo d.d. :10 Samozaposlen/-a :13
## Deželna Banka Slovenije d.d. : 8 Delno zaposlen/-a: 3
## (Other) :18 Brezposeln/-a : 3
## NA's : 4 NA's : 1
## Q47
## Pod 1.000€ :46
## 1.000€ - 1.500€:39
## 2.001€ - 3.000€:29
## 1.501€ - 2.000€:25
## 3.001€ - 5.000€: 6
## (Other) : 4
## NA's : 9
## Q48
## Dokončana osnovna šola : 8
## Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja) :35
## Dokončana nižja ali srednja poklicna izobrazba :23
## Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja):34
## Dokončana specializacija, znanstveni magisterij, doktorat : 5
## Dokončana srednja strokovna ali splošna izobrazba :50
## NA's : 3
## Q49
## Preko linka :84
## Na tablici :37
## Na listu papirja:17
## NA's :20
##
##
##
Q1: Ali uporabljate mobilno aplikacijo banke, kjer imate odprt primarni račun (npr. KlikIn, mBank@Net in podobno)? [1 - Da, 2 - Ne]
Q2: Ali dnevno uporabljate pametni telefon? [1 - Da, 2 - Ne]
Q3: Ali uporabljate internet vsaj nekajkrat na teden? [1 - Da, 2 - Ne]
Q4: Ali je kdo od vaših družinskih članov uporabnik mobilne banke? [1 - Da, 2 - Ne, 3 - Ne vem]
Q5: Ali imate skupni bančni račun, ki ga upravlja en član družine? [1 - Da, 2 - Ne]
Q6: Zakaj ne uporabljate mobilne banke (npr. KlikIn, mBank@Net, Addiko Mobil, …)? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti se ne strinjam niti se strinjam, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q7: Se vam zdi, da bi potrebovali dodatno izobraževanje, preden bi lahko uporabljali mobilno banko? [1 - Da, 2 - Ne]
Q8: Kako pomembni so naslednji dejavniki pri odločitvi med poslovalnico ali mobilno aplikacijo? [1 - Sploh ni pomembno, 2 - Ni pomembno, 3 - Niti ni pomembno niti je pomembno, 4 - Je pomembno, 5 - Zelo pomembno]
Q10: Kako zaznavate podporo in svetovanje v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q11: Kako zaznavate zagotavljanje varnosti in zaščito vaših finančnih podatkov ter transakcij v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q12: Kako zaznavate transparentnost delovanja banke in njenih storitev v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q13: Kako zaznavate priročnost (npr. enostavna uporaba, dostopnost od kjerkoli, hitre transakcije) v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q14: Kako zaznavate zanesljivost opravljene storitve v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q15: Kako zaskrbljeni ste glede naslednjih tveganj med uporabo mobilnega bančništva? [1 - Sploh me ne skrbi, 2 - Me ne skrbi, 3 - Niti me ne skrbi niti me skrbi, 4 - Me skrbi, 5 - Zelo me skrbi]
Q16: Kateri izmed spodaj navedenih dejavnikov bi vam omogočili večje zaupanje v mobilno bančništvo? [Izbira več možnih odgovorov]
Q17: Kako koristne bi se vam zdele naslednje funkcije v aplikaciji za mobilno bančništvo? [1 - Sploh ni koristna, 2 - Ni koristna, 3 - Niti ni koristna niti je koristna, 4 - Je koristna, 5 - Izjemno koristna]
Q18: Kateri finančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]
Q19: Kateri nefinančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]
Q41: Spol [1 - Ženska, 2 - Moški, 3 - Ne želim odgovoriti]
Q42: Prosimo, vpišite leto rojstva. [Odprto besedilo]
Q43: S kom živite v istem gospodinjstvu? [Izbira več možnih odgovorov]
Q44: Koliko prebivalcev živi v kraju, kjer prebivate? [1 - Manj kot 1.000 prebivalcev, 2 - 1.000–5.000 prebivalcev, 3 - 5.001–20.000 prebivalcev, 4 - 20.001–50.000 prebivalcev, 5 - 50.001–100.000 prebivalcev, 6 - Več kot 100.000 prebivalcev]
Q45: Katera je vaša primarna banka? [1 - Nova Ljubljanska Banka d.d. (NLB), 2 - BKS Bank AG, Bančna podružnica, 3 - OTP banka d.d., 4 - Banka Sparkasse d.d., 5 - Banka Intesa Sanpaolo d.d., 6 - UniCredit Banka Slovenija d.d., 7 - Revolut, 8 - N26, 9 - Gorenjska Banka d.d., 10 - Deželna Banka Slovenije d.d., 11 - Addiko Bank d.d., 12 - Delavska Hranilnica d.d., 13 - Drugo]
Q46: Kakšna je vaša trenutna zaposlitev? [1 - Študent/ka, 2 - Redno zaposlen/a, 3 - Upokojen/a, 4 - Brezposeln/a, 5 - Samozaposlen/a, 6 - Delno zaposlen/a]
Q47: Kakšen je vaš mesečni neto prihodek? [1 - Pod 1.000€, 2 - 1.000€–1.500€, 3 - 1.501€–2.000€, 4 - 2.001€–3.000€, 5 - 3.001€–5.000€, 6 - 5.001€–10.000€, 7 - Nad 10.000€, 8 - Ne želim odgovoriti]
Q48: Kakšna je vaša stopnja izobrazbe? [1 - Nedokončana osnovna šola, 2 - Dokončana osnovna šola, 3 - Dokončana nižja ali srednja poklicna izobrazba, 4 - Dokončana srednja strokovna ali splošna izobrazba, 5 - Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja), 6 - Dokončana visokošolska strokovna ali univerzitetna izobrazba (tudi 2. bolonjska stopnja), 7 - Dokončana specializacija, znanstveni magisterij ali doktorat]
Q49: Kako ste rešili anketo? [1 - Preko linka, 2 - Na tablici, 3 - Na listu papirja]
knitr::opts_chunk$set(echo = TRUE)
options(width = 120)
#install.packages(ggplot2)
library(ggplot2)
#install.packages("ggfortify")
library(ggfortify)
#install.packages("ranger")
library(ranger)
#install.packages("dplyr")
library(dplyr)
#install.packages("Hmisc")
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("cluster")
library(cluster)
#install.packages("magrittr")
library(magrittr)
#install.packages("NbClust")
library("NbClust")
library(dplyr)
mydata <- mydata %>%
mutate(Security_concerns = rowMeans(across(c(7, 8, 14)), na.rm = TRUE))
mydata <- mydata %>%
mutate(Lack_of_competence_or_support = rowMeans(across(c(12, 13)), na.rm = TRUE))
mydata <- mydata %>%
mutate(Preference_for_traditional_methods = rowMeans(across(c(9, 11)), na.rm = TRUE))
colnames(mydata) [6] <- "Aversion_to_change"
colnames(mydata) [10] <- "Physical_limitations"
From question 6, we made 5 different variables. 3 of them are grouped together in a way that we took the mean of them together (6b, 6c and 6i are Security_concerns; 6g and 6h are Lack_of_competence_or_support; 6d and 6f are Preference_for_traditional_methods) and than 6a is Aversion_to_change and 6e are Physical_limitations.
#Saving standardized cluster variables into new data frame
mydata_clu_std <- as.data.frame(scale(mydata[c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations")]))
mydata$Dissimilarity <- sqrt(mydata_clu_std$Security_concerns^2 + mydata_clu_std$Lack_of_competence_or_support^2 + mydata_clu_std$Preference_for_traditional_methods^2 +
mydata_clu_std$Aversion_to_change^2 + mydata_clu_std$Physical_limitations^2) #Finding outliers
head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) #Finding units with highest value of dissimilarity
## # A tibble: 6 × 2
## ID Dissimilarity
## <int> <dbl>
## 1 53 4.92
## 2 1 4.06
## 3 123 3.97
## 4 44 3.94
## 5 30 3.71
## 6 72 3.64
ID53 is a potential outlier, as there is a big jump in disimilarity numbers between units. For this reason we will remove this unit.
mydata <- mydata %>%
filter(!ID %in% c(53)) #Removing ID53 from original data frame
mydata$ID <- seq(1, nrow(mydata)) #Numbering the data again
mydata_clu_std <- as.data.frame(scale(mydata[c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations")])) #Standardizing the data again
After removing one country, the sample size is now 157.
#Finding Eudlidean distances, based on 5 Cluster variables, then saving them into object Distances
Distances <- get_dist(mydata_clu_std,
method = "euclidian")
fviz_dist(Distances, #Showing matrix of distances
gradient = list(low = "#33006F",
mid = "grey95",
high = "white"))
We can see on the matrix of distances that some groups of clusters are forming, I see 3 groups.
get_clust_tendency(mydata_clu_std, #Hopkins statistics
n = nrow(mydata_clu_std) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6029504
##
## $plot
## NULL
This data is clusterable as it is above 0.5. If it would be more close to 1, it would be even more appropriate. However the threshold is 0.5. Now the next question is how many clusters to use. I will check this with Hierarhical clustering (dendrogram) and K-Means clustering (Elbow method, Silhouette analysis and with the help of indices).
WARD <- mydata_clu_std %>%
get_dist(method = "euclidean") %>%
hclust(method = "ward.D2")
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 157
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Based on the dendrogram, we would choose 2 clusters, as there is the biggest jump in vertical line.
fviz_nbclust(mydata_clu_std, kmeans, method = "wss") +
labs(subtitle = "Elbow method")
With the elbow method the slope changes most evidently at 2 clusters.
fviz_nbclust(mydata_clu_std, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette analysis")
The higest value of the Silhouette analysis is at 2.
NbClust(mydata_clu_std,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 4 proposed 9 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB Silhouette
## 2 9.5966 92.7417 23.3648 -1.9059 182.2586 22365513125 9141.421 488.0082 3.5819 1.5983 0.4213 1.2674 0.3109
## 3 0.2118 64.6236 38.3498 -4.2729 287.2586 25781592519 5810.071 424.0819 5.0700 1.8393 0.3873 1.7241 0.2346
## 4 33.4822 66.1618 14.8522 -2.2561 410.1106 20958157652 4746.590 339.5305 6.7664 2.2973 0.3518 1.4934 0.2405
## 5 0.1946 57.7712 14.4020 -2.8446 501.1125 18341844976 4146.290 309.4876 8.4125 2.5203 0.3329 1.5378 0.2127
## 6 0.3402 53.1246 18.4451 -2.6373 577.8546 16200227137 3341.184 282.7016 9.7434 2.7591 0.3193 1.5636 0.2199
## 7 170.9406 52.4031 8.2807 -1.4552 648.9874 14016730597 2585.750 251.9279 11.0928 3.0961 0.3641 1.4115 0.2343
## 8 0.0053 48.2557 17.2282 -1.9713 692.5546 13871218677 2394.293 238.7479 12.0205 3.2670 0.3331 1.5076 0.2114
## 9 11.5140 48.9287 5.5136 -0.5159 772.2003 10570689341 1798.061 214.0037 13.8227 3.6448 0.4090 1.3345 0.2287
## 10 0.4798 45.4161 7.5001 -1.0486 804.3282 10635205508 1752.332 206.3176 14.7638 3.7806 0.4041 1.4314 0.2050
## Duda Pseudot2 Beale Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 0.9784 2.1452 0.0684 0.4324 244.0041 0.4973 0.9300 0.6734 0.0959 0.0021 1.8068 1.6568 0.9625
## 3 1.0585 -4.0338 -0.1694 0.3884 141.3606 0.4739 0.2894 1.3160 0.1174 0.0024 2.1176 1.5423 1.9356
## 4 1.3475 -15.9880 -0.7858 0.3757 84.8826 0.4900 0.6398 1.8251 0.0722 0.0029 1.9498 1.3840 0.7941
## 5 1.8725 -23.7634 -1.4063 0.3470 61.8975 0.4659 0.3248 2.3156 0.0976 0.0033 2.0907 1.3079 0.7352
## 6 1.0624 -2.2896 -0.1778 0.3258 47.1169 0.4573 0.0996 2.7357 0.0987 0.0033 2.1119 1.2557 0.4736
## 7 1.3570 -8.9441 -0.7859 0.3109 35.9897 0.4659 0.7818 2.9210 0.1167 0.0033 1.9525 1.1932 0.3572
## 8 1.7019 -12.3726 -1.2101 0.2944 29.8435 0.4378 0.0754 3.4835 0.0816 0.0035 2.0991 1.1490 0.3336
## 9 0.9718 0.6384 0.0860 0.2838 23.7782 0.4444 1.2669 3.6472 0.1444 0.0035 1.9951 1.1000 0.3151
## 10 1.1576 -1.6341 -0.3788 0.2711 20.6318 0.4187 -0.1176 4.2006 0.1728 0.0036 2.2337 1.0821 0.3061
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6717 47.4016 0.9968
## 3 0.6107 46.5380 1.0000
## 4 0.5760 45.6335 1.0000
## 5 0.5287 45.4696 1.0000
## 6 0.5452 32.5352 1.0000
## 7 0.4864 35.9027 1.0000
## 8 0.4234 40.8610 1.0000
## 9 0.4584 25.9898 0.9943
## 10 0.2868 29.8345 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB
## Number_clusters 7.0000 2.0000 4.0000 9.0000 4.000 9 3.00 4.0000 9.0000 9.000 6.0000 2.0000
## Value_Index 170.9406 92.7417 23.4976 -0.5159 122.852 3365045502 3331.35 54.5085 1.8023 -0.242 0.3193 1.2674
## Silhouette Duda PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain Dunn Hubert SDindex
## Number_clusters 2.0000 2.0000 2.0000 2.0000 2.0000 3.0000 2.0000 1 2.0000 10.0000 0 2.0000
## Value_Index 0.3109 0.9784 2.1452 0.0684 0.4324 102.6435 0.4973 NA 0.6734 0.1728 0 1.8068
## Dindex SDbw
## Number_clusters 0 10.0000
## Value_Index 0 0.3061
##
## $Best.partition
## [1] 1 2 1 1 2 1 1 1 2 1 1 1 1 1 2 2 1 1 1 2 1 1 1 2 1 2 1 2 2 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1 2 2 1 1 2 2 2 1 2 2 1 2 1
## [58] 2 1 2 2 2 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2 2 1 1 2 2 2 2 2 2
## [115] 1 2 2 2 1 2 1 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2
We will proceed with 3 clusters, as this would be the most optimal for our project.
Clustering <- kmeans(mydata_clu_std,
centers = 3, #Number of groups
nstart = 25) #Number of attempts at different starting leader positions
Clustering
## K-means clustering with 3 clusters of sizes 45, 46, 66
##
## Cluster means:
## Security_concerns Lack_of_competence_or_support Preference_for_traditional_methods Aversion_to_change
## 1 -0.96430126 -0.5695709 -1.2244132 -0.9276589
## 2 0.09075076 -0.6448033 0.2826530 0.1622943
## 3 0.59422760 0.8377522 0.6378266 0.5193805
## Physical_limitations
## 1 -0.4679937
## 2 -0.7534631
## 3 0.8442276
##
## Clustering vector:
## [1] 1 3 1 1 3 2 2 2 3 1 2 2 2 1 3 3 1 1 3 3 2 2 1 3 2 3 1 3 3 1 2 2 3 3 2 3 2 1 2 1 1 3 2 1 3 3 2 2 3 2 3 2 3 3 1 3 2
## [58] 3 2 3 3 2 3 1 1 3 3 2 3 3 1 1 1 2 2 1 3 1 3 1 3 3 2 2 1 3 3 3 2 1 2 1 1 1 1 3 2 1 2 1 1 3 1 2 2 2 2 2 3 3 3 3 3 3
## [115] 1 3 3 3 1 3 1 1 3 1 3 3 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 3 1 2 2 1 1 2 2 2 2 2 1 1 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 141.2425 107.8430 146.6024
## (between_SS / total_SS = 49.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
## [8] "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_std)
Averages <- Clustering$centers
Averages #Average values of cluster variables to describe groups
## Security_concerns Lack_of_competence_or_support Preference_for_traditional_methods Aversion_to_change
## 1 -0.96430126 -0.5695709 -1.2244132 -0.9276589
## 2 0.09075076 -0.6448033 0.2826530 0.1622943
## 3 0.59422760 0.8377522 0.6378266 0.5193805
## Physical_limitations
## 1 -0.4679937
## 2 -0.7534631
## 3 0.8442276
Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
Figure <- pivot_longer(Figure, cols = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"))
Figure$Group <- factor(Figure$ID,
levels = c(1, 2, 3),
labels = c("1", "2", "3"))
Figure$NameF <- factor(Figure$name,
levels = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"),
labels = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"))
library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 3) +
geom_line(aes(group = ID), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-2.2, 2.2) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
mydata$Group <- Clustering$cluster #Assignings units to groups
#Checking if clustering variables successfully differentiate between groups
fit <- aov(cbind(Security_concerns, Lack_of_competence_or_support, Preference_for_traditional_methods, Aversion_to_change, Physical_limitations) ~ as.factor(Group),
data = mydata)
summary(fit)
## Response Security_concerns :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 56.024 28.0120 55.771 < 2.2e-16 ***
## Residuals 154 77.350 0.5023
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Lack_of_competence_or_support :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 123.84 61.922 81.146 < 2.2e-16 ***
## Residuals 154 117.52 0.763
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Preference_for_traditional_methods :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 81.174 40.587 130.06 < 2.2e-16 ***
## Residuals 154 48.057 0.312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Aversion_to_change :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 80.167 40.083 45.248 3.486e-16 ***
## Residuals 154 136.423 0.886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Physical_limitations :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 162.21 81.105 87.57 < 2.2e-16 ***
## Residuals 154 142.63 0.926
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Response for Social support:
H0: μ(Security_concerns, G1) = μ(Security_concerns, G2) = μ(Security_concerns, G3)
H1: At least one μ(Security_concerns, j) is different.
We can reject H0 at p < 0.001. We can reject H0 for all cluster variables at p < 0.001. Therefore we can assume that the groups are statistically different in the mean values of the cluster variables.
Next step is to check the criterion validity of the classification with variables that were not used in the clustering process.
chi_square <- chisq.test(mydata$Q2, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q2 and as.factor(mydata$Group)
## X-squared = 20.555, df = 2, p-value = 3.439e-05
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q2 1 2 3 Sum
## Yes 40 41 39 120
## No 5 4 27 36
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q2 1 2 3 Sum
## Yes 34.62 34.62 50.77 120.01
## No 10.38 10.38 15.23 35.99
## Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q2 1 2 3
## Yes 0.92 1.09 -1.65
## No -1.67 -1.98 3.02
library(effectsize)
effectsize::cramers_v(mydata$Q2, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.35 | [0.19, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between usage of mobile phone and classification of customers in 3 groups.
H1: There is association between usage of mobile phone and classification of customers in 3 groups.
We can reject H0 at p < 0.001, so there are differences. All expected variables are above 5, so this assumption is met.
table_clusters1 <- table(mydata$Group, mydata$Q2)
prop_table_clusters1 <- prop.table(table_clusters1, margin = 1)
prop_df1 <- as.data.frame(as.table(prop_table_clusters1))
library(ggplot2)
ggplot(prop_df1, aes(x = Var1, y = Freq * 100, fill = Var2)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("#84BD00", "#33006F")) + # Custom Colors
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage of Phone Usage by Group"
) +
theme_minimal()
chi_square <- chisq.test(mydata$Q41, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q41 and as.factor(mydata$Group)
## X-squared = 1.6566, df = 2, p-value = 0.4368
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q41 1 2 3 Sum
## Female 22 26 30 78
## Male 23 19 36 78
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q41 1 2 3 Sum
## Female 22.5 22.5 33 78
## Male 22.5 22.5 33 78
## Sum 45.0 45.0 66 156
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q41 1 2 3
## Female -0.11 0.74 -0.52
## Male 0.11 -0.74 0.52
library(effectsize)
effectsize::cramers_v(mydata$Q41, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.00 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between gender and classification of customers in 3 groups.
H1: There is association between gender and classification of customers in 3 groups.
We cannot reject H0 at p = 0.437, so there are no differences.
mydata$Q42 <- as.numeric(as.character(mydata$Q42))
current_year <- as.numeric(format(Sys.Date(), "%Y"))
mydata$Age <- current_year - mydata$Q42
mydata$AgeGroup <- cut(mydata$Age,
breaks = c(18, 40, 60, Inf),
labels = c("18-40", "41-60", "60+"))
chi_square <- chisq.test(mydata$AgeGroup, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$AgeGroup and as.factor(mydata$Group)
## X-squared = 49.906, df = 4, p-value = 3.778e-10
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$AgeGroup 1 2 3 Sum
## 18-40 21 22 3 46
## 41-60 16 18 23 57
## 60+ 7 5 40 52
## Sum 44 45 66 155
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$AgeGroup 1 2 3 Sum
## 18-40 13.06 13.35 19.59 46
## 41-60 16.18 16.55 24.27 57
## 60+ 14.76 15.10 22.14 52
## Sum 44.00 45.00 66.00 155
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$AgeGroup 1 2 3
## 18-40 2.20 2.37 -3.75
## 41-60 -0.04 0.36 -0.26
## 60+ -2.02 -2.60 3.80
library(effectsize)
effectsize::cramers_v(mydata$AgeGroup, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.39 | [0.27, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between age and classification of customers in 3 groups.
H1: There is association between age and classification of customers in 3 groups.
We can reject H0 at p < 0.001, so there are differences.
table_clusters2 <- table(mydata$Group, mydata$AgeGroup)
prop_table_clusters2 <- prop.table(table_clusters2, margin = 1)
prop_df2 <- as.data.frame(as.table(prop_table_clusters2))
library(ggplot2)
ggplot(prop_df2, aes(x = Var1, y = Freq * 100, fill = Var2)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("#84BD00", "#33006F", "#FA7800")) + # Custom Colors
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage of Age by Group"
) +
theme_minimal()
mydata[c(67:73)] <- mydata[c(67:73)] %>% mutate_all(as.numeric)
mydata$Household <- case_when(
mydata$Q43a == 1 ~ 1, # If var1 is 1, assign 1
mydata$Q43b == 1 ~ 1,
mydata$Q43c == 1 ~ 1,
mydata$Q43d == 1 ~ 1,
mydata$Q43e == 1 ~ 1,
mydata$Q43f == 1 ~ 1,
mydata$Q43g == 1 ~ 2,
TRUE ~ 0)
mydata$Household <- factor(mydata$Household,
levels = c(1, 2),
labels = c("With somebody", "Living alone"))
chi_square <- chisq.test(mydata$Household, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Household and as.factor(mydata$Group)
## X-squared = 5.3091, df = 2, p-value = 0.07033
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Household 1 2 3 Sum
## With somebody 41 33 56 130
## Living alone 4 12 10 26
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Household 1 2 3 Sum
## With somebody 37.5 37.5 55 130
## Living alone 7.5 7.5 11 26
## Sum 45.0 45.0 66 156
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Household 1 2 3
## With somebody 0.57 -0.73 0.13
## Living alone -1.28 1.64 -0.30
library(effectsize)
effectsize::cramers_v(mydata$Household, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.15 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between living situation and classification of customers in 3 groups.
H1: There is association between living situation and classification of customers in 3 groups.
We can reject H0 at p = 0.071, so there are differences.
table_clusters3 <- table(mydata$Group, mydata$Household)
prop_table_clusters3 <- prop.table(table_clusters3, margin = 1)
prop_df3 <- as.data.frame(as.table(prop_table_clusters3))
library(ggplot2)
ggplot(prop_df3, aes(x = Var1, y = Freq * 100, fill = Var2)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("#84BD00", "#33006F")) + # Custom Colors
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage of Household by Group"
) +
theme_minimal()
mydata <- mydata %>%
mutate(Q44_numeric = as.integer(Q44))
mydata <- mydata %>%
mutate(Q44_numeric = case_when(
Q44_numeric %in% c(1) ~ (1 + 999) / 2,
Q44_numeric %in% c(2) ~ (1000 + 5000) / 2,
Q44_numeric %in% c(3) ~ (5001 + 20000) / 2,
Q44_numeric %in% c(4) ~ (20001 + 50000) / 2,
Q44_numeric %in% c(5) ~ (50001 + 100000) / 2,
Q44_numeric %in% c(6) ~ (100001 + 285000) / 2,
))
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
leveneTest(mydata$Q44_numeric, as.factor(mydata$Group))
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.8609 0.06028 .
## 153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H0: σ2 (habitants, G1) = σ2 (habitants, G2) = σ2 (habitants, G3)
H1: At least one σ2 (habitants, j) is different.
We can reject H0 at p = 0.061, meaning that at least one variance is different from the rest. With this we checked the homogeneity of variances, therefore we can proceed with Welch F-test (if normality is not violated).
library(dplyr)
library(rstatix)
##
## Attaching package: 'rstatix'
## The following objects are masked from 'package:effectsize':
##
## cohens_d, eta_squared
## The following object is masked from 'package:stats':
##
## filter
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(Q44_numeric)
## # A tibble: 3 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 Q44_numeric 0.508 4.46e-11
## 2 2 Q44_numeric 0.644 3.40e- 9
## 3 3 Q44_numeric 0.651 2.94e-11
H0: Habitants are normally distributed in G1.
H1: Habitants are not normally distributed in G1.
We reject H0 at p < 0.001.
H0: Habitants are normally distributed in G2.
H1: Habitants are not normally distributed in G2.
We reject H0 at p < 0.001.
H0: Habitants are normally distributed in G3.
H1: Habitants are not normally distributed in G3.
We reject H0 at p < 0.001.
We have to use the Kruskal-Wallis Rank Sum Test.
kruskal.test(Q44_numeric ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: Q44_numeric by as.factor(Group)
## Kruskal-Wallis chi-squared = 3.8397, df = 2, p-value = 0.1466
H0: Location distribution of habitants are the same for all groups.
H1: Location distribution of habitants are not the same for all groups.
We cannot reject H0 at p = 0.147, therefore at least one location distribution is not different from the other, so the result is not validated.
mydata <- mydata %>%
mutate(Q47_numeric = as.integer(Q47))
mydata <- mydata %>%
mutate(Q47_numeric = case_when(
Q47_numeric %in% c(1) ~ (1 + 999) / 2,
Q47_numeric %in% c(2) ~ (1000 + 1500) / 2,
Q47_numeric %in% c(3) ~ (1501 + 2000) / 2,
Q47_numeric %in% c(4) ~ (2001 + 3000) / 2,
Q47_numeric %in% c(5) ~ (3001 + 5000) / 2,
Q47_numeric %in% c(6) ~ (5001 + 10000) / 2,
Q47_numeric %in% c(7) ~ (10000 + 12000) / 2,
Q47_numeric %in% c(8) ~ NA,
))
library(car)
leveneTest(mydata$Q47_numeric, as.factor(mydata$Group))
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 3.0541 0.05023 .
## 144
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H0: σ2 (habitants, G1) = σ2 (habitants, G2) = σ2 (habitants, G3)
H1: At least one σ2 (habitants, j) is different.
We can reject H0 at p = 0.061, meaning that at least one variance is different from the rest. With this we checked the homogeneity of variances, therefore we can proceed with Welch F-test (if normality is not violated).
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(Q47_numeric)
## # A tibble: 3 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 Q47_numeric 0.741 0.000000186
## 2 2 Q47_numeric 0.834 0.0000465
## 3 3 Q47_numeric 0.843 0.00000100
H0: Habitants are normally distributed in G1.
H1: Habitants are not normally distributed in G1.
We reject H0 at p < 0.001.
H0: Habitants are normally distributed in G2.
H1: Habitants are not normally distributed in G2.
We reject H0 at p < 0.001.
H0: Habitants are normally distributed in G3.
H1: Habitants are not normally distributed in G3.
We reject H0 at p < 0.001.
We have to use the Kruskal-Wallis Rank Sum Test.
kruskal.test(Q47_numeric ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: Q47_numeric by as.factor(Group)
## Kruskal-Wallis chi-squared = 7.6769, df = 2, p-value = 0.02153
H0: Location distribution of income are the same for all groups.
H1: Location distribution of income are not the same for all groups.
We can reject H0 at p = 0.032, therefore at least one location distribution is different from the other, so the result is validated.
table_clusters4 <- table(mydata$Group, mydata$Q47_numeric)
prop_table_clusters4 <- prop.table(table_clusters4, margin = 1)
prop_df4 <- as.data.frame(as.table(prop_table_clusters4))
library(ggplot2)
ggplot(prop_df4, aes(x = Var1, y = Freq * 100, fill = Var2)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("#84BD00", "#33006F", "#FA7800", "#0083B8", "#F4A900", "#007C80", "#00A499", "#002147")) + # Custom Colors
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage of Income by Group"
) +
theme_minimal()
chi_square <- chisq.test(mydata$Q3, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q3 and as.factor(mydata$Group)
## X-squared = 12.145, df = 2, p-value = 0.002305
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q3 1 2 3 Sum
## Yes 38 44 48 130
## No 7 1 18 26
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q3 1 2 3 Sum
## Yes 37.5 37.5 55 130
## No 7.5 7.5 11 26
## Sum 45.0 45.0 66 156
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q3 1 2 3
## Yes 0.08 1.06 -0.94
## No -0.18 -2.37 2.11
library(effectsize)
effectsize::cramers_v(mydata$Q3, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.26 | [0.06, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between age and classification of customers in 3 groups.
H1: There is association between age and classification of customers in 3 groups.
We can reject H0 at p = 0.003, so there are differences.
table_clusters5 <- table(mydata$Group, mydata$Q3)
prop_table_clusters5 <- prop.table(table_clusters5, margin = 1)
prop_df5 <- as.data.frame(as.table(prop_table_clusters5))
library(ggplot2)
ggplot(prop_df5, aes(x = Var1, y = Freq * 100, fill = Var2)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("#33006F", "#84BD00")) + # Custom Colors
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage of Internet Usage by Group"
) +
theme_minimal()
chi_square <- chisq.test(mydata$Q4, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q4 and as.factor(mydata$Group)
## X-squared = 4.7944, df = 4, p-value = 0.3091
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q4 1 2 3 Sum
## Yes 28 34 40 102
## No 10 7 11 28
## I don't know 7 4 15 26
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q4 1 2 3 Sum
## Yes 29.42 29.42 43.15 101.99
## No 8.08 8.08 11.85 28.01
## I don't know 7.50 7.50 11.00 26.00
## Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q4 1 2 3
## Yes -0.26 0.84 -0.48
## No 0.68 -0.38 -0.25
## I don't know -0.18 -1.28 1.21
library(effectsize)
effectsize::cramers_v(mydata$Q4, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.05 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between income level and classification of customers in 3 groups.
H1: There is association between income level and classification of customers in 3 groups.
We cannot reject H0 at p = 0.310, so there are no differences.
chi_square <- chisq.test(mydata$Q5, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q5 and as.factor(mydata$Group)
## X-squared = 5.4441, df = 2, p-value = 0.06574
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q5 1 2 3 Sum
## Yes 10 9 5 24
## No 35 36 61 132
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q5 1 2 3 Sum
## Yes 6.92 6.92 10.15 23.99
## No 38.08 38.08 55.85 132.01
## Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q5 1 2 3
## Yes 1.17 0.79 -1.62
## No -0.50 -0.34 0.69
library(effectsize)
effectsize::cramers_v(mydata$Q5, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.15 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between joint bank account and classification of customers in 3 groups.
H1: There is association between joint bank account and classification of customers in 3 groups.
We can reject H0 at p = 0.066, so there are differences.
table_clusters6 <- table(mydata$Group, mydata$Q5)
prop_table_clusters6 <- prop.table(table_clusters6, margin = 1)
prop_df6 <- as.data.frame(as.table(prop_table_clusters6))
library(ggplot2)
ggplot(prop_df6, aes(x = Var1, y = Freq * 100, fill = Var2)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("#33006F", "#84BD00")) + # Custom Colors
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage of Joint Bank Account by Group"
) +
theme_minimal()
chi_square <- chisq.test(mydata$Q7, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q7 and as.factor(mydata$Group)
## X-squared = 52.459, df = 2, p-value = 4.061e-12
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q7 1 2 3 Sum
## Yes 12 13 57 82
## No 33 32 9 74
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q7 1 2 3 Sum
## Yes 23.65 23.65 34.69 81.99
## No 21.35 21.35 31.31 74.01
## Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q7 1 2 3
## Yes -2.40 -2.19 3.79
## No 2.52 2.31 -3.99
library(effectsize)
effectsize::cramers_v(mydata$Q7, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.57 | [0.43, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between Additional Education and classification of customers in 3 groups.
H1: There is association between Additional Education and classification of customers in 3 groups.
We can reject H0 at p < 0.001, so there are differences.
table_clusters7 <- table(mydata$Group, mydata$Q7)
prop_table_clusters7 <- prop.table(table_clusters7, margin = 1)
prop_df7 <- as.data.frame(as.table(prop_table_clusters7))
library(ggplot2)
ggplot(prop_df7, aes(x = Var1, y = Freq * 100, fill = Var2)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("#33006F", "#84BD00")) + # Custom Colors
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage of Additional Education by Group"
) +
theme_minimal()
Our analysis has revealed three distinct customer segments based on five standardized variables (Security Concerns, Lack of Competence or Support, Preference for Traditional Methods, Aversion to Change, and Physical Limitations), each with unique attitudes, behaviors, and barriers toward mobile banking adoption. While some are on the verge of adopting digital banking, others require stronger incentives, and a final group firmly resists change. Interestingly, our research indicates that gender, city size, and whether a family member uses mobile banking are not significant differentiators.
The Practical Observers are the most open-minded of the three clusters—they are comfortable with technology, don not lack confidence or support, do not have strong security fears, and are not opposed to change. However, they have yet to find a compelling reason to adopt mobile banking.
This segment consists of a relatively young and middle-aged audience, with 48% aged 18–40 and 36% aged 41–60, while 16% are over 60. Their high engagement with technology is evident—88% use a smartphone daily, and 85% go online multiple times per week.
Financially, this group is diverse. While 43% earn below €1,000 per month, a significant 32% fall into the €3,001–€5,000 income bracket, and 21% earn between €1,501–€2,000. Additionally, 2% earn between €5,001–€10,000, and 2% exceed €10,000 net per month. This suggests that many in this group have disposable income and could benefit from mobile banking’s convenience and financial tools.
They are also socially active, with 88% living with a partner or family. However, 77% do not have a joint bank account, highlighting a preference for individual financial management. Importantly, 74% believe they don’t need additional education to start using mobile banking, reinforcing the idea that their resistance is more about habit than hesitation.
The Cautious Adopters are one step away from transitioning to mobile banking but need reassurance about security and reliability before making the leap. They are slightly more traditional than Cluster 1, showing a stronger preference for in-person banking and a mildly higher aversion to change. However, they don’t struggle with technology, and like Cluster 1, they have below-average concerns about needing external support.
Their age profile is similar, with 49% aged 18–40 and 40% aged 41–60, but fewer (only 11%) are over 60. 91% use a smartphone daily, and 98% go online frequently, indicating a high digital readiness.
Financially, they are more affluent than Cluster 1, with 32% earning €3,001–€5,000, 23% earning €5,001–€10,000, and 23% between €1,501–€2,000. Only 23% earn under €1,000, making them the strongest potential customer base for premium digital banking services.
Most (73%) live with a partner or family, but like Cluster 1, 80% manage their finances independently without a joint bank account. Importantly, 72% do not believe they need additional education before using mobile banking, suggesting that their hesitation is psychological rather than technical.
The Firm Traditionalists are deeply resistant to mobile banking. Their concerns extend beyond security—they lack confidence in digital tools, strongly prefer in-person traditional banking, and are the only cluster with above average physical limitations – making them the most averse to change.
This segment is significantly older, with 61% aged over 60, 35% aged 41–60, and only 4% aged 18–40. They are the least digitally active, with only 60% using a smartphone daily and 73% accessing the internet at least a few times per week.
While 85% live with family or a partner, 95% handle their finances independently without a joint account, indicating a preference for financial autonomy despite their reliance on traditional banking methods.
Crucially, 87% feel they need additional education before adopting mobile banking, underscoring their lack of digital confidence. Their high level of security fears, preference for in-person banking, and reluctance toward technology make them the most difficult segment to convert.
mydata_PCA <- mydata[c(18:22)]
mydata_PCA2 <- mydata[c(25:34)]
R <- cor(mydata_PCA)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:effectsize':
##
## phi
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
corPlot(R)
library(psych)
cortest.bartlett(R, n = nrow(mydata_PCA))
## $chisq
## [1] 214.0942
##
## $p.value
## [1] 1.838694e-40
##
## $df
## [1] 10
library(psych)
KMO(R)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA = 0.78
## MSA for each item =
## Assistance Security Transparency Convinience Speed and Reliability
## 0.79 0.76 0.80 0.78 0.77
library(FactoMineR)
components <- PCA(mydata_PCA,
scale.unit = TRUE,
graph = FALSE)
library(factoextra)
get_eigenvalue(components)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.7190702 54.381405 54.38140
## Dim.2 0.7872302 15.744603 70.12601
## Dim.3 0.6464620 12.929241 83.05525
## Dim.4 0.4411911 8.823822 91.87907
## Dim.5 0.4060465 8.120929 100.00000
Eigenvalue of first principal component for standardized variables is bigger than 1.
The first 1 principal components explain more than 40% of the data. We measure evaluation which is subjective, so we measure soft data for which the chosen number of components should explain around 40% of the data.
The last chosen principal component captures more than 5% of total variance of original variables (5).
fviz_eig(components,
choice = "eigenvalue",
main = "Screeplot",
ylab = "Eigenvalue",
xlab = "Principal component",
addlabels = TRUE)
When looking at the Scree plot the biggest difference between eigenvalues. This is between 1 and 2, so we should choose 1 principal component.
library(psych)
fa.parallel(mydata_PCA,
sim = FALSE,
fa = "pc")
## Parallel analysis suggests that the number of factors = NA and the number of components = 1
Parallel analysis suggests that we should choose 1 principal component.
Because we need to do the perception map, which has 2 principal components, we will use 2.
library(FactoMineR)
components <- PCA(mydata_PCA,
ncp = 2,
scale.unit = TRUE,
graph = FALSE)
components$var$cor
## Dim.1 Dim.2
## Assistance 0.6588985 0.58938063
## Security 0.7655438 0.33729834
## Transparency 0.7487942 -0.09960031
## Convinience 0.7262470 -0.48912635
## Speed and Reliability 0.7814975 -0.27735471
loadings <- components$var$cor
library(factoextra)
eigenvalue <- get_eigenvalue(components)[1:2,1 ]
coefficient1 <- loadings[1:5]/sqrt(eigenvalue)[1]
coefficient2 <- loadings[6:10]/sqrt(eigenvalue)[2]
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ rstatix::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ purrr::some() masks car::some()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
mydata_PCAD <- mydata_PCA2 %>%
pivot_longer(everything(), names_to = "name", values_to = "score") %>%
separate(name, into = c("retailer", "dimension"), sep = "_")%>%
pivot_wider(names_from = retailer, values_from = score, values_fn = mean) %>%
column_to_rownames(var = "dimension")
mydata_PCA_std <- scale(mydata_PCAD)
poslovalnica1 <- sum(mydata_PCA_std[,1]*coefficient1)
mobilna_banka1 <- sum(mydata_PCA_std[,2]*coefficient1)
poslovalnica2 <- sum(mydata_PCA_std[,1]*coefficient2)
mobilna_banka2 <- sum(mydata_PCA_std[,2]*coefficient2)
library(factoextra)
p <- fviz_pca_biplot(components, repel = TRUE, invisible = "ind", col.var = "#33006F")
p +
annotate("point", x = poslovalnica1, y = poslovalnica2, color = "#84BD00", size = 4, shape = 16) +
annotate("text", x = poslovalnica1, y = poslovalnica2, label = "Branch", vjust = -1, color = "#84BD00") +
annotate("point", x = mobilna_banka1, y = mobilna_banka2, color = "#FA7800", size = 4, shape = 16) +
annotate("text", x = mobilna_banka1, y = mobilna_banka2, label = "Mobile bank", vjust = -1, color = "#FA7800")
Principal component analysis was performed on 5 standardized variables (n = 157). The KMO measure confirms the appropriateness of the variables, KMO = 0.78, although the data falls into the category “Middling”. The MSA statistics for the individual variables are above 0.50 for all variables. Based on the component’s loadings, we conclude that PC1 (𝜆1 = 2.72) represents quality, while PC2 (𝜆2 = 0.79) represents the contrast between security&customer support and service efficiency&transparency.
The questions from the survey used for this test:
Q11: Kako zaznavate zagotavljanje varnosti in zaščito vaših finančnih podatkov ter transakcij v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
mydata$Branch_Security <- as.numeric(as.character(mydata$Branch_Security))
shapiro.test(mydata$Branch_Security)
##
## Shapiro-Wilk normality test
##
## data: mydata$Branch_Security
## W = 0.8418, p-value = 9.859e-12
H0: Distribution of Branch_Security is normal.
H1: Distribution of Branch_Security is not normal.
We can reject H0 at p < 0.001, so we can proceed with the alternative non-parametric test - Wilcoxon Signed Rank Test.
summary(mydata$Branch_Security)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 4.000 4.000 3.987 5.000 5.000
wilcox.test(mydata$Branch_Security,
mu = 4,
correct = FALSE)
##
## Wilcoxon signed rank test
##
## data: mydata$Branch_Security
## V = 1655.5, p-value = 0.8143
## alternative hypothesis: true location is not equal to 4
H0: Me = 4
H1: Me ≠ 4
We cannot reject H0 at p = 0.815.
mydata$`Mobile bank_Security` <- as.numeric(as.character(mydata$`Mobile bank_Security`))
shapiro.test(mydata$`Mobile bank_Security`)
##
## Shapiro-Wilk normality test
##
## data: mydata$`Mobile bank_Security`
## W = 0.90602, p-value = 1.652e-08
H0: Distribution of Mobile bank_Security is normal.
H1: Distribution of Mobile bank_Security is not normal.
We can reject H0 at p < 0.001, so we can proceed with the alternative non-parametric test - Wilcoxon Signed Rank Test.
summary(mydata$`Mobile bank_Security`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.032 4.000 5.000
wilcox.test(mydata$`Mobile bank_Security`,
mu = 3,
correct = FALSE)
##
## Wilcoxon signed rank test
##
## data: mydata$`Mobile bank_Security`
## V = 2220, p-value = 0.7385
## alternative hypothesis: true location is not equal to 3
H0: Me = 3
H1: Me ≠ 3
We cannot reject H0 at p = 0.739.
mydata$Difference1 <- mydata$Branch_Security - mydata$`Mobile bank_Security`
shapiro.test(mydata$Difference1)
##
## Shapiro-Wilk normality test
##
## data: mydata$Difference1
## W = 0.91546, p-value = 6.321e-08
H0: Distribution of Difference 1 is normal.
H1: Distribution of Difference 1 is not normal.
We can reject H0 at p < 0.001, so we can proceed with the alternative non-parametric test - Wilcoxon Signed Rank Test.
wilcox.test(mydata$Branch_Security, mydata$`Mobile bank_Security`,
paired = TRUE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon signed rank test
##
## data: mydata$Branch_Security and mydata$`Mobile bank_Security`
## V = 5276.5, p-value = 3.499e-14
## alternative hypothesis: true location shift is not equal to 0
H0: Distribution location of Security in branches is equal to the distribution location of Security in mobile banks.
H0: Distribution location of Security in branches is not equal to the distribution location of Security in mobile banks.
We can reject H0 at p < 0.001.
cor.test(mydata$Branch_Security, mydata$`Mobile bank_Security`, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: mydata$Branch_Security and mydata$`Mobile bank_Security`
## t = -0.1906, df = 155, p-value = 0.8491
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1715345 0.1416702
## sample estimates:
## cor
## -0.01530765
H0: ρ (Branch_Security, Mobile bank_Security) = 0
H1: ρ (Branch_Security, Mobile bank_Security) ≠ 0
We cannot reject H0 at p = 0.850.
The results provide partial support for H1, as perceptions of security in bank branches and mobile banking are significantly different. Customers do not perceive a significant deviation from expected security levels in bank branches (p = 0.815) or mobile banking (p = 0.739), indicating that both meet expectations in terms of security. However, the Wilcoxon Signed-Rank Test shows a significant difference in security perceptions between branches and mobile banking (p < 0.001), meaning that customers do not rate their security equally in both settings. Despite this difference, the correlation test between branch and mobile banking security perceptions is not significant (p = 0.850), suggesting that trust in bank security does not directly translate into trust in mobile banking security. Thus, H1 is only partially supported—while security perceptions between branches and mobile banking differ, customers who trust their bank do not necessarily perceive mobile banking as more secure. This highlights that building trust in mobile banking security requires independent efforts beyond general bank security trust.
The questions from the survey used for this test:
Q12: Kako zaznavate transparentnost delovanja banke in njenih storitev v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
mydata$Branch_Transparency <- as.numeric(as.character(mydata$Branch_Transparency))
shapiro.test(mydata$Branch_Transparency)
##
## Shapiro-Wilk normality test
##
## data: mydata$Branch_Transparency
## W = 0.8319, p-value = 3.798e-12
H0: Distribution of Branch_Transparency is normal.
H1: Distribution of Branch_Transparency is not normal.
We can reject H0 at p < 0.001, so we can proceed with the alternative non-parametric test - Wilcoxon Signed Rank Test.
summary(mydata$Branch_Transparency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 4.000 3.904 4.000 5.000
wilcox.test(mydata$Branch_Transparency,
mu = 3,
correct = FALSE)
##
## Wilcoxon signed rank test
##
## data: mydata$Branch_Transparency
## V = 7020.5, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 3
H0: Me = 3
H1: Me ≠ 3
We can reject H0 at p < 0.001.
mydata$`Mobile bank_Transparency` <- as.numeric(as.character(mydata$`Mobile bank_Transparency`))
shapiro.test(mydata$`Mobile bank_Transparency`)
##
## Shapiro-Wilk normality test
##
## data: mydata$`Mobile bank_Transparency`
## W = 0.88932, p-value = 1.859e-09
H0: Distribution of Branch_Transparency is normal.
H1: Distribution of Branch_Transparency is not normal.
We can reject H0 at p < 0.001, so we can proceed with the alternative non-parametric test - Wilcoxon Signed Rank Test.
summary(mydata$`Mobile bank_Transparency`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 3.000 3.256 4.000 5.000
wilcox.test(mydata$`Mobile bank_Transparency`,
mu = 3,
correct = FALSE)
##
## Wilcoxon signed rank test
##
## data: mydata$`Mobile bank_Transparency`
## V = 2972, p-value = 0.0004326
## alternative hypothesis: true location is not equal to 3
H0: Me = 4
H1: Me ≠ 4
We can reject H0 at p < 0.001.
mydata$Difference2 <- mydata$Branch_Transparency - mydata$`Mobile bank_Transparency`
shapiro.test(mydata$Difference2)
##
## Shapiro-Wilk normality test
##
## data: mydata$Difference2
## W = 0.93147, p-value = 7.665e-07
H0: Distribution of Difference 2 is normal.
H1: Distribution of Difference 2 is not normal.
We can reject H0 at p < 0.001, so we can proceed with the alternative non-parametric test - Wilcoxon Signed Rank Test.
wilcox.test(mydata$Branch_Transparency, mydata$`Mobile bank_Transparency`,
paired = TRUE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon signed rank test
##
## data: mydata$Branch_Transparency and mydata$`Mobile bank_Transparency`
## V = 4145.5, p-value = 2.216e-09
## alternative hypothesis: true location shift is not equal to 0
H0: Distribution location of Transparency in branches is equal to the distribution location of Transparency in mobile banks.
H0: Distribution location of Transparency in branches is not equal to the distribution location of Transparency in mobile banks.
We can reject H0 at p < 0.001.
cor.test(mydata$Branch_Transparency, mydata$`Mobile bank_Transparency`, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: mydata$Branch_Transparency and mydata$`Mobile bank_Transparency`
## t = -0.74959, df = 155, p-value = 0.4546
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.21471620 0.09745621
## sample estimates:
## cor
## -0.06009937
H0: ρ (Branch_Transparency, Mobile bank_Transparency) = 0
H1: ρ (Branch_Transparency, Mobile bank_Transparency) ≠ 0
We cannot reject H0 at p = 0.455.
The results provide partial support for H2, as transparency in privacy policies and guarantees appears to influence customer trust in mobile banking, but not in a straightforward way. Customers perceive bank branches as meeting transparency expectations, whereas mobile banking transparency significantly deviates from expectations (p < 0.001), indicating varied trust levels. Additionally, perceptions of transparency in branches and mobile banking are significantly different (p < 0.001), suggesting that customers evaluate them separately. However, there is no significant correlation between the two transparency perceptions (p = 0.455), meaning that trust in a bank’s physical transparency does not automatically extend to mobile banking. Thus, while transparency matters for trust, it must be built independently for mobile banking, as it is not inherently inherited from branch transparency. H2 is only partially supported, as transparency influences trust in mobile banking, but trust in branch transparency does not predict trust in mobile banking transparency.
The questions from the survey used for this test:
Q6: Zakaj ne uporabljate mobilne banke (npr. KlikIn, mBank@Net, Addiko Mobil, …)? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti se ne strinjam niti se strinjam, 4 - Se strinjam, 5 - Popolnoma se strinjam]
library(rstatix)
library(tidyverse)
mydata_H3 <- mydata %>% select(c("ID", "Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"))
#Wide format of data is changed to long format.
mydata_long <- mydata_H3 %>%
pivot_longer(
cols = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"),
names_to = "Concerns",
values_to = "Agreeableness") %>%
convert_as_factor(Concerns)
mydata_long <- as.data.frame(mydata_long)
tail(mydata_long, 10)
## ID Concerns Agreeableness
## 776 156 Security_concerns 3.666667
## 777 156 Lack_of_competence_or_support 2.000000
## 778 156 Preference_for_traditional_methods 3.000000
## 779 156 Aversion_to_change 3.000000
## 780 156 Physical_limitations 4.000000
## 781 157 Security_concerns 4.000000
## 782 157 Lack_of_competence_or_support 2.500000
## 783 157 Preference_for_traditional_methods 4.500000
## 784 157 Aversion_to_change 4.000000
## 785 157 Physical_limitations 2.000000
library(ggpubr)
#Boxplot for each variable.
ggboxplot(mydata_long,
x = "Concerns",
y = "Agreeableness",
add = "jitter") +
scale_x_discrete(labels = c(
"Aversion_to_change" = "Aversion",
"Lack_of_competence_or_support" = "Lack",
"Physical_limitations" = "Phys. Limitations",
"Preference_for_traditional_methods" = "Traditional",
"Security_concerns" = "Security"
))
library(tidyverse)
library(ggpubr)
library(rstatix)
#Finding outliers.
mydata_long %>%
group_by(Concerns) %>%
identify_outliers(Agreeableness)
## # A tibble: 9 × 5
## Concerns ID Agreeableness is.outlier is.extreme
## <fct> <int> <dbl> <lgl> <lgl>
## 1 Aversion_to_change 27 1 TRUE FALSE
## 2 Aversion_to_change 71 1 TRUE FALSE
## 3 Aversion_to_change 75 1 TRUE FALSE
## 4 Aversion_to_change 80 1 TRUE FALSE
## 5 Aversion_to_change 85 1 TRUE FALSE
## 6 Aversion_to_change 98 1 TRUE FALSE
## 7 Aversion_to_change 121 1 TRUE FALSE
## 8 Aversion_to_change 122 1 TRUE FALSE
## 9 Security_concerns 1 1 TRUE FALSE
#Removing outliers.
mydata_long <- mydata_long %>%
filter(!ID == 27 & !ID == 71 & !ID == 75 & !ID == 80 & !ID == 85 & !ID == 98 & !ID == 121 & !ID == 122 & !ID == 1)
mydata_long %>%
group_by(Concerns) %>%
identify_outliers(Agreeableness)
## # A tibble: 1 × 5
## Concerns ID Agreeableness is.outlier is.extreme
## <fct> <int> <dbl> <lgl> <lgl>
## 1 Security_concerns 95 1.33 TRUE FALSE
#Removing outliers.
mydata_long <- mydata_long %>%
filter(!ID == 95)
mydata_long %>%
group_by(Concerns) %>%
identify_outliers(Agreeableness)
## [1] Concerns ID Agreeableness is.outlier is.extreme
## <0 rows> (or 0-length row.names)
library(rstatix)
#Checking normality with Shapiro-Wilk test
mydata_long %>%
group_by(Concerns) %>%
shapiro_test(Agreeableness)
## # A tibble: 5 × 4
## Concerns variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Aversion_to_change Agreeableness 0.864 2.68e-10
## 2 Lack_of_competence_or_support Agreeableness 0.937 4.01e- 6
## 3 Physical_limitations Agreeableness 0.883 2.15e- 9
## 4 Preference_for_traditional_methods Agreeableness 0.917 1.75e- 7
## 5 Security_concerns Agreeableness 0.948 2.93e- 5
H0: Agreeableness is normally distributed in aversion to change.
H1: Agreeableness is not normally distributed in aversion to change.
We can reject H0 for all 5 variables, therefore we use Friedman ANOVA.
library(rstatix)
#Friedman ANOVA.
FriedmanANOVA <- friedman_test(Agreeableness ~ Concerns | ID,
data = mydata_long)
FriedmanANOVA #Summary of results.
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 Agreeableness 147 76.6 4 8.94e-16 Friedman test
H0: Location distribution of agreeableness is the same in all five variables.
H1: At least one location distribution is different.
We can reject H0 at p < 0.001.
library(effectsize)
effectsize::kendalls_w(Agreeableness ~ Concerns | ID,
data = mydata_long)
## Warning: 124 block(s) contain ties, some containing only 1 unique ranking.
## Kendall's W | 95% CI
## --------------------------
## 0.13 | [0.09, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
interpret_kendalls_w(0.13)
## [1] "slight agreement"
## (Rules: landis1977)
library(rstatix)
#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonpar <- wilcox_test(Agreeableness ~ Concerns,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_long)
paires_nonpar
## # A tibble: 10 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Agreeableness Aversion_to_change Lack_of_compet… 147 147 4208 1 e- 3 1.1 e- 2 *
## 2 Agreeableness Aversion_to_change Physical_limit… 147 147 5322. 1.51e- 8 1.51e- 7 ****
## 3 Agreeableness Aversion_to_change Preference_for… 147 147 2058. 1.3 e- 2 1.29e- 1 ns
## 4 Agreeableness Aversion_to_change Security_conce… 147 147 2812. 8.1 e- 2 8.09e- 1 ns
## 5 Agreeableness Lack_of_competence_or_support Physical_limit… 147 147 3584. 2.5 e- 4 2 e- 3 **
## 6 Agreeableness Lack_of_competence_or_support Preference_for… 147 147 1183 3.35e- 8 3.35e- 7 ****
## 7 Agreeableness Lack_of_competence_or_support Security_conce… 147 147 1778. 2.76e- 7 2.76e- 6 ****
## 8 Agreeableness Physical_limitations Preference_for… 147 147 924 3.24e-12 3.24e-11 ****
## 9 Agreeableness Physical_limitations Security_conce… 147 147 1402. 1.88e-11 1.88e-10 ****
## 10 Agreeableness Preference_for_traditional_methods Security_conce… 147 147 4190. 3.42e- 1 1 e+ 0 ns
library(rstatix)
comparisons <- paires_nonpar %>%
add_y_position(fun = "median", step.increase = 0.35)
library(ggpubr)
ggboxplot(mydata_long, x = "Concerns", y = "Agreeableness", add = "point", ylim=c(0, 18)) +
stat_pvalue_manual(comparisons, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = Concerns), color = "#84BD00",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "#33006F",
position = position_dodge(width = 0.8),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = Concerns)) +
labs(subtitle = get_test_label(FriedmanANOVA, detailed = TRUE),
caption = get_pwc_label(comparisons)) +
scale_x_discrete(labels = c(
"Aversion_to_change" = "Aversion",
"Lack_of_competence_or_support" = "Lack",
"Physical_limitations" = "Phys. Limitations",
"Preference_for_traditional_methods" = "Traditional",
"Security_concerns" = "Security"
))
The results from the Friedman ANOVA test indicate a significant difference among the various concerns non-users have regarding mobile banking (p < 0.001). This suggests that at least one concern stands out more than the others. The effect size analysis using Kendall’s W suggests a small to moderate effect size (W = 0.13), meaning the differences, while significant, are not overwhelmingly large.
To further explore these differences, we conducted Wilcoxon signed-rank tests for pairwise comparisons, adjusted with the Bonferroni correction. The results show that security concerns consistently received significantly higher ratings compared to other barriers, such as lack of competence, preference for traditional methods, and aversion to change. The median values confirm that concerns about security (e.g., fear of account breaches) were among the most highly rated trust-related barriers in comparison to other concerns.
The results partially support the hypothesis that security concerns are a major trust-related barrier for non-users. However, it is not the only significant concern - the preference for traditional banking is equally important in influencing trust-related barriers to mobile banking adoption. While security concerns are highly rated, they are not uniquely the most significant barrier. Therefore, H3 is partially supported but should also consider traditional banking preferences as a key factor.
The questions from the survey used for this test:
Q18: Kateri finančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]
mydata <- mydata %>%
rename(
`Cashback` = "Q18a",
`Exemption from service fees` = "Q18b",
`Lower fees than trad.` = "Q18c",
)
library(rstatix)
library(tidyverse)
mydata_H4 <- mydata %>% select(c(`Cashback`, `Exemption from service fees`, `Lower fees than trad.`))
# Add an ID column starting from 1
mydata_H4 <- mydata_H4 %>%
mutate(ID = row_number())
#Wide format of data is changed to long format.
mydata_long_2 <- mydata_H4 %>%
pivot_longer(
cols = c(`Cashback`, `Exemption from service fees`, `Lower fees than trad.`),
names_to = "Incentive_Type",
values_to = "Agreeableness")%>%
convert_as_factor(Incentive_Type)
mydata_long_2 <- as.data.frame(mydata_long_2)
tail(mydata_long_2, 10)
## ID Incentive_Type Agreeableness
## 462 154 Lower fees than trad. 4
## 463 155 Cashback 4
## 464 155 Exemption from service fees 4
## 465 155 Lower fees than trad. 3
## 466 156 Cashback 5
## 467 156 Exemption from service fees 4
## 468 156 Lower fees than trad. 4
## 469 157 Cashback 4
## 470 157 Exemption from service fees 3
## 471 157 Lower fees than trad. 4
library(ggpubr)
#Boxplot for each variable.
ggboxplot(mydata_long_2,
x = "Incentive_Type",
y = "Agreeableness",
add = "jitter")
### Repeated measures ANOVA
library(tidyverse)
library(ggpubr)
library(rstatix)
#Finding outliers.
mydata_long_2 %>%
group_by("Incentive_Type") %>%
identify_outliers("Agreeableness")
## # A tibble: 22 × 6
## `"Incentive_Type"` ID Incentive_Type Agreeableness is.outlier is.extreme
## <chr> <int> <fct> <dbl> <lgl> <lgl>
## 1 Incentive_Type 5 Cashback 1 TRUE FALSE
## 2 Incentive_Type 17 Lower fees than trad. 1 TRUE FALSE
## 3 Incentive_Type 18 Cashback 1 TRUE FALSE
## 4 Incentive_Type 21 Cashback 1 TRUE FALSE
## 5 Incentive_Type 38 Cashback 1 TRUE FALSE
## 6 Incentive_Type 79 Cashback 1 TRUE FALSE
## 7 Incentive_Type 79 Exemption from service fees 1 TRUE FALSE
## 8 Incentive_Type 79 Lower fees than trad. 1 TRUE FALSE
## 9 Incentive_Type 81 Cashback 1 TRUE FALSE
## 10 Incentive_Type 81 Exemption from service fees 1 TRUE FALSE
## # ℹ 12 more rows
library(dplyr)
mydata_long_2 <- mydata_long_2 %>%
filter(ID != 5 & ID != 17 & ID != 18 & ID != 21 & ID != 38 & ID != 79 & ID != 81 & ID != 82 & ID != 94 & ID != 96 & ID != 129 & ID != 143 & ID != 145 & ID != 149)
#Finding outliers.
mydata_long_2 %>%
group_by("Incentive_Type") %>%
identify_outliers("Agreeableness")
## [1] "Incentive_Type" ID Incentive_Type Agreeableness is.outlier is.extreme
## <0 rows> (or 0-length row.names)
library(rstatix)
#Checking normality with Shapiro-Wilk test
mydata_long_2 %>%
group_by(Incentive_Type) %>%
shapiro_test(Agreeableness)
## # A tibble: 3 × 4
## Incentive_Type variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Cashback Agreeableness 0.869 6.84e-10
## 2 Exemption from service fees Agreeableness 0.858 2.10e-10
## 3 Lower fees than trad. Agreeableness 0.841 3.77e-11
H0: Agreeableness is normally distributed in Cashback.
H1: Agreeableness is not normally distributed in Cashback.
We can reject H0 for all 3 variables, therefore we use Friedman ANOVA.
library(rstatix)
#Friedman ANOVA.
FriedmanANOVA_2 <- friedman_test(Agreeableness ~ Incentive_Type | ID,
data = mydata_long_2)
FriedmanANOVA_2 #Summary of results.
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 Agreeableness 143 17.4 2 0.000170 Friedman test
H0: Location distribution of agreeableness is the same in all three variables.
H1: At least one location distribution is different.
We can reject H0 at p < 0.001.
library(effectsize)
effectsize::kendalls_w(Agreeableness ~ Incentive_Type | ID,
data = mydata_long_2)
## Warning: 130 block(s) contain ties, some containing only 1 unique ranking.
## Kendall's W | 95% CI
## --------------------------
## 0.06 | [0.02, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
interpret_kendalls_w(0.06)
## [1] "slight agreement"
## (Rules: landis1977)
library(rstatix)
#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonpar_2 <- wilcox_test(Agreeableness ~ Incentive_Type,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_long_2)
paires_nonpar_2
## # A tibble: 3 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Agreeableness Cashback Exemption from service fees 143 143 694. 1 e-3 0.003 **
## 2 Agreeableness Cashback Lower fees than trad. 143 143 485 4.95e-4 0.001 **
## 3 Agreeableness Exemption from service fees Lower fees than trad. 143 143 454 8.09e-1 1 ns
library(rstatix)
comparisons_2 <- paires_nonpar_2 %>%
add_y_position(fun = "median", step.increase = 0.35)
library(ggpubr)
ggboxplot(mydata_long_2, x = "Incentive_Type", y = "Agreeableness", add = "point", ylim=c(0, 18)) +
stat_pvalue_manual(comparisons_2, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = Incentive_Type), color = "#84BD00",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "#33006F",
position = position_dodge(width = 0.8),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = Incentive_Type)) +
labs(subtitle = get_test_label(FriedmanANOVA_2, detailed = TRUE),
caption = get_pwc_label(comparisons_2))
The Friedman ANOVA test indicates a significant difference in how respondents perceive various financial incentives for mobile banking adoption (p < 0.001). This suggests that some financial incentives are more effective motivators than others. The effect size (Kendall’s W = 0.06) suggests a relatively small effect, meaning that while the differences are statistically significant, they may not be extremely strong in practical terms.
The Wilcoxon signed-rank tests for pairwise comparisons show that lower transaction fees (“Nižje provizije”) received significantly higher ratings than other incentives, such as cashback and fee waivers. However, the difference between cashback and fee waivers was not statistically significant, as indicated by the non-significant p-value (ns) in the pairwise tests.
The results partially support the hypothesis that financial incentives encourage mobile banking adoption, but not all financial incentives are equally effective. Cashback appears to be a stronger motivator than fee exemptions, while the difference between lower fees and exemptions is negligible. This suggests that banks should focus more on cashback rewards rather than just reducing or waiving service fees to enhance mobile banking adoption.
The questions from the survey used for this test:
Q19: Kateri nefinančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]
mydata <- mydata %>%
rename(
`Guides` = "Q19a",
`24/7 Support` = "Q19b",
`Personalization` = "Q19c",
`Banker assistance` = "Q19d",
`Video&FAQ` = "Q19e",
)
library(rstatix)
library(tidyverse)
mydata_H5 <- mydata %>% select(c(`Guides`, `24/7 Support`, `Personalization`, `Banker assistance`, `Video&FAQ`))
# Add an ID column starting from 1
mydata_H5 <- mydata_H5 %>%
mutate(ID = row_number())
#Wide format of data is changed to long format.
mydata_long_3 <- mydata_H5 %>%
pivot_longer(
cols = c(`Guides`, `24/7 Support`, `Personalization`, `Banker assistance`, `Video&FAQ`),
names_to = "Incentive_Type",
values_to = "Agreeableness")%>%
convert_as_factor(Incentive_Type)
mydata_long_3 <- as.data.frame(mydata_long_3)
tail(mydata_long_3, 10)
## ID Incentive_Type Agreeableness
## 776 156 Guides 3
## 777 156 24/7 Support 4
## 778 156 Personalization 5
## 779 156 Banker assistance 5
## 780 156 Video&FAQ 4
## 781 157 Guides 3
## 782 157 24/7 Support 4
## 783 157 Personalization 4
## 784 157 Banker assistance 4
## 785 157 Video&FAQ 3
library(ggpubr)
#Boxplot for each variable.
ggboxplot(mydata_long_3,
x = "Incentive_Type",
y = "Agreeableness",
add = "jitter")
### Repeated measures ANOVA
library(tidyverse)
library(ggpubr)
library(rstatix)
#Finding outliers.
mydata_long_3 %>%
group_by("Incentive_Type") %>%
identify_outliers("Agreeableness")
## # A tibble: 56 × 6
## `"Incentive_Type"` ID Incentive_Type Agreeableness is.outlier is.extreme
## <chr> <int> <fct> <dbl> <lgl> <lgl>
## 1 Incentive_Type 5 Personalization 1 TRUE FALSE
## 2 Incentive_Type 6 Guides 1 TRUE FALSE
## 3 Incentive_Type 11 Guides 1 TRUE FALSE
## 4 Incentive_Type 26 Personalization 1 TRUE FALSE
## 5 Incentive_Type 31 Guides 1 TRUE FALSE
## 6 Incentive_Type 31 24/7 Support 1 TRUE FALSE
## 7 Incentive_Type 31 Personalization 1 TRUE FALSE
## 8 Incentive_Type 31 Banker assistance 1 TRUE FALSE
## 9 Incentive_Type 31 Video&FAQ 1 TRUE FALSE
## 10 Incentive_Type 60 Video&FAQ 1 TRUE FALSE
## # ℹ 46 more rows
library(dplyr)
mydata_long_3 <- mydata_long_3 %>%
filter(ID != 5 & ID != 6 & ID != 11 & ID != 26 & ID != 31 & ID != 60 & ID != 66 & ID != 68 & ID != 69 & ID != 70 & ID != 71 & ID != 75 & ID != 79 & ID != 81 & ID != 82 & ID != 85 & ID != 89 & ID != 90 & ID != 94 & ID != 96 & ID != 101 & ID != 143 & ID != 149)
#Finding outliers.
mydata_long_3 %>%
group_by("Incentive_Type") %>%
identify_outliers("Agreeableness")
## [1] "Incentive_Type" ID Incentive_Type Agreeableness is.outlier is.extreme
## <0 rows> (or 0-length row.names)
library(rstatix)
#Checking normality with Shapiro-Wilk test
mydata_long_3 %>%
group_by(Incentive_Type) %>%
shapiro_test(Agreeableness)
## # A tibble: 5 × 4
## Incentive_Type variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 24/7 Support Agreeableness 0.840 8.89e-11
## 2 Banker assistance Agreeableness 0.870 1.74e- 9
## 3 Guides Agreeableness 0.863 8.43e-10
## 4 Personalization Agreeableness 0.861 6.85e-10
## 5 Video&FAQ Agreeableness 0.867 1.36e- 9
H0: Agreeableness is normally distributed in 24/7 Support.
H1: Agreeableness is not normally distributed in 24/7 Support.
We can reject H0 for all 5 variables, therefore we use Friedman ANOVA.
library(rstatix)
#Friedman ANOVA.
FriedmanANOVA_3 <- friedman_test(Agreeableness ~ Incentive_Type | ID,
data = mydata_long_3)
FriedmanANOVA_3 #Summary of results.
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 Agreeableness 134 46.9 4 0.00000000163 Friedman test
H0: Location distribution of agreeableness is the same in all three variables.
H1: At least one location distribution is different.
We can reject H0 at p < 0.001.
library(effectsize)
effectsize::kendalls_w(Agreeableness ~ Incentive_Type | ID,
data = mydata_long_3)
## Warning: 133 block(s) contain ties, some containing only 1 unique ranking.
## Kendall's W | 95% CI
## --------------------------
## 0.09 | [0.05, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
interpret_kendalls_w(0.09)
## [1] "slight agreement"
## (Rules: landis1977)
library(rstatix)
#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonpar_3 <- wilcox_test(Agreeableness ~ Incentive_Type,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_long_3)
paires_nonpar_3
## # A tibble: 10 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Agreeableness 24/7 Support Banker assistance 134 134 1100 0.487 1 ns
## 2 Agreeableness 24/7 Support Guides 134 134 1410. 0.0000217 0.000217 ***
## 3 Agreeableness 24/7 Support Personalization 134 134 1128. 0.054 0.538 ns
## 4 Agreeableness 24/7 Support Video&FAQ 134 134 2234 0.000000372 0.00000372 ****
## 5 Agreeableness Banker assistance Guides 134 134 2232 0.0000917 0.000917 ***
## 6 Agreeableness Banker assistance Personalization 134 134 1554 0.013 0.127 ns
## 7 Agreeableness Banker assistance Video&FAQ 134 134 2648. 0.000000329 0.00000329 ****
## 8 Agreeableness Guides Personalization 134 134 1017 0.08 0.801 ns
## 9 Agreeableness Guides Video&FAQ 134 134 1758. 0.062 0.616 ns
## 10 Agreeableness Personalization Video&FAQ 134 134 1989 0.00067 0.007 **
library(rstatix)
comparisons_3 <- paires_nonpar_3 %>%
add_y_position(fun = "median", step.increase = 0.35)
library(ggpubr)
ggboxplot(mydata_long_3, x = "Incentive_Type", y = "Agreeableness", add = "point", ylim=c(0, 18)) +
stat_pvalue_manual(comparisons_3, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = Incentive_Type), color = "#84BD00",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "#33006F",
position = position_dodge(width = 0.8),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = Incentive_Type)) +
labs(subtitle = get_test_label(FriedmanANOVA_3, detailed = TRUE),
caption = get_pwc_label(comparisons_3))
The Friedman ANOVA test indicates a significant difference (p < 0.001) in how respondents perceive different non-financial incentives for mobile banking adoption. This suggests that some forms of guidance and support are considered more effective than others. The effect size (Kendall’s W = 0.09) indicates a small but meaningful effect, suggesting that these non-financial incentives do have an impact on users’ willingness to adopt mobile banking.
The Wilcoxon signed-rank tests show that guides and tutorials received significantly higher ratings than other incentives, such as 24/7 customer support and banker assistance. However, video guides and FAQs were rated significantly lower than other forms of support, suggesting that users prefer direct assistance over passive learning materials.
The results partially support H5, confirming that non-financial incentives influence mobile banking adoption, but not all types of support are equally effective. Guides and personalization features were rated significantly higher than video-based help (FAQs & tutorials), which was the least effective. Additionally, banker assistance showed mixed results, being significantly different from some incentives but not others. This suggests that users value structured guidance (like step-by-step guides) over passive content like FAQs.
mydata_excel1 <- read_excel("~/Documents/Šola/IMB/2. semester/NLB project/Logistična IMB/1ka data.xlsx")
mydata_excel2 <- read_excel("~/Documents/Šola/IMB/2. semester/NLB project/Logistična IMB/anketa_končni podatki.xlsx")
mydata_excel2 <- mydata_excel2[-1, ] #Delete first row in which the questions are written
mydata_excel2$ID <- seq(1,nrow(mydata_excel2))
mydata2 <- mydata_excel2[!(apply(mydata_excel2 == -3, 1, any)), ]
mydata2 <- subset(mydata2, select = -c(Q21:Q40))
mydata2$ID <- seq(1,nrow(mydata2))
# Keep only specific columns (replace with actual column names)
data1_log_reg <- mydata_excel1 %>% select(Q1, Q41, Q42, Q43a, Q43b, Q43c, Q43d, Q43e, Q43f, Q43g, Q43h, Q44, Q45, Q46, Q47, Q48)
data2_log_reg <- mydata_excel2 %>% select(Q1, Q41, Q42, Q43a, Q43b, Q43c, Q43d, Q43e, Q43f, Q43g, Q43h, Q44, Q45, Q46, Q47, Q48)
merged_data_log_reg <- rbind(data1_log_reg, data2_log_reg)
# Rename columns using colnames()
colnames(merged_data_log_reg) <- c("Ali uporabljate mobilno aplikacijo",
"Spol","Leto rojstva","Živim s starši",
"Živim s sorojenci","Živim s sorodniki",
"Živim s partnerko/jem", "Živim z otroki",
"Živim s skrbnikom", "Živim sam/a", "Drugo",
"Št. prebivalcev v kraju bivanja", "Primarna banka",
"Trenutna zaposlitev", "Mesečni neto prihodek", "Stopnja izobrazbe")
# Remove the first row
merged_data_log_reg1 <- merged_data_log_reg[-1, ]
categorical_vars <- c("Ali uporabljate mobilno aplikacijo", "Spol",
"Živim s starši", "Živim s sorojenci",
"Živim s sorodniki", "Živim s partnerko/jem",
"Živim z otroki", "Živim s skrbnikom",
"Živim sam/a", "Drugo", "Primarna banka",
"Trenutna zaposlitev", "Stopnja izobrazbe")
merged_data_log_reg1[categorical_vars] <- lapply(merged_data_log_reg1[categorical_vars], as.factor)
# Convert numerical variables to numeric
numeric_vars <- c("Leto rojstva", "Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek")
merged_data_log_reg1[numeric_vars] <- lapply(merged_data_log_reg1[numeric_vars], as.numeric)
merged_data_log_reg1 <- as.data.frame(merged_data_log_reg1)
head(merged_data_log_reg1)
## Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši Živim s sorojenci Živim s sorodniki
## 1 1 2 2000 1 0 0
## 2 1 2 1998 0 0 0
## 3 1 2 2001 1 0 0
## 4 1 2 1994 0 0 0
## 5 1 2 2000 1 1 0
## 6 1 2 2004 1 0 0
## Živim s partnerko/jem Živim z otroki Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1 0 0 0 0 0 2
## 2 1 0 0 0 0 6
## 3 0 0 0 0 0 2
## 4 1 0 0 0 0 6
## 5 0 0 0 0 0 2
## 6 0 0 0 0 0 3
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe
## 1 3 2 2 2
## 2 3 2 3 4
## 3 1 2 2 3
## 4 12 5 6 5
## 5 1 2 3 6
## 6 1 1 8 4
merged_data_clean <- merged_data_log_reg1 %>%
filter(complete.cases(.)) # Removes rows with any NAs
# Automatically calculate age and assign age groups
merged_data_clean <- merged_data_clean %>%
mutate(
`Leto rojstva` = as.numeric(as.character(`Leto rojstva`)), # Convert to numeric (handles factors & characters)
age = as.numeric(format(Sys.Date(), "%Y")) - `Leto rojstva`, # Calculate age
age_group = case_when(
age <= 27 ~ "Young",
age > 27 & age <= 65 ~ "Professional",
age > 65 ~ "Older"))
# Ensure output is a data frame (not a tibble)
merged_data_clean <- as.data.frame(merged_data_clean)
head(merged_data_clean)
## Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši Živim s sorojenci Živim s sorodniki
## 1 1 2 2000 1 0 0
## 2 1 2 1998 0 0 0
## 3 1 2 2001 1 0 0
## 4 1 2 1994 0 0 0
## 5 1 2 2000 1 1 0
## 6 1 2 2004 1 0 0
## Živim s partnerko/jem Živim z otroki Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1 0 0 0 0 0 2
## 2 1 0 0 0 0 6
## 3 0 0 0 0 0 2
## 4 1 0 0 0 0 6
## 5 0 0 0 0 0 2
## 6 0 0 0 0 0 3
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe age age_group
## 1 3 2 2 2 25 Young
## 2 3 2 3 4 27 Young
## 3 1 2 2 3 24 Young
## 4 12 5 6 5 31 Professional
## 5 1 2 3 6 25 Young
## 6 1 1 8 4 21 Young
# Add an ID column (sequential numbering)
merged_data_clean <- merged_data_clean %>%
mutate(ID = row_number())
# Ensure it's a data frame (not a tibble)
merged_data_clean <- as.data.frame(merged_data_clean)
# Display the first 6 rows to check
head(merged_data_clean)
## Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši Živim s sorojenci Živim s sorodniki
## 1 1 2 2000 1 0 0
## 2 1 2 1998 0 0 0
## 3 1 2 2001 1 0 0
## 4 1 2 1994 0 0 0
## 5 1 2 2000 1 1 0
## 6 1 2 2004 1 0 0
## Živim s partnerko/jem Živim z otroki Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1 0 0 0 0 0 2
## 2 1 0 0 0 0 6
## 3 0 0 0 0 0 2
## 4 1 0 0 0 0 6
## 5 0 0 0 0 0 2
## 6 0 0 0 0 0 3
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe age age_group ID
## 1 3 2 2 2 25 Young 1
## 2 3 2 3 4 27 Young 2
## 3 1 2 2 3 24 Young 3
## 4 12 5 6 5 31 Professional 4
## 5 1 2 3 6 25 Young 5
## 6 1 1 8 4 21 Young 6
# Convert a specific column to numeric
merged_data_clean <- merged_data_clean %>%
mutate(across(c(`Živim s starši`, `Živim s sorojenci`, `Živim s sorodniki`, `Živim s partnerko/jem`, `Živim z otroki`, `Živim s skrbnikom`, `Živim sam/a`, `Drugo`, `age`),
~ as.numeric(as.character(.))))
merged_data_clean <- merged_data_clean %>%
mutate(`Ali uporabljate mobilno aplikacijo` = ifelse(`Ali uporabljate mobilno aplikacijo` == 1, 0, ifelse(!1 == 2, 1, V1)))
head(merged_data_clean)
## Ali uporabljate mobilno aplikacijo Spol Leto rojstva Živim s starši Živim s sorojenci Živim s sorodniki
## 1 0 2 2000 1 0 0
## 2 0 2 1998 0 0 0
## 3 0 2 2001 1 0 0
## 4 0 2 1994 0 0 0
## 5 0 2 2000 1 1 0
## 6 0 2 2004 1 0 0
## Živim s partnerko/jem Živim z otroki Živim s skrbnikom Živim sam/a Drugo Št. prebivalcev v kraju bivanja
## 1 0 0 0 0 0 2
## 2 1 0 0 0 0 6
## 3 0 0 0 0 0 2
## 4 1 0 0 0 0 6
## 5 0 0 0 0 0 2
## 6 0 0 0 0 0 3
## Primarna banka Trenutna zaposlitev Mesečni neto prihodek Stopnja izobrazbe age age_group ID
## 1 3 2 2 2 25 Young 1
## 2 3 2 3 4 27 Young 2
## 3 1 2 2 3 24 Young 3
## 4 12 5 6 5 31 Professional 4
## 5 1 2 3 6 25 Young 5
## 6 1 1 8 4 21 Young 6
fit0 <- glm(`Ali uporabljate mobilno aplikacijo` ~ 1, #Dependent and explanatory variables
family = binomial, #Binary logistic regression
data = merged_data_clean)
summary(fit0)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ 1, family = binomial,
## data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.33798 0.09521 -3.55 0.000385 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.6 on 453 degrees of freedom
## Residual deviance: 616.6 on 453 degrees of freedom
## AIC: 618.6
##
## Number of Fisher Scoring iterations: 4
exp(cbind(odds = fit0$coefficients, confint.default(fit0))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 0.7132075 0.5917995 0.8595225
head(fitted(fit0)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.4162996 0.4162996 0.4162996 0.4162996 0.4162996 0.4162996
fit1 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group,
family = binomial,
data = merged_data_clean)
summary(fit1)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group,
## family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5506 0.2942 5.270 1.36e-07 ***
## age_groupProfessional -1.6854 0.3254 -5.180 2.22e-07 ***
## age_groupYoung -3.2341 0.3635 -8.896 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.6 on 453 degrees of freedom
## Residual deviance: 505.7 on 451 degrees of freedom
## AIC: 511.7
##
## Number of Fisher Scoring iterations: 3
exp(cbind(odds = fit1$coefficients, confint.default(fit1))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 4.71428558 2.64836954 8.39176261
## age_groupProfessional 0.18536719 0.09796180 0.35075912
## age_groupYoung 0.03939394 0.01931873 0.08033047
head(fitted(fit1)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.1566265 0.1566265 0.1566265 0.4663462 0.1566265 0.1566265
#Ifelse za vprašanje s kom živijo
merged_data_clean <- merged_data_clean %>%
mutate(
# Create a text column listing all people they live with
S_kom_živijo = paste0(
ifelse(`Živim s starši` == 1, "Živim s starši, ", ""),
ifelse(`Živim s sorojenci` == 1, "Živim s sorojenci, ", ""),
ifelse(`Živim s sorodniki` == 1, "Živim s sorodniki, ", ""),
ifelse(`Živim s partnerko/jem` == 1, "Živim s partnerko/jem, ", ""),
ifelse(`Živim z otroki` == 1, "Živim z otroki, ", ""),
ifelse(`Živim s skrbnikom` == 1, "Živim s skrbnikom, ", ""),
ifelse(`Drugo` == 1, "Drugo, ", "")
),
S_kom_živijo = sub(", $", "", S_kom_živijo), # Remove trailing comma
# Categorize into "Lives alone" or "Lives with others"
Kako_živijo = ifelse(`Živim sam/a` == 1, "Živi sam", "Živi z drugimi")
)
anova(fit0, fit1, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ 1
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 453 616.6
## 2 451 505.7 2 110.89 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(cbind(OR = fit1$coefficients, confint.default(fit1))) #Odds ratio for Y=1 (with 95% CI)
## OR 2.5 % 97.5 %
## (Intercept) 4.71428558 2.64836954 8.39176261
## age_groupProfessional 0.18536719 0.09796180 0.35075912
## age_groupYoung 0.03939394 0.01931873 0.08033047
merged_data_clean <- merged_data_clean %>%
mutate(across(c(`Št. prebivalcev v kraju bivanja`, `Trenutna zaposlitev`, `Mesečni neto prihodek`, `Stopnja izobrazbe`),
~ as.numeric(as.character(.))))
str(merged_data_clean)
## 'data.frame': 454 obs. of 21 variables:
## $ Ali uporabljate mobilno aplikacijo: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Spol : Factor w/ 4 levels "-3","1","2","3": 3 3 3 3 3 3 2 3 3 2 ...
## $ Leto rojstva : num 2000 1998 2001 1994 2000 ...
## $ Živim s starši : num 1 0 1 0 1 1 0 0 0 0 ...
## $ Živim s sorojenci : num 0 0 0 0 1 0 0 0 0 0 ...
## $ Živim s sorodniki : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Živim s partnerko/jem : num 0 1 0 1 0 0 1 0 1 1 ...
## $ Živim z otroki : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Živim s skrbnikom : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Živim sam/a : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Drugo : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Št. prebivalcev v kraju bivanja : num 2 6 2 6 2 3 6 6 1 4 ...
## $ Primarna banka : Factor w/ 14 levels "-3","1","10",..: 8 8 2 5 2 2 11 7 2 7 ...
## $ Trenutna zaposlitev : num 2 2 2 5 2 1 2 1 2 1 ...
## $ Mesečni neto prihodek : num 2 3 2 6 3 8 4 6 4 2 ...
## $ Stopnja izobrazbe : num 2 4 3 5 6 4 6 1 6 5 ...
## $ age : num 25 27 24 31 25 21 28 23 39 23 ...
## $ age_group : chr "Young" "Young" "Young" "Professional" ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ S_kom_živijo : chr "Živim s starši" "Živim s partnerko/jem" "Živim s starši" "Živim s partnerko/jem" ...
## $ Kako_živijo : chr "Živi z drugimi" "Živi z drugimi" "Živi z drugimi" "Živi z drugimi" ...
fit2 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Trenutna zaposlitev` + `Stopnja izobrazbe`,
family = binomial,
data = merged_data_clean)
summary(fit2)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Trenutna zaposlitev` + `Stopnja izobrazbe`, family = binomial,
## data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.64298 0.50628 5.220 1.79e-07 ***
## age_groupProfessional -0.71711 0.36821 -1.948 0.0515 .
## age_groupYoung -2.51214 0.40758 -6.164 7.11e-10 ***
## `Trenutna zaposlitev` 0.07741 0.10220 0.757 0.4488
## `Stopnja izobrazbe` -0.44330 0.08969 -4.943 7.71e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.60 on 453 degrees of freedom
## Residual deviance: 475.35 on 449 degrees of freedom
## AIC: 485.35
##
## Number of Fisher Scoring iterations: 5
exp(cbind(odds = fit2$coefficients, confint.default(fit2))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 14.05507453 5.2105773 37.9123286
## age_groupProfessional 0.48816000 0.2372152 1.0045737
## age_groupYoung 0.08109449 0.0364804 0.1802699
## `Trenutna zaposlitev` 1.08048242 0.8843581 1.3201013
## `Stopnja izobrazbe` 0.64191150 0.5384305 0.7652805
head(fitted(fit2)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.35412612 0.18428812 0.26032964 0.52407772 0.08516365 0.17293487
fit3 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`,
family = binomial,
data = merged_data_clean)
summary(fit3)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja`, family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.95622 0.33318 5.871 4.32e-09 ***
## age_groupProfessional -1.18740 0.34527 -3.439 0.000584 ***
## age_groupYoung -2.80997 0.37710 -7.452 9.23e-14 ***
## `Št. prebivalcev v kraju bivanja` -0.24141 0.05459 -4.422 9.76e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.60 on 453 degrees of freedom
## Residual deviance: 484.64 on 450 degrees of freedom
## AIC: 492.64
##
## Number of Fisher Scoring iterations: 4
exp(cbind(odds = fit0$coefficients, confint.default(fit3))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 0.7132075 3.68101588 13.5888375
## age_groupProfessional 0.7132075 0.15503284 0.6000846
## age_groupYoung 0.7132075 0.02875141 0.1260762
## `Št. prebivalcev v kraju bivanja` 0.7132075 0.70581729 0.8742236
head(fitted(fit3)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.20807504 0.09094061 0.20807504 0.33634239 0.20807504 0.17108220
fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
head(merged_data_clean[, c("age_group", "Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek", "Kako_živijo")])
## age_group Št. prebivalcev v kraju bivanja Mesečni neto prihodek Kako_živijo
## 1 Young 2 2 Živi z drugimi
## 2 Young 6 3 Živi z drugimi
## 3 Young 2 2 Živi z drugimi
## 4 Professional 6 6 Živi z drugimi
## 5 Young 2 3 Živi z drugimi
## 6 Young 3 8 Živi z drugimi
library(car)
vif(fit4)
## GVIF Df GVIF^(1/(2*Df))
## age_group 1.340321 2 1.075975
## `Št. prebivalcev v kraju bivanja` 1.059177 1 1.029163
## `Mesečni neto prihodek` 1.342169 1 1.158520
## Kako_živijo 1.036132 1 1.017906
All VIF values are close to 1, meaning there is no serious multicollinearity in our model. Our predictors are independent enough to be used together.
summary(fit4)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` +
## Kako_živijo, family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.83274 0.52146 5.432 5.56e-08 ***
## age_groupProfessional -0.89742 0.38778 -2.314 0.0207 *
## age_groupYoung -3.25271 0.43764 -7.432 1.07e-13 ***
## `Št. prebivalcev v kraju bivanja` -0.14921 0.06377 -2.340 0.0193 *
## `Mesečni neto prihodek` -0.48453 0.08232 -5.886 3.96e-09 ***
## Kako_živijoŽivi z drugimi 0.06792 0.31249 0.217 0.8279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.60 on 453 degrees of freedom
## Residual deviance: 437.01 on 448 degrees of freedom
## AIC: 449.01
##
## Number of Fisher Scoring iterations: 5
exp(cbind(odds = fit4$coefficients, confint.default(fit4))) #Odds for Y=1
## odds 2.5 % 97.5 %
## (Intercept) 16.99197228 6.1147486 47.21815095
## age_groupProfessional 0.40761833 0.1906221 0.87163395
## age_groupYoung 0.03866918 0.0164000 0.09117718
## `Št. prebivalcev v kraju bivanja` 0.86138858 0.7601860 0.97606417
## `Mesečni neto prihodek` 0.61598806 0.5242040 0.72384281
## Kako_živijoŽivi z drugimi 1.07027620 0.5801038 1.97463141
head(fitted(fit4)) #Estimated probability for Y=1
## 1 2 3 4 5 6
## 0.165269812 0.062920556 0.165269812 0.141949708 0.108703138 0.009231083
fit5 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
summary(fit5)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` +
## `Mesečni neto prihodek` + `Stopnja izobrazbe` + Kako_živijo,
## family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.36990 0.74444 4.527 5.99e-06 ***
## age_groupProfessional -0.67280 0.40141 -1.676 0.0937 .
## age_groupYoung -2.99739 0.48179 -6.221 4.93e-10 ***
## `Št. prebivalcev v kraju bivanja` -0.12621 0.06518 -1.936 0.0528 .
## `Trenutna zaposlitev` 0.06624 0.11807 0.561 0.5748
## `Mesečni neto prihodek` -0.43289 0.08371 -5.171 2.33e-07 ***
## `Stopnja izobrazbe` -0.22244 0.11139 -1.997 0.0458 *
## Kako_živijoŽivi z drugimi 0.01338 0.31648 0.042 0.9663
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.60 on 453 degrees of freedom
## Residual deviance: 432.72 on 446 degrees of freedom
## AIC: 448.72
##
## Number of Fisher Scoring iterations: 6
anova(fit0, fit1, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ 1
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 453 616.6
## 2 451 505.7 2 110.89 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit2, fit3, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Trenutna zaposlitev` +
## `Stopnja izobrazbe`
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 449 475.35
## 2 450 484.64 -1 -9.2898 0.002304 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit3, fit4, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja`
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Mesečni neto prihodek` + Kako_živijo
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 450 484.64
## 2 448 437.01 2 47.633 4.536e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit4, fit5, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Mesečni neto prihodek` + Kako_živijo
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` +
## Kako_živijo
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 448 437.01
## 2 446 432.72 2 4.2928 0.1169
We figured out that fit4 is the best fit, so we will proceed with this one.
# Extract standardized residuals and Cook's distance from fit4
merged_data_clean$StdResid <- rstandard(fit4) # Standardized residuals
merged_data_clean$Cook <- cooks.distance(fit4) # Cook's distance
merged_data_clean$StdResid <- rstandard(fit4)
merged_data_clean$CooksD <- cooks.distance(fit4)
library(ggplot2)
StdResid <- ggplot(merged_data_clean, aes(x=StdResid)) +
theme_linedraw() +
geom_histogram() +
xlab("Standardized residuals")
library(ggplot2)
Cook <- ggplot(merged_data_clean, aes(x=CooksD)) +
theme_linedraw() +
geom_histogram() +
xlab("Cook's distances")
ggarrange(StdResid, Cook,
ncol = 2, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
head(merged_data_clean[order(merged_data_clean$StdResid), c("ID", "StdResid")], 5)
## ID StdResid
## 191 191 -2.105705
## 232 232 -2.066293
## 240 240 -1.881493
## 264 264 -1.792185
## 254 254 -1.779406
head(merged_data_clean[order(-merged_data_clean$StdResid), c("ID", "StdResid")], 5)
## ID StdResid
## 440 440 2.916655
## 313 313 2.505523
## 297 297 2.358535
## 394 394 2.256942
## 446 446 2.194612
head(merged_data_clean[order(-merged_data_clean$CooksD), c("ID", "CooksD")], 10)
## ID CooksD
## 440 440 0.04439633
## 191 191 0.02805883
## 313 313 0.02379869
## 240 240 0.02354029
## 232 232 0.02054852
## 446 446 0.01904230
## 380 380 0.01837936
## 394 394 0.01827372
## 296 296 0.01789714
## 322 322 0.01789714
We have 3 high impact values and 1 potential outlier. So we dropped those values.
library(dplyr)
merged_data_clean <- merged_data_clean %>%
filter(ID != 440 & ID != 191 & ID != 313 & ID != 232)
Let’s run the fits again and check if they’re better.
fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
summary(fit4)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` +
## Kako_živijo, family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.32703 0.58135 5.723 1.05e-08 ***
## age_groupProfessional -1.13377 0.42267 -2.682 0.00731 **
## age_groupYoung -3.75194 0.48745 -7.697 1.39e-14 ***
## `Št. prebivalcev v kraju bivanja` -0.15270 0.06632 -2.303 0.02130 *
## `Mesečni neto prihodek` -0.57067 0.09076 -6.287 3.23e-10 ***
## Kako_živijoŽivi z drugimi 0.10018 0.32367 0.309 0.75694
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 610.94 on 449 degrees of freedom
## Residual deviance: 412.17 on 444 degrees of freedom
## AIC: 424.17
##
## Number of Fisher Scoring iterations: 6
fit5 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
summary(fit5)
##
## Call:
## glm(formula = `Ali uporabljate mobilno aplikacijo` ~ age_group +
## `Št. prebivalcev v kraju bivanja` + `Trenutna zaposlitev` +
## `Mesečni neto prihodek` + `Stopnja izobrazbe` + Kako_živijo,
## family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.99040 0.83581 4.774 1.80e-06 ***
## age_groupProfessional -0.90296 0.43243 -2.088 0.0368 *
## age_groupYoung -3.49294 0.53299 -6.553 5.62e-11 ***
## `Št. prebivalcev v kraju bivanja` -0.12915 0.06771 -1.907 0.0565 .
## `Trenutna zaposlitev` 0.07230 0.12380 0.584 0.5592
## `Mesečni neto prihodek` -0.51303 0.09142 -5.612 2.00e-08 ***
## `Stopnja izobrazbe` -0.25354 0.11856 -2.139 0.0325 *
## Kako_živijoŽivi z drugimi 0.03498 0.32861 0.106 0.9152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 610.94 on 449 degrees of freedom
## Residual deviance: 407.18 on 442 degrees of freedom
## AIC: 423.18
##
## Number of Fisher Scoring iterations: 6
And let’s run anova again:
anova(fit4, fit5, test = "Chi") #Comparision of models based on -2LL statistics
## Analysis of Deviance Table
##
## Model 1: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Mesečni neto prihodek` + Kako_živijo
## Model 2: `Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` +
## `Trenutna zaposlitev` + `Mesečni neto prihodek` + `Stopnja izobrazbe` +
## Kako_živijo
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 444 412.17
## 2 442 407.18 2 4.9891 0.08253 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# A data frame for coefficients of fit4
coeff_data <- data.frame(
Variable = c("Intercept", "age_groupProfessional", "age_groupYoung",
"Št. prebivalcev v kraju bivanja", "Mesečni neto prihodek", "Kako_živijoŽivi z drugimi"),
Estimate = c(-3.74939, 1.27803, 4.01414, 0.15967, 0.63420, -0.03244),
Std_Error = c(0.62769, 0.44997, 0.52222, 0.06804, 0.09739, 0.33023),
Z_Value = c(-5.973, 2.840, 7.687, 2.347, 6.512, -0.098),
P_Value = c(2.32e-09, 0.00451, 1.51e-14, 0.01894, 7.40e-11, 0.92174))
head(coeff_data)
## Variable Estimate Std_Error Z_Value P_Value
## 1 Intercept -3.74939 0.62769 -5.973 2.3200e-09
## 2 age_groupProfessional 1.27803 0.44997 2.840 4.5100e-03
## 3 age_groupYoung 4.01414 0.52222 7.687 1.5100e-14
## 4 Št. prebivalcev v kraju bivanja 0.15967 0.06804 2.347 1.8940e-02
## 5 Mesečni neto prihodek 0.63420 0.09739 6.512 7.4000e-11
## 6 Kako_živijoŽivi z drugimi -0.03244 0.33023 -0.098 9.2174e-01
# Ensure the dependent variable is a factor (assuming LatePayF is the outcome variable)
merged_data_clean$`Ali uporabljate mobilno aplikacijo` <- as.factor(merged_data_clean$`Ali uporabljate mobilno aplikacijo`)
# Fit logistic regression model
fit4 <- glm(`Ali uporabljate mobilno aplikacijo` ~ age_group + `Št. prebivalcev v kraju bivanja` + `Mesečni neto prihodek` + `Kako_živijo`,
family = binomial,
data = merged_data_clean)
# Generate predicted probabilities (Estimated Probability)
merged_data_clean <- merged_data_clean %>%
mutate(EstProb = predict(fit4, type = "response"))
# Classify based on probability threshold (0.50)
merged_data_clean <- merged_data_clean %>%
mutate(Classification = ifelse(EstProb < 0.50, "NO", "YES"),
ClassificationF = factor(Classification, levels = c("NO", "YES")))
# Create the classification table
ClassificationTable <- table(merged_data_clean$`Ali uporabljate mobilno aplikacijo`, merged_data_clean$ClassificationF)
# Display classification table
print(ClassificationTable)
##
## NO YES
## 0 223 40
## 1 62 125
# Compute Pseudo R² (Proportion of correctly classified cases)
Pseudo_R2_fit4 <- sum(diag(ClassificationTable)) / nrow(merged_data_clean)
# Display Pseudo R²
Pseudo_R2_fit4
## [1] 0.7733333
Fit4 vs. Fit5: The simpler model (fit4) performed slightly better, and adding employment status and education level did not significantly improve the fit.
Classification Performance: The logistic regression model achieved 78.29% accuracy, indicating a good ability to distinguish between app users and non-users.
Potential Issues: The warning message (glm.fit: fitted probabilities numerically 0 or 1 occurred) suggests some separation in the data, meaning some observations were perfectly predicted, which could affect model reliability.
Now we make some slight alterations to fit 4 and make a conclusion.
merged_data_clean$ID <- seq(1, nrow(merged_data_clean))
merged_data_clean$Aplikacija <- factor(merged_data_clean$`Ali uporabljate mobilno aplikacijo`,
levels = c(0, 1),
labels = c("Ne", "Da"))
merged_data_clean$Kako_živijo <- factor(merged_data_clean$Kako_živijo,
levels = c("Živi sam", "Živi z drugimi"),
labels = c("Živi sam", "Živi z drugimi"))
merged_data_clean$Spol <- factor(merged_data_clean$Spol,
levels = c(1, 2),
labels = c("M", "Z"))
merged_data_clean <- merged_data_clean %>%
filter(age < 100)
merged_data_clean <- merged_data_clean %>%
filter(`Mesečni neto prihodek` > 0)
merged_data_clean$AgeG <- ifelse(merged_data_clean$age < 30,
yes = "Young",
no = ifelse(merged_data_clean$age > 50,
"Older",
"Middle"))
merged_data_clean$AgeG <- factor(merged_data_clean$AgeG,
levels = c("Middle", "Young", "Older"),
labels = c("Middle", "Young", "Older"))
merged_data_clean <- merged_data_clean %>%
filter(!ID %in% c(187, 190, 227))
summary(merged_data_clean[c("Aplikacija", "AgeG", "Št. prebivalcev v kraju bivanja",
"Mesečni neto prihodek", "Kako_živijo", "Spol")])
## Aplikacija AgeG Št. prebivalcev v kraju bivanja Mesečni neto prihodek Kako_živijo Spol
## Ne:260 Middle:110 Min. :1.000 Min. :1.00 Živi sam : 73 M:194
## Da:147 Young :171 1st Qu.:2.000 1st Qu.:1.00 Živi z drugimi:334 Z:213
## Older :126 Median :3.000 Median :2.00
## Mean :3.698 Mean :2.87
## 3rd Qu.:6.000 3rd Qu.:4.00
## Max. :6.000 Max. :8.00
fit_final <- glm(Aplikacija ~ AgeG + `Št. prebivalcev v kraju bivanja` +
`Mesečni neto prihodek` + `Kako_živijo` + Spol + `Stopnja izobrazbe`,
family = binomial,
data = merged_data_clean)
summary(fit_final)
##
## Call:
## glm(formula = Aplikacija ~ AgeG + `Št. prebivalcev v kraju bivanja` +
## `Mesečni neto prihodek` + Kako_živijo + Spol + `Stopnja izobrazbe`,
## family = binomial, data = merged_data_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.81343 0.76612 3.672 0.00024 ***
## AgeGYoung -2.29017 0.35835 -6.391 1.65e-10 ***
## AgeGOlder 1.01713 0.30043 3.386 0.00071 ***
## `Št. prebivalcev v kraju bivanja` -0.12328 0.06954 -1.773 0.07625 .
## `Mesečni neto prihodek` -0.48960 0.10237 -4.783 1.73e-06 ***
## Kako_živijoŽivi z drugimi -0.13968 0.32957 -0.424 0.67169
## SpolZ 0.02701 0.25812 0.105 0.91667
## `Stopnja izobrazbe` -0.21857 0.12111 -1.805 0.07111 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 532.43 on 406 degrees of freedom
## Residual deviance: 389.14 on 399 degrees of freedom
## AIC: 405.14
##
## Number of Fisher Scoring iterations: 5
Interpretation of each variable from the logistic regression results:
AgeG (Young vs. Baseline - Professional): From the results, we can see that younger participants for each additional year (compared to the baseline group - professional) are 0.1 times less likely to use the mobile banking application, as indicated by the coefficient of -2.29. This relationship is statistically significant (p-value < 0.001), assuming all other explanatory variables remain the same.
AgeG (Older vs. Baseline - Professional): Older participants are with each additional year 2.76 times more likely to use the application compared to the baseline age group, with this result also being statistically significant (p-value < 0.001), assuming all other explanatory variables remain the same.
Number of residents in the living area (Št. prebivalcev v kraju bivanja): Results point that for each additional resident in the participant’s living area, the odds of using the application would be 0.884 of the initial odds (estimate = -0.12328) at p = 0.077, assuming all other explanatory variables remain the same.
Monthly net income (Mesečni neto prihodek): Results point that increasing the monthly net income by one income class, the odds of using the application would be 0.613 of the initial odds This result is statistically significant (p-value < 0.001), assuming all other explanatory variables remain the same.
Living situation (Kako_živi): Not statistically significant at p = 0.68.
Gender (Spol): Not statistically significant at p = 0.92.
Education level (Stopnja izobrazbe): Results point that for each additional educational level, the odds of using the application would be 0.803 of the initial odds (estimate = -0.21857) at p = 0.077, assuming all other explanatory variables remain the same.
To summarize, with logistic regression analysis we have proven that age, number of residents in the living area, monthly net income, and education level have an impact on wheather the customer is using a mobile bank or not.