#install.packages
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata_excel <- read_excel("~/Documents/Šola/IMB/2. semester/NLB project/anketa_končni podatki.xlsx")
mydata_excel <- mydata_excel[-1, ] #Delete first row in which the questions are written
mydata_excel$ID <- seq(1,nrow(mydata_excel))
head(mydata_excel)
## # A tibble: 6 × 129
## Q1 Q2 Q3 Q4 Q5 Q6a Q6b Q6c Q6d Q6e Q6f Q6g Q6h
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 1 1 1 2 1 1 2 1 2 1 1
## 2 2 1 1 2 -2 -2 -2 -2 -2 -2 -2 -2 -2
## 3 2 1 1 1 2 4 5 5 4 3 5 5 4
## 4 2 1 1 1 2 -3 -3 -3 -3 -3 -3 -3 -3
## 5 2 1 1 1 2 2 3 4 4 1 2 2 3
## 6 2 1 1 1 2 3 2 5 1 1 1 1 1
## # ℹ 116 more variables: Q6i <chr>, Q6j <chr>, Q6j_text <chr>, Q7 <chr>,
## # Assistance <chr>, Security <chr>, Transparency <chr>, Convinience <chr>,
## # `Speed and Reliability` <chr>, Q8f <chr>, Q8f_text <chr>,
## # Branch_Assistance <chr>, `Mobile bank_Assistance` <chr>,
## # Branch_Security <chr>, `Mobile bank_Security` <chr>,
## # Branch_Transparency <chr>, `Mobile bank_Transparency` <chr>,
## # Branch_Convinience <chr>, `Mobile bank_Convinience` <chr>, …
mydata <- mydata_excel[!(apply(mydata_excel == -3, 1, any)), ]
mydata <- subset(mydata, select = -c(Q21:Q40))
mydata$ID <- seq(1,nrow(mydata))
head(mydata)
## # A tibble: 6 × 83
## Q1 Q2 Q3 Q4 Q5 Q6a Q6b Q6c Q6d Q6e Q6f Q6g Q6h
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 1 1 1 2 1 1 2 1 2 1 1
## 2 2 1 1 2 -2 -2 -2 -2 -2 -2 -2 -2 -2
## 3 2 1 1 1 2 4 5 5 4 3 5 5 4
## 4 2 1 1 1 2 2 3 4 4 1 2 2 3
## 5 2 1 1 1 2 3 2 5 1 1 1 1 1
## 6 2 2 2 1 2 5 4 4 5 4 3 5 5
## # ℹ 70 more variables: Q6i <chr>, Q6j <chr>, Q6j_text <chr>, Q7 <chr>,
## # Assistance <chr>, Security <chr>, Transparency <chr>, Convinience <chr>,
## # `Speed and Reliability` <chr>, Q8f <chr>, Q8f_text <chr>,
## # Branch_Assistance <chr>, `Mobile bank_Assistance` <chr>,
## # Branch_Security <chr>, `Mobile bank_Security` <chr>,
## # Branch_Transparency <chr>, `Mobile bank_Transparency` <chr>,
## # Branch_Convinience <chr>, `Mobile bank_Convinience` <chr>, …
mydata$Q2 <- factor(mydata$Q2,
levels = c(1, 2),
labels = c("Yes","No"))
mydata$Q3 <- factor(mydata$Q3,
levels = c(1, 2),
labels = c("Yes","No"))
mydata$Q4 <- factor(mydata$Q4,
levels = c(1, 2, 3),
labels = c("Yes","No", "I don't know"))
mydata$Q5 <- factor(mydata$Q5,
levels = c(1, 2),
labels = c("Yes","No"))
mydata$Q7 <- factor(mydata$Q7,
levels = c(1, 2),
labels = c("Yes","No"))
mydata$Q16a <- factor(mydata$Q16a,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16b <- factor(mydata$Q16b,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16c <- factor(mydata$Q16c,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16d <- factor(mydata$Q16d,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16e <- factor(mydata$Q16e,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16f <- factor(mydata$Q16f,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q16g <- factor(mydata$Q16g,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q41 <- factor(mydata$Q41,
levels = c(1, 2),
labels = c("Female","Male"))
mydata$Q43a <- factor(mydata$Q43a,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43b <- factor(mydata$Q43b,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43c <- factor(mydata$Q43c,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43d <- factor(mydata$Q43d,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43e <- factor(mydata$Q43e,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43f <- factor(mydata$Q43f,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43g <- factor(mydata$Q43g,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q43h <- factor(mydata$Q43h,
levels = c(1, 0),
labels = c("Selected","Not selected"))
mydata$Q44 <- factor(mydata$Q44,
levels = c(2, 6, 1, 4, 3, 5),
labels = c("1.000 – 5.000 habitants","More than 100.000 habitants", "Less than 1.000 habitants", "20.001 – 50.000 habitants", "5.001 – 20.000 habitants", "50.001 – 100.000 habitants"))
mydata$Q45 <- factor(mydata$Q45,
levels = c(3, 5, 1, 9, 12, 7, 10, 4, 11, 6),
labels = c("OTP banka d.d.","Banka Intesa Sanpaolo d.d.", "Nova Ljubljanska Banka d.d. (NLB)", "Gorenjska Banka d.d.", "Delavska Hranilnica d.d.", "Revolut", "Deželna Banka Slovenije d.d.", "Banka Sparkasse d.d.", "Addiko Bank d.d.", "UniCredit Banka Slovenija d.d."))
mydata$Q46 <- factor(mydata$Q46,
levels = c(1, 2, 3, 5, 6, 4),
labels = c("Študent/-ka","Redno zaposlen/-a", "Upokojen/-a", "Samozaposlen/-a", "Delno zaposlen/-a", "Brezposeln/-a"))
mydata$Q47 <- factor(mydata$Q47,
levels = c(1, 5, 3, 8, 2, 4, 6, 7),
labels = c("Pod 1.000€","3.001€ - 5.000€", "1.501€ - 2.000€", "I don't want to answer", "1.000€ - 1.500€", "2.001€ - 3.000€", "5.001€ - 10.000€", "Above 10.000€"))
mydata$Q48 <- factor(mydata$Q48,
levels = c(2, 6, 3, 5, 7, 4),
labels = c("Dokončana osnovna šola","Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)", "Dokončana nižja ali srednja poklicna izobrazba", "Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja)", "Dokončana specializacija, znanstveni magisterij, doktorat", "Dokončana srednja strokovna ali splošna izobrazba"))
mydata$Q49 <- factor(mydata$Q49,
levels = c(1, 2, 3),
labels = c("Preko linka","Na tablici", "Na listu papirja"))
mydata[c(6:14, 18:22, 25:39, 48:55, 58:62)] <- mydata[c(6:14, 18:22, 25:39, 48:55, 58:62)] %>% mutate_all(as.numeric)
mydata <- mydata %>% mutate(across(where(is.numeric), ~ replace(., . == -2, mean(.[. != -2], na.rm = TRUE))))
mydata <- mydata %>% mutate(across(where(is.numeric), ~ replace(., is.na(.), mean(., na.rm = TRUE))))
mydata <- mydata %>%
filter(!ID %in% c(2, 16, 17))
mydata$ID <- seq(1, nrow(mydata))
summary(mydata[c(-1, -15, -16, -23, -24, -47, -56, -57, -63, -64, -66, -75, -78, -83)])
## Q2 Q3 Q4 Q5 Q6a
## Yes :121 Yes :131 Yes :103 Yes : 25 Min. :1.000
## No : 36 No : 26 No : 28 No :132 1st Qu.:3.000
## NA's: 1 NA's: 1 I don't know: 26 NA's: 1 Median :3.000
## NA's : 1 Mean :3.433
## 3rd Qu.:4.000
## Max. :5.000
##
## Q6b Q6c Q6d Q6e
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:1.000
## Median :4.000 Median :4.000 Median :4.000 Median :2.000
## Mean :3.535 Mean :3.847 Mean :3.854 Mean :2.688
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q6f Q6g Q6h Q6i Q7
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 Yes :82
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:3.000 No :75
## Median :4.000 Median :3.086 Median :3.00 Median :4.000 NA's: 1
## Mean :3.497 Mean :3.172 Mean :2.93 Mean :3.465
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
##
## Assistance Security Transparency Convinience
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.00 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000
## Median :4.00 Median :5.000 Median :4.000 Median :4.000
## Mean :3.79 Mean :4.382 Mean :4.108 Mean :3.975
## 3rd Qu.:4.00 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.000
##
## Speed and Reliability Branch_Assistance Mobile bank_Assistance Branch_Security
## Min. :1.000 Min. :2.000 Min. :1.00 Min. :2.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:3.00 1st Qu.:3.986
## Median :4.000 Median :4.000 Median :3.00 Median :4.000
## Mean :4.185 Mean :4.006 Mean :3.14 Mean :3.981
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.00 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
##
## Mobile bank_Security Branch_Transparency Mobile bank_Transparency
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :4.000 Median :3.000
## Mean :3.032 Mean :3.898 Mean :3.248
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
##
## Branch_Convinience Mobile bank_Convinience Branch_Speed and Reliability
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :3.000 Median :4.000
## Mean :3.752 Mean :3.389 Mean :3.701
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
##
## Mobile bank_Speed and Reliability Q15a Q15b Q15c
## Min. :1.000 Min. :1 Min. :2.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:4 1st Qu.:4.000 1st Qu.:4.000
## Median :4.000 Median :4 Median :4.000 Median :4.000
## Mean :3.586 Mean :4 Mean :4.134 Mean :4.115
## 3rd Qu.:4.000 3rd Qu.:5 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5 Max. :5.000 Max. :5.000
##
## Q15d Q15e Q16a Q16b
## Min. :1.000 Min. :2.000 Selected :98 Selected :66
## 1st Qu.:3.000 1st Qu.:4.000 Not selected:59 Not selected:91
## Median :4.000 Median :4.000 NA's : 1 NA's : 1
## Mean :3.809 Mean :4.127
## 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
##
## Q16c Q16d Q16e Q16f
## Selected :68 Selected :79 Selected : 49 Selected : 36
## Not selected:89 Not selected:78 Not selected:108 Not selected:121
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q16g Q17a Q17b Q17c
## Selected : 9 Min. :0.000 Min. :0.000 Min. :0.000
## Not selected:148 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000
## NA's : 1 Median :4.000 Median :4.000 Median :4.000
## Mean :3.662 Mean :3.694 Mean :3.758
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.750
## Max. :5.000 Max. :5.000 Max. :5.000
##
## Q17d Q17e Q18a Q18b Q18c
## Min. :0.000 Min. :0.000 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:4.000 1st Qu.:3.00 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.00 Median :4.000 Median :4.000
## Mean :3.223 Mean :3.809 Mean :3.35 Mean :3.732 Mean :3.713
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.00 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000 Max. :5.000
##
## Q19a Q19b Q19c Q19d
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.217 Mean :3.548 Mean :3.331 Mean :3.618
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Q19e Q41 Q43a Q43b
## Min. :1.000 Female:78 Selected : 32 Selected : 12
## 1st Qu.:2.000 Male :79 Not selected:125 Not selected:145
## Median :3.000 NA's : 1 NA's : 1 NA's : 1
## Mean :3.051
## 3rd Qu.:4.000
## Max. :5.000
##
## Q43c Q43d Q43e Q43f
## Selected : 5 Selected :81 Selected : 57 Selected : 1
## Not selected:152 Not selected:76 Not selected:100 Not selected:156
## NA's : 1 NA's : 1 NA's : 1 NA's : 1
##
##
##
##
## Q43g Q43h Q44
## Selected : 29 Selected : 0 1.000 – 5.000 habitants :44
## Not selected:128 Not selected:157 More than 100.000 habitants:36
## NA's : 1 NA's : 1 Less than 1.000 habitants :30
## 20.001 – 50.000 habitants :14
## 5.001 – 20.000 habitants :22
## 50.001 – 100.000 habitants :11
## NA's : 1
## Q45 Q46
## OTP banka d.d. :71 Študent/-ka :22
## Nova Ljubljanska Banka d.d. (NLB):35 Redno zaposlen/-a:73
## Delavska Hranilnica d.d. :12 Upokojen/-a :43
## Banka Intesa Sanpaolo d.d. :10 Samozaposlen/-a :13
## Deželna Banka Slovenije d.d. : 8 Delno zaposlen/-a: 3
## (Other) :18 Brezposeln/-a : 3
## NA's : 4 NA's : 1
## Q47
## Pod 1.000€ :46
## 1.000€ - 1.500€:39
## 2.001€ - 3.000€:29
## 1.501€ - 2.000€:25
## 3.001€ - 5.000€: 6
## (Other) : 4
## NA's : 9
## Q48
## Dokončana osnovna šola : 8
## Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja) :35
## Dokončana nižja ali srednja poklicna izobrazba :23
## Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja):34
## Dokončana specializacija, znanstveni magisterij, doktorat : 5
## Dokončana srednja strokovna ali splošna izobrazba :50
## NA's : 3
## Q49
## Preko linka :84
## Na tablici :37
## Na listu papirja:17
## NA's :20
##
##
##
Q1: Ali uporabljate mobilno aplikacijo banke, kjer imate odprt primarni račun (npr. KlikIn, mBank@Net in podobno)? [1 - Da, 2 - Ne]
Q2: Ali dnevno uporabljate pametni telefon? [1 - Da, 2 - Ne]
Q3: Ali uporabljate internet vsaj nekajkrat na teden? [1 - Da, 2 - Ne]
Q4: Ali je kdo od vaših družinskih članov uporabnik mobilne banke? [1 - Da, 2 - Ne, 3 - Ne vem]
Q5: Ali imate skupni bančni račun, ki ga upravlja en član družine? [1 - Da, 2 - Ne]
Q6: Zakaj ne uporabljate mobilne banke (npr. KlikIn, mBank@Net, Addiko Mobil, …)? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti se ne strinjam niti se strinjam, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q7: Se vam zdi, da bi potrebovali dodatno izobraževanje, preden bi lahko uporabljali mobilno banko? [1 - Da, 2 - Ne]
Q8: Kako pomembni so naslednji dejavniki pri odločitvi med poslovalnico ali mobilno aplikacijo? [1 - Sploh ni pomembno, 2 - Ni pomembno, 3 - Niti ni pomembno niti je pomembno, 4 - Je pomembno, 5 - Zelo pomembno]
Q10: Kako zaznavate podporo in svetovanje v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q11: Kako zaznavate zagotavljanje varnosti in zaščito vaših finančnih podatkov ter transakcij v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q12: Kako zaznavate transparentnost delovanja banke in njenih storitev v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q13: Kako zaznavate priročnost (npr. enostavna uporaba, dostopnost od kjerkoli, hitre transakcije) v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q14: Kako zaznavate zanesljivost opravljene storitve v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]
Q15: Kako zaskrbljeni ste glede naslednjih tveganj med uporabo mobilnega bančništva? [1 - Sploh me ne skrbi, 2 - Me ne skrbi, 3 - Niti me ne skrbi niti me skrbi, 4 - Me skrbi, 5 - Zelo me skrbi]
Q16: Kateri izmed spodaj navedenih dejavnikov bi vam omogočili večje zaupanje v mobilno bančništvo? [Izbira več možnih odgovorov]
Q17: Kako koristne bi se vam zdele naslednje funkcije v aplikaciji za mobilno bančništvo? [1 - Sploh ni koristna, 2 - Ni koristna, 3 - Niti ni koristna niti je koristna, 4 - Je koristna, 5 - Izjemno koristna]
Q18: Kateri finančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]
Q19: Kateri nefinančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]
Q41: Spol [1 - Ženska, 2 - Moški, 3 - Ne želim odgovoriti]
Q42: Prosimo, vpišite leto rojstva. [Odprto besedilo]
Q43: S kom živite v istem gospodinjstvu? [Izbira več možnih odgovorov]
Q44: Koliko prebivalcev živi v kraju, kjer prebivate? [1 - Manj kot 1.000 prebivalcev, 2 - 1.000–5.000 prebivalcev, 3 - 5.001–20.000 prebivalcev, 4 - 20.001–50.000 prebivalcev, 5 - 50.001–100.000 prebivalcev, 6 - Več kot 100.000 prebivalcev]
Q45: Katera je vaša primarna banka? [1 - Nova Ljubljanska Banka d.d. (NLB), 2 - BKS Bank AG, Bančna podružnica, 3 - OTP banka d.d., 4 - Banka Sparkasse d.d., 5 - Banka Intesa Sanpaolo d.d., 6 - UniCredit Banka Slovenija d.d., 7 - Revolut, 8 - N26, 9 - Gorenjska Banka d.d., 10 - Deželna Banka Slovenije d.d., 11 - Addiko Bank d.d., 12 - Delavska Hranilnica d.d., 13 - Drugo]
Q46: Kakšna je vaša trenutna zaposlitev? [1 - Študent/ka, 2 - Redno zaposlen/a, 3 - Upokojen/a, 4 - Brezposeln/a, 5 - Samozaposlen/a, 6 - Delno zaposlen/a]
Q47: Kakšen je vaš mesečni neto prihodek? [1 - Pod 1.000€, 2 - 1.000€–1.500€, 3 - 1.501€–2.000€, 4 - 2.001€–3.000€, 5 - 3.001€–5.000€, 6 - 5.001€–10.000€, 7 - Nad 10.000€, 8 - Ne želim odgovoriti]
Q48: Kakšna je vaša stopnja izobrazbe? [1 - Nedokončana osnovna šola, 2 - Dokončana osnovna šola, 3 - Dokončana nižja ali srednja poklicna izobrazba, 4 - Dokončana srednja strokovna ali splošna izobrazba, 5 - Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja), 6 - Dokončana visokošolska strokovna ali univerzitetna izobrazba (tudi 2. bolonjska stopnja), 7 - Dokončana specializacija, znanstveni magisterij ali doktorat]
Q49: Kako ste rešili anketo? [1 - Preko linka, 2 - Na tablici, 3 - Na listu papirja]
knitr::opts_chunk$set(echo = TRUE)
options(width = 120)
#install.packages(ggplot2)
library(ggplot2)
#install.packages("ggfortify")
library(ggfortify)
#install.packages("ranger")
library(ranger)
#install.packages("dplyr")
library(dplyr)
#install.packages("Hmisc")
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("cluster")
library(cluster)
#install.packages("magrittr")
library(magrittr)
#install.packages("NbClust")
library("NbClust")
library(dplyr)
mydata <- mydata %>%
mutate(Security_concerns = rowMeans(across(c(7, 8, 14)), na.rm = TRUE))
mydata <- mydata %>%
mutate(Lack_of_competence_or_support = rowMeans(across(c(12, 13)), na.rm = TRUE))
mydata <- mydata %>%
mutate(Preference_for_traditional_methods = rowMeans(across(c(9, 11)), na.rm = TRUE))
colnames(mydata) [6] <- "Aversion_to_change"
colnames(mydata) [10] <- "Physical_limitations"
From question 6, we made 5 different variables. 3 of them are grouped together in a way that we took the mean of them together (6b, 6c and 6i are Security_concerns; 6g and 6h are Lack_of_competence_or_support; 6d and 6f are Preference_for_traditional_methods) and than 6a is Aversion_to_change and 6e are Physical_limitations.
#Saving standardized cluster variables into new data frame
mydata_clu_std <- as.data.frame(scale(mydata[c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations")]))
mydata$Dissimilarity <- sqrt(mydata_clu_std$Security_concerns^2 + mydata_clu_std$Lack_of_competence_or_support^2 + mydata_clu_std$Preference_for_traditional_methods^2 +
mydata_clu_std$Aversion_to_change^2 + mydata_clu_std$Physical_limitations^2) #Finding outliers
head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) #Finding units with highest value of dissimilarity
## # A tibble: 6 × 2
## ID Dissimilarity
## <int> <dbl>
## 1 53 4.92
## 2 1 4.06
## 3 123 3.97
## 4 44 3.94
## 5 30 3.71
## 6 72 3.64
ID53 is a potential outlier, as there is a big jump in disimilarity numbers between units. For this reason we will remove this unit.
mydata <- mydata %>%
filter(!ID %in% c(53)) #Removing ID53 from original data frame
mydata$ID <- seq(1, nrow(mydata)) #Numbering the data again
mydata_clu_std <- as.data.frame(scale(mydata[c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations")])) #Standardizing the data again
After removing one country, the sample size is now 157.
#Finding Eudlidean distances, based on 5 Cluster variables, then saving them into object Distances
Distances <- get_dist(mydata_clu_std,
method = "euclidian")
fviz_dist(Distances, #Showing matrix of distances
gradient = list(low = "darkred",
mid = "grey95",
high = "white"))
We can see on the matrix of distances that some groups of clusters are forming, I see either 3 groups.
get_clust_tendency(mydata_clu_std, #Hopkins statistics
n = nrow(mydata_clu_std) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6029504
##
## $plot
## NULL
This data is clusterable as it is above 0.5. If it would be more close to 1, it would be even more appropriate. However the threshold is 0.5. Now the next question is how many clusters to use. I will check this with Hierarhical clustering (dendrogram) and K-Means clustering (Elbow method, Silhouette analysis and with the help of indices).
WARD <- mydata_clu_std %>%
get_dist(method = "euclidean") %>%
hclust(method = "ward.D2")
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 157
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Based on the dendrogram, we would choose 2 clusters, as there is the biggest jump in vertical line.
fviz_nbclust(mydata_clu_std, kmeans, method = "wss") +
labs(subtitle = "Elbow method")
With the elbow method the slope changes most evidently at 2 clusters.
fviz_nbclust(mydata_clu_std, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette analysis")
The higest value of the Silhouette analysis is at 2.
NbClust(mydata_clu_std,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 4 proposed 9 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB Silhouette
## 2 9.5966 92.7417 23.3648 -1.9059 182.2586 22365513125 9141.421 488.0082 3.5819 1.5983 0.4213 1.2674 0.3109
## 3 0.2118 64.6236 38.3498 -4.2729 287.2586 25781592519 5810.071 424.0819 5.0700 1.8393 0.3873 1.7241 0.2346
## 4 33.4822 66.1618 14.8522 -2.2561 410.1106 20958157652 4746.590 339.5305 6.7664 2.2973 0.3518 1.4934 0.2405
## 5 0.1946 57.7712 14.4020 -2.8446 501.1125 18341844976 4146.290 309.4876 8.4125 2.5203 0.3329 1.5378 0.2127
## 6 0.3402 53.1246 18.4451 -2.6373 577.8546 16200227137 3341.184 282.7016 9.7434 2.7591 0.3193 1.5636 0.2199
## 7 170.9406 52.4031 8.2807 -1.4552 648.9874 14016730597 2585.750 251.9279 11.0928 3.0961 0.3641 1.4115 0.2343
## 8 0.0053 48.2557 17.2282 -1.9713 692.5546 13871218677 2394.293 238.7479 12.0205 3.2670 0.3331 1.5076 0.2114
## 9 11.5140 48.9287 5.5136 -0.5159 772.2003 10570689341 1798.061 214.0037 13.8227 3.6448 0.4090 1.3345 0.2287
## 10 0.4798 45.4161 7.5001 -1.0486 804.3282 10635205508 1752.332 206.3176 14.7638 3.7806 0.4041 1.4314 0.2050
## Duda Pseudot2 Beale Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 0.9784 2.1452 0.0684 0.4324 244.0041 0.4973 0.9300 0.6734 0.0959 0.0021 1.8068 1.6568 0.9625
## 3 1.0585 -4.0338 -0.1694 0.3884 141.3606 0.4739 0.2894 1.3160 0.1174 0.0024 2.1176 1.5423 1.9356
## 4 1.3475 -15.9880 -0.7858 0.3757 84.8826 0.4900 0.6398 1.8251 0.0722 0.0029 1.9498 1.3840 0.7941
## 5 1.8725 -23.7634 -1.4063 0.3470 61.8975 0.4659 0.3248 2.3156 0.0976 0.0033 2.0907 1.3079 0.7352
## 6 1.0624 -2.2896 -0.1778 0.3258 47.1169 0.4573 0.0996 2.7357 0.0987 0.0033 2.1119 1.2557 0.4736
## 7 1.3570 -8.9441 -0.7859 0.3109 35.9897 0.4659 0.7818 2.9210 0.1167 0.0033 1.9525 1.1932 0.3572
## 8 1.7019 -12.3726 -1.2101 0.2944 29.8435 0.4378 0.0754 3.4835 0.0816 0.0035 2.0991 1.1490 0.3336
## 9 0.9718 0.6384 0.0860 0.2838 23.7782 0.4444 1.2669 3.6472 0.1444 0.0035 1.9951 1.1000 0.3151
## 10 1.1576 -1.6341 -0.3788 0.2711 20.6318 0.4187 -0.1176 4.2006 0.1728 0.0036 2.2337 1.0821 0.3061
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6717 47.4016 0.9968
## 3 0.6107 46.5380 1.0000
## 4 0.5760 45.6335 1.0000
## 5 0.5287 45.4696 1.0000
## 6 0.5452 32.5352 1.0000
## 7 0.4864 35.9027 1.0000
## 8 0.4234 40.8610 1.0000
## 9 0.4584 25.9898 0.9943
## 10 0.2868 29.8345 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB
## Number_clusters 7.0000 2.0000 4.0000 9.0000 4.000 9 3.00 4.0000 9.0000 9.000 6.0000 2.0000
## Value_Index 170.9406 92.7417 23.4976 -0.5159 122.852 3365045502 3331.35 54.5085 1.8023 -0.242 0.3193 1.2674
## Silhouette Duda PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain Dunn Hubert SDindex
## Number_clusters 2.0000 2.0000 2.0000 2.0000 2.0000 3.0000 2.0000 1 2.0000 10.0000 0 2.0000
## Value_Index 0.3109 0.9784 2.1452 0.0684 0.4324 102.6435 0.4973 NA 0.6734 0.1728 0 1.8068
## Dindex SDbw
## Number_clusters 0 10.0000
## Value_Index 0 0.3061
##
## $Best.partition
## [1] 1 2 1 1 2 1 1 1 2 1 1 1 1 1 2 2 1 1 1 2 1 1 1 2 1 2 1 2 2 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1 2 2 1 1 2 2 2 1 2 2 1 2 1
## [58] 2 1 2 2 2 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2 2 1 1 2 2 2 2 2 2
## [115] 1 2 2 2 1 2 1 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2
We will proceed with 3 clusters.
Clustering <- kmeans(mydata_clu_std,
centers = 3, #Number of groups
nstart = 25) #Number of attempts at different starting leader positions
Clustering
## K-means clustering with 3 clusters of sizes 45, 46, 66
##
## Cluster means:
## Security_concerns Lack_of_competence_or_support Preference_for_traditional_methods Aversion_to_change
## 1 -0.96430126 -0.5695709 -1.2244132 -0.9276589
## 2 0.09075076 -0.6448033 0.2826530 0.1622943
## 3 0.59422760 0.8377522 0.6378266 0.5193805
## Physical_limitations
## 1 -0.4679937
## 2 -0.7534631
## 3 0.8442276
##
## Clustering vector:
## [1] 1 3 1 1 3 2 2 2 3 1 2 2 2 1 3 3 1 1 3 3 2 2 1 3 2 3 1 3 3 1 2 2 3 3 2 3 2 1 2 1 1 3 2 1 3 3 2 2 3 2 3 2 3 3 1 3 2
## [58] 3 2 3 3 2 3 1 1 3 3 2 3 3 1 1 1 2 2 1 3 1 3 1 3 3 2 2 1 3 3 3 2 1 2 1 1 1 1 3 2 1 2 1 1 3 1 2 2 2 2 2 3 3 3 3 3 3
## [115] 1 3 3 3 1 3 1 1 3 1 3 3 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 3 1 2 2 1 1 2 2 2 2 2 1 1 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 141.2425 107.8430 146.6024
## (between_SS / total_SS = 49.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
## [8] "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_std)
Averages <- Clustering$centers
Averages #Average values of cluster variables to describe groups
## Security_concerns Lack_of_competence_or_support Preference_for_traditional_methods Aversion_to_change
## 1 -0.96430126 -0.5695709 -1.2244132 -0.9276589
## 2 0.09075076 -0.6448033 0.2826530 0.1622943
## 3 0.59422760 0.8377522 0.6378266 0.5193805
## Physical_limitations
## 1 -0.4679937
## 2 -0.7534631
## 3 0.8442276
Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
Figure <- pivot_longer(Figure, cols = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"))
Figure$Group <- factor(Figure$ID,
levels = c(1, 2, 3),
labels = c("1", "2", "3"))
Figure$NameF <- factor(Figure$name,
levels = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"),
labels = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"))
library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 3) +
geom_line(aes(group = ID), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-2.2, 2.2) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
mydata$Group <- Clustering$cluster #Assignings units to groups
#Checking if clustering variables successfully differentiate between groups
fit <- aov(cbind(Security_concerns, Lack_of_competence_or_support, Preference_for_traditional_methods, Aversion_to_change, Physical_limitations) ~ as.factor(Group),
data = mydata)
summary(fit)
## Response Security_concerns :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 56.024 28.0120 55.771 < 2.2e-16 ***
## Residuals 154 77.350 0.5023
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Lack_of_competence_or_support :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 123.84 61.922 81.146 < 2.2e-16 ***
## Residuals 154 117.52 0.763
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Preference_for_traditional_methods :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 81.174 40.587 130.06 < 2.2e-16 ***
## Residuals 154 48.057 0.312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Aversion_to_change :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 80.167 40.083 45.248 3.486e-16 ***
## Residuals 154 136.423 0.886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Physical_limitations :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 162.21 81.105 87.57 < 2.2e-16 ***
## Residuals 154 142.63 0.926
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Response for Social support:
H0: μ(Security_concerns, G1) = μ(Security_concerns, G2) = μ(Security_concerns, G3)
H1: At least one μ(Security_concerns, j) is different.
We can reject H0 at p < 0.001. We can reject H0 for all cluster variables at p < 0.001. Therefore we can assume that the groups are statistically different in the mean values of the cluster variables.
Next step is to check the criterion validity of the classification with variables that were not used in the clustering process. For this we chose Q2: Ali dnevno uporabljate pametni telefon? [1 - Da, 2 - Ne].
We have to check two assumptions: 1. normal distribution of the usage of mobile phones in each group. 2. homogeneity of variances
chi_square <- chisq.test(mydata$Q2, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q2 and as.factor(mydata$Group)
## X-squared = 20.555, df = 2, p-value = 3.439e-05
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q2 1 2 3 Sum
## Yes 40 41 39 120
## No 5 4 27 36
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q2 1 2 3 Sum
## Yes 34.62 34.62 50.77 120.01
## No 10.38 10.38 15.23 35.99
## Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q2 1 2 3
## Yes 0.92 1.09 -1.65
## No -1.67 -1.98 3.02
library(effectsize)
effectsize::cramers_v(mydata$Q2, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.35 | [0.19, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between usage of mobile phone and classification of customers in 3 groups.
H1: There is association between usage of mobile phone and classification of customers in 3 groups.
We can reject H0, so there are differences. All expected variables are above 5, so this assumption is met.
Q41: Spol [1 - Ženska, 2 - Moški]
chi_square <- chisq.test(mydata$Q41, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q41 and as.factor(mydata$Group)
## X-squared = 1.6566, df = 2, p-value = 0.4368
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q41 1 2 3 Sum
## Female 22 26 30 78
## Male 23 19 36 78
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q41 1 2 3 Sum
## Female 22.5 22.5 33 78
## Male 22.5 22.5 33 78
## Sum 45.0 45.0 66 156
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q41 1 2 3
## Female -0.11 0.74 -0.52
## Male 0.11 -0.74 0.52
library(effectsize)
effectsize::cramers_v(mydata$Q41, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.00 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between gender and classification of customers in 3 groups.
H1: There is association between gender and classification of customers in 3 groups.
We cannot reject H0, so there are no differences.
Q42: Prosimo, vpišite leto rojstva. [Odprto besedilo]
mydata$Q42 <- as.numeric(as.character(mydata$Q42))
current_year <- as.numeric(format(Sys.Date(), "%Y"))
mydata$Age <- current_year - mydata$Q42
mydata$AgeGroup <- cut(mydata$Age,
breaks = c(18, 40, 60, Inf),
labels = c("18-40", "41-60", "60+"))
chi_square <- chisq.test(mydata$AgeGroup, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$AgeGroup and as.factor(mydata$Group)
## X-squared = 49.906, df = 4, p-value = 3.778e-10
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$AgeGroup 1 2 3 Sum
## 18-40 21 22 3 46
## 41-60 16 18 23 57
## 60+ 7 5 40 52
## Sum 44 45 66 155
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$AgeGroup 1 2 3 Sum
## 18-40 13.06 13.35 19.59 46
## 41-60 16.18 16.55 24.27 57
## 60+ 14.76 15.10 22.14 52
## Sum 44.00 45.00 66.00 155
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$AgeGroup 1 2 3
## 18-40 2.20 2.37 -3.75
## 41-60 -0.04 0.36 -0.26
## 60+ -2.02 -2.60 3.80
library(effectsize)
effectsize::cramers_v(mydata$Q41, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.00 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between age and classification of customers in 3 groups.
H1: There is association between age and classification of customers in 3 groups.
We can reject H0, so there are differences.
Q43: S kom živite v istem gospodinjstvu? [Izbira več možnih odgovorov]
mydata[c(67:73)] <- mydata[c(67:73)] %>% mutate_all(as.numeric)
mydata$Household <- case_when(
mydata$Q43a == 1 ~ 1, # If var1 is 1, assign 1
mydata$Q43b == 1 ~ 1,
mydata$Q43c == 1 ~ 1,
mydata$Q43d == 1 ~ 1,
mydata$Q43e == 1 ~ 1,
mydata$Q43f == 1 ~ 1,
mydata$Q43g == 1 ~ 2,
TRUE ~ 0)
mydata$Household <- factor(mydata$Household,
levels = c(1, 2),
labels = c("With somebody", "Living alone"))
chi_square <- chisq.test(mydata$Household, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Household and as.factor(mydata$Group)
## X-squared = 5.3091, df = 2, p-value = 0.07033
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Household 1 2 3 Sum
## With somebody 41 33 56 130
## Living alone 4 12 10 26
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Household 1 2 3 Sum
## With somebody 37.5 37.5 55 130
## Living alone 7.5 7.5 11 26
## Sum 45.0 45.0 66 156
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Household 1 2 3
## With somebody 0.57 -0.73 0.13
## Living alone -1.28 1.64 -0.30
library(effectsize)
effectsize::cramers_v(mydata$Household, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.15 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between living situation and classification of customers in 3 groups.
H1: There is association between living situation and classification of customers in 3 groups.
We cannot reject H0, so there are no differences.
Q44: Koliko prebivalcev živi v kraju, kjer prebivate? [1 - Manj kot 1.000 prebivalcev, 2 - 1.000–5.000 prebivalcev, 3 - 5.001–20.000 prebivalcev, 4 - 20.001–50.000 prebivalcev, 5 - 50.001–100.000 prebivalcev, 6 - Več kot 100.000 prebivalcev]
mydata <- mydata %>%
mutate(Q44_numeric = as.integer(Q44))
mydata <- mydata %>%
mutate(Q44_numeric = case_when(
Q44_numeric %in% c(1) ~ (1 + 1000) / 2,
Q44_numeric %in% c(2) ~ (1001 + 5000) / 2,
Q44_numeric %in% c(3, 4) ~ (5001 + 50000) / 2,
Q44_numeric %in% c(5, 6) ~ (50001 + 285000) / 2,
))
chi_square <- chisq.test(mydata$Q44_numeric, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q44_numeric and as.factor(mydata$Group)
## X-squared = 20.028, df = 6, p-value = 0.002738
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q44_numeric 1 2 3 Sum
## 500.5 19 7 18 44
## 3000.5 3 15 18 36
## 27500.5 17 14 12 43
## 167500.5 6 9 18 33
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q44_numeric 1 2 3 Sum
## 500.5 12.69 12.69 18.62 44.00
## 3000.5 10.38 10.38 15.23 35.99
## 27500.5 12.40 12.40 18.19 42.99
## 167500.5 9.52 9.52 13.96 33.00
## Sum 44.99 44.99 66.00 155.98
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q44_numeric 1 2 3
## 500.5 1.77 -1.60 -0.14
## 3000.5 -2.29 1.43 0.71
## 27500.5 1.31 0.45 -1.45
## 167500.5 -1.14 -0.17 1.08
library(effectsize)
effectsize::cramers_v(mydata$Q44_numeric, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.21 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between habitants and classification of customers in 3 groups.
H1: There is association between habitants and classification of customers in 3 groups.
We can reject H0, so there are differences.
Q47: Kakšen je vaš mesečni neto prihodek? [1 - Pod 1.000€, 2 - 1.000€–1.500€, 3 - 1.501€–2.000€, 4 - 2.001€–3.000€, 5 - 3.001€–5.000€, 6 - 5.001€–10.000€, 7 - Nad 10.000€, 8 - Ne želim odgovoriti]
mydata <- mydata %>%
mutate(Q47_numeric = as.integer(Q47))
mydata <- mydata %>%
mutate(Q47_numeric = case_when(
Q47_numeric %in% c(1, 2, 3) ~ (1 + 2000) / 2,
Q47_numeric %in% c(4, 5, 6, 7) ~ (2001 + 10000) / 2,
))
chi_square <- chisq.test(mydata$Q47_numeric, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q47_numeric and as.factor(mydata$Group)
## X-squared = 4.9016, df = 2, p-value = 0.08622
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q47_numeric 1 2 3 Sum
## 1000.5 28 21 27 76
## 6000.5 16 18 37 71
## Sum 44 39 64 147
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q47_numeric 1 2 3 Sum
## 1000.5 22.75 20.16 33.09 76
## 6000.5 21.25 18.84 30.91 71
## Sum 44.00 39.00 64.00 147
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q47_numeric 1 2 3
## 1000.5 1.10 0.19 -1.06
## 6000.5 -1.14 -0.19 1.10
library(effectsize)
effectsize::cramers_v(mydata$Q47_numeric, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.14 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between income level and classification of customers in 3 groups.
H1: There is association between income level and classification of customers in 3 groups.
We can reject H0, so there are differences.
Q3: Ali uporabljate internet vsaj nekajkrat na teden? [1 - Da, 2 - Ne]
chi_square <- chisq.test(mydata$Q3, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q3 and as.factor(mydata$Group)
## X-squared = 12.145, df = 2, p-value = 0.002305
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q3 1 2 3 Sum
## Yes 38 44 48 130
## No 7 1 18 26
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q3 1 2 3 Sum
## Yes 37.5 37.5 55 130
## No 7.5 7.5 11 26
## Sum 45.0 45.0 66 156
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q3 1 2 3
## Yes 0.08 1.06 -0.94
## No -0.18 -2.37 2.11
library(effectsize)
effectsize::cramers_v(mydata$Q3, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.26 | [0.06, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between age and classification of customers in 3 groups.
H1: There is association between age and classification of customers in 3 groups.
We can reject H0, so there are differences.
Q4: Ali je kdo od vaših družinskih članov uporabnik mobilne banke? [1 - Da, 2 - Ne, 3 - Ne vem]
chi_square <- chisq.test(mydata$Q4, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q4 and as.factor(mydata$Group)
## X-squared = 4.7944, df = 4, p-value = 0.3091
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q4 1 2 3 Sum
## Yes 28 34 40 102
## No 10 7 11 28
## I don't know 7 4 15 26
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q4 1 2 3 Sum
## Yes 29.42 29.42 43.15 101.99
## No 8.08 8.08 11.85 28.01
## I don't know 7.50 7.50 11.00 26.00
## Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q4 1 2 3
## Yes -0.26 0.84 -0.48
## No 0.68 -0.38 -0.25
## I don't know -0.18 -1.28 1.21
library(effectsize)
effectsize::cramers_v(mydata$Q4, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.05 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between income level and classification of customers in 3 groups.
H1: There is association between income level and classification of customers in 3 groups.
We cannot reject H0, so there are no differences.
Q5: Ali imate skupni bančni račun, ki ga upravlja en član družine? [1 - Da, 2 - Ne]
chi_square <- chisq.test(mydata$Q5, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q5 and as.factor(mydata$Group)
## X-squared = 5.4441, df = 2, p-value = 0.06574
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q5 1 2 3 Sum
## Yes 10 9 5 24
## No 35 36 61 132
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q5 1 2 3 Sum
## Yes 6.92 6.92 10.15 23.99
## No 38.08 38.08 55.85 132.01
## Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q5 1 2 3
## Yes 1.17 0.79 -1.62
## No -0.50 -0.34 0.69
library(effectsize)
effectsize::cramers_v(mydata$Q5, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.15 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between income level and classification of customers in 3 groups.
H1: There is association between income level and classification of customers in 3 groups.
We can reject H0, so there are differences.
Q7: Se vam zdi, da bi potrebovali dodatno izobraževanje, preden bi lahko uporabljali mobilno banko? [1 - Da, 2 - Ne]
chi_square <- chisq.test(mydata$Q5, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Q5 and as.factor(mydata$Group)
## X-squared = 5.4441, df = 2, p-value = 0.06574
addmargins(chi_square$observed)
## as.factor(mydata$Group)
## mydata$Q5 1 2 3 Sum
## Yes 10 9 5 24
## No 35 36 61 132
## Sum 45 45 66 156
addmargins(round(chi_square$expected, 2))
## as.factor(mydata$Group)
## mydata$Q5 1 2 3 Sum
## Yes 6.92 6.92 10.15 23.99
## No 38.08 38.08 55.85 132.01
## Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2)
## as.factor(mydata$Group)
## mydata$Q5 1 2 3
## Yes 1.17 0.79 -1.62
## No -0.50 -0.34 0.69
library(effectsize)
effectsize::cramers_v(mydata$Q5, mydata$Group)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.15 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
H0: There is no association between income level and classification of customers in 3 groups.
H1: There is association between income level and classification of customers in 3 groups.
We can reject H0, so there are differences.
Based on five standardized variables (Security Concerns, Lack of Competence or Support, Preference for Traditional Methods, Aversion to Change, and Physical Limitations), we divided the dataset into three distinct groups using hierarchical and K-means clustering.
Group 1 (45/157 = 29%)
Group 2 (46/157 = 29%)
Group 3 (66/157 = 42%)
mydata_PCA <- mydata[c(18:22)]
mydata_PCA2 <- mydata[c(25:34)]
R <- cor(mydata_PCA)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:effectsize':
##
## phi
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
corPlot(R)
library(psych)
cortest.bartlett(R, n = nrow(mydata_PCA))
## $chisq
## [1] 214.0942
##
## $p.value
## [1] 1.838694e-40
##
## $df
## [1] 10
library(psych)
KMO(R)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA = 0.78
## MSA for each item =
## Assistance Security Transparency Convinience Speed and Reliability
## 0.79 0.76 0.80 0.78 0.77
library(FactoMineR)
components <- PCA(mydata_PCA,
scale.unit = TRUE,
graph = FALSE)
library(factoextra)
get_eigenvalue(components)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.7190702 54.381405 54.38140
## Dim.2 0.7872302 15.744603 70.12601
## Dim.3 0.6464620 12.929241 83.05525
## Dim.4 0.4411911 8.823822 91.87907
## Dim.5 0.4060465 8.120929 100.00000
Eigenvalue of first principal component for standardized variables is bigger than 1.
The first 1 principal components explain more than 40% of the data. We measure evaluation which is subjective, so we measure soft data for which the chosen number of components should explain around 40% of the data.
The last chosen principal component captures more than 5% of total variance of original variables (5).
fviz_eig(components,
choice = "eigenvalue",
main = "Screeplot",
ylab = "Eigenvalue",
xlab = "Principal component",
addlabels = TRUE)
When looking at the Scree plot the biggest difference between eigenvalues. This is between 1 and 2, so we should choose 1 principal component.
library(psych)
fa.parallel(mydata_PCA,
sim = FALSE,
fa = "pc")
## Parallel analysis suggests that the number of factors = NA and the number of components = 1
Parallel analysis suggests that we should choose 1 principal component.
Because we need to do the perception map, which has 2 principal components, we will use 2.
library(FactoMineR)
components <- PCA(mydata_PCA,
ncp = 2,
scale.unit = TRUE,
graph = FALSE)
components$var$cor
## Dim.1 Dim.2
## Assistance 0.6588985 0.58938063
## Security 0.7655438 0.33729834
## Transparency 0.7487942 -0.09960031
## Convinience 0.7262470 -0.48912635
## Speed and Reliability 0.7814975 -0.27735471
loadings <- components$var$cor
library(factoextra)
eigenvalue <- get_eigenvalue(components)[1:2,1 ]
coefficient1 <- loadings[1:5]/sqrt(eigenvalue)[1]
coefficient2 <- loadings[6:10]/sqrt(eigenvalue)[2]
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
mydata_PCAD <- mydata_PCA2 %>%
pivot_longer(everything(), names_to = "name", values_to = "score") %>%
separate(name, into = c("retailer", "dimension"), sep = "_")%>%
pivot_wider(names_from = retailer, values_from = score, values_fn = mean) %>%
column_to_rownames(var = "dimension")
mydata_PCA_std <- scale(mydata_PCAD)
poslovalnica1 <- sum(mydata_PCA_std[,1]*coefficient1)
mobilna_banka1 <- sum(mydata_PCA_std[,2]*coefficient1)
poslovalnica2 <- sum(mydata_PCA_std[,1]*coefficient2)
mobilna_banka2 <- sum(mydata_PCA_std[,2]*coefficient2)
library(factoextra)
p <- fviz_pca_biplot(components, repel = TRUE, invisible = "ind", col.var = "#33006F")
p +
annotate("point", x = poslovalnica1, y = poslovalnica2, color = "#84BD00", size = 4, shape = 16) +
annotate("text", x = poslovalnica1, y = poslovalnica2, label = "Poslovalnica", vjust = -1, color = "#84BD00") +
annotate("point", x = mobilna_banka1, y = mobilna_banka2, color = "#FA7800", size = 4, shape = 16) +
annotate("text", x = mobilna_banka1, y = mobilna_banka2, label = "Mobilna banka", vjust = -1, color = "#FA7800")
Principal component analysis was performed on 5 standardized variables (n = 157). The KMO measure confirms the appropriateness of the variables, KMO = 0.78, although the data falls into the category “Middling”. The MSA statistics for the individual variables are above 0.50 for all variables. Based on the component’s loadings, we conclude that PC1 (𝜆1 = 2.72) represents quality, while PC2 (𝜆2 = 0.79) represents the contrast between security&customer support and service efficiency&transparency.