#install.packages
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata_excel <- read_excel("~/Documents/Šola/IMB/2. semester/NLB project/anketa_končni podatki.xlsx")

mydata_excel <- mydata_excel[-1, ] #Delete first row in which the questions are written

mydata_excel$ID <- seq(1,nrow(mydata_excel))

head(mydata_excel)
## # A tibble: 6 × 129
##   Q1    Q2    Q3    Q4    Q5    Q6a   Q6b   Q6c   Q6d   Q6e   Q6f   Q6g   Q6h  
##   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2     1     1     1     1     2     1     1     2     1     2     1     1    
## 2 2     1     1     2     -2    -2    -2    -2    -2    -2    -2    -2    -2   
## 3 2     1     1     1     2     4     5     5     4     3     5     5     4    
## 4 2     1     1     1     2     -3    -3    -3    -3    -3    -3    -3    -3   
## 5 2     1     1     1     2     2     3     4     4     1     2     2     3    
## 6 2     1     1     1     2     3     2     5     1     1     1     1     1    
## # ℹ 116 more variables: Q6i <chr>, Q6j <chr>, Q6j_text <chr>, Q7 <chr>,
## #   Assistance <chr>, Security <chr>, Transparency <chr>, Convinience <chr>,
## #   `Speed and Reliability` <chr>, Q8f <chr>, Q8f_text <chr>,
## #   Branch_Assistance <chr>, `Mobile bank_Assistance` <chr>,
## #   Branch_Security <chr>, `Mobile bank_Security` <chr>,
## #   Branch_Transparency <chr>, `Mobile bank_Transparency` <chr>,
## #   Branch_Convinience <chr>, `Mobile bank_Convinience` <chr>, …
mydata <- mydata_excel[!(apply(mydata_excel == -3, 1, any)), ]

mydata <- subset(mydata, select = -c(Q21:Q40))

mydata$ID <- seq(1,nrow(mydata))

head(mydata)
## # A tibble: 6 × 83
##   Q1    Q2    Q3    Q4    Q5    Q6a   Q6b   Q6c   Q6d   Q6e   Q6f   Q6g   Q6h  
##   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2     1     1     1     1     2     1     1     2     1     2     1     1    
## 2 2     1     1     2     -2    -2    -2    -2    -2    -2    -2    -2    -2   
## 3 2     1     1     1     2     4     5     5     4     3     5     5     4    
## 4 2     1     1     1     2     2     3     4     4     1     2     2     3    
## 5 2     1     1     1     2     3     2     5     1     1     1     1     1    
## 6 2     2     2     1     2     5     4     4     5     4     3     5     5    
## # ℹ 70 more variables: Q6i <chr>, Q6j <chr>, Q6j_text <chr>, Q7 <chr>,
## #   Assistance <chr>, Security <chr>, Transparency <chr>, Convinience <chr>,
## #   `Speed and Reliability` <chr>, Q8f <chr>, Q8f_text <chr>,
## #   Branch_Assistance <chr>, `Mobile bank_Assistance` <chr>,
## #   Branch_Security <chr>, `Mobile bank_Security` <chr>,
## #   Branch_Transparency <chr>, `Mobile bank_Transparency` <chr>,
## #   Branch_Convinience <chr>, `Mobile bank_Convinience` <chr>, …
mydata$Q2 <- factor(mydata$Q2, 
                         levels = c(1, 2), 
                         labels = c("Yes","No"))

mydata$Q3 <- factor(mydata$Q3, 
                         levels = c(1, 2), 
                         labels = c("Yes","No"))

mydata$Q4 <- factor(mydata$Q4, 
                         levels = c(1, 2, 3), 
                         labels = c("Yes","No", "I don't know"))

mydata$Q5 <- factor(mydata$Q5, 
                         levels = c(1, 2), 
                         labels = c("Yes","No"))

mydata$Q7 <- factor(mydata$Q7, 
                         levels = c(1, 2), 
                         labels = c("Yes","No"))

mydata$Q16a <- factor(mydata$Q16a, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q16b <- factor(mydata$Q16b, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q16c <- factor(mydata$Q16c, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q16d <- factor(mydata$Q16d, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q16e <- factor(mydata$Q16e, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q16f <- factor(mydata$Q16f, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q16g <- factor(mydata$Q16g, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q41 <- factor(mydata$Q41, 
                         levels = c(1, 2), 
                         labels = c("Female","Male"))

mydata$Q43a <- factor(mydata$Q43a, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43b <- factor(mydata$Q43b, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43c <- factor(mydata$Q43c, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43d <- factor(mydata$Q43d, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43e <- factor(mydata$Q43e, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43f <- factor(mydata$Q43f, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43g <- factor(mydata$Q43g, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43h <- factor(mydata$Q43h, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q44 <- factor(mydata$Q44, 
                         levels = c(2, 6, 1, 4, 3, 5), 
                         labels = c("1.000 – 5.000 habitants","More than 100.000 habitants", "Less than 1.000 habitants", "20.001 – 50.000 habitants", "5.001 – 20.000 habitants", "50.001 – 100.000 habitants"))

mydata$Q45 <- factor(mydata$Q45, 
                         levels = c(3, 5, 1, 9, 12, 7, 10, 4, 11, 6), 
                         labels = c("OTP banka d.d.","Banka Intesa Sanpaolo d.d.", "Nova Ljubljanska Banka d.d. (NLB)", "Gorenjska Banka d.d.", "Delavska Hranilnica d.d.", "Revolut", "Deželna Banka Slovenije d.d.", "Banka Sparkasse d.d.", "Addiko Bank d.d.", "UniCredit Banka Slovenija d.d."))

mydata$Q46 <- factor(mydata$Q46, 
                         levels = c(1, 2, 3, 5, 6, 4), 
                         labels = c("Študent/-ka","Redno zaposlen/-a", "Upokojen/-a", "Samozaposlen/-a", "Delno zaposlen/-a", "Brezposeln/-a"))

mydata$Q47 <- factor(mydata$Q47, 
                         levels = c(1, 5, 3, 8, 2, 4, 6, 7), 
                         labels = c("Pod 1.000€","3.001€ - 5.000€", "1.501€ - 2.000€", "I don't want to answer", "1.000€ - 1.500€", "2.001€ - 3.000€", "5.001€ - 10.000€", "Above 10.000€"))

mydata$Q48 <- factor(mydata$Q48, 
                         levels = c(2, 6, 3, 5, 7, 4), 
                         labels = c("Dokončana osnovna šola","Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)", "Dokončana nižja ali srednja poklicna izobrazba", "Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja)", "Dokončana specializacija, znanstveni magisterij, doktorat", "Dokončana srednja strokovna ali splošna izobrazba"))

mydata$Q49 <- factor(mydata$Q49, 
                         levels = c(1, 2, 3), 
                         labels = c("Preko linka","Na tablici", "Na listu papirja"))

mydata[c(6:14, 18:22, 25:39, 48:55, 58:62)] <- mydata[c(6:14, 18:22, 25:39, 48:55, 58:62)] %>% mutate_all(as.numeric)

mydata <- mydata %>% mutate(across(where(is.numeric), ~ replace(., . == -2, mean(.[. != -2], na.rm = TRUE))))

mydata <- mydata %>% mutate(across(where(is.numeric), ~ replace(., is.na(.), mean(., na.rm = TRUE))))

mydata <- mydata %>%
  filter(!ID %in% c(2, 16, 17)) 

mydata$ID <- seq(1, nrow(mydata)) 

summary(mydata[c(-1, -15, -16, -23, -24, -47, -56, -57, -63, -64, -66, -75, -78, -83)])
##     Q2         Q3                 Q4         Q5           Q6a       
##  Yes :121   Yes :131   Yes         :103   Yes : 25   Min.   :1.000  
##  No  : 36   No  : 26   No          : 28   No  :132   1st Qu.:3.000  
##  NA's:  1   NA's:  1   I don't know: 26   NA's:  1   Median :3.000  
##                        NA's        :  1              Mean   :3.433  
##                                                      3rd Qu.:4.000  
##                                                      Max.   :5.000  
##                                                                     
##       Q6b             Q6c             Q6d             Q6e       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:1.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :2.000  
##  Mean   :3.535   Mean   :3.847   Mean   :3.854   Mean   :2.688  
##  3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       Q6f             Q6g             Q6h            Q6i           Q7    
##  Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000   Yes :82  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.00   1st Qu.:3.000   No  :75  
##  Median :4.000   Median :3.086   Median :3.00   Median :4.000   NA's: 1  
##  Mean   :3.497   Mean   :3.172   Mean   :2.93   Mean   :3.465            
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.00   3rd Qu.:4.000            
##  Max.   :5.000   Max.   :5.000   Max.   :5.00   Max.   :5.000            
##                                                                          
##    Assistance      Security      Transparency    Convinience   
##  Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.00   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median :4.00   Median :5.000   Median :4.000   Median :4.000  
##  Mean   :3.79   Mean   :4.382   Mean   :4.108   Mean   :3.975  
##  3rd Qu.:4.00   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.00   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                
##  Speed and Reliability Branch_Assistance Mobile bank_Assistance Branch_Security
##  Min.   :1.000         Min.   :2.000     Min.   :1.00           Min.   :2.000  
##  1st Qu.:4.000         1st Qu.:4.000     1st Qu.:3.00           1st Qu.:3.986  
##  Median :4.000         Median :4.000     Median :3.00           Median :4.000  
##  Mean   :4.185         Mean   :4.006     Mean   :3.14           Mean   :3.981  
##  3rd Qu.:5.000         3rd Qu.:5.000     3rd Qu.:4.00           3rd Qu.:5.000  
##  Max.   :5.000         Max.   :5.000     Max.   :5.00           Max.   :5.000  
##                                                                                
##  Mobile bank_Security Branch_Transparency Mobile bank_Transparency
##  Min.   :1.000        Min.   :1.000       Min.   :1.000           
##  1st Qu.:2.000        1st Qu.:3.000       1st Qu.:3.000           
##  Median :3.000        Median :4.000       Median :3.000           
##  Mean   :3.032        Mean   :3.898       Mean   :3.248           
##  3rd Qu.:4.000        3rd Qu.:4.000       3rd Qu.:4.000           
##  Max.   :5.000        Max.   :5.000       Max.   :5.000           
##                                                                   
##  Branch_Convinience Mobile bank_Convinience Branch_Speed and Reliability
##  Min.   :1.000      Min.   :1.000           Min.   :1.000               
##  1st Qu.:3.000      1st Qu.:3.000           1st Qu.:3.000               
##  Median :4.000      Median :3.000           Median :4.000               
##  Mean   :3.752      Mean   :3.389           Mean   :3.701               
##  3rd Qu.:4.000      3rd Qu.:4.000           3rd Qu.:4.000               
##  Max.   :5.000      Max.   :5.000           Max.   :5.000               
##                                                                         
##  Mobile bank_Speed and Reliability      Q15a        Q15b            Q15c      
##  Min.   :1.000                     Min.   :1   Min.   :2.000   Min.   :1.000  
##  1st Qu.:3.000                     1st Qu.:4   1st Qu.:4.000   1st Qu.:4.000  
##  Median :4.000                     Median :4   Median :4.000   Median :4.000  
##  Mean   :3.586                     Mean   :4   Mean   :4.134   Mean   :4.115  
##  3rd Qu.:4.000                     3rd Qu.:5   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000                     Max.   :5   Max.   :5.000   Max.   :5.000  
##                                                                               
##       Q15d            Q15e                 Q16a              Q16b   
##  Min.   :1.000   Min.   :2.000   Selected    :98   Selected    :66  
##  1st Qu.:3.000   1st Qu.:4.000   Not selected:59   Not selected:91  
##  Median :4.000   Median :4.000   NA's        : 1   NA's        : 1  
##  Mean   :3.809   Mean   :4.127                                      
##  3rd Qu.:4.000   3rd Qu.:5.000                                      
##  Max.   :5.000   Max.   :5.000                                      
##                                                                     
##            Q16c              Q16d              Q16e               Q16f    
##  Selected    :68   Selected    :79   Selected    : 49   Selected    : 36  
##  Not selected:89   Not selected:78   Not selected:108   Not selected:121  
##  NA's        : 1   NA's        : 1   NA's        :  1   NA's        :  1  
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##            Q16g          Q17a            Q17b            Q17c      
##  Selected    :  9   Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  Not selected:148   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000  
##  NA's        :  1   Median :4.000   Median :4.000   Median :4.000  
##                     Mean   :3.662   Mean   :3.694   Mean   :3.758  
##                     3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.750  
##                     Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                    
##       Q17d            Q17e            Q18a           Q18b            Q18c      
##  Min.   :0.000   Min.   :0.000   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:4.000   1st Qu.:3.00   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :4.000   Median :4.00   Median :4.000   Median :4.000  
##  Mean   :3.223   Mean   :3.809   Mean   :3.35   Mean   :3.732   Mean   :3.713  
##  3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:4.00   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.00   Max.   :5.000   Max.   :5.000  
##                                                                                
##       Q19a            Q19b            Q19c            Q19d      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :3.000   Median :4.000   Median :4.000   Median :4.000  
##  Mean   :3.217   Mean   :3.548   Mean   :3.331   Mean   :3.618  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       Q19e           Q41               Q43a               Q43b    
##  Min.   :1.000   Female:78   Selected    : 32   Selected    : 12  
##  1st Qu.:2.000   Male  :79   Not selected:125   Not selected:145  
##  Median :3.000   NA's  : 1   NA's        :  1   NA's        :  1  
##  Mean   :3.051                                                    
##  3rd Qu.:4.000                                                    
##  Max.   :5.000                                                    
##                                                                   
##            Q43c               Q43d              Q43e               Q43f    
##  Selected    :  5   Selected    :81   Selected    : 57   Selected    :  1  
##  Not selected:152   Not selected:76   Not selected:100   Not selected:156  
##  NA's        :  1   NA's        : 1   NA's        :  1   NA's        :  1  
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##            Q43g               Q43h                              Q44    
##  Selected    : 29   Selected    :  0   1.000 – 5.000 habitants    :44  
##  Not selected:128   Not selected:157   More than 100.000 habitants:36  
##  NA's        :  1   NA's        :  1   Less than 1.000 habitants  :30  
##                                        20.001 – 50.000 habitants  :14  
##                                        5.001 – 20.000 habitants   :22  
##                                        50.001 – 100.000 habitants :11  
##                                        NA's                       : 1  
##                                 Q45                    Q46    
##  OTP banka d.d.                   :71   Študent/-ka      :22  
##  Nova Ljubljanska Banka d.d. (NLB):35   Redno zaposlen/-a:73  
##  Delavska Hranilnica d.d.         :12   Upokojen/-a      :43  
##  Banka Intesa Sanpaolo d.d.       :10   Samozaposlen/-a  :13  
##  Deželna Banka Slovenije d.d.     : 8   Delno zaposlen/-a: 3  
##  (Other)                          :18   Brezposeln/-a    : 3  
##  NA's                             : 4   NA's             : 1  
##               Q47    
##  Pod 1.000€     :46  
##  1.000€ - 1.500€:39  
##  2.001€ - 3.000€:29  
##  1.501€ - 2.000€:25  
##  3.001€ - 5.000€: 6  
##  (Other)        : 4  
##  NA's           : 9  
##                                                                                                Q48    
##  Dokončana osnovna šola                                                                          : 8  
##  Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)            :35  
##  Dokončana nižja ali srednja poklicna izobrazba                                                  :23  
##  Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja):34  
##  Dokončana specializacija, znanstveni magisterij, doktorat                                       : 5  
##  Dokončana srednja strokovna ali splošna izobrazba                                               :50  
##  NA's                                                                                            : 3  
##                Q49    
##  Preko linka     :84  
##  Na tablici      :37  
##  Na listu papirja:17  
##  NA's            :20  
##                       
##                       
## 

Q1: Ali uporabljate mobilno aplikacijo banke, kjer imate odprt primarni račun (npr. KlikIn, in podobno)? [1 - Da, 2 - Ne]

Q2: Ali dnevno uporabljate pametni telefon? [1 - Da, 2 - Ne]

Q3: Ali uporabljate internet vsaj nekajkrat na teden? [1 - Da, 2 - Ne]

Q4: Ali je kdo od vaših družinskih članov uporabnik mobilne banke? [1 - Da, 2 - Ne, 3 - Ne vem]

Q5: Ali imate skupni bančni račun, ki ga upravlja en član družine? [1 - Da, 2 - Ne]

Q6: Zakaj ne uporabljate mobilne banke (npr. KlikIn, , Addiko Mobil, …)? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti se ne strinjam niti se strinjam, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q7: Se vam zdi, da bi potrebovali dodatno izobraževanje, preden bi lahko uporabljali mobilno banko? [1 - Da, 2 - Ne]

Q8: Kako pomembni so naslednji dejavniki pri odločitvi med poslovalnico ali mobilno aplikacijo? [1 - Sploh ni pomembno, 2 - Ni pomembno, 3 - Niti ni pomembno niti je pomembno, 4 - Je pomembno, 5 - Zelo pomembno]

Q10: Kako zaznavate podporo in svetovanje v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q11: Kako zaznavate zagotavljanje varnosti in zaščito vaših finančnih podatkov ter transakcij v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q12: Kako zaznavate transparentnost delovanja banke in njenih storitev v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q13: Kako zaznavate priročnost (npr. enostavna uporaba, dostopnost od kjerkoli, hitre transakcije) v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q14: Kako zaznavate zanesljivost opravljene storitve v poslovalnici in mobilni aplikaciji? [1 - Sploh se ne strinjam, 2 - Se ne strinjam, 3 - Niti niti, 4 - Se strinjam, 5 - Popolnoma se strinjam]

Q15: Kako zaskrbljeni ste glede naslednjih tveganj med uporabo mobilnega bančništva? [1 - Sploh me ne skrbi, 2 - Me ne skrbi, 3 - Niti me ne skrbi niti me skrbi, 4 - Me skrbi, 5 - Zelo me skrbi]

Q16: Kateri izmed spodaj navedenih dejavnikov bi vam omogočili večje zaupanje v mobilno bančništvo? [Izbira več možnih odgovorov]

Q17: Kako koristne bi se vam zdele naslednje funkcije v aplikaciji za mobilno bančništvo? [1 - Sploh ni koristna, 2 - Ni koristna, 3 - Niti ni koristna niti je koristna, 4 - Je koristna, 5 - Izjemno koristna]

Q18: Kateri finančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]

Q19: Kateri nefinančni spodbudni ukrepi bi vas spodbudili k uporabi mobilnega bančništva? [1 - Sploh me ne bi spodbudilo, 2 - Ne bi me spodbudilo, 3 - Niti niti, 4 - Bi me spodbudilo, 5 - Zelo bi me spodbudilo]

Q41: Spol [1 - Ženska, 2 - Moški, 3 - Ne želim odgovoriti]

Q42: Prosimo, vpišite leto rojstva. [Odprto besedilo]

Q43: S kom živite v istem gospodinjstvu? [Izbira več možnih odgovorov]

Q44: Koliko prebivalcev živi v kraju, kjer prebivate? [1 - Manj kot 1.000 prebivalcev, 2 - 1.000–5.000 prebivalcev, 3 - 5.001–20.000 prebivalcev, 4 - 20.001–50.000 prebivalcev, 5 - 50.001–100.000 prebivalcev, 6 - Več kot 100.000 prebivalcev]

Q45: Katera je vaša primarna banka? [1 - Nova Ljubljanska Banka d.d. (NLB), 2 - BKS Bank AG, Bančna podružnica, 3 - OTP banka d.d., 4 - Banka Sparkasse d.d., 5 - Banka Intesa Sanpaolo d.d., 6 - UniCredit Banka Slovenija d.d., 7 - Revolut, 8 - N26, 9 - Gorenjska Banka d.d., 10 - Deželna Banka Slovenije d.d., 11 - Addiko Bank d.d., 12 - Delavska Hranilnica d.d., 13 - Drugo]

Q46: Kakšna je vaša trenutna zaposlitev? [1 - Študent/ka, 2 - Redno zaposlen/a, 3 - Upokojen/a, 4 - Brezposeln/a, 5 - Samozaposlen/a, 6 - Delno zaposlen/a]

Q47: Kakšen je vaš mesečni neto prihodek? [1 - Pod 1.000€, 2 - 1.000€–1.500€, 3 - 1.501€–2.000€, 4 - 2.001€–3.000€, 5 - 3.001€–5.000€, 6 - 5.001€–10.000€, 7 - Nad 10.000€, 8 - Ne želim odgovoriti]

Q48: Kakšna je vaša stopnja izobrazbe? [1 - Nedokončana osnovna šola, 2 - Dokončana osnovna šola, 3 - Dokončana nižja ali srednja poklicna izobrazba, 4 - Dokončana srednja strokovna ali splošna izobrazba, 5 - Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja), 6 - Dokončana visokošolska strokovna ali univerzitetna izobrazba (tudi 2. bolonjska stopnja), 7 - Dokončana specializacija, znanstveni magisterij ali doktorat]

Q49: Kako ste rešili anketo? [1 - Preko linka, 2 - Na tablici, 3 - Na listu papirja]

Clustering

knitr::opts_chunk$set(echo = TRUE)
options(width = 120)
#install.packages(ggplot2)
library(ggplot2)
#install.packages("ggfortify")
library(ggfortify)
#install.packages("ranger")
library(ranger)
#install.packages("dplyr")
library(dplyr)
#install.packages("Hmisc")
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("cluster")
library(cluster)
#install.packages("magrittr")
library(magrittr)
#install.packages("NbClust")
library("NbClust")
library(dplyr)

mydata <- mydata %>%
  mutate(Security_concerns = rowMeans(across(c(7, 8, 14)), na.rm = TRUE))

mydata <- mydata %>%
  mutate(Lack_of_competence_or_support = rowMeans(across(c(12, 13)), na.rm = TRUE))

mydata <- mydata %>%
  mutate(Preference_for_traditional_methods = rowMeans(across(c(9, 11)), na.rm = TRUE))

colnames(mydata) [6] <- "Aversion_to_change"
colnames(mydata) [10] <- "Physical_limitations"

From question 6, we made 5 different variables. 3 of them are grouped together in a way that we took the mean of them together (6b, 6c and 6i are Security_concerns; 6g and 6h are Lack_of_competence_or_support; 6d and 6f are Preference_for_traditional_methods) and than 6a is Aversion_to_change and 6e are Physical_limitations.

#Saving standardized cluster variables into new data frame

mydata_clu_std <- as.data.frame(scale(mydata[c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations")]))
mydata$Dissimilarity <- sqrt(mydata_clu_std$Security_concerns^2 + mydata_clu_std$Lack_of_competence_or_support^2 + mydata_clu_std$Preference_for_traditional_methods^2 + 
                             mydata_clu_std$Aversion_to_change^2 + mydata_clu_std$Physical_limitations^2) #Finding outliers
head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) #Finding units with highest value of dissimilarity
## # A tibble: 6 × 2
##      ID Dissimilarity
##   <int>         <dbl>
## 1    53          4.92
## 2     1          4.06
## 3   123          3.97
## 4    44          3.94
## 5    30          3.71
## 6    72          3.64

ID53 is a potential outlier, as there is a big jump in disimilarity numbers between units. For this reason we will remove this unit.

mydata <- mydata %>%
  filter(!ID %in% c(53)) #Removing ID53 from original data frame

mydata$ID <- seq(1, nrow(mydata)) #Numbering the data again

mydata_clu_std <- as.data.frame(scale(mydata[c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations")])) #Standardizing the data again

After removing one country, the sample size is now 157.

#Finding Eudlidean distances, based on 5 Cluster variables, then saving them into object Distances

Distances <- get_dist(mydata_clu_std, 
                      method = "euclidian")

fviz_dist(Distances, #Showing matrix of distances
          gradient = list(low = "darkred",
                          mid = "grey95",
                          high = "white"))

We can see on the matrix of distances that some groups of clusters are forming, I see either 3 groups.

get_clust_tendency(mydata_clu_std, #Hopkins statistics
                   n = nrow(mydata_clu_std) - 1,
                   graph = FALSE) 
## $hopkins_stat
## [1] 0.6029504
## 
## $plot
## NULL

This data is clusterable as it is above 0.5. If it would be more close to 1, it would be even more appropriate. However the threshold is 0.5. Now the next question is how many clusters to use. I will check this with Hierarhical clustering (dendrogram) and K-Means clustering (Elbow method, Silhouette analysis and with the help of indices).

How many clusters to use?

WARD <- mydata_clu_std %>%
  get_dist(method = "euclidean") %>%  
  hclust(method = "ward.D2")          

WARD
## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 157
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

Based on the dendrogram, we would choose 2 clusters, as there is the biggest jump in vertical line.

fviz_nbclust(mydata_clu_std, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

With the elbow method the slope changes most evidently at 2 clusters.

fviz_nbclust(mydata_clu_std, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette analysis")

The higest value of the Silhouette analysis is at 2.

NbClust(mydata_clu_std, 
        distance = "euclidean", 
        min.nc = 2, max.nc = 10,
        method = "kmeans", 
        index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 10 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 4 proposed 9 as the best number of clusters 
## * 2 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
## $All.index
##          KL      CH Hartigan     CCC    Scott     Marriot   TrCovW   TraceW Friedman  Rubin Cindex     DB Silhouette
## 2    9.5966 92.7417  23.3648 -1.9059 182.2586 22365513125 9141.421 488.0082   3.5819 1.5983 0.4213 1.2674     0.3109
## 3    0.2118 64.6236  38.3498 -4.2729 287.2586 25781592519 5810.071 424.0819   5.0700 1.8393 0.3873 1.7241     0.2346
## 4   33.4822 66.1618  14.8522 -2.2561 410.1106 20958157652 4746.590 339.5305   6.7664 2.2973 0.3518 1.4934     0.2405
## 5    0.1946 57.7712  14.4020 -2.8446 501.1125 18341844976 4146.290 309.4876   8.4125 2.5203 0.3329 1.5378     0.2127
## 6    0.3402 53.1246  18.4451 -2.6373 577.8546 16200227137 3341.184 282.7016   9.7434 2.7591 0.3193 1.5636     0.2199
## 7  170.9406 52.4031   8.2807 -1.4552 648.9874 14016730597 2585.750 251.9279  11.0928 3.0961 0.3641 1.4115     0.2343
## 8    0.0053 48.2557  17.2282 -1.9713 692.5546 13871218677 2394.293 238.7479  12.0205 3.2670 0.3331 1.5076     0.2114
## 9   11.5140 48.9287   5.5136 -0.5159 772.2003 10570689341 1798.061 214.0037  13.8227 3.6448 0.4090 1.3345     0.2287
## 10   0.4798 45.4161   7.5001 -1.0486 804.3282 10635205508 1752.332 206.3176  14.7638 3.7806 0.4041 1.4314     0.2050
##      Duda Pseudot2   Beale Ratkowsky     Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  0.9784   2.1452  0.0684    0.4324 244.0041     0.4973  0.9300  0.6734 0.0959 0.0021  1.8068 1.6568 0.9625
## 3  1.0585  -4.0338 -0.1694    0.3884 141.3606     0.4739  0.2894  1.3160 0.1174 0.0024  2.1176 1.5423 1.9356
## 4  1.3475 -15.9880 -0.7858    0.3757  84.8826     0.4900  0.6398  1.8251 0.0722 0.0029  1.9498 1.3840 0.7941
## 5  1.8725 -23.7634 -1.4063    0.3470  61.8975     0.4659  0.3248  2.3156 0.0976 0.0033  2.0907 1.3079 0.7352
## 6  1.0624  -2.2896 -0.1778    0.3258  47.1169     0.4573  0.0996  2.7357 0.0987 0.0033  2.1119 1.2557 0.4736
## 7  1.3570  -8.9441 -0.7859    0.3109  35.9897     0.4659  0.7818  2.9210 0.1167 0.0033  1.9525 1.1932 0.3572
## 8  1.7019 -12.3726 -1.2101    0.2944  29.8435     0.4378  0.0754  3.4835 0.0816 0.0035  2.0991 1.1490 0.3336
## 9  0.9718   0.6384  0.0860    0.2838  23.7782     0.4444  1.2669  3.6472 0.1444 0.0035  1.9951 1.1000 0.3151
## 10 1.1576  -1.6341 -0.3788    0.2711  20.6318     0.4187 -0.1176  4.2006 0.1728 0.0036  2.2337 1.0821 0.3061
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.6717            47.4016       0.9968
## 3          0.6107            46.5380       1.0000
## 4          0.5760            45.6335       1.0000
## 5          0.5287            45.4696       1.0000
## 6          0.5452            32.5352       1.0000
## 7          0.4864            35.9027       1.0000
## 8          0.4234            40.8610       1.0000
## 9          0.4584            25.9898       0.9943
## 10         0.2868            29.8345       1.0000
## 
## $Best.nc
##                       KL      CH Hartigan     CCC   Scott    Marriot  TrCovW  TraceW Friedman  Rubin Cindex     DB
## Number_clusters   7.0000  2.0000   4.0000  9.0000   4.000          9    3.00  4.0000   9.0000  9.000 6.0000 2.0000
## Value_Index     170.9406 92.7417  23.4976 -0.5159 122.852 3365045502 3331.35 54.5085   1.8023 -0.242 0.3193 1.2674
##                 Silhouette   Duda PseudoT2  Beale Ratkowsky     Ball PtBiserial Frey McClain    Dunn Hubert SDindex
## Number_clusters     2.0000 2.0000   2.0000 2.0000    2.0000   3.0000     2.0000    1  2.0000 10.0000      0  2.0000
## Value_Index         0.3109 0.9784   2.1452 0.0684    0.4324 102.6435     0.4973   NA  0.6734  0.1728      0  1.8068
##                 Dindex    SDbw
## Number_clusters      0 10.0000
## Value_Index          0  0.3061
## 
## $Best.partition
##   [1] 1 2 1 1 2 1 1 1 2 1 1 1 1 1 2 2 1 1 1 2 1 1 1 2 1 2 1 2 2 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1 2 2 1 1 2 2 2 1 2 2 1 2 1
##  [58] 2 1 2 2 2 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2 2 1 1 2 2 2 2 2 2
## [115] 1 2 2 2 1 2 1 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2

We will proceed with 3 clusters.

K-Means Clustering

Clustering <- kmeans(mydata_clu_std, 
                     centers = 3, #Number of groups
                     nstart = 25) #Number of attempts at different starting leader positions

Clustering
## K-means clustering with 3 clusters of sizes 45, 46, 66
## 
## Cluster means:
##   Security_concerns Lack_of_competence_or_support Preference_for_traditional_methods Aversion_to_change
## 1       -0.96430126                    -0.5695709                         -1.2244132         -0.9276589
## 2        0.09075076                    -0.6448033                          0.2826530          0.1622943
## 3        0.59422760                     0.8377522                          0.6378266          0.5193805
##   Physical_limitations
## 1           -0.4679937
## 2           -0.7534631
## 3            0.8442276
## 
## Clustering vector:
##   [1] 1 3 1 1 3 2 2 2 3 1 2 2 2 1 3 3 1 1 3 3 2 2 1 3 2 3 1 3 3 1 2 2 3 3 2 3 2 1 2 1 1 3 2 1 3 3 2 2 3 2 3 2 3 3 1 3 2
##  [58] 3 2 3 3 2 3 1 1 3 3 2 3 3 1 1 1 2 2 1 3 1 3 1 3 3 2 2 1 3 3 3 2 1 2 1 1 1 1 3 2 1 2 1 1 3 1 2 2 2 2 2 3 3 3 3 3 3
## [115] 1 3 3 3 1 3 1 1 3 1 3 3 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 3 1 2 2 1 1 2 2 2 2 2 1 1 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 141.2425 107.8430 146.6024
##  (between_SS / total_SS =  49.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
## [8] "iter"         "ifault"
library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = FALSE,
             ggtheme = theme_bw(),
             data = mydata_clu_std)

Group profiles

Averages <- Clustering$centers
Averages #Average values of cluster variables to describe groups
##   Security_concerns Lack_of_competence_or_support Preference_for_traditional_methods Aversion_to_change
## 1       -0.96430126                    -0.5695709                         -1.2244132         -0.9276589
## 2        0.09075076                    -0.6448033                          0.2826530          0.1622943
## 3        0.59422760                     0.8377522                          0.6378266          0.5193805
##   Physical_limitations
## 1           -0.4679937
## 2           -0.7534631
## 3            0.8442276
Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
Figure <- pivot_longer(Figure, cols = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"))

Figure$Group <- factor(Figure$ID, 
                       levels = c(1, 2, 3), 
                       labels = c("1", "2", "3"))

Figure$NameF <- factor(Figure$name, 
                       levels = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"), 
                       labels = c("Security_concerns", "Lack_of_competence_or_support", "Preference_for_traditional_methods", "Aversion_to_change", "Physical_limitations"))

library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 3) +
  geom_line(aes(group = ID), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")+
  ylim(-2.2, 2.2) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))

mydata$Group <- Clustering$cluster #Assignings units to groups

Appropriateness of cluster variables used

#Checking if clustering variables successfully differentiate between groups

fit <- aov(cbind(Security_concerns, Lack_of_competence_or_support, Preference_for_traditional_methods, Aversion_to_change, Physical_limitations) ~ as.factor(Group), 
           data = mydata)

summary(fit)
##  Response Security_concerns :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   2 56.024 28.0120  55.771 < 2.2e-16 ***
## Residuals        154 77.350  0.5023                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Lack_of_competence_or_support :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   2 123.84  61.922  81.146 < 2.2e-16 ***
## Residuals        154 117.52   0.763                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Preference_for_traditional_methods :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   2 81.174  40.587  130.06 < 2.2e-16 ***
## Residuals        154 48.057   0.312                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Aversion_to_change :
##                   Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   2  80.167  40.083  45.248 3.486e-16 ***
## Residuals        154 136.423   0.886                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Physical_limitations :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   2 162.21  81.105   87.57 < 2.2e-16 ***
## Residuals        154 142.63   0.926                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Response for Social support:

  • H0: μ(Security_concerns, G1) = μ(Security_concerns, G2) = μ(Security_concerns, G3)

  • H1: At least one μ(Security_concerns, j) is different.

We can reject H0 at p < 0.001. We can reject H0 for all cluster variables at p < 0.001. Therefore we can assume that the groups are statistically different in the mean values of the cluster variables.

Criterion Validity

Next step is to check the criterion validity of the classification with variables that were not used in the clustering process. For this we chose Q2: Ali dnevno uporabljate pametni telefon? [1 - Da, 2 - Ne].

We have to check two assumptions: 1. normal distribution of the usage of mobile phones in each group. 2. homogeneity of variances

chi_square <- chisq.test(mydata$Q2, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q2 and as.factor(mydata$Group)
## X-squared = 20.555, df = 2, p-value = 3.439e-05
addmargins(chi_square$observed)
##          as.factor(mydata$Group)
## mydata$Q2   1   2   3 Sum
##       Yes  40  41  39 120
##       No    5   4  27  36
##       Sum  45  45  66 156
addmargins(round(chi_square$expected, 2)) 
##          as.factor(mydata$Group)
## mydata$Q2     1     2     3    Sum
##       Yes 34.62 34.62 50.77 120.01
##       No  10.38 10.38 15.23  35.99
##       Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2) 
##          as.factor(mydata$Group)
## mydata$Q2     1     2     3
##       Yes  0.92  1.09 -1.65
##       No  -1.67 -1.98  3.02
library(effectsize)
effectsize::cramers_v(mydata$Q2, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.35              | [0.19, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between usage of mobile phone and classification of customers in 3 groups.

  • H1: There is association between usage of mobile phone and classification of customers in 3 groups.

We can reject H0, so there are differences. All expected variables are above 5, so this assumption is met.

Q41: Spol [1 - Ženska, 2 - Moški]

chi_square <- chisq.test(mydata$Q41, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q41 and as.factor(mydata$Group)
## X-squared = 1.6566, df = 2, p-value = 0.4368
addmargins(chi_square$observed)
##           as.factor(mydata$Group)
## mydata$Q41   1   2   3 Sum
##     Female  22  26  30  78
##     Male    23  19  36  78
##     Sum     45  45  66 156
addmargins(round(chi_square$expected, 2)) 
##           as.factor(mydata$Group)
## mydata$Q41    1    2  3 Sum
##     Female 22.5 22.5 33  78
##     Male   22.5 22.5 33  78
##     Sum    45.0 45.0 66 156
round(chi_square$res, 2) 
##           as.factor(mydata$Group)
## mydata$Q41     1     2     3
##     Female -0.11  0.74 -0.52
##     Male    0.11 -0.74  0.52
library(effectsize)
effectsize::cramers_v(mydata$Q41, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.00              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between gender and classification of customers in 3 groups.

  • H1: There is association between gender and classification of customers in 3 groups.

We cannot reject H0, so there are no differences.

Q42: Prosimo, vpišite leto rojstva. [Odprto besedilo]

mydata$Q42 <- as.numeric(as.character(mydata$Q42))
current_year <- as.numeric(format(Sys.Date(), "%Y"))
mydata$Age <- current_year - mydata$Q42
mydata$AgeGroup <- cut(mydata$Age, 
                       breaks = c(18, 40, 60, Inf), 
                       labels = c("18-40", "41-60", "60+"))
chi_square <- chisq.test(mydata$AgeGroup, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$AgeGroup and as.factor(mydata$Group)
## X-squared = 49.906, df = 4, p-value = 3.778e-10
addmargins(chi_square$observed)
##                as.factor(mydata$Group)
## mydata$AgeGroup   1   2   3 Sum
##           18-40  21  22   3  46
##           41-60  16  18  23  57
##           60+     7   5  40  52
##           Sum    44  45  66 155
addmargins(round(chi_square$expected, 2)) 
##                as.factor(mydata$Group)
## mydata$AgeGroup     1     2     3 Sum
##           18-40 13.06 13.35 19.59  46
##           41-60 16.18 16.55 24.27  57
##           60+   14.76 15.10 22.14  52
##           Sum   44.00 45.00 66.00 155
round(chi_square$res, 2) 
##                as.factor(mydata$Group)
## mydata$AgeGroup     1     2     3
##           18-40  2.20  2.37 -3.75
##           41-60 -0.04  0.36 -0.26
##           60+   -2.02 -2.60  3.80
library(effectsize)
effectsize::cramers_v(mydata$Q41, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.00              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between age and classification of customers in 3 groups.

  • H1: There is association between age and classification of customers in 3 groups.

We can reject H0, so there are differences.

Q43: S kom živite v istem gospodinjstvu? [Izbira več možnih odgovorov]

  • Q43a: S starši
  • Q43b: S sorojenci
  • Q43c: S sorodniki (dedek, babica, stric, teta …)
  • Q43d: S partnerko/jem
  • Q43e: Z otroki
  • Q43f: S skrbnikom/i
  • Q43g: Živim sam/a
  • Q43h: Drugo
mydata[c(67:73)] <- mydata[c(67:73)] %>% mutate_all(as.numeric)
mydata$Household <- case_when(
  mydata$Q43a == 1 ~ 1,   # If var1 is 1, assign 1
  mydata$Q43b == 1 ~ 1,   
  mydata$Q43c == 1 ~ 1,
  mydata$Q43d == 1 ~ 1,
  mydata$Q43e == 1 ~ 1,
  mydata$Q43f == 1 ~ 1,
  mydata$Q43g == 1 ~ 2,
  TRUE ~ 0)
mydata$Household <- factor(mydata$Household, 
                         levels = c(1, 2), 
                         labels = c("With somebody", "Living alone"))
chi_square <- chisq.test(mydata$Household, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Household and as.factor(mydata$Group)
## X-squared = 5.3091, df = 2, p-value = 0.07033
addmargins(chi_square$observed)
##                 as.factor(mydata$Group)
## mydata$Household   1   2   3 Sum
##    With somebody  41  33  56 130
##    Living alone    4  12  10  26
##    Sum            45  45  66 156
addmargins(round(chi_square$expected, 2)) 
##                 as.factor(mydata$Group)
## mydata$Household    1    2  3 Sum
##    With somebody 37.5 37.5 55 130
##    Living alone   7.5  7.5 11  26
##    Sum           45.0 45.0 66 156
round(chi_square$res, 2) 
##                 as.factor(mydata$Group)
## mydata$Household     1     2     3
##    With somebody  0.57 -0.73  0.13
##    Living alone  -1.28  1.64 -0.30
library(effectsize)
effectsize::cramers_v(mydata$Household, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.15              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between living situation and classification of customers in 3 groups.

  • H1: There is association between living situation and classification of customers in 3 groups.

We cannot reject H0, so there are no differences.

Q44: Koliko prebivalcev živi v kraju, kjer prebivate? [1 - Manj kot 1.000 prebivalcev, 2 - 1.000–5.000 prebivalcev, 3 - 5.001–20.000 prebivalcev, 4 - 20.001–50.000 prebivalcev, 5 - 50.001–100.000 prebivalcev, 6 - Več kot 100.000 prebivalcev]

mydata <- mydata %>%
  mutate(Q44_numeric = as.integer(Q44)) 

mydata <- mydata %>%
  mutate(Q44_numeric = case_when(
    Q44_numeric %in% c(1) ~ (1 + 1000) / 2,  
      Q44_numeric %in% c(2) ~ (1001 + 5000) / 2,    
     Q44_numeric %in% c(3, 4) ~ (5001 + 50000) / 2,    
     Q44_numeric %in% c(5, 6) ~ (50001 + 285000) / 2,  
  ))
chi_square <- chisq.test(mydata$Q44_numeric, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q44_numeric and as.factor(mydata$Group)
## X-squared = 20.028, df = 6, p-value = 0.002738
addmargins(chi_square$observed)
##                   as.factor(mydata$Group)
## mydata$Q44_numeric   1   2   3 Sum
##           500.5     19   7  18  44
##           3000.5     3  15  18  36
##           27500.5   17  14  12  43
##           167500.5   6   9  18  33
##           Sum       45  45  66 156
addmargins(round(chi_square$expected, 2)) 
##                   as.factor(mydata$Group)
## mydata$Q44_numeric     1     2     3    Sum
##           500.5    12.69 12.69 18.62  44.00
##           3000.5   10.38 10.38 15.23  35.99
##           27500.5  12.40 12.40 18.19  42.99
##           167500.5  9.52  9.52 13.96  33.00
##           Sum      44.99 44.99 66.00 155.98
round(chi_square$res, 2) 
##                   as.factor(mydata$Group)
## mydata$Q44_numeric     1     2     3
##           500.5     1.77 -1.60 -0.14
##           3000.5   -2.29  1.43  0.71
##           27500.5   1.31  0.45 -1.45
##           167500.5 -1.14 -0.17  1.08
library(effectsize)
effectsize::cramers_v(mydata$Q44_numeric, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.21              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between habitants and classification of customers in 3 groups.

  • H1: There is association between habitants and classification of customers in 3 groups.

We can reject H0, so there are differences.

Q47: Kakšen je vaš mesečni neto prihodek? [1 - Pod 1.000€, 2 - 1.000€–1.500€, 3 - 1.501€–2.000€, 4 - 2.001€–3.000€, 5 - 3.001€–5.000€, 6 - 5.001€–10.000€, 7 - Nad 10.000€, 8 - Ne želim odgovoriti]

mydata <- mydata %>%
  mutate(Q47_numeric = as.integer(Q47)) 

mydata <- mydata %>%
  mutate(Q47_numeric = case_when(
    Q47_numeric %in% c(1, 2, 3) ~ (1 + 2000) / 2, 
     Q47_numeric %in% c(4, 5, 6, 7) ~ (2001 + 10000) / 2,  
  ))
chi_square <- chisq.test(mydata$Q47_numeric, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q47_numeric and as.factor(mydata$Group)
## X-squared = 4.9016, df = 2, p-value = 0.08622
addmargins(chi_square$observed)
##                   as.factor(mydata$Group)
## mydata$Q47_numeric   1   2   3 Sum
##             1000.5  28  21  27  76
##             6000.5  16  18  37  71
##             Sum     44  39  64 147
addmargins(round(chi_square$expected, 2)) 
##                   as.factor(mydata$Group)
## mydata$Q47_numeric     1     2     3 Sum
##             1000.5 22.75 20.16 33.09  76
##             6000.5 21.25 18.84 30.91  71
##             Sum    44.00 39.00 64.00 147
round(chi_square$res, 2) 
##                   as.factor(mydata$Group)
## mydata$Q47_numeric     1     2     3
##             1000.5  1.10  0.19 -1.06
##             6000.5 -1.14 -0.19  1.10
library(effectsize)
effectsize::cramers_v(mydata$Q47_numeric, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.14              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between income level and classification of customers in 3 groups.

  • H1: There is association between income level and classification of customers in 3 groups.

We can reject H0, so there are differences.

Q3: Ali uporabljate internet vsaj nekajkrat na teden? [1 - Da, 2 - Ne]

chi_square <- chisq.test(mydata$Q3, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q3 and as.factor(mydata$Group)
## X-squared = 12.145, df = 2, p-value = 0.002305
addmargins(chi_square$observed)
##          as.factor(mydata$Group)
## mydata$Q3   1   2   3 Sum
##       Yes  38  44  48 130
##       No    7   1  18  26
##       Sum  45  45  66 156
addmargins(round(chi_square$expected, 2)) 
##          as.factor(mydata$Group)
## mydata$Q3    1    2  3 Sum
##       Yes 37.5 37.5 55 130
##       No   7.5  7.5 11  26
##       Sum 45.0 45.0 66 156
round(chi_square$res, 2) 
##          as.factor(mydata$Group)
## mydata$Q3     1     2     3
##       Yes  0.08  1.06 -0.94
##       No  -0.18 -2.37  2.11
library(effectsize)
effectsize::cramers_v(mydata$Q3, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.26              | [0.06, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between age and classification of customers in 3 groups.

  • H1: There is association between age and classification of customers in 3 groups.

We can reject H0, so there are differences.

Q4: Ali je kdo od vaših družinskih članov uporabnik mobilne banke? [1 - Da, 2 - Ne, 3 - Ne vem]

chi_square <- chisq.test(mydata$Q4, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q4 and as.factor(mydata$Group)
## X-squared = 4.7944, df = 4, p-value = 0.3091
addmargins(chi_square$observed)
##               as.factor(mydata$Group)
## mydata$Q4        1   2   3 Sum
##   Yes           28  34  40 102
##   No            10   7  11  28
##   I don't know   7   4  15  26
##   Sum           45  45  66 156
addmargins(round(chi_square$expected, 2)) 
##               as.factor(mydata$Group)
## mydata$Q4          1     2     3    Sum
##   Yes          29.42 29.42 43.15 101.99
##   No            8.08  8.08 11.85  28.01
##   I don't know  7.50  7.50 11.00  26.00
##   Sum          45.00 45.00 66.00 156.00
round(chi_square$res, 2) 
##               as.factor(mydata$Group)
## mydata$Q4          1     2     3
##   Yes          -0.26  0.84 -0.48
##   No            0.68 -0.38 -0.25
##   I don't know -0.18 -1.28  1.21
library(effectsize)
effectsize::cramers_v(mydata$Q4, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.05              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between income level and classification of customers in 3 groups.

  • H1: There is association between income level and classification of customers in 3 groups.

We cannot reject H0, so there are no differences.

Q5: Ali imate skupni bančni račun, ki ga upravlja en član družine? [1 - Da, 2 - Ne]

chi_square <- chisq.test(mydata$Q5, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q5 and as.factor(mydata$Group)
## X-squared = 5.4441, df = 2, p-value = 0.06574
addmargins(chi_square$observed)
##          as.factor(mydata$Group)
## mydata$Q5   1   2   3 Sum
##       Yes  10   9   5  24
##       No   35  36  61 132
##       Sum  45  45  66 156
addmargins(round(chi_square$expected, 2)) 
##          as.factor(mydata$Group)
## mydata$Q5     1     2     3    Sum
##       Yes  6.92  6.92 10.15  23.99
##       No  38.08 38.08 55.85 132.01
##       Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2) 
##          as.factor(mydata$Group)
## mydata$Q5     1     2     3
##       Yes  1.17  0.79 -1.62
##       No  -0.50 -0.34  0.69
library(effectsize)
effectsize::cramers_v(mydata$Q5, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.15              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between income level and classification of customers in 3 groups.

  • H1: There is association between income level and classification of customers in 3 groups.

We can reject H0, so there are differences.

Q7: Se vam zdi, da bi potrebovali dodatno izobraževanje, preden bi lahko uporabljali mobilno banko? [1 - Da, 2 - Ne]

chi_square <- chisq.test(mydata$Q5, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q5 and as.factor(mydata$Group)
## X-squared = 5.4441, df = 2, p-value = 0.06574
addmargins(chi_square$observed)
##          as.factor(mydata$Group)
## mydata$Q5   1   2   3 Sum
##       Yes  10   9   5  24
##       No   35  36  61 132
##       Sum  45  45  66 156
addmargins(round(chi_square$expected, 2)) 
##          as.factor(mydata$Group)
## mydata$Q5     1     2     3    Sum
##       Yes  6.92  6.92 10.15  23.99
##       No  38.08 38.08 55.85 132.01
##       Sum 45.00 45.00 66.00 156.00
round(chi_square$res, 2) 
##          as.factor(mydata$Group)
## mydata$Q5     1     2     3
##       Yes  1.17  0.79 -1.62
##       No  -0.50 -0.34  0.69
library(effectsize)
effectsize::cramers_v(mydata$Q5, mydata$Group)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.15              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
  • H0: There is no association between income level and classification of customers in 3 groups.

  • H1: There is association between income level and classification of customers in 3 groups.

We can reject H0, so there are differences.

Conclusion

Based on five standardized variables (Security Concerns, Lack of Competence or Support, Preference for Traditional Methods, Aversion to Change, and Physical Limitations), we divided the dataset into three distinct groups using hierarchical and K-means clustering.

Group 1 (45/157 = 29%)

Group 2 (46/157 = 29%)

Group 3 (66/157 = 42%)

PCA

mydata_PCA <- mydata[c(18:22)]
mydata_PCA2 <- mydata[c(25:34)]
R <- cor(mydata_PCA)

library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:effectsize':
## 
##     phi
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
corPlot(R)

library(psych)
cortest.bartlett(R, n = nrow(mydata_PCA))
## $chisq
## [1] 214.0942
## 
## $p.value
## [1] 1.838694e-40
## 
## $df
## [1] 10
library(psych)
KMO(R)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA =  0.78
## MSA for each item = 
##            Assistance              Security          Transparency           Convinience Speed and Reliability 
##                  0.79                  0.76                  0.80                  0.78                  0.77
library(FactoMineR)

components <- PCA(mydata_PCA, 
                  scale.unit = TRUE, 
                  graph = FALSE)
library(factoextra)
get_eigenvalue(components)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  2.7190702        54.381405                    54.38140
## Dim.2  0.7872302        15.744603                    70.12601
## Dim.3  0.6464620        12.929241                    83.05525
## Dim.4  0.4411911         8.823822                    91.87907
## Dim.5  0.4060465         8.120929                   100.00000

How many components to retain?

Eigenvalue of first principal component for standardized variables is bigger than 1.

The first 1 principal components explain more than 40% of the data. We measure evaluation which is subjective, so we measure soft data for which the chosen number of components should explain around 40% of the data.

The last chosen principal component captures more than 5% of total variance of original variables (5).

fviz_eig(components, 
         choice = "eigenvalue", 
         main = "Screeplot",
         ylab = "Eigenvalue",
         xlab = "Principal component",
         addlabels = TRUE)

When looking at the Scree plot the biggest difference between eigenvalues. This is between 1 and 2, so we should choose 1 principal component.

library(psych)
fa.parallel(mydata_PCA,
            sim = FALSE,
            fa = "pc")

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  1

Parallel analysis suggests that we should choose 1 principal component.

Because we need to do the perception map, which has 2 principal components, we will use 2.

library(FactoMineR)
components <- PCA(mydata_PCA, 
                  ncp = 2, 
                  scale.unit = TRUE, 
                  graph = FALSE)
components$var$cor
##                           Dim.1       Dim.2
## Assistance            0.6588985  0.58938063
## Security              0.7655438  0.33729834
## Transparency          0.7487942 -0.09960031
## Convinience           0.7262470 -0.48912635
## Speed and Reliability 0.7814975 -0.27735471
loadings <- components$var$cor
library(factoextra)
eigenvalue <- get_eigenvalue(components)[1:2,1 ]
coefficient1 <- loadings[1:5]/sqrt(eigenvalue)[1]
coefficient2 <- loadings[6:10]/sqrt(eigenvalue)[2]
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()       masks ggplot2::%+%()
## ✖ psych::alpha()     masks ggplot2::alpha()
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ Hmisc::src()       masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)

mydata_PCAD <- mydata_PCA2 %>% 
  pivot_longer(everything(), names_to = "name", values_to = "score")  %>% 
  separate(name, into = c("retailer", "dimension"), sep = "_")%>% 
  pivot_wider(names_from = retailer, values_from = score, values_fn = mean) %>%
  column_to_rownames(var = "dimension")

mydata_PCA_std <- scale(mydata_PCAD)
poslovalnica1 <- sum(mydata_PCA_std[,1]*coefficient1)
mobilna_banka1 <- sum(mydata_PCA_std[,2]*coefficient1)
poslovalnica2 <- sum(mydata_PCA_std[,1]*coefficient2)
mobilna_banka2 <- sum(mydata_PCA_std[,2]*coefficient2)
library(factoextra)

p <- fviz_pca_biplot(components, repel = TRUE, invisible = "ind", col.var = "#33006F")
p + 
  annotate("point", x = poslovalnica1, y = poslovalnica2, color = "#84BD00", size = 4, shape = 16) +
  annotate("text", x = poslovalnica1, y = poslovalnica2, label = "Poslovalnica", vjust = -1, color = "#84BD00") +
  annotate("point", x = mobilna_banka1, y = mobilna_banka2, color = "#FA7800", size = 4, shape = 16) +
  annotate("text", x = mobilna_banka1, y = mobilna_banka2, label = "Mobilna banka", vjust = -1, color = "#FA7800")

Principal component analysis was performed on 5 standardized variables (n = 157). The KMO measure confirms the appropriateness of the variables, KMO = 0.78, although the data falls into the category “Middling”. The MSA statistics for the individual variables are above 0.50 for all variables. Based on the component’s loadings, we conclude that PC1 (𝜆1 = 2.72) represents quality, while PC2 (𝜆2 = 0.79) represents the contrast between security&customer support and service efficiency&transparency.