#install.packages("readxl")
library(readxl)

## Warning: package 'readxl' was built under R version 4.4.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mydata <- read_excel("C:/Users/Pino/Desktop/IMB/MVA/Survey.xlsx")

mydata <- as.data.frame(mydata)
mydata$ID <- seq(1, nrow(mydata))
mydata <- mydata %>%
  filter(!ID %in% 1)
mydata$ID <- seq(1, nrow(mydata))
head(mydata)

##   Q1 Q21 Q22a Q23a Q23b Q23c Q23d Q23e Q23f Q24 Q25 Q26 Q27a Q27b Q27c Q28 Q29a
## 1  1   6    3    1    1    1    1    1    1   2   3   1    1    1    0   2    3
## 2  1   5    4    1    1    1    1    1    1   1   4   1    1    0    0   2    4
## 3  1   6    4    2    1    1    1    2    2   2   3   2    0    0    1   1    4
## 4  1   6    5    1    1    1    1    1    1   1   4   1    0    0    1   2    5
## 5  1   6    5    1    1    1    1    1    1   1   4   1    0    0    1   2    5
## 6  1   6    5    2    1    1    1    1    1   4   3   1    0    1    0   1    4
##   Q29b Q29c Q29d Q30a Q30b Q30c Q30d Q30e Q31a Q31b Q33a Q33b Q33c Q33d Q33e
## 1    3    4    5    3    3    3    3    3    4    4    3    4    5    3    4
## 2    2    2    1    3    3    3    3    3    5    2    2    4    4    5    4
## 3    4    2    5    4    4    4    4    1    2    4    5    4    4    5    5
## 4    2    1    1    2    2    4    1    3    5    1    4    4    5    3    4
## 5    2    2    2    5    5    5    5    5    5    2    3    4    4    5    5
## 6    1    1    4    3    5    5    2    4    4    5    5    4    5    4    5
##   Q33f Poslovalnica_podpora_in_usmerjanje
## 1    4                                  4
## 2    5                                  4
## 3    5                                  5
## 4    5                                  5
## 5    5                                  4
## 6    4                                  4
##   Mobilna aplikacija_podpora_in_usmerjanje Poslovalnica_brezkrbnost
## 1                                        4                        3
## 2                                        3                        4
## 3                                        2                        5
## 4                                        3                        5
## 5                                        3                        4
## 6                                        2                        4
##   Mobilna aplikacija_brezkrbnost Poslovalnica_varnost
## 1                              3                    4
## 2                              4                    4
## 3                              2                    5
## 4                              4                    5
## 5                              4                    4
## 6                              3                    4
##   Mobilna aplikacija_varnost Poslovalnica_dostopnost
## 1                          3                       3
## 2                          4                       3
## 3                          3                       5
## 4                          4                       5
## 5                          3                       4
## 6                          3                       5
##   Mobilna aplikacija_dostopnost Poslovalnica_jasnost Mobilna aplikacija_jasnost
## 1                             3                    4                          4
## 2                             5                    5                          4
## 3                             2                    5                          2
## 4                             3                    5                          4
## 5                             5                    4                          4
## 6                             2                    4                          2
##   Poslovalnica_hitrost Mobilna aplikacija_hitrost                    Q40 Q41
## 1                    3                          4                     -1   2
## 2                    3                          5                     -1   2
## 3                    3                          4                     -1   2
## 4                    2                          5                     -1   2
## 5                    2                          4                     -1   2
## 6                    1                          5 Stay humble, only cash   2
##    Q42 Q43a Q43b Q43c Q43d Q43e Q43f Q43g Q43h Q44 Q45 Q45_13_text Q46 Q47 Q48
## 1 2000    1    0    0    0    0    0    0    0   2   3          -2   2   2   2
## 2 1998    0    0    0    1    0    0    0    0   6   3          -2   2   3   4
## 3 2001    1    0    0    0    0    0    0    0   2   1          -2   2   2   3
## 4 1994    0    0    0    1    0    0    0    0   6  12          -2   5   6   5
## 5 2000    1    1    0    0    0    0    0    0   2   1          -2   2   3   6
## 6 2004    1    0    0    0    0    0    0    0   3   1          -2   1   8   4
##   ID
## 1  1
## 2  2
## 3  3
## 4  4
## 5  5
## 6  6

mydata[c(2:9,11, 17:45,48,58)] <- mydata[c(2:9,11, 17:45,48,58)] %>% mutate_all(as.numeric)
library(dplyr)
mydata$BankF <- case_when(
  mydata$Q45 == 1 ~ 1,   
  mydata$Q45 == 2 ~ 2,  
  mydata$Q45 == 3 ~ 2,  
  mydata$Q45 == 4 ~ 2,  
  mydata$Q45 == 5 ~ 2, 
  mydata$Q45 == 6 ~ 2,  
  mydata$Q45 == 7 ~ 2,  
  mydata$Q45 == 8 ~ 2,  
  mydata$Q45 == 9 ~ 2, 
  mydata$Q45 == 10 ~ 2,  
  mydata$Q45 == 11 ~ 2,  
  mydata$Q45 == 12 ~ 2,  
  mydata$Q45 == 13 ~ 2,  
  TRUE ~ 0)

mydata$Q26 <- factor(mydata$Q26, 
                         levels = c(1, 2), 
                         labels = c("Da","Ne"))
mydata$Q27a <- factor(mydata$Q27a, 
                         levels = c(1, 0), 
                         labels = c("V mobilni aplikaciji","Ne"))
mydata$Q27b <- factor(mydata$Q27b, 
                         levels = c(1, 0), 
                         labels = c("V poslovalnici","Ne"))
mydata$Q27c <- factor(mydata$Q27c, 
                         levels = c(1, 0), 
                         labels = c("Nisem upiorabljal/a","Ne"))
mydata$Q28 <- factor(mydata$Q28, 
                         levels = c(1, 2), 
                         labels = c("V poslovalnici","V mobilni aplikaciji"))

summary(mydata[c(2:45)])

##       Q21             Q22a            Q23a            Q23b      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :5.000   Median :5.000   Median :1.000   Median :1.000  
##  Mean   :4.502   Mean   :4.325   Mean   :1.381   Mean   :1.192  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :6.000   Max.   :5.000   Max.   :2.000   Max.   :2.000  
##       Q23c            Q23d            Q23e            Q23f      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :1.000   Median :1.000   Median :2.000  
##  Mean   :1.283   Mean   :1.181   Mean   :1.355   Mean   :1.543  
##  3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :2.000   Max.   :2.000   Max.   :2.000   Max.   :2.000  
##      Q24                 Q25        Q26                        Q27a    
##  Length:265         Min.   :1.000   Da:141   V mobilni aplikaciji:106  
##  Class :character   1st Qu.:3.000   Ne:124   Ne                  :159  
##  Mode  :character   Median :4.000                                      
##                     Mean   :3.498                                      
##                     3rd Qu.:4.000                                      
##                     Max.   :5.000                                      
##              Q27b                      Q27c                       Q28     
##  V poslovalnici: 67   Nisem upiorabljal/a:114   V poslovalnici      : 96  
##  Ne            :198   Ne                 :151   V mobilni aplikaciji:169  
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##       Q29a            Q29b            Q29c            Q29d           Q30a      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:2.00   1st Qu.:2.000  
##  Median :4.000   Median :3.000   Median :2.000   Median :3.00   Median :3.000  
##  Mean   :3.438   Mean   :2.728   Mean   :2.596   Mean   :2.97   Mean   :3.072  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.00   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.00   Max.   :5.000  
##       Q30b            Q30c            Q30d            Q30e      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :4.000   Median :4.000   Median :3.000   Median :3.000  
##  Mean   :3.377   Mean   :3.426   Mean   :3.117   Mean   :2.932  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##       Q31a            Q31b            Q33a           Q33b            Q33c      
##  Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:2.000   1st Qu.:3.00   1st Qu.:4.000   1st Qu.:4.000  
##  Median :4.000   Median :2.000   Median :4.00   Median :4.000   Median :5.000  
##  Mean   :4.128   Mean   :2.506   Mean   :3.83   Mean   :4.075   Mean   :4.264  
##  3rd Qu.:5.000   3rd Qu.:3.000   3rd Qu.:5.00   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.00   Max.   :5.000   Max.   :5.000  
##       Q33d            Q33e            Q33f      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median :4.000   Median :4.000   Median :4.000  
##  Mean   :4.042   Mean   :4.023   Mean   :4.219  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  Poslovalnica_podpora_in_usmerjanje Mobilna aplikacija_podpora_in_usmerjanje
##  Min.   :1.000                      Min.   :1.000                           
##  1st Qu.:3.000                      1st Qu.:3.000                           
##  Median :4.000                      Median :3.000                           
##  Mean   :3.811                      Mean   :3.125                           
##  3rd Qu.:4.000                      3rd Qu.:4.000                           
##  Max.   :5.000                      Max.   :5.000                           
##  Poslovalnica_brezkrbnost Mobilna aplikacija_brezkrbnost Poslovalnica_varnost
##  Min.   :1.000            Min.   :1.000                  Min.   :1.000       
##  1st Qu.:3.000            1st Qu.:3.000                  1st Qu.:4.000       
##  Median :4.000            Median :4.000                  Median :4.000       
##  Mean   :3.921            Mean   :3.551                  Mean   :4.211       
##  3rd Qu.:5.000            3rd Qu.:4.000                  3rd Qu.:5.000       
##  Max.   :5.000            Max.   :5.000                  Max.   :5.000       
##  Mobilna aplikacija_varnost Poslovalnica_dostopnost
##  Min.   :1.000              Min.   :1.000          
##  1st Qu.:3.000              1st Qu.:3.000          
##  Median :4.000              Median :4.000          
##  Mean   :3.698              Mean   :3.785          
##  3rd Qu.:4.000              3rd Qu.:5.000          
##  Max.   :5.000              Max.   :5.000          
##  Mobilna aplikacija_dostopnost Poslovalnica_jasnost Mobilna aplikacija_jasnost
##  Min.   :1.000                 Min.   :1.000        Min.   :1.000             
##  1st Qu.:3.000                 1st Qu.:3.000        1st Qu.:3.000             
##  Median :4.000                 Median :4.000        Median :3.000             
##  Mean   :3.792                 Mean   :3.913        Mean   :3.347             
##  3rd Qu.:4.000                 3rd Qu.:5.000        3rd Qu.:4.000             
##  Max.   :5.000                 Max.   :5.000        Max.   :5.000             
##  Poslovalnica_hitrost Mobilna aplikacija_hitrost
##  Min.   :1.000        Min.   :1.0               
##  1st Qu.:2.000        1st Qu.:4.0               
##  Median :3.000        Median :4.0               
##  Mean   :2.891        Mean   :4.2               
##  3rd Qu.:4.000        3rd Qu.:5.0               
##  Max.   :5.000        Max.   :5.0

mydata$Q41 <- factor(mydata$Q41, 
                         levels = c(1, 2, 3), 
                         labels = c("Female","Male", "I don't want to answer"))

mydata$Q43a <- factor(mydata$Q43a, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43b <- factor(mydata$Q43b, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43c <- factor(mydata$Q43c, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43d <- factor(mydata$Q43d, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43e <- factor(mydata$Q43e, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43f <- factor(mydata$Q43f, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43g <- factor(mydata$Q43g, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q43h <- factor(mydata$Q43h, 
                         levels = c(1, 0), 
                         labels = c("Selected","Not selected"))

mydata$Q44 <- factor(mydata$Q44, 
                         levels = c(1, 2, 3, 4, 5, 6), 
                         labels = c("Less than 1.000 habitants",
                       "1.000 – 5.000 habitants",
                       "5.001 – 20.000 habitants",
                       "20.001 – 50.000 habitants",
                       "50.001 – 100.000 habitants",
                       "More than 100.000 habitants"))

mydata$Q45 <- factor(mydata$Q45, 
                         levels = c(3, 5, 1, 9, 12, 7, 10, 4, 11, 6), 
                         labels = c("OTP banka d.d.","Banka Intesa Sanpaolo d.d.", "Nova Ljubljanska Banka d.d. (NLB)", "Gorenjska Banka d.d.", "Delavska Hranilnica d.d.", "Revolut", "Deželna Banka Slovenije d.d.", "Banka Sparkasse d.d.", "Addiko Bank d.d.", "UniCredit Banka Slovenija d.d."))

mydata$Q46 <- factor(mydata$Q46, 
                         levels = c(1, 2, 3, 5, 6, 4), 
                         labels = c("Študent/-ka","Redno zaposlen/-a", "Upokojen/-a", "Samozaposlen/-a", "Delno zaposlen/-a", "Brezposeln/-a"))

mydata$Q47 <- factor(mydata$Q47, 
                         levels = c(1, 2, 3, 4, 5, 6, 7, 8), 
                         labels = c("Pod 1.000€","1.000€ - 1.500€","1.501€ - 2.000€","2.001€ - 3.000€","3.001€ - 5.000€","5.001€ - 10.000€","Above 10.000€", "I don't want to answer"))

mydata$Q48 <- factor(mydata$Q48, 
                           levels = c(2, 3, 4, 5, 6, 7),  
                           labels = c(
                             "Dokončana osnovna šola", 
                             "Dokončana nižja ali srednja poklicna izobrazba", 
                             "Dokončana srednja strokovna ali splošna izobrazba", 
                             "Dokončana višješolska strokovna ali visokošolska strokovna izobrazba (tudi 1. bolonjska stopnja)", 
                             "Dokončana visokošolska strokovna univerzitetna izobrazba (tudi 2. bolonjska stopnja)", 
                             "Dokončana specializacija, znanstveni magisterij, doktorat"
                           ))
mydata$BankF <- factor(mydata$BankF, 
                         levels = c(1, 2), 
                         labels = c("NLB","Other"))

library(dplyr)

mydataNLB <- mydata %>%
  filter(BankF == "NLB")

library(psych)
describe.by(mydata[c(2:45)])

## Warning in describe.by(mydata[c(2:45)]): describe.by is deprecated.  Please use
## the describeBy function

## Warning in describeBy(x = x, group = group, mat = mat, type = type, ...): no
## grouping variable requested

##                                          vars   n mean   sd median trimmed  mad
## Q21                                         1 265 4.50 1.28      5    4.66 1.48
## Q22a                                        2 265 4.32 0.87      5    4.49 0.00
## Q23a                                        3 265 1.38 0.49      1    1.35 0.00
## Q23b                                        4 265 1.19 0.39      1    1.12 0.00
## Q23c                                        5 265 1.28 0.45      1    1.23 0.00
## Q23d                                        6 265 1.18 0.39      1    1.10 0.00
## Q23e                                        7 265 1.35 0.48      1    1.32 0.00
## Q23f                                        8 265 1.54 0.50      2    1.55 0.00
## Q24*                                        9 265 2.16 1.74      1    1.83 0.00
## Q25                                        10 265 3.50 0.86      4    3.52 1.48
## Q26*                                       11 265 1.47 0.50      1    1.46 0.00
## Q27a*                                      12 265 1.60 0.49      2    1.62 0.00
## Q27b*                                      13 265 1.75 0.44      2    1.81 0.00
## Q27c*                                      14 265 1.57 0.50      2    1.59 0.00
## Q28*                                       15 265 1.64 0.48      2    1.67 0.00
## Q29a                                       16 265 3.44 1.26      4    3.54 1.48
## Q29b                                       17 265 2.73 1.13      3    2.74 1.48
## Q29c                                       18 265 2.60 1.31      2    2.50 1.48
## Q29d                                       19 265 2.97 1.32      3    2.96 1.48
## Q30a                                       20 265 3.07 1.20      3    3.09 1.48
## Q30b                                       21 265 3.38 1.18      4    3.47 1.48
## Q30c                                       22 265 3.43 1.24      4    3.52 1.48
## Q30d                                       23 265 3.12 1.25      3    3.15 1.48
## Q30e                                       24 265 2.93 1.24      3    2.92 1.48
## Q31a                                       25 265 4.13 0.86      4    4.25 1.48
## Q31b                                       26 265 2.51 1.22      2    2.42 1.48
## Q33a                                       27 265 3.83 1.06      4    3.97 1.48
## Q33b                                       28 265 4.08 0.95      4    4.22 1.48
## Q33c                                       29 265 4.26 1.02      5    4.48 0.00
## Q33d                                       30 265 4.04 0.91      4    4.16 1.48
## Q33e                                       31 265 4.02 0.92      4    4.13 1.48
## Q33f                                       32 265 4.22 0.89      4    4.35 1.48
## Poslovalnica_podpora_in_usmerjanje         33 265 3.81 0.88      4    3.86 1.48
## Mobilna aplikacija_podpora_in_usmerjanje   34 265 3.12 0.88      3    3.13 1.48
## Poslovalnica_brezkrbnost                   35 265 3.92 0.87      4    3.97 1.48
## Mobilna aplikacija_brezkrbnost             36 265 3.55 0.87      4    3.57 1.48
## Poslovalnica_varnost                       37 265 4.21 0.81      4    4.31 1.48
## Mobilna aplikacija_varnost                 38 265 3.70 0.85      4    3.73 1.48
## Poslovalnica_dostopnost                    39 265 3.78 1.00      4    3.88 1.48
## Mobilna aplikacija_dostopnost              40 265 3.79 0.89      4    3.84 1.48
## Poslovalnica_jasnost                       41 265 3.91 0.91      4    4.00 1.48
## Mobilna aplikacija_jasnost                 42 265 3.35 0.85      3    3.34 1.48
## Poslovalnica_hitrost                       43 265 2.89 1.02      3    2.90 1.48
## Mobilna aplikacija_hitrost                 44 265 4.20 0.76      4    4.28 1.48
##                                          min max range  skew kurtosis   se
## Q21                                        1   6     5 -1.10     0.76 0.08
## Q22a                                       1   5     4 -1.54     2.46 0.05
## Q23a                                       1   2     1  0.49    -1.77 0.03
## Q23b                                       1   2     1  1.55     0.41 0.02
## Q23c                                       1   2     1  0.96    -1.09 0.03
## Q23d                                       1   2     1  1.65     0.71 0.02
## Q23e                                       1   2     1  0.60    -1.64 0.03
## Q23f                                       1   2     1 -0.17    -1.98 0.03
## Q24*                                       1   6     5  1.34     0.23 0.11
## Q25                                        1   5     4 -0.33     0.31 0.05
## Q26*                                       1   2     1  0.13    -1.99 0.03
## Q27a*                                      1   2     1 -0.41    -1.84 0.03
## Q27b*                                      1   2     1 -1.13    -0.72 0.03
## Q27c*                                      1   2     1 -0.28    -1.93 0.03
## Q28*                                       1   2     1 -0.57    -1.68 0.03
## Q29a                                       1   5     4 -0.45    -0.73 0.08
## Q29b                                       1   5     4 -0.05    -0.97 0.07
## Q29c                                       1   5     4  0.35    -1.06 0.08
## Q29d                                       1   5     4 -0.09    -1.18 0.08
## Q30a                                       1   5     4 -0.24    -0.84 0.07
## Q30b                                       1   5     4 -0.57    -0.55 0.07
## Q30c                                       1   5     4 -0.43    -0.79 0.08
## Q30d                                       1   5     4 -0.31    -0.97 0.08
## Q30e                                       1   5     4 -0.01    -1.02 0.08
## Q31a                                       1   5     4 -1.28     2.22 0.05
## Q31b                                       1   5     4  0.43    -0.85 0.07
## Q33a                                       1   5     4 -0.96     0.54 0.06
## Q33b                                       1   5     4 -1.19     1.39 0.06
## Q33c                                       1   5     4 -1.60     2.10 0.06
## Q33d                                       1   5     4 -1.27     2.05 0.06
## Q33e                                       1   5     4 -1.12     1.62 0.06
## Q33f                                       1   5     4 -1.25     1.64 0.05
## Poslovalnica_podpora_in_usmerjanje         1   5     4 -0.62     0.55 0.05
## Mobilna aplikacija_podpora_in_usmerjanje   1   5     4 -0.08    -0.19 0.05
## Poslovalnica_brezkrbnost                   1   5     4 -0.59     0.39 0.05
## Mobilna aplikacija_brezkrbnost             1   5     4 -0.12    -0.22 0.05
## Poslovalnica_varnost                       1   5     4 -1.13     1.85 0.05
## Mobilna aplikacija_varnost                 1   5     4 -0.44     0.15 0.05
## Poslovalnica_dostopnost                    1   5     4 -0.63     0.10 0.06
## Mobilna aplikacija_dostopnost              1   5     4 -0.42    -0.09 0.05
## Poslovalnica_jasnost                       1   5     4 -0.72     0.33 0.06
## Mobilna aplikacija_jasnost                 1   5     4 -0.03    -0.06 0.05
## Poslovalnica_hitrost                       1   5     4  0.07    -0.59 0.06
## Mobilna aplikacija_hitrost                 1   5     4 -1.02     2.08 0.05

mydata_clu_std <- as.data.frame(scale(mydata[c(3,11,26,27)]))
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.2

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

get_clust_tendency(mydata_clu_std, 
                   n = nrow(mydata_clu_std) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.5586369
## 
## $plot
## NULL

colnames(mydata) [3] <- "Awareness"
colnames(mydata) [11] <- "Ease"
colnames(mydata) [26] <- "Value"
colnames(mydata) [27] <- "Trust"

I changed the name of the variables. We are creating clusters on 4 cluster variables: “Awareness”, “Ease”, “Value”, “Trust”.

library(factoextra)
library(NbClust)

fviz_nbclust(mydata_clu_std, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

fviz_nbclust(mydata_clu_std, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette analysis")

library(dplyr)
library(factoextra)
WARD <- mydata_clu_std %>%
  get_dist(method = "euclidean") %>%  
  hclust(method = "ward.D2")          

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 265

library(factoextra)
fviz_dend(WARD)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_dend(WARD,
          k = 5,
          cex = 0.5,
          palette = "jama",
          color_labels_by_k = TRUE,
          rect = TRUE)

library(factoextra)
library(ggplot2)

Clustering <- kmeans(mydata_clu_std, 
                     centers = 5, 
                     nstart = 25) 

Clustering

## K-means clustering with 5 clusters of sizes 27, 74, 76, 32, 56
## 
## Cluster means:
##         Q22a        Q25       Q31a       Q31b
## 1 -2.2987082 -0.6636458 -0.1920661  0.4967113
## 2  0.1713700 -0.7658139  0.1806157 -0.3039786
## 3  0.5063700  0.9025907  0.5693474 -0.7926587
## 4 -0.0860382 -0.5414867 -1.9278200  0.2004560
## 5  0.2438008  0.4164170  0.1828583  1.1234050
## 
## Clustering vector:
##   [1] 1 3 4 3 3 5 2 5 3 1 4 4 5 2 4 1 2 2 5 5 3 5 4 3 3 1 5 2 2 2 5 2 3 3 3 5 3
##  [38] 4 3 2 3 5 3 3 2 3 3 3 3 3 2 3 5 5 2 2 5 3 5 5 5 3 3 2 3 2 2 2 5 5 5 5 3 2
##  [75] 4 2 5 4 3 5 5 1 3 2 3 2 1 5 3 2 2 1 1 1 2 2 5 3 2 1 3 1 4 3 2 3 1 2 1 3 3
## [112] 5 2 2 1 2 5 1 3 2 5 2 2 4 5 1 2 2 1 2 5 2 5 3 5 1 5 4 4 2 3 1 1 4 2 4 2 3
## [149] 3 4 3 4 2 3 2 5 1 4 1 3 3 3 3 2 2 3 2 3 2 5 3 4 4 3 2 2 3 2 3 3 3 5 3 2 2
## [186] 4 2 2 2 3 1 3 5 4 4 2 2 3 3 5 4 4 2 5 2 4 3 3 5 5 2 1 3 2 5 2 5 2 5 5 4 5
## [223] 3 2 3 2 3 5 3 2 5 3 5 1 1 4 1 5 3 2 4 2 5 2 3 2 3 2 5 5 2 3 3 5 5 2 2 3 4
## [260] 4 3 3 5 4 4
## 
## Within cluster sum of squares by cluster:
## [1]  95.65988  92.49211  79.70569 116.86933  85.02220
##  (between_SS / total_SS =  55.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = mydata_clu_std)

mydata$Dissimilarity <- sqrt(
  mydata_clu_std$Q22a^2 +
  mydata_clu_std$Q25^2 +
  mydata_clu_std$Q31a^2 +
  mydata_clu_std$Q31b^2 
)

head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")], 10)

##      ID Dissimilarity
## 191 191      6.365406
## 11   11      4.823598
## 202 202      4.716886
## 93   93      4.390156
## 241 241      4.281977
## 38   38      4.229806
## 234 234      4.116562
## 115 115      3.993249
## 103 103      3.861159
## 10   10      3.798713

mydata <- mydata %>%
  filter(!ID %in% c(191))

mydata$ID <- seq(1, nrow(mydata))

mydata_clu_std <- as.data.frame(scale(mydata[c(3,11,26,27)]))

library(factoextra) 
get_clust_tendency(mydata_clu_std,
                   n = nrow(mydata_clu_std) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.5736077
## 
## $plot
## NULL

fviz_dend(WARD,
          k = 5,
          cex = 0.5,
          palette = "jama",
          color_labels_by_k = TRUE,
          rect = TRUE)

library(factoextra)
library(ggplot2)

Clustering <- kmeans(mydata_clu_std, 
                     centers = 5, 
                     nstart = 25) 

Clustering

## K-means clustering with 5 clusters of sizes 56, 74, 76, 32, 26
## 
## Cluster means:
##    Awareness       Ease      Value      Trust
## 1  0.2355188  0.4112907  0.1731763  1.1381089
## 2  0.1611116 -0.7879947  0.1708796 -0.2980480
## 3  0.5052525  0.9044778  0.5689982 -0.7897317
## 4 -0.1033202 -0.5604314 -1.9884698  0.2094871
## 5 -2.3155481 -0.5971993 -0.0752230  0.4475952
## 
## Clustering vector:
##   [1] 5 3 4 3 3 1 2 1 3 5 4 4 1 2 4 5 2 2 1 1 3 1 4 3 3 5 1 2 2 2 1 2 3 3 3 1 3
##  [38] 4 3 2 3 1 3 3 2 3 3 3 3 3 2 3 1 1 2 2 1 3 1 1 1 3 3 2 3 2 2 2 1 1 1 1 3 2
##  [75] 4 2 1 4 3 1 1 5 3 2 3 2 5 1 3 2 2 5 5 5 2 2 1 3 2 5 3 5 4 3 2 3 5 2 5 3 3
## [112] 1 2 2 5 2 1 5 3 2 1 2 2 4 1 5 2 2 5 2 1 2 1 3 1 5 1 4 4 2 3 5 5 4 2 4 2 3
## [149] 3 4 3 4 2 3 2 1 5 4 5 3 3 3 3 2 2 3 2 3 2 1 3 4 4 3 2 2 3 2 3 3 3 1 3 2 2
## [186] 4 2 2 2 3 3 1 4 4 2 2 3 3 1 4 4 2 1 2 4 3 3 1 1 2 5 3 2 1 2 1 2 1 1 4 1 3
## [223] 2 3 2 3 1 3 2 1 3 1 5 5 4 5 1 3 2 4 2 1 2 3 2 3 2 1 1 2 3 3 1 1 2 2 3 4 4
## [260] 3 3 1 4 4
## 
## Within cluster sum of squares by cluster:
## [1]  88.14381  95.70238  82.80060 120.50690  75.50678
##  (between_SS / total_SS =  56.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = mydata_clu_std)

mydata <- mydata %>%
  filter(!ID %in% c(11))

mydata$ID <- seq(1, nrow(mydata))

mydata_clu_std <- as.data.frame(scale(mydata[c(3,11,26,27)]))

library(factoextra)
library(ggplot2)

Clustering <- kmeans(mydata_clu_std, 
                     centers = 5, 
                     nstart = 25) 

Clustering

## K-means clustering with 5 clusters of sizes 76, 54, 75, 32, 26
## 
## Cluster means:
##     Awareness       Ease       Value      Trust
## 1  0.16017846 -0.8533305  0.11948655 -0.3059896
## 2  0.21326412  0.3776779  0.22118546  1.1460396
## 3  0.49918810  0.9111497  0.59543745 -0.7892907
## 4 -0.03067026 -0.2443742 -1.97805346  0.2824627
## 5 -2.31336479 -0.6176055 -0.09173426  0.4433489
## 
## Clustering vector:
##   [1] 5 3 4 3 3 2 1 2 3 5 4 2 1 1 5 1 1 2 2 3 2 4 3 3 5 2 1 1 1 2 1 3 3 3 2 3 4
##  [38] 3 1 3 2 3 3 1 3 3 3 3 3 1 3 2 2 1 1 2 3 2 2 2 3 3 1 3 1 1 1 2 2 2 2 3 1 4
##  [75] 1 2 4 4 2 2 5 3 1 3 1 5 2 3 1 1 5 5 5 1 1 2 3 1 5 3 5 4 3 1 3 5 1 5 3 3 2
## [112] 1 1 5 1 2 5 3 1 2 1 1 4 4 5 1 1 5 1 2 1 2 3 2 5 2 4 4 1 3 5 5 4 1 4 1 3 3
## [149] 4 3 1 1 3 1 2 5 4 5 3 3 3 3 1 1 3 1 3 1 2 3 4 4 3 1 1 3 1 3 3 3 2 3 1 1 4
## [186] 1 1 1 3 3 2 4 4 1 1 3 3 2 4 4 1 2 1 4 3 3 4 2 1 5 3 1 2 1 2 1 2 2 4 2 3 1
## [223] 3 1 3 2 3 1 2 3 2 5 5 4 5 2 3 1 4 1 2 1 3 1 3 1 2 2 1 3 3 2 2 1 1 3 4 4 3
## [260] 3 2 4 4
## 
## Within cluster sum of squares by cluster:
## [1] 109.86189  82.00624  80.53778 111.18358  76.83579
##  (between_SS / total_SS =  56.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = mydata_clu_std)

mydata <- mydata %>%
  filter(!ID %in% c(86,200,143,239,37))

mydata$ID <- seq(1, nrow(mydata))

mydata_clu_std <- as.data.frame(scale(mydata[c(3,11,26,27)]))

library(factoextra)
library(ggplot2)

Clustering <- kmeans(mydata_clu_std, 
                     centers = 5, 
                     nstart = 25) 

Clustering

## K-means clustering with 5 clusters of sizes 79, 25, 50, 30, 74
## 
## Cluster means:
##     Awareness       Ease      Value      Trust
## 1  0.50108668  0.8823227  0.6219599 -0.7289837
## 2 -2.38619015 -0.5895733 -0.1589148  0.5303956
## 3  0.14549250  0.3470049  0.1111776  1.2357670
## 4  0.08869192 -0.3574300 -1.9775371  0.1497189
## 5  0.13693951 -0.8323177  0.1162874 -0.2966229
## 
## Clustering vector:
##   [1] 2 1 4 1 1 3 5 3 1 2 4 3 5 4 2 5 5 3 1 1 3 4 1 1 2 3 5 5 5 3 5 1 1 1 3 1 1
##  [38] 5 1 3 1 1 5 1 1 1 1 1 5 1 3 1 5 5 3 1 3 3 3 1 1 5 1 5 5 5 3 3 3 3 1 5 4 5
##  [75] 3 4 4 3 3 2 1 5 1 5 3 1 5 5 2 2 2 5 5 3 1 5 2 1 2 4 1 5 1 2 5 2 1 1 3 5 5
## [112] 2 5 3 2 1 5 3 5 5 4 4 2 5 5 2 5 3 5 3 1 3 2 1 4 4 5 1 2 2 5 4 5 1 1 4 1 4
## [149] 5 1 5 3 2 4 2 1 1 1 1 5 5 1 5 1 5 3 1 4 4 1 5 5 1 5 1 1 1 3 1 5 5 4 5 5 5
## [186] 1 1 3 4 4 5 5 1 1 3 4 5 3 5 4 1 1 4 3 5 2 1 5 3 5 3 5 3 3 4 3 1 5 1 5 1 3
## [223] 1 5 3 1 3 2 2 4 2 3 1 5 5 3 5 1 5 1 5 3 3 5 1 1 1 3 5 5 1 4 4 1 1 3 4 4
## 
## Within cluster sum of squares by cluster:
## [1]  95.40383  77.28224  80.33349  91.09260 105.03046
##  (between_SS / total_SS =  56.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = mydata_clu_std)

mydata <- mydata %>%
  filter(!ID %in% c(90,155,85,146,100))

mydata$ID <- seq(1, nrow(mydata))

mydata_clu_std <- as.data.frame(scale(mydata[c(3,11,26,27)]))

library(factoextra)
library(ggplot2)

Clustering <- kmeans(mydata_clu_std, 
                     centers = 5, 
                     nstart = 25) 

Clustering

## K-means clustering with 5 clusters of sizes 23, 76, 49, 79, 26
## 
## Cluster means:
##     Awareness       Ease       Value       Trust
## 1 -2.41967781 -0.6566175 -0.16063729  0.44471611
## 2  0.11993807 -0.8977034  0.06023898 -0.26926030
## 3  0.11446989  0.3155981  0.08097431  1.31250594
## 4  0.49980397  0.8895872  0.61326842 -0.71180559
## 5  0.05552915 -0.0928475 -2.04997890  0.08289087
## 
## Clustering vector:
##   [1] 1 4 5 4 4 3 2 3 4 1 5 3 2 2 1 2 2 3 4 4 3 5 4 4 1 3 2 2 2 3 2 4 4 4 3 4 4
##  [38] 2 4 3 4 4 2 4 4 4 4 4 2 4 3 4 2 2 3 4 3 3 3 4 4 2 4 2 2 2 3 3 3 3 4 2 5 2
##  [75] 3 5 5 3 3 1 4 2 4 2 4 2 2 1 1 2 2 3 4 2 1 4 1 4 2 4 1 2 1 4 4 3 2 2 1 2 3
## [112] 1 4 2 3 2 2 5 5 1 2 2 1 2 3 2 3 4 3 1 4 5 5 2 4 1 1 2 5 2 4 4 4 2 2 4 2 3
## [149] 1 5 4 4 4 4 2 2 4 2 4 2 3 4 5 5 4 2 2 4 2 4 4 4 3 4 2 2 5 2 2 2 4 4 3 5 5
## [186] 2 2 4 4 3 5 2 3 2 5 4 4 5 3 2 1 4 2 3 2 3 2 3 3 5 3 4 2 4 2 4 3 4 2 3 4 3
## [223] 1 1 5 1 3 4 2 2 3 2 4 2 4 2 3 3 2 4 4 4 3 2 2 4 5 5 4 4 3 5 5
## 
## Within cluster sum of squares by cluster:
## [1]  70.16428 124.58392  80.49920  99.56269  63.35325
##  (between_SS / total_SS =  56.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = mydata_clu_std)

mydata <- mydata %>%
  filter(!ID %in% c(193,190,164))

mydata$ID <- seq(1, nrow(mydata))

mydata_clu_std <- as.data.frame(scale(mydata[c(3,11,26,27)]))

library(factoextra)
library(ggplot2)

Clustering <- kmeans(mydata_clu_std, 
                     centers = 5, 
                     nstart = 25) 

Clustering

## K-means clustering with 5 clusters of sizes 75, 26, 79, 23, 47
## 
## Cluster means:
##     Awareness       Ease       Value      Trust
## 1  0.13714948 -0.8797902  0.08082592 -0.2392861
## 2  0.01602031 -0.1291655 -2.03520230 -0.0309770
## 3  0.50815392  0.9082966  0.62028545 -0.7001278
## 4 -2.40486418 -0.6499943 -0.16676336  0.4696909
## 5  0.09499952  0.2667449  0.03587920  1.3459376
## 
## Clustering vector:
##   [1] 4 3 2 3 3 5 1 5 3 4 2 5 1 2 4 1 1 5 3 3 5 2 3 3 4 5 1 1 1 5 1 3 3 3 5 3 3
##  [38] 1 3 5 3 3 1 3 3 3 3 3 1 3 5 3 1 1 5 3 5 5 5 3 3 1 3 1 1 1 5 5 5 5 3 1 2 1
##  [75] 5 2 2 5 5 4 3 1 3 1 3 1 1 4 4 1 1 5 3 1 4 3 4 3 1 3 4 1 4 3 3 5 1 1 4 1 5
## [112] 4 3 1 5 1 1 2 2 4 1 1 4 1 5 1 5 3 5 4 3 2 2 1 3 4 4 1 2 1 3 3 3 1 1 3 1 5
## [149] 4 2 3 3 3 3 1 1 3 1 3 1 5 3 2 3 1 1 3 1 3 3 3 5 3 1 1 2 1 1 1 3 3 5 2 2 1
## [186] 1 3 3 2 1 1 2 3 3 2 5 1 4 3 1 5 1 5 1 5 5 2 5 3 1 3 1 3 5 3 1 5 3 5 4 4 2
## [223] 4 5 3 1 1 5 1 3 1 3 1 5 5 1 3 3 3 5 1 1 3 2 2 3 3 5 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 121.45870  62.95169 101.43697  71.50918  74.12297
##  (between_SS / total_SS =  56.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = mydata_clu_std)

mydata <- mydata %>%
  filter(!ID %in% c(220,10,109))

mydata$ID <- seq(1, nrow(mydata))

mydata_clu_std <- as.data.frame(scale(mydata[c(3,11,26,27)]))

library(factoextra)
library(ggplot2)

Clustering <- kmeans(mydata_clu_std, 
                     centers = 5, 
                     nstart = 25) 

Clustering

## K-means clustering with 5 clusters of sizes 48, 75, 79, 20, 25
## 
## Cluster means:
##     Awareness       Ease        Value       Trust
## 1  0.07755914  0.2561182 -0.003027745  1.38856900
## 2  0.10677233 -0.9230788  0.077510269 -0.22238704
## 3  0.50068099  0.9055025  0.618119866 -0.69048033
## 4 -2.49544837 -0.4992393 -0.152598342  0.31410430
## 5 -0.05502378 -0.1845070 -2.057897640 -0.06825693
## 
## Clustering vector:
##   [1] 4 3 5 3 3 1 2 1 3 5 1 2 5 4 2 2 1 3 3 1 5 3 3 4 1 2 2 2 1 2 3 3 3 1 3 3 2
##  [38] 3 1 3 3 2 3 3 3 3 3 2 3 1 3 2 2 1 3 1 1 1 3 3 2 3 2 2 2 1 1 1 1 3 2 5 2 1
##  [75] 5 5 1 1 4 3 2 3 2 3 2 2 4 4 2 2 1 3 2 4 3 4 3 2 3 4 2 4 3 3 1 2 2 2 1 4 3
## [112] 2 1 2 2 5 5 4 2 2 4 2 1 2 1 3 1 4 3 5 5 2 3 4 4 2 5 2 3 3 3 2 2 3 2 1 4 5
## [149] 3 3 3 3 2 2 3 2 3 2 1 3 5 3 2 2 3 2 3 3 3 1 3 2 2 5 2 2 2 3 3 1 5 5 2 2 3
## [186] 3 5 2 2 5 3 3 1 1 2 4 3 2 1 2 1 2 1 1 5 1 3 2 3 2 3 1 3 2 1 3 1 4 5 4 1 3
## [223] 2 2 1 2 3 2 3 2 1 1 2 3 3 3 1 2 2 3 5 5 3 3 1 5 5
## 
## Within cluster sum of squares by cluster:
## [1]  81.67665 127.65628 106.19954  49.86444  62.38244
##  (between_SS / total_SS =  56.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = mydata_clu_std)

Averages <- Clustering$centers
Averages

##     Awareness       Ease        Value       Trust
## 1  0.07755914  0.2561182 -0.003027745  1.38856900
## 2  0.10677233 -0.9230788  0.077510269 -0.22238704
## 3  0.50068099  0.9055025  0.618119866 -0.69048033
## 4 -2.49544837 -0.4992393 -0.152598342  0.31410430
## 5 -0.05502378 -0.1845070 -2.057897640 -0.06825693

Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)

library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Awareness", "Ease", "Value", "Trust"))



Figure$Group <- factor(Figure$ID, 
                       levels = c(1, 2, 3, 4,5), 
                       labels = c("1", "2", "3", "4","5"))


Figure$NameF <- factor(Figure$name, 
                       levels = c("Awareness", "Ease", "Value", "Trust"), 
                       labels = c("Awareness", "Ease", "Value", "Trust"))


library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 5, alpha = 0.4) +
  geom_line(aes(group = ID), linewidth = 1.5) +
  ylab("Averages") +
  xlab("Cluster variables")+
  ylim(-2.5, 2.5) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 12))

mydata$Group <- Clustering$cluster
fit <- aov(cbind(Awareness, Ease, Value, Trust) ~ as.factor(Group), 
             data = mydata)

summary(fit)

##  Response Awareness :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 80.074 20.0186  87.691 < 2.2e-16 ***
## Residuals        242 55.245  0.2283                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Ease :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 86.860 21.7150  76.879 < 2.2e-16 ***
## Residuals        242 68.354  0.2825                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Value :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 71.857 17.9643  76.009 < 2.2e-16 ***
## Residuals        242 57.195  0.2363                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Trust :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 172.02  43.004  74.816 < 2.2e-16 ***
## Residuals        242 139.10   0.575                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Significant difference in variables across clusters.

mydata$Q42 <- as.numeric(as.character(mydata$Q42))
current_year <- as.numeric(format(Sys.Date(), "%Y"))
mydata$Age <- current_year - mydata$Q42

Changed the year of birth into age in number of years.

aggregate(mydata$Age, 
          by = list(mydata$Group), 
          FUN = median)

##   Group.1    x
## 1       1 27.0
## 2       2 29.0
## 3       3 27.0
## 4       4 23.5
## 5       5 35.0

DIFFERENCES ACROSS SEGMENTS/CLUSTERS/GROUPS

library(rstatix)

## Warning: package 'rstatix' was built under R version 4.4.2

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

mydata %>%
  group_by(Group) %>%
  shapiro_test(Age)

## # A tibble: 5 × 4
##   Group variable statistic            p
##   <int> <chr>        <dbl>        <dbl>
## 1     1 Age          0.804 0.00000163  
## 2     2 Age          0.863 0.000000887 
## 3     3 Age          0.832 0.0000000502
## 4     4 Age          0.728 0.0000868   
## 5     5 Age          0.897 0.0162

kruskal.test(Age ~ Group, 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Age by Group
## Kruskal-Wallis chi-squared = 4.5423, df = 4, p-value = 0.3376

We can not reject H0 at p=0.3376 No significant difference in Age between groups.

aggregate(mydata$Age, 
          by = list(mydata$Group), 
          FUN = median)

##   Group.1    x
## 1       1 27.0
## 2       2 29.0
## 3       3 27.0
## 4       4 23.5
## 5       5 35.0

aggregate(mydata$Age, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 34.27083
## 2       2 36.09333
## 3       3 33.40506
## 4       4 32.10000
## 5       5 38.48000

Age does not differ statistically across segments, however here are means and medians ove age in years across segments

library(dplyr)

mydata$UsageFrequency <- case_when(
  mydata$Q21 %in% 1:4 ~ "Occasional Usage",
  mydata$Q21 %in% 5:6 ~ "Frequent Usage"
)

DifUsage <- chisq.test(mydata$UsageFrequency, mydata$Group)

DifUsage

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$UsageFrequency and mydata$Group
## X-squared = 12.12, df = 4, p-value = 0.01648

We reject H0 at p=0.017 There is significant association between usage of the mobile app across groups.

library(dplyr)
library(ggplot2)

usage_table <- table(mydata$Group, mydata$UsageFrequency)

usage_proportions <- prop.table(usage_table, margin = 1)

usage_df <- as.data.frame(as.table(usage_proportions))

most_frequent_group <- usage_df %>%
  group_by(Var2) %>%  
  slice_max(Freq, n = 1)

print("Most Frequent Groups for Each Usage Category:")

## [1] "Most Frequent Groups for Each Usage Category:"

print(most_frequent_group)

## # A tibble: 2 × 3
## # Groups:   Var2 [2]
##   Var1  Var2              Freq
##   <fct> <fct>            <dbl>
## 1 3     Frequent Usage   0.785
## 2 5     Occasional Usage 0.52

ggplot(usage_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group",
    y = "Proportion (%)",
    fill = "Usage Category",
    title = "Usage Proportions Across Groups"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)  
  )

Here we see which group/segment has most frequent useres etc.

DifWhere <- chisq.test(mydata$Q28, mydata$Group)

DifWhere

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q28 and mydata$Group
## X-squared = 26.786, df = 4, p-value = 2.196e-05

We can reject H0 at p<0.001 Significant association between in Where individuals would carry out advanced service next week if they had to and group

library(dplyr)
library(ggplot2)

service_table <- table(mydata$Group, mydata$Q28)

service_proportions <- prop.table(service_table, margin = 1)

service_df <- as.data.frame(as.table(service_proportions))

service_df$Var2 <- recode(service_df$Var2, 
                          `1` = "In Branch", 
                          `2` = "In Mobile App")


most_frequent_service <- service_df %>%
  group_by(Var2) %>%  
  slice_max(Freq, n = 1)  

print("Most Frequent Groups for Each Service Option:")

## [1] "Most Frequent Groups for Each Service Option:"

print(most_frequent_service)

## # A tibble: 2 × 3
## # Groups:   Var2 [2]
##   Var1  Var2                  Freq
##   <fct> <fct>                <dbl>
## 1 4     V poslovalnici       0.55 
## 2 3     V mobilni aplikaciji 0.873

ggplot(service_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group",
    y = "Proportion (%)",
    fill = "Service Option",
    title = "Preference for Advanced Service Location by Group"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  # Move legend below the plot
    axis.text.x = element_text(angle = 45, hjust = 1)  # Rotate x-axis labels for better readability
  )

Here we see the distribution of the latter across groups, used for describing the groups/segments.

DifNotify <- chisq.test(mydata$Q26, mydata$Group)

DifNotify

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q26 and mydata$Group
## X-squared = 24.965, df = 4, p-value = 5.113e-05

We can reject H0 at p<0.001 There is association between groups and wheter individuals noticed notifications about advanced services.

library(dplyr)
library(ggplot2)

notification_table <- table(mydata$Group, mydata$Q26)

notification_proportions <- prop.table(notification_table, margin = 1)

notification_df <- as.data.frame(as.table(notification_proportions))

notification_df$Var2 <- recode(notification_df$Var2, 
                               `1` = "Yes", 
                               `2` = "No")

most_frequent_notification <- notification_df %>%
  group_by(Var2) %>%
  slice_max(Freq, n = 1)

print("Most Frequent Groups for Each Notification Response:")

## [1] "Most Frequent Groups for Each Notification Response:"

print(most_frequent_notification)

## # A tibble: 2 × 3
## # Groups:   Var2 [2]
##   Var1  Var2   Freq
##   <fct> <fct> <dbl>
## 1 3     Da    0.747
## 2 4     Ne    0.75

ggplot(notification_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group",
    y = "Proportion (%)",
    fill = "Notification Noticed",
    title = "Notifications About Advanced Services by Group"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Here we see the distribution of the recievement of notifications across groups, used for describing the groups/segments.

DiffAwareness <- chisq.test(mydata$Q23a, mydata$Group)

DiffAwareness

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q23a and mydata$Group
## X-squared = 15.959, df = 4, p-value = 0.003075

We can reject H0 at p=0.004 There is association between groups and awareness of advanced feature “Getting Consumer Loan”

DiffAwareness1 <- chisq.test(mydata$Q23e, mydata$Group)

DiffAwareness1

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q23e and mydata$Group
## X-squared = 18.035, df = 4, p-value = 0.001215

We can reject H0 at p=0.002 There is association between groups and awareness of advanced feature “Opening a Saving Account”

DiffAwareness2 <- chisq.test(mydata$Q23f, mydata$Group)

DiffAwareness2

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Q23f and mydata$Group
## X-squared = 14.801, df = 4, p-value = 0.005132

We can reject H0 at p=0.006 There is association between groups and awareness of advanced service “Term Deposits”

Under here are visual representations of awareness of the mentioned advanced services across groups/segments. This can be used for interpretation of segments and recommendations

library(dplyr)
library(ggplot2)

visualize_distribution <- function(data, group_col, awareness_col, awareness_label) {
  
  awareness_table <- table(data[[group_col]], data[[awareness_col]])
  
  
  awareness_proportions <- prop.table(awareness_table, margin = 1)
  
 
  awareness_df <- as.data.frame(as.table(awareness_proportions))
  
  
  awareness_df$Var2 <- recode(awareness_df$Var2, 
                              `1` = "Yes", 
                              `2` = "No")
  
 
  ggplot(awareness_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +
    geom_bar(stat = "identity", position = "fill") +
    labs(
      x = "Group",
      y = "Proportion (%)",
      fill = "Awareness",
      title = paste("Awareness of", awareness_label, "by Group")
    ) +
    theme_minimal() +
    theme(
      legend.position = "bottom",
      axis.text.x = element_text(angle = 45, hjust = 1)
    )
}


plot_q23a <- visualize_distribution(
  data = mydata,
  group_col = "Group",
  awareness_col = "Q23a",
  awareness_label = "Getting Consumer Loan"
)


plot_q23e <- visualize_distribution(
  data = mydata,
  group_col = "Group",
  awareness_col = "Q23e",
  awareness_label = "Opening a Saving Account"
)

# Plot for Q23f (Term Deposits)
plot_q23f <- visualize_distribution(
  data = mydata,
  group_col = "Group",
  awareness_col = "Q23f",
  awareness_label = "Term Deposits"
)

# Display the plots
print(plot_q23a)

print(plot_q23e)

print(plot_q23f)

SOME INTERPRETATION OF EACH SEGMENT/CLUSTER/GROUP

Segment 1: Trust Champions Description: This group scores the highest in trust, while other dimensions (Awareness, Ease, and Value) remain around the average. They are confident in the app’s reliability but may not feel strongly compelled to use advanced services unless trustworthiness is a key selling point. Key Traits: Trust: Very high. Awareness, Ease, Value: Around average. Barriers: Lack of compelling motivation beyond trust to engage with advanced services. Targeting Strategy: Build on Trust: Emphasize that advanced features are just as reliable and secure as the core app services. Highlight Practical Benefits: Demonstrate how advanced features can enhance the app’s utility without compromising trust. Introduce Exclusivity: Frame advanced features as “trustworthy tools” available to a select group of valued users.

Segment 2: Cautious Navigators Description: This group is generally balanced across all dimensions, but their Ease score is slightly below average. They may feel hesitant to adopt advanced features because they perceive them as too complicated or not intuitive. Key Traits: Awareness, Trust, Value: Average. Ease: Slightly below average. Barriers: Perceived complexity or lack of user-friendliness. Targeting Strategy: Simplify Onboarding: Offer step-by-step tutorials, walkthroughs, or in-app prompts to demystify advanced features. Emphasize User-Friendliness: Highlight how easy it is to activate and use advanced features in marketing messages. Provide Support: Include accessible help resources or a dedicated support channel for questions about advanced services.

Segment 3: High Performers Description: This group scores above average across all dimensions, with Ease being their strongest area. However, their Trust score is the smallest and falls below average. They may engage with advanced features if their concerns around trust and reliability are addressed. Key Traits: Ease, Awareness, Value: Above average. Trust: Below average. Barriers: Lack of trust in the app’s advanced features or uncertainty about their reliability. Targeting Strategy: Address Trust Concerns: Use testimonials, endorsements, and transparent explanations to build confidence in advanced services. Promote Ease and Value: Highlight how easy and rewarding it is to use advanced features, leveraging their preference for intuitive tools. Demonstrate Reliability: Offer guarantees or show evidence of the app’s security and performance in handling advanced features.

Segment 4: Trust-Driven Skeptics Description: This group scores below average across most dimensions, but their Trust score is just above average. They may trust the app but are otherwise disengaged due to lack of awareness or motivation. Key Traits: Trust: Slightly above average. Awareness, Ease, Value: Below average. Barriers: Low awareness and lack of engagement with the app’s features. Targeting Strategy: Raise Awareness: Use in-app notifications, banners, or campaigns to explain the purpose and benefits of advanced features. Leverage Trust: Position advanced features as extensions of the app’s reliable core functionality. Encourage Exploration: Use small incentives (e.g., rewards, discounts) to motivate them to try advanced features.

Segment 5: Disengaged Low-Engagers Description: This group scores below average across all dimensions, with the lowest scores in Awareness and Value. They are the most disengaged group and may not understand or see the value in the app’s advanced services. Key Traits: Awareness, Value: Very low. Ease, Trust: Below average. Barriers: Lack of awareness and a strong sense that advanced features aren’t worth using. Targeting Strategy: Focus on Awareness: Use prominent marketing, notifications, or educational content to ensure this group understands the availability and benefits of advanced services. Demonstrate Clear Value: Show how advanced services solve specific problems or improve their app experience. Simplify Access: Make it extremely easy for this group to try advanced features, perhaps through a free trial or an opt-in demo mode.

mydata$AgeGroup <- cut(mydata$Age, 
                       breaks = c(18, 30, 40, 50, 60, Inf), 
                       labels = c("18-30", "31-40", "41-50", "51-60", "60+"))

table_age <- table(mydata$AgeGroup, mydata$Q23a)

print(table_age)

##        
##          1  2
##   18-30 74 66
##   31-40 26  2
##   41-50 26 10
##   51-60 18  7
##   60+   14  4

Awareness of advenced services across AGE GROUPS*

library(ggplot2)

prop_table5 <- prop.table(table(mydata$AgeGroup, mydata$Q23a), margin = 1)
prop_df5 <- as.data.frame(as.table(prop_table5))

ggplot(prop_df5, aes(x = Var1, y = Freq, fill = Var2)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(x = "Age Group", 
       y = "Proportion", 
       fill = "Awareness of Advanced Service", 
       title = "Getting a Conmsumer Loan",) +
  scale_fill_manual(
    values = c("1" = "#33006F", "2" = "#84BD00"),  
    labels = c("1" = "Yes", "2" = "No")  
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  
    axis.text.x = element_text(angle = 45, hjust = 1)  
  )

library(ggplot2)

prop_table51 <- prop.table(table(mydata$AgeGroup, mydata$Q23b), margin = 1)
prop_df51 <- as.data.frame(as.table(prop_table51))

ggplot(prop_df51, aes(x = Var1, y = Freq, fill = Var2)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(x = "Age Group", 
       y = "Proportion", 
       fill = "Awareness of Advanced Service", 
       title = "Change of the Account Limit",) +
  scale_fill_manual(
    values = c("1" = "#33006F", "2" = "#84BD00"),  
    labels = c("1" = "Yes", "2" = "No")  
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  
    axis.text.x = element_text(angle = 45, hjust = 1)  
  )

library(ggplot2)

prop_table52 <- prop.table(table(mydata$AgeGroup, mydata$Q23c), margin = 1)
prop_df52 <- as.data.frame(as.table(prop_table52))

ggplot(prop_df52, aes(x = Var1, y = Freq, fill = Var2)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(x = "Age Group", 
       y = "Proportion", 
       fill = "Awareness of Advanced Service", 
       title = "Ordering a new card",) +
  scale_fill_manual(
    values = c("1" = "#33006F", "2" = "#84BD00"),  
    labels = c("1" = "Yes", "2" = "No")  
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  
    axis.text.x = element_text(angle = 45, hjust = 1)  
  )

library(ggplot2)

prop_table53 <- prop.table(table(mydata$AgeGroup, mydata$Q23d), margin = 1)
prop_df53 <- as.data.frame(as.table(prop_table53))

ggplot(prop_df53, aes(x = Var1, y = Freq, fill = Var2)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(x = "Age Group", 
       y = "Proportion", 
       fill = "Awareness of Advanced Service", 
       title = "Ordering a New Card",) +
  scale_fill_manual(
    values = c("1" = "#33006F", "2" = "#84BD00"),  
    labels = c("1" = "Yes", "2" = "No")  
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  
    axis.text.x = element_text(angle = 45, hjust = 1)  
  )

library(ggplot2)

prop_table54 <- prop.table(table(mydata$AgeGroup, mydata$Q23e), margin = 1)
prop_df54 <- as.data.frame(as.table(prop_table54))

ggplot(prop_df54, aes(x = Var1, y = Freq, fill = Var2)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(x = "Age Group", 
       y = "Proportion", 
       fill = "Awareness of Advanced Service", 
       title = "Opening a Saving Account",) +
  scale_fill_manual(
    values = c("1" = "#33006F", "2" = "#84BD00"),  
    labels = c("1" = "Yes", "2" = "No")  
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  
    axis.text.x = element_text(angle = 45, hjust = 1)  
  )

library(ggplot2)

prop_table55 <- prop.table(table(mydata$AgeGroup, mydata$Q23f), margin = 1)
prop_df55 <- as.data.frame(as.table(prop_table55))

ggplot(prop_df55, aes(x = Var1, y = Freq, fill = Var2)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(
    x = "Age Group",
    y = "Proportion",
    fill = "Awareness of Advanced Service",  
    title = "Term Deposits"
  ) +
  scale_fill_manual(
    values = c("1" = "#33006F", "2" = "#84BD00"),  
    labels = c("1" = "Yes", "2" = "No")  
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  
    axis.text.x = element_text(angle = 45, hjust = 1)  
  )

set.seed(123)
data <- iris[, -5]  # Removing the species column
kmeans_result <- kmeans(data, centers = 5)

# Get the size of each cluster
kmeans_result$size

## [1] 19  8 38 23 62

table_clusters1 <- table(mydata$Group, mydata$Q45)

prop_table_clusters1 <- prop.table(table_clusters1, margin = 1)

prop_df1 <- as.data.frame(as.table(prop_table_clusters1))

library(ggplot2)
ggplot(prop_df1, aes(x = Var1, y = Freq * 100, fill = Var2)) +  
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of Primary Bank by Group"
  ) +
  theme_minimal()

table_data <- table(mydata$Q45, mydata$Group)

percentage_distribution <- prop.table(table_data, margin = 2) * 100

as.data.frame(as.table(percentage_distribution))

##                                 Var1 Var2      Freq
## 1                     OTP banka d.d.    1 36.363636
## 2         Banka Intesa Sanpaolo d.d.    1 11.363636
## 3  Nova Ljubljanska Banka d.d. (NLB)    1 38.636364
## 4               Gorenjska Banka d.d.    1  2.272727
## 5           Delavska Hranilnica d.d.    1  4.545455
## 6                            Revolut    1  0.000000
## 7       Deželna Banka Slovenije d.d.    1  0.000000
## 8               Banka Sparkasse d.d.    1  2.272727
## 9                   Addiko Bank d.d.    1  0.000000
## 10    UniCredit Banka Slovenija d.d.    1  4.545455
## 11                    OTP banka d.d.    2 27.536232
## 12        Banka Intesa Sanpaolo d.d.    2 13.043478
## 13 Nova Ljubljanska Banka d.d. (NLB)    2 28.985507
## 14              Gorenjska Banka d.d.    2  2.898551
## 15          Delavska Hranilnica d.d.    2 14.492754
## 16                           Revolut    2  1.449275
## 17      Deželna Banka Slovenije d.d.    2  0.000000
## 18              Banka Sparkasse d.d.    2  4.347826
## 19                  Addiko Bank d.d.    2  1.449275
## 20    UniCredit Banka Slovenija d.d.    2  5.797101
## 21                    OTP banka d.d.    3 45.454545
## 22        Banka Intesa Sanpaolo d.d.    3  5.194805
## 23 Nova Ljubljanska Banka d.d. (NLB)    3 25.974026
## 24              Gorenjska Banka d.d.    3  2.597403
## 25          Delavska Hranilnica d.d.    3  2.597403
## 26                           Revolut    3  3.896104
## 27      Deželna Banka Slovenije d.d.    3  1.298701
## 28              Banka Sparkasse d.d.    3  5.194805
## 29                  Addiko Bank d.d.    3  2.597403
## 30    UniCredit Banka Slovenija d.d.    3  5.194805
## 31                    OTP banka d.d.    4 16.666667
## 32        Banka Intesa Sanpaolo d.d.    4  5.555556
## 33 Nova Ljubljanska Banka d.d. (NLB)    4 50.000000
## 34              Gorenjska Banka d.d.    4  5.555556
## 35          Delavska Hranilnica d.d.    4 11.111111
## 36                           Revolut    4  5.555556
## 37      Deželna Banka Slovenije d.d.    4  0.000000
## 38              Banka Sparkasse d.d.    4  0.000000
## 39                  Addiko Bank d.d.    4  0.000000
## 40    UniCredit Banka Slovenija d.d.    4  5.555556
## 41                    OTP banka d.d.    5 33.333333
## 42        Banka Intesa Sanpaolo d.d.    5  8.333333
## 43 Nova Ljubljanska Banka d.d. (NLB)    5 29.166667
## 44              Gorenjska Banka d.d.    5  4.166667
## 45          Delavska Hranilnica d.d.    5 12.500000
## 46                           Revolut    5  0.000000
## 47      Deželna Banka Slovenije d.d.    5  4.166667
## 48              Banka Sparkasse d.d.    5  4.166667
## 49                  Addiko Bank d.d.    5  0.000000
## 50    UniCredit Banka Slovenija d.d.    5  4.166667

NLB_Clustering

Pino Sekač

2025-02-05

DIFFERENCES ACROSS SEGMENTS/CLUSTERS/GROUPS

SOME INTERPRETATION OF EACH SEGMENT/CLUSTER/GROUP

Awareness of advenced services across AGE GROUPS*