Impoting and cleaning data

mydata <- read.table("~/Program R/MVA/NLB Project/survey-results.csv",
                     header=TRUE,
                     sep=";",
                     dec=",") # Data was partialy cleaned in 1KA and Excel

mydata$ID <- seq(1,nrow(mydata)) # Adding variable ID for better understanding of data

head(mydata)

##   Q8 Q19a Q19b Q19c Q19d Q19e Q19f Q19g Q19h Q26 Q27a_1 Q27b_1 Q27c_1 Q28 Q29
## 1  2    2    5    5    5    1    1    2    2   2      7      7      7   1   4
## 2  2    3    5    5    5    2    2    3    3   2      7      7      4   1   3
## 3  2    4    3    4    5    5    1    5    3   5      7      7      7   1   1
## 4  2    4    4    5    5    3    2    3    3   4      6      7      6   1   1
## 5  2    4    5    5    5    4    2    4    4   3      6      7      7   1   1
## 6  2    5    5    5    5    2    2    3    3   3      4      6      6   1   2
##   Q30a_1 Q30b_1 Q30c_1 Q30d_1 Q30e_1 Q30f_1 Q30g_1 Q31a_1 Q31b_1 Q31c_1 Q32a_1
## 1      6      3      5      5      5      5      5      5      3      3      6
## 2      7      6      7      4      4      5      5      5      5      4      7
## 3      7      2      6      1      2      1      2      5      7      5      6
## 4      4      3      5      4      6      2      3      5      6      6      5
## 5      6      5      6      2      5      6      5      3      6      3      6
## 6      3      2      4      3      5      2      3      3      5      3      6
##   Q32b_1 Q32c_1 Q32d_1 Q32e_1 Q33 Q34 Q35 Q36a Q36b Q36c Q36d Q36e Q46 Q1a_1
## 1      6      6      6      6   3   5   6    1    0    0    1    1  80     6
## 2      5      5      5      5   5   3   7    1    0    0    1    1  50     7
## 3      6      7      7      2   7   7   7    1    1    1    1    0  90     7
## 4      3      4      2      5   6   6   6    1    1    1    0    0  75     6
## 5      6      6      6      5   4   7   3    1    0    0    1    0  72     6
## 6      6      5      7      3   6   6   3    1    1    0    1    0  81     3
##   Q1b_1 Q1c_1 Q1d_1 Q1e_1 Q1f_1 Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1
## 1     6     7     7     7     3     7     6     7     7     7     7     7     6
## 2     7     7     7     7     7     4     4     4     7     7     7     7     7
## 3     7     5     6     7     7     7     6     5     3     7     7     6     7
## 4     6     5     6     6     6     7     5     5     3     5     6     5     6
## 5     5     6     6     6     4     6     6     5     2     4     6     5     3
## 6     6     4     5     3     2     6     6     6     5     6     5     3     6
##   Q4c_1 Q5a_1 Q5b_1 Q5c_1 Q6a_1 Q6b_1 Q6c_1 Q7a_1 Q7b_1 Q7c_1 Q39  Q40 Q37 Q38
## 1     7     7     7     7     7     6     6     7     7     7   1 1968   7  -2
## 2     7     7     7     7     7     3     3     2     7     7   2 1975   7  -2
## 3     7     3     7     7     7     6     5     2     7     7   2 1977   5  -2
## 4     6     3     6     6     7     5     7     6     6     6   2 1984   5  -2
## 5     4     2     1     4     5     7     4     4     5     7   1 1975   5  -2
## 6     5     3     7     6     5     4     4     6     5     6   2 1972   6  -2
##   Q41 Q42 Q43 Q44 Q45 ID
## 1   7   1   3   1   1  1
## 2   7   1   3   2   1  2
## 3   6   1   4   1  10  3
## 4   6   1   4   1   1  4
## 5   6   1   3   3   1  5
## 6   6   1   3   1   1  6

mydata$Q8 <- as.numeric(mydata$Q8)
mydata$Q26 <- as.numeric(mydata$Q26)
mydata$Q33 <- as.numeric(mydata$Q33)
mydata$Q34 <- as.numeric(mydata$Q34)
mydata$Q35 <- as.numeric(mydata$Q35)
mydata$Q36a <- as.numeric(mydata$Q36a)
mydata$Q36b <- as.numeric(mydata$Q36b)
mydata$Q36c <- as.numeric(mydata$Q36c)
mydata$Q36d <- as.numeric(mydata$Q36d)
mydata$Q36e <- as.numeric(mydata$Q36e)
mydata$Q46 <- as.numeric(mydata$Q46)
mydata$Q40 <- as.numeric(mydata$Q40) # Transforming variables to numeric

mydata$Q19a <- factor(mydata$Q19a,
                      levels = c(4,1,2,3,5),
                      labels = c("Mostly digital payments", "Cash only", "Mostly cash", "Half cash, half digital payments", "Only digital payments"))

mydata$Q19b <- factor(mydata$Q19b,
                      levels = c(5,1,2,3,4),
                      labels = c("Only digital payments", "Cash only", "Mostly cash", "Half cash, half digital payments", "Mostly digital payments"))

mydata$Q19c <- factor(mydata$Q19c,
                      levels = c(5,1,2,3,4),
                      labels = c("Only digital payments", "Cash only", "Mostly cash", "Half cash, half digital payments", "Mostly digital payments"))

mydata$Q19d <- factor(mydata$Q19d,
                      levels = c(5,1,2,3,4),
                      labels = c("Only digital payments", "Cash only", "Mostly cash", "Half cash, half digital payments", "Mostly digital payments"))

mydata$Q19e <- factor(mydata$Q19e,
                      levels = c(3,1,2,5,4),
                      labels = c("Half cash, half digital payments", "Cash only", "Mostly cash", "Only digital payments", "Mostly digital payments"))

mydata$Q19f <- factor(mydata$Q19f,
                      levels = c(1,2,3,4,5),
                      labels = c("Cash only", "Mostly cash", "Half cash, half digital payments", "Mostly digital payments", "Only digital payments"))

mydata$Q19g <- factor(mydata$Q19g,
                      levels = c(3,1,2,5,4),
                      labels = c("Half cash, half digital payments", "Cash only", "Mostly cash", "Only digital payments", "Mostly digital payments"))

mydata$Q19h <- factor(mydata$Q19h,
                      levels = c(4,1,2,3,5),
                      labels = c("Mostly digital payments", "Cash only", "Mostly cash", "Half cash, half digital payments", "Only digital payments"))


mydata$Q28 <- factor(mydata$Q28,
                      levels = c(4,1,2,3),
                      labels = c("Cash", "Credit/Debit cards", "Mobile payment options", "Mobile banks"))

mydata$Q29 <- factor(mydata$Q29,
                      levels = c(1,2,3,4,5),
                      labels = c("Less than 50 EUR", "50 EUR - 100 EUR", "101 EUR - 300 EUR", "301 EUR - 500 EUR", "More than 500 EUR"))

mydata$Q39 <- factor(mydata$Q39,
                      levels = c(1,2),
                      labels = c("Male", "Female"))

mydata$Q37 <- factor(mydata$Q37,
                      levels = c(5,1,6,7),
                      labels = c("1.701 EUR - 2.500 EUR", "Pension", "2.501 EUR - 3.300 EUR", "Over 3.300 EUR"))

mydata$Q38 <- factor(mydata$Q38,
                      levels = c(-2,5),
                      labels = c("No pension", "Pension over 1.110 EUR"))

mydata$Q41 <- factor(mydata$Q41,
                      levels = c(6,1,2,3,4,5,7),
                      labels = c("Completed university academic education (also 2nd Bologna level)", "Incomplete primary school", "Completed primary school", "Completed lower or secondary vocational education", "Completed secondary professional or general education", "Completed higher professional or university professional education (also 1st Bologna level)","Completed specialization, scientific master’s degree, doctorate"))

mydata$Q42 <- factor(mydata$Q42,
                      levels = c(1,2,3,4),
                      labels = c("Employed for shorter/longer working hours", "Self-employed", "Retired", "Currently unemployed"))

mydata$Q43 <- factor(mydata$Q43,
                      levels = c(3,1,2,4,5,-2),
                      labels = c("Office professions (e.g., IT, finance)", "Physical work (e.g., construction, factory work)", "Service industry (e.g., retail, hospitality)", "Public sector (e.g., healthcare, education, politics)", "Creative/artistic work", "Self-employed or retired or currently unemployed"))

mydata$Q44 <- factor(mydata$Q44,
                      levels = c(1,2,3),
                      labels = c("Urban area (city or metropolitan region)", "Suburban area (on the outskirts of the city)", "Rural area (village or countryside)"))

mydata$Q45 <- factor(mydata$Q45,
                      levels = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14),
                      labels = c("NLB", "OTP", "Unicredit", "Raiffaisen", "Gorenjska banka", "Intesa Sanpaolo", "Delavska Hranilnica", "Revolut", "N26", "Sparkasse", "Sberbank", "Addiko", "Deželna banka Slovenije", "UBS")) # Adding factors

mydata$Q40_Age <- 2025-mydata$Q40 # Creating age out of variable year of birth

summary(mydata)

##        Q8                                  Q19a   
##  Min.   :2   Mostly digital payments         :74  
##  1st Qu.:2   Cash only                       : 3  
##  Median :2   Mostly cash                     :14  
##  Mean   :2   Half cash, half digital payments:26  
##  3rd Qu.:2   Only digital payments           :38  
##  Max.   :2                                        
##                                                   
##                                Q19b                                  Q19c    
##  Only digital payments           :99   Only digital payments           :104  
##  Cash only                       : 5   Cash only                       :  4  
##  Mostly cash                     : 5   Mostly cash                     :  5  
##  Half cash, half digital payments:16   Half cash, half digital payments: 12  
##  Mostly digital payments         :30   Mostly digital payments         : 30  
##                                                                              
##                                                                              
##                                Q19d                                   Q19e   
##  Only digital payments           :111   Half cash, half digital payments:52  
##  Cash only                       :  5   Cash only                       :25  
##  Mostly cash                     :  6   Mostly cash                     :34  
##  Half cash, half digital payments: 10   Only digital payments           :14  
##  Mostly digital payments         : 23   Mostly digital payments         :30  
##                                                                              
##                                                                              
##                                Q19f                                  Q19g   
##  Cash only                       :60   Half cash, half digital payments:45  
##  Mostly cash                     :63   Cash only                       :30  
##  Half cash, half digital payments:17   Mostly cash                     :39  
##  Mostly digital payments         : 8   Only digital payments           :14  
##  Only digital payments           : 7   Mostly digital payments         :27  
##                                                                             
##                                                                             
##                                Q19h         Q26            Q27a_1     
##  Mostly digital payments         :51   Min.   :1.000   Min.   :1.000  
##  Cash only                       :17   1st Qu.:2.500   1st Qu.:5.000  
##  Mostly cash                     :21   Median :3.000   Median :6.000  
##  Half cash, half digital payments:45   Mean   :3.245   Mean   :5.768  
##  Only digital payments           :21   3rd Qu.:4.000   3rd Qu.:7.000  
##                                        Max.   :7.000   Max.   :7.000  
##                                                                       
##      Q27b_1          Q27c_1                          Q28    
##  Min.   :1.000   Min.   :1.000   Cash                  :11  
##  1st Qu.:6.000   1st Qu.:6.000   Credit/Debit cards    :98  
##  Median :7.000   Median :6.000   Mobile payment options:34  
##  Mean   :6.252   Mean   :5.974   Mobile banks          :12  
##  3rd Qu.:7.000   3rd Qu.:7.000                              
##  Max.   :7.000   Max.   :7.000                              
##                                                             
##                 Q29         Q30a_1          Q30b_1      Q30c_1     
##  Less than 50 EUR :71   Min.   :1.000   Min.   :1   Min.   :1.000  
##  50 EUR - 100 EUR :60   1st Qu.:5.000   1st Qu.:2   1st Qu.:4.000  
##  101 EUR - 300 EUR:17   Median :6.000   Median :4   Median :5.000  
##  301 EUR - 500 EUR: 4   Mean   :5.497   Mean   :4   Mean   :4.974  
##  More than 500 EUR: 3   3rd Qu.:7.000   3rd Qu.:5   3rd Qu.:6.000  
##                         Max.   :7.000   Max.   :7   Max.   :7.000  
##                                                                    
##      Q30d_1          Q30e_1          Q30f_1          Q30g_1     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :3.000   Median :5.000   Median :4.000   Median :5.000  
##  Mean   :3.729   Mean   :4.523   Mean   :4.039   Mean   :4.219  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##                                                                 
##      Q31a_1          Q31b_1          Q31c_1          Q32a_1     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.500   1st Qu.:4.000   1st Qu.:2.500   1st Qu.:5.000  
##  Median :6.000   Median :5.000   Median :4.000   Median :6.000  
##  Mean   :5.277   Mean   :5.019   Mean   :3.735   Mean   :5.594  
##  3rd Qu.:7.000   3rd Qu.:6.500   3rd Qu.:5.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##                                                                 
##      Q32b_1          Q32c_1          Q32d_1         Q32e_1         Q33       
##  Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.0   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:5.000   1st Qu.:5.00   1st Qu.:3.0   1st Qu.:3.000  
##  Median :5.000   Median :6.000   Median :6.00   Median :5.0   Median :5.000  
##  Mean   :5.135   Mean   :5.484   Mean   :5.71   Mean   :4.4   Mean   :4.381  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:7.00   3rd Qu.:6.0   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.00   Max.   :7.0   Max.   :7.000  
##                                                                              
##       Q34             Q35             Q36a            Q36b       
##  Min.   :1.000   Min.   :1.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:4.500   1st Qu.:3.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :6.000   Median :5.000   Median :1.000   Median :0.0000  
##  Mean   :5.348   Mean   :4.529   Mean   :0.729   Mean   :0.4194  
##  3rd Qu.:6.500   3rd Qu.:6.000   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :7.000   Max.   :7.000   Max.   :1.000   Max.   :1.0000  
##                                                                  
##       Q36c             Q36d             Q36e             Q46        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :  1.00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 68.50  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median : 80.00  
##  Mean   :0.6194   Mean   :0.3613   Mean   :0.1677   Mean   : 75.92  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.: 90.00  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :100.00  
##                                                                     
##      Q1a_1           Q1b_1           Q1c_1           Q1d_1           Q1e_1    
##  Min.   :1.000   Min.   :2.000   Min.   :1.000   Min.   :2.000   Min.   :1.0  
##  1st Qu.:5.000   1st Qu.:5.000   1st Qu.:5.000   1st Qu.:6.000   1st Qu.:5.0  
##  Median :6.000   Median :6.000   Median :6.000   Median :6.000   Median :6.0  
##  Mean   :5.781   Mean   :5.884   Mean   :6.045   Mean   :6.045   Mean   :5.4  
##  3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:7.0  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.0  
##                                                                               
##      Q1f_1           Q2a_1           Q2b_1           Q2c_1      
##  Min.   :1.000   Min.   :2.000   Min.   :2.000   Min.   :1.000  
##  1st Qu.:5.000   1st Qu.:6.000   1st Qu.:5.000   1st Qu.:5.000  
##  Median :6.000   Median :6.000   Median :6.000   Median :6.000  
##  Mean   :5.477   Mean   :6.039   Mean   :5.619   Mean   :5.555  
##  3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##                                                                 
##      Q3a_1           Q3b_1           Q3c_1           Q4a_1      
##  Min.   :1.000   Min.   :2.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:6.000   1st Qu.:6.000   1st Qu.:5.000  
##  Median :5.000   Median :6.000   Median :6.000   Median :6.000  
##  Mean   :4.813   Mean   :6.052   Mean   :5.974   Mean   :5.445  
##  3rd Qu.:6.000   3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:7.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##                                                                 
##      Q4b_1           Q4c_1           Q5a_1           Q5b_1      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:6.000   1st Qu.:5.000   1st Qu.:3.000   1st Qu.:5.000  
##  Median :6.000   Median :6.000   Median :5.000   Median :6.000  
##  Mean   :6.032   Mean   :5.774   Mean   :4.529   Mean   :5.955  
##  3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:6.000   3rd Qu.:7.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##                                                                 
##      Q5c_1           Q6a_1           Q6b_1           Q6c_1      
##  Min.   :1.000   Min.   :2.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:5.000   1st Qu.:6.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :6.000   Median :7.000   Median :4.000   Median :4.000  
##  Mean   :5.826   Mean   :6.219   Mean   :4.252   Mean   :4.219  
##  3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:5.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##                                                                 
##      Q7a_1           Q7b_1           Q7c_1           Q39          Q40      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Male  :60   Min.   :1954  
##  1st Qu.:2.000   1st Qu.:5.000   1st Qu.:5.000   Female:95   1st Qu.:1969  
##  Median :4.000   Median :6.000   Median :6.000               Median :1977  
##  Mean   :3.865   Mean   :5.523   Mean   :5.645               Mean   :1978  
##  3rd Qu.:6.000   3rd Qu.:7.000   3rd Qu.:7.000               3rd Qu.:1986  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000               Max.   :2002  
##                                                                            
##                     Q37                         Q38     
##  1.701 EUR - 2.500 EUR:96   No pension            :154  
##  Pension              : 1   Pension over 1.110 EUR:  1  
##  2.501 EUR - 3.300 EUR:39                               
##  Over 3.300 EUR       :19                               
##                                                         
##                                                         
##                                                         
##                                                                                           Q41    
##  Completed university academic education (also 2nd Bologna level)                           :82  
##  Incomplete primary school                                                                  : 1  
##  Completed primary school                                                                   : 0  
##  Completed lower or secondary vocational education                                          : 0  
##  Completed secondary professional or general education                                      :10  
##  Completed higher professional or university professional education (also 1st Bologna level):26  
##  Completed specialization, scientific master’s degree, doctorate                            :36  
##                                         Q42     
##  Employed for shorter/longer working hours:117  
##  Self-employed                            : 27  
##  Retired                                  : 11  
##  Currently unemployed                     :  0  
##                                                 
##                                                 
##                                                 
##                                                     Q43    
##  Office professions (e.g., IT, finance)               :64  
##  Physical work (e.g., construction, factory work)     : 1  
##  Service industry (e.g., retail, hospitality)         : 9  
##  Public sector (e.g., healthcare, education, politics):33  
##  Creative/artistic work                               : 2  
##  Self-employed or retired or currently unemployed     :46  
##                                                            
##                                            Q44                  Q45    
##  Urban area (city or metropolitan region)    :93   NLB            :63  
##  Suburban area (on the outskirts of the city):31   OTP            :27  
##  Rural area (village or countryside)         :31   Intesa Sanpaolo:18  
##                                                    Unicredit      :14  
##                                                    Revolut        : 8  
##                                                    Gorenjska banka: 6  
##                                                    (Other)        :19  
##        ID           Q40_Age     
##  Min.   :  1.0   Min.   :23.00  
##  1st Qu.: 39.5   1st Qu.:38.50  
##  Median : 78.0   Median :48.00  
##  Mean   : 78.0   Mean   :46.58  
##  3rd Qu.:116.5   3rd Qu.:56.00  
##  Max.   :155.0   Max.   :71.00  
##

Perceptional map (PCA)

mydata_PCA <- mydata[,c("Q2a_1","Q2b_1","Q2c_1","Q3a_1","Q3b_1","Q3c_1","Q4a_1","Q4b_1","Q4c_1","Q5a_1","Q5b_1","Q5c_1","Q6a_1","Q6b_1","Q6c_1","Q7a_1","Q7b_1","Q7c_1")]

colnames(mydata_PCA) <- c("Cash_Security","Credit/Debit cards_Security","Mobile payments_Security","Cash_Speed of transactions","Credit/Debit cards_Speed of transactions","Mobile payments_Speed of transactions","Cash_Ease of use","Credit/Debit cards_Ease of use","Mobile payments_Ease of use","Cash_Convenience","Credit/Debit cards_Convenience","Mobile payments_Convenience","Cash_Privacy","Credit/Debit cards_Privacy","Mobile payments_Privacy","Cash_Tracking expenses","Credit/Debit cards_Tracking expenses","Mobile payments_Tracking expenses")

library(pastecs)
round(stat.desc(mydata_PCA, basic = FALSE), 2)

##              Cash_Security Credit/Debit cards_Security Mobile payments_Security
## median                6.00                        6.00                     6.00
## mean                  6.04                        5.62                     5.55
## SE.mean               0.09                        0.09                     0.09
## CI.mean.0.95          0.18                        0.17                     0.18
## var                   1.26                        1.13                     1.26
## std.dev               1.12                        1.06                     1.12
## coef.var              0.19                        0.19                     0.20
##              Cash_Speed of transactions
## median                             5.00
## mean                               4.81
## SE.mean                            0.15
## CI.mean.0.95                       0.29
## var                                3.27
## std.dev                            1.81
## coef.var                           0.38
##              Credit/Debit cards_Speed of transactions
## median                                           6.00
## mean                                             6.05
## SE.mean                                          0.07
## CI.mean.0.95                                     0.15
## var                                              0.87
## std.dev                                          0.93
## coef.var                                         0.15
##              Mobile payments_Speed of transactions Cash_Ease of use
## median                                        6.00             6.00
## mean                                          5.97             5.45
## SE.mean                                       0.09             0.13
## CI.mean.0.95                                  0.18             0.26
## var                                           1.30             2.74
## std.dev                                       1.14             1.66
## coef.var                                      0.19             0.30
##              Credit/Debit cards_Ease of use Mobile payments_Ease of use
## median                                 6.00                        6.00
## mean                                   6.03                        5.77
## SE.mean                                0.10                        0.11
## CI.mean.0.95                           0.19                        0.21
## var                                    1.50                        1.73
## std.dev                                1.22                        1.32
## coef.var                               0.20                        0.23
##              Cash_Convenience Credit/Debit cards_Convenience
## median                   5.00                           6.00
## mean                     4.53                           5.95
## SE.mean                  0.15                           0.10
## CI.mean.0.95             0.30                           0.20
## var                      3.60                           1.55
## std.dev                  1.90                           1.24
## coef.var                 0.42                           0.21
##              Mobile payments_Convenience Cash_Privacy
## median                              6.00         7.00
## mean                                5.83         6.22
## SE.mean                             0.11         0.09
## CI.mean.0.95                        0.22         0.18
## var                                 1.89         1.28
## std.dev                             1.37         1.13
## coef.var                            0.24         0.18
##              Credit/Debit cards_Privacy Mobile payments_Privacy
## median                             4.00                    4.00
## mean                               4.25                    4.22
## SE.mean                            0.13                    0.13
## CI.mean.0.95                       0.25                    0.26
## var                                2.55                    2.69
## std.dev                            1.60                    1.64
## coef.var                           0.38                    0.39
##              Cash_Tracking expenses Credit/Debit cards_Tracking expenses
## median                         4.00                                 6.00
## mean                           3.86                                 5.52
## SE.mean                        0.16                                 0.12
## CI.mean.0.95                   0.33                                 0.23
## var                            4.20                                 2.12
## std.dev                        2.05                                 1.46
## coef.var                       0.53                                 0.26
##              Mobile payments_Tracking expenses
## median                                    6.00
## mean                                      5.65
## SE.mean                                   0.12
## CI.mean.0.95                              0.24
## var                                       2.28
## std.dev                                   1.51
## coef.var                                  0.27

R <- cor(mydata_PCA)

library(psych)
cortest.bartlett(R, n = nrow(mydata))

## $chisq
## [1] 1272.097
## 
## $p.value
## [1] 1.417576e-175
## 
## $df
## [1] 153

library(psych)
KMO(R)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA =  0.71
## MSA for each item = 
##                            Cash_Security 
##                                     0.76 
##              Credit/Debit cards_Security 
##                                     0.67 
##                 Mobile payments_Security 
##                                     0.73 
##               Cash_Speed of transactions 
##                                     0.73 
## Credit/Debit cards_Speed of transactions 
##                                     0.81 
##    Mobile payments_Speed of transactions 
##                                     0.76 
##                         Cash_Ease of use 
##                                     0.72 
##           Credit/Debit cards_Ease of use 
##                                     0.76 
##              Mobile payments_Ease of use 
##                                     0.78 
##                         Cash_Convenience 
##                                     0.64 
##           Credit/Debit cards_Convenience 
##                                     0.73 
##              Mobile payments_Convenience 
##                                     0.82 
##                             Cash_Privacy 
##                                     0.80 
##               Credit/Debit cards_Privacy 
##                                     0.50 
##                  Mobile payments_Privacy 
##                                     0.54 
##                   Cash_Tracking expenses 
##                                     0.67 
##     Credit/Debit cards_Tracking expenses 
##                                     0.65 
##        Mobile payments_Tracking expenses 
##                                     0.68

library(FactoMineR)
components <- PCA(mydata_PCA,
                  scale.unit = TRUE,
                  graph = FALSE)

library(factoextra)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

get_eigenvalue(components)

##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   4.3576765       24.2093140                    24.20931
## Dim.2   3.2645122       18.1361787                    42.34549
## Dim.3   1.8980219       10.5445664                    52.89006
## Dim.4   1.3961992        7.7566625                    60.64672
## Dim.5   1.1869459        6.5941437                    67.24087
## Dim.6   0.9251920        5.1399558                    72.38082
## Dim.7   0.8178039        4.5433552                    76.92418
## Dim.8   0.6538489        3.6324937                    80.55667
## Dim.9   0.6307985        3.5044359                    84.06111
## Dim.10  0.5747143        3.1928575                    87.25396
## Dim.11  0.5207851        2.8932507                    90.14721
## Dim.12  0.4027546        2.2375255                    92.38474
## Dim.13  0.3591583        1.9953239                    94.38006
## Dim.14  0.2702578        1.5014323                    95.88150
## Dim.15  0.2287672        1.2709287                    97.15242
## Dim.16  0.2124628        1.1803488                    98.33277
## Dim.17  0.1681656        0.9342535                    99.26703
## Dim.18  0.1319352        0.7329734                   100.00000

library(factoextra)
fviz_eig(components,
         choice = "eigenvalue",
         main = "Scree plot",
         ylab = "Eigenvalue",
         addlabels = TRUE) # Elbow method

library(psych)
fa.parallel(mydata_PCA,
            sim = FALSE,
            fa = "pc") # Parallel analysis

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  3

library(tibble)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:pastecs':
## 
##     extract

mydata_PCA_final <- mydata_PCA %>% 
  pivot_longer(everything(), names_to = "name", values_to = "score")  %>% 
  separate(name, into = c("retailer", "dimension"), sep = "_")%>% 
  pivot_wider(names_from = dimension, values_from = score, values_fn = mean) %>%
  column_to_rownames(var = "retailer")

print(mydata_PCA_final)

##                    Security Speed of transactions Ease of use Convenience
## Cash               6.038710              4.812903    5.445161    4.529032
## Credit/Debit cards 5.619355              6.051613    6.032258    5.954839
## Mobile payments    5.554839              5.974194    5.774194    5.825806
##                     Privacy Tracking expenses
## Cash               6.219355          3.864516
## Credit/Debit cards 4.251613          5.522581
## Mobile payments    4.219355          5.645161

library(FactoMineR)
components <- PCA(mydata_PCA_final,
                  scale.unit = TRUE,
                  graph = FALSE,
                  ncp = 4)

components

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3 individuals, described by 6 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

print(components$var$cor)

##                            Dim.1        Dim.2
## Security              -0.9833236  0.181864489
## Speed of transactions  0.9999921 -0.003975209
## Ease of use            0.9233301  0.384007219
## Convenience            0.9997555  0.022112481
## Privacy               -0.9972728  0.073803412
## Tracking expenses      0.9926335 -0.121156035

print(components$var$contrib)

##                          Dim.1        Dim.2
## Security              16.67448 16.441432738
## Speed of transactions 17.24458  0.007855317
## Ease of use           14.70190 73.303147426
## Convenience           17.23642  0.243062971
## Privacy               17.15092  2.707676213
## Tracking expenses     16.99171  7.296825335

library(factoextra)
fviz_pca_biplot(components, 
                repel = TRUE)

Segmentation (Clustering)

mydata_cluster <- mydata[,c("ID","Q19a","Q19b","Q19c","Q19d","Q19e","Q19f","Q19g","Q19h","Q28","Q29","Q30a_1","Q30b_1","Q30c_1","Q30d_1","Q30e_1","Q30f_1","Q30g_1","Q31a_1","Q31b_1","Q46","Q39","Q40_Age","Q37","Q41","Q42","Q43","Q44","Q45")]
colnames(mydata_cluster) <- c("ID","Grocery shopping", "Online shopping", "Subscriptions", "Bill payments", "Paying for services", "Tips", "Transport", "Social events","Most frequent payment method","Cash carrying","Security concerns","Lack of trust in tehcnology","Privacy concerns","Complexity of use","Hidden transaction costs","Lack of availability","Tehnical issues concerns","Cash_Safety","Cash_privacy","Digital payment usage","Gender","Age","Income","Education","Employment status","Employment","Area of living","Primary bank")

mydata_cluster$Age <- ifelse(mydata_cluster$Age>48,"Older","Younger")
mydata_cluster$Age <- factor(mydata_cluster$Age,
                             levels = c("Older","Younger"),
                             labels = c("Older","Younger"))

mydata_cluster$Education <- ifelse(
  mydata_cluster$Education == "Completed university academic education (also 2nd Bologna level)" | 
  mydata_cluster$Education == "Completed specialization, scientific master’s degree, doctorate",
  "Higher education",
  "Lower education")
mydata_cluster$Education <- factor(mydata_cluster$Education,
                             levels = c("Higher education","Lower education"),
                             labels = c("Higher education","Lower education"))

mydata_cluster$`Employment status`<-ifelse(mydata_cluster$`Employment status`=="Employed for shorter/longer working hours","Full-time employees","Others")
mydata_cluster$`Employment status`<-factor(mydata_cluster$`Employment status`,
                                           levels = c("Full-time employees","Others"),
                                           labels = c("Full-time employees","Others"))

mydata_cluster$Employment <-ifelse(mydata_cluster$Employment == "Office professions (e.g., IT, finance)","Mental work",ifelse(mydata_cluster$Employment == "Self-employed or retired or currently unemployed","Other","Physical work"))
mydata_cluster$Employment <-factor(mydata_cluster$Employment,
                                   levels = c("Mental work","Other","Physical work"),
                                   labels = c("Mental work","Other","Physical work"))

mydata_cluster$`Area of living` <- ifelse(mydata_cluster$`Area of living` ==
"Rural area (village or countryside)","Rural area","Urban area")
mydata_cluster$`Area of living` <- factor(mydata_cluster$`Area of living`,
                                          levels = c("Urban area","Rural area"),
                                          labels = c("Urban area","Rural area"))

mydata_cluster$`Primary bank` <- ifelse(
  is.na(mydata_cluster$`Primary bank`), "Other banks",
  ifelse(mydata_cluster$`Primary bank` == "NLB", "NLB",
  ifelse(mydata_cluster$`Primary bank` == "OTP", "OTP","Other banks")))
mydata_cluster$`Primary bank` <- factor(mydata_cluster$`Primary bank`,
                                        levels = c("NLB","OTP","Other banks"),
                                        labels = c("NLB","OTP","Other banks"))

mydata_cluster$`Grocery shopping` <- ifelse(mydata_cluster$`Grocery shopping`== "Mostly digital payments" |mydata_cluster$`Grocery shopping`== "Only digital payments", "Digital payments", "Cash")
mydata_cluster$`Grocery shopping` <- factor(mydata_cluster$`Grocery shopping`,
                                            levels = c("Digital payments","Cash"),
                                            labels = c("Digital payments","Cash"))

mydata_cluster$`Online shopping` <- ifelse(mydata_cluster$`Online shopping`== "Mostly digital payments" |mydata_cluster$`Online shopping`== "Only digital payments", "Digital payments", "Cash")
mydata_cluster$`Online shopping` <- factor(mydata_cluster$`Online shopping`,
                                            levels = c("Digital payments","Cash"),
                                            labels = c("Digital payments","Cash"))

mydata_cluster$Subscriptions <- ifelse(mydata_cluster$Subscriptions == "Mostly digital payments" |mydata_cluster$Subscriptions== "Only digital payments", "Digital payments", "Cash")
mydata_cluster$Subscriptions <- factor(mydata_cluster$Subscriptions,
                                            levels = c("Digital payments","Cash"),
                                            labels = c("Digital payments","Cash"))

mydata_cluster$`Bill payments` <- ifelse(mydata_cluster$`Bill payments` == "Mostly digital payments" |mydata_cluster$`Bill payments`== "Only digital payments", "Digital payments", "Cash")
mydata_cluster$`Bill payments` <- factor(mydata_cluster$`Bill payments`,
                                            levels = c("Digital payments","Cash"),
                                            labels = c("Digital payments","Cash"))

mydata_cluster$`Paying for services` <- ifelse(mydata_cluster$`Paying for services` == "Mostly digital payments" |mydata_cluster$`Paying for services`== "Only digital payments", "Digital payments", "Cash")
mydata_cluster$`Paying for services` <- factor(mydata_cluster$`Paying for services`,
                                            levels = c("Digital payments","Cash"),
                                            labels = c("Digital payments","Cash"))

mydata_cluster$Tips <- ifelse(mydata_cluster$Tips == "Mostly digital payments" |mydata_cluster$Tips == "Only digital payments", "Digital payments", "Cash")
mydata_cluster$Tips <- factor(mydata_cluster$Tips,
                                            levels = c("Digital payments","Cash"),
                                            labels = c("Digital payments","Cash"))

mydata_cluster$Transport <- ifelse(mydata_cluster$Transport == "Mostly digital payments" |mydata_cluster$Transport == "Only digital payments", "Digital payments", "Cash")
mydata_cluster$Transport <- factor(mydata_cluster$Transport,
                                            levels = c("Digital payments","Cash"),
                                            labels = c("Digital payments","Cash"))

mydata_cluster$`Social events` <- ifelse(mydata_cluster$`Social events` == "Mostly digital payments" |mydata_cluster$`Social events` == "Only digital payments", "Digital payments", "Cash")
mydata_cluster$`Social events` <- factor(mydata_cluster$`Social events`,
                                            levels = c("Digital payments","Cash"),
                                            labels = c("Digital payments","Cash"))

mydata_cluster$`Cash carrying` <-ifelse(mydata_cluster$`Cash carrying`== "Less than 50 EUR","Small amount", "Large amount")
mydata_cluster$`Cash carrying` <- factor(mydata_cluster$`Cash carrying`,
                                            levels = c("Small amount","Large amount"),
                                            labels = c("Small amount","Large amount"))

mydata_cluster$`Digital payment usage` <- ifelse(mydata_cluster$`Digital payment usage` > 80,"High usage","Low usage")
mydata_cluster$`Digital payment usage`<- factor(mydata_cluster$`Digital payment usage`,
                                                levels = c("High usage","Low usage"),
                                                labels = c("High usage","Low usage"))

mydata_cluster_std <- as.data.frame(scale(mydata_cluster[c("Security concerns","Lack of trust in tehcnology","Privacy concerns","Complexity of use","Hidden transaction costs","Lack of availability","Tehnical issues concerns")]))

mydata_cluster$Dissimilarity <- sqrt(mydata_cluster_std$`Security concerns`^2 + mydata_cluster_std$`Lack of trust in tehcnology`^2 + mydata_cluster_std$`Privacy concerns`^2 + mydata_cluster_std$`Complexity of use`^2 + 
mydata_cluster_std$`Hidden transaction costs`^2 + mydata_cluster_std$`Lack of availability`^2 + mydata_cluster_std$`Tehnical issues concerns`^2)

head(mydata_cluster[order(-mydata_cluster$Dissimilarity),c("ID","Dissimilarity")])

##      ID Dissimilarity
## 95   95      5.160370
## 131 131      5.017509
## 150 150      4.852034
## 135 135      4.303267
## 75   75      4.234314
## 10   10      4.147864

library(factoextra)

# Finding Euclidean distances, based on 7 Cluster variables, then saving them into object Distances
Distances <- get_dist(mydata_cluster_std,
                      method = "euclidian")

fviz_dist(Distances, # Showing matrix of distances
          gradient = list(low = "darkred",
                          mid = "grey95",
                          high = "white"))

library(factoextra)

get_clust_tendency(mydata_cluster_std, # Hopkins statistics
                   n = nrow(mydata_cluster_std) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.6029137
## 
## $plot
## NULL

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:pastecs':
## 
##     first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(factoextra)
WARD <-mydata_cluster_std%>%get_dist(method = "euclidean")%>%hclust("ward.D2")

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 155

library(factoextra)
fviz_dend(WARD)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(factoextra)
library(NbClust)

fviz_nbclust(mydata_cluster_std, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

fviz_nbclust(mydata_cluster_std, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette analysis")

library(NbClust)
NbClust(mydata_cluster_std,
        distance = "euclidean",
        min.nc = 2, max.nc = 10,
        method = "kmeans",
        index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

## $All.index
##        KL      CH Hartigan     CCC    Scott      Marriot   TrCovW   TraceW
## 2  4.0263 89.6074  32.6618 -1.8003 186.9104 9.830137e+13 8553.364 679.8392
## 3  1.5520 70.2370  22.0769 -1.8855 347.9424 7.826123e+13 5522.140 560.2412
## 4  4.2119 60.5834  12.0651 -1.6311 441.9824 7.584616e+13 4934.573 489.1898
## 5  0.3535 51.7394  14.1502 -2.2107 559.2533 5.561248e+13 4382.911 452.9951
## 6  1.1168 47.8054  12.3119 -1.6300 594.0944 6.396073e+13 3663.242 413.9457
## 7  1.8292 44.8783   8.8407 -1.2695 691.3385 4.648777e+13 3116.462 382.3520
## 8  0.6343 41.7440   9.9972 -1.1527 761.4121 3.863523e+13 2732.960 360.7997
## 9  2.6277 39.9858   6.3559 -0.6498 818.2622 3.388444e+13 2387.824 337.8248
## 10 1.6681 37.5376   2.8868 -0.7425 862.5248 3.144097e+13 2157.894 323.7317
##    Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 2    5.8407 1.5857 0.4185 1.3066     0.3071 1.2713 -24.5384 -0.9624    0.4254
## 3    8.5968 1.9242 0.4305 1.4932     0.2575 1.0640  -4.9961 -0.2709    0.3984
## 4   10.0629 2.2036 0.4432 1.5785     0.2164 1.2914 -14.6655 -1.0104    0.3688
## 5   12.5157 2.3797 0.4231 1.7778     0.2207 1.7604 -23.7575 -1.8992    0.3394
## 6   12.5887 2.6042 0.4280 1.6574     0.2125 1.3692 -11.5943 -1.1798    0.3202
## 7   14.7632 2.8194 0.4089 1.5824     0.1996 1.7031 -11.5596 -1.7741    0.3033
## 8   16.0798 2.9878 0.3971 1.5012     0.2089 1.7010  -6.5936 -1.6725    0.2881
## 9   17.2846 3.1910 0.3923 1.4959     0.2172 1.6978 -13.5634 -1.7828    0.2761
## 10  18.3995 3.3299 0.4535 1.4326     0.2145 0.9441   0.6514  0.2591    0.2643
##        Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  339.9196     0.5316  0.6084  0.6705 0.1223 0.0017  1.6143 1.9983 1.0090
## 3  186.7471     0.5470  1.4263  1.1572 0.1798 0.0020  1.6116 1.8077 0.5689
## 4  122.2975     0.4850  0.7034  1.8101 0.1401 0.0023  1.7873 1.6701 0.4778
## 5   90.5990     0.4533  0.3369  2.4501 0.1136 0.0024  2.1364 1.6084 0.4460
## 6   68.9909     0.4432  0.3131  2.9374 0.1206 0.0025  1.9228 1.5406 0.4227
## 7   54.6217     0.4315  0.2341  3.4551 0.0996 0.0026  1.9782 1.4746 0.4098
## 8   45.1000     0.4268  0.0324  3.7480 0.1040 0.0026  1.9775 1.4313 0.3847
## 9   37.5361     0.4354  0.3079  3.8445 0.1502 0.0026  2.0356 1.3887 0.3669
## 10  32.3732     0.4281 -1.1872  4.1132 0.1259 0.0027  1.9915 1.3639 0.3565
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.7325            41.9944       1.0000
## 3          0.7168            32.7901       1.0000
## 4          0.6881            29.4597       1.0000
## 5          0.6051            35.8946       1.0000
## 6          0.5874            30.2070       1.0000
## 7          0.5300            24.8349       1.0000
## 8          0.4004            23.9565       1.0000
## 9          0.5581            26.1319       1.0000
## 10         0.5874             7.7274       0.9686
## 
## $Best.nc
##                     KL      CH Hartigan     CCC   Scott      Marriot   TrCovW
## Number_clusters 4.0000  2.0000   3.0000  9.0000   3.000 5.000000e+00    3.000
## Value_Index     4.2119 89.6074  10.5849 -0.6498 161.032 2.858193e+13 3031.224
##                  TraceW Friedman   Rubin Cindex     DB Silhouette   Duda
## Number_clusters  3.0000    3.000  4.0000 9.0000 2.0000     2.0000 2.0000
## Value_Index     48.5467    2.756 -0.1034 0.3923 1.3066     0.3071 1.2713
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000  2.0000    2.0000   3.0000      3.000    1  2.0000
## Value_Index     -24.5384 -0.9624    0.4254 153.1726      0.547   NA  0.6705
##                   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 3.0000      0  3.0000      0 10.0000
## Value_Index     0.1798      0  1.6116      0  0.3565
## 
## $Best.partition
##   [1] 1 1 3 3 1 2 2 3 3 2 3 2 2 1 3 3 1 1 3 1 1 1 2 2 3 3 3 3 1 3 3 3 2 3 1 1 1
##  [38] 1 1 3 2 2 3 2 3 3 2 3 1 1 1 1 1 3 1 1 3 1 3 1 1 3 1 3 1 1 3 1 2 1 2 2 1 1
##  [75] 2 1 1 1 2 1 1 3 3 1 1 1 1 3 1 2 3 1 1 1 2 2 1 1 1 1 1 3 1 2 2 1 3 3 2 3 1
## [112] 2 3 2 2 3 3 3 2 2 1 2 1 1 1 2 1 1 2 1 2 1 1 2 2 3 1 2 1 2 3 1 3 3 2 1 1 1
## [149] 3 2 3 3 1 1 3

Clustering <- kmeans(mydata_cluster_std,centers = 5,nstart = 25)

Clustering

## K-means clustering with 5 clusters of sizes 33, 22, 46, 25, 29
## 
## Cluster means:
##   Security concerns Lack of trust in tehcnology Privacy concerns
## 1         0.4068932                  -0.3248583        0.3594427
## 2        -0.5064599                  -0.5360163       -0.6834639
## 3         0.2316735                   0.2796607        0.3856747
## 4        -1.4273458                  -0.9862699       -1.5781226
## 5         0.7841830                   1.1829324        0.8581595
##   Complexity of use Hidden transaction costs Lack of availability
## 1        -0.6385697               -0.4393307           -0.8801734
## 2        -0.6074640               -0.1380198            0.5873926
## 3         0.4626393                0.5039033            0.3282859
## 4        -0.9079450               -1.3492894           -0.9839571
## 5         1.2363525                0.9685183            0.8834780
##   Tehnical issues concerns
## 1               -0.9369453
## 2                0.4006879
## 3                0.2221564
## 4               -0.9133034
## 5                1.1971535
## 
## Clustering vector:
##   [1] 3 3 1 1 3 4 4 2 1 4 1 4 4 5 1 3 3 3 1 5 5 5 4 2 3 1 1 2 3 1 1 1 2 2 5 5 5
##  [38] 5 3 1 4 4 1 4 1 1 2 1 5 3 3 5 5 2 5 5 3 3 1 5 3 1 5 1 3 5 3 3 4 5 4 4 3 5
##  [75] 4 5 3 5 2 3 3 3 1 5 3 3 3 1 5 4 1 5 2 3 4 2 3 3 3 3 3 1 3 4 2 5 1 1 4 1 3
## [112] 2 1 4 4 1 1 1 2 4 3 2 3 3 3 2 2 3 2 3 4 5 2 4 4 3 3 4 2 2 2 3 1 1 2 3 5 5
## [149] 3 4 3 3 5 5 1
## 
## Within cluster sum of squares by cluster:
## [1]  97.60250  85.55375 138.94337  74.59530  45.97807
##  (between_SS / total_SS =  58.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering,palette = "Set1", repel = TRUE, ggtheme = theme_bw(),data = mydata_cluster_std)

library(dplyr)

mydata_cluster <-mydata_cluster %>% filter(!ID %in% c(133,139))

mydata_cluster$ID <-seq(1,nrow(mydata_cluster))

mydata_cluster_std <- as.data.frame(scale(mydata_cluster[c("Security concerns","Lack of trust in tehcnology","Privacy concerns","Complexity of use","Hidden transaction costs","Lack of availability","Tehnical issues concerns")]))

Clustering <- kmeans(mydata_cluster_std,centers = 5,nstart = 25)

Clustering

## K-means clustering with 5 clusters of sizes 48, 29, 18, 33, 25
## 
## Cluster means:
##   Security concerns Lack of trust in tehcnology Privacy concerns
## 1         0.2208215                   0.2710113        0.3838910
## 2         0.7795892                   1.1842503        0.8574089
## 3        -0.6157766                  -0.6801511       -0.8930492
## 4         0.4045221                  -0.3207630        0.3613883
## 5        -1.4189109                  -0.9809562       -1.5657021
##   Complexity of use Hidden transaction costs Lack of availability
## 1         0.4138857                0.4485749            0.3327816
## 2         1.2245582                0.9660758            0.9056486
## 3        -0.5902442               -0.1004808            0.5943471
## 4        -0.6552154               -0.4332418           -0.8677019
## 5        -0.9252877               -1.3376864           -0.9720564
##   Tehnical issues concerns
## 1                0.2733460
## 2                1.2078796
## 3                0.2697852
## 4               -0.9240638
## 5               -0.9004458
## 
## Clustering vector:
##   [1] 1 1 4 4 1 5 5 1 4 5 4 5 5 2 4 1 1 1 4 2 2 2 5 3 1 4 4 3 1 4 4 4 3 3 2 2 2
##  [38] 2 1 4 5 5 4 5 4 4 3 4 2 1 1 2 2 3 2 2 1 1 4 2 1 4 2 4 1 2 1 1 5 2 5 5 1 2
##  [75] 5 2 1 2 3 1 1 1 4 2 1 1 1 4 2 5 4 2 3 1 5 3 1 1 1 1 1 4 1 5 3 2 4 4 5 4 1
## [112] 3 4 5 5 4 4 4 3 5 1 3 1 1 1 3 1 1 3 1 5 2 5 5 1 1 5 3 3 1 4 4 3 1 2 2 1 5
## [149] 1 1 2 2 4
## 
## Within cluster sum of squares by cluster:
## [1] 147.22154  45.98789  67.57078  97.17606  74.23017
##  (between_SS / total_SS =  59.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering,palette = "Set1", repel = TRUE, ggtheme = theme_bw(),data = mydata_cluster_std)

Averages <-Clustering$centers
Averages

##   Security concerns Lack of trust in tehcnology Privacy concerns
## 1         0.2208215                   0.2710113        0.3838910
## 2         0.7795892                   1.1842503        0.8574089
## 3        -0.6157766                  -0.6801511       -0.8930492
## 4         0.4045221                  -0.3207630        0.3613883
## 5        -1.4189109                  -0.9809562       -1.5657021
##   Complexity of use Hidden transaction costs Lack of availability
## 1         0.4138857                0.4485749            0.3327816
## 2         1.2245582                0.9660758            0.9056486
## 3        -0.5902442               -0.1004808            0.5943471
## 4        -0.6552154               -0.4332418           -0.8677019
## 5        -0.9252877               -1.3376864           -0.9720564
##   Tehnical issues concerns
## 1                0.2733460
## 2                1.2078796
## 3                0.2697852
## 4               -0.9240638
## 5               -0.9004458

Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)

library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Security concerns","Lack of trust in tehcnology","Privacy concerns","Complexity of use","Hidden transaction costs","Lack of availability","Tehnical issues concerns"))

Figure$Group <- factor(Figure$ID,
                        levels = c(1, 2, 3, 4, 5),
                        labels = c("1", "2", "3","4", "5"))

Figure$NameF <- factor(Figure$name,
                        levels = c("Security concerns","Lack of trust in tehcnology","Privacy concerns","Complexity of use","Hidden transaction costs","Lack of availability","Tehnical issues concerns"),
                        labels = c("Security concerns","Lack of trust in tehcnology","Privacy concerns","Complexity of use","Hidden transaction costs","Lack of availability","Tehnical issues concerns"))

library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 3) +
  geom_line(aes(group = ID), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables") +
  ylim(-2.2, 2.2) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))

mydata_cluster$Group <-Clustering$cluster

fit <- aov(cbind(`Security concerns`,`Lack of trust in tehcnology`,`Privacy concerns`,`Complexity of use`,`Hidden transaction costs`,`Lack of availability`,`Tehnical issues concerns`) ~ as.factor(Group), data = mydata_cluster)

summary(fit)

##  Response Security concerns :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 216.22  54.054  43.948 < 2.2e-16 ***
## Residuals        148 182.03   1.230                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Lack of trust in tehcnology :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 279.38  69.846  41.085 < 2.2e-16 ***
## Residuals        148 251.61   1.700                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Privacy concerns :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 338.46  84.615  91.826 < 2.2e-16 ***
## Residuals        148 136.38   0.921                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Complexity of use :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 353.31  88.328  59.221 < 2.2e-16 ***
## Residuals        148 220.74   1.491                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Hidden transaction costs :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 291.36  72.841  50.649 < 2.2e-16 ***
## Residuals        148 212.84   1.438                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Lack of availability :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 289.88  72.470  45.619 < 2.2e-16 ***
## Residuals        148 235.11   1.589                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Tehnical issues concerns :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 363.82  90.954  62.814 < 2.2e-16 ***
## Residuals        148 214.30   1.448                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aggregate(mydata_cluster$Cash_Safety,by=list(mydata_cluster$Group),FUN=mean) # Cross validity with variable reliability

##   Group.1        x
## 1       1 5.583333
## 2       2 6.137931
## 3       3 4.555556
## 4       4 5.242424
## 5       5 4.240000

aggregate(mydata_cluster$Cash_privacy,by=list(mydata_cluster$Group),FUN=mean)

##   Group.1        x
## 1       1 5.208333
## 2       2 6.034483
## 3       3 4.500000
## 4       4 5.212121
## 5       5 3.760000

Cross validity successfull, both added variables are similar across all clusters.

library(car) # Criterian validity

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:psych':
## 
##     logit

leveneTest(mydata_cluster$Cash_Safety,as.factor(mydata_cluster$Group))

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value   Pr(>F)   
## group   4  4.0225 0.003982 **
##       148                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(car)
leveneTest(mydata_cluster$Cash_privacy,as.factor(mydata_cluster$Group))

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value   Pr(>F)   
## group   4  4.3148 0.002489 **
##       148                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(dplyr)
library(rstatix)

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

mydata_cluster%>%
  group_by(as.factor(mydata_cluster$Group)) %>% 
  shapiro_test(Cash_Safety)

## # A tibble: 5 × 4
##   `as.factor(mydata_cluster$Group)` variable    statistic         p
##   <fct>                             <chr>           <dbl>     <dbl>
## 1 1                                 Cash_Safety     0.867 0.0000615
## 2 2                                 Cash_Safety     0.774 0.0000295
## 3 3                                 Cash_Safety     0.875 0.0219   
## 4 4                                 Cash_Safety     0.871 0.00100  
## 5 5                                 Cash_Safety     0.906 0.0243

mydata_cluster%>%
  group_by(as.factor(mydata_cluster$Group)) %>% 
  shapiro_test(Cash_privacy) # Normality violated

## # A tibble: 5 × 4
##   `as.factor(mydata_cluster$Group)` variable     statistic         p
##   <fct>                             <chr>            <dbl>     <dbl>
## 1 1                                 Cash_privacy     0.875 0.000109 
## 2 2                                 Cash_privacy     0.780 0.0000366
## 3 3                                 Cash_privacy     0.863 0.0135   
## 4 4                                 Cash_privacy     0.855 0.000448 
## 5 5                                 Cash_privacy     0.880 0.00682

fit1 <-aov(cbind(mydata_cluster$Cash_Safety,mydata_cluster$Cash_privacy)~as.factor(Group),data = mydata_cluster)

summary(fit1) #Criterian validity check, statistical difference

##  Response 1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4  62.29 15.5726  6.8557 4.328e-05 ***
## Residuals        148 336.18  2.2715                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4  77.22 19.3056  6.7474 5.133e-05 ***
## Residuals        148 423.46  2.8612                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

kruskal.test(mydata_cluster$Cash_Safety~Group,
             data = mydata_cluster) # We reject H0, distribution location of cash safety is different in all groups, data validated.

## 
##  Kruskal-Wallis rank sum test
## 
## data:  mydata_cluster$Cash_Safety by Group
## Kruskal-Wallis chi-squared = 20.787, df = 4, p-value = 0.0003489

kruskal.test(mydata_cluster$Cash_privacy~Group,
             data = mydata_cluster) # We reject H0, distribution location of cash privacy is different in all groups, data validated.

## 
##  Kruskal-Wallis rank sum test
## 
## data:  mydata_cluster$Cash_privacy by Group
## Kruskal-Wallis chi-squared = 20.178, df = 4, p-value = 0.0004606

Descriptor1 <- chisq.test(mydata_cluster$Gender,as.factor(mydata_cluster$Group))
Descriptor1 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$Gender and as.factor(mydata_cluster$Group)
## X-squared = 2.3469, df = 4, p-value = 0.6722

addmargins(Descriptor1$observed)

##                      
## mydata_cluster$Gender   1   2   3   4   5 Sum
##                Male    20   9   9  13   8  59
##                Female  28  20   9  20  17  94
##                Sum     48  29  18  33  25 153

addmargins(round(Descriptor1$expected,2))

##                      
## mydata_cluster$Gender     1     2     3     4     5 Sum
##                Male   18.51 11.18  6.94 12.73  9.64  59
##                Female 29.49 17.82 11.06 20.27 15.36  94
##                Sum    48.00 29.00 18.00 33.00 25.00 153

round(Descriptor1$residuals,2)

##                      
## mydata_cluster$Gender     1     2     3     4     5
##                Male    0.35 -0.65  0.78  0.08 -0.53
##                Female -0.27  0.52 -0.62 -0.06  0.42

library(effectsize)

## 
## Attaching package: 'effectsize'

## The following objects are masked from 'package:rstatix':
## 
##     cohens_d, eta_squared

## The following object is masked from 'package:psych':
## 
##     phi

effectsize::cramers_v(mydata_cluster$Gender,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.00              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor2 <- chisq.test(mydata_cluster$Age,as.factor(mydata_cluster$Group))
Descriptor2 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$Age and as.factor(mydata_cluster$Group)
## X-squared = 4.0682, df = 4, p-value = 0.3969

addmargins(Descriptor2$observed)

##                   
## mydata_cluster$Age   1   2   3   4   5 Sum
##            Older    26  17   7  17   9  76
##            Younger  22  12  11  16  16  77
##            Sum      48  29  18  33  25 153

addmargins(round(Descriptor2$expected,2))

##                   
## mydata_cluster$Age     1     2     3     4     5 Sum
##            Older   23.84 14.41  8.94 16.39 12.42  76
##            Younger 24.16 14.59  9.06 16.61 12.58  77
##            Sum     48.00 29.00 18.00 33.00 25.00 153

round(Descriptor2$residuals,2)

##                   
## mydata_cluster$Age     1     2     3     4     5
##            Older    0.44  0.68 -0.65  0.15 -0.97
##            Younger -0.44 -0.68  0.64 -0.15  0.96

library(effectsize)
effectsize::cramers_v(mydata_cluster$Age,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.02              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor3 <- chisq.test(mydata_cluster$Education,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$Education,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor3 # Used for descriptor p-value appropriate (2 categories, sample size 153)

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$Education and as.factor(mydata_cluster$Group)
## X-squared = 9.2882, df = 4, p-value = 0.05429

addmargins(Descriptor3$observed)

##                         
## mydata_cluster$Education   1   2   3   4   5 Sum
##         Higher education  38  16  15  27  21 117
##         Lower education   10  13   3   6   4  36
##         Sum               48  29  18  33  25 153

addmargins(round(Descriptor3$expected,2))

##                         
## mydata_cluster$Education     1     2     3     4     5    Sum
##         Higher education 36.71 22.18 13.76 25.24 19.12 117.01
##         Lower education  11.29  6.82  4.24  7.76  5.88  35.99
##         Sum              48.00 29.00 18.00 33.00 25.00 153.00

round(Descriptor3$residuals,2)

##                         
## mydata_cluster$Education     1     2     3     4     5
##         Higher education  0.21 -1.31  0.33  0.35  0.43
##         Lower education  -0.39  2.36 -0.60 -0.63 -0.78

library(effectsize)
effectsize::cramers_v(mydata_cluster$Education,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.19              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor4 <- chisq.test(mydata_cluster$`Employment status`,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$`Employment status`,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor4 # Used for descriptor p-value appropriate (2 categories, sample size 153)

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Employment status` and as.factor(mydata_cluster$Group)
## X-squared = 10.336, df = 4, p-value = 0.03514

addmargins(Descriptor4$observed)

##                      
##                         1   2   3   4   5 Sum
##   Full-time employees  36  18  18  23  21 116
##   Others               12  11   0  10   4  37
##   Sum                  48  29  18  33  25 153

addmargins(round(Descriptor4$expected,2))

##                      
##                           1     2     3     4     5 Sum
##   Full-time employees 36.39 21.99 13.65 25.02 18.95 116
##   Others              11.61  7.01  4.35  7.98  6.05  37
##   Sum                 48.00 29.00 18.00 33.00 25.00 153

round(Descriptor4$residuals,2)

##                      
##                           1     2     3     4     5
##   Full-time employees -0.07 -0.85  1.18 -0.40  0.47
##   Others               0.12  1.51 -2.09  0.71 -0.83

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Employment status`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.20              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor5 <- chisq.test(mydata_cluster$Employment,as.factor(mydata_cluster$Group))
Descriptor5 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$Employment and as.factor(mydata_cluster$Group)
## X-squared = 10.262, df = 8, p-value = 0.2471

addmargins(Descriptor5$observed)

##                          
## mydata_cluster$Employment   1   2   3   4   5 Sum
##             Mental work    21  11   7  14  11  64
##             Other          16  11   1  10   6  44
##             Physical work  11   7  10   9   8  45
##             Sum            48  29  18  33  25 153

addmargins(round(Descriptor5$expected,2))

##                          
## mydata_cluster$Employment     1     2     3     4     5 Sum
##             Mental work   20.08 12.13  7.53 13.80 10.46  64
##             Other         13.80  8.34  5.18  9.49  7.19  44
##             Physical work 14.12  8.53  5.29  9.71  7.35  45
##             Sum           48.00 29.00 18.00 33.00 25.00 153

round(Descriptor5$residuals,2)

##                          
## mydata_cluster$Employment     1     2     3     4     5
##             Mental work    0.21 -0.32 -0.19  0.05  0.17
##             Other          0.59  0.92 -1.84  0.17 -0.44
##             Physical work -0.83 -0.52  2.05 -0.23  0.24

library(effectsize)
effectsize::cramers_v(mydata_cluster$Employment,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.09              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor6 <- chisq.test(mydata_cluster$`Area of living`,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$`Area of living`,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor6 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Area of living` and as.factor(mydata_cluster$Group)
## X-squared = 2.7095, df = 4, p-value = 0.6076

addmargins(Descriptor6$observed)

##             
##                1   2   3   4   5 Sum
##   Urban area  36  22  16  28  21 123
##   Rural area  12   7   2   5   4  30
##   Sum         48  29  18  33  25 153

addmargins(round(Descriptor6$expected,2))

##             
##                  1     2     3     4    5 Sum
##   Urban area 38.59 23.31 14.47 26.53 20.1 123
##   Rural area  9.41  5.69  3.53  6.47  4.9  30
##   Sum        48.00 29.00 18.00 33.00 25.0 153

round(Descriptor6$residuals,2)

##             
##                  1     2     3     4     5
##   Urban area -0.42 -0.27  0.40  0.29  0.20
##   Rural area  0.84  0.55 -0.81 -0.58 -0.41

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Area of living`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.00              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor7 <- chisq.test(mydata_cluster$`Primary bank`,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$`Primary bank`,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor7 # Used for descriptor even though p-value is slightly above 10% (3 categories, sample size 153)

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Primary bank` and as.factor(mydata_cluster$Group)
## X-squared = 12.279, df = 8, p-value = 0.1392

addmargins(Descriptor7$observed)

##                              
## mydata_cluster$`Primary bank`   1   2   3   4   5 Sum
##                   NLB          25  12   5  12   8  62
##                   OTP           5   8   1   8   5  27
##                   Other banks  18   9  12  13  12  64
##                   Sum          48  29  18  33  25 153

addmargins(round(Descriptor7$expected,2))

##                              
## mydata_cluster$`Primary bank`     1     2     3     4     5    Sum
##                   NLB         19.45 11.75  7.29 13.37 10.13  61.99
##                   OTP          8.47  5.12  3.18  5.82  4.41  27.00
##                   Other banks 20.08 12.13  7.53 13.80 10.46  64.00
##                   Sum         48.00 29.00 18.00 32.99 25.00 152.99

round(Descriptor7$residuals,2)

##                              
## mydata_cluster$`Primary bank`     1     2     3     4     5
##                   NLB          1.26  0.07 -0.85 -0.38 -0.67
##                   OTP         -1.19  1.27 -1.22  0.90  0.28
##                   Other banks -0.46 -0.90  1.63 -0.22  0.48

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Primary bank`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.12              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor8 <- chisq.test(mydata_cluster$`Grocery shopping`,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$`Grocery shopping`,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor8 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Grocery shopping` and as.factor(mydata_cluster$Group)
## X-squared = 5.5936, df = 4, p-value = 0.2316

addmargins(Descriptor8$observed)

##                   
##                      1   2   3   4   5 Sum
##   Digital payments  34  17  15  24  21 111
##   Cash              14  12   3   9   4  42
##   Sum               48  29  18  33  25 153

addmargins(round(Descriptor8$expected,2))

##                   
##                        1     2     3     4     5 Sum
##   Digital payments 34.82 21.04 13.06 23.94 18.14 111
##   Cash             13.18  7.96  4.94  9.06  6.86  42
##   Sum              48.00 29.00 18.00 33.00 25.00 153

round(Descriptor8$residuals,2)

##                   
##                        1     2     3     4     5
##   Digital payments -0.14 -0.88  0.54  0.01  0.67
##   Cash              0.23  1.43 -0.87 -0.02 -1.09

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Grocery shopping`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.10              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor9 <- chisq.test(mydata_cluster$`Online shopping`,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$`Online shopping`,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor9 # Used for descriptor p-value appropriate (2 categories, sample size 153)

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Online shopping` and as.factor(mydata_cluster$Group)
## X-squared = 12.068, df = 4, p-value = 0.01685

addmargins(Descriptor9$observed)

##                   
##                      1   2   3   4   5 Sum
##   Digital payments  39  19  15  29  25 127
##   Cash               9  10   3   4   0  26
##   Sum               48  29  18  33  25 153

addmargins(round(Descriptor9$expected,2))

##                   
##                        1     2     3     4     5    Sum
##   Digital payments 39.84 24.07 14.94 27.39 20.75 126.99
##   Cash              8.16  4.93  3.06  5.61  4.25  26.01
##   Sum              48.00 29.00 18.00 33.00 25.00 153.00

round(Descriptor9$residuals,2)

##                   
##                        1     2     3     4     5
##   Digital payments -0.13 -1.03  0.02  0.31  0.93
##   Cash              0.30  2.28 -0.03 -0.68 -2.06

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Online shopping`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.23              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

fisher.test(mydata_cluster$`Online shopping`,as.factor(mydata_cluster$Group)) # Fisher test as more than 2 expected frequencies is below 5, H0: The variables are independent, H1: The variables are not independent. We cannot reject H0, we assume variables are independent at p=1.

## 
##  Fisher's Exact Test for Count Data
## 
## data:  mydata_cluster$`Online shopping` and as.factor(mydata_cluster$Group)
## p-value = 0.01044
## alternative hypothesis: two.sided

Descriptor10 <- chisq.test(mydata_cluster$Subscriptions,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$Subscriptions,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor10 # Used for descriptor p-value appropriate (2 categories, sample size 153)

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$Subscriptions and as.factor(mydata_cluster$Group)
## X-squared = 12.105, df = 4, p-value = 0.01658

addmargins(Descriptor10$observed)

##                             
## mydata_cluster$Subscriptions   1   2   3   4   5 Sum
##             Digital payments  42  20  15  30  25 132
##             Cash               6   9   3   3   0  21
##             Sum               48  29  18  33  25 153

addmargins(round(Descriptor10$expected,2))

##                             
## mydata_cluster$Subscriptions     1     2     3     4     5 Sum
##             Digital payments 41.41 25.02 15.53 28.47 21.57 132
##             Cash              6.59  3.98  2.47  4.53  3.43  21
##             Sum              48.00 29.00 18.00 33.00 25.00 153

round(Descriptor10$residuals,2)

##                             
## mydata_cluster$Subscriptions     1     2     3     4     5
##             Digital payments  0.09 -1.00 -0.13  0.29  0.74
##             Cash             -0.23  2.52  0.34 -0.72 -1.85

library(effectsize)
effectsize::cramers_v(mydata_cluster$Subscriptions,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.23              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

fisher.test(mydata_cluster$Subscriptions, as.factor(mydata_cluster$Group)) # Fisher test as more than 2 expected frequencies is below 5, H0: The variables are independent, H1: The variables are not independent. We cannot reject H0, we assume variables are independent at p=1.

## 
##  Fisher's Exact Test for Count Data
## 
## data:  mydata_cluster$Subscriptions and as.factor(mydata_cluster$Group)
## p-value = 0.01436
## alternative hypothesis: two.sided

Descriptor11 <- chisq.test(mydata_cluster$`Bill payments`,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$`Bill payments`,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor11 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Bill payments` and as.factor(mydata_cluster$Group)
## X-squared = 5.5678, df = 4, p-value = 0.2338

addmargins(Descriptor11$observed)

##                               
## mydata_cluster$`Bill payments`   1   2   3   4   5 Sum
##               Digital payments  41  23  15  29  25 133
##               Cash               7   6   3   4   0  20
##               Sum               48  29  18  33  25 153

addmargins(round(Descriptor11$expected,2))

##                               
## mydata_cluster$`Bill payments`     1     2     3     4     5    Sum
##               Digital payments 41.73 25.21 15.65 28.69 21.73 133.01
##               Cash              6.27  3.79  2.35  4.31  3.27  19.99
##               Sum              48.00 29.00 18.00 33.00 25.00 153.00

round(Descriptor11$residuals,2)

##                               
## mydata_cluster$`Bill payments`     1     2     3     4     5
##               Digital payments -0.11 -0.44 -0.16  0.06  0.70
##               Cash              0.29  1.13  0.42 -0.15 -1.81

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Bill payments`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.10              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

fisher.test(mydata_cluster$`Bill payments`,as.factor(mydata_cluster$Group)) # Fisher test as more than 2 expected frequencies is below 5, H0: The variables are independent, H1: The variables are not independent. We cannot reject H0, we assume variables are independent at p=1.

## 
##  Fisher's Exact Test for Count Data
## 
## data:  mydata_cluster$`Bill payments` and as.factor(mydata_cluster$Group)
## p-value = 0.1491
## alternative hypothesis: two.sided

Descriptor12 <- chisq.test(mydata_cluster$`Paying for services`,as.factor(mydata_cluster$Group))
Descriptor12 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Paying for services` and as.factor(mydata_cluster$Group)
## X-squared = 2.7994, df = 4, p-value = 0.5919

addmargins(Descriptor12$observed)

##                   
##                      1   2   3   4   5 Sum
##   Digital payments  17   8   3   8   8  44
##   Cash              31  21  15  25  17 109
##   Sum               48  29  18  33  25 153

addmargins(round(Descriptor12$expected,2))

##                   
##                       1     2     3     4     5 Sum
##   Digital payments 13.8  8.34  5.18  9.49  7.19  44
##   Cash             34.2 20.66 12.82 23.51 17.81 109
##   Sum              48.0 29.00 18.00 33.00 25.00 153

round(Descriptor12$residuals,2)

##                   
##                        1     2     3     4     5
##   Digital payments  0.86 -0.12 -0.96 -0.48  0.30
##   Cash             -0.55  0.07  0.61  0.31 -0.19

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Paying for services`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.00              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor13 <- chisq.test(mydata_cluster$Tips,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$Tips, as.factor(mydata_cluster$Group)):
## Chi-squared approximation may be incorrect

Descriptor13 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$Tips and as.factor(mydata_cluster$Group)
## X-squared = 2.0889, df = 4, p-value = 0.7194

addmargins(Descriptor13$observed)

##                    
## mydata_cluster$Tips   1   2   3   4   5 Sum
##    Digital payments   4   2   2   2   4  14
##    Cash              44  27  16  31  21 139
##    Sum               48  29  18  33  25 153

addmargins(round(Descriptor13$expected,2))

##                    
## mydata_cluster$Tips     1     2     3     4     5 Sum
##    Digital payments  4.39  2.65  1.65  3.02  2.29  14
##    Cash             43.61 26.35 16.35 29.98 22.71 139
##    Sum              48.00 29.00 18.00 33.00 25.00 153

round(Descriptor13$residuals,2)

##                    
## mydata_cluster$Tips     1     2     3     4     5
##    Digital payments -0.19 -0.40  0.28 -0.59  1.13
##    Cash              0.06  0.13 -0.09  0.19 -0.36

library(effectsize)
effectsize::cramers_v(mydata_cluster$Tips,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.00              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

fisher.test(mydata_cluster$Tips,as.factor(mydata_cluster$Group)) # Fisher test as more than 2 expected frequencies is below 5, H0: The variables are independent, H1: The variables are not independent. We cannot reject H0, we assume variables are independent at p=1.

## 
##  Fisher's Exact Test for Count Data
## 
## data:  mydata_cluster$Tips and as.factor(mydata_cluster$Group)
## p-value = 0.7314
## alternative hypothesis: two.sided

Descriptor14 <- chisq.test(mydata_cluster$Transport,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$Transport,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor14 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$Transport and as.factor(mydata_cluster$Group)
## X-squared = 2.045, df = 4, p-value = 0.7275

addmargins(Descriptor14$observed)

##                         
## mydata_cluster$Transport   1   2   3   4   5 Sum
##         Digital payments  12   5   6  10   7  40
##         Cash              36  24  12  23  18 113
##         Sum               48  29  18  33  25 153

addmargins(round(Descriptor14$expected,2))

##                         
## mydata_cluster$Transport     1     2     3     4     5    Sum
##         Digital payments 12.55  7.58  4.71  8.63  6.54  40.01
##         Cash             35.45 21.42 13.29 24.37 18.46 112.99
##         Sum              48.00 29.00 18.00 33.00 25.00 153.00

round(Descriptor14$residuals,2)

##                         
## mydata_cluster$Transport     1     2     3     4     5
##         Digital payments -0.15 -0.94  0.60  0.47  0.18
##         Cash              0.09  0.56 -0.35 -0.28 -0.11

library(effectsize)
effectsize::cramers_v(mydata_cluster$Transport,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.00              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor15 <- chisq.test(mydata_cluster$`Social events`,as.factor(mydata_cluster$Group))
Descriptor15 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Social events` and as.factor(mydata_cluster$Group)
## X-squared = 1.9514, df = 4, p-value = 0.7447

addmargins(Descriptor15$observed)

##                               
## mydata_cluster$`Social events`   1   2   3   4   5 Sum
##               Digital payments  21  11  10  16  13  71
##               Cash              27  18   8  17  12  82
##               Sum               48  29  18  33  25 153

addmargins(round(Descriptor15$expected,2))

##                               
## mydata_cluster$`Social events`     1     2     3     4    5    Sum
##               Digital payments 22.27 13.46  8.35 15.31 11.6  70.99
##               Cash             25.73 15.54  9.65 17.69 13.4  82.01
##               Sum              48.00 29.00 18.00 33.00 25.0 153.00

round(Descriptor15$residuals,2)

##                               
## mydata_cluster$`Social events`     1     2     3     4     5
##               Digital payments -0.27 -0.67  0.57  0.18  0.41
##               Cash              0.25  0.62 -0.53 -0.16 -0.38

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Social events`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.00              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor16 <- chisq.test(mydata_cluster$`Most frequent payment method`,as.factor(mydata_cluster$Group))

## Warning in chisq.test(mydata_cluster$`Most frequent payment method`,
## as.factor(mydata_cluster$Group)): Chi-squared approximation may be incorrect

Descriptor16 # Used for descriptor p-value appropriate (4 categories, sample size 153)

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Most frequent payment method` and as.factor(mydata_cluster$Group)
## X-squared = 20.559, df = 12, p-value = 0.05722

addmargins(Descriptor16$observed)

##                         
##                            1   2   3   4   5 Sum
##   Cash                     3   3   0   3   2  11
##   Credit/Debit cards      34  23   8  17  14  96
##   Mobile payment options   5   3   8  11   7  34
##   Mobile banks             6   0   2   2   2  12
##   Sum                     48  29  18  33  25 153

addmargins(round(Descriptor16$expected,2))

##                         
##                              1     2     3     4     5    Sum
##   Cash                    3.45  2.08  1.29  2.37  1.80  10.99
##   Credit/Debit cards     30.12 18.20 11.29 20.71 15.69  96.01
##   Mobile payment options 10.67  6.44  4.00  7.33  5.56  34.00
##   Mobile banks            3.76  2.27  1.41  2.59  1.96  11.99
##   Sum                    48.00 28.99 17.99 33.00 25.01 152.99

round(Descriptor16$residuals,2)

##                         
##                              1     2     3     4     5
##   Cash                   -0.24  0.63 -1.14  0.41  0.15
##   Credit/Debit cards      0.71  1.13 -0.98 -0.81 -0.43
##   Mobile payment options -1.74 -1.36  2.00  1.35  0.61
##   Mobile banks            1.15 -1.51  0.50 -0.37  0.03

fisher.test(mydata_cluster$`Most frequent payment method`, 
            as.factor(mydata_cluster$Group), 
            workspace = 2e8)  # Increase workspace size as contingency table is to large for r to calculate it. Fisher test as more than 2 expected frequencies is below 5, H0: The variables are independent, H1: The variables are not independent. We reject H0, we assume variables are not independent at p=0.031.

## 
##  Fisher's Exact Test for Count Data
## 
## data:  mydata_cluster$`Most frequent payment method` and as.factor(mydata_cluster$Group)
## p-value = 0.03078
## alternative hypothesis: two.sided

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Most frequent payment method`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.14              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor17 <- chisq.test(mydata_cluster$`Cash carrying`,as.factor(mydata_cluster$Group))
Descriptor17 # Cannot be used as descriptor p-value too high, not validated

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Cash carrying` and as.factor(mydata_cluster$Group)
## X-squared = 6.2186, df = 4, p-value = 0.1834

addmargins(Descriptor17$observed)

##                               
## mydata_cluster$`Cash carrying`   1   2   3   4   5 Sum
##                   Small amount  18  11  10  14  16  69
##                   Large amount  30  18   8  19   9  84
##                   Sum           48  29  18  33  25 153

addmargins(round(Descriptor17$expected,2))

##                               
## mydata_cluster$`Cash carrying`     1     2     3     4     5 Sum
##                   Small amount 21.65 13.08  8.12 14.88 11.27  69
##                   Large amount 26.35 15.92  9.88 18.12 13.73  84
##                   Sum          48.00 29.00 18.00 33.00 25.00 153

round(Descriptor17$residuals,2)

##                               
## mydata_cluster$`Cash carrying`     1     2     3     4     5
##                   Small amount -0.78 -0.57  0.66 -0.23  1.41
##                   Large amount  0.71  0.52 -0.60  0.21 -1.28

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Cash carrying`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.12              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Descriptor18 <- chisq.test(mydata_cluster$`Digital payment usage`,as.factor(mydata_cluster$Group))
Descriptor18 # Used for descriptor p-value appropriate (2 categories, sample size 153)

## 
##  Pearson's Chi-squared test
## 
## data:  mydata_cluster$`Digital payment usage` and as.factor(mydata_cluster$Group)
## X-squared = 8.0059, df = 4, p-value = 0.09136

addmargins(Descriptor18$observed)

##             
##                1   2   3   4   5 Sum
##   High usage  18  10  11  14  16  69
##   Low usage   30  19   7  19   9  84
##   Sum         48  29  18  33  25 153

addmargins(round(Descriptor18$expected,2))

##             
##                  1     2     3     4     5 Sum
##   High usage 21.65 13.08  8.12 14.88 11.27  69
##   Low usage  26.35 15.92  9.88 18.12 13.73  84
##   Sum        48.00 29.00 18.00 33.00 25.00 153

round(Descriptor18$residuals,2)

##             
##                  1     2     3     4     5
##   High usage -0.78 -0.85  1.01 -0.23  1.41
##   Low usage   0.71  0.77 -0.92  0.21 -1.28

library(effectsize)
effectsize::cramers_v(mydata_cluster$`Digital payment usage`,mydata_cluster$Group)

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.16              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Hypothesis testing

H1: Consumers who perceive digital payments as highly secure (Likert score ≥5) use them more frequently than those who perceive them as less secure.

mydata_H1 <- mydata[,c("Q2b_1","Q2c_1","Q46")]

head(mydata_H1)

##   Q2b_1 Q2c_1 Q46
## 1     6     7  80
## 2     4     4  50
## 3     6     5  90
## 4     5     5  75
## 5     6     5  72
## 6     6     6  81

mydata_H1$`Digital Payments Security` <- rowMeans(mydata_H1[, c("Q2b_1", "Q2c_1")])
mydata_H1$`Digital Payments Security` <- ifelse(mydata_H1$`Digital Payments Security`>5,"High security perception","Low security perception")
mydata_H1$`Digital Payments Security` <- factor(mydata_H1$`Digital Payments Security`,
                                                levels = c("High security perception","Low security perception"),
                                                labels = c("High security perception","Low security perception"))

colnames(mydata_H1) <- c("Cards security", "Mobile payments security", "Digital payment usage","Digital payment security")

library(psych)
describeBy(mydata_H1$`Digital payment usage`,mydata_H1$`Digital payment security`)

## 
##  Descriptive statistics by group 
## group: High security perception
##    vars   n  mean    sd median trimmed   mad min max range  skew kurtosis   se
## X1    1 107 79.36 15.85     80   81.25 14.83   1 100    99 -1.81     5.48 1.53
## ------------------------------------------------------------ 
## group: Low security perception
##    vars  n  mean    sd median trimmed   mad min max range  skew kurtosis   se
## X1    1 48 68.25 25.49     75   71.62 22.24   1  99    98 -1.17     0.56 3.68

library(ggplot2)

Security_High <- ggplot(mydata_H1[mydata_H1$`Digital payment security` == "High security perception", ], aes(x = `Digital payment usage`)) +
  theme_linedraw() +
  geom_histogram(binwidth = 1, col = "black") +
  ylab("Frequency") +
  ggtitle("High Security")

Security_Low <- ggplot(mydata_H1[mydata_H1$`Digital payment security` == "Low security perception", ], aes(x = `Digital payment usage`)) +
  theme_linedraw() +
  geom_histogram(binwidth = 1, col = "black") +
  ylab("Frequency") +
  ggtitle("Low security")

library(ggpubr)
ggarrange(Security_High, Security_Low,
          ncol = 2, nrow = 1)

Normality is violated, Wilcoxon rank sum test

H0: Distribution locations of digital payment usage is the same for individuals with low and high perception of digital payment security.
H1: Distribution locations of digital payment usage is greater for individuals with high perception of digital payment security.

wilcox.test(mydata_H1$`Digital payment usage`~mydata_H1$`Digital payment security`,
            correct=FALSE,
            exact=FALSE,
            alternative="greater")

## 
##  Wilcoxon rank sum test
## 
## data:  mydata_H1$`Digital payment usage` by mydata_H1$`Digital payment security`
## W = 3251, p-value = 0.004052
## alternative hypothesis: true location shift is greater than 0

library(effectsize)
effectsize(wilcox.test(mydata_H1$`Digital payment usage`~mydata_H1$`Digital payment security`,
            correct=FALSE,
            exact=FALSE,
            alternative="greater"))

## r (rank biserial) |       95% CI
## --------------------------------
## 0.27              | [0.11, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

interpret_rank_biserial(0.27)

## [1] "medium"
## (Rules: funder2019)

Conclusion: We reject H0 at p = 0.005, we assume that consumers who perceive digital payments as highly secure (Likert score ≥5) use them more frequently than those who perceive them as less secure.

H2: Higher trust in bank-provided digital platforms (Likert score ≥5) is positively correlated with digital payment adoption.

mydata_H2 <- mydata[,c("Q27a_1","Q27b_1","Q27c_1","Q46")]
head(mydata_H2)

##   Q27a_1 Q27b_1 Q27c_1 Q46
## 1      7      7      7  80
## 2      7      7      4  50
## 3      7      7      7  90
## 4      6      7      6  75
## 5      6      7      7  72
## 6      4      6      6  81

mydata_H2$`Digital payment trust` <- round(rowMeans(mydata_H2[,c("Q27a_1","Q27b_1","Q27c_1")]),2)

colnames(mydata_H2) <- c("Trust large payments","Trust reoccuring payments", "Trust P2P","Digital payment usage","Digital payment trust")

mydata_H2f <- mydata_H2[,c("Digital payment usage","Digital payment trust")]

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggpairs(mydata_H2f)

cor(mydata_H2$`Digital payment usage`,mydata_H2$`Digital payment trust`,
    method = "pearson") # Linear relationship between digital payment usage and trust is positive and semi strong

## [1] 0.4940468

H0: Pearson rho^2 is equal to 0
H1: Pearson rho^2 is not equal to 0

cor.test(mydata_H2$`Digital payment usage`,mydata_H2$`Digital payment trust`,
    method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  mydata_H2$`Digital payment usage` and mydata_H2$`Digital payment trust`
## t = 7.0287, df = 153, p-value = 6.459e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3648122 0.6046050
## sample estimates:
##       cor 
## 0.4940468

Conclusion: We reject H0 at p < 0.001, pearson rho^2 is not equal to 0, correlation between digital payment usage and trust is positive and semi strong.

H3: Consumers who trust digital payments for large purchases are more likely to use them for daily transactions.

mydata_H3 <- mydata[,c("Q27a_1","Q46")]
colnames(mydata_H3) <- c("Trust large payments", "Digital payment usage")
head(mydata_H3)

##   Trust large payments Digital payment usage
## 1                    7                    80
## 2                    7                    50
## 3                    7                    90
## 4                    6                    75
## 5                    6                    72
## 6                    4                    81

mydata_H3$`Trust large payments` <- ifelse(mydata_H3$`Trust large payments`>5,"High trust","Low trust")
mydata_H3$`Trust large payments` <- factor(mydata_H3$`Trust large payments`,
                                           levels = c("High trust","Low trust"),
                                           labels = c("High trust","Low trust"))

library(ggplot2)

Trust_High <- ggplot(mydata_H3[mydata_H3$`Trust large payments` == "High trust", ], aes(x = `Digital payment usage`)) +
  theme_linedraw() +
  geom_histogram(binwidth = 1, col = "blue") +
  ylab("Frequency") +
  ggtitle("High trust")

Trust_Low <- ggplot(mydata_H3[mydata_H3$`Trust large payments` == "Low trust", ], aes(x = `Digital payment usage`)) +
  theme_linedraw() +
  geom_histogram(binwidth = 1, col = "purple") +
  ylab("Frequency") +
  ggtitle("Low trust")

library(ggpubr)
ggarrange(Trust_High, Trust_Low,
          ncol = 2, nrow = 1)

Normality is violated, Wilcoxon rank sum test

H0: Distribution locations of digital payment usage is the same for individuals with low and high trust of digital payments for large purchases.
H1: Distribution locations of digital payment usage is greater for individuals with high trust of digital payments for large purchases.

wilcox.test(mydata_H3$`Digital payment usage`~mydata_H3$`Trust large payments`,
            correct=FALSE,
            exact=FALSE,
            alternative="greater")

## 
##  Wilcoxon rank sum test
## 
## data:  mydata_H3$`Digital payment usage` by mydata_H3$`Trust large payments`
## W = 3939.5, p-value = 5.282e-08
## alternative hypothesis: true location shift is greater than 0

library(effectsize)
effectsize(wilcox.test(mydata_H3$`Digital payment usage`~mydata_H3$`Trust large payments`,
            correct=FALSE,
            exact=FALSE,
            alternative="greater"))

## r (rank biserial) |       95% CI
## --------------------------------
## 0.53              | [0.41, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

interpret_rank_biserial(0.53)

## [1] "very large"
## (Rules: funder2019)

Conclusion: We reject H0 at p < 0.001, we assume that consumers who have high trust indigital payments for large purchases (Likert score >5) use digital payments more likely for daily transactions than those who have lower trust in digital payments for large purchases.

H4: The majority (more than 70%) of transactions by premium customers are conducted digitally.

mydata_H4 <- mydata["Q46"]
colnames(mydata_H4) <- "Digital payment usage"
head(mydata_H4)

##   Digital payment usage
## 1                    80
## 2                    50
## 3                    90
## 4                    75
## 5                    72
## 6                    81

H0: Distribution of variable digital payment usage is normal
H1: Distribution of variable digital payment usage is not normal

library(ggplot2)

ggplot(mydata_H4, aes(x = `Digital payment usage`)) +
  geom_histogram(binwidth = 1, colour = "orange", fill="blue") +
  ylab("Frequency") +
  xlab("Digital payment usage")

shapiro.test(mydata_H4$`Digital payment usage`)

## 
##  Shapiro-Wilk normality test
## 
## data:  mydata_H4$`Digital payment usage`
## W = 0.83393, p-value = 5.651e-12

Normality violated, non-parametric hypothesis about the median.

median(mydata_H4$`Digital payment usage`)

## [1] 80

H0: Median is 70
H1: Median is not 70

wilcox.test(mydata_H4$`Digital payment usage`,
            mu = 70,
            correct = FALSE)

## 
##  Wilcoxon signed rank test
## 
## data:  mydata_H4$`Digital payment usage`
## V = 8592.5, p-value = 1.113e-07
## alternative hypothesis: true location is not equal to 70

effectsize(wilcox.test(mydata_H4$`Digital payment usage`,
            mu = 70,
            correct = FALSE))

## r (rank biserial) |       95% CI
## --------------------------------
## 0.50              | [0.35, 0.62]
## 
## - Deviation from a difference of 70.

interpret_rank_biserial(0.50)

## [1] "very large"
## (Rules: funder2019)

Conclusion: We reject H0 at p < 0.001, median digital payment usage is 80 and is higher than the 70 set in hypothesis. Majority of transactions by premium segment is conducted by digital payments.

H5: Convenience is the primary motivator for the usage of digital payment methods among premium customers.

mydata_H5 <- mydata[,c("Q2b_1","Q2c_1","Q3b_1","Q3c_1","Q4b_1","Q4c_1","Q5b_1","Q5c_1","Q6b_1","Q6c_1","Q7b_1","Q7c_1","ID")]
head(mydata_H5)

##   Q2b_1 Q2c_1 Q3b_1 Q3c_1 Q4b_1 Q4c_1 Q5b_1 Q5c_1 Q6b_1 Q6c_1 Q7b_1 Q7c_1 ID
## 1     6     7     7     7     6     7     7     7     6     6     7     7  1
## 2     4     4     7     7     7     7     7     7     3     3     7     7  2
## 3     6     5     7     7     7     7     7     7     6     5     7     7  3
## 4     5     5     5     6     6     6     6     6     5     7     6     6  4
## 5     6     5     4     6     3     4     1     4     7     4     5     7  5
## 6     6     6     6     5     6     5     7     6     4     4     5     6  6

mydata_H5$Safety <- rowMeans(mydata_H5[,c("Q2b_1","Q2c_1")])
mydata_H5$Speed <- rowMeans(mydata_H5[,c("Q3b_1","Q3c_1")])
mydata_H5$Ease <- rowMeans(mydata_H5[,c("Q4b_1","Q4c_1")])
mydata_H5$Convenience <- rowMeans(mydata_H5[,c("Q5b_1","Q5c_1")])
mydata_H5$Privacy <- rowMeans(mydata_H5[,c("Q6b_1","Q6c_1")])
mydata_H5$Tracking <- rowMeans(mydata_H5[,c("Q7b_1","Q7c_1")])

mydata_H5f <- mydata_H5[,c("Safety","Speed","Ease","Convenience","Privacy","Tracking","ID")]

library(rstatix)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()    masks psych::%+%()
## ✖ ggplot2::alpha()  masks psych::alpha()
## ✖ tidyr::extract()  masks pastecs::extract()
## ✖ rstatix::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::first()    masks pastecs::first()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ dplyr::last()     masks pastecs::last()
## ✖ car::recode()     masks dplyr::recode()
## ✖ purrr::some()     masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

mydata_H5_long <- mydata_H5f %>%
  pivot_longer(cols = c("Safety","Speed","Ease","Convenience","Privacy","Tracking"),
               names_to = "Motivator",
               values_to = "Importance") %>%
  convert_as_factor(Motivator)

mydata_H5_long <- as.data.frame(mydata_H5_long)

tail(mydata_H5_long,10)

##      ID   Motivator Importance
## 921 154        Ease        7.0
## 922 154 Convenience        7.0
## 923 154     Privacy        5.0
## 924 154    Tracking        6.5
## 925 155      Safety        5.0
## 926 155       Speed        5.5
## 927 155        Ease        5.0
## 928 155 Convenience        4.5
## 929 155     Privacy        1.0
## 930 155    Tracking        6.0

library(ggpubr)

# Boxplot for each variable.
ggboxplot(mydata_H5_long,
          x = "Motivator",
          y = "Importance",
          add = "jitter")

library(tidyverse)
library(ggpubr)
library(rstatix)

# Finding outliers.
mydata_H5_long %>%
  group_by(Motivator) %>%
  identify_outliers(Importance)

## # A tibble: 25 × 5
##    Motivator      ID Importance is.outlier is.extreme
##    <fct>       <int>      <dbl> <lgl>      <lgl>     
##  1 Convenience     5        2.5 TRUE       FALSE     
##  2 Convenience    70        2.5 TRUE       FALSE     
##  3 Convenience   138        1   TRUE       FALSE     
##  4 Convenience   145        3   TRUE       FALSE     
##  5 Convenience   153        2   TRUE       FALSE     
##  6 Ease            5        3.5 TRUE       FALSE     
##  7 Ease           71        1.5 TRUE       TRUE      
##  8 Ease          116        2.5 TRUE       FALSE     
##  9 Ease          126        3.5 TRUE       FALSE     
## 10 Ease          138        1   TRUE       TRUE      
## # ℹ 15 more rows

mydata_H5_long <- mydata_H5_long %>%
  filter(!ID %in% c(5,70,138,145,153,71,116,126,146,21,63,79,103,27,42,152))

library(rstatix)

# Checking normality with Shapiro-Wilk test
mydata_H5_long %>%
  group_by(Motivator) %>%
  shapiro_test(Importance)

## # A tibble: 6 × 4
##   Motivator   variable   statistic             p
##   <fct>       <chr>          <dbl>         <dbl>
## 1 Convenience Importance     0.876 0.00000000216
## 2 Ease        Importance     0.903 0.0000000482 
## 3 Privacy     Importance     0.951 0.0000817    
## 4 Safety      Importance     0.929 0.00000198   
## 5 Speed       Importance     0.903 0.0000000488 
## 6 Tracking    Importance     0.881 0.00000000358

Normality is violated in all 6 motivators (H0 < 0.001), we will use Friedman ANOVA.

library(rstatix)

# Descriptive statistics for each variable.
mydata_H5_long %>%
  group_by(Motivator) %>%
  get_summary_stats(Importance, type = "common")

## # A tibble: 6 × 11
##   Motivator   variable       n   min   max median   iqr  mean    sd    se    ci
##   <fct>       <fct>      <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Convenience Importance   139     4     7      6  1     6.08 0.833 0.071 0.14 
## 2 Ease        Importance   139     4     7      6  1.25  6.11 0.737 0.062 0.124
## 3 Privacy     Importance   139     1     7      4  2.5   4.31 1.50  0.127 0.251
## 4 Safety      Importance   139     4     7      6  1     5.74 0.781 0.066 0.131
## 5 Speed       Importance   139     4     7      6  1     6.09 0.709 0.06  0.119
## 6 Tracking    Importance   139     3     7      6  1.5   5.73 1.10  0.093 0.185

H0: Distribution location of importance in all 6 motivators is the same
H1: Distribution location of importance in all 6 motivators is not the same

library(rstatix)

# Friedman ANOVA.
FriedmanANOVA <- friedman_test(Importance ~ Motivator | ID, 
                               data = mydata_H5_long)

FriedmanANOVA # Summary of results.

## # A tibble: 1 × 6
##   .y.            n statistic    df        p method       
## * <chr>      <int>     <dbl> <dbl>    <dbl> <chr>        
## 1 Importance   139      194.     5 6.53e-40 Friedman test

library(effectsize)
effectsize::kendalls_w(Importance ~ Motivator | ID, 
                               data = mydata_H5_long)

## Warning: 139 block(s) contain ties, some containing only 1 unique ranking.

## Kendall's W |       95% CI
## --------------------------
## 0.28        | [0.23, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

interpret_kendalls_w(0.28)

## [1] "fair agreement"
## (Rules: landis1977)

Post-hoc tests

library(rstatix)

# Wilcoxon signed rank tests - comparing all possible pairs.
paires_nonpar <- wilcox_test(Importance ~ Motivator,
                             paired = TRUE,
                             p.adjust.method = "bonferroni",
                             data = mydata_H5_long)

paires_nonpar

## # A tibble: 15 × 9
##    .y.        group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
##  * <chr>      <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
##  1 Importance Conve… Ease     139   139     1162  6.31e- 1 1   e+ 0 ns          
##  2 Importance Conve… Priva…   139   139     6738. 3.30e-19 4.95e-18 ****        
##  3 Importance Conve… Safety   139   139     4254. 1.59e- 5 2.38e- 4 ***         
##  4 Importance Conve… Speed    139   139     1838  7.45e- 1 1   e+ 0 ns          
##  5 Importance Conve… Track…   139   139     3730  4.58e- 4 7   e- 3 **          
##  6 Importance Ease   Priva…   139   139     6780  1.14e-19 1.71e-18 ****        
##  7 Importance Ease   Safety   139   139     3483  1.64e- 6 2.46e- 5 ****        
##  8 Importance Ease   Speed    139   139     1754. 6.52e- 1 1   e+ 0 ns          
##  9 Importance Ease   Track…   139   139     4183  2.9 e- 4 4   e- 3 **          
## 10 Importance Priva… Safety   139   139      222. 1.06e-17 1.59e-16 ****        
## 11 Importance Priva… Speed    139   139      142. 1.23e-19 1.84e-18 ****        
## 12 Importance Priva… Track…   139   139      612  4.16e-14 6.24e-13 ****        
## 13 Importance Safety Speed    139   139     1120. 4.20e- 6 6.30e- 5 ****        
## 14 Importance Safety Track…   139   139     2978. 9.55e- 1 1   e+ 0 ns          
## 15 Importance Speed  Track…   139   139     3750. 1.43e- 4 2   e- 3 **

library(rstatix)
comparisons <- paires_nonpar %>%
  add_y_position(fun = "median", step.increase = 0.35)

library(ggpubr)
ggboxplot(mydata_H5_long, x = "Motivator", y = "Importance", add = "point", ylim = c(0, 10)) +
  stat_pvalue_manual(comparisons, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 6,
               aes(group = Motivator), color = "blue",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "blue",
               position = position_dodge(width = 0.8),
               geom = "text", vjust = 0.5, hjust = -8,
               aes(label = round(after_stat(y), digits = 2), group = Motivator)) +
  labs(subtitle = get_test_label(FriedmanANOVA, detailed = TRUE),
       caption = get_pwc_label(comparisons))

Conclusion: We found that location of distribution of importance differs from the rest in at least one motivator at p < 0.001. Post-hoc tests revealed differences for each pair of groups except for the pairs: convenience-ease, convenience-speed, ease-speed and speed-tracking. Out of other pairs convenience is significantly different (higher) than privacy, safety and tracking. We accept our hypothesis that convenience is the primary motivator for use of digital payments among premium users.

H6: Ease of use (simple User Interface, Likert >5) is positively correlated with use of digital payment usage

mydata_H6 <- mydata[,c("Q4b_1","Q4c_1","Q46")]
head(mydata_H6)

##   Q4b_1 Q4c_1 Q46
## 1     6     7  80
## 2     7     7  50
## 3     7     7  90
## 4     6     6  75
## 5     3     4  72
## 6     6     5  81

mydata_H6$Ease <- rowMeans(mydata_H6[,c("Q4b_1","Q4c_1")])

mydata_H6f <- mydata_H6[,c("Q46","Ease")]
colnames(mydata_H6f) <- c("Digital payment usage","Ease")

library(GGally)
ggpairs(mydata_H6f)

cor(mydata_H6f$`Digital payment usage`,mydata_H6f$Ease,
    method = "pearson") # Linear relationship between digital payment usage and trust is positive and semi strong

## [1] 0.3366213

H0: Pearson rho^2 is equal to 0
H1: Pearson rho^2 is not equal to 0

cor.test(mydata_H6f$`Digital payment usage`,mydata_H6f$Ease,
    method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  mydata_H6f$`Digital payment usage` and mydata_H6f$Ease
## t = 4.4218, df = 153, p-value = 1.847e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1890031 0.4693616
## sample estimates:
##       cor 
## 0.3366213

Conclusion: We reject H0 at p < 0.001, pearson rho^2 is not equal to 0, correlation between digital payment usage and ease of use (simpler user interface means more use of digital payments) is positive and semi strong.

NLB Project Data analysis

Luka Vrtač

2025-02-03