Clustering

Introduction

The aim of clustering is to classify or group homogeneous objects. With the results obtained from the questionnaire, we have been able to differentiate our respondents into groups with the goal to personalize the recommendations and develop the most efficient marketing strategies.

We have performed statistical tests on various variables which we could describe our clusters with. We have tested the following demographic variables:

Additionally, we have tested the following variables that the respondents evaluated on the Likert scale, where 1 = strongly disagree, 2 = disagree, 3 = somewhat disagree, 4 = neutral, 5 = somewhat agree, 6 = agree, and 7 = strongly agree. The following variables were tested:

library(readxl)
mydata <- read_xlsx("./REAL.xlsx")


head(mydata)
## # A tibble: 6 × 40
##      ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     5     6     5     7     7     5     7     7     5     7     7     6
## 2     2     7     7     6     2     6     6     4     7     7     3     6     6
## 3     3     7     6     6     6     6     6     7     7     5     6     7     5
## 4     4     7     3     5     6     6     3     3     6     5     3     6     6
## 5     5     6     5     5     5     5     6     6     6     7     4     6     7
## 6     6     6     6     6     5     7     6     6     7     5     7     7     5
## # ℹ 27 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## #   Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## #   Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <dbl>,
## #   BankFixed <dbl>, EmplFixed <dbl>, JobFixed <dbl>, LocationFixed <dbl>,
## #   PreferCash <dbl>, KeepCash <dbl>, FindSeller <dbl>, SmallCash <dbl>,
## #   DigitalEasy <dbl>, ConvenientFlik <dbl>, FriendsFlik <dbl>,
## #   Hypothetical <dbl>, Income <dbl>
colnames(mydata)
##  [1] "ID"             "Q2a_1"          "Q2b_1"          "Q2c_1"         
##  [5] "Q3a_1"          "Q3b_1"          "Q3c_1"          "Q4a_1"         
##  [9] "Q4b_1"          "Q4c_1"          "Q5a_1"          "Q5b_1"         
## [13] "Q5c_1"          "Q6a_1"          "Q6b_1"          "Q6c_1"         
## [17] "Q7a_1"          "Q7b_1"          "Q7c_1"          "Age"           
## [21] "Gender"         "Location"       "Education"      "Job"           
## [25] "Bank"           "EmplStatus"     "EducFixed"      "BankFixed"     
## [29] "EmplFixed"      "JobFixed"       "LocationFixed"  "PreferCash"    
## [33] "KeepCash"       "FindSeller"     "SmallCash"      "DigitalEasy"   
## [37] "ConvenientFlik" "FriendsFlik"    "Hypothetical"   "Income"

Making Factors

First, we have made factors.

mydata$GenderFactor <- factor(mydata$Gender, 
                             levels = c(1, 2), 
                             labels = c("Male", "Female"))

mydata$LocationFactor <- factor(mydata$Location, 
                             levels = c(1, 2, 3), 
                             labels = c("Urban", "Urban", "Rural"))

mydata$EducationFactor <- factor(mydata$Education,
                                 levels = c(0,1, 2, 3, 4, 5, 6),
                                 labels = c("Unifinished elementary", "Finished elementary", "Vocational school", "General high school", "Undergraduate degree", "Master's degree", "PhD"))

mydata$EmplStatusFactor <- factor (mydata$EmplStatus,
                                   levels = c(1, 2, 3, 4),
                                   labels = c("Employed", "Self-employed", "Self-employed", "Self-employed"))

#mydata$JobFactor <- factor (mydata$Job,
                                   #levels = c(1, 2 ,3, 4, 5),
                                   #labels = c("Manual", "Manual", "Office", "Office", "Office"))

mydata$EducFixed <- factor (mydata$EducFixed,
                                   levels = c(0, 1),
                                   labels = c("Up to high school", "Undergrad and more"))

mydata$BankFixed <- factor (mydata$BankFixed,
                                   levels = c(0, 1,2,3,4,5,6,7,8,9),
                                   labels = c("NLB", "Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks"))

mydata$EmplFixed <- factor (mydata$EmplFixed,
                                   levels = c(0, 1),
                                   labels = c("Employed", "Others"))

mydata$JobFixed <- factor(mydata$JobFixed,
                          levels = c(0,1),
                          labels = c("White Collar", "Blue Collar"))

Standardizing the Variables

Now we will standardize variables that will be used for clustering.

The description of the variables:

mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")])) 

Let us find outliers.

mydata_clu_new[is.na(mydata_clu_new)] <- 0

mydata$Dissimilarity <- sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 + mydata_clu_new$Q4a_1^2 + mydata_clu_new$Q5a_1^2 + mydata_clu_new$Q6a_1^2 + mydata_clu_new$Q7a_1^2) 

Let us find the units with the highest value of dissimilarity.

head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) 
## # A tibble: 6 × 2
##      ID Dissimilarity
##   <dbl>         <dbl>
## 1    40          4.64
## 2    14          4.49
## 3   120          4.12
## 4    34          3.92
## 5    71          3.89
## 6   138          3.81

Let us show the units with the highest dissimilarity.

print(mydata[c(14,40,120), ])
## # A tibble: 3 × 45
##      ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    14     7     7     5     3     6     7     6     6     7     3     6     7
## 2    40     5     3     3     2     7     7     2     7     7     2     7     7
## 3   120     1     6     5     6     6     7     7     5     6     6     6     6
## # ℹ 32 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## #   Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## #   Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <fct>,
## #   BankFixed <fct>, EmplFixed <fct>, JobFixed <fct>, LocationFixed <dbl>,
## #   PreferCash <dbl>, KeepCash <dbl>, FindSeller <dbl>, SmallCash <dbl>,
## #   DigitalEasy <dbl>, ConvenientFlik <dbl>, FriendsFlik <dbl>,
## #   Hypothetical <dbl>, Income <dbl>, GenderFactor <fct>, …

Euclidian Distance

Let us calculate the Euclidian distances, and show the matrix of distances.

library(factoextra) 
## Warning: package 'factoextra' was built under R version 4.4.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Distances <- get_dist(mydata_clu_new, 
                      method = "euclidian")



fviz_dist(Distances, 
          gradient = list(low = "slateblue4",
                          mid = "green",
                          high = "white"))

Hopkins Statistics

Let us calculate the Hopkins statistics. If it is higher than 0.5, this means that data is clusterable.

library(factoextra) 
get_clust_tendency(mydata_clu_new, 
                   n = nrow(mydata_clu_new) - 1,
                   graph = FALSE)
## $hopkins_stat
## [1] 0.6418886
## 
## $plot
## NULL

Hopkins statistics is above 0.5 - data is clusterable.

K Means Approach

The graphs below - the elbow method and the silhouette analysis - are used to determine the optimal number of clusters. This is the K-means approach to clustering.

library(factoextra)
library(NbClust)

fviz_nbclust(mydata_clu_new, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

fviz_nbclust(mydata_clu_new, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette analysis")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(factoextra)
WARD <- mydata_clu_new %>%
  get_dist(method = "euclidean") %>%  
  hclust(method = "ward.D2")          

WARD
## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 152

Hierarchical Approach

Let us check the hierarchical approach to clustering. It shows the option of creating five clusters, which is what we have opted for.

library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(NbClust)
NbClust(mydata_clu_new, 
        distance = "euclidean", 
        min.nc = 2, max.nc = 10,
        method = "kmeans", 
        index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 5 proposed 3 as the best number of clusters 
## * 3 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## * 3 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
## $All.index
##         KL      CH Hartigan     CCC    Scott      Marriot    TrCovW   TraceW
## 2  16.5370 59.5648  22.8451 -1.2199 190.7707 4.307472e+12 12926.431 648.4867
## 3   0.1354 45.4359  23.4358 -1.8840 367.1832 3.036401e+12  9223.823 562.7755
## 4   0.7123 42.5792  23.8440 -1.6591 474.6207 2.662356e+12  6927.557 486.2886
## 5   2.2046 42.7495  14.9309  0.1108 564.7277 2.299495e+12  4648.256 418.8142
## 6   2.5760 40.3828   9.9144  0.8422 663.8276 1.725228e+12  3722.664 380.1973
## 7   0.8891 37.3325   9.0992  0.7345 727.1115 1.548550e+12  3098.369 356.0210
## 8   0.4194 35.0637  11.8946  0.7330 782.5853 1.404137e+12  2947.080 334.9989
## 9   9.5096 34.4609   5.7515  1.5349 847.1365 1.162195e+12  2492.866 309.4387
## 10  0.0898 32.2757  11.9974  1.1503 877.7749 1.172881e+12  2235.064 297.4742
##    Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 2    3.2372 1.3971 0.4534 1.5833     0.2516 1.2250 -18.5509 -0.6976    0.3433
## 3    5.4346 1.6099 0.4247 1.7077     0.2047 1.5054 -24.1722 -1.2658    0.3359
## 4    6.8884 1.8631 0.4612 1.5760     0.2104 1.1178  -5.7980 -0.3969    0.3241
## 5    7.5885 2.1633 0.4244 1.3974     0.2328 1.2897 -11.6811 -0.8421    0.3275
## 6    9.3895 2.3830 0.3998 1.3793     0.2277 1.4435 -10.4458 -1.1283    0.3105
## 7   10.5507 2.5448 0.4490 1.4012     0.2230 1.0142  -0.5887 -0.0525    0.2941
## 8   11.6282 2.7045 0.4165 1.4359     0.2077 1.6961 -10.2607 -1.4862    0.2804
## 9   12.8642 2.9279 0.4229 1.4137     0.2152 1.1606  -2.4907 -0.4969    0.2704
## 10  13.4092 3.0456 0.4162 1.3749     0.2092 1.8949 -12.7511 -1.6655    0.2591
##        Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  324.2434     0.4586  0.8104  0.7394 0.1312 0.0023  1.6254 1.9756 1.2828
## 3  187.5918     0.4468  0.1855  1.4472 0.0831 0.0026  1.6291 1.8218 1.0393
## 4  121.5722     0.4777  0.0745  1.9274 0.0977 0.0028  1.4802 1.6897 0.6214
## 5   83.7628     0.5135  0.2775  2.2588 0.1090 0.0032  1.3569 1.5742 0.5371
## 6   63.3662     0.5105  0.7706  2.6311 0.1078 0.0033  1.3411 1.5061 0.4869
## 7   50.8601     0.4811  1.1917  3.1800 0.1112 0.0034  1.5588 1.4607 0.4626
## 8   41.8749     0.4367  0.0850  4.0759 0.1051 0.0036  1.6493 1.4036 0.4293
## 9   34.3821     0.4416  0.4428  4.3256 0.1112 0.0038  1.5593 1.3558 0.3930
## 10  29.7474     0.4290 -0.1904  4.7313 0.1480 0.0039  1.5085 1.3214 0.3768
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.7006            43.1654            1
## 3          0.6533            38.2096            1
## 4          0.6459            30.1530            1
## 5          0.6222            31.5710            1
## 6          0.5356            29.4770            1
## 7          0.6188            25.8772            1
## 8          0.4889            26.1338            1
## 9          0.4643            20.7641            1
## 10         0.4174            37.6933            1
## 
## $Best.nc
##                     KL      CH Hartigan    CCC    Scott      Marriot   TrCovW
## Number_clusters  2.000  2.0000   5.0000 9.0000   3.0000            3    3.000
## Value_Index     16.537 59.5648   8.9131 1.5349 176.4125 897026528841 3702.608
##                  TraceW Friedman   Rubin Cindex      DB Silhouette  Duda
## Number_clusters  5.0000   3.0000  9.0000 6.0000 10.0000     2.0000 2.000
## Value_Index     28.8575   2.1974 -0.1056 0.3998  1.3749     0.2516 1.225
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000  2.0000    2.0000   3.0000     5.0000    1  2.0000
## Value_Index     -18.5509 -0.6976    0.3433 136.6515     0.5135   NA  0.7394
##                   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 10.000      0  6.0000      0 10.0000
## Value_Index      0.148      0  1.3411      0  0.3768
## 
## $Best.partition
##   [1] 1 2 1 2 1 1 2 2 2 1 2 1 1 2 2 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1
##  [38] 1 1 2 2 2 1 2 2 2 1 1 1 2 1 1 2 1 1 2 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 1 2 1
##  [75] 2 1 2 2 1 2 2 1 2 2 1 2 2 2 2 2 1 2 1 1 1 1 2 2 2 1 2 2 1 2 1 1 1 1 1 1 2
## [112] 1 2 1 2 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 2 2 1 2 1 2 2 2 2
## [149] 1 2 1 1

Making Clusters

In the first step, we will make five clusters with 30, 17, 24, 41, and 40 units, respectively.

Clustering <- kmeans(mydata_clu_new, 
                     centers = 5, 
                     nstart = 25) 

Clustering
## K-means clustering with 5 clusters of sizes 30, 17, 24, 41, 40
## 
## Cluster means:
##         Q2a_1       Q3a_1      Q4a_1      Q5a_1      Q6a_1      Q7a_1
## 1  0.30294872 -1.08988634 -1.2737371 -1.1121377  0.2349868 -0.4492697
## 2 -2.04299453 -0.15283293  0.1288002 -0.2342229 -0.1969184 -0.1982072
## 3  0.46619735 -0.27363453 -0.4237942 -0.3527855 -1.7529088 -0.1005993
## 4  0.32831346  0.01992193  0.4436057  0.4646443  0.5814218 -0.6697631
## 5  0.02482143  1.02612950  0.7001434  0.6690589  0.3632381  1.1680571
## 
## Clustering vector:
##   [1] 5 1 4 1 5 4 3 1 3 4 1 4 4 3 1 5 4 1 4 4 1 4 5 5 4 5 5 5 5 4 1 5 3 1 2 3 5
##  [38] 4 4 3 2 1 4 1 1 2 5 4 4 3 2 5 3 5 5 1 3 2 2 4 5 2 3 5 5 5 4 5 4 3 5 5 3 5
##  [75] 3 4 1 1 2 4 1 4 1 2 4 4 1 2 1 1 2 3 4 5 5 3 1 4 1 4 4 1 5 4 5 5 5 5 5 4 3
## [112] 5 1 5 4 4 4 4 4 2 5 4 1 4 2 4 3 5 3 3 3 4 5 3 2 4 1 3 2 1 2 5 1 5 4 1 3 3
## [149] 2 1 5 5
## 
## Within cluster sum of squares by cluster:
## [1] 101.21820  62.11656  76.96477  92.61780  85.89687
##  (between_SS / total_SS =  53.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = FALSE,
             ggtheme = theme_bw(),
             data = mydata_clu_new)

Some units seem to be far away from the center, which means that they are outliers, so we will remove them. As we remove the units, we standardize the data again.

mydata <- mydata %>%
  filter(!ID %in% c(133, 108, 7))

mydata$ID <- seq(1, nrow(mydata))


mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")])) 

As we have removed three units, the sizes of our five clusters changed. They now have 29, 41, 25, 37, and 17 units, respectively.

mydata_clu_new[is.na(mydata_clu_new)] <- 0

Clustering <- kmeans(mydata_clu_new, 
                     centers = 5, 
                     nstart = 25) 

Clustering
## K-means clustering with 5 clusters of sizes 29, 41, 25, 37, 17
## 
## Cluster means:
##         Q2a_1        Q3a_1      Q4a_1      Q5a_1      Q6a_1      Q7a_1
## 1  0.32527292 -1.099435794 -1.2365294 -1.1969146  0.2499536 -0.4063642
## 2  0.34475122 -0.004020726  0.4124110  0.4922125  0.5570996 -0.7193229
## 3  0.46902278 -0.119537224 -0.4047331 -0.2429699 -1.7180355  0.0505640
## 4 -0.02546811  1.014802901  0.7180670  0.6648481  0.4418536  1.1696604
## 5 -2.02064493 -0.147693467  0.1470795 -0.2350188 -0.2051434 -0.1920432
## 
## Clustering vector:
##   [1] 4 1 2 1 4 2 1 3 2 1 2 2 3 1 4 2 1 2 4 1 2 4 4 2 4 4 4 4 2 1 4 3 1 5 3 4 2
##  [38] 2 3 5 1 2 1 1 5 4 2 2 3 5 4 3 3 4 1 3 5 5 2 4 5 3 4 4 4 2 4 2 3 4 4 3 4 3
##  [75] 2 1 1 5 2 1 2 1 5 2 2 1 5 1 1 5 3 2 4 4 3 1 2 1 2 2 1 4 2 4 4 4 4 2 3 4 1
## [112] 4 2 2 2 2 2 5 4 2 1 2 5 2 3 4 3 3 3 2 3 5 2 1 3 5 2 5 4 1 3 2 1 3 3 5 1 4
## [149] 4
## 
## Within cluster sum of squares by cluster:
## [1] 94.10655 99.94698 77.03574 78.50395 62.95983
##  (between_SS / total_SS =  53.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = FALSE,
             ggtheme = theme_bw(),
             data = mydata_clu_new)

Averages <- Clustering$centers
Averages 
##         Q2a_1        Q3a_1      Q4a_1      Q5a_1      Q6a_1      Q7a_1
## 1  0.32527292 -1.099435794 -1.2365294 -1.1969146  0.2499536 -0.4063642
## 2  0.34475122 -0.004020726  0.4124110  0.4922125  0.5570996 -0.7193229
## 3  0.46902278 -0.119537224 -0.4047331 -0.2429699 -1.7180355  0.0505640
## 4 -0.02546811  1.014802901  0.7180670  0.6648481  0.4418536  1.1696604
## 5 -2.02064493 -0.147693467  0.1470795 -0.2350188 -0.2051434 -0.1920432

Describing the Groups

The graph below shows how different clusters perceive cash in terms of safety, speed, ease of use, convenience, privacy, and tracking expenses.

Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)

library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"))

Figure$Group <- factor(Figure$ID, 
                       levels = c(1, 2, 3, 4, 5), 
                       labels = c("1", "2", "3", "4", "5"))

Figure$NameF <- factor(Figure$name, 
                       levels = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"), 
                       labels = c("Cash_Safety", "Cash_Speed", "Cash_Ease of Use", "Cash_Convenience", "Cash_Privacy",  "Cash_Tracking Expenses"))

library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 5) +
  geom_line(aes(group = ID), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")+
  ylim(-2.5, 2.5) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))

mydata$Group <- Clustering$cluster

Checking the Fit

Below are the ANOVA tests, which test the differences between group means for the cluster variables. As p < 0.05, the H0 that the means are the same for all groups can be rejected. This means that the groups are statistically different in the mean values of cluster variables.

fit <- aov(cbind(Q2a_1, Q3a_1, Q4a_1, Q5a_1, Q6a_1, Q7a_1) ~ as.factor(Group), 
           data = mydata)

summary(fit)
##  Response Q2a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 158.22  39.555  45.813 < 2.2e-16 ***
## Residuals        144 124.33   0.863                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q3a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 186.27  46.568  35.889 < 2.2e-16 ***
## Residuals        144 186.84   1.298                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q4a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 166.72  41.681  36.842 < 2.2e-16 ***
## Residuals        144 162.91   1.131                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q5a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 221.47  55.367  32.526 < 2.2e-16 ***
## Residuals        144 245.12   1.702                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q6a_1 :
##                   Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 102.047 25.5117   66.99 < 2.2e-16 ***
## Residuals        144  54.839  0.3808                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 296.74  74.185  39.376 < 2.2e-16 ***
## Residuals        144 271.30   1.884                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Validation - Demographics

Now we will use demographic variables to check what we can describe our clusters with.

Age

aggregate(mydata$Age, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 36.03448
## 2       2 35.48780
## 3       3 38.28000
## 4       4 45.78378
## 5       5 40.41176

The table above shows the average age for each of the clusters. The average age in group 1 is 36.03, and 40.41 in group 5.

Let us test the normal distribution of the variable.

  • H0: normality is met.

  • H1: normality is not met.

H0 cannot be rejected for all clusters. Thus, we turn to the non-parametric test.

library(dplyr)
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.4.2
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(Age)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable statistic        p
##   <fct>                     <chr>        <dbl>    <dbl>
## 1 1                         Age          0.890 0.00569 
## 2 2                         Age          0.913 0.00399 
## 3 3                         Age          0.871 0.00457 
## 4 4                         Age          0.879 0.000818
## 5 5                         Age          0.929 0.207

The hypotheses for the non-parametric test are the following:

  • H0: all distribution locations of the variable are the same.

  • H1: at least one distribution location of the variable is different.

kruskal.test(Age ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Age by as.factor(Group)
## Kruskal-Wallis chi-squared = 11.386, df = 4, p-value = 0.02256

As p < 0.05, H0 can be rejected. This means that the variable is statistically significant and can be used to describe clusters.

Location (either urban or rural)

Let us perform the Pearson Chi-Squared test. The hypotheses are the following:

  • H0: there is no association between location and classification of people into clusters.

  • H1: there is association between location and classification of people into clusters.

chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group))
## Warning in chisq.test(mydata$LocationFactor, as.factor(mydata$Group)):
## Chi-squared approximation may be incorrect
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 5.2367, df = 4, p-value = 0.2639

As the Pearson Chi-Squared test approximation may be incorrect, we have performed the Fisher’s test.

fisher_result <- fisher.test(table(mydata$LocationFactor, mydata$Group), simulate.p.value = TRUE, B = 10000)


fisher_result
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$LocationFactor, mydata$Group)
## p-value = 0.2848
## alternative hypothesis: two.sided

Location is statistically not significant.

table_data_edu <- table(mydata$LocationFactor, mydata$Group)

print(table_data_edu)
##        
##          1  2  3  4  5
##   Urban 23 35 22 25 13
##   Rural  6  6  3 12  4

The table above shows the number of people who live in urban or rural areas. In cluster 1, for example, 23 people live in urban areas, and 6 in rural areas.

Gender (either male or female)

Let us perform the Pearson Chi-Squared test. The hypotheses are the following:

  • H0: there is no association between gender and classification of people into clusters.

  • H1: there is association between gender and classification of people into clusters.

chi_square <- chisq.test(mydata$GenderFactor, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$GenderFactor and as.factor(mydata$Group)
## X-squared = 3.0375, df = 4, p-value = 0.5516

As p > 0.05, H0 cannot be rejected. Gender is statistically not significant.

table_data_edu <- table(mydata$GenderFactor, mydata$Group)


print(table_data_edu)
##         
##           1  2  3  4  5
##   Male   10 21 10 13  8
##   Female 19 20 15 24  9

The table above shows the number of male and female in different clusters. In cluster 1, for example, 10 people are male, and 19 people are female.

Education (either up to high school or undergraduate degree and more)

Let us perform the Pearson Chi-Squared test. The hypotheses are the following:

  • H0: there is no association between education and classification of people into clusters.

  • H1: there is association between education and classification of people into clusters.

chi_square <- chisq.test(mydata$EducFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EducFixed and as.factor(mydata$Group)
## X-squared = 4.5211, df = 4, p-value = 0.3401

As p > 0.05, H0 cannot be rejected. Education is statistically not significant.

table_data_edu <- table(mydata$EducFixed, mydata$Group)


print(table_data_edu)
##                     
##                       1  2  3  4  5
##   Up to high school   7 16  6 16  7
##   Undergrad and more 22 25 19 21 10

The table above shows the number of people whose highest level of education obtained is high school, and those who have obtained at least a bachelor’s degree. In cluster 1, for example, finished high school is the highest level of education obtained for 7 people, and 22 have obtained at least a bachelor’s degree.

Employment status (either employed or self-employed)

Let us perform the Pearson Chi-Squared test. The hypotheses are the following:

  • H0: there is no association between employment status and classification of people into clusters.

  • H1: there is association between employment status and classification of people into clusters.

chi_square <- chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), :
## Chi-squared approximation may be incorrect
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EmplStatusFactor and as.factor(mydata$Group)
## X-squared = 16.828, df = 4, p-value = 0.002087

As p < 0.05, H0 can be rejected.

table_data_emp <- table(mydata$EmplStatusFactor, mydata$Group)


print(table_data_emp)
##                
##                  1  2  3  4  5
##   Employed      25 37 19 20 14
##   Self-employed  4  4  6 17  3

The table above shows the number of people who are employed or self-employed in each of the clusters. In cluster 1, for example, 25 people are employed, and 4 self-employed.

As the Pearson Chi-Squared test approximation may be incorrect, we have performed the Fisher’s test.

fisher_test_emp <- fisher.test(table_data_emp, simulate.p.value = TRUE, B = 10000)  


print(fisher_test_emp)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data_emp
## p-value = 0.0033
## alternative hypothesis: two.sided

Employment status is statistically significant and can be used to describe clusters.

Bank (either NLB or other)

Let us perform the Pearson Chi-Squared test. The hypotheses are the following:

  • H0: there is no association between bank and classification of people into clusters.

  • H1: there is association between bank and classification of people into clusters.

chi_square <- chisq.test(mydata$BankFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$BankFixed and as.factor(mydata$Group)
## X-squared = 6.6039, df = 4, p-value = 0.1584

As p > 0.05, H0 cannot be rejected. Bank is not statistically significant.

table_data_job <- table(mydata$BankFixed, mydata$Group)


print(table_data_job)
##              
##                1  2  3  4  5
##   NLB         11 15  5 14 10
##   Other banks 18 26 20 23  7

The table above shows the number of people who use NLB and those who use other banks. In cluster 1, for example, 11 people use NLB, and 18 use other banks.

Job (either white-collar or blue-collar)

Let us perform the Pearson Chi-Squared test. The hypotheses are the following:

chi_square <- chisq.test(mydata$JobFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$JobFixed and as.factor(mydata$Group)
## X-squared = 12.681, df = 4, p-value = 0.01295

As p < 0.05, H0 can be rejected. Job is statistically significant and can be used to describe clusters.

table_data_job <- table(mydata$JobFixed, mydata$Group)


print(table_data_job)
##               
##                 1  2  3  4  5
##   White Collar 17 22 20 32 11
##   Blue Collar  12 19  5  5  6

The table above shows the number of white-collar and blue-collar workers in each of the clusters. In cluster 1, for example, 17 people have white-collar jobs, and 12 have blue-collar jobs.

Location (either urban or rural)

Let us perform the Pearson Chi-Squared test. The hypotheses are the following:

  • H0: there is no association between location and classification of people into clusters.

  • H1: there is association between location and classification of people into clusters.

chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$LocationFactor, as.factor(mydata$Group), correct =
## TRUE): Chi-squared approximation may be incorrect
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 5.2367, df = 4, p-value = 0.2639

As p > 0.05, H0 cannot be rejected. Location is not statistically significant.

table_data_loc <- table(mydata$LocationFactor, mydata$Group)


print(table_data_loc)
##        
##          1  2  3  4  5
##   Urban 23 35 22 25 13
##   Rural  6  6  3 12  4

The table above shows the number of people who live in urban and rural areas in each of the clusters. In cluster 1, for example, 23 people live in urban areas, and 6 in rural.

As the Pearson Chi-Squared test approximation may be incorrect, we have performed the Fisher’s test.

fisher_test_loc <- fisher.test(table_data_loc, simulate.p.value = TRUE, B = 10000)  


print(fisher_test_loc)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data_loc
## p-value = 0.2793
## alternative hypothesis: two.sided

Location is not statistically significant.

Statistically significant are the following variables:

  • Age

  • Job (white- or blue-collar)

  • Employment status (employed or self-employed)

We have shown that other variables describing demographics are not statistically significant, but we will still use them to describe the clusters.

Validation - Likert Scale

Raje imam pri sebi gotovino, ker ni vedno mogoče plačati z drugimi plačilnimi sredstvi. (angl. I prefer to carry cash becazse it is not always possible to pay with other means of payment.)

aggregate(mydata$PreferCash, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 1.862069
## 2       2 3.731707
## 3       3 3.560000
## 4       4 4.351351
## 5       5 3.117647

The table above shows the average answer to the statement “Raje imam pri sebi gotovino, ker ni vedno mogoče plačati z drugimi plačilnimi sredstvi”, where 1 means strongly disagree and 7 strongly agree.

Let us perform the Shapiro-Wilk normality test to check the normal distribution of the variable. Our hypotheses are the following:

  • H0: normality is met.

  • H1: normality is not met.

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(PreferCash)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable   statistic          p
##   <fct>                     <chr>          <dbl>      <dbl>
## 1 1                         PreferCash     0.684 0.00000124
## 2 2                         PreferCash     0.890 0.000864  
## 3 3                         PreferCash     0.890 0.0111    
## 4 4                         PreferCash     0.865 0.000359  
## 5 5                         PreferCash     0.877 0.0281

As p < 0.05, H0 can be rejected. Thus, we have to perform the non-parameteric alternative. Our hypotheses are:

  • H0: all distribution locations of the variable are the same.

  • H1: at least one distribution location of the variable is different.

kruskal.test(PreferCash ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  PreferCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 28.631, df = 4, p-value = 9.29e-06

As p < 0.05, H0 can be rejected. There are differences between clusters.

Below is the visual representation of the attitude towards the statement “Raje imam pri sebi gotovino, ker ni vedno mogoče plačati z drugimi plačilnimi sredstvi.”

library(ggplot2)
library(dplyr)


likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)


table_clusters <- table(mydata$Group, mydata$PreferCash)


prop_table_clusters <- prop.table(table_clusters, margin = 1)


prop_df <- as.data.frame(as.table(prop_table_clusters))


prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)


ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  
    title = "Percentage Distribution of Those Who Keep Cash"
  ) +
  theme_minimal()

Raje plačujem z gotovino kot z digitalnimi plačilnimi sredstvi. (angl. I prefer to pay with cash rather than digital means of payment.)

aggregate(mydata$KeepCash, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 3.793103
## 2       2 4.731707
## 3       3 4.040000
## 4       4 4.351351
## 5       5 4.000000

The table above shows the average answer to the statement “Raje plačujem z gotovino kot z digitalnimi plačilnimi sredstvi”, where 1 means strongly disagree and 7 strongly agree.

Let us perform the Shapiro-Wilk normality test to check the normal distribution of the variable. Our hypotheses are the following:

  • H0: normality is met.

  • H1: normality is not met.

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(KeepCash)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable statistic        p
##   <fct>                     <chr>        <dbl>    <dbl>
## 1 1                         KeepCash     0.892 0.00638 
## 2 2                         KeepCash     0.877 0.000368
## 3 3                         KeepCash     0.855 0.00216 
## 4 4                         KeepCash     0.897 0.00250 
## 5 5                         KeepCash     0.843 0.00855

As p < 0.05, H0 can be rejected. Thus, we have to perform the non-parameteric alternative. Our hypotheses are:

  • H0: all distribution locations of the variable are the same.

  • H1: at least one distribution location of the variable is different.

kruskal.test(KeepCash ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  KeepCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 5.7094, df = 4, p-value = 0.2219

As p > 0.05, H0 cannot be rejected. We cannot say that there are statistically significant differences between the groups.

Below is the visual representation of the attitude towards the statement “Raje plačujem z gotovino kot z digitalnimi plačilnimi sredstvi.”

library(ggplot2)
library(dplyr)


likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)


table_clusters <- table(mydata$Group, mydata$KeepCash)


prop_table_clusters <- prop.table(table_clusters, margin = 1)


prop_df <- as.data.frame(as.table(prop_table_clusters))


prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)


ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  
    title = "Percentage Distribution of Those Who Prefer Cash"
  ) +
  theme_minimal()

Zlahka najdem ponudnike, ki sprejemajo digitalna placila za vsakodnevne nakupe. (angl. I can easily find providers that accept digital payments for everyday purchases.)

aggregate(mydata$FindSeller, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 6.172414
## 2       2 6.000000
## 3       3 5.240000
## 4       4 5.054054
## 5       5 5.470588

The table above shows the average answer to the statement “Zlahka najdem ponudnike, ki sprejemajo digitalna plačila za vsakodnevne nakupe”, where 1 means strongly disagree and 7 strongly agree.

Let us perform the Shapiro-Wilk normality test to check the normal distribution of the variable. Our hypotheses are the following:

  • H0: normality is met.

  • H1: normality is not met.

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(FindSeller)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable   statistic          p
##   <fct>                     <chr>          <dbl>      <dbl>
## 1 1                         FindSeller     0.701 0.00000216
## 2 2                         FindSeller     0.850 0.0000745 
## 3 3                         FindSeller     0.887 0.00989   
## 4 4                         FindSeller     0.887 0.00129   
## 5 5                         FindSeller     0.888 0.0434

As p < 0.05, H0 can be rejected. Thus, we have to perform the non-parameteric alternative. Our hypotheses are:

  • H0: all distribution locations of the variable are the same.

  • H1: at least one distribution location of the variable is different.

kruskal.test(FindSeller ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  FindSeller by as.factor(Group)
## Kruskal-Wallis chi-squared = 15.429, df = 4, p-value = 0.00389

As p < 0.05, H0 can be rejected. There are differences between clusters.

Below is the visual representation of the attitude towards the statement “Zlahka najdem ponudnike, ki sprejemajo digitalna plačila za vsakodnevne nakupe.”

library(ggplot2)
library(dplyr)


likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)


table_clusters <- table(mydata$Group, mydata$FindSeller)


prop_table_clusters <- prop.table(table_clusters, margin = 1)


prop_df <- as.data.frame(as.table(prop_table_clusters))


prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)


ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  
    title = "Percentage Distribution of Those Who Find Vendors Accepting Digital Payments"
  ) +
  theme_minimal()

Pri plačilih manjše vrednosti (do 10€) se mi zdi gotovina hitrejši način plačila od digitalnih plačil. (angl. For smaller payments (up to €10), I find cash a faster way to pay than digital payments.)

aggregate(mydata$SmallCash, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 1.862069
## 2       2 2.902439
## 3       3 3.200000
## 4       4 4.351351
## 5       5 3.176471

The table above shows the average answer to the statement “Pri plačilih manjše vrednosti (do 10€) se mi zdi gotovina hitrejši način plačila od digitalnih plačil”, where 1 means strongly disagree and 7 strongly agree.

Let us perform the Shapiro-Wilk normality test to check the normal distribution of the variable. Our hypotheses are the following:

  • H0: normality is met.

  • H1: normality is not met.

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(SmallCash)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable  statistic        p
##   <fct>                     <chr>         <dbl>    <dbl>
## 1 1                         SmallCash     0.810 0.000126
## 2 2                         SmallCash     0.879 0.000428
## 3 3                         SmallCash     0.894 0.0138  
## 4 4                         SmallCash     0.854 0.000197
## 5 5                         SmallCash     0.844 0.00885

As p < 0.05, H0 can be rejected. Thus, we have to perform the non-parameteric alternative. Our hypotheses are:

  • H0: all distribution locations of the variable are the same.

  • H1: at least one distribution location of the variable is different.

kruskal.test(SmallCash ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  SmallCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 22.6, df = 4, p-value = 0.0001522

As p < 0.05, H0 can be rejected. There are differences between clusters.

Below is the visual representation of the attitude towards the statement “Pri plačilih manjše vrednosti (do 10€) se mi zdi gotovina hitrejši način plačila od digitalnih plačil.”

library(ggplot2)
library(dplyr)


likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)


table_clusters <- table(mydata$Group, mydata$SmallCash)


prop_table_clusters <- prop.table(table_clusters, margin = 1)


prop_df <- as.data.frame(as.table(prop_table_clusters))


prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)


ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response", 
    title = "Percentage Distribution of Those Who Prefer Cash For Payments up to 10€"
  ) +
  theme_minimal()

Uporaba digitalnih plačil za transakcije z majhno vrednostjo (do 10€) je priročna in enostavna, z minimalnimi dodatnimi koraki (npr. vnos PIN-a, skeniranje QR kode) (angl. Using digital payments for small value transactions (up to €10) is convenient and easy, with minimal additional steps (e.g., PIN entry, QR code scanning.)

aggregate(mydata$DigitalEasy, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 6.068966
## 2       2 5.878049
## 3       3 5.200000
## 4       4 4.432432
## 5       5 5.705882

The table above shows the average answer to the statement “Uporaba digitalnih placil za transkacije z majhno vrednostjo (do 10) je prirocna in enostavna, z minimalnimi dodatnimi koraki (npr. vnos PIN-a, skeniranje QR kode)”, where 1 means strongly disagree and 7 strongly agree.

Let us perform the Shapiro-Wilk normality test to check the normal distribution of the variable. Our hypotheses are the following:

  • H0: normality is met.

  • H1: normality is not met.

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(DigitalEasy)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable    statistic           p
##   <fct>                     <chr>           <dbl>       <dbl>
## 1 1                         DigitalEasy     0.668 0.000000744
## 2 2                         DigitalEasy     0.812 0.0000100  
## 3 3                         DigitalEasy     0.878 0.00640    
## 4 4                         DigitalEasy     0.901 0.00305    
## 5 5                         DigitalEasy     0.877 0.0288

As p < 0.05, H0 can be rejected. Thus, we have to perform the non-parameteric alternative. Our hypotheses are:

  • H0: all distribution locations of the variable are the same.

  • H1: at least one distribution location of the variable is different.

kruskal.test(DigitalEasy ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  DigitalEasy by as.factor(Group)
## Kruskal-Wallis chi-squared = 20.426, df = 4, p-value = 0.0004114

As p < 0.05, H0 can be rejected. There are differences between clusters.

Below is the visual representation of the attitude towards the statement “Uporaba digitalnih plačil za transkacije z majhno vrednostjo (do 10€) je priročna in enostavna, z minimalnimi dodatnimi koraki (npr. vnos PIN-a, skeniranje QR kode)”

library(ggplot2)
library(dplyr)


likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)


table_clusters <- table(mydata$Group, mydata$DigitalEasy)


prop_table_clusters <- prop.table(table_clusters, margin = 1)


prop_df <- as.data.frame(as.table(prop_table_clusters))


prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)


ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  
    title = "Percentage Distribution of Those Who Prefer Digital For Payments up to 10€"
  ) +
  theme_minimal()

Rad/a uporabljam mobilne plačilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ker so priročne.(angl. I like to use mobile payment platforms (such as Apple Pay, Revolut, PayPal, Flik) because they are convenient.)

aggregate(mydata$ConvenientFlik, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 6.413793
## 2       2 5.292683
## 3       3 5.640000
## 4       4 4.081081
## 5       5 5.588235

The table above shows the average answer to the statement “Rad/a uporabljam mobilne plačilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ker so priročne”, where 1 means strongly disagree and 7 strongly agree.

Let us perform the Shapiro-Wilk normality test to check the normal distribution of the variable. Our hypotheses are the following:

  • H0: normality is met.

  • H1: normality is not met.

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(ConvenientFlik)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable       statistic            p
##   <fct>                     <chr>              <dbl>        <dbl>
## 1 1                         ConvenientFlik     0.528 0.0000000154
## 2 2                         ConvenientFlik     0.781 0.00000224  
## 3 3                         ConvenientFlik     0.777 0.0000966   
## 4 4                         ConvenientFlik     0.872 0.000532    
## 5 5                         ConvenientFlik     0.817 0.00351

As p < 0.05, H0 can be rejected. Thus, we have to perform the non-parameteric alternative. Our hypotheses are:

  • H0: all distribution locations of the variable are the same.

  • H1: at least one distribution location of the variable is different.

kruskal.test(ConvenientFlik ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  ConvenientFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 24.896, df = 4, p-value = 5.28e-05

As p < 0.05, H0 can be rejected. There are differences between clusters.

Below is the visual representation of the attitude towards the statement “Rad/a uporabljam mobilne plačilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ker so priročne.”

library(ggplot2)
library(dplyr)


likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)


table_clusters <- table(mydata$Group, mydata$ConvenientFlik)


prop_table_clusters <- prop.table(table_clusters, margin = 1)


prop_df <- as.data.frame(as.table(prop_table_clusters))


prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)

# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + 
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  
    title = "Percentage Distribution of Those Who Use Mobile Payment Platforms due to Convenience"
  ) +
  theme_minimal()