# Load dataset from Excel file
library(readxl)
mydata <- read_xlsx("./REAL.xlsx")

# Display first few rows
head(mydata)
## # A tibble: 6 × 40
##      ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     5     6     5     7     7     5     7     7     5     7     7     6
## 2     2     7     7     6     2     6     6     4     7     7     3     6     6
## 3     3     7     6     6     6     6     6     7     7     5     6     7     5
## 4     4     7     3     5     6     6     3     3     6     5     3     6     6
## 5     5     6     5     5     5     5     6     6     6     7     4     6     7
## 6     6     6     6     6     5     7     6     6     7     5     7     7     5
## # ℹ 27 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## #   Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## #   Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <dbl>,
## #   BankFixed <dbl>, EmplFixed <dbl>, JobFixed <dbl>, LocationFixed <dbl>,
## #   PreferCash <dbl>, KeepCash <dbl>, FindSeller <dbl>, SmallCash <dbl>,
## #   DigitalEasy <dbl>, ConvenientFlik <dbl>, FriendsFlik <dbl>,
## #   Hypothetical <dbl>, Income <dbl>
colnames(mydata)
##  [1] "ID"             "Q2a_1"          "Q2b_1"          "Q2c_1"         
##  [5] "Q3a_1"          "Q3b_1"          "Q3c_1"          "Q4a_1"         
##  [9] "Q4b_1"          "Q4c_1"          "Q5a_1"          "Q5b_1"         
## [13] "Q5c_1"          "Q6a_1"          "Q6b_1"          "Q6c_1"         
## [17] "Q7a_1"          "Q7b_1"          "Q7c_1"          "Age"           
## [21] "Gender"         "Location"       "Education"      "Job"           
## [25] "Bank"           "EmplStatus"     "EducFixed"      "BankFixed"     
## [29] "EmplFixed"      "JobFixed"       "LocationFixed"  "PreferCash"    
## [33] "KeepCash"       "FindSeller"     "SmallCash"      "DigitalEasy"   
## [37] "ConvenientFlik" "FriendsFlik"    "Hypothetical"   "Income"
mydata$GenderFactor <- factor(mydata$Gender, 
                             levels = c(1, 2), 
                             labels = c("Male", "Female"))

mydata$LocationFactor <- factor(mydata$Location, 
                             levels = c(1, 2, 3), 
                             labels = c("Urban", "Urban", "Rural"))

mydata$EducationFactor <- factor(mydata$Education,
                                 levels = c(0,1, 2, 3, 4, 5, 6),
                                 labels = c("Unifinished elementary", "Finished elementary", "Vocational school", "General high school", "Undergraduate degree", "Master's degree", "PhD"))

mydata$EmplStatusFactor <- factor (mydata$EmplStatus,
                                   levels = c(1, 2, 3, 4),
                                   labels = c("Employed", "Self-employed", "Self-employed", "Self-employed"))

#mydata$JobFactor <- factor (mydata$Job,
                                   #levels = c(1, 2 ,3, 4, 5),
                                   #labels = c("Manual", "Manual", "Office", "Office", "Office"))

mydata$EducFixed <- factor (mydata$EducFixed,
                                   levels = c(0, 1),
                                   labels = c("Up to high school", "Undergrad and more"))

mydata$BankFixed <- factor (mydata$BankFixed,
                                   levels = c(0, 1,2,3,4,5,6,7,8,9),
                                   labels = c("NLB", "Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks"))

mydata$EmplFixed <- factor (mydata$EmplFixed,
                                   levels = c(0, 1),
                                   labels = c("Employed", "Others"))

mydata$JobFixed <- factor(mydata$JobFixed,
                          levels = c(0,1),
                          labels = c("White Collar", "Blue Collar"))
#Saving standardized cluster variables into new data frame

mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")])) 
#Finding outliers

mydata_clu_new[is.na(mydata_clu_new)] <- 0

mydata$Dissimilarity <- sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 + mydata_clu_new$Q4a_1^2 + mydata_clu_new$Q5a_1^2 + mydata_clu_new$Q6a_1^2 + mydata_clu_new$Q7a_1^2) 
#Finding units with highest value of dissimilarity

head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) 
## # A tibble: 6 × 2
##      ID Dissimilarity
##   <dbl>         <dbl>
## 1    40          4.64
## 2    14          4.49
## 3   120          4.12
## 4    34          3.92
## 5    71          3.89
## 6   138          3.81
#Showing units ID14, 40, 120

print(mydata[c(14,40,120), ])
## # A tibble: 3 × 45
##      ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    14     7     7     5     3     6     7     6     6     7     3     6     7
## 2    40     5     3     3     2     7     7     2     7     7     2     7     7
## 3   120     1     6     5     6     6     7     7     5     6     6     6     6
## # ℹ 32 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## #   Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## #   Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <fct>,
## #   BankFixed <fct>, EmplFixed <fct>, JobFixed <fct>, LocationFixed <dbl>,
## #   PreferCash <dbl>, KeepCash <dbl>, FindSeller <dbl>, SmallCash <dbl>,
## #   DigitalEasy <dbl>, ConvenientFlik <dbl>, FriendsFlik <dbl>,
## #   Hypothetical <dbl>, Income <dbl>, GenderFactor <fct>, …
library(factoextra) 
## Warning: package 'factoextra' was built under R version 4.4.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Finding Euclidean distances based on 6 Cluster variables, then saving them into object Distances

Distances <- get_dist(mydata_clu_new, 
                      method = "euclidian")

#Showing matrix of distances

fviz_dist(Distances, 
          gradient = list(low = "slateblue4",
                          mid = "green",
                          high = "white"))

#Hopkins statistics

library(factoextra) 
get_clust_tendency(mydata_clu_new, 
                   n = nrow(mydata_clu_new) - 1,
                   graph = FALSE)
## $hopkins_stat
## [1] 0.6418886
## 
## $plot
## NULL

Hopkins statistics is above 0.5 - data is clusterable.

#Determining number of clusters for K-means clustering

library(factoextra)
library(NbClust)

fviz_nbclust(mydata_clu_new, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

#Determining number of clusters for K-means clustering

fviz_nbclust(mydata_clu_new, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette analysis")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(factoextra)
WARD <- mydata_clu_new %>%
  get_dist(method = "euclidean") %>%  
  hclust(method = "ward.D2")          

WARD
## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 152
library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(NbClust)
NbClust(mydata_clu_new, 
        distance = "euclidean", 
        min.nc = 2, max.nc = 10,
        method = "kmeans", 
        index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 5 proposed 3 as the best number of clusters 
## * 3 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## * 3 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
## $All.index
##         KL      CH Hartigan     CCC    Scott      Marriot    TrCovW   TraceW
## 2  16.5370 59.5648  22.8451 -1.2199 190.7707 4.307472e+12 12926.431 648.4867
## 3   0.1354 45.4359  23.4358 -1.8840 367.1832 3.036401e+12  9223.823 562.7755
## 4   0.7123 42.5792  23.8440 -1.6591 474.6207 2.662356e+12  6927.557 486.2886
## 5   2.2046 42.7495  14.9309  0.1108 564.7277 2.299495e+12  4648.256 418.8142
## 6   2.5760 40.3828   9.9144  0.8422 663.8276 1.725228e+12  3722.664 380.1973
## 7   0.8891 37.3325   9.0992  0.7345 727.1115 1.548550e+12  3098.369 356.0210
## 8   0.4194 35.0637  11.8946  0.7330 782.5853 1.404137e+12  2947.080 334.9989
## 9   9.5096 34.4609   5.7515  1.5349 847.1365 1.162195e+12  2492.866 309.4387
## 10  0.0898 32.2757  11.9974  1.1503 877.7749 1.172881e+12  2235.064 297.4742
##    Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 2    3.2372 1.3971 0.4534 1.5833     0.2516 1.2250 -18.5509 -0.6976    0.3433
## 3    5.4346 1.6099 0.4247 1.7077     0.2047 1.5054 -24.1722 -1.2658    0.3359
## 4    6.8884 1.8631 0.4612 1.5760     0.2104 1.1178  -5.7980 -0.3969    0.3241
## 5    7.5885 2.1633 0.4244 1.3974     0.2328 1.2897 -11.6811 -0.8421    0.3275
## 6    9.3895 2.3830 0.3998 1.3793     0.2277 1.4435 -10.4458 -1.1283    0.3105
## 7   10.5507 2.5448 0.4490 1.4012     0.2230 1.0142  -0.5887 -0.0525    0.2941
## 8   11.6282 2.7045 0.4165 1.4359     0.2077 1.6961 -10.2607 -1.4862    0.2804
## 9   12.8642 2.9279 0.4229 1.4137     0.2152 1.1606  -2.4907 -0.4969    0.2704
## 10  13.4092 3.0456 0.4162 1.3749     0.2092 1.8949 -12.7511 -1.6655    0.2591
##        Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  324.2434     0.4586  0.8104  0.7394 0.1312 0.0023  1.6254 1.9756 1.2828
## 3  187.5918     0.4468  0.1855  1.4472 0.0831 0.0026  1.6291 1.8218 1.0393
## 4  121.5722     0.4777  0.0745  1.9274 0.0977 0.0028  1.4802 1.6897 0.6214
## 5   83.7628     0.5135  0.2775  2.2588 0.1090 0.0032  1.3569 1.5742 0.5371
## 6   63.3662     0.5105  0.7706  2.6311 0.1078 0.0033  1.3411 1.5061 0.4869
## 7   50.8601     0.4811  1.1917  3.1800 0.1112 0.0034  1.5588 1.4607 0.4626
## 8   41.8749     0.4367  0.0850  4.0759 0.1051 0.0036  1.6493 1.4036 0.4293
## 9   34.3821     0.4416  0.4428  4.3256 0.1112 0.0038  1.5593 1.3558 0.3930
## 10  29.7474     0.4290 -0.1904  4.7313 0.1480 0.0039  1.5085 1.3214 0.3768
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.7006            43.1654            1
## 3          0.6533            38.2096            1
## 4          0.6459            30.1530            1
## 5          0.6222            31.5710            1
## 6          0.5356            29.4770            1
## 7          0.6188            25.8772            1
## 8          0.4889            26.1338            1
## 9          0.4643            20.7641            1
## 10         0.4174            37.6933            1
## 
## $Best.nc
##                     KL      CH Hartigan    CCC    Scott      Marriot   TrCovW
## Number_clusters  2.000  2.0000   5.0000 9.0000   3.0000            3    3.000
## Value_Index     16.537 59.5648   8.9131 1.5349 176.4125 897026528841 3702.608
##                  TraceW Friedman   Rubin Cindex      DB Silhouette  Duda
## Number_clusters  5.0000   3.0000  9.0000 6.0000 10.0000     2.0000 2.000
## Value_Index     28.8575   2.1974 -0.1056 0.3998  1.3749     0.2516 1.225
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000  2.0000    2.0000   3.0000     5.0000    1  2.0000
## Value_Index     -18.5509 -0.6976    0.3433 136.6515     0.5135   NA  0.7394
##                   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 10.000      0  6.0000      0 10.0000
## Value_Index      0.148      0  1.3411      0  0.3768
## 
## $Best.partition
##   [1] 1 2 1 2 1 1 2 2 2 1 2 1 1 2 2 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1
##  [38] 1 1 2 2 2 1 2 2 2 1 1 1 2 1 1 2 1 1 2 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 1 2 1
##  [75] 2 1 2 2 1 2 2 1 2 2 1 2 2 2 2 2 1 2 1 1 1 1 2 2 2 1 2 2 1 2 1 1 1 1 1 1 2
## [112] 1 2 1 2 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 2 2 1 2 1 2 2 2 2
## [149] 1 2 1 1

We will make five clusters.

Clustering <- kmeans(mydata_clu_new, 
                     centers = 5, #Number of groups
                     nstart = 25) #Number of attempts at different starting leader positions

Clustering
## K-means clustering with 5 clusters of sizes 30, 17, 24, 41, 40
## 
## Cluster means:
##         Q2a_1       Q3a_1      Q4a_1      Q5a_1      Q6a_1      Q7a_1
## 1  0.30294872 -1.08988634 -1.2737371 -1.1121377  0.2349868 -0.4492697
## 2 -2.04299453 -0.15283293  0.1288002 -0.2342229 -0.1969184 -0.1982072
## 3  0.46619735 -0.27363453 -0.4237942 -0.3527855 -1.7529088 -0.1005993
## 4  0.32831346  0.01992193  0.4436057  0.4646443  0.5814218 -0.6697631
## 5  0.02482143  1.02612950  0.7001434  0.6690589  0.3632381  1.1680571
## 
## Clustering vector:
##   [1] 5 1 4 1 5 4 3 1 3 4 1 4 4 3 1 5 4 1 4 4 1 4 5 5 4 5 5 5 5 4 1 5 3 1 2 3 5
##  [38] 4 4 3 2 1 4 1 1 2 5 4 4 3 2 5 3 5 5 1 3 2 2 4 5 2 3 5 5 5 4 5 4 3 5 5 3 5
##  [75] 3 4 1 1 2 4 1 4 1 2 4 4 1 2 1 1 2 3 4 5 5 3 1 4 1 4 4 1 5 4 5 5 5 5 5 4 3
## [112] 5 1 5 4 4 4 4 4 2 5 4 1 4 2 4 3 5 3 3 3 4 5 3 2 4 1 3 2 1 2 5 1 5 4 1 3 3
## [149] 2 1 5 5
## 
## Within cluster sum of squares by cluster:
## [1] 101.21820  62.11656  76.96477  92.61780  85.89687
##  (between_SS / total_SS =  53.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = FALSE,
             ggtheme = theme_bw(),
             data = mydata_clu_new)

Some units seem to be far away from the center, so I will remove them.

mydata <- mydata %>%
  filter(!ID %in% c(133, 108, 7))

mydata$ID <- seq(1, nrow(mydata))


mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")])) 
mydata_clu_new[is.na(mydata_clu_new)] <- 0

Clustering <- kmeans(mydata_clu_new, 
                     centers = 5, #Number of groups
                     nstart = 25) #Number of attempts at different starting leader positions

Clustering
## K-means clustering with 5 clusters of sizes 29, 41, 25, 37, 17
## 
## Cluster means:
##         Q2a_1        Q3a_1      Q4a_1      Q5a_1      Q6a_1      Q7a_1
## 1  0.32527292 -1.099435794 -1.2365294 -1.1969146  0.2499536 -0.4063642
## 2  0.34475122 -0.004020726  0.4124110  0.4922125  0.5570996 -0.7193229
## 3  0.46902278 -0.119537224 -0.4047331 -0.2429699 -1.7180355  0.0505640
## 4 -0.02546811  1.014802901  0.7180670  0.6648481  0.4418536  1.1696604
## 5 -2.02064493 -0.147693467  0.1470795 -0.2350188 -0.2051434 -0.1920432
## 
## Clustering vector:
##   [1] 4 1 2 1 4 2 1 3 2 1 2 2 3 1 4 2 1 2 4 1 2 4 4 2 4 4 4 4 2 1 4 3 1 5 3 4 2
##  [38] 2 3 5 1 2 1 1 5 4 2 2 3 5 4 3 3 4 1 3 5 5 2 4 5 3 4 4 4 2 4 2 3 4 4 3 4 3
##  [75] 2 1 1 5 2 1 2 1 5 2 2 1 5 1 1 5 3 2 4 4 3 1 2 1 2 2 1 4 2 4 4 4 4 2 3 4 1
## [112] 4 2 2 2 2 2 5 4 2 1 2 5 2 3 4 3 3 3 2 3 5 2 1 3 5 2 5 4 1 3 2 1 3 3 5 1 4
## [149] 4
## 
## Within cluster sum of squares by cluster:
## [1] 94.10655 99.94698 77.03574 78.50395 62.95983
##  (between_SS / total_SS =  53.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = FALSE,
             ggtheme = theme_bw(),
             data = mydata_clu_new)

#Average values of cluster variables to describe groups

Averages <- Clustering$centers
Averages 
##         Q2a_1        Q3a_1      Q4a_1      Q5a_1      Q6a_1      Q7a_1
## 1  0.32527292 -1.099435794 -1.2365294 -1.1969146  0.2499536 -0.4063642
## 2  0.34475122 -0.004020726  0.4124110  0.4922125  0.5570996 -0.7193229
## 3  0.46902278 -0.119537224 -0.4047331 -0.2429699 -1.7180355  0.0505640
## 4 -0.02546811  1.014802901  0.7180670  0.6648481  0.4418536  1.1696604
## 5 -2.02064493 -0.147693467  0.1470795 -0.2350188 -0.2051434 -0.1920432
Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)

library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"))

Figure$Group <- factor(Figure$ID, 
                       levels = c(1, 2, 3, 4, 5), 
                       labels = c("1", "2", "3", "4", "5"))

Figure$NameF <- factor(Figure$name, 
                       levels = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"), 
                       labels = c("Cash_Safety", "Cash_Speed", "Cash_Ease of Use", "Cash_Convenience", "Cash_Privacy",  "Cash_Tracking Expenses"))

library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 5) +
  geom_line(aes(group = ID), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")+
  ylim(-2.5, 2.5) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))

#Saving where each unit belongs

mydata$Group <- Clustering$cluster
#Checking if clustering variables successfully differentiate between groups

fit <- aov(cbind(Q2a_1, Q3a_1, Q4a_1, Q5a_1, Q6a_1, Q7a_1) ~ as.factor(Group), 
           data = mydata)

summary(fit)
##  Response Q2a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 158.22  39.555  45.813 < 2.2e-16 ***
## Residuals        144 124.33   0.863                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q3a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 186.27  46.568  35.889 < 2.2e-16 ***
## Residuals        144 186.84   1.298                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q4a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 166.72  41.681  36.842 < 2.2e-16 ***
## Residuals        144 162.91   1.131                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q5a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 221.47  55.367  32.526 < 2.2e-16 ***
## Residuals        144 245.12   1.702                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q6a_1 :
##                   Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 102.047 25.5117   66.99 < 2.2e-16 ***
## Residuals        144  54.839  0.3808                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 296.74  74.185  39.376 < 2.2e-16 ***
## Residuals        144 271.30   1.884                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

It differs for at least one of the groups for all variables.

Checking demographics to describe clusters

Age

#Additional variables

aggregate(mydata$Age, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 36.03448
## 2       2 35.48780
## 3       3 38.28000
## 4       4 45.78378
## 5       5 40.41176
#Checking normal distribution of variables

library(dplyr)
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.4.2
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(Age)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable statistic        p
##   <fct>                     <chr>        <dbl>    <dbl>
## 1 1                         Age          0.890 0.00569 
## 2 2                         Age          0.913 0.00399 
## 3 3                         Age          0.871 0.00457 
## 4 4                         Age          0.879 0.000818
## 5 5                         Age          0.929 0.207
kruskal.test(Age ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Age by as.factor(Group)
## Kruskal-Wallis chi-squared = 11.386, df = 4, p-value = 0.02256

Significant. Age can be used to describe clusters.

Location

#Checking the association between the location and classification into 5 groups
chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group))
## Warning in chisq.test(mydata$LocationFactor, as.factor(mydata$Group)):
## Chi-squared approximation may be incorrect
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 5.2367, df = 4, p-value = 0.2639

Not significant, location can’t be used to describe clusters.

# Perform Fisher's Exact Test with simulated p-value
fisher_result <- fisher.test(table(mydata$LocationFactor, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$LocationFactor, mydata$Group)
## p-value = 0.2848
## alternative hypothesis: two.sided

Gender

#Checking the association between the gender and classification into 5 groups
chi_square <- chisq.test(mydata$GenderFactor, as.factor(mydata$Group))
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$GenderFactor and as.factor(mydata$Group)
## X-squared = 3.0375, df = 4, p-value = 0.5516

Not significant, can’t be used to describe clusters.

#Checking the association between the education and classification into 5 groups
chi_square <- chisq.test(mydata$EducFixed, as.factor(mydata$Group), correct=TRUE)

chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EducFixed and as.factor(mydata$Group)
## X-squared = 4.5211, df = 4, p-value = 0.3401

Education (up to high school or undergraduate degree and more)

#Checking the association between the employment status and classification into 5 groups
chi_square <- chisq.test(mydata$EducFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EducFixed and as.factor(mydata$Group)
## X-squared = 4.5211, df = 4, p-value = 0.3401
# Create a contingency table of EducationFactor and Group
table_data_edu <- table(mydata$EducFixed, mydata$Group)

# Print table to verify structure
print(table_data_edu)
##                     
##                       1  2  3  4  5
##   Up to high school   7 16  6 16  7
##   Undergrad and more 22 25 19 21 10
# Perform Fisher's Exact Test with simulation
fisher_test_edu <- fisher.test(table_data_edu, simulate.p.value = TRUE, B = 10000)  # B controls the number of simulations

# Print the results
print(fisher_test_edu)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data_edu
## p-value = 0.3483
## alternative hypothesis: two.sided
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$EducFixed, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$EducFixed, mydata$Group)
## p-value = 0.3372
## alternative hypothesis: two.sided

Not signfiicant, can’t be used to describe clusters.

Employment status (employed or self-employed)

#Checking the association between the employment status and classification into 5 groups
chi_square <- chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), :
## Chi-squared approximation may be incorrect
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EmplStatusFactor and as.factor(mydata$Group)
## X-squared = 16.828, df = 4, p-value = 0.002087
# Create a contingency table of Employment Status Factor and Group
table_data_emp <- table(mydata$EmplStatusFactor, mydata$Group)

# Print table to verify structure
print(table_data_emp)
##                
##                  1  2  3  4  5
##   Employed      25 37 19 20 14
##   Self-employed  4  4  6 17  3
# Perform Fisher's Exact Test with simulation
fisher_test_emp <- fisher.test(table_data_emp, simulate.p.value = TRUE, B = 10000)  # B controls the number of simulations

# Print the results
print(fisher_test_emp)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data_emp
## p-value = 0.0031
## alternative hypothesis: two.sided

This can be used, but the proportion of self-employed people in each of the clusters is very small. Can it be used?

Bank (NLB or other)

#Checking the association between the bank and classification into 5 groups
chi_square <- chisq.test(mydata$BankFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$BankFixed and as.factor(mydata$Group)
## X-squared = 6.6039, df = 4, p-value = 0.1584

Not significant. Can’t be used to describe clusters.

Job (White Collar or Blue Collar)

#Checking the association between the job and classification into 5 groups
chi_square <- chisq.test(mydata$JobFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$JobFixed and as.factor(mydata$Group)
## X-squared = 12.681, df = 4, p-value = 0.01295
# Create a contingency table of JobFixed and Group
table_data_job <- table(mydata$JobFixed, mydata$Group)

# Print table to verify structure
print(table_data_job)
##               
##                 1  2  3  4  5
##   White Collar 17 22 20 32 11
##   Blue Collar  12 19  5  5  6

This can be used to describe clusters.

Location (Urban or Rural)

#Checking the association between the location and classification into 4 groups
chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$LocationFactor, as.factor(mydata$Group), correct =
## TRUE): Chi-squared approximation may be incorrect
chi_square
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 5.2367, df = 4, p-value = 0.2639
# Create a contingency table of Location and Group
table_data_job <- table(mydata$LocationFactor, mydata$Group)

# Print table to verify structure
print(table_data_job)
##        
##          1  2  3  4  5
##   Urban 23 35 22 25 13
##   Rural  6  6  3 12  4

Not significant, can’t be used to describe clusters.

Overall, from demographics, we can use the following to describe:

  • Age
  • Job (white collar or blue collar)

Now that the demographics have been tested, we’ll test the questions on the Likert scale.

Raje imam pri sebi gotovino, ker ni vedno mogoče plačati z drugimi plačilnimi sredstvi.

aggregate(mydata$PreferCash, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 1.862069
## 2       2 3.731707
## 3       3 3.560000
## 4       4 4.351351
## 5       5 3.117647
#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(PreferCash)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable   statistic          p
##   <fct>                     <chr>          <dbl>      <dbl>
## 1 1                         PreferCash     0.684 0.00000124
## 2 2                         PreferCash     0.890 0.000864  
## 3 3                         PreferCash     0.890 0.0111    
## 4 4                         PreferCash     0.865 0.000359  
## 5 5                         PreferCash     0.877 0.0281
kruskal.test(PreferCash ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  PreferCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 28.631, df = 4, p-value = 9.29e-06

There are differences among groups, this can be used to describe clusters.

library(ggplot2)
library(dplyr)

# Define the labels for Likert scale categories
likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)

# Create frequency table
table_clusters <- table(mydata$Group, mydata$PreferCash)

# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)

# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))

# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)

# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Convert Freq to percentages
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  # Changed from "Category" to "Response" for clarity
    title = "Percentage Distribution of Those Who Keep Cash"
  ) +
  theme_minimal()

Raje placujem z gotovino kot z digitalnimi placilnimi sredstvi.

aggregate(mydata$KeepCash, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 3.793103
## 2       2 4.731707
## 3       3 4.040000
## 4       4 4.351351
## 5       5 4.000000
#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(KeepCash)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable statistic        p
##   <fct>                     <chr>        <dbl>    <dbl>
## 1 1                         KeepCash     0.892 0.00638 
## 2 2                         KeepCash     0.877 0.000368
## 3 3                         KeepCash     0.855 0.00216 
## 4 4                         KeepCash     0.897 0.00250 
## 5 5                         KeepCash     0.843 0.00855
kruskal.test(KeepCash ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  KeepCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 5.7094, df = 4, p-value = 0.2219

This can be used to describe clusters.

library(ggplot2)
library(dplyr)

# Define the labels for Likert scale categories
likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)

# Create frequency table
table_clusters <- table(mydata$Group, mydata$KeepCash)

# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)

# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))

# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)

# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Convert Freq to percentages
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  # Changed from "Category" to "Response" for clarity
    title = "Percentage Distribution of Those Who Prefer Cash"
  ) +
  theme_minimal()

Zlahka najdem ponudnike, ki sprejemajo digitalna placila za vsakodnevne nakupe.

aggregate(mydata$FindSeller, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 6.172414
## 2       2 6.000000
## 3       3 5.240000
## 4       4 5.054054
## 5       5 5.470588
#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(FindSeller)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable   statistic          p
##   <fct>                     <chr>          <dbl>      <dbl>
## 1 1                         FindSeller     0.701 0.00000216
## 2 2                         FindSeller     0.850 0.0000745 
## 3 3                         FindSeller     0.887 0.00989   
## 4 4                         FindSeller     0.887 0.00129   
## 5 5                         FindSeller     0.888 0.0434
kruskal.test(FindSeller ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  FindSeller by as.factor(Group)
## Kruskal-Wallis chi-squared = 15.429, df = 4, p-value = 0.00389

This can be used to describe clusters.

library(ggplot2)
library(dplyr)

# Define the labels for Likert scale categories
likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)

# Create frequency table
table_clusters <- table(mydata$Group, mydata$FindSeller)

# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)

# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))

# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)

# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Convert Freq to percentages
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  # Changed from "Category" to "Response" for clarity
    title = "Percentage Distribution of Those Who Find Vendors Accepting Digital Payments"
  ) +
  theme_minimal()

Pri placilih manjse vrednosti (do 10) se mi zdi gotovina hitrejsi nacin placila od digitalnih placil.

aggregate(mydata$SmallCash, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 1.862069
## 2       2 2.902439
## 3       3 3.200000
## 4       4 4.351351
## 5       5 3.176471
#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(SmallCash)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable  statistic        p
##   <fct>                     <chr>         <dbl>    <dbl>
## 1 1                         SmallCash     0.810 0.000126
## 2 2                         SmallCash     0.879 0.000428
## 3 3                         SmallCash     0.894 0.0138  
## 4 4                         SmallCash     0.854 0.000197
## 5 5                         SmallCash     0.844 0.00885
kruskal.test(SmallCash ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  SmallCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 22.6, df = 4, p-value = 0.0001522

This can be used to describe clusters.

library(ggplot2)
library(dplyr)

# Define the labels for Likert scale categories
likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)

# Create frequency table
table_clusters <- table(mydata$Group, mydata$SmallCash)

# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)

# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))

# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)

# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Convert Freq to percentages
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  # Changed from "Category" to "Response" for clarity
    title = "Percentage Distribution of Those Who Prefer Cash For Payments up to 10€"
  ) +
  theme_minimal()

Uporaba digitalnih placil za transkacije z majhno vrednostjo (do 10) je prirocna in enostavna, z minimalnimi dodatnimi koraki (npr. vnos PIN-a, skeniranje QR kode)

aggregate(mydata$DigitalEasy, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 6.068966
## 2       2 5.878049
## 3       3 5.200000
## 4       4 4.432432
## 5       5 5.705882
#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(DigitalEasy)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable    statistic           p
##   <fct>                     <chr>           <dbl>       <dbl>
## 1 1                         DigitalEasy     0.668 0.000000744
## 2 2                         DigitalEasy     0.812 0.0000100  
## 3 3                         DigitalEasy     0.878 0.00640    
## 4 4                         DigitalEasy     0.901 0.00305    
## 5 5                         DigitalEasy     0.877 0.0288
kruskal.test(DigitalEasy ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  DigitalEasy by as.factor(Group)
## Kruskal-Wallis chi-squared = 20.426, df = 4, p-value = 0.0004114

This can be used to describe clusters.

library(ggplot2)
library(dplyr)

# Define the labels for Likert scale categories
likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)

# Create frequency table
table_clusters <- table(mydata$Group, mydata$DigitalEasy)

# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)

# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))

# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)

# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Convert Freq to percentages
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  # Changed from "Category" to "Response" for clarity
    title = "Percentage Distribution of Those Who Prefer Digital For Payments up to 10€"
  ) +
  theme_minimal()

Rad/a uporabljam mobilne placilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ker so prirocne.

aggregate(mydata$ConvenientFlik, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 6.413793
## 2       2 5.292683
## 3       3 5.640000
## 4       4 4.081081
## 5       5 5.588235
#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(ConvenientFlik)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable       statistic            p
##   <fct>                     <chr>              <dbl>        <dbl>
## 1 1                         ConvenientFlik     0.528 0.0000000154
## 2 2                         ConvenientFlik     0.781 0.00000224  
## 3 3                         ConvenientFlik     0.777 0.0000966   
## 4 4                         ConvenientFlik     0.872 0.000532    
## 5 5                         ConvenientFlik     0.817 0.00351
kruskal.test(ConvenientFlik ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  ConvenientFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 24.896, df = 4, p-value = 5.28e-05

This can be used to describe clusters.

library(ggplot2)
library(dplyr)

# Define the labels for Likert scale categories
likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)

# Create frequency table
table_clusters <- table(mydata$Group, mydata$ConvenientFlik)

# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)

# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))

# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)

# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Convert Freq to percentages
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  # Changed from "Category" to "Response" for clarity
    title = "Percentage Distribution of Those Who Use Mobile Payment Platforms due to Convenience"
  ) +
  theme_minimal()

Verjetno bom uporabljal/a mobilne placilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ce mi jih priporocijo prijatelji, druzina ali vrstniki.

aggregate(mydata$FriendsFlik, 
          by = list(mydata$Group), 
          FUN = mean)
##   Group.1        x
## 1       1 5.413793
## 2       2 5.000000
## 3       3 5.040000
## 4       4 4.378378
## 5       5 5.235294
#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(FriendsFlik)
## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable    statistic         p
##   <fct>                     <chr>           <dbl>     <dbl>
## 1 1                         FriendsFlik     0.808 0.000117 
## 2 2                         FriendsFlik     0.846 0.0000602
## 3 3                         FriendsFlik     0.875 0.00558  
## 4 4                         FriendsFlik     0.898 0.00255  
## 5 5                         FriendsFlik     0.838 0.00697
kruskal.test(FriendsFlik ~ as.factor(Group), 
             data = mydata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  FriendsFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 7.617, df = 4, p-value = 0.1067

This can be used to describe clusters. It is 11% but hopefully it should be fine.

library(ggplot2)
library(dplyr)

# Define the labels for Likert scale categories
likert_labels <- c(
  "1" = "Strongly Disagree",
  "2" = "Disagree",
  "3" = "Somewhat Disagree",
  "4" = "Neutral",
  "5" = "Somewhat Agree",
  "6" = "Agree",
  "7" = "Strongly Agree"
)

# Create frequency table
table_clusters <- table(mydata$Group, mydata$FriendsFlik)

# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)

# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))

# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)

# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Convert Freq to percentages
  geom_bar(stat = "identity", position = "stack") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Response",  # Changed from "Category" to "Response" for clarity
    title = "Percentage Distribution of Those Who Use Mobile Payment Platforms due to Recommendations"
  ) +
  theme_minimal()

Description of clusters (we’ll use more creative names in the report)

  • Cluster 1: STRONGLY DIGITAL The mean age in this group s 36.03. It contains a mix of white collar (17) and blue collar (12) workers. They strongly disagree with keeping and preferring cash, and are skeptical of cash usage even for small payments. They prefer digital payments and are in favor of mobile payment platforms. They are comfortable with non-cash transactions. This group consists of fairly young individuals who actively avoid cash and are strong proponents of digital payments.

  • Cluster 2: DIGITAL The mean age in this group is 35.49. It contains more white collar (22) than blue collar (19) workers. They are more neutral about keeping and preferring cash and lean toward digital payments. They moderately accept mobile payment platforms, and are less reliant on cash for small payments, but aren’t fully opposed to them. They are more open to various payment methods. This group is likely to transition toward digital payments but doesn’t completely reject cash.

  • Cluster 3: BALANCED The mean age in this group is 38.28. It mostly consists of white collar workers (20), and only 5 people have blue collar jobs. They have neutral stance on keeping and preferring cash, and there’s an even distribution between cash and digital preferences for small payments. They somewhat agree with mobile payment platforms. They do find vendors who accept digital payments but are still open to using cash. This group is balanced in their payment methods.

  • Cluster 4: STRONGLY CASH The mean age in this group is 45.78, which is the highest. It contains mostly white collar workers (32) and 5 blue collar workers. They strongly disagree with digital payments, prefer cash transactions and believe in keeping cash. They’re skeptical of digital payment platforms. They’re less likely to use mobile payment platforms because they’re convenient or because they’re recommended to them. This group consists of older, security-conscious individuals who are resistant to digital payments and prefer the reliability of cash.

  • Cluster 5: CASH The mean age in this group is 40.41, with mostly white collar workers (11), and a few blue collar workers (6). They have mixed opinions on cash vs. digital payments, but lean towards preferring cash for small payments. They’re have neutral or slightly positive feelings about mobile payment platforms. They’re more flexible than cluster 4 but are still cash-reliant. This group consists of individuals who use cash frequently but are open to digital payments in certain conditions.

Conclusion:

  • Clusters 1 and 2: pay mostly digitally, especially cluster 1

  • Cluster 3: balanced between using cash and digital payments

  • Clusters 4 and 5: more cash-reliant, with cluster 4 being the most resistant to digital payments