# Load dataset from Excel file
library(readxl)
mydata <- read_xlsx("./REAL.xlsx")
# Display first few rows
head(mydata)
## # A tibble: 6 × 40
## ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 5 6 5 7 7 5 7 7 5 7 7 6
## 2 2 7 7 6 2 6 6 4 7 7 3 6 6
## 3 3 7 6 6 6 6 6 7 7 5 6 7 5
## 4 4 7 3 5 6 6 3 3 6 5 3 6 6
## 5 5 6 5 5 5 5 6 6 6 7 4 6 7
## 6 6 6 6 6 5 7 6 6 7 5 7 7 5
## # ℹ 27 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## # Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## # Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <dbl>,
## # BankFixed <dbl>, EmplFixed <dbl>, JobFixed <dbl>, LocationFixed <dbl>,
## # PreferCash <dbl>, KeepCash <dbl>, FindSeller <dbl>, SmallCash <dbl>,
## # DigitalEasy <dbl>, ConvenientFlik <dbl>, FriendsFlik <dbl>,
## # Hypothetical <dbl>, Income <dbl>
colnames(mydata)
## [1] "ID" "Q2a_1" "Q2b_1" "Q2c_1"
## [5] "Q3a_1" "Q3b_1" "Q3c_1" "Q4a_1"
## [9] "Q4b_1" "Q4c_1" "Q5a_1" "Q5b_1"
## [13] "Q5c_1" "Q6a_1" "Q6b_1" "Q6c_1"
## [17] "Q7a_1" "Q7b_1" "Q7c_1" "Age"
## [21] "Gender" "Location" "Education" "Job"
## [25] "Bank" "EmplStatus" "EducFixed" "BankFixed"
## [29] "EmplFixed" "JobFixed" "LocationFixed" "PreferCash"
## [33] "KeepCash" "FindSeller" "SmallCash" "DigitalEasy"
## [37] "ConvenientFlik" "FriendsFlik" "Hypothetical" "Income"
mydata$GenderFactor <- factor(mydata$Gender,
levels = c(1, 2),
labels = c("Male", "Female"))
mydata$LocationFactor <- factor(mydata$Location,
levels = c(1, 2, 3),
labels = c("Urban", "Urban", "Rural"))
mydata$EducationFactor <- factor(mydata$Education,
levels = c(0,1, 2, 3, 4, 5, 6),
labels = c("Unifinished elementary", "Finished elementary", "Vocational school", "General high school", "Undergraduate degree", "Master's degree", "PhD"))
mydata$EmplStatusFactor <- factor (mydata$EmplStatus,
levels = c(1, 2, 3, 4),
labels = c("Employed", "Self-employed", "Self-employed", "Self-employed"))
#mydata$JobFactor <- factor (mydata$Job,
#levels = c(1, 2 ,3, 4, 5),
#labels = c("Manual", "Manual", "Office", "Office", "Office"))
mydata$EducFixed <- factor (mydata$EducFixed,
levels = c(0, 1),
labels = c("Up to high school", "Undergrad and more"))
mydata$BankFixed <- factor (mydata$BankFixed,
levels = c(0, 1,2,3,4,5,6,7,8,9),
labels = c("NLB", "Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks","Other banks"))
mydata$EmplFixed <- factor (mydata$EmplFixed,
levels = c(0, 1),
labels = c("Employed", "Others"))
mydata$JobFixed <- factor(mydata$JobFixed,
levels = c(0,1),
labels = c("White Collar", "Blue Collar"))
#Saving standardized cluster variables into new data frame
mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")]))
#Finding outliers
mydata_clu_new[is.na(mydata_clu_new)] <- 0
mydata$Dissimilarity <- sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 + mydata_clu_new$Q4a_1^2 + mydata_clu_new$Q5a_1^2 + mydata_clu_new$Q6a_1^2 + mydata_clu_new$Q7a_1^2)
#Finding units with highest value of dissimilarity
head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")])
## # A tibble: 6 × 2
## ID Dissimilarity
## <dbl> <dbl>
## 1 40 4.64
## 2 14 4.49
## 3 120 4.12
## 4 34 3.92
## 5 71 3.89
## 6 138 3.81
#Showing units ID14, 40, 120
print(mydata[c(14,40,120), ])
## # A tibble: 3 × 45
## ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14 7 7 5 3 6 7 6 6 7 3 6 7
## 2 40 5 3 3 2 7 7 2 7 7 2 7 7
## 3 120 1 6 5 6 6 7 7 5 6 6 6 6
## # ℹ 32 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## # Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## # Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <fct>,
## # BankFixed <fct>, EmplFixed <fct>, JobFixed <fct>, LocationFixed <dbl>,
## # PreferCash <dbl>, KeepCash <dbl>, FindSeller <dbl>, SmallCash <dbl>,
## # DigitalEasy <dbl>, ConvenientFlik <dbl>, FriendsFlik <dbl>,
## # Hypothetical <dbl>, Income <dbl>, GenderFactor <fct>, …
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Finding Euclidean distances based on 6 Cluster variables, then saving them into object Distances
Distances <- get_dist(mydata_clu_new,
method = "euclidian")
#Showing matrix of distances
fviz_dist(Distances,
gradient = list(low = "slateblue4",
mid = "green",
high = "white"))
#Hopkins statistics
library(factoextra)
get_clust_tendency(mydata_clu_new,
n = nrow(mydata_clu_new) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6418886
##
## $plot
## NULL
Hopkins statistics is above 0.5 - data is clusterable.
#Determining number of clusters for K-means clustering
library(factoextra)
library(NbClust)
fviz_nbclust(mydata_clu_new, kmeans, method = "wss") +
labs(subtitle = "Elbow method")
#Determining number of clusters for K-means clustering
fviz_nbclust(mydata_clu_new, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette analysis")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
WARD <- mydata_clu_new %>%
get_dist(method = "euclidean") %>%
hclust(method = "ward.D2")
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 152
library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(NbClust)
NbClust(mydata_clu_new,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 5 proposed 3 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 16.5370 59.5648 22.8451 -1.2199 190.7707 4.307472e+12 12926.431 648.4867
## 3 0.1354 45.4359 23.4358 -1.8840 367.1832 3.036401e+12 9223.823 562.7755
## 4 0.7123 42.5792 23.8440 -1.6591 474.6207 2.662356e+12 6927.557 486.2886
## 5 2.2046 42.7495 14.9309 0.1108 564.7277 2.299495e+12 4648.256 418.8142
## 6 2.5760 40.3828 9.9144 0.8422 663.8276 1.725228e+12 3722.664 380.1973
## 7 0.8891 37.3325 9.0992 0.7345 727.1115 1.548550e+12 3098.369 356.0210
## 8 0.4194 35.0637 11.8946 0.7330 782.5853 1.404137e+12 2947.080 334.9989
## 9 9.5096 34.4609 5.7515 1.5349 847.1365 1.162195e+12 2492.866 309.4387
## 10 0.0898 32.2757 11.9974 1.1503 877.7749 1.172881e+12 2235.064 297.4742
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 3.2372 1.3971 0.4534 1.5833 0.2516 1.2250 -18.5509 -0.6976 0.3433
## 3 5.4346 1.6099 0.4247 1.7077 0.2047 1.5054 -24.1722 -1.2658 0.3359
## 4 6.8884 1.8631 0.4612 1.5760 0.2104 1.1178 -5.7980 -0.3969 0.3241
## 5 7.5885 2.1633 0.4244 1.3974 0.2328 1.2897 -11.6811 -0.8421 0.3275
## 6 9.3895 2.3830 0.3998 1.3793 0.2277 1.4435 -10.4458 -1.1283 0.3105
## 7 10.5507 2.5448 0.4490 1.4012 0.2230 1.0142 -0.5887 -0.0525 0.2941
## 8 11.6282 2.7045 0.4165 1.4359 0.2077 1.6961 -10.2607 -1.4862 0.2804
## 9 12.8642 2.9279 0.4229 1.4137 0.2152 1.1606 -2.4907 -0.4969 0.2704
## 10 13.4092 3.0456 0.4162 1.3749 0.2092 1.8949 -12.7511 -1.6655 0.2591
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 324.2434 0.4586 0.8104 0.7394 0.1312 0.0023 1.6254 1.9756 1.2828
## 3 187.5918 0.4468 0.1855 1.4472 0.0831 0.0026 1.6291 1.8218 1.0393
## 4 121.5722 0.4777 0.0745 1.9274 0.0977 0.0028 1.4802 1.6897 0.6214
## 5 83.7628 0.5135 0.2775 2.2588 0.1090 0.0032 1.3569 1.5742 0.5371
## 6 63.3662 0.5105 0.7706 2.6311 0.1078 0.0033 1.3411 1.5061 0.4869
## 7 50.8601 0.4811 1.1917 3.1800 0.1112 0.0034 1.5588 1.4607 0.4626
## 8 41.8749 0.4367 0.0850 4.0759 0.1051 0.0036 1.6493 1.4036 0.4293
## 9 34.3821 0.4416 0.4428 4.3256 0.1112 0.0038 1.5593 1.3558 0.3930
## 10 29.7474 0.4290 -0.1904 4.7313 0.1480 0.0039 1.5085 1.3214 0.3768
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.7006 43.1654 1
## 3 0.6533 38.2096 1
## 4 0.6459 30.1530 1
## 5 0.6222 31.5710 1
## 6 0.5356 29.4770 1
## 7 0.6188 25.8772 1
## 8 0.4889 26.1338 1
## 9 0.4643 20.7641 1
## 10 0.4174 37.6933 1
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 2.000 2.0000 5.0000 9.0000 3.0000 3 3.000
## Value_Index 16.537 59.5648 8.9131 1.5349 176.4125 897026528841 3702.608
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 5.0000 3.0000 9.0000 6.0000 10.0000 2.0000 2.000
## Value_Index 28.8575 2.1974 -0.1056 0.3998 1.3749 0.2516 1.225
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 2.0000 3.0000 5.0000 1 2.0000
## Value_Index -18.5509 -0.6976 0.3433 136.6515 0.5135 NA 0.7394
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 10.000 0 6.0000 0 10.0000
## Value_Index 0.148 0 1.3411 0 0.3768
##
## $Best.partition
## [1] 1 2 1 2 1 1 2 2 2 1 2 1 1 2 2 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1
## [38] 1 1 2 2 2 1 2 2 2 1 1 1 2 1 1 2 1 1 2 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 1 2 1
## [75] 2 1 2 2 1 2 2 1 2 2 1 2 2 2 2 2 1 2 1 1 1 1 2 2 2 1 2 2 1 2 1 1 1 1 1 1 2
## [112] 1 2 1 2 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 2 2 1 2 1 2 2 2 2
## [149] 1 2 1 1
We will make five clusters.
Clustering <- kmeans(mydata_clu_new,
centers = 5, #Number of groups
nstart = 25) #Number of attempts at different starting leader positions
Clustering
## K-means clustering with 5 clusters of sizes 30, 17, 24, 41, 40
##
## Cluster means:
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 0.30294872 -1.08988634 -1.2737371 -1.1121377 0.2349868 -0.4492697
## 2 -2.04299453 -0.15283293 0.1288002 -0.2342229 -0.1969184 -0.1982072
## 3 0.46619735 -0.27363453 -0.4237942 -0.3527855 -1.7529088 -0.1005993
## 4 0.32831346 0.01992193 0.4436057 0.4646443 0.5814218 -0.6697631
## 5 0.02482143 1.02612950 0.7001434 0.6690589 0.3632381 1.1680571
##
## Clustering vector:
## [1] 5 1 4 1 5 4 3 1 3 4 1 4 4 3 1 5 4 1 4 4 1 4 5 5 4 5 5 5 5 4 1 5 3 1 2 3 5
## [38] 4 4 3 2 1 4 1 1 2 5 4 4 3 2 5 3 5 5 1 3 2 2 4 5 2 3 5 5 5 4 5 4 3 5 5 3 5
## [75] 3 4 1 1 2 4 1 4 1 2 4 4 1 2 1 1 2 3 4 5 5 3 1 4 1 4 4 1 5 4 5 5 5 5 5 4 3
## [112] 5 1 5 4 4 4 4 4 2 5 4 1 4 2 4 3 5 3 3 3 4 5 3 2 4 1 3 2 1 2 5 1 5 4 1 3 3
## [149] 2 1 5 5
##
## Within cluster sum of squares by cluster:
## [1] 101.21820 62.11656 76.96477 92.61780 85.89687
## (between_SS / total_SS = 53.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_new)
Some units seem to be far away from the center, so I will remove them.
mydata <- mydata %>%
filter(!ID %in% c(133, 108, 7))
mydata$ID <- seq(1, nrow(mydata))
mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")]))
mydata_clu_new[is.na(mydata_clu_new)] <- 0
Clustering <- kmeans(mydata_clu_new,
centers = 5, #Number of groups
nstart = 25) #Number of attempts at different starting leader positions
Clustering
## K-means clustering with 5 clusters of sizes 29, 41, 25, 37, 17
##
## Cluster means:
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 0.32527292 -1.099435794 -1.2365294 -1.1969146 0.2499536 -0.4063642
## 2 0.34475122 -0.004020726 0.4124110 0.4922125 0.5570996 -0.7193229
## 3 0.46902278 -0.119537224 -0.4047331 -0.2429699 -1.7180355 0.0505640
## 4 -0.02546811 1.014802901 0.7180670 0.6648481 0.4418536 1.1696604
## 5 -2.02064493 -0.147693467 0.1470795 -0.2350188 -0.2051434 -0.1920432
##
## Clustering vector:
## [1] 4 1 2 1 4 2 1 3 2 1 2 2 3 1 4 2 1 2 4 1 2 4 4 2 4 4 4 4 2 1 4 3 1 5 3 4 2
## [38] 2 3 5 1 2 1 1 5 4 2 2 3 5 4 3 3 4 1 3 5 5 2 4 5 3 4 4 4 2 4 2 3 4 4 3 4 3
## [75] 2 1 1 5 2 1 2 1 5 2 2 1 5 1 1 5 3 2 4 4 3 1 2 1 2 2 1 4 2 4 4 4 4 2 3 4 1
## [112] 4 2 2 2 2 2 5 4 2 1 2 5 2 3 4 3 3 3 2 3 5 2 1 3 5 2 5 4 1 3 2 1 3 3 5 1 4
## [149] 4
##
## Within cluster sum of squares by cluster:
## [1] 94.10655 99.94698 77.03574 78.50395 62.95983
## (between_SS / total_SS = 53.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_new)
#Average values of cluster variables to describe groups
Averages <- Clustering$centers
Averages
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 0.32527292 -1.099435794 -1.2365294 -1.1969146 0.2499536 -0.4063642
## 2 0.34475122 -0.004020726 0.4124110 0.4922125 0.5570996 -0.7193229
## 3 0.46902278 -0.119537224 -0.4047331 -0.2429699 -1.7180355 0.0505640
## 4 -0.02546811 1.014802901 0.7180670 0.6648481 0.4418536 1.1696604
## 5 -2.02064493 -0.147693467 0.1470795 -0.2350188 -0.2051434 -0.1920432
Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)
library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"))
Figure$Group <- factor(Figure$ID,
levels = c(1, 2, 3, 4, 5),
labels = c("1", "2", "3", "4", "5"))
Figure$NameF <- factor(Figure$name,
levels = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"),
labels = c("Cash_Safety", "Cash_Speed", "Cash_Ease of Use", "Cash_Convenience", "Cash_Privacy", "Cash_Tracking Expenses"))
library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 5) +
geom_line(aes(group = ID), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-2.5, 2.5) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
#Saving where each unit belongs
mydata$Group <- Clustering$cluster
#Checking if clustering variables successfully differentiate between groups
fit <- aov(cbind(Q2a_1, Q3a_1, Q4a_1, Q5a_1, Q6a_1, Q7a_1) ~ as.factor(Group),
data = mydata)
summary(fit)
## Response Q2a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 158.22 39.555 45.813 < 2.2e-16 ***
## Residuals 144 124.33 0.863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q3a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 186.27 46.568 35.889 < 2.2e-16 ***
## Residuals 144 186.84 1.298
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q4a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 166.72 41.681 36.842 < 2.2e-16 ***
## Residuals 144 162.91 1.131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q5a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 221.47 55.367 32.526 < 2.2e-16 ***
## Residuals 144 245.12 1.702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q6a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 102.047 25.5117 66.99 < 2.2e-16 ***
## Residuals 144 54.839 0.3808
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q7a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 296.74 74.185 39.376 < 2.2e-16 ***
## Residuals 144 271.30 1.884
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It differs for at least one of the groups for all variables.
#Additional variables
aggregate(mydata$Age,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 36.03448
## 2 2 35.48780
## 3 3 38.28000
## 4 4 45.78378
## 5 5 40.41176
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.4.2
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(Age)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 Age 0.890 0.00569
## 2 2 Age 0.913 0.00399
## 3 3 Age 0.871 0.00457
## 4 4 Age 0.879 0.000818
## 5 5 Age 0.929 0.207
kruskal.test(Age ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: Age by as.factor(Group)
## Kruskal-Wallis chi-squared = 11.386, df = 4, p-value = 0.02256
Significant. Age can be used to describe clusters.
#Checking the association between the location and classification into 5 groups
chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group))
## Warning in chisq.test(mydata$LocationFactor, as.factor(mydata$Group)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 5.2367, df = 4, p-value = 0.2639
Not significant, location can’t be used to describe clusters.
# Perform Fisher's Exact Test with simulated p-value
fisher_result <- fisher.test(table(mydata$LocationFactor, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$LocationFactor, mydata$Group)
## p-value = 0.2848
## alternative hypothesis: two.sided
#Checking the association between the gender and classification into 5 groups
chi_square <- chisq.test(mydata$GenderFactor, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$GenderFactor and as.factor(mydata$Group)
## X-squared = 3.0375, df = 4, p-value = 0.5516
Not significant, can’t be used to describe clusters.
#Checking the association between the education and classification into 5 groups
chi_square <- chisq.test(mydata$EducFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$EducFixed and as.factor(mydata$Group)
## X-squared = 4.5211, df = 4, p-value = 0.3401
#Checking the association between the employment status and classification into 5 groups
chi_square <- chisq.test(mydata$EducFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$EducFixed and as.factor(mydata$Group)
## X-squared = 4.5211, df = 4, p-value = 0.3401
# Create a contingency table of EducationFactor and Group
table_data_edu <- table(mydata$EducFixed, mydata$Group)
# Print table to verify structure
print(table_data_edu)
##
## 1 2 3 4 5
## Up to high school 7 16 6 16 7
## Undergrad and more 22 25 19 21 10
# Perform Fisher's Exact Test with simulation
fisher_test_edu <- fisher.test(table_data_edu, simulate.p.value = TRUE, B = 10000) # B controls the number of simulations
# Print the results
print(fisher_test_edu)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table_data_edu
## p-value = 0.3483
## alternative hypothesis: two.sided
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$EducFixed, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$EducFixed, mydata$Group)
## p-value = 0.3372
## alternative hypothesis: two.sided
Not signfiicant, can’t be used to describe clusters.
#Checking the association between the employment status and classification into 5 groups
chi_square <- chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), :
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$EmplStatusFactor and as.factor(mydata$Group)
## X-squared = 16.828, df = 4, p-value = 0.002087
# Create a contingency table of Employment Status Factor and Group
table_data_emp <- table(mydata$EmplStatusFactor, mydata$Group)
# Print table to verify structure
print(table_data_emp)
##
## 1 2 3 4 5
## Employed 25 37 19 20 14
## Self-employed 4 4 6 17 3
# Perform Fisher's Exact Test with simulation
fisher_test_emp <- fisher.test(table_data_emp, simulate.p.value = TRUE, B = 10000) # B controls the number of simulations
# Print the results
print(fisher_test_emp)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table_data_emp
## p-value = 0.0031
## alternative hypothesis: two.sided
This can be used, but the proportion of self-employed people in each of the clusters is very small. Can it be used?
#Checking the association between the bank and classification into 5 groups
chi_square <- chisq.test(mydata$BankFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$BankFixed and as.factor(mydata$Group)
## X-squared = 6.6039, df = 4, p-value = 0.1584
Not significant. Can’t be used to describe clusters.
#Checking the association between the job and classification into 5 groups
chi_square <- chisq.test(mydata$JobFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$JobFixed and as.factor(mydata$Group)
## X-squared = 12.681, df = 4, p-value = 0.01295
# Create a contingency table of JobFixed and Group
table_data_job <- table(mydata$JobFixed, mydata$Group)
# Print table to verify structure
print(table_data_job)
##
## 1 2 3 4 5
## White Collar 17 22 20 32 11
## Blue Collar 12 19 5 5 6
This can be used to describe clusters.
#Checking the association between the location and classification into 4 groups
chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$LocationFactor, as.factor(mydata$Group), correct =
## TRUE): Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 5.2367, df = 4, p-value = 0.2639
# Create a contingency table of Location and Group
table_data_job <- table(mydata$LocationFactor, mydata$Group)
# Print table to verify structure
print(table_data_job)
##
## 1 2 3 4 5
## Urban 23 35 22 25 13
## Rural 6 6 3 12 4
Not significant, can’t be used to describe clusters.
Overall, from demographics, we can use the following to describe:
aggregate(mydata$PreferCash,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 1.862069
## 2 2 3.731707
## 3 3 3.560000
## 4 4 4.351351
## 5 5 3.117647
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(PreferCash)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 PreferCash 0.684 0.00000124
## 2 2 PreferCash 0.890 0.000864
## 3 3 PreferCash 0.890 0.0111
## 4 4 PreferCash 0.865 0.000359
## 5 5 PreferCash 0.877 0.0281
kruskal.test(PreferCash ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: PreferCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 28.631, df = 4, p-value = 9.29e-06
There are differences among groups, this can be used to describe clusters.
library(ggplot2)
library(dplyr)
# Define the labels for Likert scale categories
likert_labels <- c(
"1" = "Strongly Disagree",
"2" = "Disagree",
"3" = "Somewhat Disagree",
"4" = "Neutral",
"5" = "Somewhat Agree",
"6" = "Agree",
"7" = "Strongly Agree"
)
# Create frequency table
table_clusters <- table(mydata$Group, mydata$PreferCash)
# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)
# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))
# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)
# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Convert Freq to percentages
geom_bar(stat = "identity", position = "stack") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Response", # Changed from "Category" to "Response" for clarity
title = "Percentage Distribution of Those Who Keep Cash"
) +
theme_minimal()
aggregate(mydata$KeepCash,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 3.793103
## 2 2 4.731707
## 3 3 4.040000
## 4 4 4.351351
## 5 5 4.000000
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(KeepCash)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 KeepCash 0.892 0.00638
## 2 2 KeepCash 0.877 0.000368
## 3 3 KeepCash 0.855 0.00216
## 4 4 KeepCash 0.897 0.00250
## 5 5 KeepCash 0.843 0.00855
kruskal.test(KeepCash ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: KeepCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 5.7094, df = 4, p-value = 0.2219
This can be used to describe clusters.
library(ggplot2)
library(dplyr)
# Define the labels for Likert scale categories
likert_labels <- c(
"1" = "Strongly Disagree",
"2" = "Disagree",
"3" = "Somewhat Disagree",
"4" = "Neutral",
"5" = "Somewhat Agree",
"6" = "Agree",
"7" = "Strongly Agree"
)
# Create frequency table
table_clusters <- table(mydata$Group, mydata$KeepCash)
# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)
# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))
# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)
# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Convert Freq to percentages
geom_bar(stat = "identity", position = "stack") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Response", # Changed from "Category" to "Response" for clarity
title = "Percentage Distribution of Those Who Prefer Cash"
) +
theme_minimal()
aggregate(mydata$FindSeller,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 6.172414
## 2 2 6.000000
## 3 3 5.240000
## 4 4 5.054054
## 5 5 5.470588
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(FindSeller)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 FindSeller 0.701 0.00000216
## 2 2 FindSeller 0.850 0.0000745
## 3 3 FindSeller 0.887 0.00989
## 4 4 FindSeller 0.887 0.00129
## 5 5 FindSeller 0.888 0.0434
kruskal.test(FindSeller ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: FindSeller by as.factor(Group)
## Kruskal-Wallis chi-squared = 15.429, df = 4, p-value = 0.00389
This can be used to describe clusters.
library(ggplot2)
library(dplyr)
# Define the labels for Likert scale categories
likert_labels <- c(
"1" = "Strongly Disagree",
"2" = "Disagree",
"3" = "Somewhat Disagree",
"4" = "Neutral",
"5" = "Somewhat Agree",
"6" = "Agree",
"7" = "Strongly Agree"
)
# Create frequency table
table_clusters <- table(mydata$Group, mydata$FindSeller)
# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)
# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))
# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)
# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Convert Freq to percentages
geom_bar(stat = "identity", position = "stack") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Response", # Changed from "Category" to "Response" for clarity
title = "Percentage Distribution of Those Who Find Vendors Accepting Digital Payments"
) +
theme_minimal()
aggregate(mydata$SmallCash,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 1.862069
## 2 2 2.902439
## 3 3 3.200000
## 4 4 4.351351
## 5 5 3.176471
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(SmallCash)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 SmallCash 0.810 0.000126
## 2 2 SmallCash 0.879 0.000428
## 3 3 SmallCash 0.894 0.0138
## 4 4 SmallCash 0.854 0.000197
## 5 5 SmallCash 0.844 0.00885
kruskal.test(SmallCash ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: SmallCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 22.6, df = 4, p-value = 0.0001522
This can be used to describe clusters.
library(ggplot2)
library(dplyr)
# Define the labels for Likert scale categories
likert_labels <- c(
"1" = "Strongly Disagree",
"2" = "Disagree",
"3" = "Somewhat Disagree",
"4" = "Neutral",
"5" = "Somewhat Agree",
"6" = "Agree",
"7" = "Strongly Agree"
)
# Create frequency table
table_clusters <- table(mydata$Group, mydata$SmallCash)
# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)
# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))
# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)
# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Convert Freq to percentages
geom_bar(stat = "identity", position = "stack") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Response", # Changed from "Category" to "Response" for clarity
title = "Percentage Distribution of Those Who Prefer Cash For Payments up to 10€"
) +
theme_minimal()
aggregate(mydata$DigitalEasy,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 6.068966
## 2 2 5.878049
## 3 3 5.200000
## 4 4 4.432432
## 5 5 5.705882
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(DigitalEasy)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 DigitalEasy 0.668 0.000000744
## 2 2 DigitalEasy 0.812 0.0000100
## 3 3 DigitalEasy 0.878 0.00640
## 4 4 DigitalEasy 0.901 0.00305
## 5 5 DigitalEasy 0.877 0.0288
kruskal.test(DigitalEasy ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: DigitalEasy by as.factor(Group)
## Kruskal-Wallis chi-squared = 20.426, df = 4, p-value = 0.0004114
This can be used to describe clusters.
library(ggplot2)
library(dplyr)
# Define the labels for Likert scale categories
likert_labels <- c(
"1" = "Strongly Disagree",
"2" = "Disagree",
"3" = "Somewhat Disagree",
"4" = "Neutral",
"5" = "Somewhat Agree",
"6" = "Agree",
"7" = "Strongly Agree"
)
# Create frequency table
table_clusters <- table(mydata$Group, mydata$DigitalEasy)
# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)
# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))
# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)
# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Convert Freq to percentages
geom_bar(stat = "identity", position = "stack") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Response", # Changed from "Category" to "Response" for clarity
title = "Percentage Distribution of Those Who Prefer Digital For Payments up to 10€"
) +
theme_minimal()
aggregate(mydata$ConvenientFlik,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 6.413793
## 2 2 5.292683
## 3 3 5.640000
## 4 4 4.081081
## 5 5 5.588235
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(ConvenientFlik)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 ConvenientFlik 0.528 0.0000000154
## 2 2 ConvenientFlik 0.781 0.00000224
## 3 3 ConvenientFlik 0.777 0.0000966
## 4 4 ConvenientFlik 0.872 0.000532
## 5 5 ConvenientFlik 0.817 0.00351
kruskal.test(ConvenientFlik ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: ConvenientFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 24.896, df = 4, p-value = 5.28e-05
This can be used to describe clusters.
library(ggplot2)
library(dplyr)
# Define the labels for Likert scale categories
likert_labels <- c(
"1" = "Strongly Disagree",
"2" = "Disagree",
"3" = "Somewhat Disagree",
"4" = "Neutral",
"5" = "Somewhat Agree",
"6" = "Agree",
"7" = "Strongly Agree"
)
# Create frequency table
table_clusters <- table(mydata$Group, mydata$ConvenientFlik)
# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)
# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))
# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)
# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Convert Freq to percentages
geom_bar(stat = "identity", position = "stack") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Response", # Changed from "Category" to "Response" for clarity
title = "Percentage Distribution of Those Who Use Mobile Payment Platforms due to Convenience"
) +
theme_minimal()
aggregate(mydata$FriendsFlik,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 5.413793
## 2 2 5.000000
## 3 3 5.040000
## 4 4 4.378378
## 5 5 5.235294
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(FriendsFlik)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 FriendsFlik 0.808 0.000117
## 2 2 FriendsFlik 0.846 0.0000602
## 3 3 FriendsFlik 0.875 0.00558
## 4 4 FriendsFlik 0.898 0.00255
## 5 5 FriendsFlik 0.838 0.00697
kruskal.test(FriendsFlik ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: FriendsFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 7.617, df = 4, p-value = 0.1067
This can be used to describe clusters. It is 11% but hopefully it should be fine.
library(ggplot2)
library(dplyr)
# Define the labels for Likert scale categories
likert_labels <- c(
"1" = "Strongly Disagree",
"2" = "Disagree",
"3" = "Somewhat Disagree",
"4" = "Neutral",
"5" = "Somewhat Agree",
"6" = "Agree",
"7" = "Strongly Agree"
)
# Create frequency table
table_clusters <- table(mydata$Group, mydata$FriendsFlik)
# Convert to proportions
prop_table_clusters <- prop.table(table_clusters, margin = 1)
# Convert to a dataframe
prop_df <- as.data.frame(as.table(prop_table_clusters))
# Convert Var2 to a factor with meaningful Likert labels
prop_df$Var2 <- factor(prop_df$Var2, levels = names(likert_labels), labels = likert_labels)
# Plot with updated labels
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Convert Freq to percentages
geom_bar(stat = "identity", position = "stack") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Response", # Changed from "Category" to "Response" for clarity
title = "Percentage Distribution of Those Who Use Mobile Payment Platforms due to Recommendations"
) +
theme_minimal()
Cluster 1: STRONGLY DIGITAL The mean age in this group s 36.03. It contains a mix of white collar (17) and blue collar (12) workers. They strongly disagree with keeping and preferring cash, and are skeptical of cash usage even for small payments. They prefer digital payments and are in favor of mobile payment platforms. They are comfortable with non-cash transactions. This group consists of fairly young individuals who actively avoid cash and are strong proponents of digital payments.
Cluster 2: DIGITAL The mean age in this group is 35.49. It contains more white collar (22) than blue collar (19) workers. They are more neutral about keeping and preferring cash and lean toward digital payments. They moderately accept mobile payment platforms, and are less reliant on cash for small payments, but aren’t fully opposed to them. They are more open to various payment methods. This group is likely to transition toward digital payments but doesn’t completely reject cash.
Cluster 3: BALANCED The mean age in this group is 38.28. It mostly consists of white collar workers (20), and only 5 people have blue collar jobs. They have neutral stance on keeping and preferring cash, and there’s an even distribution between cash and digital preferences for small payments. They somewhat agree with mobile payment platforms. They do find vendors who accept digital payments but are still open to using cash. This group is balanced in their payment methods.
Cluster 4: STRONGLY CASH The mean age in this group is 45.78, which is the highest. It contains mostly white collar workers (32) and 5 blue collar workers. They strongly disagree with digital payments, prefer cash transactions and believe in keeping cash. They’re skeptical of digital payment platforms. They’re less likely to use mobile payment platforms because they’re convenient or because they’re recommended to them. This group consists of older, security-conscious individuals who are resistant to digital payments and prefer the reliability of cash.
Cluster 5: CASH The mean age in this group is 40.41, with mostly white collar workers (11), and a few blue collar workers (6). They have mixed opinions on cash vs. digital payments, but lean towards preferring cash for small payments. They’re have neutral or slightly positive feelings about mobile payment platforms. They’re more flexible than cluster 4 but are still cash-reliant. This group consists of individuals who use cash frequently but are open to digital payments in certain conditions.
Conclusion:
Clusters 1 and 2: pay mostly digitally, especially cluster 1
Cluster 3: balanced between using cash and digital payments
Clusters 4 and 5: more cash-reliant, with cluster 4 being the most resistant to digital payments