# Load dataset from Excel file
library(readxl)

## Warning: package 'readxl' was built under R version 4.4.2

mydata <- read_xlsx("./DataPerception.xlsx")

## New names:
## • `` -> `...40`

# Display first few rows
head(mydata)

## # A tibble: 6 × 41
##      ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     5     6     5     7     7     5     7     7     5     7     7     6
## 2     2     7     7     6     2     6     6     4     7     7     3     6     6
## 3     3     7     6     6     6     6     6     7     7     5     6     7     5
## 4     4     7     3     5     6     6     3     3     6     5     3     6     6
## 5     5     6     5     5     5     5     6     6     6     7     4     6     7
## 6     6     6     6     6     5     7     6     6     7     5     7     7     5
## # ℹ 28 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## #   Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## #   Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <chr>,
## #   BankFixed <dbl>, EmplFixed <dbl>, JobFixed <dbl>, LocationFixed <dbl>,
## #   BetterCash <dbl>, CashOnme <dbl>, FindSeller <dbl>, SmallCash <dbl>,
## #   DigitalEasy <dbl>, ConvenientFlik <dbl>, FriendsFlik <dbl>,
## #   Hypothetical <dbl>, ...40 <lgl>, CashKeep <dbl>

colnames(mydata)

##  [1] "ID"             "Q2a_1"          "Q2b_1"          "Q2c_1"         
##  [5] "Q3a_1"          "Q3b_1"          "Q3c_1"          "Q4a_1"         
##  [9] "Q4b_1"          "Q4c_1"          "Q5a_1"          "Q5b_1"         
## [13] "Q5c_1"          "Q6a_1"          "Q6b_1"          "Q6c_1"         
## [17] "Q7a_1"          "Q7b_1"          "Q7c_1"          "Age"           
## [21] "Gender"         "Location"       "Education"      "Job"           
## [25] "Bank"           "EmplStatus"     "EducFixed"      "BankFixed"     
## [29] "EmplFixed"      "JobFixed"       "LocationFixed"  "BetterCash"    
## [33] "CashOnme"       "FindSeller"     "SmallCash"      "DigitalEasy"   
## [37] "ConvenientFlik" "FriendsFlik"    "Hypothetical"   "...40"         
## [41] "CashKeep"

mydata$GenderFactor <- factor(mydata$Gender, 
                             levels = c(1, 2), 
                             labels = c("Male", "Female"))

mydata$LocationFactor <- factor(mydata$Location, 
                             levels = c(1, 2, 3), 
                             labels = c("Urban", "Suburban", "Rural"))

mydata$EducationFactor <- factor(mydata$Education,
                                 levels = c(1, 2, 3, 4, 5, 6 ,7),
                                 labels = c("Unifinished elementary", "Finished elementary", "Vocational school", "General high school", "Undergraduate degree", "Master's degree", "PhD"))

mydata$EmplStatusFactor <- factor (mydata$EmplStatus,
                                   levels = c(1, 2 ,3, 4),
                                   labels = c("Employed", "Self-employed", "Retired", "Unemployed"))

mydata$JobFactor <- factor (mydata$Job,
                                   levels = c(1, 2 ,3, 4, 5),
                                   labels = c("Physical", "Service", "Office", "Public", "Creative"))

mydata$EducFixed <- factor (mydata$EducFixed,
                                   levels = c(0, 1),
                                   labels = c("Up to high school", "Undergrad and more"))

mydata$BankFixed <- factor (mydata$BankFixed,
                                   levels = c(0, 1),
                                   labels = c("NLB", "Other banks"))

mydata$EmplFixed <- factor (mydata$EmplFixed,
                                   levels = c(0, 1),
                                   labels = c("Employed", "Others"))

mydata$JobFixed <- factor (mydata$JobFixed,
                                   levels = c(0, 1),
                                   labels = c("Office", "Others"))

mydata$LocationFixed <- factor (mydata$LocationFixed,
                                   levels = c(0, 1),
                                   labels = c("Urban", "Others"))

mydata$BetterCash <- factor (mydata$BetterCash,
                                   levels = c(1, 2 ,3, 4, 5, 6, 7),
                                   labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))

mydata$CashOnme <- factor (mydata$CashOnme,
                                   levels = c(1, 2 ,3, 4, 5, 6, 7),
                                   labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))

mydata$FindSeller <- factor (mydata$FindSeller,
                                   levels = c(1, 2 ,3, 4, 5, 6, 7),
                                   labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))

mydata$SmallCash <- factor (mydata$SmallCash,
                                   levels = c(1, 2 ,3, 4, 5, 6, 7),
                                   labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))

mydata$DigitalEasy <- factor (mydata$DigitalEasy,
                                   levels = c(1, 2 ,3, 4, 5, 6, 7),
                                   labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))

mydata$ConvenientFlik <- factor (mydata$ConvenientFlik,
                                   levels = c(1, 2 ,3, 4, 5, 6, 7),
                                   labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))

mydata$FriendsFlik <- factor (mydata$FriendsFlik,
                                   levels = c(1, 2 ,3, 4, 5, 6, 7),
                                   labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))

For the purpose of clustering, I chose 6 cluster variables:!!!

#Saving standardized cluster variables into new data frame

mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")]))

#Finding outliers

mydata_clu_new[is.na(mydata_clu_new)] <- 0

mydata$Dissimilarity <- sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 + mydata_clu_new$Q4a_1^2 + mydata_clu_new$Q5a_1 + mydata_clu_new$Q6a_1^2 + mydata_clu_new$Q7a_1^2)

## Warning in sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 +
## mydata_clu_new$Q4a_1^2 + : NaNs produced

#Finding units with highest value of dissimilarity

head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")])

## # A tibble: 6 × 2
##      ID Dissimilarity
##   <dbl>         <dbl>
## 1    14          4.28
## 2    40          4.21
## 3   120          4.14
## 4    71          3.84
## 5   138          3.83
## 6    34          3.39

There is a relatively big jump between third and fourth unit, so I will check first three units.

#Showing units ID14, 40, 120

print(mydata[c(14,40,120), ])

## # A tibble: 3 × 47
##      ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    14     7     7     5     3     6     7     6     6     7     3     6     7
## 2    40     5     3     3     2     7     7     2     7     7     2     7     7
## 3   120     1     6     5     6     6     7     7     5     6     6     6     6
## # ℹ 34 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## #   Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## #   Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <fct>,
## #   BankFixed <fct>, EmplFixed <fct>, JobFixed <fct>, LocationFixed <fct>,
## #   BetterCash <fct>, CashOnme <fct>, FindSeller <fct>, SmallCash <fct>,
## #   DigitalEasy <fct>, ConvenientFlik <fct>, FriendsFlik <fct>,
## #   Hypothetical <dbl>, ...40 <lgl>, CashKeep <dbl>, GenderFactor <fct>, …

They don’t seem unusual.

#Removing ...

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.2

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#Finding Euclidean distances based on 6 Cluster variables, then saving them into object Distances

Distances <- get_dist(mydata_clu_new, 
                      method = "euclidian")

#Showing matrix of distances

fviz_dist(Distances, 
          gradient = list(low = "slateblue4",
                          mid = "skyblue3",
                          high = "skyblue"))

There are three or four groups of homogeneous objects forming, but they are not very evident.

#Hopkins statistics

library(factoextra) 
get_clust_tendency(mydata_clu_new, 
                   n = nrow(mydata_clu_new) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.6520884
## 
## $plot
## NULL

Hopkins statistics is above 0.5 - data is clusterable.

#Determining number of clusters for K-means clustering

library(factoextra)
library(NbClust)

fviz_nbclust(mydata_clu_new, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

It seems that the biggest break is at 5 or 6, indicating that we should form 5 or 6 clusters based on Elbow method.

#Determining number of clusters for K-means clustering

fviz_nbclust(mydata_clu_new, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette analysis")

Since we want average Silhouette to be as high as possible, according to this index, it is the best option to form 2 clusters, but 3, 5 or 6 is also almost as good.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(factoextra)
WARD <- mydata_clu_new %>%
  get_dist(method = "euclidean") %>%  
  hclust(method = "ward.D2")          

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 168

library(factoextra)
fviz_dend(WARD)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(NbClust)
NbClust(mydata_clu_new, 
        distance = "euclidean", 
        min.nc = 2, max.nc = 10,
        method = "kmeans", 
        index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 5 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 3 proposed 8 as the best number of clusters 
## * 2 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

## $All.index
##        KL      CH Hartigan     CCC    Scott      Marriot    TrCovW   TraceW
## 2  3.1827 57.9657  28.3492 -2.5629 173.9249 5.366386e+12 13199.460 671.5135
## 3  0.5625 47.8173  24.2670 -2.6210 364.6717 3.879425e+12  9614.043 573.5617
## 4  0.6372 44.3850  25.1876 -2.6681 479.6169 3.479378e+12  7191.278 500.0220
## 5  2.3310 44.4256  15.6463 -1.0277 593.1414 2.765993e+12  4990.154 433.4514
## 6  1.1173 41.8231  13.4675 -0.4471 689.9419 2.238596e+12  3894.252 395.4885
## 7  1.1191 39.7477  11.7876  0.1050 754.2201 2.078283e+12  3559.316 365.1339
## 8  4.1349 38.0102   7.4798  0.5409 817.5443 1.862045e+12  3026.031 340.2244
## 9  1.7033 35.5253   6.3092  0.2683 892.9602 1.504314e+12  2960.553 325.0297
## 10 0.3790 33.3212   3.4812 -0.0651 920.0917 1.580215e+12  2649.834 312.6247
##    Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 2    2.8254 1.3492 0.4405 1.6644     0.2169 1.3980 -26.1913 -1.0805    0.3259
## 3    4.7862 1.5796 0.3979 1.7596     0.2023 2.1839 -51.5006 -2.0403    0.3312
## 4    6.1517 1.8119 0.4271 1.6065     0.2171 1.1293  -6.8707 -0.4333    0.3207
## 5    7.1848 2.0902 0.3968 1.3997     0.2398 1.8716 -17.6966 -1.7103    0.3226
## 6    8.6824 2.2908 0.4134 1.4038     0.2367 1.3524  -9.1200 -0.9435    0.3060
## 7    9.5696 2.4813 0.3889 1.4472     0.2151 1.3315 -11.2042 -0.9123    0.2917
## 8   10.5086 2.6629 0.4484 1.3463     0.2313 2.5006 -17.4029 -2.1439    0.2792
## 9   12.2185 2.7874 0.3904 1.4202     0.2235 2.4389 -19.4694 -2.0952    0.2663
## 10  12.3861 2.8980 0.3539 1.3717     0.2185 1.1475  -1.7998 -0.4671    0.2558
##        Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  335.7568     0.3510  0.1525  0.7625 0.1310 0.0019  1.8108 1.8917 9.7859
## 3  191.1872     0.4331  0.1434  1.3184 0.1245 0.0028  1.8815 1.7204 0.7724
## 4  125.0055     0.4744  0.0511  1.7343 0.1090 0.0032  1.7597 1.6095 0.6715
## 5   86.6903     0.5082  0.2441  1.9469 0.1090 0.0034  1.5438 1.5091 0.5740
## 6   65.9148     0.5116  1.0270  2.2516 0.1202 0.0037  1.5468 1.4508 0.5336
## 7   52.1620     0.4584 -0.0578  3.1030 0.1173 0.0039  1.7740 1.3634 0.4818
## 8   42.5280     0.4668  0.6230  3.1017 0.1235 0.0042  1.5471 1.3448 0.4524
## 9   36.1144     0.4359  0.3661  3.8216 0.1112 0.0042  1.7320 1.3012 0.4680
## 10  31.2625     0.4144  3.4007  4.5526 0.1301 0.0044  1.7707 1.2258 0.4359
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.6955            40.2797            1
## 3          0.6433            52.6826            1
## 4          0.6757            28.7988            1
## 5          0.5356            32.9448            1
## 6          0.4889            36.5873            1
## 7          0.5276            40.2959            1
## 8          0.4503            35.4073            1
## 9          0.4347            42.9164            1
## 10         0.4997            14.0160            1
## 
## $Best.nc
##                     KL      CH Hartigan    CCC    Scott      Marriot   TrCovW
## Number_clusters 8.0000  2.0000   5.0000 8.0000   3.0000 3.000000e+00    3.000
## Value_Index     4.1349 57.9657   9.5412 0.5409 190.7468 1.086913e+12 3585.416
##                  TraceW Friedman   Rubin  Cindex     DB Silhouette  Duda
## Number_clusters  5.0000   3.0000  5.0000 10.0000 8.0000     5.0000 2.000
## Value_Index     28.6078   1.9608 -0.0776  0.3539 1.3463     0.2398 1.398
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000  2.0000    3.0000   3.0000     6.0000    1  2.0000
## Value_Index     -26.1913 -1.0805    0.3312 144.5695     0.5116   NA  0.7625
##                  Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 2.000      0  5.0000      0 10.0000
## Value_Index     0.131      0  1.5438      0  0.4359
## 
## $Best.partition
##   [1] 1 2 1 2 1 1 2 2 2 1 2 1 1 2 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1
##  [38] 1 1 2 2 2 1 2 2 2 1 1 1 2 1 1 2 1 1 2 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 1 2 1
##  [75] 2 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 1 2 2 1 2 1 1 1 1 1 1 2
## [112] 1 2 1 2 1 1 2 1 1 1 1 2 1 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 2 2 1 2 1 2 2 2 2
## [149] 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

Clustering <- kmeans(mydata_clu_new, 
                     centers = 5, #Number of groups
                     nstart = 25) #Number of attempts at different starting leader positions

Clustering

## K-means clustering with 5 clusters of sizes 17, 39, 60, 23, 29
## 
## Cluster means:
##         Q2a_1       Q3a_1      Q4a_1      Q5a_1      Q6a_1      Q7a_1
## 1 -2.04299453 -0.15283293  0.1288002 -0.2342229 -0.1969184 -0.1982072
## 2  0.04063469  1.04412623  0.7289623  0.6531940  0.3558390  1.1871225
## 3  0.23103022  0.00902994  0.2372070  0.3725217  0.3978323 -0.4700529
## 4  0.44911013 -0.29962981 -0.3654457 -0.3996591 -1.8156404 -0.1234300
## 5  0.30878648 -1.09562323 -1.2567691 -1.1948939  0.2537823 -0.4098687
## 
## Clustering vector:
##   [1] 2 5 3 5 2 3 4 5 4 3 5 3 3 4 5 2 3 5 3 3 5 3 2 2 3 2 2 2 2 3 5 2 4 5 1 4 2
##  [38] 3 3 4 1 5 3 5 5 1 2 3 3 4 1 2 4 2 2 5 4 1 1 3 2 1 4 2 2 2 3 2 3 4 2 2 4 2
##  [75] 4 3 5 5 1 3 5 3 5 1 3 3 5 1 5 5 1 4 3 2 2 4 5 3 5 3 3 5 2 3 2 2 2 2 2 3 4
## [112] 2 5 2 3 3 3 3 3 1 2 3 5 3 1 3 4 2 4 4 4 3 2 4 1 3 5 4 1 3 1 3 5 2 3 5 3 4
## [149] 1 5 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  62.11656  82.66081 124.50231  70.87847  93.29320
##  (between_SS / total_SS =  52.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = FALSE,
             ggtheme = theme_bw(),
             data = mydata_clu_new)

Units seem to be far away from the center, so I will remove them.

mydata <- mydata %>%
  filter(!ID %in% c(120, 40))

mydata$ID <- seq(1, nrow(mydata))


mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")]))

mydata_clu_new[is.na(mydata_clu_new)] <- 0

Clustering <- kmeans(mydata_clu_new, 
                     centers = 5, #Number of groups
                     nstart = 25) #Number of attempts at different starting leader positions

Clustering

## K-means clustering with 5 clusters of sizes 25, 16, 59, 36, 30
## 
## Cluster means:
##         Q2a_1       Q3a_1       Q4a_1      Q5a_1      Q6a_1        Q7a_1
## 1  0.50200868 -0.12950095 -0.30466060 -0.2482056 -1.7246734  0.006730357
## 2 -2.05447054 -0.22662665  0.07048118 -0.3003852 -0.2750557 -0.294873759
## 3  0.20812089  0.02202012  0.31437151  0.3445813  0.4095805 -0.447511693
## 4 -0.01506026  1.07012340  0.75255714  0.6750440  0.4614548  1.229972713
## 5  0.28614495 -1.09866931 -1.30503867 -1.1206858  0.2246701 -0.444203551
## 
## Clustering vector:
##   [1] 4 5 3 5 4 3 1 5 1 3 5 3 3 1 5 4 3 5 3 3 5 3 4 4 3 4 4 4 4 3 5 4 1 5 2 1 4
##  [38] 3 3 2 5 3 5 5 2 4 3 3 1 2 4 1 1 4 5 1 2 2 3 4 2 1 4 4 4 3 4 3 1 4 4 1 4 1
##  [75] 3 5 5 2 3 5 3 5 2 3 3 5 2 5 5 2 1 3 3 4 1 5 3 5 3 3 5 4 3 4 4 4 4 4 3 1 4
## [112] 5 4 3 3 3 3 3 4 3 5 3 2 3 1 4 1 1 1 3 4 1 2 3 5 1 2 5 2 3 5 1 3 5 1 1 2 5
## [149] 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  75.13789  54.55411 114.37998  76.26333 104.07766
##  (between_SS / total_SS =  52.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)
fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = FALSE,
             ggtheme = theme_bw(),
             data = mydata_clu_new)

#Average values of cluster variables to describe groups

Averages <- Clustering$centers
Averages

##         Q2a_1       Q3a_1       Q4a_1      Q5a_1      Q6a_1        Q7a_1
## 1  0.50200868 -0.12950095 -0.30466060 -0.2482056 -1.7246734  0.006730357
## 2 -2.05447054 -0.22662665  0.07048118 -0.3003852 -0.2750557 -0.294873759
## 3  0.20812089  0.02202012  0.31437151  0.3445813  0.4095805 -0.447511693
## 4 -0.01506026  1.07012340  0.75255714  0.6750440  0.4614548  1.229972713
## 5  0.28614495 -1.09866931 -1.30503867 -1.1206858  0.2246701 -0.444203551

Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)

library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"))

Figure$Group <- factor(Figure$ID, 
                       levels = c(1, 2, 3, 4, 5), 
                       labels = c("1", "2", "3", "4", "5"))

Figure$NameF <- factor(Figure$name, 
                       levels = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"), 
                       labels = c("Cash_Safety", "Cash_Speed", "Cash_Ease of Use", "Cash_Convenience", "Cash_Privacy",  "Cash_Tracking Expenses"))

library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 5) +
  geom_line(aes(group = ID), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")+
  ylim(-2.5, 2.5) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))

#Saving where each unit belongs

mydata$Group <- Clustering$cluster

#Checking if clustering variables successfully differentiate between groups

fit <- aov(cbind(Q2a_1, Q3a_1, Q4a_1, Q5a_1, Q6a_1, Q7a_1) ~ as.factor(Group), 
           data = mydata)

summary(fit)

##  Response Q2a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 140.74  35.186  41.808 < 2.2e-16 ***
## Residuals        145 122.03   0.842                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q3a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 200.48  50.120  40.602 < 2.2e-16 ***
## Residuals        145 178.99   1.234                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q4a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 176.01  44.001  44.225 < 2.2e-16 ***
## Residuals        145 144.27   0.995                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q5a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 209.54  52.385  29.362 < 2.2e-16 ***
## Residuals        145 258.70   1.784                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q6a_1 :
##                   Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 100.089 25.0223  70.354 < 2.2e-16 ***
## Residuals        145  51.571  0.3557                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   4 306.07  76.517  39.809 < 2.2e-16 ***
## Residuals        145 278.70   1.922                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 16 observations deleted due to missingness

It differs for at least one of the groups for all variables.

#Additional variables

aggregate(mydata$Age, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 37.76000
## 2       2 40.43750
## 3       3       NA
## 4       4 46.75000
## 5       5 36.16667

#Checking normal distribution of variables

library(dplyr)
library(rstatix)

## Warning: package 'rstatix' was built under R version 4.4.2

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(Age)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable statistic        p
##   <fct>                     <chr>        <dbl>    <dbl>
## 1 1                         Age          0.848 0.00164 
## 2 2                         Age          0.923 0.188   
## 3 3                         Age          0.896 0.000948
## 4 4                         Age          0.896 0.00266 
## 5 5                         Age          0.899 0.00789

-> Kruskal Walis

kruskal.test(Age ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Age by as.factor(Group)
## Kruskal-Wallis chi-squared = 15.871, df = 4, p-value = 0.003197

Significant.

#Checking the association between the location and classification into 4 groups
chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group))

## Warning in chisq.test(mydata$LocationFactor, as.factor(mydata$Group)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 7.6158, df = 8, p-value = 0.4719

Not significant.

# Create a contingency table of LocationFactor and Group
table_data <- table(mydata$LocationFactor, mydata$Group)

# Print table to verify structure
print(table_data)

##           
##             1  2  3  4  5
##   Urban    16  7 23 17 13
##   Suburban  6  5 13  7 11
##   Rural     3  4  7 12  6

# Perform Fisher's Exact Test with simulation
fisher_test <- fisher.test(table_data, simulate.p.value = TRUE, B = 10000)  # B controls number of simulations

# Print the results
print(fisher_test)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data
## p-value = 0.5074
## alternative hypothesis: two.sided

# Perform Fisher's Exact Test with simulated p-value
fisher_result <- fisher.test(table(mydata$LocationFactor, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$LocationFactor, mydata$Group)
## p-value = 0.5019
## alternative hypothesis: two.sided

#Checking the association between the gender and classification into 4 groups
chi_square <- chisq.test(mydata$GenderFactor, as.factor(mydata$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$GenderFactor and as.factor(mydata$Group)
## X-squared = 5.6518, df = 4, p-value = 0.2267

Not significant.

#Checking the association between the education and classification into 4 groups
chi_square <- chisq.test(mydata$EducationFactor, as.factor(mydata$Group), correct=TRUE)

## Warning in chisq.test(mydata$EducationFactor, as.factor(mydata$Group), correct
## = TRUE): Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EducationFactor and as.factor(mydata$Group)
## X-squared = 16.603, df = 20, p-value = 0.6786

Not significant.¸

# Create a contingency table of EducationFactor and Group
table_data_edu <- table(mydata$EducationFactor, mydata$Group)

# Print table to verify structure
print(table_data_edu)

##                         
##                           1  2  3  4  5
##   Unifinished elementary  0  0  0  0  0
##   Finished elementary     1  0  0  1  0
##   Vocational school       2  1  2  2  2
##   General high school     3  6 15 13  5
##   Undergraduate degree   10  4 15 11 14
##   Master's degree         8  4  7  7  9
##   PhD                     1  1  4  2  0

# Perform Fisher's Exact Test with simulation
fisher_test_edu <- fisher.test(table_data_edu, simulate.p.value = TRUE, B = 10000)  # B controls the number of simulations

# Print the results
print(fisher_test_edu)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data_edu
## p-value = 0.5445
## alternative hypothesis: two.sided

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$EducationFactor, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$EducationFactor, mydata$Group)
## p-value = 0.5364
## alternative hypothesis: two.sided

#Checking the association between the employment status and classification into 4 groups
chi_square <- chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), correct=TRUE)

## Warning in chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), :
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EmplStatusFactor and as.factor(mydata$Group)
## X-squared = 22.056, df = 8, p-value = 0.004813

# Create a contingency table of Employment Status Factor and Group
table_data_emp <- table(mydata$EmplStatusFactor, mydata$Group)

# Print table to verify structure
print(table_data_emp)

##                
##                  1  2  3  4  5
##   Employed      19 13 39 19 26
##   Self-employed  4  1  3 14  3
##   Retired        0  0  0  0  0
##   Unemployed     2  2  1  3  1

# Perform Fisher's Exact Test with simulation
fisher_test_emp <- fisher.test(table_data_emp, simulate.p.value = TRUE, B = 10000)  # B controls the number of simulations

# Print the results
print(fisher_test_emp)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data_emp
## p-value = 0.0038
## alternative hypothesis: two.sided

#Checking the association between the job and classification into 4 groups
chi_square <- chisq.test(mydata$JobFactor, as.factor(mydata$Group), correct=TRUE)

## Warning in chisq.test(mydata$JobFactor, as.factor(mydata$Group), correct =
## TRUE): Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$JobFactor and as.factor(mydata$Group)
## X-squared = 17.777, df = 16, p-value = 0.3371

# Create a contingency table of JobFactor and Group
table_data_job <- table(mydata$JobFactor, mydata$Group)

# Print table to verify structure
print(table_data_job)

##           
##             1  2  3  4  5
##   Physical  3  1  4  3  3
##   Service   4  4  8  5  2
##   Office    5  6 19  5 13
##   Public    7  0  7  7  8
##   Creative  0  1  1  0  0

# Perform Fisher's Exact Test with simulation
fisher_test_job <- fisher.test(table_data_job, simulate.p.value = TRUE, B = 10000)  # B controls the number of simulations

# Print the results
print(fisher_test_job)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data_job
## p-value = 0.2196
## alternative hypothesis: two.sided

#WITH FIXED DEMOGRAPHIC VARIABLES


#Checking the association between the education and classification into 4 groups
chi_square <- chisq.test(mydata$EducFixed, as.factor(mydata$Group), correct=TRUE)
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EducFixed and as.factor(mydata$Group)
## X-squared = 5.4322, df = 4, p-value = 0.2457

#Checking the association between the empl status and classification into 4 groups
chi_square <- chisq.test(mydata$EmplFixed, as.factor(mydata$Group), correct=TRUE)

## Warning in chisq.test(mydata$EmplFixed, as.factor(mydata$Group), correct =
## TRUE): Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EmplFixed and as.factor(mydata$Group)
## X-squared = 3.131, df = 4, p-value = 0.5361

# Create a contingency table of EmplFixed and Group
table_data_empl_fixed <- table(mydata$EmplFixed, mydata$Group)

# Print table to verify structure
print(table_data_empl_fixed)

##           
##             1  2  3  4  5
##   Employed 23 14 42 33 29
##   Others    2  2  1  3  1

# Perform Fisher's Exact Test with simulation
fisher_test_empl_fixed <- fisher.test(table_data_empl_fixed, simulate.p.value = TRUE, B = 10000)  # B controls the number of simulations

# Print the results
print(fisher_test_empl_fixed)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table_data_empl_fixed
## p-value = 0.4529
## alternative hypothesis: two.sided

#Checking the association between the bank and classification into 4 groups
chi_square <- chisq.test(mydata$BankFixed, as.factor(mydata$Group), correct=TRUE)
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$BankFixed and as.factor(mydata$Group)
## X-squared = 6.4764, df = 4, p-value = 0.1663

#Checking the association between the job and classification into 4 groups
chi_square <- chisq.test(mydata$JobFixed, as.factor(mydata$Group), correct=TRUE)
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$JobFixed and as.factor(mydata$Group)
## X-squared = 12.009, df = 4, p-value = 0.01729

#Checking the association between the location and classification into 4 groups
chi_square <- chisq.test(mydata$LocationFixed, as.factor(mydata$Group), correct=TRUE)
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$LocationFixed and as.factor(mydata$Group)
## X-squared = 3.0376, df = 4, p-value = 0.5515

mydata$BetterCash <- as.numeric(mydata$BetterCash)

aggregate(mydata$BetterCash, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 3.320000
## 2       2 3.125000
## 3       3       NA
## 4       4 4.611111
## 5       5 1.833333

#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(BetterCash)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable   statistic           p
##   <fct>                     <chr>          <dbl>       <dbl>
## 1 1                         BetterCash     0.907 0.0265     
## 2 2                         BetterCash     0.876 0.0337     
## 3 3                         BetterCash     0.894 0.000808   
## 4 4                         BetterCash     0.852 0.000207   
## 5 5                         BetterCash     0.677 0.000000735

kruskal.test(BetterCash ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  BetterCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 33.092, df = 4, p-value = 1.144e-06

mydata$CashOnme <- as.numeric(mydata$CashOnme)

aggregate(mydata$CashOnme, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 3.960000
## 2       2 3.937500
## 3       3       NA
## 4       4 4.416667
## 5       5 3.733333

#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(CashOnme)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable statistic        p
##   <fct>                     <chr>        <dbl>    <dbl>
## 1 1                         CashOnme     0.878 0.00633 
## 2 2                         CashOnme     0.820 0.00503 
## 3 3                         CashOnme     0.867 0.000142
## 4 4                         CashOnme     0.886 0.00142 
## 5 5                         CashOnme     0.894 0.00600

kruskal.test(CashOnme ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  CashOnme by as.factor(Group)
## Kruskal-Wallis chi-squared = 8.0913, df = 4, p-value = 0.08829

mydata$FindSeller <- as.numeric(mydata$FindSeller)

aggregate(mydata$FindSeller, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 5.280000
## 2       2 5.562500
## 3       3       NA
## 4       4 5.055556
## 5       5 6.200000

#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(FindSeller)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable   statistic          p
##   <fct>                     <chr>          <dbl>      <dbl>
## 1 1                         FindSeller     0.888 0.0102    
## 2 2                         FindSeller     0.859 0.0188    
## 3 3                         FindSeller     0.855 0.0000689 
## 4 4                         FindSeller     0.889 0.00172   
## 5 5                         FindSeller     0.694 0.00000128

kruskal.test(FindSeller ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  FindSeller by as.factor(Group)
## Kruskal-Wallis chi-squared = 13.872, df = 4, p-value = 0.007715

mydata$SmallCash <- as.numeric(mydata$SmallCash)

aggregate(mydata$SmallCash, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 3.160000
## 2       2 3.062500
## 3       3       NA
## 4       4 4.638889
## 5       5 1.833333

#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(SmallCash)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable  statistic         p
##   <fct>                     <chr>         <dbl>     <dbl>
## 1 1                         SmallCash     0.900 0.0187   
## 2 2                         SmallCash     0.816 0.00450  
## 3 3                         SmallCash     0.883 0.000404 
## 4 4                         SmallCash     0.834 0.0000828
## 5 5                         SmallCash     0.801 0.0000684

kruskal.test(SmallCash ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  SmallCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 27.816, df = 4, p-value = 1.359e-05

mydata$DigitalEasy <- as.numeric(mydata$DigitalEasy)

aggregate(mydata$DigitalEasy, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 5.360000
## 2       2 5.812500
## 3       3       NA
## 4       4 4.388889
## 5       5 6.100000

#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(DigitalEasy)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable    statistic           p
##   <fct>                     <chr>           <dbl>       <dbl>
## 1 1                         DigitalEasy     0.877 0.00611    
## 2 2                         DigitalEasy     0.859 0.0188     
## 3 3                         DigitalEasy     0.834 0.0000211  
## 4 4                         DigitalEasy     0.898 0.00293    
## 5 5                         DigitalEasy     0.658 0.000000407

kruskal.test(DigitalEasy ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  DigitalEasy by as.factor(Group)
## Kruskal-Wallis chi-squared = 19.441, df = 4, p-value = 0.0006437

mydata$ConvenientFlik <- as.numeric(mydata$ConvenientFlik)


aggregate(mydata$ConvenientFlik, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 5.680000
## 2       2 5.687500
## 3       3       NA
## 4       4 3.972222
## 5       5 6.266667

#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(ConvenientFlik)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable       statistic            p
##   <fct>                     <chr>              <dbl>        <dbl>
## 1 1                         ConvenientFlik     0.762 0.0000563   
## 2 2                         ConvenientFlik     0.781 0.00154     
## 3 3                         ConvenientFlik     0.775 0.00000109  
## 4 4                         ConvenientFlik     0.865 0.000422    
## 5 5                         ConvenientFlik     0.565 0.0000000277

kruskal.test(ConvenientFlik ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  ConvenientFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 23.528, df = 4, p-value = 9.93e-05

mydata$FriendsFlik <- as.numeric(mydata$FriendsFlik)

aggregate(mydata$FriendsFlik, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 5.080000
## 2       2 5.375000
## 3       3       NA
## 4       4 4.361111
## 5       5 5.466667

#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(FriendsFlik)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable    statistic         p
##   <fct>                     <chr>           <dbl>     <dbl>
## 1 1                         FriendsFlik     0.864 0.00326  
## 2 2                         FriendsFlik     0.843 0.0109   
## 3 3                         FriendsFlik     0.849 0.0000484
## 4 4                         FriendsFlik     0.909 0.00614  
## 5 5                         FriendsFlik     0.798 0.0000591

kruskal.test(FriendsFlik ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  FriendsFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 8.1413, df = 4, p-value = 0.08654

aggregate(mydata$Hypothetical, 
          by = list(mydata$Group), 
          FUN = mean)

##   Group.1        x
## 1       1 4.320000
## 2       2 3.875000
## 3       3       NA
## 4       4 3.694444
## 5       5 4.033333

#Checking normal distribution of variables

library(dplyr)
library(rstatix)
mydata %>%
  group_by(as.factor(mydata$Group)) %>%
  shapiro_test(Hypothetical)

## # A tibble: 5 × 4
##   `as.factor(mydata$Group)` variable     statistic       p
##   <fct>                     <chr>            <dbl>   <dbl>
## 1 1                         Hypothetical     0.915 0.0393 
## 2 2                         Hypothetical     0.946 0.427  
## 3 3                         Hypothetical     0.918 0.00461
## 4 4                         Hypothetical     0.902 0.00389
## 5 5                         Hypothetical     0.865 0.00131

kruskal.test(Hypothetical ~ as.factor(Group), 
             data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Hypothetical by as.factor(Group)
## Kruskal-Wallis chi-squared = 2.1751, df = 4, p-value = 0.7036

From here on, i have done the Chi-Square test for the variables that have proven to be statistically significant, meaning that they can describe the clusters. The graphs show for each show the characteristics for each of the groups. I apologize that age is ugly and messy, I should have made it a dummy.

Testing Age (the p-value is too high so i dont know what to do with that because with criterion validity it was low enough)

chi_square <- chisq.test(mydata$Age, as.factor(mydata$Group))

## Warning in chisq.test(mydata$Age, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Age and as.factor(mydata$Group)
## X-squared = 169.73, df = 156, p-value = 0.2139

round(chi_square$residuals,2)

##           as.factor(mydata$Group)
## mydata$Age     1     2     3     4     5
##         20 -0.41 -0.33  1.33 -0.49 -0.45
##         22 -0.41  2.74 -0.54 -0.49 -0.45
##         23 -1.08 -0.86  1.41  0.25 -0.34
##         24  0.71  0.64 -0.24 -1.11  0.39
##         25 -0.41  0.04  0.88 -1.47  0.89
##         26 -0.62 -1.08  0.48  0.22  0.54
##         27  1.63  0.88 -0.14 -0.98 -0.89
##         28  1.00 -0.80 -1.31 -0.37  1.64
##         29  1.63 -0.65 -1.07  0.04  0.22
##         30  1.28 -0.73  1.31 -1.10 -1.00
##         31  1.15 -0.46  0.56 -0.69 -0.63
##         32 -0.58 -0.46  0.56  0.75 -0.63
##         33 -0.41 -0.33  1.33 -0.49 -0.45
##         34 -0.41 -0.33 -0.54  1.55 -0.45
##         35 -0.71 -0.57  1.23 -0.85  0.52
##         36 -0.71 -0.57  1.23 -0.85  0.52
##         38 -0.91  0.64 -0.36 -0.18  1.00
##         39  0.00  0.45  0.98 -1.20 -0.18
##         40 -0.15  1.45  0.70 -1.30 -0.34
##         41 -0.41 -0.33 -0.54  1.55 -0.45
##         42 -0.58 -0.46 -0.76  2.19 -0.63
##         43 -0.71 -0.57  0.15  0.33  0.52
##         45 -0.82  0.88 -0.14 -0.98  1.34
##         46 -0.58 -0.46 -0.76  0.75  0.95
##         47  1.63 -0.65 -1.07  0.04  0.22
##         48  0.41  0.88 -0.14  0.04 -0.89
##         49 -0.82  0.88 -0.14  0.04  0.22
##         50  2.89 -0.46 -0.76 -0.69 -0.63
##         52 -0.71 -0.57  0.15 -0.85  1.81
##         53 -0.58 -0.46  0.56  0.75 -0.63
##         54 -0.58  1.70 -0.76 -0.69  0.95
##         55 -0.58  1.70 -0.76  0.75 -0.63
##         56 -0.91  0.64 -0.36  1.64 -1.00
##         57  0.71 -0.57 -0.93  1.51 -0.77
##         60  1.15 -0.46 -0.76  0.75 -0.63
##         61 -0.71  1.20 -0.93  1.51 -0.77
##         62  2.12 -0.57 -0.93  0.33 -0.77
##         63 -0.82 -0.65 -1.07  3.10 -0.89
##         64 -0.58 -0.46  1.88 -0.69 -0.63
##         65 -0.71 -0.57 -0.93  1.51  0.52

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$Age, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$Age, mydata$Group)
## p-value = 0.1304
## alternative hypothesis: two.sided

The p-value is lower but still too high, can age be used to describe clusters???

# Calculate frequency by Response
library(tidyr)
library(dplyr)
library(ggplot2)


table_clusters <- table(mydata$Group, mydata$Age)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of age by group"
  ) +
  theme_minimal()

Type of Job (office or other, I cant do the graph here, i dont know why)

chi_square <- chisq.test(mydata$JobFixed, as.factor(mydata$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$JobFixed and as.factor(mydata$Group)
## X-squared = 12.009, df = 4, p-value = 0.01729

round(chi_square$residuals,2)

##                as.factor(mydata$Group)
## mydata$JobFixed     1     2     3     4     5
##          Office  0.73 -0.27 -0.97  1.32 -0.75
##          Others -1.06  0.39  1.41 -1.92  1.10

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$JobFixed, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$JobFixed, mydata$Group)
## p-value = 0.0133
## alternative hypothesis: two.sided

This is okay.

library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$JobFixed)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of type of job per group"
  ) +
  theme_minimal()

Raje imam pri sebi gotovino, ker ni vedno mogoce placati z drugimi placilnimi sredstvi.

chi_square <- chisq.test(mydata$BetterCash, as.factor(mydata$Group))

## Warning in chisq.test(mydata$BetterCash, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$BetterCash and as.factor(mydata$Group)
## X-squared = 67.076, df = 24, p-value = 6.007e-06

round(chi_square$residuals,2)

##                  as.factor(mydata$Group)
## mydata$BetterCash     1     2     3     4     5
##                 1 -1.23 -1.15 -0.36 -1.05  3.55
##                 2 -0.26  0.84 -0.43 -1.16  1.41
##                 3  3.10  1.28 -1.14 -1.28 -1.00
##                 4 -0.29  1.24  1.13 -0.66 -1.26
##                 5  0.80 -0.83  0.81  0.43 -1.56
##                 6 -1.34 -0.16  0.81  1.76 -1.56
##                 7 -0.71 -1.13 -0.24  3.02 -1.55

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$BetterCash, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$BetterCash, mydata$Group)
## p-value = 9.999e-05
## alternative hypothesis: two.sided

This is okay now I think.

library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$BetterCash)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of those who keep cash"
  ) +
  theme_minimal()

Raje placujem z gotovino kot z digitalnimi placilnimi sredstvi.

chi_square <- chisq.test(mydata$CashOnme, as.factor(mydata$Group))

## Warning in chisq.test(mydata$CashOnme, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$CashOnme and as.factor(mydata$Group)
## X-squared = 39.619, df = 24, p-value = 0.02349

round(chi_square$residuals,2)

##                as.factor(mydata$Group)
## mydata$CashOnme     1     2     3     4     5
##               1 -0.95 -1.26 -0.14  0.74  1.15
##               2  1.03  0.68 -1.05 -0.73  0.62
##               3  0.50  2.77 -1.86  0.10 -0.37
##               4  0.00 -0.80  0.98 -0.37 -0.18
##               5  0.62 -1.70  0.39 -0.48  0.74
##               6  0.17  1.08  1.07 -0.99 -1.14
##               7 -1.83 -0.78  0.95  1.92 -1.00

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$CashOnme, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$CashOnme, mydata$Group)
## p-value = 0.0116
## alternative hypothesis: two.sided

This is okay now I think.

library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$CashOnme)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of those who prefer cash"
  ) +
  theme_minimal()

Zlahka najdem ponudnike, ki sprejemajo digitalna placila za vsakodnevne nakupe.

chi_square <- chisq.test(mydata$FindSeller, as.factor(mydata$Group))

## Warning in chisq.test(mydata$FindSeller, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$FindSeller and as.factor(mydata$Group)
## X-squared = 34.503, df = 24, p-value = 0.07612

round(chi_square$residuals,2)

##                  as.factor(mydata$Group)
## mydata$FindSeller     1     2     3     4     5
##                 1 -0.58 -0.46 -0.76  2.19 -0.63
##                 2 -0.58 -0.46 -0.76  0.75  0.95
##                 3  1.60  0.76 -1.78  1.45 -1.48
##                 4  0.41 -0.98 -0.36  1.25 -0.60
##                 5  0.89  1.01  0.48 -0.45 -1.63
##                 6 -0.57 -0.23  1.06 -0.99  0.50
##                 7 -0.86 -0.32 -0.17 -0.48  1.75

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$FindSeller, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$FindSeller, mydata$Group)
## p-value = 0.0324
## alternative hypothesis: two.sided

This is okay now I think.

library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$FindSeller)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of those who easily find establishments where they can pay digitally"
  ) +
  theme_minimal()

chi_square <- chisq.test(mydata$SmallCash, as.factor(mydata$Group))

## Warning in chisq.test(mydata$SmallCash, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$SmallCash and as.factor(mydata$Group)
## X-squared = 57.805, df = 24, p-value = 0.0001295

round(chi_square$residuals,2)

##                 as.factor(mydata$Group)
## mydata$SmallCash     1     2     3     4     5
##                1 -0.93 -1.02  0.64 -1.36  2.32
##                2  0.56  1.77 -0.56 -1.46  0.46
##                3  0.62  0.01 -0.36 -0.66  0.59
##                4 -0.29 -0.92  1.79 -0.66 -0.47
##                5  1.25 -0.33  0.66 -0.07 -1.61
##                6 -0.22  0.41 -0.01  1.44 -1.67
##                7 -0.95 -0.47 -1.59  4.43 -1.73

Pri placilih manjse vrednosti (do 10) se mi zdi gotovina hitrejsi nacin placila od digitalnih placil.

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$SmallCash, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$SmallCash, mydata$Group)
## p-value = 2e-04
## alternative hypothesis: two.sided

This is okay now I think.

library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$SmallCash)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of type of those who pay with cash up to 10 EUR"
  ) +
  theme_minimal()

Uporaba digitalnih placil za transkacije z majhno vrednostjo (do 10) je prirocna in enostavna, z minimalnimi dodatnimi koraki (npr. vnos PIN-a, skeniranje QR kode)

chi_square <- chisq.test(mydata$DigitalEasy, as.factor(mydata$Group))

## Warning in chisq.test(mydata$DigitalEasy, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$DigitalEasy and as.factor(mydata$Group)
## X-squared = 40.535, df = 24, p-value = 0.01872

round(chi_square$residuals,2)

##                   as.factor(mydata$Group)
## mydata$DigitalEasy     1     2     3     4     5
##                  1 -0.91 -0.73 -1.20  2.56  0.00
##                  2 -0.91 -0.73 -0.36  2.56 -1.00
##                  3  0.95 -0.47 -1.11  1.26 -0.58
##                  4  1.00  0.45  0.21 -0.37 -1.10
##                  5  0.37 -0.17  0.37  0.94 -1.69
##                  6  0.34  1.03  0.43 -1.97  0.59
##                  7 -0.86 -0.19  0.36 -1.21  1.82

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$DigitalEasy, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$DigitalEasy, mydata$Group)
## p-value = 0.0177
## alternative hypothesis: two.sided

This is okay.

library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$DigitalEasy)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of those who prefer digital over cash up to 10 EUR"
  ) +
  theme_minimal()

Rad/a uporabljam mobilne placilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ker so prirocne.

chi_square <- chisq.test(mydata$ConvenientFlik, as.factor(mydata$Group))

## Warning in chisq.test(mydata$ConvenientFlik, as.factor(mydata$Group)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$ConvenientFlik and as.factor(mydata$Group)
## X-squared = 40.301, df = 24, p-value = 0.01985

round(chi_square$residuals,2)

##                      as.factor(mydata$Group)
## mydata$ConvenientFlik     1     2     3     4     5
##                     1 -1.41 -1.13  0.30  2.43 -0.90
##                     2  0.44 -0.40 -1.00  1.99 -1.08
##                     3 -1.00  1.70 -0.55  1.30 -1.10
##                     4  0.41 -0.65 -0.14  0.04  0.22
##                     5  0.90 -0.41  0.68  0.00 -1.34
##                     6 -0.64  0.74  0.93 -0.50 -0.53
##                     7  0.46  0.11 -0.48 -2.09  2.37

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$ConvenientFlik, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$ConvenientFlik, mydata$Group)
## p-value = 0.013
## alternative hypothesis: two.sided

This is okay now.

library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$ConvenientFlik)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of those who use mobile payment platforms due to convenience"
  ) +
  theme_minimal()

Verjetno bom uporabljal/a mobilne placilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ce mi jih priporocijo prijatelji, druzina ali vrstniki.

chi_square <- chisq.test(mydata$FriendsFlik, as.factor(mydata$Group))

## Warning in chisq.test(mydata$FriendsFlik, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$FriendsFlik and as.factor(mydata$Group)
## X-squared = 34.693, df = 24, p-value = 0.07308

round(chi_square$residuals,2)

##                   as.factor(mydata$Group)
## mydata$FriendsFlik     1     2     3     4     5
##                  1 -0.52 -1.03  0.67  0.39  0.00
##                  2 -1.08 -0.86  0.00  2.56 -1.18
##                  3  1.34  0.51 -1.23 -0.02 -0.10
##                  4 -1.08  1.45 -0.71  0.25  0.51
##                  5  0.66 -0.52  1.24 -0.04 -1.67
##                  6 -1.18 -0.52  0.45  0.60  0.26
##                  7  0.52  0.84 -0.73 -1.81  1.77

# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$FriendsFlik, mydata$Group), simulate.p.value = TRUE, B = 10000)

# Display the result
fisher_result

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  table(mydata$FriendsFlik, mydata$Group)
## p-value = 0.07609
## alternative hypothesis: two.sided

library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$FriendsFlik)

prop_table_clusters <- prop.table(table_clusters, margin = 1)

prop_df <- as.data.frame(as.table(prop_table_clusters))

library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) +  # Multiply Freq by 100 to get percentages
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Group", 
    y = "Percentage (%)", 
    fill = "Category", 
    title = "Percentage Distribution of use mobile digital payment platforms due to recommendations"
  ) +
  theme_minimal()

Clustering Group 4

2025-02-05

Testing Age (the p-value is too high so i dont know what to do with that because with criterion validity it was low enough)

Type of Job (office or other, I cant do the graph here, i dont know why)

Raje imam pri sebi gotovino, ker ni vedno mogoce placati z drugimi placilnimi sredstvi.

Raje placujem z gotovino kot z digitalnimi placilnimi sredstvi.

Zlahka najdem ponudnike, ki sprejemajo digitalna placila za vsakodnevne nakupe.

Pri placilih manjse vrednosti (do 10) se mi zdi gotovina hitrejsi nacin placila od digitalnih placil.

Uporaba digitalnih placil za transkacije z majhno vrednostjo (do 10) je prirocna in enostavna, z minimalnimi dodatnimi koraki (npr. vnos PIN-a, skeniranje QR kode)

Rad/a uporabljam mobilne placilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ker so prirocne.

Verjetno bom uporabljal/a mobilne placilne platforme (kot so Apple Pay, Revolut, PayPal, Flik), ce mi jih priporocijo prijatelji, druzina ali vrstniki.