# Load dataset from Excel file
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
mydata <- read_xlsx("./DataPerception.xlsx")
## New names:
## • `` -> `...40`
# Display first few rows
head(mydata)
## # A tibble: 6 × 41
## ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 5 6 5 7 7 5 7 7 5 7 7 6
## 2 2 7 7 6 2 6 6 4 7 7 3 6 6
## 3 3 7 6 6 6 6 6 7 7 5 6 7 5
## 4 4 7 3 5 6 6 3 3 6 5 3 6 6
## 5 5 6 5 5 5 5 6 6 6 7 4 6 7
## 6 6 6 6 6 5 7 6 6 7 5 7 7 5
## # ℹ 28 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## # Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## # Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <chr>,
## # BankFixed <dbl>, EmplFixed <dbl>, JobFixed <dbl>, LocationFixed <dbl>,
## # BetterCash <dbl>, CashOnme <dbl>, FindSeller <dbl>, SmallCash <dbl>,
## # DigitalEasy <dbl>, ConvenientFlik <dbl>, FriendsFlik <dbl>,
## # Hypothetical <dbl>, ...40 <lgl>, CashKeep <dbl>
colnames(mydata)
## [1] "ID" "Q2a_1" "Q2b_1" "Q2c_1"
## [5] "Q3a_1" "Q3b_1" "Q3c_1" "Q4a_1"
## [9] "Q4b_1" "Q4c_1" "Q5a_1" "Q5b_1"
## [13] "Q5c_1" "Q6a_1" "Q6b_1" "Q6c_1"
## [17] "Q7a_1" "Q7b_1" "Q7c_1" "Age"
## [21] "Gender" "Location" "Education" "Job"
## [25] "Bank" "EmplStatus" "EducFixed" "BankFixed"
## [29] "EmplFixed" "JobFixed" "LocationFixed" "BetterCash"
## [33] "CashOnme" "FindSeller" "SmallCash" "DigitalEasy"
## [37] "ConvenientFlik" "FriendsFlik" "Hypothetical" "...40"
## [41] "CashKeep"
mydata$GenderFactor <- factor(mydata$Gender,
levels = c(1, 2),
labels = c("Male", "Female"))
mydata$LocationFactor <- factor(mydata$Location,
levels = c(1, 2, 3),
labels = c("Urban", "Suburban", "Rural"))
mydata$EducationFactor <- factor(mydata$Education,
levels = c(1, 2, 3, 4, 5, 6 ,7),
labels = c("Unifinished elementary", "Finished elementary", "Vocational school", "General high school", "Undergraduate degree", "Master's degree", "PhD"))
mydata$EmplStatusFactor <- factor (mydata$EmplStatus,
levels = c(1, 2 ,3, 4),
labels = c("Employed", "Self-employed", "Retired", "Unemployed"))
mydata$JobFactor <- factor (mydata$Job,
levels = c(1, 2 ,3, 4, 5),
labels = c("Physical", "Service", "Office", "Public", "Creative"))
mydata$EducFixed <- factor (mydata$EducFixed,
levels = c(0, 1),
labels = c("Up to high school", "Undergrad and more"))
mydata$BankFixed <- factor (mydata$BankFixed,
levels = c(0, 1),
labels = c("NLB", "Other banks"))
mydata$EmplFixed <- factor (mydata$EmplFixed,
levels = c(0, 1),
labels = c("Employed", "Others"))
mydata$JobFixed <- factor (mydata$JobFixed,
levels = c(0, 1),
labels = c("Office", "Others"))
mydata$LocationFixed <- factor (mydata$LocationFixed,
levels = c(0, 1),
labels = c("Urban", "Others"))
mydata$BetterCash <- factor (mydata$BetterCash,
levels = c(1, 2 ,3, 4, 5, 6, 7),
labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))
mydata$CashOnme <- factor (mydata$CashOnme,
levels = c(1, 2 ,3, 4, 5, 6, 7),
labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))
mydata$FindSeller <- factor (mydata$FindSeller,
levels = c(1, 2 ,3, 4, 5, 6, 7),
labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))
mydata$SmallCash <- factor (mydata$SmallCash,
levels = c(1, 2 ,3, 4, 5, 6, 7),
labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))
mydata$DigitalEasy <- factor (mydata$DigitalEasy,
levels = c(1, 2 ,3, 4, 5, 6, 7),
labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))
mydata$ConvenientFlik <- factor (mydata$ConvenientFlik,
levels = c(1, 2 ,3, 4, 5, 6, 7),
labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))
mydata$FriendsFlik <- factor (mydata$FriendsFlik,
levels = c(1, 2 ,3, 4, 5, 6, 7),
labels = c("Strongly disagree", "Disagree", "Somewhat disagree", "Neutral", "Somewhat agree", "Agree", "Strongly Agree"))
For the purpose of clustering, I chose 6 cluster variables:!!!
#Saving standardized cluster variables into new data frame
mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")]))
#Finding outliers
mydata_clu_new[is.na(mydata_clu_new)] <- 0
mydata$Dissimilarity <- sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 + mydata_clu_new$Q4a_1^2 + mydata_clu_new$Q5a_1 + mydata_clu_new$Q6a_1^2 + mydata_clu_new$Q7a_1^2)
## Warning in sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 +
## mydata_clu_new$Q4a_1^2 + : NaNs produced
#Finding units with highest value of dissimilarity
head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")])
## # A tibble: 6 × 2
## ID Dissimilarity
## <dbl> <dbl>
## 1 14 4.28
## 2 40 4.21
## 3 120 4.14
## 4 71 3.84
## 5 138 3.83
## 6 34 3.39
There is a relatively big jump between third and fourth unit, so I will check first three units.
#Showing units ID14, 40, 120
print(mydata[c(14,40,120), ])
## # A tibble: 3 × 47
## ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14 7 7 5 3 6 7 6 6 7 3 6 7
## 2 40 5 3 3 2 7 7 2 7 7 2 7 7
## 3 120 1 6 5 6 6 7 7 5 6 6 6 6
## # ℹ 34 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## # Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## # Education <dbl>, Job <dbl>, Bank <dbl>, EmplStatus <dbl>, EducFixed <fct>,
## # BankFixed <fct>, EmplFixed <fct>, JobFixed <fct>, LocationFixed <fct>,
## # BetterCash <fct>, CashOnme <fct>, FindSeller <fct>, SmallCash <fct>,
## # DigitalEasy <fct>, ConvenientFlik <fct>, FriendsFlik <fct>,
## # Hypothetical <dbl>, ...40 <lgl>, CashKeep <dbl>, GenderFactor <fct>, …
They don’t seem unusual.
#Removing ...
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Finding Euclidean distances based on 6 Cluster variables, then saving them into object Distances
Distances <- get_dist(mydata_clu_new,
method = "euclidian")
#Showing matrix of distances
fviz_dist(Distances,
gradient = list(low = "slateblue4",
mid = "skyblue3",
high = "skyblue"))
There are three or four groups of homogeneous objects forming, but they are not very evident.
#Hopkins statistics
library(factoextra)
get_clust_tendency(mydata_clu_new,
n = nrow(mydata_clu_new) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6520884
##
## $plot
## NULL
Hopkins statistics is above 0.5 - data is clusterable.
#Determining number of clusters for K-means clustering
library(factoextra)
library(NbClust)
fviz_nbclust(mydata_clu_new, kmeans, method = "wss") +
labs(subtitle = "Elbow method")
It seems that the biggest break is at 5 or 6, indicating that we should form 5 or 6 clusters based on Elbow method.
#Determining number of clusters for K-means clustering
fviz_nbclust(mydata_clu_new, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette analysis")
Since we want average Silhouette to be as high as possible, according to this index, it is the best option to form 2 clusters, but 3, 5 or 6 is also almost as good.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
WARD <- mydata_clu_new %>%
get_dist(method = "euclidean") %>%
hclust(method = "ward.D2")
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 168
library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(NbClust)
NbClust(mydata_clu_new,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 5 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 3 proposed 8 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 3.1827 57.9657 28.3492 -2.5629 173.9249 5.366386e+12 13199.460 671.5135
## 3 0.5625 47.8173 24.2670 -2.6210 364.6717 3.879425e+12 9614.043 573.5617
## 4 0.6372 44.3850 25.1876 -2.6681 479.6169 3.479378e+12 7191.278 500.0220
## 5 2.3310 44.4256 15.6463 -1.0277 593.1414 2.765993e+12 4990.154 433.4514
## 6 1.1173 41.8231 13.4675 -0.4471 689.9419 2.238596e+12 3894.252 395.4885
## 7 1.1191 39.7477 11.7876 0.1050 754.2201 2.078283e+12 3559.316 365.1339
## 8 4.1349 38.0102 7.4798 0.5409 817.5443 1.862045e+12 3026.031 340.2244
## 9 1.7033 35.5253 6.3092 0.2683 892.9602 1.504314e+12 2960.553 325.0297
## 10 0.3790 33.3212 3.4812 -0.0651 920.0917 1.580215e+12 2649.834 312.6247
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 2.8254 1.3492 0.4405 1.6644 0.2169 1.3980 -26.1913 -1.0805 0.3259
## 3 4.7862 1.5796 0.3979 1.7596 0.2023 2.1839 -51.5006 -2.0403 0.3312
## 4 6.1517 1.8119 0.4271 1.6065 0.2171 1.1293 -6.8707 -0.4333 0.3207
## 5 7.1848 2.0902 0.3968 1.3997 0.2398 1.8716 -17.6966 -1.7103 0.3226
## 6 8.6824 2.2908 0.4134 1.4038 0.2367 1.3524 -9.1200 -0.9435 0.3060
## 7 9.5696 2.4813 0.3889 1.4472 0.2151 1.3315 -11.2042 -0.9123 0.2917
## 8 10.5086 2.6629 0.4484 1.3463 0.2313 2.5006 -17.4029 -2.1439 0.2792
## 9 12.2185 2.7874 0.3904 1.4202 0.2235 2.4389 -19.4694 -2.0952 0.2663
## 10 12.3861 2.8980 0.3539 1.3717 0.2185 1.1475 -1.7998 -0.4671 0.2558
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 335.7568 0.3510 0.1525 0.7625 0.1310 0.0019 1.8108 1.8917 9.7859
## 3 191.1872 0.4331 0.1434 1.3184 0.1245 0.0028 1.8815 1.7204 0.7724
## 4 125.0055 0.4744 0.0511 1.7343 0.1090 0.0032 1.7597 1.6095 0.6715
## 5 86.6903 0.5082 0.2441 1.9469 0.1090 0.0034 1.5438 1.5091 0.5740
## 6 65.9148 0.5116 1.0270 2.2516 0.1202 0.0037 1.5468 1.4508 0.5336
## 7 52.1620 0.4584 -0.0578 3.1030 0.1173 0.0039 1.7740 1.3634 0.4818
## 8 42.5280 0.4668 0.6230 3.1017 0.1235 0.0042 1.5471 1.3448 0.4524
## 9 36.1144 0.4359 0.3661 3.8216 0.1112 0.0042 1.7320 1.3012 0.4680
## 10 31.2625 0.4144 3.4007 4.5526 0.1301 0.0044 1.7707 1.2258 0.4359
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6955 40.2797 1
## 3 0.6433 52.6826 1
## 4 0.6757 28.7988 1
## 5 0.5356 32.9448 1
## 6 0.4889 36.5873 1
## 7 0.5276 40.2959 1
## 8 0.4503 35.4073 1
## 9 0.4347 42.9164 1
## 10 0.4997 14.0160 1
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 8.0000 2.0000 5.0000 8.0000 3.0000 3.000000e+00 3.000
## Value_Index 4.1349 57.9657 9.5412 0.5409 190.7468 1.086913e+12 3585.416
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 5.0000 3.0000 5.0000 10.0000 8.0000 5.0000 2.000
## Value_Index 28.6078 1.9608 -0.0776 0.3539 1.3463 0.2398 1.398
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 3.0000 3.0000 6.0000 1 2.0000
## Value_Index -26.1913 -1.0805 0.3312 144.5695 0.5116 NA 0.7625
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 2.000 0 5.0000 0 10.0000
## Value_Index 0.131 0 1.5438 0 0.4359
##
## $Best.partition
## [1] 1 2 1 2 1 1 2 2 2 1 2 1 1 2 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1
## [38] 1 1 2 2 2 1 2 2 2 1 1 1 2 1 1 2 1 1 2 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 1 2 1
## [75] 2 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 1 2 2 1 2 1 1 1 1 1 1 2
## [112] 1 2 1 2 1 1 2 1 1 1 1 2 1 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 2 2 1 2 1 2 2 2 2
## [149] 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
Clustering <- kmeans(mydata_clu_new,
centers = 5, #Number of groups
nstart = 25) #Number of attempts at different starting leader positions
Clustering
## K-means clustering with 5 clusters of sizes 17, 39, 60, 23, 29
##
## Cluster means:
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 -2.04299453 -0.15283293 0.1288002 -0.2342229 -0.1969184 -0.1982072
## 2 0.04063469 1.04412623 0.7289623 0.6531940 0.3558390 1.1871225
## 3 0.23103022 0.00902994 0.2372070 0.3725217 0.3978323 -0.4700529
## 4 0.44911013 -0.29962981 -0.3654457 -0.3996591 -1.8156404 -0.1234300
## 5 0.30878648 -1.09562323 -1.2567691 -1.1948939 0.2537823 -0.4098687
##
## Clustering vector:
## [1] 2 5 3 5 2 3 4 5 4 3 5 3 3 4 5 2 3 5 3 3 5 3 2 2 3 2 2 2 2 3 5 2 4 5 1 4 2
## [38] 3 3 4 1 5 3 5 5 1 2 3 3 4 1 2 4 2 2 5 4 1 1 3 2 1 4 2 2 2 3 2 3 4 2 2 4 2
## [75] 4 3 5 5 1 3 5 3 5 1 3 3 5 1 5 5 1 4 3 2 2 4 5 3 5 3 3 5 2 3 2 2 2 2 2 3 4
## [112] 2 5 2 3 3 3 3 3 1 2 3 5 3 1 3 4 2 4 4 4 3 2 4 1 3 5 4 1 3 1 3 5 2 3 5 3 4
## [149] 1 5 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 62.11656 82.66081 124.50231 70.87847 93.29320
## (between_SS / total_SS = 52.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_new)
Units seem to be far away from the center, so I will remove them.
mydata <- mydata %>%
filter(!ID %in% c(120, 40))
mydata$ID <- seq(1, nrow(mydata))
mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")]))
mydata_clu_new[is.na(mydata_clu_new)] <- 0
Clustering <- kmeans(mydata_clu_new,
centers = 5, #Number of groups
nstart = 25) #Number of attempts at different starting leader positions
Clustering
## K-means clustering with 5 clusters of sizes 25, 16, 59, 36, 30
##
## Cluster means:
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 0.50200868 -0.12950095 -0.30466060 -0.2482056 -1.7246734 0.006730357
## 2 -2.05447054 -0.22662665 0.07048118 -0.3003852 -0.2750557 -0.294873759
## 3 0.20812089 0.02202012 0.31437151 0.3445813 0.4095805 -0.447511693
## 4 -0.01506026 1.07012340 0.75255714 0.6750440 0.4614548 1.229972713
## 5 0.28614495 -1.09866931 -1.30503867 -1.1206858 0.2246701 -0.444203551
##
## Clustering vector:
## [1] 4 5 3 5 4 3 1 5 1 3 5 3 3 1 5 4 3 5 3 3 5 3 4 4 3 4 4 4 4 3 5 4 1 5 2 1 4
## [38] 3 3 2 5 3 5 5 2 4 3 3 1 2 4 1 1 4 5 1 2 2 3 4 2 1 4 4 4 3 4 3 1 4 4 1 4 1
## [75] 3 5 5 2 3 5 3 5 2 3 3 5 2 5 5 2 1 3 3 4 1 5 3 5 3 3 5 4 3 4 4 4 4 4 3 1 4
## [112] 5 4 3 3 3 3 3 4 3 5 3 2 3 1 4 1 1 1 3 4 1 2 3 5 1 2 5 2 3 5 1 3 5 1 1 2 5
## [149] 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 75.13789 54.55411 114.37998 76.26333 104.07766
## (between_SS / total_SS = 52.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_new)
#Average values of cluster variables to describe groups
Averages <- Clustering$centers
Averages
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 0.50200868 -0.12950095 -0.30466060 -0.2482056 -1.7246734 0.006730357
## 2 -2.05447054 -0.22662665 0.07048118 -0.3003852 -0.2750557 -0.294873759
## 3 0.20812089 0.02202012 0.31437151 0.3445813 0.4095805 -0.447511693
## 4 -0.01506026 1.07012340 0.75255714 0.6750440 0.4614548 1.229972713
## 5 0.28614495 -1.09866931 -1.30503867 -1.1206858 0.2246701 -0.444203551
Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)
library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"))
Figure$Group <- factor(Figure$ID,
levels = c(1, 2, 3, 4, 5),
labels = c("1", "2", "3", "4", "5"))
Figure$NameF <- factor(Figure$name,
levels = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"),
labels = c("Cash_Safety", "Cash_Speed", "Cash_Ease of Use", "Cash_Convenience", "Cash_Privacy", "Cash_Tracking Expenses"))
library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 5) +
geom_line(aes(group = ID), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-2.5, 2.5) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
#Saving where each unit belongs
mydata$Group <- Clustering$cluster
#Checking if clustering variables successfully differentiate between groups
fit <- aov(cbind(Q2a_1, Q3a_1, Q4a_1, Q5a_1, Q6a_1, Q7a_1) ~ as.factor(Group),
data = mydata)
summary(fit)
## Response Q2a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 140.74 35.186 41.808 < 2.2e-16 ***
## Residuals 145 122.03 0.842
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q3a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 200.48 50.120 40.602 < 2.2e-16 ***
## Residuals 145 178.99 1.234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q4a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 176.01 44.001 44.225 < 2.2e-16 ***
## Residuals 145 144.27 0.995
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q5a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 209.54 52.385 29.362 < 2.2e-16 ***
## Residuals 145 258.70 1.784
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q6a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 100.089 25.0223 70.354 < 2.2e-16 ***
## Residuals 145 51.571 0.3557
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q7a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 4 306.07 76.517 39.809 < 2.2e-16 ***
## Residuals 145 278.70 1.922
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## 16 observations deleted due to missingness
It differs for at least one of the groups for all variables.
#Additional variables
aggregate(mydata$Age,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 37.76000
## 2 2 40.43750
## 3 3 NA
## 4 4 46.75000
## 5 5 36.16667
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.4.2
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(Age)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 Age 0.848 0.00164
## 2 2 Age 0.923 0.188
## 3 3 Age 0.896 0.000948
## 4 4 Age 0.896 0.00266
## 5 5 Age 0.899 0.00789
-> Kruskal Walis
kruskal.test(Age ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: Age by as.factor(Group)
## Kruskal-Wallis chi-squared = 15.871, df = 4, p-value = 0.003197
Significant.
#Checking the association between the location and classification into 4 groups
chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group))
## Warning in chisq.test(mydata$LocationFactor, as.factor(mydata$Group)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 7.6158, df = 8, p-value = 0.4719
Not significant.
# Create a contingency table of LocationFactor and Group
table_data <- table(mydata$LocationFactor, mydata$Group)
# Print table to verify structure
print(table_data)
##
## 1 2 3 4 5
## Urban 16 7 23 17 13
## Suburban 6 5 13 7 11
## Rural 3 4 7 12 6
# Perform Fisher's Exact Test with simulation
fisher_test <- fisher.test(table_data, simulate.p.value = TRUE, B = 10000) # B controls number of simulations
# Print the results
print(fisher_test)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table_data
## p-value = 0.5074
## alternative hypothesis: two.sided
# Perform Fisher's Exact Test with simulated p-value
fisher_result <- fisher.test(table(mydata$LocationFactor, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$LocationFactor, mydata$Group)
## p-value = 0.5019
## alternative hypothesis: two.sided
#Checking the association between the gender and classification into 4 groups
chi_square <- chisq.test(mydata$GenderFactor, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$GenderFactor and as.factor(mydata$Group)
## X-squared = 5.6518, df = 4, p-value = 0.2267
Not significant.
#Checking the association between the education and classification into 4 groups
chi_square <- chisq.test(mydata$EducationFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$EducationFactor, as.factor(mydata$Group), correct
## = TRUE): Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$EducationFactor and as.factor(mydata$Group)
## X-squared = 16.603, df = 20, p-value = 0.6786
Not significant.¸
# Create a contingency table of EducationFactor and Group
table_data_edu <- table(mydata$EducationFactor, mydata$Group)
# Print table to verify structure
print(table_data_edu)
##
## 1 2 3 4 5
## Unifinished elementary 0 0 0 0 0
## Finished elementary 1 0 0 1 0
## Vocational school 2 1 2 2 2
## General high school 3 6 15 13 5
## Undergraduate degree 10 4 15 11 14
## Master's degree 8 4 7 7 9
## PhD 1 1 4 2 0
# Perform Fisher's Exact Test with simulation
fisher_test_edu <- fisher.test(table_data_edu, simulate.p.value = TRUE, B = 10000) # B controls the number of simulations
# Print the results
print(fisher_test_edu)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table_data_edu
## p-value = 0.5445
## alternative hypothesis: two.sided
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$EducationFactor, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$EducationFactor, mydata$Group)
## p-value = 0.5364
## alternative hypothesis: two.sided
#Checking the association between the employment status and classification into 4 groups
chi_square <- chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$EmplStatusFactor, as.factor(mydata$Group), :
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$EmplStatusFactor and as.factor(mydata$Group)
## X-squared = 22.056, df = 8, p-value = 0.004813
# Create a contingency table of Employment Status Factor and Group
table_data_emp <- table(mydata$EmplStatusFactor, mydata$Group)
# Print table to verify structure
print(table_data_emp)
##
## 1 2 3 4 5
## Employed 19 13 39 19 26
## Self-employed 4 1 3 14 3
## Retired 0 0 0 0 0
## Unemployed 2 2 1 3 1
# Perform Fisher's Exact Test with simulation
fisher_test_emp <- fisher.test(table_data_emp, simulate.p.value = TRUE, B = 10000) # B controls the number of simulations
# Print the results
print(fisher_test_emp)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table_data_emp
## p-value = 0.0038
## alternative hypothesis: two.sided
#Checking the association between the job and classification into 4 groups
chi_square <- chisq.test(mydata$JobFactor, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$JobFactor, as.factor(mydata$Group), correct =
## TRUE): Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$JobFactor and as.factor(mydata$Group)
## X-squared = 17.777, df = 16, p-value = 0.3371
# Create a contingency table of JobFactor and Group
table_data_job <- table(mydata$JobFactor, mydata$Group)
# Print table to verify structure
print(table_data_job)
##
## 1 2 3 4 5
## Physical 3 1 4 3 3
## Service 4 4 8 5 2
## Office 5 6 19 5 13
## Public 7 0 7 7 8
## Creative 0 1 1 0 0
# Perform Fisher's Exact Test with simulation
fisher_test_job <- fisher.test(table_data_job, simulate.p.value = TRUE, B = 10000) # B controls the number of simulations
# Print the results
print(fisher_test_job)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table_data_job
## p-value = 0.2196
## alternative hypothesis: two.sided
#WITH FIXED DEMOGRAPHIC VARIABLES
#Checking the association between the education and classification into 4 groups
chi_square <- chisq.test(mydata$EducFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$EducFixed and as.factor(mydata$Group)
## X-squared = 5.4322, df = 4, p-value = 0.2457
#Checking the association between the empl status and classification into 4 groups
chi_square <- chisq.test(mydata$EmplFixed, as.factor(mydata$Group), correct=TRUE)
## Warning in chisq.test(mydata$EmplFixed, as.factor(mydata$Group), correct =
## TRUE): Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$EmplFixed and as.factor(mydata$Group)
## X-squared = 3.131, df = 4, p-value = 0.5361
# Create a contingency table of EmplFixed and Group
table_data_empl_fixed <- table(mydata$EmplFixed, mydata$Group)
# Print table to verify structure
print(table_data_empl_fixed)
##
## 1 2 3 4 5
## Employed 23 14 42 33 29
## Others 2 2 1 3 1
# Perform Fisher's Exact Test with simulation
fisher_test_empl_fixed <- fisher.test(table_data_empl_fixed, simulate.p.value = TRUE, B = 10000) # B controls the number of simulations
# Print the results
print(fisher_test_empl_fixed)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table_data_empl_fixed
## p-value = 0.4529
## alternative hypothesis: two.sided
#Checking the association between the bank and classification into 4 groups
chi_square <- chisq.test(mydata$BankFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$BankFixed and as.factor(mydata$Group)
## X-squared = 6.4764, df = 4, p-value = 0.1663
#Checking the association between the job and classification into 4 groups
chi_square <- chisq.test(mydata$JobFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$JobFixed and as.factor(mydata$Group)
## X-squared = 12.009, df = 4, p-value = 0.01729
#Checking the association between the location and classification into 4 groups
chi_square <- chisq.test(mydata$LocationFixed, as.factor(mydata$Group), correct=TRUE)
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$LocationFixed and as.factor(mydata$Group)
## X-squared = 3.0376, df = 4, p-value = 0.5515
mydata$BetterCash <- as.numeric(mydata$BetterCash)
aggregate(mydata$BetterCash,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 3.320000
## 2 2 3.125000
## 3 3 NA
## 4 4 4.611111
## 5 5 1.833333
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(BetterCash)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 BetterCash 0.907 0.0265
## 2 2 BetterCash 0.876 0.0337
## 3 3 BetterCash 0.894 0.000808
## 4 4 BetterCash 0.852 0.000207
## 5 5 BetterCash 0.677 0.000000735
kruskal.test(BetterCash ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: BetterCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 33.092, df = 4, p-value = 1.144e-06
mydata$CashOnme <- as.numeric(mydata$CashOnme)
aggregate(mydata$CashOnme,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 3.960000
## 2 2 3.937500
## 3 3 NA
## 4 4 4.416667
## 5 5 3.733333
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(CashOnme)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 CashOnme 0.878 0.00633
## 2 2 CashOnme 0.820 0.00503
## 3 3 CashOnme 0.867 0.000142
## 4 4 CashOnme 0.886 0.00142
## 5 5 CashOnme 0.894 0.00600
kruskal.test(CashOnme ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: CashOnme by as.factor(Group)
## Kruskal-Wallis chi-squared = 8.0913, df = 4, p-value = 0.08829
mydata$FindSeller <- as.numeric(mydata$FindSeller)
aggregate(mydata$FindSeller,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 5.280000
## 2 2 5.562500
## 3 3 NA
## 4 4 5.055556
## 5 5 6.200000
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(FindSeller)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 FindSeller 0.888 0.0102
## 2 2 FindSeller 0.859 0.0188
## 3 3 FindSeller 0.855 0.0000689
## 4 4 FindSeller 0.889 0.00172
## 5 5 FindSeller 0.694 0.00000128
kruskal.test(FindSeller ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: FindSeller by as.factor(Group)
## Kruskal-Wallis chi-squared = 13.872, df = 4, p-value = 0.007715
mydata$SmallCash <- as.numeric(mydata$SmallCash)
aggregate(mydata$SmallCash,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 3.160000
## 2 2 3.062500
## 3 3 NA
## 4 4 4.638889
## 5 5 1.833333
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(SmallCash)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 SmallCash 0.900 0.0187
## 2 2 SmallCash 0.816 0.00450
## 3 3 SmallCash 0.883 0.000404
## 4 4 SmallCash 0.834 0.0000828
## 5 5 SmallCash 0.801 0.0000684
kruskal.test(SmallCash ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: SmallCash by as.factor(Group)
## Kruskal-Wallis chi-squared = 27.816, df = 4, p-value = 1.359e-05
mydata$DigitalEasy <- as.numeric(mydata$DigitalEasy)
aggregate(mydata$DigitalEasy,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 5.360000
## 2 2 5.812500
## 3 3 NA
## 4 4 4.388889
## 5 5 6.100000
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(DigitalEasy)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 DigitalEasy 0.877 0.00611
## 2 2 DigitalEasy 0.859 0.0188
## 3 3 DigitalEasy 0.834 0.0000211
## 4 4 DigitalEasy 0.898 0.00293
## 5 5 DigitalEasy 0.658 0.000000407
kruskal.test(DigitalEasy ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: DigitalEasy by as.factor(Group)
## Kruskal-Wallis chi-squared = 19.441, df = 4, p-value = 0.0006437
mydata$ConvenientFlik <- as.numeric(mydata$ConvenientFlik)
aggregate(mydata$ConvenientFlik,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 5.680000
## 2 2 5.687500
## 3 3 NA
## 4 4 3.972222
## 5 5 6.266667
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(ConvenientFlik)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 ConvenientFlik 0.762 0.0000563
## 2 2 ConvenientFlik 0.781 0.00154
## 3 3 ConvenientFlik 0.775 0.00000109
## 4 4 ConvenientFlik 0.865 0.000422
## 5 5 ConvenientFlik 0.565 0.0000000277
kruskal.test(ConvenientFlik ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: ConvenientFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 23.528, df = 4, p-value = 9.93e-05
mydata$FriendsFlik <- as.numeric(mydata$FriendsFlik)
aggregate(mydata$FriendsFlik,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 5.080000
## 2 2 5.375000
## 3 3 NA
## 4 4 4.361111
## 5 5 5.466667
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(FriendsFlik)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 FriendsFlik 0.864 0.00326
## 2 2 FriendsFlik 0.843 0.0109
## 3 3 FriendsFlik 0.849 0.0000484
## 4 4 FriendsFlik 0.909 0.00614
## 5 5 FriendsFlik 0.798 0.0000591
kruskal.test(FriendsFlik ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: FriendsFlik by as.factor(Group)
## Kruskal-Wallis chi-squared = 8.1413, df = 4, p-value = 0.08654
aggregate(mydata$Hypothetical,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 4.320000
## 2 2 3.875000
## 3 3 NA
## 4 4 3.694444
## 5 5 4.033333
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(Hypothetical)
## # A tibble: 5 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 Hypothetical 0.915 0.0393
## 2 2 Hypothetical 0.946 0.427
## 3 3 Hypothetical 0.918 0.00461
## 4 4 Hypothetical 0.902 0.00389
## 5 5 Hypothetical 0.865 0.00131
kruskal.test(Hypothetical ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: Hypothetical by as.factor(Group)
## Kruskal-Wallis chi-squared = 2.1751, df = 4, p-value = 0.7036
chi_square <- chisq.test(mydata$Age, as.factor(mydata$Group))
## Warning in chisq.test(mydata$Age, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Age and as.factor(mydata$Group)
## X-squared = 169.73, df = 156, p-value = 0.2139
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$Age 1 2 3 4 5
## 20 -0.41 -0.33 1.33 -0.49 -0.45
## 22 -0.41 2.74 -0.54 -0.49 -0.45
## 23 -1.08 -0.86 1.41 0.25 -0.34
## 24 0.71 0.64 -0.24 -1.11 0.39
## 25 -0.41 0.04 0.88 -1.47 0.89
## 26 -0.62 -1.08 0.48 0.22 0.54
## 27 1.63 0.88 -0.14 -0.98 -0.89
## 28 1.00 -0.80 -1.31 -0.37 1.64
## 29 1.63 -0.65 -1.07 0.04 0.22
## 30 1.28 -0.73 1.31 -1.10 -1.00
## 31 1.15 -0.46 0.56 -0.69 -0.63
## 32 -0.58 -0.46 0.56 0.75 -0.63
## 33 -0.41 -0.33 1.33 -0.49 -0.45
## 34 -0.41 -0.33 -0.54 1.55 -0.45
## 35 -0.71 -0.57 1.23 -0.85 0.52
## 36 -0.71 -0.57 1.23 -0.85 0.52
## 38 -0.91 0.64 -0.36 -0.18 1.00
## 39 0.00 0.45 0.98 -1.20 -0.18
## 40 -0.15 1.45 0.70 -1.30 -0.34
## 41 -0.41 -0.33 -0.54 1.55 -0.45
## 42 -0.58 -0.46 -0.76 2.19 -0.63
## 43 -0.71 -0.57 0.15 0.33 0.52
## 45 -0.82 0.88 -0.14 -0.98 1.34
## 46 -0.58 -0.46 -0.76 0.75 0.95
## 47 1.63 -0.65 -1.07 0.04 0.22
## 48 0.41 0.88 -0.14 0.04 -0.89
## 49 -0.82 0.88 -0.14 0.04 0.22
## 50 2.89 -0.46 -0.76 -0.69 -0.63
## 52 -0.71 -0.57 0.15 -0.85 1.81
## 53 -0.58 -0.46 0.56 0.75 -0.63
## 54 -0.58 1.70 -0.76 -0.69 0.95
## 55 -0.58 1.70 -0.76 0.75 -0.63
## 56 -0.91 0.64 -0.36 1.64 -1.00
## 57 0.71 -0.57 -0.93 1.51 -0.77
## 60 1.15 -0.46 -0.76 0.75 -0.63
## 61 -0.71 1.20 -0.93 1.51 -0.77
## 62 2.12 -0.57 -0.93 0.33 -0.77
## 63 -0.82 -0.65 -1.07 3.10 -0.89
## 64 -0.58 -0.46 1.88 -0.69 -0.63
## 65 -0.71 -0.57 -0.93 1.51 0.52
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$Age, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$Age, mydata$Group)
## p-value = 0.1304
## alternative hypothesis: two.sided
The p-value is lower but still too high, can age be used to describe clusters???
# Calculate frequency by Response
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$Age)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of age by group"
) +
theme_minimal()
chi_square <- chisq.test(mydata$JobFixed, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$JobFixed and as.factor(mydata$Group)
## X-squared = 12.009, df = 4, p-value = 0.01729
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$JobFixed 1 2 3 4 5
## Office 0.73 -0.27 -0.97 1.32 -0.75
## Others -1.06 0.39 1.41 -1.92 1.10
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$JobFixed, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$JobFixed, mydata$Group)
## p-value = 0.0133
## alternative hypothesis: two.sided
This is okay.
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$JobFixed)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of type of job per group"
) +
theme_minimal()
chi_square <- chisq.test(mydata$BetterCash, as.factor(mydata$Group))
## Warning in chisq.test(mydata$BetterCash, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$BetterCash and as.factor(mydata$Group)
## X-squared = 67.076, df = 24, p-value = 6.007e-06
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$BetterCash 1 2 3 4 5
## 1 -1.23 -1.15 -0.36 -1.05 3.55
## 2 -0.26 0.84 -0.43 -1.16 1.41
## 3 3.10 1.28 -1.14 -1.28 -1.00
## 4 -0.29 1.24 1.13 -0.66 -1.26
## 5 0.80 -0.83 0.81 0.43 -1.56
## 6 -1.34 -0.16 0.81 1.76 -1.56
## 7 -0.71 -1.13 -0.24 3.02 -1.55
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$BetterCash, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$BetterCash, mydata$Group)
## p-value = 9.999e-05
## alternative hypothesis: two.sided
This is okay now I think.
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$BetterCash)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of those who keep cash"
) +
theme_minimal()
chi_square <- chisq.test(mydata$CashOnme, as.factor(mydata$Group))
## Warning in chisq.test(mydata$CashOnme, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$CashOnme and as.factor(mydata$Group)
## X-squared = 39.619, df = 24, p-value = 0.02349
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$CashOnme 1 2 3 4 5
## 1 -0.95 -1.26 -0.14 0.74 1.15
## 2 1.03 0.68 -1.05 -0.73 0.62
## 3 0.50 2.77 -1.86 0.10 -0.37
## 4 0.00 -0.80 0.98 -0.37 -0.18
## 5 0.62 -1.70 0.39 -0.48 0.74
## 6 0.17 1.08 1.07 -0.99 -1.14
## 7 -1.83 -0.78 0.95 1.92 -1.00
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$CashOnme, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$CashOnme, mydata$Group)
## p-value = 0.0116
## alternative hypothesis: two.sided
This is okay now I think.
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$CashOnme)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of those who prefer cash"
) +
theme_minimal()
chi_square <- chisq.test(mydata$FindSeller, as.factor(mydata$Group))
## Warning in chisq.test(mydata$FindSeller, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$FindSeller and as.factor(mydata$Group)
## X-squared = 34.503, df = 24, p-value = 0.07612
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$FindSeller 1 2 3 4 5
## 1 -0.58 -0.46 -0.76 2.19 -0.63
## 2 -0.58 -0.46 -0.76 0.75 0.95
## 3 1.60 0.76 -1.78 1.45 -1.48
## 4 0.41 -0.98 -0.36 1.25 -0.60
## 5 0.89 1.01 0.48 -0.45 -1.63
## 6 -0.57 -0.23 1.06 -0.99 0.50
## 7 -0.86 -0.32 -0.17 -0.48 1.75
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$FindSeller, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$FindSeller, mydata$Group)
## p-value = 0.0324
## alternative hypothesis: two.sided
This is okay now I think.
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$FindSeller)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of those who easily find establishments where they can pay digitally"
) +
theme_minimal()
chi_square <- chisq.test(mydata$SmallCash, as.factor(mydata$Group))
## Warning in chisq.test(mydata$SmallCash, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$SmallCash and as.factor(mydata$Group)
## X-squared = 57.805, df = 24, p-value = 0.0001295
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$SmallCash 1 2 3 4 5
## 1 -0.93 -1.02 0.64 -1.36 2.32
## 2 0.56 1.77 -0.56 -1.46 0.46
## 3 0.62 0.01 -0.36 -0.66 0.59
## 4 -0.29 -0.92 1.79 -0.66 -0.47
## 5 1.25 -0.33 0.66 -0.07 -1.61
## 6 -0.22 0.41 -0.01 1.44 -1.67
## 7 -0.95 -0.47 -1.59 4.43 -1.73
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$SmallCash, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$SmallCash, mydata$Group)
## p-value = 2e-04
## alternative hypothesis: two.sided
This is okay now I think.
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$SmallCash)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of type of those who pay with cash up to 10 EUR"
) +
theme_minimal()
chi_square <- chisq.test(mydata$DigitalEasy, as.factor(mydata$Group))
## Warning in chisq.test(mydata$DigitalEasy, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$DigitalEasy and as.factor(mydata$Group)
## X-squared = 40.535, df = 24, p-value = 0.01872
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$DigitalEasy 1 2 3 4 5
## 1 -0.91 -0.73 -1.20 2.56 0.00
## 2 -0.91 -0.73 -0.36 2.56 -1.00
## 3 0.95 -0.47 -1.11 1.26 -0.58
## 4 1.00 0.45 0.21 -0.37 -1.10
## 5 0.37 -0.17 0.37 0.94 -1.69
## 6 0.34 1.03 0.43 -1.97 0.59
## 7 -0.86 -0.19 0.36 -1.21 1.82
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$DigitalEasy, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$DigitalEasy, mydata$Group)
## p-value = 0.0177
## alternative hypothesis: two.sided
This is okay.
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$DigitalEasy)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of those who prefer digital over cash up to 10 EUR"
) +
theme_minimal()
chi_square <- chisq.test(mydata$ConvenientFlik, as.factor(mydata$Group))
## Warning in chisq.test(mydata$ConvenientFlik, as.factor(mydata$Group)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$ConvenientFlik and as.factor(mydata$Group)
## X-squared = 40.301, df = 24, p-value = 0.01985
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$ConvenientFlik 1 2 3 4 5
## 1 -1.41 -1.13 0.30 2.43 -0.90
## 2 0.44 -0.40 -1.00 1.99 -1.08
## 3 -1.00 1.70 -0.55 1.30 -1.10
## 4 0.41 -0.65 -0.14 0.04 0.22
## 5 0.90 -0.41 0.68 0.00 -1.34
## 6 -0.64 0.74 0.93 -0.50 -0.53
## 7 0.46 0.11 -0.48 -2.09 2.37
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$ConvenientFlik, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$ConvenientFlik, mydata$Group)
## p-value = 0.013
## alternative hypothesis: two.sided
This is okay now.
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$ConvenientFlik)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of those who use mobile payment platforms due to convenience"
) +
theme_minimal()
chi_square <- chisq.test(mydata$FriendsFlik, as.factor(mydata$Group))
## Warning in chisq.test(mydata$FriendsFlik, as.factor(mydata$Group)): Chi-squared
## approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$FriendsFlik and as.factor(mydata$Group)
## X-squared = 34.693, df = 24, p-value = 0.07308
round(chi_square$residuals,2)
## as.factor(mydata$Group)
## mydata$FriendsFlik 1 2 3 4 5
## 1 -0.52 -1.03 0.67 0.39 0.00
## 2 -1.08 -0.86 0.00 2.56 -1.18
## 3 1.34 0.51 -1.23 -0.02 -0.10
## 4 -1.08 1.45 -0.71 0.25 0.51
## 5 0.66 -0.52 1.24 -0.04 -1.67
## 6 -1.18 -0.52 0.45 0.60 0.26
## 7 0.52 0.84 -0.73 -1.81 1.77
# Perform Fisher's Exact Test with simulation
fisher_result <- fisher.test(table(mydata$FriendsFlik, mydata$Group), simulate.p.value = TRUE, B = 10000)
# Display the result
fisher_result
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: table(mydata$FriendsFlik, mydata$Group)
## p-value = 0.07609
## alternative hypothesis: two.sided
library(tidyr)
library(dplyr)
library(ggplot2)
table_clusters <- table(mydata$Group, mydata$FriendsFlik)
prop_table_clusters <- prop.table(table_clusters, margin = 1)
prop_df <- as.data.frame(as.table(prop_table_clusters))
library(ggplot2)
ggplot(prop_df, aes(x = Var1, y = Freq * 100, fill = Var2)) + # Multiply Freq by 100 to get percentages
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Group",
y = "Percentage (%)",
fill = "Category",
title = "Percentage Distribution of use mobile digital payment platforms due to recommendations"
) +
theme_minimal()