#install.packages(ggplot2)
library(ggplot2)
#install.packages("dplyr")
library(dplyr)
#install.packages("Hmisc")
library(Hmisc)
#install.packages("factoextra")
library(factoextra)
#install.packages("cluster")
library(cluster)
#install.packages("magrittr")
library(magrittr)
#install.packages("NbClust")
library(NbClust)
#install.packages("tidyr")
library(tidyr)
#install.packages("rstatix")
library(rstatix)
data <- read.table("./anketa_final_6.csv", header=TRUE, sep=",", dec=".")
head(data)
## ID Q47a Q47b Q47c Q47d Q21 Q50a Q50b Q50c Q50d Q51 Q52a Q52b Q52c Q52d Q52e
## 1 1 1 1 0 0 4 2 2 6 6 4 2 4 2 1 1
## 2 2 1 0 0 0 7 2 6 6 6 5 1 1 1 1 1
## 3 3 1 1 0 0 7 1 2 6 6 4 1 1 1 1 1
## 4 4 1 1 1 0 7 1 2 4 6 1 1 3 1 1 1
## 5 5 1 0 1 0 7 1 6 2 6 5 1 3 1 1 1
## 6 6 1 1 0 1 6 2 1 6 4 3 2 4 2 2 2
## Q52f Q52g Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q54a Q54b Q54c Q54d Q55a Q55b Q55c Q55d
## 1 2 2 4 4 5 3 1 1 0 0 1 0 1 0
## 2 1 1 1 7 7 7 0 0 0 1 1 1 1 1
## 3 1 1 1 7 7 7 0 1 0 0 1 1 1 0
## 4 1 1 3 6 7 7 0 1 0 0 1 1 0 0
## 5 1 1 1 7 7 7 1 1 0 0 1 0 0 0
## 6 1 2 6 5 2 2 1 1 0 0 0 1 1 0
## Q55e Q55f Q55g Q56 Q57 Q58 Q59 Q60a Q60b Q60c Q60d Q60e Q60f Q60g Q60g_text
## 1 0 0 0 5 6 3 5 -2 -2 -2 -2 -2 -2 -2 -2
## 2 1 1 0 7 1 1 1 0 0 0 1 1 -2 0 -2
## 3 1 1 0 6 3 5 1 1 1 1 1 0 -2 0 -2
## 4 1 0 0 5 6 6 4 0 1 1 0 0 -2 0 -2
## 5 0 0 0 5 1 5 6 -2 -2 -2 -2 -2 -2 -2 -2
## 6 0 0 0 2 5 6 6 -2 -2 -2 -2 -2 -2 -2 -2
## Q61a Q61b Q61c Q61d Q61e Q61f Q61f_text Q62 Q63a_1 Q63b_1 Q63c_1 Q63d_1
## 1 1 0 0 -2 -2 0 -2 2 -2 -2 -2 -2
## 2 0 0 0 -2 1 0 -2 2 -2 -2 -2 -2
## 3 1 0 0 -2 -2 0 -2 2 -2 -2 -2 -2
## 4 1 0 0 -2 -2 0 -2 1 6 4 6 7
## 5 0 0 1 -2 -2 0 -2 2 -2 -2 -2 -2
## 6 1 0 1 -2 -2 0 -2 2 -2 -2 -2 -2
## Q63e_1 Q63f_1 Q64 Q65 Q46 Q1a_1 Q1b_1 Q1c_1 Q1d_1 Q1e_1 Q1f_1 Q2a_1 Q2b_1
## 1 -2 -2 3 1 66 4 6 6 6 5 6 6 6
## 2 -2 -2 1 1 0 7 7 7 7 7 7 7 1
## 3 -2 -2 5 3 16 7 7 7 7 7 7 7 5
## 4 6 7 5 4 70 5 7 7 7 3 3 7 6
## 5 -2 -2 1 1 49 7 7 7 7 7 7 7 1
## 6 -2 -2 6 6 85 6 6 6 5 5 6 7 6
## Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1 Q6a_1 Q6b_1 Q6c_1
## 1 6 5 6 4 5 5 4 5 5 4 7 5 5
## 2 1 7 1 1 7 1 1 7 1 1 7 1 1
## 3 3 7 7 5 6 7 1 7 7 1 7 5 3
## 4 6 6 7 7 7 6 5 7 6 5 6 6 6
## 5 7 7 1 7 7 1 7 7 1 7 7 1 7
## 6 5 7 6 6 7 6 6 7 6 6 7 6 6
## Q7a_1 Q7b_1 Q7c_1 Q39 Q40 Q37 Q38 Q41 Q42 Q42_5_text Q43 Q44 Q45 Q45_10_text
## 1 6 4 4 1 1958 1 3 4 3 -2 -2 2 6 -2
## 2 7 1 1 2 1942 1 1 2 3 -2 -2 3 7 -2
## 3 7 3 3 2 1953 2 -2 4 3 -2 -2 3 1 -2
## 4 3 5 6 1 1948 5 -2 3 3 -2 -2 3 1 -2
## 5 7 1 7 1 1953 5 -2 5 3 -2 -2 3 4 -2
## 6 7 6 6 1 1955 1 4 5 3 -2 -2 3 1 -2
data_seg <- as.data.frame(data[c("ID","Q51", "Q53a_1", "Q53b_1", "Q53c_1", "Q53d_1", "Q56", "Q58")])
summary(data_seg[,-1])
## Q51 Q53a_1 Q53b_1 Q53c_1
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:1.000
## Median :3.000 Median :5.000 Median :3.000 Median :2.000
## Mean :3.481 Mean :4.877 Mean :3.642 Mean :2.877
## 3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:4.000
## Max. :5.000 Max. :7.000 Max. :7.000 Max. :7.000
## Q53d_1 Q56 Q58
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:5.000
## Median :2.000 Median :5.000 Median :6.000
## Mean :3.113 Mean :4.575 Mean :5.594
## 3rd Qu.:5.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000 Max. :7.000
data_seg_std <- as.data.frame(scale(data_seg[c("Q51", "Q53a_1", "Q53b_1", "Q53c_1", "Q53d_1", "Q56", "Q58")]))
head(data_seg_std)
## Q51 Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q56 Q58
## 1 0.5358171 -0.4348621 0.1583175 1.0352670 -0.05133481 0.2325582 -1.9729472
## 2 1.5684828 -1.9218101 1.4831852 2.0107185 1.76249520 1.3281658 -3.4939101
## 3 0.5358171 -1.9218101 1.4831852 2.0107185 1.76249520 0.7803620 -0.4519843
## 4 -2.5621799 -0.9305115 1.0415627 2.0107185 1.76249520 0.2325582 0.3084972
## 5 1.5684828 -1.9218101 1.4831852 2.0107185 1.76249520 0.2325582 -0.4519843
## 6 -0.4968486 0.5564365 0.5999401 -0.4279103 -0.50479231 -1.4108531 0.3084972
data_seg$Dissimilarity = sqrt(data_seg_std$Q51^2 + data_seg_std$Q53a_1^2 + data_seg_std$Q53b_1^2 + data_seg_std$Q53c_1^2 + data_seg_std$Q53d_1^2 + data_seg_std$Q56^2 + data_seg_std$Q58^2)
head(data_seg[order(-data_seg$Dissimilarity), c("ID", "Dissimilarity")], 15)
## ID Dissimilarity
## 2 2 5.429009
## 92 92 4.746060
## 63 63 4.599919
## 27 27 4.294490
## 78 78 4.276532
## 4 4 3.976695
## 106 106 3.974940
## 5 5 3.970023
## 89 89 3.937557
## 3 3 3.760708
## 83 83 3.687591
## 60 60 3.677529
## 90 90 3.415806
## 88 88 3.415193
## 47 47 3.339925
data <- data %>%
filter(!ID %in% c("2"))
data <- data %>%
mutate(ID = row_number())
data_seg <- as.data.frame(data[c("ID","Q51", "Q53a_1", "Q53b_1", "Q53c_1", "Q53d_1", "Q56", "Q58")])
data_seg_std <- as.data.frame(scale(data_seg[c(2:8)]))
head(data_seg_std)
## Q51 Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q56 Q58
## 1 0.5547258 -0.4592293 0.1734642 1.0703936 -0.03490949 0.2461331 -2.1252565
## 2 0.5547258 -1.9660753 1.5061769 2.0606256 1.79783885 0.7960050 -0.5140512
## 3 -2.5656070 -0.9615113 1.0619393 2.0606256 1.79783885 0.2461331 0.2915514
## 4 1.5948368 -1.9660753 1.5061769 2.0606256 1.79783885 0.2461331 -0.5140512
## 5 -0.4853851 0.5453348 0.6177018 -0.4149544 -0.49309658 -1.4034825 0.2915514
## 6 -1.5254961 -1.4637933 1.5061769 0.5752776 1.79783885 0.7960050 0.2915514
get_clust_tendency(data_seg_std,
n = nrow(data_seg_std) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6028032
##
## $plot
## NULL
Distance <- get_dist(data_seg_std,
method = "euclidian")
fviz_dist(Distance,
gradient = list(low = "darkred",
mid = "grey95",
high = "white"))
fviz_nbclust(data_seg_std, kmeans, method = "wss") +
labs(subtitle = "Elbow method")
fviz_nbclust(data_seg_std, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette analysis")
NbClust(data_seg_std,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 5 proposed 3 as the best number of clusters
## * 2 proposed 7 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 6 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 5.8231 50.4280 17.2340 -1.1697 144.2888 1.250154e+13 6064.080 488.7244
## 3 1.2767 37.6804 12.2854 -1.5673 263.6807 9.022453e+12 4354.009 418.6722
## 4 0.6503 31.9249 12.2267 -2.0020 333.3147 8.263921e+12 3759.051 373.6658
## 5 2.2571 29.6029 7.8224 -1.7001 374.4693 8.725395e+12 2598.760 333.3159
## 6 0.5725 26.8283 8.8850 -1.7292 444.5704 6.444654e+12 2290.061 309.1341
## 7 14.2482 25.5832 4.1178 -1.3979 483.4065 6.059851e+12 1890.003 283.6750
## 8 0.0684 23.1989 7.9438 -1.9053 525.8342 5.283955e+12 1761.571 272.2360
## 9 0.5696 22.7178 11.6443 -1.3978 577.7530 4.078689e+12 1460.604 251.6290
## 10 1.7603 23.6874 7.7622 0.3559 708.3057 1.452285e+12 1303.030 224.4093
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 5.5742 1.4896 0.3767 1.4160 0.2968 1.3614 -18.0518 -1.1818 0.3848
## 3 8.0354 1.7388 0.4052 1.7158 0.2097 1.3050 -13.5572 -1.0419 0.3631
## 4 8.9837 1.9483 0.3892 1.5854 0.2153 0.9076 4.7877 0.4529 0.3440
## 5 9.5475 2.1841 0.4175 1.5594 0.2047 1.4236 -9.2247 -1.2969 0.3289
## 6 11.7865 2.3550 0.4013 1.5719 0.1785 1.3557 -8.1340 -1.1350 0.3089
## 7 12.3394 2.5663 0.3800 1.5390 0.1832 2.0520 -16.9181 -2.1848 0.2949
## 8 13.8453 2.6742 0.3770 1.4809 0.1872 1.5993 -5.2459 -1.4970 0.2793
## 9 15.2710 2.8931 0.3812 1.4448 0.1870 0.7786 4.2659 1.2302 0.2694
## 10 21.6004 3.2441 0.3369 1.3719 0.2084 1.0064 -0.0765 -0.0259 0.2623
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 244.3622 0.5361 1.6236 0.6234 0.1274 0.0029 1.4482 2.0363 0.9433
## 3 139.5574 0.4672 0.1422 1.3240 0.1245 0.0034 1.6879 1.8644 0.7460
## 4 93.4165 0.4950 0.7674 1.5628 0.1280 0.0038 1.5572 1.7664 0.5824
## 5 66.6632 0.4596 0.4319 2.2372 0.1074 0.0040 1.5684 1.6678 0.5271
## 6 51.5223 0.4442 0.3317 2.7789 0.1266 0.0042 1.6926 1.6087 0.5075
## 7 40.5250 0.4326 0.0876 3.2993 0.1091 0.0044 1.6455 1.5273 0.4767
## 8 34.0295 0.4338 0.1301 3.3316 0.1993 0.0045 1.5314 1.4963 0.4557
## 9 27.9588 0.4360 0.0768 3.5227 0.2084 0.0048 1.4508 1.4449 0.4199
## 10 22.4409 0.4496 0.1131 3.9167 0.2073 0.0049 1.5063 1.3651 0.3964
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6579 35.3646 1.0000
## 3 0.6638 29.3788 1.0000
## 4 0.6515 25.1406 0.8678
## 5 0.5737 23.0370 1.0000
## 6 0.5494 25.4245 1.0000
## 7 0.5070 32.0944 1.0000
## 8 0.3729 23.5412 1.0000
## 9 0.5494 12.3022 0.2911
## 10 0.4004 17.9674 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 7.0000 2.000 3.0000 10.0000 10.0000 3.000000e+00 3.000
## Value_Index 14.2482 50.428 4.9485 0.3559 130.5526 2.720555e+12 1710.071
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 10.0000 7.0000 10.0000 10.0000 2.0000 2.0000
## Value_Index 25.0458 6.3295 -0.1035 0.3369 1.3719 0.2968 1.3614
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 2.0000 3.0000 2.0000 2.0000 2.0000
## Value_Index -18.0518 -1.1818 0.3848 104.8048 0.5361 1.6236 0.6234
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 9.0000 0 2.0000 0 10.0000
## Value_Index 0.2084 0 1.4482 0 0.3964
##
## $Best.partition
## [1] 1 1 1 1 2 1 2 2 2 2 2 2 1 2 1 2 2 2 1 2 1 2 2 1 2 2 2 1 2 2 2 1 1 1 2 2 2
## [38] 2 2 2 2 2 1 1 2 1 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 2 2 2 2 1 2 1 2 2 2 2 1
## [75] 2 2 1 2 1 2 2 1 1 1 1 2 2 1 2 2 1 2 2 2 1 2 2 1 1 1 2 2 1 2 1
Clustering <- kmeans(data_seg_std,
centers = 2,
nstart = 25)
Clustering
## K-means clustering with 2 clusters of sizes 38, 67
##
## Cluster means:
## Q51 Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q56 Q58
## 1 -0.3485284 -0.7896780 1.0151775 0.9922174 0.7850042 0.5644800 -0.5776514
## 2 0.1976728 0.4478771 -0.5757723 -0.5627502 -0.4452263 -0.3201528 0.3276232
##
## Clustering vector:
## [1] 1 1 1 1 2 1 2 2 2 2 2 2 1 2 1 2 2 2 1 2 1 2 2 1 2 2 2 1 2 2 2 1 1 1 2 2 2
## [38] 2 2 2 2 2 1 1 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2 1 1 2 2 2 2 2 2 1 2 2 2 2 1
## [75] 2 2 1 2 1 2 2 1 1 1 1 2 2 1 2 2 1 2 2 2 2 2 2 1 1 1 2 2 1 2 1
##
## Within cluster sum of squares by cluster:
## [1] 215.2432 272.8390
## (between_SS / total_SS = 33.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(Clustering,
palette = "Set1",
repel = TRUE,
ggtheme = theme_bw(),
labelsize = 8,
data = data_seg_std)
# Silhouette analysis
dist_matrix <- dist(data_seg_std, method = "euclidean")
silhouette_scores <- silhouette(Clustering$cluster, dist_matrix)
# Extract silhouette widths
sil_values <- silhouette_scores[, 3]
# Identify low silhouette points
low_sil_points <- which(sil_values < 0.05) # You can adjust the threshold
print(length(low_sil_points))
## [1] 6
print(low_sil_points)
## [1] 13 15 33 51 83 98
data <- data %>%
filter(!ID %in% c("93", "76", "16", "66", "28", "3", "77", "91", "62", "13", "15", "33", "51", "83", "98"))
data <- data %>%
mutate(ID = row_number())
data_seg <- as.data.frame(data[c("ID","Q51", "Q53a_1", "Q53b_1", "Q53c_1", "Q53d_1", "Q56", "Q58")])
data_seg_std <- as.data.frame(scale(data_seg[c(2:8)]))
head(data_seg_std)
## Q51 Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q56
## 1 0.5447743 -0.524308838 0.2470272 1.3050474 -0.02071165 0.3491143
## 2 0.5447743 -2.079950446 1.6081976 2.3875752 1.84333700 0.9003475
## 3 1.6849995 -2.079950446 1.6081976 2.3875752 1.84333700 0.3491143
## 4 -0.5954510 0.512785567 0.7007507 -0.3187443 -0.48672381 -1.3045851
## 5 -1.7356762 -1.561403244 1.6081976 0.7637835 1.84333700 0.9003475
## 6 1.6849995 -0.005761636 -0.6604197 -0.3187443 -0.48672381 0.3491143
## Q58
## 1 -2.3438372
## 2 -0.6218344
## 3 -0.6218344
## 4 0.2391671
## 5 0.2391671
## 6 -0.6218344
Clustering <- kmeans(data_seg_std,
centers = 2,
nstart = 25)
Clustering
## K-means clustering with 2 clusters of sizes 27, 63
##
## Cluster means:
## Q51 Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q56 Q58
## 1 -0.3842981 -1.0236506 1.1544741 1.1647197 0.9630918 0.5532747 -0.6856122
## 2 0.1646992 0.4387074 -0.4947746 -0.4991656 -0.4127536 -0.2371177 0.2938338
##
## Clustering vector:
## [1] 1 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 1 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 1 1
## [39] 2 1 1 2 2 2 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 1 2 2 1 2 2 1 1 1 2 2 1
## [77] 2 2 2 2 2 2 2 1 1 2 2 1 2 1
##
## Within cluster sum of squares by cluster:
## [1] 140.0226 267.4152
## (between_SS / total_SS = 34.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(Clustering,
palette = "Set1",
repel = TRUE,
ggtheme = theme_bw(),
labelsize = 8,
data = data_seg_std)
data <- data %>%
filter(!ID %in% c("22", "80", "49", "3", "54"))
data <- data %>%
mutate(ID = row_number())
data_seg <- as.data.frame(data[c("ID","Q51", "Q53a_1", "Q53b_1", "Q53c_1", "Q53d_1", "Q56", "Q58")])
data_seg_std <- as.data.frame(scale(data_seg[c(2:8)]))
head(data_seg_std)
## Q51 Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q56
## 1 0.5426290 -0.55068417 0.2514460 1.3788762 -0.02741576 0.3734578
## 2 0.5426290 -2.14641671 1.6453311 2.5004502 1.83685610 0.9207665
## 3 -0.6104577 0.51313752 0.7160743 -0.3034847 -0.49348373 -1.2684686
## 4 -1.7635444 -1.61450586 1.6453311 0.8180892 1.83685610 0.9207665
## 5 1.6957157 -0.01877332 -0.6778108 -0.3034847 -0.49348373 0.3734578
## 6 -0.6104577 -1.61450586 -0.6778108 -0.8642717 -0.49348373 0.9207665
## Q58
## 1 -2.7318215
## 2 -0.7886849
## 3 0.1828834
## 4 0.1828834
## 5 -0.7886849
## 6 0.1828834
Clustering <- kmeans(data_seg_std,
centers = 2,
nstart = 25)
Clustering
## K-means clustering with 2 clusters of sizes 59, 26
##
## Cluster means:
## Q51 Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q56 Q58
## 1 0.2299276 0.4860912 -0.4966845 -0.493582 -0.4144892 -0.2202331 0.3804906
## 2 -0.5217587 -1.1030531 1.1270918 1.120051 0.9405716 0.4997598 -0.8634209
##
## Clustering vector:
## [1] 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 2
## [39] 2 1 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 2 2 1 1 2 1 1 1 1
## [77] 1 1 2 2 1 1 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 228.3987 138.8358
## (between_SS / total_SS = 37.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(Clustering,
palette = "Set1",
repel = TRUE,
ggtheme = theme_bw(),
labelsize = 8,
data = data_seg_std)
Averages <- Clustering$centers
Averages
## Q51 Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q56 Q58
## 1 0.2299276 0.4860912 -0.4966845 -0.493582 -0.4144892 -0.2202331 0.3804906
## 2 -0.5217587 -1.1030531 1.1270918 1.120051 0.9405716 0.4997598 -0.8634209
Figure <- as.data.frame(Averages)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c("Q51", "Q53a_1", "Q53b_1", "Q53c_1", "Q53d_1", "Q56", "Q58"))
Figure$Group <- factor(Figure$id,
levels = c(1, 2),
labels = c("1", "2"))
Figure$ImeF <- factor(Figure$name,
levels = c("Q51", "Q53a_1", "Q53b_1", "Q53c_1", "Q53d_1", "Q56", "Q58"),
labels = c("Q51", "Q53a_1", "Q53b_1", "Q53c_1", "Q53d_1", "Q56", "Q58"))
library(ggplot2)
ggplot(Figure, aes(x = ImeF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 3) +
geom_line(aes(group = id), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables") +
scale_color_brewer(palette="Set1") +
ylim(-1.5, 1.5) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
data$Group <- Clustering$cluster
data_seg$Group <- Clustering$cluster
fit <- aov(cbind(Q51, Q53a_1, Q53b_1, Q53c_1, Q53d_1, Q56, Q58) ~ as.factor(Group),
data = data)
summary(fit)
## Response Q51 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 1 7.669 7.6693 11.468 0.001084 **
## Residuals 83 55.507 0.6688
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q53a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 1 161.09 161.085 98.448 9.402e-16 ***
## Residuals 83 135.81 1.636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q53b_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 1 220.42 220.418 108.45 < 2.2e-16 ***
## Residuals 83 168.69 2.032
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q53c_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 1 149.42 149.424 105.39 < 2.2e-16 ***
## Residuals 83 117.68 1.418
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q53d_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 1 152.56 152.555 54.076 1.24e-10 ***
## Residuals 83 234.15 2.821
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q56 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 1 31.232 31.2319 10.403 0.001801 **
## Residuals 83 249.192 3.0023
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q58 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 1 29.583 29.5828 41.332 7.709e-09 ***
## Residuals 83 59.405 0.7157
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kruskal.test(Q50a ~ Group,
data = data)
##
## Kruskal-Wallis rank sum test
##
## data: Q50a by Group
## Kruskal-Wallis chi-squared = 22.579, df = 1, p-value = 2.017e-06
kruskal_effsize(Q50a ~ Group,
data = data)
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Q50a 85 0.260 eta2[H] large
data %>%
group_by(Group) %>%
shapiro_test(Q46)
## # A tibble: 2 × 4
## Group variable statistic p
## <int> <chr> <dbl> <dbl>
## 1 1 Q46 0.837 0.00000153
## 2 2 Q46 0.955 0.296
kruskal.test(Q46 ~ Group,
data = data)
##
## Kruskal-Wallis rank sum test
##
## data: Q46 by Group
## Kruskal-Wallis chi-squared = 34.575, df = 1, p-value = 4.101e-09
kruskal_effsize(Q46 ~ Group,
data = data)
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Q46 85 0.405 eta2[H] large
The clustering process aimed to segment participants based on their responses to key survey questions, specifically targeting variables related to payment methods, financial understanding, and trust in digital transactions. A 2-cluster solution was chosen based on the silhouette method, within-cluster sum of squares, and other clustering metrics. Outliers were iteratively removed to improve clustering consistency and structure.
Q51, Q53a_1, Q53b_1,
Q53c_1, Q53d_1, Q56,
Q58).Q53b_1,
Q53c_1).Q51, Q53a_1).This segmentation provides actionable insights for tailoring strategies to improve digital payment adoption among the targeted groups, addressing both intrinsic and extrinsic motivators specific to each cluster.