# Load dataset from Excel file
library(readxl)
mydata <- read_excel("C:/Users/Tamara/Desktop/R data/DataPerception.xlsx")
# Display first few rows
head(mydata)
## # A tibble: 6 × 22
## ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 5 6 5 7 7 5 7 7 5 7 7 6
## 2 2 7 7 6 2 6 6 4 7 7 3 6 6
## 3 3 7 6 6 6 6 6 7 7 5 6 7 5
## 4 4 7 3 5 6 6 3 3 6 5 3 6 6
## 5 5 6 5 5 5 5 6 6 6 7 4 6 7
## 6 6 6 6 6 5 7 6 6 7 5 7 7 5
## # ℹ 9 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## # Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>
colnames(mydata)
## [1] "ID" "Q2a_1" "Q2b_1" "Q2c_1" "Q3a_1" "Q3b_1"
## [7] "Q3c_1" "Q4a_1" "Q4b_1" "Q4c_1" "Q5a_1" "Q5b_1"
## [13] "Q5c_1" "Q6a_1" "Q6b_1" "Q6c_1" "Q7a_1" "Q7b_1"
## [19] "Q7c_1" "Age" "Gender" "Location"
mydata$GenderFactor <- factor(mydata$Gender,
levels = c(1, 2),
labels = c("Male", "Female"))
mydata$LocationFactor <- factor(mydata$Location,
levels = c(1, 2, 3),
labels = c("Urban", "Suburban", "Rural"))
For the purpose of clustering, I chose 6 cluster variables:!!!
#Saving standardized cluster variables into new data frame
mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")]))
#Finding outliers
mydata$Dissimilarity <- sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 + mydata_clu_new$Q4a_1^2 + mydata_clu_new$Q5a_1 + mydata_clu_new$Q6a_1^2 + mydata_clu_new$Q7a_1^2)
## Warning in sqrt(mydata_clu_new$Q2a_1^2 + mydata_clu_new$Q3a_1^2 +
## mydata_clu_new$Q4a_1^2 + : NaNs produced
#Finding units with highest value of dissimilarity
head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")])
## # A tibble: 6 × 2
## ID Dissimilarity
## <dbl> <dbl>
## 1 14 4.28
## 2 40 4.21
## 3 120 4.14
## 4 71 3.84
## 5 138 3.83
## 6 34 3.39
There is a relatively big jump between third and fourth unit, so I will check first three units.
#Showing units ID14, 40, 120
print(mydata[c(14,40,120), ])
## # A tibble: 3 × 25
## ID Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1 Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14 7 7 5 3 6 7 6 6 7 3 6 7
## 2 40 5 3 3 2 7 7 2 7 7 2 7 7
## 3 120 1 6 5 6 6 7 7 5 6 6 6 6
## # ℹ 12 more variables: Q6a_1 <dbl>, Q6b_1 <dbl>, Q6c_1 <dbl>, Q7a_1 <dbl>,
## # Q7b_1 <dbl>, Q7c_1 <dbl>, Age <dbl>, Gender <dbl>, Location <dbl>,
## # GenderFactor <fct>, LocationFactor <fct>, Dissimilarity <dbl>
They don’t seem unusual.
#Removing ...
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Finding Euclidean distances based on 6 Cluster variables, then saving them into object Distances
Distances <- get_dist(mydata_clu_new,
method = "euclidian")
#Showing matrix of distances
fviz_dist(Distances,
gradient = list(low = "slateblue4",
mid = "skyblue3",
high = "skyblue"))
There are three or four groups of homogeneous objects forming, but they are not very evident.
#Hopkins statistics
library(factoextra)
get_clust_tendency(mydata_clu_new,
n = nrow(mydata_clu_new) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6418886
##
## $plot
## NULL
Hopkins statistics is above 0.5 - data is clusterable.
#Determining number of clusters for K-means clustering
library(factoextra)
library(NbClust)
fviz_nbclust(mydata_clu_new, kmeans, method = "wss") +
labs(subtitle = "Elbow method")
It seems that the biggest break is at 5, indicating that we should form 5 clusters based on Elbow method.
#Determining number of clusters for K-means clustering
fviz_nbclust(mydata_clu_new, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette analysis")
Since we want average Silhouette to be as high as possible, according to this index, it is the best option to form 2 clusters, but 4 is also almost as good.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
WARD <- mydata_clu_new %>%
get_dist(method = "euclidean") %>%
hclust(method = "ward.D2")
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 152
library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(NbClust)
NbClust(mydata_clu_new,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 5 proposed 3 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 16.5370 59.5648 22.8451 -1.2199 190.7707 4.307472e+12 12926.431 648.4867
## 3 0.1354 45.4359 23.4358 -1.8840 367.1832 3.036401e+12 9223.823 562.7755
## 4 0.7123 42.5792 23.8440 -1.6591 474.6207 2.662356e+12 6927.557 486.2886
## 5 2.2046 42.7495 14.9309 0.1108 564.7277 2.299495e+12 4648.256 418.8142
## 6 2.5760 40.3828 9.9144 0.8422 663.8276 1.725228e+12 3722.664 380.1973
## 7 0.8891 37.3325 9.0992 0.7345 727.1115 1.548550e+12 3098.369 356.0210
## 8 0.4194 35.0637 11.8946 0.7330 782.5853 1.404137e+12 2947.080 334.9989
## 9 9.5096 34.4609 5.7515 1.5349 847.1365 1.162195e+12 2492.866 309.4387
## 10 0.0898 32.2757 11.9974 1.1503 877.7749 1.172881e+12 2235.064 297.4742
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 3.2372 1.3971 0.4534 1.5833 0.2516 1.2250 -18.5509 -0.6976 0.3433
## 3 5.4346 1.6099 0.4247 1.7077 0.2047 1.5054 -24.1722 -1.2658 0.3359
## 4 6.8884 1.8631 0.4612 1.5760 0.2104 1.1178 -5.7980 -0.3969 0.3241
## 5 7.5885 2.1633 0.4244 1.3974 0.2328 1.2897 -11.6811 -0.8421 0.3275
## 6 9.3895 2.3830 0.3998 1.3793 0.2277 1.4435 -10.4458 -1.1283 0.3105
## 7 10.5507 2.5448 0.4490 1.4012 0.2230 1.0142 -0.5887 -0.0525 0.2941
## 8 11.6282 2.7045 0.4165 1.4359 0.2077 1.6961 -10.2607 -1.4862 0.2804
## 9 12.8642 2.9279 0.4229 1.4137 0.2152 1.1606 -2.4907 -0.4969 0.2704
## 10 13.4092 3.0456 0.4162 1.3749 0.2092 1.8949 -12.7511 -1.6655 0.2591
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 324.2434 0.4586 0.8104 0.7394 0.1312 0.0023 1.6254 1.9756 1.2828
## 3 187.5918 0.4468 0.1855 1.4472 0.0831 0.0026 1.6291 1.8218 1.0393
## 4 121.5722 0.4777 0.0745 1.9274 0.0977 0.0028 1.4802 1.6897 0.6214
## 5 83.7628 0.5135 0.2775 2.2588 0.1090 0.0032 1.3569 1.5742 0.5371
## 6 63.3662 0.5105 0.7706 2.6311 0.1078 0.0033 1.3411 1.5061 0.4869
## 7 50.8601 0.4811 1.1917 3.1800 0.1112 0.0034 1.5588 1.4607 0.4626
## 8 41.8749 0.4367 0.0850 4.0759 0.1051 0.0036 1.6493 1.4036 0.4293
## 9 34.3821 0.4416 0.4428 4.3256 0.1112 0.0038 1.5593 1.3558 0.3930
## 10 29.7474 0.4290 -0.1904 4.7313 0.1480 0.0039 1.5085 1.3214 0.3768
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.7006 43.1654 1
## 3 0.6533 38.2096 1
## 4 0.6459 30.1530 1
## 5 0.6222 31.5710 1
## 6 0.5356 29.4770 1
## 7 0.6188 25.8772 1
## 8 0.4889 26.1338 1
## 9 0.4643 20.7641 1
## 10 0.4174 37.6933 1
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 2.000 2.0000 5.0000 9.0000 3.0000 3 3.000
## Value_Index 16.537 59.5648 8.9131 1.5349 176.4125 897026528841 3702.608
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 5.0000 3.0000 9.0000 6.0000 10.0000 2.0000 2.000
## Value_Index 28.8575 2.1974 -0.1056 0.3998 1.3749 0.2516 1.225
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 2.0000 3.0000 5.0000 1 2.0000
## Value_Index -18.5509 -0.6976 0.3433 136.6515 0.5135 NA 0.7394
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 10.000 0 6.0000 0 10.0000
## Value_Index 0.148 0 1.3411 0 0.3768
##
## $Best.partition
## [1] 1 2 1 2 1 1 2 2 2 1 2 1 1 2 2 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1
## [38] 1 1 2 2 2 1 2 2 2 1 1 1 2 1 1 2 1 1 2 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 1 2 1
## [75] 2 1 2 2 1 2 2 1 2 2 1 2 2 2 2 2 1 2 1 1 1 1 2 2 2 1 2 2 1 2 1 1 1 1 1 1 2
## [112] 1 2 1 2 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 2 2 1 2 1 2 2 2 2
## [149] 1 2 1 1
According to the majority rule, the best number of clusters is 5.
Clustering <- kmeans(mydata_clu_new,
centers = 4, #Number of groups
nstart = 25) #Number of attempts at different starting leader positions
Clustering
## K-means clustering with 4 clusters of sizes 46, 26, 32, 48
##
## Cluster means:
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 0.10210822 -0.05550022 0.4076716 0.3829262 0.5054298 -0.74807794
## 2 0.27318259 -0.29962981 -0.4237942 -0.2482213 -1.7159132 -0.09898356
## 3 0.06563359 -1.09898469 -1.1577087 -1.2082286 0.2610379 -0.48917780
## 4 -0.28958334 0.94814365 0.6106757 0.5729680 0.2710575 1.09664265
##
## Clustering vector:
## [1] 4 3 1 3 4 1 2 3 2 1 3 1 1 2 3 4 1 3 1 4 3 1 4 4 1 4 4 4 4 1 3 4 2 3 4 2 4
## [38] 1 1 2 1 3 1 3 3 3 4 1 1 2 4 4 2 4 4 3 2 1 3 1 4 1 2 4 4 4 1 4 1 2 4 4 2 4
## [75] 2 1 3 3 4 1 3 1 3 3 1 1 3 1 3 3 1 2 1 4 4 2 3 1 3 1 1 3 4 1 4 4 4 4 4 1 2
## [112] 4 3 4 1 1 1 1 1 4 4 1 3 1 4 1 2 4 2 2 2 1 4 2 4 1 3 2 2 1 2 4 3 4 1 3 2 2
## [149] 4 3 4 4
##
## Within cluster sum of squares by cluster:
## [1] 131.9942 95.0562 119.3242 139.4512
## (between_SS / total_SS = 46.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_new)
Units ID40, ID84 and ID120 seem to be far away from the center, so I will remove them.
mydata <- mydata %>%
filter(!ID %in% c(40, 84, 120))
mydata$ID <- seq(1, nrow(mydata))
mydata_clu_new <- as.data.frame(scale(mydata[c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")]))
Clustering <- kmeans(mydata_clu_new,
centers = 4, #Number of groups
nstart = 25) #Number of attempts at different starting leader positions
Clustering
## K-means clustering with 4 clusters of sizes 31, 41, 31, 46
##
## Cluster means:
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 -0.03128611 -0.06424249 -0.2892342 -0.1214947 -1.6084816 0.08157507
## 2 -0.13282415 0.97382376 0.6853525 0.6053649 0.4642818 1.13824513
## 3 0.11828490 -1.13229077 -1.2108368 -1.2313090 0.2424502 -0.47239185
## 4 0.05975712 -0.06161399 0.4000597 0.3721077 0.5067699 -0.75114631
##
## Clustering vector:
## [1] 2 3 4 3 2 4 1 3 1 4 3 4 4 1 3 2 4 3 4 2 3 4 2 2 4 2 2 2 2 4 3 2 1 3 1 1 2
## [38] 4 4 4 3 4 3 3 3 2 4 4 1 1 2 1 1 2 3 1 4 3 4 2 4 1 2 2 2 4 2 4 1 2 2 1 2 1
## [75] 4 3 3 2 4 3 4 3 4 4 3 4 3 3 4 1 4 2 2 1 3 4 3 4 4 3 2 4 2 2 2 2 2 4 1 2 3
## [112] 2 4 4 4 4 4 2 4 3 4 2 4 1 2 1 1 1 4 2 1 1 4 3 1 1 4 1 2 3 1 4 3 1 1 1 3 2
## [149] 2
##
## Within cluster sum of squares by cluster:
## [1] 127.79269 95.91834 114.09601 137.52207
## (between_SS / total_SS = 46.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_new)
#Average values of cluster variables to describe groups
Averages <- Clustering$centers
Averages
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## 1 -0.03128611 -0.06424249 -0.2892342 -0.1214947 -1.6084816 0.08157507
## 2 -0.13282415 0.97382376 0.6853525 0.6053649 0.4642818 1.13824513
## 3 0.11828490 -1.13229077 -1.2108368 -1.2313090 0.2424502 -0.47239185
## 4 0.05975712 -0.06161399 0.4000597 0.3721077 0.5067699 -0.75114631
Figure <- as.data.frame(Averages)
Figure$ID <- 1:nrow(Figure)
library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"))
Figure$Group <- factor(Figure$ID,
levels = c(1, 2, 3, 4),
labels = c("1", "2", "3", "4"))
Figure$NameF <- factor(Figure$name,
levels = c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"),
labels = c("Cash_Safety", "Cash_Speed", "Cash_Ease of Use", "Cash_Convenience", "Cash_Privacy", "Cash_Tracking Expenses"))
library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 4) +
geom_line(aes(group = ID), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-2.5, 2.5) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
#Saving where each unit belongs
mydata$Group <- Clustering$cluster
#Checking if clustering variables successfully differentiate between groups
fit <- aov(cbind(Q2a_1, Q3a_1, Q4a_1, Q5a_1, Q6a_1, Q7a_1) ~ as.factor(Group),
data = mydata)
summary(fit)
## Response Q2a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 3 2.263 0.75445 0.4455 0.7209
## Residuals 145 245.562 1.69353
##
## Response Q3a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 3 202.25 67.416 55.231 < 2.2e-16 ***
## Residuals 145 176.99 1.221
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q4a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 3 161.36 53.787 49.208 < 2.2e-16 ***
## Residuals 145 158.49 1.093
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q5a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 3 216.45 72.149 42.046 < 2.2e-16 ***
## Residuals 145 248.81 1.716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q6a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 3 104.912 34.971 109.5 < 2.2e-16 ***
## Residuals 145 46.309 0.319
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q7a_1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 3 337.88 112.627 67.412 < 2.2e-16 ***
## Residuals 145 242.25 1.671
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It differs for at least one of the groups for all variables.
#Additional variables
aggregate(mydata$Age,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 37.90323
## 2 2 44.41463
## 3 3 37.22581
## 4 4 36.45652
#Checking normal distribution of variables
library(dplyr)
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.4.2
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(Age)
## # A tibble: 4 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 Age 0.863 0.000990
## 2 2 Age 0.896 0.00128
## 3 3 Age 0.896 0.00570
## 4 4 Age 0.928 0.00710
None of the variables are normally distributed. -> Kruskal Walis
kruskal.test(Age ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: Age by as.factor(Group)
## Kruskal-Wallis chi-squared = 6.971, df = 3, p-value = 0.07283
Not significant.
#Checking the association between the location and classification into 4 groups
chi_square <- chisq.test(mydata$LocationFactor, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$LocationFactor and as.factor(mydata$Group)
## X-squared = 7.1807, df = 6, p-value = 0.3045
Not significant.
#Checking the association between the gender and classification into 4 groups
chi_square <- chisq.test(mydata$GenderFactor, as.factor(mydata$Group))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$GenderFactor and as.factor(mydata$Group)
## X-squared = 4.6171, df = 3, p-value = 0.2021
Not significant.