NLB_cluster3

#install.packages(ggplot2)
library(ggplot2)
#install.packages("dplyr")
library(dplyr)
#install.packages("Hmisc")
library(Hmisc)
#install.packages("factoextra")
library(factoextra)
#install.packages("cluster")
library(cluster)
#install.packages("magrittr")
library(magrittr)
#install.packages("NbClust")
library(NbClust)
#install.packages("tidyr")
library(tidyr)
#install.packages("rstatix")
library(rstatix)
#install.packages("corrplot")
library(corrplot)
#install.packages("car")
library(car)
#install.packages("GGally")
library(GGally)
#install.packages("factoextra")
library(factoextra)

I started by activating the libraries I might need for the analysis.

data <- read.table("./anketa_final_6.csv", header=TRUE, sep=",", dec=".")
head(data)

##   ID Q47a Q47b Q47c Q47d Q21 Q50a Q50b Q50c Q50d Q51 Q52a Q52b Q52c Q52d Q52e
## 1  1    1    1    0    0   4    2    2    6    6   4    2    4    2    1    1
## 2  2    1    0    0    0   7    2    6    6    6   5    1    1    1    1    1
## 3  3    1    1    0    0   7    1    2    6    6   4    1    1    1    1    1
## 4  4    1    1    1    0   7    1    2    4    6   1    1    3    1    1    1
## 5  5    1    0    1    0   7    1    6    2    6   5    1    3    1    1    1
## 6  6    1    1    0    1   6    2    1    6    4   3    2    4    2    2    2
##   Q52f Q52g Q53a_1 Q53b_1 Q53c_1 Q53d_1 Q54a Q54b Q54c Q54d Q55a Q55b Q55c Q55d
## 1    2    2      4      4      5      3    1    1    0    0    1    0    1    0
## 2    1    1      1      7      7      7    0    0    0    1    1    1    1    1
## 3    1    1      1      7      7      7    0    1    0    0    1    1    1    0
## 4    1    1      3      6      7      7    0    1    0    0    1    1    0    0
## 5    1    1      1      7      7      7    1    1    0    0    1    0    0    0
## 6    1    2      6      5      2      2    1    1    0    0    0    1    1    0
##   Q55e Q55f Q55g Q56 Q57 Q58 Q59 Q60a Q60b Q60c Q60d Q60e Q60f Q60g Q61a Q61b
## 1    0    0    0   5   6   3   5   -2   -2   -2   -2   -2   -2   -2    1    0
## 2    1    1    0   7   1   1   1    0    0    0    1    1   -2    0    0    0
## 3    1    1    0   6   3   5   1    1    1    1    1    0   -2    0    1    0
## 4    1    0    0   5   6   6   4    0    1    1    0    0   -2    0    1    0
## 5    0    0    0   5   1   5   6   -2   -2   -2   -2   -2   -2   -2    0    0
## 6    0    0    0   2   5   6   6   -2   -2   -2   -2   -2   -2   -2    1    0
##   Q61c Q61d Q61e Q61f Q62 Q63a_1 Q63b_1 Q63c_1 Q63d_1 Q63e_1 Q63f_1 Q64 Q65 Q46
## 1    0    0    0    0   2     -2     -2     -2     -2     -2     -2   3   1  66
## 2    0    0    1    0   2     -2     -2     -2     -2     -2     -2   1   1   0
## 3    0    0    0    0   2     -2     -2     -2     -2     -2     -2   5   3  16
## 4    0    0    0    0   1      6      4      6      7      6      7   5   4  70
## 5    1    0    0    0   2     -2     -2     -2     -2     -2     -2   1   1  49
## 6    1    0    0    0   2     -2     -2     -2     -2     -2     -2   6   6  85
##   Q1a_1 Q1b_1 Q1c_1 Q1d_1 Q1e_1 Q1f_1 Q2a_1 Q2b_1 Q2c_1 Q3a_1 Q3b_1 Q3c_1 Q4a_1
## 1     4     6     6     6     5     6     6     6     6     5     6     4     5
## 2     7     7     7     7     7     7     7     1     1     7     1     1     7
## 3     7     7     7     7     7     7     7     5     3     7     7     5     6
## 4     5     7     7     7     3     3     7     6     6     6     7     7     7
## 5     7     7     7     7     7     7     7     1     7     7     1     7     7
## 6     6     6     6     5     5     6     7     6     5     7     6     6     7
##   Q4b_1 Q4c_1 Q5a_1 Q5b_1 Q5c_1 Q6a_1 Q6b_1 Q6c_1 Q7a_1 Q7b_1 Q7c_1 Q39  Q40
## 1     5     4     5     5     4     7     5     5     6     4     4   1 1958
## 2     1     1     7     1     1     7     1     1     7     1     1   2 1942
## 3     7     1     7     7     1     7     5     3     7     3     3   2 1953
## 4     6     5     7     6     5     6     6     6     3     5     6   1 1948
## 5     1     7     7     1     7     7     1     7     7     1     7   1 1953
## 6     6     6     7     6     6     7     6     6     7     6     6   1 1955
##   Q37 Q38 Q41 Q42 Q43 Q44 Q45
## 1   1   3   4   3  -2   2   6
## 2   1   1   2   3  -2   3   7
## 3   2  -2   4   3  -2   3   1
## 4   5  -2   3   3  -2   3   1
## 5   5  -2   5   3  -2   3   4
## 6   1   4   5   3  -2   3   1

Import the data from the survey.

data_seg <- as.data.frame(data[c("ID","Q21","Q56", "Q58", "Q65", "Q7a_1")])
summary(data_seg[,-1])

##       Q21             Q56             Q58             Q65       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:3.000   1st Qu.:5.000   1st Qu.:2.000  
##  Median :3.000   Median :5.000   Median :6.000   Median :5.000  
##  Mean   :3.585   Mean   :4.575   Mean   :5.594   Mean   :4.406  
##  3rd Qu.:5.750   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##      Q7a_1      
##  Min.   :1.000  
##  1st Qu.:3.000  
##  Median :6.000  
##  Mean   :4.972  
##  3rd Qu.:7.000  
##  Max.   :7.000

I created a new data frame that includes only the clustering variables that we chose. - Q21: I prefer paying with cash over a card. (Likert scale 1-7) - Q56: How concerned are you about fraud when using digital payments? (Likert scale 1-7) - Q58: How much do you trust banks to securely process your digital transactions? (Likert scale 1-7) - Q65: To what extent do you agree with the statement: ‘Digital payments will completely replace cash in the future’? (Likert scale 1-7) - Q7a_1: To what extent do you think the following payment methods allow for tracking expenses? Cash (1 - do not allow expense tracking at all, 7 - fully allow expense tracking)

ggpairs(data_seg[, -1])

The correlation between the selected variables is below 0.4 which was selected as the cutpoint to avoid multicolinearity.

data_seg_std <- as.data.frame(scale(data_seg[c("Q21","Q56", "Q58", "Q65", "Q7a_1")]))
head(data_seg_std)

##        Q21        Q56        Q58        Q65      Q7a_1
## 1 0.194196  0.2325582 -1.9729472 -1.5520692  0.5018083
## 2 1.597704  1.3281658 -3.4939101 -1.5520692  0.9898053
## 3 1.597704  0.7803620 -0.4519843 -0.6406048  0.9898053
## 4 1.597704  0.2325582  0.3084972 -0.1848725 -0.9621828
## 5 1.597704  0.2325582 -0.4519843 -1.5520692  0.9898053
## 6 1.129868 -1.4108531  0.3084972  0.7265920  0.9898053

Scaled the data, not necessary.

data_seg$Dissimilarity = sqrt(data_seg_std$Q21^2 + data_seg_std$Q56^2 + data_seg_std$Q58^2 + data_seg_std$Q65^2 + data_seg_std$Q7a_1^2)

head(data_seg[order(-data_seg$Dissimilarity), c("ID", "Dissimilarity")], 15)

##      ID Dissimilarity
## 2     2      4.462367
## 27   27      4.213971
## 92   92      3.895760
## 88   88      3.561759
## 63   63      3.223487
## 106 106      3.159417
## 74   74      3.086032
## 104 104      3.078872
## 78   78      2.974564
## 103 103      2.969696
## 24   24      2.916235
## 22   22      2.879093
## 99   99      2.877978
## 85   85      2.812402
## 45   45      2.810575

data <- data %>%
  filter(!ID %in% c("2"))

data <- data %>%
  mutate(ID = row_number())

data_seg <- as.data.frame(data[c("ID","Q21","Q56", "Q58", "Q65", "Q7a_1")])

data_seg_std <- as.data.frame(scale(data_seg[c(2:6)]))
head(data_seg_std)

##         Q21        Q56        Q58        Q65      Q7a_1
## 1 0.2110182  0.2461331 -2.1252565 -1.5777496  0.5112080
## 2 1.6252894  0.7960050 -0.5140512 -0.6599451  0.9991794
## 3 1.6252894  0.2461331  0.2915514 -0.2010429 -0.9527059
## 4 1.6252894  0.2461331 -0.5140512 -1.5777496  0.9991794
## 5 1.1538656 -1.4034825  0.2915514  0.7167616  0.9991794
## 6 1.1538656  0.7960050  0.2915514  1.1756639  0.9991794

Checked for outliers wiht dissimilarity and decided to remove ID2, then reset the ID order.

get_clust_tendency(data_seg_std,
                   n = nrow(data_seg_std) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.5801586
## 
## $plot
## NULL

0.58 > 0.50 data is clusterable.

WARD <- data_seg_std %>%
  get_dist(method = "euclidean") %>%
  hclust(method = "ward.D2")

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 105

fviz_dend(WARD)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Distance <- get_dist(data_seg_std, 
                     method = "euclidian")

fviz_dist(Distance, 
          gradient = list(low = "darkred", 
                          mid = "grey95", 
                          high = "white"))

fviz_nbclust(data_seg_std, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

fviz_nbclust(data_seg_std, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette analysis")

Most of the methods suggested 2 clusters, as it was not allowed to go on with this I decided to check the dendogram for the second best option which was 3 clusters.

Clustering <- kmeans(data_seg_std, 
                     centers = 3,
                     nstart = 25)

Clustering

## K-means clustering with 3 clusters of sizes 35, 40, 30
## 
## Cluster means:
##          Q21         Q56        Q58        Q65      Q7a_1
## 1 -0.5836674 -0.47655561  0.2685342  0.1923019 -1.1618365
## 2 -0.2604055 -0.01505602  0.4929521  0.3381673  0.6697987
## 3  1.0281527  0.57605624 -0.9705594 -0.6752419  0.4624109
## 
## Clustering vector:
##   [1] 3 3 3 3 2 2 1 2 1 1 2 1 3 1 2 2 3 1 3 1 3 2 1 2 2 3 2 1 2 2 2 2 3 3 2 2 1
##  [38] 1 1 1 2 1 3 3 1 3 2 2 1 2 2 2 2 1 2 2 2 2 3 1 2 3 1 2 1 1 2 2 3 2 1 2 1 3
##  [75] 1 2 3 2 2 1 1 3 3 3 3 1 1 3 2 2 3 1 1 1 2 1 1 3 3 3 1 2 3 1 3
## 
## Within cluster sum of squares by cluster:
## [1]  97.44233 106.40500 120.23515
##  (between_SS / total_SS =  37.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             labelsize = 8,
             data = data_seg_std)

data <- data %>%
  filter(!ID %in% c("26"))

data <- data %>%
  mutate(ID = row_number())

data_seg <- as.data.frame(data[c("ID","Q21","Q56", "Q58", "Q65", "Q7a_1")])

data_seg_std <- as.data.frame(scale(data_seg[c(2:6)]))
head(data_seg_std)

##         Q21        Q56        Q58        Q65      Q7a_1
## 1 0.1998971  0.2601229 -2.3132329 -1.5694611  0.5018437
## 2 1.6173489  0.8122205 -0.5886722 -0.6498895  0.9896171
## 3 1.6173489  0.2601229  0.2736082 -0.1901037 -0.9614763
## 4 1.6173489  0.2601229 -0.5886722 -1.5694611  0.9896171
## 5 1.1448649 -1.3961699  0.2736082  0.7294678  0.9896171
## 6 1.1448649  0.8122205  0.2736082  1.1892536  0.9896171

After the first iteration of the cluster I decided to remove ID 26 as it looks as an outlier.

Clustering <- kmeans(data_seg_std, 
                     centers = 3,
                     nstart = 25)

Clustering

## K-means clustering with 3 clusters of sizes 40, 29, 35
## 
## Cluster means:
##          Q21          Q56        Q58        Q65      Q7a_1
## 1 -0.2725869 -0.002123452  0.4891783  0.3501446  0.6603701
## 2  1.0959873  0.564728489 -0.9752117 -0.7291629  0.5018437
## 3 -0.5965759 -0.465491088  0.2489716  0.2039984 -1.1705220
## 
## Clustering vector:
##   [1] 2 2 2 2 1 1 3 1 3 3 1 3 2 3 1 1 2 3 2 3 2 1 3 1 1 1 3 1 1 1 1 2 2 1 1 3 3
##  [38] 3 3 1 3 2 2 3 2 1 1 3 1 1 1 1 3 1 1 1 1 2 3 1 2 3 1 3 3 1 1 2 1 3 1 3 2 3
##  [75] 1 2 1 1 3 3 2 2 2 2 3 3 2 1 1 2 3 3 3 1 3 3 2 2 2 3 1 2 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 108.2046 105.3258 100.5716
##  (between_SS / total_SS =  39.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

fviz_cluster(Clustering, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             labelsize = 8,
             data = data_seg_std)

I decided to keep this clusters.

Averages <- Clustering$centers
Averages

##          Q21          Q56        Q58        Q65      Q7a_1
## 1 -0.2725869 -0.002123452  0.4891783  0.3501446  0.6603701
## 2  1.0959873  0.564728489 -0.9752117 -0.7291629  0.5018437
## 3 -0.5965759 -0.465491088  0.2489716  0.2039984 -1.1705220

Figure <- as.data.frame(Averages)
Figure$id <- 1:nrow(Figure)

Figure <- pivot_longer(Figure, cols = c("Q21","Q56", "Q58", "Q65", "Q7a_1"))

Figure$Group <- factor(Figure$id, 
                       levels = c(1, 2, 3), 
                       labels = c("1", "2", "3"))

Figure$ImeF <- factor(Figure$name, 
              levels = c("Q21","Q56", "Q58", "Q65", "Q7a_1"), 
              labels = c("CashCardPref","FraudConcern", "TrustBanks", "FutureDigDom", "CashTrack"))


ggplot(Figure, aes(x = ImeF, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 3) +
  geom_line(aes(group = id), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables") +
  scale_color_brewer(palette="Set1") +
  ylim(-1.5, 1.5) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))

The plot shows the averages of the clusters, this already gives us an idea of how these groups are composed, it also shows that there is differences between each other in all of the variables that were selected.

data$Group <- Clustering$cluster

data_seg$Group <- Clustering$cluster

Assigned the groups to the observations in the two data frames that I will be using.

fit <- aov(cbind(Q21, Q56, Q58, Q65, Q7a_1) ~ as.factor(Group), 
             data = data)

summary(fit)

##  Response Q21 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   2 225.15 112.576  48.131 2.082e-15 ***
## Residuals        101 236.23   2.339                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q56 :
##                   Df  Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(Group)   2  55.223 27.6116  9.8651 0.000122 ***
## Residuals        101 282.690  2.7989                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q58 :
##                   Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(Group)   2 52.885  26.442  31.184 2.84e-11 ***
## Residuals        101 85.644   0.848                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q65 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   2 103.02  51.511  13.541 6.165e-06 ***
## Residuals        101 384.20   3.804                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7a_1 :
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Group)   2 305.57 152.784  121.17 < 2.2e-16 ***
## Residuals        101 127.35   1.261                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA test shows that there is an statistical signficant difference between the mean of at least two of the groups in all of the selected variables.

kruskal.test(Q50a ~ Group, 
             data = data)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Q50a by Group
## Kruskal-Wallis chi-squared = 26.973, df = 2, p-value = 1.389e-06

kruskal_effsize(Q50a ~ Group, 
                data = data)

## # A tibble: 1 × 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 Q50a    104   0.247 eta2[H] large

data %>%
  group_by(Group) %>%
  shapiro_test(Q46)

## # A tibble: 3 × 4
##   Group variable statistic          p
##   <int> <chr>        <dbl>      <dbl>
## 1     1 Q46          0.787 0.00000374
## 2     2 Q46          0.930 0.0565    
## 3     3 Q46          0.757 0.00000320

kruskal.test(Q46 ~ Group, 
             data = data)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Q46 by Group
## Kruskal-Wallis chi-squared = 24.21, df = 2, p-value = 5.531e-06

kruskal_effsize(Q46 ~ Group, 
                data = data)

## # A tibble: 1 × 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 Q46     104   0.220 eta2[H] large

I did validation with two different variables, one categorical and one numerical, as the numerical showed a violation in normality within one of the groups I also ran a non-parametrical test, both validations were succesful, this was the last step to prove that the clustering was successful.

data$Q39F <- factor(data$Q39,
                    levels = c(1, 2),
                    labels = c("Man", "Woman"))

chi_square <- chisq.test(data$Q39F, as.factor(data$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q39F and as.factor(data$Group)
## X-squared = 2.6015, df = 2, p-value = 0.2723

Chi-test showed no association between different sex in our groups.

data$Q40G <- ifelse(data$Q40 < 1951, "0", "1")


data$Q40F <- factor(data$Q40G,
                    levels = c(0, 1),
                    labels = c("Less_75", "More_75"))

chi_square <- chisq.test(data$Q40F, as.factor(data$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q40F and as.factor(data$Group)
## X-squared = 5.0269, df = 2, p-value = 0.08099

group_count_age <- table(data$Q40F, data$Group)
print(group_count_age)

##          
##            1  2  3
##   Less_75 17  7 18
##   More_75 23 22 17

I divided the group age in two, people that are 75 or older and the second group that is 74 and younger, the Chi-squared test shows that there is an association between the group of age of observations and the group in which they belong, in this case we can clearly see that in the group of people that are less accepting of digital payments there is a vast majority of people above the cutpoint of 75 years.

data$Q41G <- ifelse(data$Q41 < 5, "0", "1")


data$Q41F <- factor(data$Q41G,
                    levels = c(0, 1),
                    labels = c("Lower_ed", "Higher_ed"))

chi_square <- chisq.test(data$Q41F, as.factor(data$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q41F and as.factor(data$Group)
## X-squared = 20.108, df = 2, p-value = 4.301e-05

group_count_ed <- table(data$Q41F, data$Group)
print(group_count_ed)

##            
##              1  2  3
##   Lower_ed  10 18  4
##   Higher_ed 30 11 31

I did the same for the highest level of education, in this case the p-value clearly shows that there is a big statistical association between the highest education level and the group to which they belong, in this case people belonging to the group that appears to be more reluctant about digital payment have lower degrees of education, the group that seems to be favorable but still in the process of accepting have higher percentage of high education, while the last group which is entirely accepting and pro digital payments have the highest ratio of high educated individuals.

data$Q44G <- ifelse(data$Q44 > 1, "0", "1")


data$Q44F <- factor(data$Q44G,
                    levels = c(0, 1),
                    labels = c("Outside", "City"))

chi_square <- chisq.test(data$Q44F, as.factor(data$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q44F and as.factor(data$Group)
## X-squared = 3.1874, df = 2, p-value = 0.2032

group_count_region <- table(data$Q44F, data$Group)
print(group_count_region)

##          
##            1  2  3
##   Outside 19 20 19
##   City    21  9 16

In the case of the region where different groups live the p-value was too high, it is not valid to describe our groups.

data$Q45G <- ifelse(data$Q45 < 2, "0", "1")


data$Q45F <- factor(data$Q45G,
                    levels = c(0, 1),
                    labels = c("NLB", "Other"))

chi_square <- chisq.test(data$Q45F, as.factor(data$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q45F and as.factor(data$Group)
## X-squared = 0.30967, df = 2, p-value = 0.8566

Which bank do this groups belong to has no statistical difference between each other.

data$Q51F <- factor(data$Q51,
                    levels = c(1, 2, 3, 4, 5),
                    labels = c("SeveralWeek", "OnceWeek", "TwoThreeMonth", "OnceMonth", "Rarely"))

chi_square <- chisq.test(data$Q51F, as.factor(data$Group))

## Warning in chisq.test(data$Q51F, as.factor(data$Group)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q51F and as.factor(data$Group)
## X-squared = 6.2798, df = 8, p-value = 0.6159

Probably there is not enough data in some responses to perform this test, I will try to group some variables.

data$Q51F <- factor(data$Q51,
                    levels = c(1, 2, 3, 4, 5),
                    labels = c("Several times a month", "Several times a month", "Several times a month", "Once a month or less", "Once a month or less"))

chi_square <- chisq.test(data$Q51F, as.factor(data$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q51F and as.factor(data$Group)
## X-squared = 1.4601, df = 2, p-value = 0.4819

After trying different combinations of grouping I couldn’t find an association between the frequency of cash withdrawal and the clusters.

data$Q55cF <- factor(data$Q55c,
                    levels = c(0, 1),
                    labels = c("Not a concern", "Concern"))

chi_square <- chisq.test(data$Q55cF, as.factor(data$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q55cF and as.factor(data$Group)
## X-squared = 5.4779, df = 2, p-value = 0.06464

group_count_difficulties <- table(data$Q55cF, data$Group)
print(group_count_difficulties)

##                
##                  1  2  3
##   Not a concern 29 13 22
##   Concern       11 16 13

After testing if there is an association between the concern that seniors have about the difficulties of using technology I found that there is an association as people from the Cash Conscious Group has more people concerned about this fact, while in the other two groups seniors don’t believe that the difficulty of using technology is a concern.

data$Q62F <- factor(data$Q62,
                    levels = c(1, 2),
                    labels = c("Yes", "No"))

chi_square <- chisq.test(data$Q62, as.factor(data$Group))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  data$Q62 and as.factor(data$Group)
## X-squared = 0.62055, df = 2, p-value = 0.7332

group_count_incentives <- table(data$Q62F, data$Group)
print(group_count_incentives)

##      
##        1  2  3
##   Yes 20 12 15
##   No  20 17 20

Extrinsic incentives don’t appear to have any association with the groups, this also shows that even people who are currently not into digital payments wouldn’t be encouraged by incentives.

kruskal.test(Q64 ~ Group, 
             data = data)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Q64 by Group
## Kruskal-Wallis chi-squared = 6.7353, df = 2, p-value = 0.03447

kruskal_effsize(Q64 ~ Group, 
                data = data)

## # A tibble: 1 × 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 Q64     104  0.0469 eta2[H] small

group_count_edu <- table(data$Q64, data$Group)
print(group_count_edu)

##    
##      1  2  3
##   1  2  5  2
##   2  4  3  2
##   3  3  7  3
##   4  2  3  3
##   5  9  3 13
##   6  8  3  8
##   7 12  5  4

library(ggplot2)
library(dplyr)

# Convert the table to a data frame
group_count_df <- as.data.frame(group_count_edu)
colnames(group_count_df) <- c("Q64", "Cluster", "Count")

# Create the bar plot
ggplot(group_count_df, aes(x = Q64, y = Count, fill = as.factor(Cluster))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Response to Q64", y = "Count", fill = "Cluster Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

# Calculate the average response of Q64 for each cluster group
avg_response_per_group <- data %>%
  group_by(Group) %>%
  summarise(Average_Q64 = mean(Q64))

# Print the results
print(avg_response_per_group)

## # A tibble: 3 × 2
##   Group Average_Q64
##   <int>       <dbl>
## 1     1        5.1 
## 2     2        3.86
## 3     3        4.8

The average interest between the three groups is statistically different, we can see that people from Group 1 are much less interested in average than the other two groups.

NLB_cluster3

Jesus Amos Facundo

2025-02-04