This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
plot(cars)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
library(readxl)
library(dplyr)
library(tidyr)
library(tibble)
raw_sheet <- read_excel(
"Questionnaire_results_EN.xlsx",
sheet = "Podatki",
col_names = FALSE
)
raw_sheet
var_names <- raw_sheet |>
slice(1) |>
unlist(use.names = FALSE) |>
as.character()
question_text <- raw_sheet |>
slice(2) |>
unlist(use.names = FALSE) |>
as.character()
data_raw <- raw_sheet |>
slice(-(1:2))
names(data_raw) <- var_names
data_raw <- data_raw |>
mutate(respondent_id = row_number()) |>
relocate(respondent_id)
glimpse(data_raw)
Rows: 338
Columns: 93
$ respondent_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
$ Q1 <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1…
$ Q2 <chr> "3", "2", "3", "2", "3", "2", "3", "3", "3", "3", "2", "2", "3…
$ Q3a <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1…
$ Q3b <chr> "1", "1", "1", "1", "0", "1", "1", "1", "1", "1", "1", "1", "1…
$ Q3c <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1…
$ Q3d <chr> "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1…
$ Q3e <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "0", "1…
$ Q3f <chr> "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0", "0…
$ Q3g <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q3g_text <chr> "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-…
$ Q4 <chr> "3", "2", "1", "1", "2", "2", "1", "1", "4", "2", "2", "1", "2…
$ Q4_5_text <chr> "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-…
$ Q5a <chr> "5", "7", "7", "4", "5", "7", "7", "4", "5", "5", "7", "7", "7…
$ Q5b <chr> "5", "6", "5", "4", "2", "5", "2", "4", "4", "2", "2", "2", "1…
$ Q5c <chr> "6", "6", "7", "6", "7", "7", "7", "4", "5", "6", "7", "7", "7…
$ Q5d <chr> "6", "3", "5", "7", "5", "7", "7", "4", "6", "7", "6", "6", "4…
$ Q5e <chr> "5", "5", "6", "1", "4", "5", "4", "4", "3", "1", "6", "5", "1…
$ Q6a <chr> "6", "7", "7", "6", "5", "7", "6", "4", "4", "6", "6", "7", "7…
$ Q6b <chr> "2", "6", "7", "4", "4", "4", "4", "4", "3", "5", "3", "4", "4…
$ Q6c <chr> "6", "7", "7", "6", "5", "5", "3", "4", "6", "5", "5", "5", "4…
$ Q6d <chr> "2", "7", "6", "4", "6", "6", "5", "4", "4", "6", "6", "5", "5…
$ Q6e <chr> "5", "1", "7", "2", "3", "1", "2", "4", "4", "5", "2", "1", "1…
$ Q6f <chr> "6", "5", "3", "4", "3", "1", "2", "4", "5", "5", "1", "1", "1…
$ Q6g <chr> "6", "6", "7", "6", "5", "1", "4", "4", "6", "6", "5", "7", "4…
$ Q6h <chr> "5", "7", "7", "6", "6", "6", "3", "4", "5", "6", "6", "6", "4…
$ Q6i <chr> "3", "6", "5", "4", "4", "5", "3", "4", "4", "5", "2", "5", "4…
$ Q8a <chr> "5", "7", "7", "1", "3", "3", "2", "4", "4", "6", "6", "3", "1…
$ Q8b <chr> "7", "7", "6", "3", "3", "6", "2", "4", "3", "6", "6", "4", "4…
$ Q8c <chr> "2", "6", "5", "1", "2", "1", "2", "4", "3", "3", "1", "1", "1…
$ Q8d <chr> "5", "7", "7", "7", "3", "7", "2", "4", "3", "5", "5", "6", "1…
$ Q8e <chr> "5", "6", "7", "7", "2", "2", "5", "4", "3", "7", "5", "5", "4…
$ Q10a <chr> "7", "7", "7", "6", "3", "6", "2", "4", "1", "6", "6", "5", "4…
$ Q10b <chr> "7", "7", "7", "5", "3", "7", "2", "4", "5", "6", "5", "5", "6…
$ Q10c <chr> "5", "7", "5", "1", "4", "5", "2", "4", "4", "5", "5", "7", "4…
$ Q10d <chr> "3", "3", "6", "2", "3", "2", "2", "4", "4", "5", "4", "4", "4…
$ Q11a <chr> "4", "3", "7", "1", "1", "2", "1", "4", "4", "4", "2", "1", "1…
$ Q11b <chr> "3", "6", "7", "1", "1", "2", "1", "4", "1", "6", "6", "2", "1…
$ Q11c <chr> "6", "7", "5", "4", "1", "6", "1", "4", "4", "2", "3", "4", "1…
$ Q11d <chr> "1", "2", "5", "2", "1", "2", "1", "4", "4", "5", "1", "2", "1…
$ Q11e <chr> "1", "1", "5", "1", "1", "2", "1", "4", "4", "4", "2", "1", "1…
$ Q11f <chr> "3", "6", "7", "4", "1", "2", "1", "4", "4", "2", "3", "6", "1…
$ Q12a <chr> "7", "7", "7", "7", "7", "7", "7", "4", "7", "7", "7", "7", "7…
$ Q12b <chr> "7", "7", "7", "7", "7", "7", "7", "4", "7", "7", "6", "7", "7…
$ Q12c <chr> "7", "5", "7", "7", "7", "7", "7", "4", "7", "6", "7", "7", "7…
$ Q12d <chr> "5", "3", "7", "7", "5", "7", "7", "4", "7", "6", "7", "7", "7…
$ Q12e <chr> "6", "5", "7", "7", "6", "5", "7", "4", "7", "6", "4", "7", "7…
$ Q12f <chr> "4", "3", "6", "5", "4", "5", "7", "4", "7", "6", "1", "6", "7…
$ Q12g <chr> "7", "6", "7", "6", "6", "7", "7", "4", "7", "6", "3", "5", "7…
$ Q12h <chr> "6", "6", "7", "5", "4", "5", "7", "4", "7", "6", "2", "5", "7…
$ Q12i <chr> "4", "3", "7", "3", "2", "7", "7", "4", "7", "6", "7", "5", "7…
$ Q13a <chr> "6", "4", "8", "3", "7", "2", "8", "4", "5", "4", "8", "5", "7…
$ Q13b <chr> "8", "3", "2", "8", "5", "8", "8", "4", "8", "8", "8", "8", "1…
$ Q13c <chr> "3", "5", "8", "6", "4", "8", "8", "4", "8", "8", "2", "8", "8…
$ Q13d <chr> "7", "7", "8", "8", "6", "7", "6", "-1", "8", "8", "4", "8", "…
$ Q13e <chr> "7", "7", "8", "7", "4", "8", "6", "-1", "8", "8", "8", "8", "…
$ Q13f <chr> "8", "5", "8", "8", "4", "8", "8", "4", "8", "8", "8", "8", "8…
$ Q13g <chr> "8", "4", "8", "8", "5", "8", "8", "4", "1", "8", "8", "8", "8…
$ Q14a <chr> "7", "3", "8", "2", "7", "3", "8", "4", "7", "2", "8", "5", "7…
$ Q14b <chr> "8", "4", "4", "8", "5", "8", "8", "4", "8", "8", "8", "8", "1…
$ Q14c <chr> "8", "5", "8", "5", "4", "8", "8", "4", "8", "8", "8", "8", "-…
$ Q14d <chr> "5", "6", "8", "8", "5", "8", "6", "4", "8", "8", "8", "8", "-…
$ Q14e <chr> "8", "6", "8", "5", "8", "8", "5", "4", "8", "8", "8", "8", "-…
$ Q14f <chr> "8", "5", "8", "8", "8", "8", "8", "4", "8", "8", "8", "8", "8…
$ Q14g <chr> "8", "3", "8", "8", "8", "8", "8", "4", "1", "8", "8", "8", "8…
$ Q15a <chr> "7", "4", "8", "3", "7", "7", "6", "4", "7", "5", "8", "6", "7…
$ Q15b <chr> "8", "3", "5", "8", "5", "8", "8", "4", "8", "8", "8", "8", "3…
$ Q15c <chr> "8", "5", "8", "5", "5", "8", "8", "4", "8", "8", "7", "8", "-…
$ Q15d <chr> "7", "6", "7", "8", "5", "7", "6", "4", "8", "8", "6", "8", "8…
$ Q15e <chr> "8", "6", "8", "6", "8", "8", "6", "4", "8", "8", "8", "8", "8…
$ Q15f <chr> "8", "4", "8", "8", "8", "8", "-1", "4", "8", "8", "8", "8", "…
$ Q15g <chr> "8", "3", "8", "8", "8", "8", "8", "4", "1", "8", "8", "8", "8…
$ Q17 <chr> "5", "7", "5", "1", "3", "5", "1", "-1", "4", "6", "6", "3", "…
$ Q18 <chr> "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2…
$ Q19 <chr> "1", "3", "1", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2",…
$ Q21 <chr> "2002", "2002", "2002", "2001", "2004", "2002", "2000", "2004"…
$ Q22 <chr> "2", "1", "1", "2", "1", "2", "1", "1", "1", "1", "2", "1", "1…
$ Q23 <chr> "3", "1", "1", "2", "1", "1", "1", "3", "1", "1", "3", "2", "3…
$ Q24 <chr> "6", "6", "5", "6", "5", "5", "6", "5", "6", "6", "7", "6", "6…
$ Q25a <chr> "1", "0", "0", "0", "1", "1", "0", "1", "1", "1", "0", "1", "1…
$ Q25b <chr> "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25c <chr> "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "1", "0", "0…
$ Q25d <chr> "1", "1", "1", "0", "0", "1", "1", "0", "0", "0", "1", "0", "0…
$ Q25e <chr> "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25f <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25g <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25h <chr> "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25i <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25j <chr> "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0…
$ Q25k <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25l <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25l_text <chr> "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-…
$ Q26 <chr> "2", "3", "3", "2", "1", "3", "5", "1", "4", "1", "1", "1", "4…
question_lookup <- tibble(
variable = var_names,
question = question_text
)
question_lookup
data_clean <- data_raw |>
mutate(
across(
everything(),
~ replace(as.character(.), as.character(.) == "-1", NA)
)
)
glimpse(data_clean)
Rows: 338
Columns: 93
$ respondent_id <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",…
$ Q1 <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1…
$ Q2 <chr> "3", "2", "3", "2", "3", "2", "3", "3", "3", "3", "2", "2", "3…
$ Q3a <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1…
$ Q3b <chr> "1", "1", "1", "1", "0", "1", "1", "1", "1", "1", "1", "1", "1…
$ Q3c <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1…
$ Q3d <chr> "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1…
$ Q3e <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "0", "1…
$ Q3f <chr> "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0", "0…
$ Q3g <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q3g_text <chr> "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-…
$ Q4 <chr> "3", "2", "1", "1", "2", "2", "1", "1", "4", "2", "2", "1", "2…
$ Q4_5_text <chr> "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-…
$ Q5a <chr> "5", "7", "7", "4", "5", "7", "7", "4", "5", "5", "7", "7", "7…
$ Q5b <chr> "5", "6", "5", "4", "2", "5", "2", "4", "4", "2", "2", "2", "1…
$ Q5c <chr> "6", "6", "7", "6", "7", "7", "7", "4", "5", "6", "7", "7", "7…
$ Q5d <chr> "6", "3", "5", "7", "5", "7", "7", "4", "6", "7", "6", "6", "4…
$ Q5e <chr> "5", "5", "6", "1", "4", "5", "4", "4", "3", "1", "6", "5", "1…
$ Q6a <chr> "6", "7", "7", "6", "5", "7", "6", "4", "4", "6", "6", "7", "7…
$ Q6b <chr> "2", "6", "7", "4", "4", "4", "4", "4", "3", "5", "3", "4", "4…
$ Q6c <chr> "6", "7", "7", "6", "5", "5", "3", "4", "6", "5", "5", "5", "4…
$ Q6d <chr> "2", "7", "6", "4", "6", "6", "5", "4", "4", "6", "6", "5", "5…
$ Q6e <chr> "5", "1", "7", "2", "3", "1", "2", "4", "4", "5", "2", "1", "1…
$ Q6f <chr> "6", "5", "3", "4", "3", "1", "2", "4", "5", "5", "1", "1", "1…
$ Q6g <chr> "6", "6", "7", "6", "5", "1", "4", "4", "6", "6", "5", "7", "4…
$ Q6h <chr> "5", "7", "7", "6", "6", "6", "3", "4", "5", "6", "6", "6", "4…
$ Q6i <chr> "3", "6", "5", "4", "4", "5", "3", "4", "4", "5", "2", "5", "4…
$ Q8a <chr> "5", "7", "7", "1", "3", "3", "2", "4", "4", "6", "6", "3", "1…
$ Q8b <chr> "7", "7", "6", "3", "3", "6", "2", "4", "3", "6", "6", "4", "4…
$ Q8c <chr> "2", "6", "5", "1", "2", "1", "2", "4", "3", "3", "1", "1", "1…
$ Q8d <chr> "5", "7", "7", "7", "3", "7", "2", "4", "3", "5", "5", "6", "1…
$ Q8e <chr> "5", "6", "7", "7", "2", "2", "5", "4", "3", "7", "5", "5", "4…
$ Q10a <chr> "7", "7", "7", "6", "3", "6", "2", "4", "1", "6", "6", "5", "4…
$ Q10b <chr> "7", "7", "7", "5", "3", "7", "2", "4", "5", "6", "5", "5", "6…
$ Q10c <chr> "5", "7", "5", "1", "4", "5", "2", "4", "4", "5", "5", "7", "4…
$ Q10d <chr> "3", "3", "6", "2", "3", "2", "2", "4", "4", "5", "4", "4", "4…
$ Q11a <chr> "4", "3", "7", "1", "1", "2", "1", "4", "4", "4", "2", "1", "1…
$ Q11b <chr> "3", "6", "7", "1", "1", "2", "1", "4", "1", "6", "6", "2", "1…
$ Q11c <chr> "6", "7", "5", "4", "1", "6", "1", "4", "4", "2", "3", "4", "1…
$ Q11d <chr> "1", "2", "5", "2", "1", "2", "1", "4", "4", "5", "1", "2", "1…
$ Q11e <chr> "1", "1", "5", "1", "1", "2", "1", "4", "4", "4", "2", "1", "1…
$ Q11f <chr> "3", "6", "7", "4", "1", "2", "1", "4", "4", "2", "3", "6", "1…
$ Q12a <chr> "7", "7", "7", "7", "7", "7", "7", "4", "7", "7", "7", "7", "7…
$ Q12b <chr> "7", "7", "7", "7", "7", "7", "7", "4", "7", "7", "6", "7", "7…
$ Q12c <chr> "7", "5", "7", "7", "7", "7", "7", "4", "7", "6", "7", "7", "7…
$ Q12d <chr> "5", "3", "7", "7", "5", "7", "7", "4", "7", "6", "7", "7", "7…
$ Q12e <chr> "6", "5", "7", "7", "6", "5", "7", "4", "7", "6", "4", "7", "7…
$ Q12f <chr> "4", "3", "6", "5", "4", "5", "7", "4", "7", "6", "1", "6", "7…
$ Q12g <chr> "7", "6", "7", "6", "6", "7", "7", "4", "7", "6", "3", "5", "7…
$ Q12h <chr> "6", "6", "7", "5", "4", "5", "7", "4", "7", "6", "2", "5", "7…
$ Q12i <chr> "4", "3", "7", "3", "2", "7", "7", "4", "7", "6", "7", "5", "7…
$ Q13a <chr> "6", "4", "8", "3", "7", "2", "8", "4", "5", "4", "8", "5", "7…
$ Q13b <chr> "8", "3", "2", "8", "5", "8", "8", "4", "8", "8", "8", "8", "1…
$ Q13c <chr> "3", "5", "8", "6", "4", "8", "8", "4", "8", "8", "2", "8", "8…
$ Q13d <chr> "7", "7", "8", "8", "6", "7", "6", NA, "8", "8", "4", "8", "8"…
$ Q13e <chr> "7", "7", "8", "7", "4", "8", "6", NA, "8", "8", "8", "8", "8"…
$ Q13f <chr> "8", "5", "8", "8", "4", "8", "8", "4", "8", "8", "8", "8", "8…
$ Q13g <chr> "8", "4", "8", "8", "5", "8", "8", "4", "1", "8", "8", "8", "8…
$ Q14a <chr> "7", "3", "8", "2", "7", "3", "8", "4", "7", "2", "8", "5", "7…
$ Q14b <chr> "8", "4", "4", "8", "5", "8", "8", "4", "8", "8", "8", "8", "1…
$ Q14c <chr> "8", "5", "8", "5", "4", "8", "8", "4", "8", "8", "8", "8", NA…
$ Q14d <chr> "5", "6", "8", "8", "5", "8", "6", "4", "8", "8", "8", "8", NA…
$ Q14e <chr> "8", "6", "8", "5", "8", "8", "5", "4", "8", "8", "8", "8", NA…
$ Q14f <chr> "8", "5", "8", "8", "8", "8", "8", "4", "8", "8", "8", "8", "8…
$ Q14g <chr> "8", "3", "8", "8", "8", "8", "8", "4", "1", "8", "8", "8", "8…
$ Q15a <chr> "7", "4", "8", "3", "7", "7", "6", "4", "7", "5", "8", "6", "7…
$ Q15b <chr> "8", "3", "5", "8", "5", "8", "8", "4", "8", "8", "8", "8", "3…
$ Q15c <chr> "8", "5", "8", "5", "5", "8", "8", "4", "8", "8", "7", "8", NA…
$ Q15d <chr> "7", "6", "7", "8", "5", "7", "6", "4", "8", "8", "6", "8", "8…
$ Q15e <chr> "8", "6", "8", "6", "8", "8", "6", "4", "8", "8", "8", "8", "8…
$ Q15f <chr> "8", "4", "8", "8", "8", "8", NA, "4", "8", "8", "8", "8", "8"…
$ Q15g <chr> "8", "3", "8", "8", "8", "8", "8", "4", "1", "8", "8", "8", "8…
$ Q17 <chr> "5", "7", "5", "1", "3", "5", "1", NA, "4", "6", "6", "3", "2"…
$ Q18 <chr> "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2…
$ Q19 <chr> "1", "3", "1", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2",…
$ Q21 <chr> "2002", "2002", "2002", "2001", "2004", "2002", "2000", "2004"…
$ Q22 <chr> "2", "1", "1", "2", "1", "2", "1", "1", "1", "1", "2", "1", "1…
$ Q23 <chr> "3", "1", "1", "2", "1", "1", "1", "3", "1", "1", "3", "2", "3…
$ Q24 <chr> "6", "6", "5", "6", "5", "5", "6", "5", "6", "6", "7", "6", "6…
$ Q25a <chr> "1", "0", "0", "0", "1", "1", "0", "1", "1", "1", "0", "1", "1…
$ Q25b <chr> "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25c <chr> "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "1", "0", "0…
$ Q25d <chr> "1", "1", "1", "0", "0", "1", "1", "0", "0", "0", "1", "0", "0…
$ Q25e <chr> "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25f <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25g <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25h <chr> "0", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25i <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25j <chr> "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0…
$ Q25k <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25l <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0…
$ Q25l_text <chr> "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-…
$ Q26 <chr> "2", "3", "3", "2", "1", "3", "5", "1", "4", "1", "1", "1", "4…
ids_removed <- data_clean |>
filter(is.na(Q22) | is.na(Q23)) |>
pull(respondent_id)
ids_removed
[1] "63" "64" "181" "272" "325"
data_clean <- data_clean |>
filter(!is.na(Q22), !is.na(Q23))
nrow(data_raw)
[1] 338
nrow(data_clean)
[1] 333
missing_summary <- data_clean |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "n_missing"
) |>
arrange(desc(n_missing))
missing_summary
question_lookup |>
filter(grepl("Q13|Q14|Q15", variable))
data_clean |>
select(Q13a:Q15g) |>
summarise(across(everything(), ~ paste(sort(unique(.)), collapse = ", "))) |>
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "values"
)
data_clean <- data_clean |>
mutate(
across(
Q13a:Q15g,
~ as.numeric(replace(., . == "8", NA))
)
)
data_clean |>
select(Q13a:Q15g) |>
summarise(across(everything(), ~ paste(sort(unique(.)), collapse = ", "))) |>
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "values"
)
bank_means <- tibble(
bank = c("OTP", "Gorenjska Banka", "NLB", "Revolut", "N26", "Intesa SanPaolo", "UniCredit"),
innovation = c(
mean(data_clean$Q13a, na.rm = TRUE),
mean(data_clean$Q13b, na.rm = TRUE),
mean(data_clean$Q13c, na.rm = TRUE),
mean(data_clean$Q13d, na.rm = TRUE),
mean(data_clean$Q13e, na.rm = TRUE),
mean(data_clean$Q13f, na.rm = TRUE),
mean(data_clean$Q13g, na.rm = TRUE)
),
customer_support = c(
mean(data_clean$Q14a, na.rm = TRUE),
mean(data_clean$Q14b, na.rm = TRUE),
mean(data_clean$Q14c, na.rm = TRUE),
mean(data_clean$Q14d, na.rm = TRUE),
mean(data_clean$Q14e, na.rm = TRUE),
mean(data_clean$Q14f, na.rm = TRUE),
mean(data_clean$Q14g, na.rm = TRUE)
),
reliability = c(
mean(data_clean$Q15a, na.rm = TRUE),
mean(data_clean$Q15b, na.rm = TRUE),
mean(data_clean$Q15c, na.rm = TRUE),
mean(data_clean$Q15d, na.rm = TRUE),
mean(data_clean$Q15e, na.rm = TRUE),
mean(data_clean$Q15f, na.rm = TRUE),
mean(data_clean$Q15g, na.rm = TRUE)
)
)
bank_means
pca_result <- prcomp(
bank_means[, c("innovation", "customer_support", "reliability")],
scale. = TRUE
)
pca_result
Standard deviations (1, .., p=3):
[1] 1.6443246 0.5059028 0.2006464
Rotation (n x k) = (3 x 3):
PC1 PC2 PC3
innovation 0.5700010 0.66141369 0.4874739
customer_support 0.5614043 -0.74671513 0.3567095
reliability 0.5999367 0.07034521 -0.7969489
bank_coordinates <- as.data.frame(pca_result$x)
bank_coordinates$bank <- bank_means$bank
bank_coordinates
plot(
bank_coordinates$PC1,
bank_coordinates$PC2,
xlab = "PC1",
ylab = "PC2",
main = "Perception Map of Banks",
pch = 19
)
text(
bank_coordinates$PC1,
bank_coordinates$PC2,
labels = bank_coordinates$bank,
pos = 3
)
library(ggplot2)
ggplot(bank_coordinates, aes(x = PC1, y = PC2, label = bank)) +
geom_point(size = 3) +
geom_text(vjust = -0.7) +
labs(
title = "Perception Map of Banks",
x = "Dimension 1",
y = "Dimension 2"
) +
theme_minimal()
library(FactoMineR)
library(factoextra)
mat <- bank_means |>
tibble::column_to_rownames("bank") |>
as.matrix()
pca_bank <- FactoMineR::PCA(mat, scale.unit = TRUE, graph = FALSE)
factoextra::fviz_pca_ind(
pca_bank,
repel = TRUE
) +
ggplot2::ggtitle("Perceptual map of banks (Q13–Q15)")
factoextra::fviz_pca_biplot(
pca_bank,
repel = TRUE,
col.var = "gray30"
) +
ggplot2::ggtitle("Perceptual map of banks with dimensions (Q13–Q15)")
factoextra::fviz_pca_biplot(
pca_bank,
repel = TRUE,
col.var = "gray35",
col.ind = "steelblue",
pointsize = 3
) +
ggplot2::ggtitle("Perceptual Map of Banks") +
ggplot2::xlab("Overall Digital Banking Performance") +
ggplot2::ylab("Customer Support vs Innovation") +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(face = "bold"),
axis.title = ggplot2::element_text(face = "bold")
)
factoextra::fviz_pca_biplot(
pca_bank,
repel = TRUE,
col.var = "gray30"
) +
ggplot2::ggtitle("Perceptual map of banks with dimensions (Q13–Q15)") +
ggplot2::xlab("Dim1 (90.1%): Overall Digital Banking Performance") +
ggplot2::ylab("Dim2 (8.5%): Customer Support vs Innovation")
data_clean |>
select(Q6a, Q6b, Q6d, Q6e, Q6h, Q6i, Q8a, Q8b, Q8c, Q8d, Q8e) |>
summarise(across(everything(), ~ paste(sort(unique(.)), collapse = ", "))) |>
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "values"
)
cluster_data <- data_clean |>
select(Q6a, Q6b, Q6d, Q6e, Q6h, Q6i, Q8a, Q8b, Q8c, Q8d, Q8e) |>
mutate(across(everything(), as.numeric))
glimpse(cluster_data)
Rows: 333
Columns: 11
$ Q6a <dbl> 6, 7, 7, 6, 5, 7, 6, 4, 4, 6, 6, 7, 7, 5, 7, 7, 6, 6, 5, 4, 6, 7, 7, 7, …
$ Q6b <dbl> 2, 6, 7, 4, 4, 4, 4, 4, 3, 5, 3, 4, 4, 2, 1, 7, NA, 6, 5, 3, 6, 2, 1, 5,…
$ Q6d <dbl> 2, 7, 6, 4, 6, 6, 5, 4, 4, 6, 6, 5, 5, 2, 5, 6, 3, 3, 5, 5, 7, 7, 7, 5, …
$ Q6e <dbl> 5, 1, 7, 2, 3, 1, 2, 4, 4, 5, 2, 1, 1, 1, 2, 5, 2, 7, 2, 5, 5, 2, 1, 2, …
$ Q6h <dbl> 5, 7, 7, 6, 6, 6, 3, 4, 5, 6, 6, 6, 4, 3, 4, 7, NA, 6, 4, 5, 6, 4, 7, 7,…
$ Q6i <dbl> 3, 6, 5, 4, 4, 5, 3, 4, 4, 5, 2, 5, 4, 2, 2, 3, 7, 4, 5, 5, 7, 4, 1, 7, …
$ Q8a <dbl> 5, 7, 7, 1, 3, 3, 2, 4, 4, 6, 6, 3, 1, 3, 1, 5, 6, 4, 4, 4, 7, 4, 1, 6, …
$ Q8b <dbl> 7, 7, 6, 3, 3, 6, 2, 4, 3, 6, 6, 4, 4, 3, 3, 6, 6, 5, 3, 5, 7, 2, 7, 6, …
$ Q8c <dbl> 2, 6, 5, 1, 2, 1, 2, 4, 3, 3, 1, 1, 1, 3, 3, 3, 1, 1, 2, 2, 7, 3, 1, 1, …
$ Q8d <dbl> 5, 7, 7, 7, 3, 7, 2, 4, 3, 5, 5, 6, 1, 3, 3, 7, 4, 1, 5, 5, 7, 6, 7, 7, …
$ Q8e <dbl> 5, 6, 7, 7, 2, 2, 5, 4, 3, 7, 5, 5, 4, 3, 5, 5, 4, 1, 6, 5, 7, 4, 7, 7, …
cluster_data |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "n_missing"
)
cluster_data_complete <- cluster_data |>
tidyr::drop_na()
nrow(cluster_data)
[1] 333
nrow(cluster_data_complete)
[1] 323
cluster_scaled <- scale(cluster_data_complete)
cluster_scaled
Q6a Q6b Q6d Q6e Q6h Q6i
[1,] -0.01386611 -0.8167893 -1.71942066 0.8688513 0.01405589 -0.48919895
[2,] 0.73259278 1.1780913 1.11411441 -1.1392987 1.14906922 1.04489097
[3,] 0.73259278 1.6768114 0.54740739 1.8729263 1.14906922 0.53352766
[4,] -0.01386611 0.1806510 -0.58600663 -0.6372612 0.58156256 0.02216435
[5,] -0.76032500 0.1806510 0.54740739 -0.1352237 0.58156256 0.02216435
[6,] 0.73259278 0.1806510 0.54740739 -1.1392987 0.58156256 0.53352766
[7,] -0.01386611 0.1806510 -0.01929962 -0.6372612 -1.12095744 -0.48919895
[8,] -1.50678388 0.1806510 -0.58600663 0.3668138 -0.55345077 0.02216435
[9,] -1.50678388 -0.3180692 -0.58600663 0.3668138 0.01405589 0.02216435
[10,] -0.01386611 0.6793711 0.54740739 0.8688513 0.58156256 0.53352766
[11,] -0.01386611 -0.3180692 0.54740739 -0.6372612 0.58156256 -1.00056226
[12,] 0.73259278 0.1806510 -0.01929962 -1.1392987 0.58156256 0.53352766
[13,] 0.73259278 0.1806510 -0.01929962 -1.1392987 -0.55345077 0.02216435
[14,] -0.76032500 -0.8167893 -1.71942066 -1.1392987 -1.12095744 -1.00056226
[15,] 0.73259278 -1.3155095 -0.01929962 -0.6372612 -0.55345077 -1.00056226
[16,] 0.73259278 1.6768114 0.54740739 0.8688513 1.14906922 -0.48919895
[17,] -0.01386611 1.1780913 -1.15271365 1.8729263 0.58156256 0.02216435
[18,] -0.76032500 0.6793711 -0.01929962 -0.6372612 -0.55345077 0.53352766
[19,] -1.50678388 -0.3180692 -0.01929962 0.8688513 0.01405589 0.53352766
[20,] -0.01386611 1.1780913 1.11411441 0.8688513 0.58156256 1.55625428
[21,] 0.73259278 -0.8167893 1.11411441 -0.6372612 -0.55345077 0.02216435
[22,] 0.73259278 -1.3155095 1.11411441 -1.1392987 1.14906922 -1.51192557
[23,] 0.73259278 0.6793711 -0.01929962 -0.6372612 1.14906922 1.55625428
[24,] 0.73259278 -0.8167893 0.54740739 -0.1352237 1.14906922 1.55625428
[25,] -0.76032500 -0.3180692 -0.01929962 -0.6372612 -1.12095744 1.04489097
[26,] -2.25324277 -0.3180692 -0.58600663 -0.6372612 0.01405589 0.02216435
[27,] 0.73259278 1.1780913 1.11411441 -0.1352237 0.01405589 0.53352766
[28,] -0.01386611 -0.3180692 -0.01929962 0.3668138 0.01405589 -0.48919895
[29,] 0.73259278 0.6793711 0.54740739 -0.6372612 1.14906922 1.55625428
[30,] 0.73259278 1.6768114 1.11411441 -0.1352237 1.14906922 -1.00056226
[31,] -0.01386611 1.6768114 -1.71942066 0.8688513 -0.55345077 0.53352766
[32,] 0.73259278 0.1806510 0.54740739 -0.1352237 1.14906922 0.02216435
[33,] -0.01386611 -0.8167893 -0.01929962 -1.1392987 0.01405589 -1.00056226
[34,] 0.73259278 1.6768114 1.11411441 0.3668138 1.14906922 -1.51192557
[35,] -0.01386611 0.6793711 -0.58600663 1.8729263 -1.68846410 -1.00056226
[36,] 0.73259278 1.6768114 1.11411441 0.8688513 1.14906922 1.04489097
[37,] -0.01386611 -1.3155095 1.11411441 1.8729263 1.14906922 1.55625428
[38,] -0.76032500 0.1806510 -0.01929962 0.3668138 0.01405589 1.04489097
[39,] 0.73259278 1.6768114 -1.71942066 -1.1392987 -0.55345077 -1.51192557
[40,] -0.01386611 0.1806510 -0.01929962 1.3708888 0.01405589 0.02216435
[41,] -0.76032500 1.6768114 1.11411441 1.8729263 1.14906922 0.53352766
[42,] -0.76032500 0.6793711 -0.01929962 -0.1352237 0.01405589 0.02216435
[43,] 0.73259278 1.1780913 0.54740739 -0.1352237 -0.55345077 0.53352766
[44,] -0.01386611 0.6793711 -1.15271365 0.8688513 0.01405589 1.04489097
[45,] 0.73259278 0.6793711 0.54740739 0.3668138 1.14906922 0.02216435
[46,] -0.01386611 0.6793711 -0.01929962 -0.6372612 0.58156256 -0.48919895
[47,] 0.73259278 -1.3155095 -1.71942066 -0.6372612 1.14906922 1.04489097
[48,] -0.01386611 -0.3180692 -0.01929962 0.8688513 -0.55345077 0.53352766
[49,] 0.73259278 -1.3155095 -2.28612767 -1.1392987 -2.25597077 -1.51192557
[50,] -0.01386611 1.1780913 -0.01929962 -0.1352237 0.01405589 0.53352766
[51,] 0.73259278 1.6768114 1.11411441 1.3708888 1.14906922 0.02216435
[52,] -0.76032500 -1.3155095 -0.58600663 0.8688513 0.58156256 0.53352766
[53,] -0.76032500 -0.8167893 -0.58600663 -0.1352237 0.58156256 0.02216435
[54,] -1.50678388 -0.8167893 -0.01929962 -0.1352237 0.58156256 0.02216435
[55,] -0.01386611 -0.3180692 -1.71942066 -0.6372612 -0.55345077 1.55625428
[56,] -0.01386611 0.1806510 0.54740739 -0.6372612 0.01405589 -0.48919895
[57,] -0.01386611 -0.3180692 0.54740739 -0.1352237 0.58156256 -1.00056226
[58,] -0.01386611 -0.3180692 -0.58600663 -0.6372612 0.01405589 -0.48919895
[59,] -0.01386611 0.1806510 -0.01929962 -0.6372612 0.01405589 -0.48919895
[60,] 0.73259278 1.6768114 -0.01929962 0.8688513 0.58156256 -0.48919895
[61,] -0.01386611 0.6793711 -0.01929962 0.8688513 -1.12095744 0.02216435
[62,] -0.76032500 1.1780913 0.54740739 0.3668138 1.14906922 -1.51192557
[63,] 0.73259278 1.6768114 1.11411441 -0.1352237 1.14906922 1.55625428
[64,] -0.76032500 0.1806510 0.54740739 -0.1352237 1.14906922 0.53352766
[65,] 0.73259278 0.6793711 1.11411441 1.8729263 0.58156256 -1.51192557
[66,] 0.73259278 1.6768114 1.11411441 1.8729263 1.14906922 1.55625428
[67,] -1.50678388 1.6768114 1.11411441 1.8729263 1.14906922 1.55625428
[68,] 0.73259278 1.6768114 1.11411441 1.8729263 1.14906922 1.55625428
[69,] 0.73259278 0.1806510 -0.01929962 0.3668138 0.01405589 0.53352766
[70,] -0.76032500 -1.3155095 0.54740739 -1.1392987 0.01405589 1.04489097
[71,] 0.73259278 0.1806510 1.11411441 1.8729263 1.14906922 1.55625428
[72,] -1.50678388 0.1806510 1.11411441 0.3668138 -0.55345077 0.02216435
[73,] -2.25324277 -0.8167893 0.54740739 0.3668138 1.14906922 1.04489097
[74,] -1.50678388 0.1806510 -0.58600663 -1.1392987 -2.25597077 -1.51192557
[75,] -0.76032500 -0.3180692 1.11411441 -0.1352237 1.14906922 -0.48919895
[76,] 0.73259278 1.6768114 -1.15271365 -1.1392987 -0.55345077 -1.51192557
[77,] -0.01386611 0.1806510 -0.58600663 -0.1352237 -0.55345077 0.53352766
[78,] -0.01386611 -0.8167893 -1.15271365 -1.1392987 0.58156256 -0.48919895
[79,] -0.01386611 -0.3180692 -0.01929962 1.3708888 0.01405589 -0.48919895
[80,] 0.73259278 1.6768114 1.11411441 -1.1392987 0.58156256 1.55625428
[81,] -0.01386611 0.6793711 -0.01929962 0.8688513 0.01405589 -0.48919895
[82,] -0.01386611 -1.3155095 1.11411441 0.8688513 0.58156256 0.02216435
[83,] 0.73259278 -0.3180692 1.11411441 -1.1392987 -2.25597077 -1.51192557
[84,] -0.01386611 1.1780913 0.54740739 1.3708888 0.58156256 -0.48919895
[85,] 0.73259278 0.1806510 1.11411441 1.8729263 1.14906922 1.55625428
[86,] -0.76032500 -1.3155095 -1.15271365 0.8688513 -0.55345077 0.53352766
[87,] 0.73259278 1.6768114 1.11411441 1.8729263 1.14906922 1.55625428
[88,] -0.01386611 1.6768114 -0.01929962 -1.1392987 1.14906922 0.02216435
[89,] -0.01386611 -1.3155095 -0.01929962 -0.1352237 0.01405589 -1.00056226
[90,] -0.01386611 -0.8167893 -1.15271365 -0.1352237 -1.12095744 -0.48919895
Q8a Q8b Q8c Q8d Q8e
[1,] 0.56865867 1.2364653 -0.55192981 0.1609586 -0.1379083
[2,] 1.59766008 1.2364653 1.41251459 1.1607592 0.4053160
[3,] 1.59766008 0.7088850 0.92140349 1.1607592 0.9485403
[4,] -1.48934414 -0.8738559 -1.04304091 1.1607592 0.9485403
[5,] -0.46034273 -0.8738559 -0.55192981 -0.8388419 -1.7675813
[6,] -0.46034273 0.7088850 -1.04304091 1.1607592 -1.7675813
[7,] -0.97484344 -1.4014362 -0.55192981 -1.3387422 -0.1379083
[8,] 0.05415797 -0.3462756 0.43029239 -0.3389417 -0.6811327
[9,] 0.05415797 -0.8738559 -0.06081871 -0.8388419 -1.2243570
[10,] 1.08315937 0.7088850 -0.06081871 0.1609586 0.9485403
[11,] 1.08315937 0.7088850 -1.04304091 0.1609586 -0.1379083
[12,] -0.46034273 -0.3462756 -1.04304091 0.6608589 -0.1379083
[13,] -1.48934414 -0.3462756 -1.04304091 -1.8386425 -0.6811327
[14,] -0.46034273 -0.8738559 -0.06081871 -0.8388419 -1.2243570
[15,] -1.48934414 -0.8738559 -0.06081871 -0.8388419 -0.1379083
[16,] 0.56865867 0.7088850 -0.06081871 1.1607592 -0.1379083
[17,] 0.05415797 0.1813047 -1.04304091 -1.8386425 -2.3108056
[18,] 0.05415797 -0.8738559 -0.55192981 0.1609586 0.4053160
[19,] 0.05415797 0.1813047 -0.55192981 0.1609586 -0.1379083
[20,] 1.59766008 1.2364653 1.90362568 1.1607592 0.9485403
[21,] 0.05415797 -1.4014362 -0.06081871 0.6608589 -0.6811327
[22,] -1.48934414 1.2364653 -1.04304091 1.1607592 0.9485403
[23,] 1.08315937 0.7088850 -1.04304091 1.1607592 0.9485403
[24,] 1.08315937 1.2364653 0.92140349 1.1607592 0.9485403
[25,] 1.59766008 1.2364653 -1.04304091 1.1607592 -0.6811327
[26,] 0.05415797 0.7088850 -1.04304091 0.1609586 0.4053160
[27,] 0.56865867 0.1813047 -0.06081871 0.6608589 0.4053160
[28,] -0.46034273 0.1813047 0.92140349 -0.3389417 -0.1379083
[29,] 0.56865867 1.2364653 0.43029239 1.1607592 0.9485403
[30,] -1.48934414 0.1813047 -1.04304091 1.1607592 0.9485403
[31,] 0.05415797 1.2364653 -1.04304091 0.6608589 0.9485403
[32,] 1.08315937 0.1813047 0.43029239 0.1609586 0.4053160
[33,] 0.56865867 0.1813047 -1.04304091 -0.3389417 -0.1379083
[34,] 0.05415797 0.1813047 -1.04304091 1.1607592 -0.1379083
[35,] -0.97484344 -1.4014362 -0.06081871 -1.3387422 -0.6811327
[36,] 1.59766008 0.7088850 0.43029239 1.1607592 0.9485403
[37,] 1.59766008 1.2364653 1.90362568 0.1609586 0.9485403
[38,] -0.46034273 0.7088850 -0.55192981 0.6608589 0.4053160
[39,] -0.97484344 -0.3462756 -1.04304091 0.6608589 -2.3108056
[40,] -0.97484344 -0.3462756 -0.55192981 -0.3389417 -0.6811327
[41,] 0.05415797 0.1813047 -1.04304091 0.1609586 -0.1379083
[42,] 0.05415797 0.1813047 0.43029239 0.1609586 -0.1379083
[43,] -0.97484344 -0.8738559 0.92140349 0.6608589 0.9485403
[44,] -0.97484344 0.7088850 -1.04304091 1.1607592 -2.3108056
[45,] 1.59766008 1.2364653 1.90362568 1.1607592 0.9485403
[46,] 0.05415797 0.1813047 -0.06081871 0.1609586 -0.1379083
[47,] 1.59766008 1.2364653 -1.04304091 1.1607592 0.9485403
[48,] 0.05415797 -0.3462756 0.43029239 -0.3389417 -0.6811327
[49,] -1.48934414 -1.9290165 -1.04304091 -1.8386425 -2.3108056
[50,] -1.48934414 -0.8738559 0.43029239 -0.8388419 0.4053160
[51,] -0.97484344 0.1813047 -1.04304091 0.1609586 -0.1379083
[52,] 0.56865867 1.2364653 -1.04304091 1.1607592 0.4053160
[53,] -1.48934414 0.1813047 -1.04304091 0.1609586 0.4053160
[54,] 1.59766008 1.2364653 -1.04304091 -0.8388419 -0.1379083
[55,] -0.46034273 0.1813047 -1.04304091 1.1607592 -0.6811327
[56,] 1.08315937 0.7088850 0.43029239 0.1609586 0.9485403
[57,] 1.08315937 1.2364653 -1.04304091 -0.3389417 -0.1379083
[58,] -0.97484344 -0.3462756 -0.06081871 -0.3389417 -1.2243570
[59,] -0.97484344 0.1813047 -1.04304091 -1.3387422 0.9485403
[60,] 1.08315937 0.7088850 0.92140349 0.1609586 0.9485403
[61,] -0.46034273 -0.3462756 -0.06081871 0.1609586 -0.1379083
[62,] 0.05415797 1.2364653 -1.04304091 1.1607592 0.4053160
[63,] 0.05415797 1.2364653 -1.04304091 1.1607592 0.9485403
[64,] 0.56865867 0.7088850 0.92140349 1.1607592 0.4053160
[65,] -1.48934414 1.2364653 -0.06081871 1.1607592 0.9485403
[66,] 1.59766008 1.2364653 1.90362568 1.1607592 0.9485403
[67,] 1.59766008 1.2364653 1.90362568 1.1607592 0.9485403
[68,] 1.08315937 0.7088850 1.41251459 0.6608589 0.4053160
[69,] -0.46034273 0.1813047 -0.06081871 -0.3389417 -0.1379083
[70,] 1.08315937 0.1813047 0.43029239 0.1609586 -0.1379083
[71,] 1.59766008 1.2364653 -1.04304091 1.1607592 0.9485403
[72,] 0.56865867 0.7088850 -0.06081871 0.1609586 0.4053160
[73,] 0.05415797 1.2364653 -0.55192981 1.1607592 0.4053160
[74,] -1.48934414 -1.9290165 -1.04304091 -1.8386425 -2.3108056
[75,] -0.46034273 1.2364653 -1.04304091 1.1607592 -0.1379083
[76,] -1.48934414 -0.3462756 -1.04304091 0.1609586 0.9485403
[77,] 0.05415797 -0.3462756 0.43029239 -0.3389417 -0.1379083
[78,] 0.56865867 0.7088850 -1.04304091 0.6608589 0.9485403
[79,] 0.05415797 1.2364653 -1.04304091 0.6608589 -0.1379083
[80,] 0.56865867 1.2364653 -0.06081871 1.1607592 0.9485403
[81,] 1.08315937 0.1813047 0.43029239 -0.3389417 -0.6811327
[82,] 1.59766008 1.2364653 1.90362568 1.1607592 0.9485403
[83,] 0.05415797 0.1813047 -1.04304091 1.1607592 0.9485403
[84,] 1.08315937 0.7088850 1.90362568 0.6608589 0.4053160
[85,] 0.56865867 0.1813047 0.92140349 0.1609586 0.4053160
[86,] 0.05415797 0.1813047 0.43029239 0.6608589 0.4053160
[87,] 1.59766008 1.2364653 1.90362568 1.1607592 0.9485403
[88,] -1.48934414 -0.3462756 -1.04304091 1.1607592 0.9485403
[89,] -0.97484344 -0.8738559 -1.04304091 0.6608589 -0.6811327
[90,] -0.97484344 -0.8738559 -0.06081871 -0.8388419 -0.1379083
[ reached 'max' / getOption("max.print") -- omitted 233 rows ]
attr(,"scaled:center")
Q6a Q6b Q6d Q6e Q6h Q6i Q8a Q8b Q8c
6.018576 3.637771 5.034056 3.269350 4.975232 3.956656 3.894737 4.656347 3.123839
Q8d Q8e
4.678019 5.253870
attr(,"scaled:scale")
Q6a Q6b Q6d Q6e Q6h Q6i Q8a Q8b Q8c
1.339659 2.005133 1.764580 1.991883 1.762094 1.955557 1.943632 1.895446 2.036199
Q8d Q8e
2.000399 1.840860
library(factoextra)
fviz_nbclust(cluster_scaled, kmeans, method = "wss")
fviz_nbclust(cluster_scaled, kmeans, method = "silhouette")
install.packages("NbClust")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.5/NbClust_3.0.1.zip'
Content type 'application/zip' length 122135 bytes (119 KB)
downloaded 119 KB
package ‘NbClust’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\frach\AppData\Local\Temp\Rtmp0giUNF\downloaded_packages
library(NbClust)
nb <- NbClust(
data = cluster_scaled,
distance = "euclidean",
min.nc = 2,
max.nc = 6,
method = "kmeans"
)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 11 proposed 2 as the best number of clusters
* 9 proposed 3 as the best number of clusters
* 1 proposed 4 as the best number of clusters
* 1 proposed 5 as the best number of clusters
* 2 proposed 6 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
set.seed(123)
k2 <- kmeans(cluster_scaled, centers = 2, nstart = 25)
k2
K-means clustering with 2 clusters of sizes 238, 85
Cluster means:
Q6a Q6b Q6d Q6e Q6h Q6i Q8a
1 0.05513429 0.2037010 0.2259559 0.1579831 0.3264230 0.2542116 0.3459966
2 -0.15437602 -0.5703629 -0.6326766 -0.4423526 -0.9139844 -0.7117924 -0.9687905
Q8b Q8c Q8d Q8e
1 0.4340112 0.2218797 0.4193104 0.3368423
2 -1.2152314 -0.6212631 -1.1740692 -0.9431585
Clustering vector:
[1] 1 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1
[41] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1
[81] 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1
[121] 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
[161] 1 2 2 2 2 1 2 1 2 2 1 2 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 2 2 2 1 1 1 1 1 2 2
[201] 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 2 2 1 1 2 1 1
[241] 2 2 1 1 1 1 2 1 2 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 1 1 2 1 2 2 1
[281] 1 1 1 1 2 1 2 1 1 2 1 1 1 2 2 2 1 1 2 2 1 1 1 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1
[321] 2 1 1
Within cluster sum of squares by cluster:
[1] 1846.8502 846.5349
(between_SS / total_SS = 24.0 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
cluster_profile <- cluster_data_complete |>
mutate(cluster = factor(k2$cluster))
cluster_profile
cluster_means <- cluster_profile |>
group_by(cluster) |>
summarise(across(everything(), mean))
cluster_means
cluster_means_long <- cluster_means |>
pivot_longer(
cols = -cluster,
names_to = "variable",
values_to = "mean_score"
)
ggplot(cluster_means_long, aes(x = variable, y = mean_score, group = cluster, color = cluster)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(
title = "Cluster Profiles",
x = "Variables",
y = "Mean score"
) +
theme_minimal()
factoextra::fviz_cluster(
k2,
data = cluster_scaled,
ellipse.type = "norm",
repel = TRUE,
show.clust.cent = TRUE
)
factoextra::fviz_cluster(
k2,
data = cluster_scaled,
geom = "point",
ellipse.type = "norm",
show.clust.cent = TRUE
)
cor(cluster_data_complete)
Q6a Q6b Q6d Q6e Q6h Q6i Q8a
Q6a 1.00000000 0.01176175 0.1389877 -0.07520157 0.1764847 0.07617645 0.03653469
Q6b 0.01176175 1.00000000 0.3414224 0.42261790 0.3745293 0.15121749 0.17824705
Q6d 0.13898774 0.34142235 1.0000000 0.31193154 0.4736980 0.13182600 0.21203009
Q6e -0.07520157 0.42261790 0.3119315 1.00000000 0.3850304 0.21827155 0.18302136
Q6h 0.17648474 0.37452926 0.4736980 0.38503042 1.0000000 0.39082943 0.38280357
Q6i 0.07617645 0.15121749 0.1318260 0.21827155 0.3908294 1.00000000 0.41059955
Q8a 0.03653469 0.17824705 0.2120301 0.18302136 0.3828036 0.41059955 1.00000000
Q8b 0.13460949 0.27520233 0.2848516 0.25079750 0.5134995 0.38807919 0.63081727
Q8c 0.04241674 0.17760187 0.1803331 0.16862760 0.1878175 0.34685944 0.48433201
Q8d 0.05438779 0.26814657 0.2617786 0.20343458 0.4461829 0.40209612 0.50086129
Q8e 0.19956982 0.29759027 0.3816634 0.14136779 0.4021385 0.20320936 0.40589447
Q8b Q8c Q8d Q8e
Q6a 0.1346095 0.04241674 0.05438779 0.1995698
Q6b 0.2752023 0.17760187 0.26814657 0.2975903
Q6d 0.2848516 0.18033308 0.26177856 0.3816634
Q6e 0.2507975 0.16862760 0.20343458 0.1413678
Q6h 0.5134995 0.18781747 0.44618285 0.4021385
Q6i 0.3880792 0.34685944 0.40209612 0.2032094
Q8a 0.6308173 0.48433201 0.50086129 0.4058945
Q8b 1.0000000 0.33453431 0.75702507 0.5697885
Q8c 0.3345343 1.00000000 0.37960444 0.3014531
Q8d 0.7570251 0.37960444 1.00000000 0.5274318
Q8e 0.5697885 0.30145309 0.52743180 1.0000000
install.packages("hopkins")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.5/hopkins_1.1.zip'
Content type 'application/zip' length 86999 bytes (84 KB)
downloaded 84 KB
package ‘hopkins’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\frach\AppData\Local\Temp\Rtmp0giUNF\downloaded_packages
library(hopkins)
hopkins(cluster_data_complete)
[1] 0.8835682
hc <- hclust(dist(cluster_scaled), method = "ward.D2")
plot(hc, labels = FALSE, hang = -1, main = "Dendrogram")
summary(aov(Q6a ~ cluster, data = cluster_profile))
Df Sum Sq Mean Sq F value Pr(>F)
cluster 1 4.9 4.934 2.764 0.0974 .
Residuals 321 573.0 1.785
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(aov(Q8a ~ cluster, data = cluster_profile))
Df Sum Sq Mean Sq F value Pr(>F)
cluster 1 409.0 409.0 162.6 <2e-16 ***
Residuals 321 807.4 2.5
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova_results <- lapply(names(cluster_data_complete), function(var) {
model <- aov(as.formula(paste(var, "~ cluster")), data = cluster_profile)
data.frame(
variable = var,
p_value = summary(model)[[1]][["Pr(>F)"]][1]
)
})
anova_results <- do.call(rbind, anova_results)
anova_results
3 groups now !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
set.seed(123)
k3 <- kmeans(cluster_scaled, centers = 3, nstart = 25)
k3
K-means clustering with 3 clusters of sizes 162, 67, 94
Cluster means:
Q6a Q6b Q6d Q6e Q6h Q6i Q8a
1 -0.24886243 -0.0964158 -0.1277436 -0.1662137 0.00354651 -0.04728005 0.03827832
2 0.06412213 -0.7200228 -0.6705898 -0.5698234 -1.10401694 -0.79449048 -1.09770928
3 0.38318649 0.6793711 0.6981273 0.6926041 0.78079362 0.64776840 0.71644079
Q8b Q8c Q8d Q8e
1 0.1454813 -0.1426706 0.1424438 0.06328585
2 -1.4014362 -0.6838701 -1.3536646 -1.01355352
3 0.7481729 0.7333184 0.7193578 0.61335934
Clustering vector:
[1] 1 3 3 1 1 1 2 1 1 3 1 1 2 2 2 3 1 1 1 3 1 1 3 3 1 1 3 1 3 1 1 3 1 3 2 3 3 1 2 1
[41] 3 1 1 1 3 1 1 1 2 1 3 1 1 1 1 1 1 1 1 3 1 1 3 3 3 3 3 3 1 1 3 1 1 2 1 1 1 1 1 3
[81] 1 3 1 3 3 1 3 1 1 2 3 3 1 2 3 3 1 2 3 2 1 3 1 1 3 1 3 1 1 1 1 1 2 1 1 3 3 3 3 1
[121] 1 3 1 1 1 1 1 1 1 1 1 2 3 3 1 1 3 3 1 2 3 1 3 3 1 1 3 3 1 3 3 3 3 3 3 1 3 3 2 3
[161] 1 2 2 2 2 1 2 1 2 2 1 2 2 1 1 1 1 1 2 1 1 1 3 1 1 1 2 1 2 2 2 2 2 3 1 3 1 1 2 2
[201] 1 1 1 1 3 3 1 3 3 3 1 2 2 1 2 2 2 1 3 2 3 1 3 1 1 1 3 3 1 1 2 3 3 1 2 1 1 2 3 1
[241] 2 1 1 1 1 1 2 1 2 2 1 2 1 2 1 3 3 2 1 3 1 2 1 2 1 1 1 1 3 1 3 3 2 3 3 1 3 2 2 1
[281] 3 1 3 1 2 1 2 1 1 2 1 3 3 2 1 2 1 1 1 1 1 1 3 2 2 3 1 3 2 1 2 1 2 1 1 3 2 1 1 1
[321] 2 1 3
Within cluster sum of squares by cluster:
[1] 1167.8778 632.8631 584.9675
(between_SS / total_SS = 32.6 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
set.seed(123)
k4 <- kmeans(cluster_scaled, centers = 4, nstart = 25)
k4
K-means clustering with 4 clusters of sizes 97, 69, 84, 73
Cluster means:
Q6a Q6b Q6d Q6e Q6h Q6i Q8a
1 -0.59102504 0.3965916 0.08586251 0.4444485 0.18372283 -0.18870711 -0.2269610
2 0.09431634 -0.7517389 -0.69277752 -0.5863299 -1.06338430 -0.79305251 -1.0643218
3 0.27938560 -0.5793036 -0.14748335 -0.7030042 -0.06026046 0.09521625 0.4094085
4 0.37470153 0.8501657 0.71043270 0.7725701 0.83033260 0.89078148 0.8364810
Q8b Q8c Q8d Q8e
1 -0.05801009 -0.2684018 -0.01941779 -0.04270408
2 -1.36320575 -0.6871633 -1.28802770 -1.03540940
3 0.50790201 0.1964300 0.45256710 0.36651424
4 0.78115625 0.7801250 0.72249042 0.61367600
Clustering vector:
[1] 3 4 4 1 1 3 2 1 1 4 3 3 2 2 2 4 1 1 1 4 3 3 4 4 3 1 4 1 4 1 1 4 3 1 2 4 4 1 2 1
[41] 1 1 1 1 4 1 3 1 2 1 1 3 1 1 3 3 3 2 1 4 1 1 4 4 1 4 4 4 1 3 4 1 1 2 1 1 1 3 1 4
[81] 1 4 3 4 4 3 4 1 2 2 4 4 3 2 3 4 1 2 4 2 1 4 3 1 4 3 4 1 1 3 1 1 2 1 3 1 4 1 4 1
[121] 1 3 1 3 1 3 3 1 1 1 3 2 4 1 1 1 3 4 3 2 4 3 3 4 1 3 4 4 1 4 4 1 4 4 4 3 3 4 2 3
[161] 3 2 2 2 2 3 2 1 2 2 3 2 2 3 3 1 3 3 2 3 1 1 3 3 3 3 2 3 2 2 2 2 2 4 3 4 3 3 1 2
[201] 1 3 3 1 4 4 1 4 4 4 1 2 2 1 2 2 2 3 1 2 4 1 4 1 3 1 3 4 1 1 2 4 4 1 2 1 3 2 4 1
[241] 2 1 3 1 3 1 2 1 2 2 3 2 3 2 3 3 4 2 3 4 3 2 3 2 1 3 3 3 3 1 4 4 2 4 3 3 1 2 2 3
[281] 4 3 4 1 2 3 2 3 3 2 1 4 4 2 1 2 3 1 1 1 1 1 4 2 2 4 2 4 2 1 2 3 2 1 3 4 2 3 1 3
[321] 2 3 4
Within cluster sum of squares by cluster:
[1] 687.5222 639.5021 506.3095 371.1580
(between_SS / total_SS = 37.8 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
cluster_profile_4 <- cluster_data_complete |>
mutate(cluster = factor(k4$cluster))
cluster_means_4 <- cluster_profile_4 |>
group_by(cluster) |>
summarise(across(everything(), mean))
cluster_means_4
factoextra::fviz_cluster(
k4,
data = cluster_scaled,
geom = "point",
ellipse.type = "none",
show.clust.cent = TRUE
)
cluster_sizes_4 <- data.frame(table(k4$cluster))
cluster_sizes_4$percent <- round(100 * cluster_sizes_4$Freq / sum(cluster_sizes_4$Freq), 1)
cluster_sizes_4
cluster_means_4_long <- cluster_means_4 |>
pivot_longer(
cols = -cluster,
names_to = "variable",
values_to = "mean_score"
)
ggplot(
cluster_means_4_long,
aes(x = variable, y = mean_score, group = cluster, color = cluster)
) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(
title = "Cluster Profiles (4-Cluster Solution)",
x = "Variables",
y = "Mean score",
color = "Cluster"
) +
theme_minimal()
pca_cluster <- prcomp(cluster_data_complete, scale. = TRUE)
summary(pca_cluster)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 2.044 1.168 1.0708 0.92877 0.87763 0.80827 0.74116 0.69154
Proportion of Variance 0.380 0.124 0.1042 0.07842 0.07002 0.05939 0.04994 0.04348
Cumulative Proportion 0.380 0.504 0.6082 0.68667 0.75669 0.81608 0.86602 0.90950
PC9 PC10 PC11
Standard deviation 0.6465 0.61845 0.44167
Proportion of Variance 0.0380 0.03477 0.01773
Cumulative Proportion 0.9475 0.98227 1.00000
round(pca_cluster$rotation[, 1:3], 3)
PC1 PC2 PC3
Q6a 0.086 -0.053 0.761
Q6b 0.246 0.480 -0.112
Q6d 0.262 0.409 0.239
Q6e 0.223 0.493 -0.343
Q6h 0.353 0.239 0.121
Q6i 0.276 -0.200 -0.218
Q8a 0.347 -0.314 -0.156
Q8b 0.404 -0.196 0.045
Q8c 0.265 -0.272 -0.235
Q8d 0.381 -0.213 -0.025
Q8e 0.337 -0.055 0.301
pca_scores_2 <- as.data.frame(pca_cluster$x[, 1:2])
head(pca_scores_2)
fviz_nbclust(pca_scores_2, kmeans, method = "wss")
fviz_nbclust(pca_scores_2, kmeans, method = "silhouette")
nb_pca2 <- NbClust(
data = pca_scores_2,
distance = "euclidean",
min.nc = 2,
max.nc = 6,
method = "kmeans"
)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 10 proposed 2 as the best number of clusters
* 7 proposed 3 as the best number of clusters
* 3 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
* 2 proposed 6 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
set.seed(123)
k4_pca2 <- kmeans(pca_scores_2, centers = 4, nstart = 25)
k4_pca2
K-means clustering with 4 clusters of sizes 102, 69, 70, 82
Cluster means:
PC1 PC2
1 0.1690658 0.7994389
2 -3.0310459 0.1049628
3 2.5341729 0.1558428
4 0.1768945 -1.2157828
Clustering vector:
[1] 4 3 3 1 1 1 2 1 1 3 1 1 2 2 2 3 1 1 1 3 4 1 3 3 4 4 3 1 3 1 1 3 4 1 2 3 3 1 2 1
[41] 1 1 1 1 3 1 4 1 2 1 1 4 1 4 4 4 1 2 1 3 1 1 3 3 1 3 3 3 1 4 3 1 3 2 1 1 4 4 1 3
[81] 1 3 4 3 3 4 3 1 2 2 3 3 4 2 4 3 1 2 3 4 1 3 4 1 3 4 3 1 1 4 1 1 2 1 4 1 4 1 3 4
[121] 1 4 4 1 1 4 4 1 1 1 4 2 1 1 4 1 3 3 4 2 3 4 3 3 1 4 3 3 4 3 3 1 1 3 3 1 3 1 2 4
[161] 4 2 2 2 2 4 2 1 2 2 4 2 2 4 4 1 4 4 2 4 4 1 4 4 4 4 2 4 2 2 2 2 2 3 4 1 4 4 1 2
[201] 1 4 4 1 3 3 1 3 3 3 1 2 2 1 2 2 2 4 1 2 3 1 3 1 1 1 4 3 2 1 2 3 3 1 2 1 4 2 1 1
[241] 2 1 1 4 4 1 2 1 2 2 1 2 4 2 4 3 3 2 4 3 4 2 4 2 1 4 4 4 3 1 1 3 2 1 4 4 1 2 2 4
[281] 3 4 3 1 2 4 2 4 4 2 1 1 3 2 4 2 4 1 2 1 1 4 3 2 2 3 4 3 2 1 2 1 2 1 4 3 2 4 1 4
[321] 2 4 3
Within cluster sum of squares by cluster:
[1] 131.7003 167.9382 101.4012 106.3939
(between_SS / total_SS = 71.6 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
factoextra::fviz_cluster(
k4_pca2,
data = pca_scores_2,
geom = "point",
ellipse.type = "none",
show.clust.cent = TRUE
)
cluster_profile_pca4 <- cluster_data_complete |>
mutate(cluster = factor(k4_pca2$cluster))
cluster_means_pca4 <- cluster_profile_pca4 |>
group_by(cluster) |>
summarise(across(everything(), mean))
cluster_means_pca4
pca_plot_data <- pca_scores_2 |>
mutate(
cluster = factor(
k4_pca2$cluster,
levels = c(1, 2, 3, 4),
labels = c(
"Cautious guidance seekers",
"AI skeptics",
"AI-enthusiastic guidance seekers",
"Feature-oriented adopters"
)
)
)
ggplot(pca_plot_data, aes(x = PC1, y = PC2, color = cluster, shape = cluster)) +
geom_point(size = 2.5, alpha = 0.8) +
labs(
title = "PCA-based 4-cluster solution",
x = "PC1",
y = "PC2",
color = "Cluster",
shape = "Cluster"
) +
theme_minimal()
cluster_means_pca4_named <- cluster_means_pca4 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4),
labels = c(
"Cautious guidance seekers",
"AI skeptics",
"AI-enthusiastic guidance seekers",
"Feature-oriented adopters"
)
)
)
cluster_means_pca4_long <- cluster_means_pca4_named |>
pivot_longer(
cols = -cluster,
names_to = "variable",
values_to = "mean_score"
)
ggplot(
cluster_means_pca4_long,
aes(x = variable, y = mean_score, group = cluster, color = cluster)
) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(
title = "Cluster Profiles (PCA-based 4-Cluster Solution)",
x = "Variables",
y = "Mean score",
color = "Cluster"
) +
theme_minimal()
cor_matrix <- cor(cluster_data_complete)
round(cor_matrix, 3)
Q6a Q6b Q6d Q6e Q6h Q6i Q8a Q8b Q8c Q8d Q8e
Q6a 1.000 0.012 0.139 -0.075 0.176 0.076 0.037 0.135 0.042 0.054 0.200
Q6b 0.012 1.000 0.341 0.423 0.375 0.151 0.178 0.275 0.178 0.268 0.298
Q6d 0.139 0.341 1.000 0.312 0.474 0.132 0.212 0.285 0.180 0.262 0.382
Q6e -0.075 0.423 0.312 1.000 0.385 0.218 0.183 0.251 0.169 0.203 0.141
Q6h 0.176 0.375 0.474 0.385 1.000 0.391 0.383 0.513 0.188 0.446 0.402
Q6i 0.076 0.151 0.132 0.218 0.391 1.000 0.411 0.388 0.347 0.402 0.203
Q8a 0.037 0.178 0.212 0.183 0.383 0.411 1.000 0.631 0.484 0.501 0.406
Q8b 0.135 0.275 0.285 0.251 0.513 0.388 0.631 1.000 0.335 0.757 0.570
Q8c 0.042 0.178 0.180 0.169 0.188 0.347 0.484 0.335 1.000 0.380 0.301
Q8d 0.054 0.268 0.262 0.203 0.446 0.402 0.501 0.757 0.380 1.000 0.527
Q8e 0.200 0.298 0.382 0.141 0.402 0.203 0.406 0.570 0.301 0.527 1.000
library(corrplot)
corrplot(
cor(cluster_data_complete),
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45
)
pca_variance <- data.frame(
Component = paste0("PC", 1:length(pca_cluster$sdev)),
Eigenvalue = pca_cluster$sdev^2,
Proportion_Variance = (pca_cluster$sdev^2) / sum(pca_cluster$sdev^2),
Cumulative_Variance = cumsum((pca_cluster$sdev^2) / sum(pca_cluster$sdev^2))
)
pca_variance$Eigenvalue <- round(pca_variance$Eigenvalue, 3)
pca_variance$Proportion_Variance <- round(pca_variance$Proportion_Variance, 3)
pca_variance$Cumulative_Variance <- round(pca_variance$Cumulative_Variance, 3)
pca_variance
pca_loadings <- as.data.frame(pca_cluster$rotation[, 1:3])
pca_loadings$Variable <- rownames(pca_loadings)
rownames(pca_loadings) <- NULL
pca_loadings <- pca_loadings[, c("Variable", "PC1", "PC2", "PC3")]
pca_loadings$PC1 <- round(pca_loadings$PC1, 3)
pca_loadings$PC2 <- round(pca_loadings$PC2, 3)
pca_loadings$PC3 <- round(pca_loadings$PC3, 3)
pca_loadings
hopkins(cluster_data_complete)
[1] 0.9490564
distance_matrix <- dist(cluster_data_complete, method = "euclidean")
distance_matrix
1 2 3 4 5 6 7 8
2 9.797959
3 8.717798 6.480741
4 7.745967 9.899495 10.099505
9 10 11 12 13 14 15 16
2
3
4
17 18 19 20 21 22 23 24
2
3
4
25 26 27 28 29 30 31 32
2
3
4
33 34 35 36 37 38 39 40
2
3
4
41 42 43 44 45 46 47 48
2
3
4
49 50 51 52 53 54 55 56
2
3
4
57 58 59 60 61 62 63 64
2
3
4
65 66 67 68 69 70 71 72
2
3
4
73 74 75 76 77 78 79 80
2
3
4
81 82 83 84 85 86 87 88
2
3
4
89 90 91 92 93 94 95 96
2
3
4
97 98 99 100 101 102 103 104
2
3
4
105 106 107 108 109 110 111 112
2
3
4
113 114 115 116 117 118 119 120
2
3
4
121 122 123 124 125 126 127 128
2
3
4
129 130 131 132 133 134 135 136
2
3
4
137 138 139 140 141 142 143 144
2
3
4
145 146 147 148 149 150 151 152
2
3
4
153 154 155 156 157 158 159 160
2
3
4
161 162 163 164 165 166 167 168
2
3
4
169 170 171 172 173 174 175 176
2
3
4
177 178 179 180 181 182 183 184
2
3
4
185 186 187 188 189 190 191 192
2
3
4
193 194 195 196 197 198 199 200
2
3
4
201 202 203 204 205 206 207 208
2
3
4
209 210 211 212 213 214 215 216
2
3
4
217 218 219 220 221 222 223 224
2
3
4
225 226 227 228 229 230 231 232
2
3
4
233 234 235 236 237 238 239 240
2
3
4
241 242 243 244 245 246 247 248
2
3
4
249 250 251 252 253 254 255 256
2
3
4
257 258 259 260 261 262 263 264
2
3
4
265 266 267 268 269 270 271 272
2
3
4
273 274 275 276 277 278 279 280
2
3
4
281 282 283 284 285 286 287 288
2
3
4
289 290 291 292 293 294 295 296
2
3
4
297 298 299 300 301 302 303 304
2
3
4
305 306 307 308 309 310 311 312
2
3
4
313 314 315 316 317 318 319 320
2
3
4
321 322
2
3
4
[ reached 'max' / getOption("max.print") -- omitted 319 rows ]
distance_matrix_table <- as.matrix(distance_matrix)
distance_matrix_table[1:10, 1:10]
1 2 3 4 5 6 7 8
1 0.000000 9.797959 8.717798 7.745967 7.745967 7.810250 8.306624 5.567764
2 9.797959 0.000000 6.480741 9.899495 9.899495 8.062258 11.269428 8.888194
3 8.717798 6.480741 0.000000 10.099505 10.295630 10.148892 11.618950 8.544004
4 7.745967 9.899495 10.099505 0.000000 7.211103 6.708204 6.557439 7.000000
5 7.745967 9.899495 10.295630 7.211103 0.000000 5.916080 5.000000 4.582576
6 7.810250 8.062258 10.148892 6.708204 5.916080 0.000000 8.246211 7.348469
7 8.306624 11.269428 11.618950 6.557439 5.000000 8.246211 0.000000 5.291503
8 5.567764 8.888194 8.544004 7.000000 4.582576 7.348469 5.291503 0.000000
9 6.082763 10.148892 9.848858 7.416198 3.316625 7.483315 5.099020 2.449490
10 6.082763 6.082763 4.358899 7.549834 7.416198 7.745967 8.366600 5.830952
9 10
1 6.082763 6.082763
2 10.148892 6.082763
3 9.848858 4.358899
4 7.416198 7.549834
5 3.316625 7.416198
6 7.483315 7.745967
7 5.099020 8.366600
8 2.449490 5.830952
9 0.000000 6.928203
10 6.928203 0.000000
hc <- hclust(dist(cluster_data_complete), method = "ward.D2")
plot(hc, labels = FALSE, hang = -1, main = "Dendrogram")
cluster_sizes_pca4 <- data.frame(table(k4_pca2$cluster))
cluster_sizes_pca4$Percent <- round(
100 * cluster_sizes_pca4$Freq / sum(cluster_sizes_pca4$Freq),
1
)
names(cluster_sizes_pca4) <- c("Cluster", "Size", "Percent")
cluster_sizes_pca4
cluster_sizes_pca4 <- data.frame(table(k4_pca2$cluster))
cluster_sizes_pca4$Percent <- round(
100 * cluster_sizes_pca4$Freq / sum(cluster_sizes_pca4$Freq),
1
)
cluster_sizes_pca4$Cluster_Name <- c(
"Cautious guidance seekers",
"AI skeptics",
"AI-enthusiastic guidance seekers",
"Feature-oriented adopters"
)
cluster_sizes_pca4 <- cluster_sizes_pca4[, c("Var1", "Cluster_Name", "Freq", "Percent")]
names(cluster_sizes_pca4) <- c("Cluster", "Cluster_Name", "Size", "Percent")
cluster_sizes_pca4
cluster_mean_table <- cluster_means_pca4 |>
mutate(
cluster = factor(
cluster,
levels = c(1,2,3,4),
labels = c(
"Cautious guidance seekers",
"AI skeptics",
"AI-enthusiastic guidance seekers",
"Feature-oriented adopters"
)
)
)
cluster_mean_table
ggplot(pca_plot_data, aes(x = PC1, y = PC2, color = cluster, shape = cluster)) +
geom_point(size = 2.5, alpha = 0.8) +
labs(
title = "Customer Segmentation Based on PCA Clustering",
x = "Interest in AI banking features (PC1)",
y = "Need for financial guidance (PC2)",
color = "Cluster",
shape = "Cluster"
) +
theme_minimal()
set.seed(123)
k5_pca2 <- kmeans(pca_scores_2, centers = 5, nstart = 25)
k5_pca2
K-means clustering with 5 clusters of sizes 56, 99, 73, 33, 62
Cluster means:
PC1 PC2
1 -1.7951634 0.6037454
2 0.4238399 0.6740437
3 0.3347926 -1.3280570
4 -3.9696761 -0.4418461
5 2.6633622 0.1772421
Clustering vector:
[1] 3 5 5 2 1 2 1 1 1 5 2 2 1 4 1 5 1 2 2 5 1 2 5 5 3 3 2 2 5 2 2 2 3 2 1 5 5 2 1 2
[41] 2 2 2 2 5 2 3 2 4 2 2 3 1 3 3 3 2 1 1 5 2 2 5 5 2 5 5 5 2 3 5 2 5 4 2 1 3 3 2 5
[81] 2 5 3 5 5 3 5 2 1 1 5 2 3 4 3 5 2 1 5 1 2 5 3 2 5 3 5 2 2 3 2 2 4 2 3 2 3 2 5 3
[121] 2 3 3 2 2 3 3 2 2 2 3 4 2 2 3 1 3 5 3 1 5 3 2 5 2 3 5 5 3 5 5 2 2 5 5 2 5 2 1 3
[161] 3 4 1 4 4 3 4 2 4 4 3 1 4 3 3 1 3 3 4 3 1 2 3 2 3 3 1 3 4 1 4 1 1 5 3 2 3 3 1 1
[201] 2 3 3 2 5 2 2 5 5 5 2 1 1 2 4 4 4 3 2 4 5 2 2 2 2 2 3 5 1 2 4 5 5 1 4 2 3 1 2 2
[241] 1 1 2 3 3 2 1 2 4 4 2 1 3 1 3 2 5 1 3 5 3 1 2 4 2 3 3 3 5 2 2 5 1 2 3 3 2 1 4 2
[281] 5 3 5 2 4 3 1 3 3 1 2 2 5 4 1 1 3 2 1 1 2 1 5 1 4 5 1 5 1 2 4 2 4 2 2 5 4 3 1 3
[321] 4 3 5
Within cluster sum of squares by cluster:
[1] 84.46534 102.03941 90.80653 56.55989 82.57159
(between_SS / total_SS = 76.7 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
cluster_profile_pca5 <- cluster_data_complete |>
mutate(cluster = factor(k5_pca2$cluster))
cluster_means_pca5 <- cluster_profile_pca5 |>
group_by(cluster) |>
summarise(across(everything(), mean))
cluster_means_pca5
cluster_means_pca5_named <- cluster_means_pca5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
)
cluster_means_pca5_long <- cluster_means_pca5_named |>
pivot_longer(
cols = -cluster,
names_to = "variable",
values_to = "mean_score"
)
ggplot(
cluster_means_pca5_long,
aes(x = variable, y = mean_score, group = cluster, color = cluster)
) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(
title = "Cluster Profiles (PCA-based 5-Cluster Solution)",
x = "Variables",
y = "Mean score",
color = "Cluster"
) +
theme_minimal()
cluster_sizes_pca5 <- data.frame(table(k5_pca2$cluster))
cluster_sizes_pca5$Percent <- round(
100 * cluster_sizes_pca5$Freq / sum(cluster_sizes_pca5$Freq),
1
)
cluster_sizes_pca5$Cluster_Name <- c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
cluster_sizes_pca5 <- cluster_sizes_pca5[, c("Var1", "Cluster_Name", "Freq", "Percent")]
names(cluster_sizes_pca5) <- c("Cluster", "Cluster_Name", "Size", "Percent")
cluster_sizes_pca5
cluster_profile_pca5 <- cluster_data_complete |>
mutate(cluster = factor(k5_pca2$cluster))
cluster_means_pca5 <- cluster_profile_pca5 |>
group_by(cluster) |>
summarise(across(everything(), mean))
cluster_mean_table_pca5 <- cluster_means_pca5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
)
cluster_mean_table_pca5
pca_plot_data_5 <- pca_scores_2 |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
)
ggplot(pca_plot_data_5, aes(x = PC1, y = PC2, color = cluster, shape = cluster)) +
geom_point(size = 2.5, alpha = 0.8) +
labs(
title = "PCA-based 5-Cluster Solution",
x = "PC1",
y = "PC2",
color = "Cluster",
shape = "Cluster"
) +
theme_minimal()
pca_plot_data_5 <- pca_scores_2 |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
)
ggplot(pca_plot_data_5, aes(x = PC1, y = PC2, color = cluster, shape = cluster)) +
geom_point(size = 2.8, alpha = 0.8) +
labs(
title = "Customer Segmentation (PCA-based 5-Cluster Solution)",
x = "Interest in AI Banking Features (PC1)",
y = "Need for Financial Guidance (PC2)",
color = "Cluster",
shape = "Cluster"
) +
theme_minimal()
cluster_profile_pca2_5 <- pca_scores_2 |>
mutate(cluster = factor(k5_pca2$cluster))
cluster_means_pca2_5 <- cluster_profile_pca2_5 |>
group_by(cluster) |>
summarise(across(everything(), mean))
cluster_means_pca2_5_named <- cluster_means_pca2_5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
) |>
rename(
`Interest in AI Banking Features` = PC1,
`Need for Financial Guidance` = PC2
)
cluster_means_pca2_5_long <- cluster_means_pca2_5_named |>
pivot_longer(
cols = -cluster,
names_to = "component",
values_to = "mean_score"
)
ggplot(
cluster_means_pca2_5_long,
aes(x = component, y = mean_score, group = cluster, color = cluster)
) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(
title = "Cluster Profiles (5-Cluster Solution Based on 2 PCs)",
x = "Principal Components",
y = "Mean component score",
color = "Cluster"
) +
theme_minimal()
cluster_profile_pca2_4 <- pca_scores_2 |>
mutate(cluster = factor(k4_pca2$cluster))
cluster_means_pca2_4 <- cluster_profile_pca2_4 |>
group_by(cluster) |>
summarise(across(everything(), mean))
cluster_means_pca2_4_named <- cluster_means_pca2_4 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4),
labels = c(
"Cautious guidance seekers",
"AI skeptics",
"AI-enthusiastic guidance seekers",
"Feature-oriented adopters"
)
)
) |>
rename(
`Interest in AI Banking Features` = PC1,
`Need for Financial Guidance` = PC2
)
cluster_means_pca2_4_long <- cluster_means_pca2_4_named |>
pivot_longer(
cols = -cluster,
names_to = "component",
values_to = "mean_score"
)
ggplot(
cluster_means_pca2_4_long,
aes(x = component, y = mean_score, group = cluster, color = cluster)
) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(
title = "Cluster Profiles (4-Cluster Solution Based on 2 PCs)",
x = "Principal Components",
y = "Mean component score",
color = "Cluster"
) +
theme_minimal()
data_clean_complete <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
)
data_cluster_profile <- data_clean_complete |>
mutate(cluster = factor(k4_pca2$cluster))
data_cluster_profile_5 <- data_clean_complete |>
mutate(cluster = factor(k5_pca2$cluster))
data_cluster_profile_5 |>
group_by(cluster) |>
summarise(mean_age = mean(as.numeric(Q21), na.rm = TRUE))
data_cluster_profile_5 <- data_cluster_profile_5 |>
mutate(age = 2026 - as.numeric(Q21))
data_cluster_profile_5 |>
group_by(cluster) |>
summarise(mean_age = mean(age, na.rm = TRUE))
data_cluster_profile_5 |>
group_by(cluster) |>
summarise(mobile_banking_use = mean(as.numeric(Q2), na.rm = TRUE))
data_cluster_profile_5 |>
group_by(cluster) |>
summarise(trust = mean(as.numeric(Q24), na.rm = TRUE))
names(data_cluster_profile_5)
[1] "respondent_id" "Q1" "Q2" "Q3a" "Q3b"
[6] "Q3c" "Q3d" "Q3e" "Q3f" "Q3g"
[11] "Q3g_text" "Q4" "Q4_5_text" "Q5a" "Q5b"
[16] "Q5c" "Q5d" "Q5e" "Q6a" "Q6b"
[21] "Q6c" "Q6d" "Q6e" "Q6f" "Q6g"
[26] "Q6h" "Q6i" "Q8a" "Q8b" "Q8c"
[31] "Q8d" "Q8e" "Q10a" "Q10b" "Q10c"
[36] "Q10d" "Q11a" "Q11b" "Q11c" "Q11d"
[41] "Q11e" "Q11f" "Q12a" "Q12b" "Q12c"
[46] "Q12d" "Q12e" "Q12f" "Q12g" "Q12h"
[51] "Q12i" "Q13a" "Q13b" "Q13c" "Q13d"
[56] "Q13e" "Q13f" "Q13g" "Q14a" "Q14b"
[61] "Q14c" "Q14d" "Q14e" "Q14f" "Q14g"
[66] "Q15a" "Q15b" "Q15c" "Q15d" "Q15e"
[71] "Q15f" "Q15g" "Q17" "Q18" "Q19"
[76] "Q21" "Q22" "Q23" "Q24" "Q25a"
[81] "Q25b" "Q25c" "Q25d" "Q25e" "Q25f"
[86] "Q25g" "Q25h" "Q25i" "Q25j" "Q25k"
[91] "Q25l" "Q25l_text" "Q26" "cluster" "age"
data_cluster_profile_5 |>
group_by(cluster, Q22) |>
summarise(n = n(), .groups = "drop") |>
group_by(cluster) |>
mutate(percent = round(100 * n / sum(n), 1))
data_cluster_profile_5 |>
group_by(cluster) |>
summarise(Q17_mean = mean(as.numeric(Q17), na.rm = TRUE))
gender_profile_5 <- data_cluster_profile_5 |>
group_by(cluster, Q22) |>
summarise(n = n(), .groups = "drop") |>
group_by(cluster) |>
mutate(percent = round(100 * n / sum(n), 1)) |>
ungroup()
male_profile_5 <- gender_profile_5 |>
filter(Q22 == "1") |>
select(cluster, male_percent = percent)
female_profile_5 <- gender_profile_5 |>
filter(Q22 == "2") |>
select(cluster, female_percent = percent)
profiling_table_5 <- data_cluster_profile_5 |>
group_by(cluster) |>
summarise(
mean_birth_year = mean(as.numeric(Q21), na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE),
mobile_banking_use = mean(as.numeric(Q2), na.rm = TRUE),
trust = mean(as.numeric(Q24), na.rm = TRUE),
Q17_mean = mean(as.numeric(Q17), na.rm = TRUE)
) |>
left_join(male_profile_5, by = "cluster") |>
left_join(female_profile_5, by = "cluster") |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
) |>
mutate(across(-cluster, ~ round(.x, 2)))
profiling_table_5
pc_loadings <- as.data.frame(pca_cluster$rotation[, 1:2])
pc_loadings$Variable <- rownames(pc_loadings)
rownames(pc_loadings) <- NULL
pc_loadings <- pc_loadings[, c("Variable", "PC1", "PC2")]
pc_loadings$PC1 <- round(pc_loadings$PC1, 3)
pc_loadings$PC2 <- round(pc_loadings$PC2, 3)
pc_loadings
cluster_bank_counts <- data_cluster_profile_5 |>
group_by(cluster) |>
summarise(
OTP = sum(Q25a == 1, na.rm = TRUE),
Gorenjska_Banka = sum(Q25b == 1, na.rm = TRUE),
NLB = sum(Q25c == 1, na.rm = TRUE),
Revolut = sum(Q25d == 1, na.rm = TRUE),
N26 = sum(Q25e == 1, na.rm = TRUE),
Intesa = sum(Q25f == 1, na.rm = TRUE),
UniCredit = sum(Q25g == 1, na.rm = TRUE),
Regional_Bank = sum(Q25h == 1, na.rm = TRUE),
Workers_Savings = sum(Q25i == 1, na.rm = TRUE),
Sparkasse = sum(Q25j == 1, na.rm = TRUE),
Addiko = sum(Q25k == 1, na.rm = TRUE)
)
cluster_bank_counts
library(dplyr)
library(tidyr)
library(ggplot2)
cluster_bank_counts <- data_cluster_profile_5 |>
group_by(cluster) |>
summarise(
OTP = sum(Q25a == 1, na.rm = TRUE),
Gorenjska_Banka = sum(Q25b == 1, na.rm = TRUE),
NLB = sum(Q25c == 1, na.rm = TRUE),
Revolut = sum(Q25d == 1, na.rm = TRUE),
N26 = sum(Q25e == 1, na.rm = TRUE),
Intesa_SanPaolo = sum(Q25f == 1, na.rm = TRUE),
UniCredit = sum(Q25g == 1, na.rm = TRUE),
Regional_Bank = sum(Q25h == 1, na.rm = TRUE),
Workers_Savings = sum(Q25i == 1, na.rm = TRUE),
Sparkasse = sum(Q25j == 1, na.rm = TRUE),
Addiko = sum(Q25k == 1, na.rm = TRUE),
Other = sum(Q25l == 1, na.rm = TRUE)
)
cluster_bank_counts
cluster_bank_long <- cluster_bank_counts |>
pivot_longer(
cols = -cluster,
names_to = "bank",
values_to = "count"
)
cluster_bank_long
ggplot(cluster_bank_long, aes(x = bank, y = factor(cluster), fill = count)) +
geom_tile(color = "white") +
geom_text(aes(label = count), size = 4) +
labs(
title = "Banks Used by Each Cluster",
x = "Bank",
y = "Cluster",
fill = "Count"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
cluster_bank_percent_long <- cluster_bank_long |>
group_by(cluster) |>
mutate(percent = round(100 * count / sum(count), 1)) |>
ungroup()
cluster_bank_percent_long
ggplot(cluster_bank_percent_long, aes(x = bank, y = factor(cluster), fill = percent)) +
geom_tile(color = "white") +
geom_text(aes(label = paste0(percent, "%")), size = 4) +
labs(
title = "Bank Usage by Cluster (%)",
x = "Bank",
y = "Cluster",
fill = "Percent"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
library(dplyr)
library(tidyr)
library(ggplot2)
cluster_bank_counts_named <- data_cluster_profile_5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
) |>
filter(cluster %in% c(
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-enthusiastic guidance seekers"
)) |>
group_by(cluster) |>
summarise(
OTP = sum(Q25a == 1, na.rm = TRUE),
Gorenjska_Banka = sum(Q25b == 1, na.rm = TRUE),
NLB = sum(Q25c == 1, na.rm = TRUE),
Revolut = sum(Q25d == 1, na.rm = TRUE),
N26 = sum(Q25e == 1, na.rm = TRUE),
Intesa_SanPaolo = sum(Q25f == 1, na.rm = TRUE),
UniCredit = sum(Q25g == 1, na.rm = TRUE),
Regional_Bank = sum(Q25h == 1, na.rm = TRUE),
Workers_Savings = sum(Q25i == 1, na.rm = TRUE),
Sparkasse = sum(Q25j == 1, na.rm = TRUE),
Addiko = sum(Q25k == 1, na.rm = TRUE),
Other = sum(Q25l == 1, na.rm = TRUE)
)
cluster_bank_percent_long_named <- cluster_bank_counts_named |>
pivot_longer(
cols = -cluster,
names_to = "bank",
values_to = "count"
) |>
group_by(cluster) |>
mutate(percent = round(100 * count / sum(count), 1)) |>
ungroup()
ggplot(
cluster_bank_percent_long_named,
aes(x = bank, y = cluster, fill = percent)
) +
geom_tile(color = "white") +
geom_text(aes(label = paste0(percent, "%")), color = "white", size = 4) +
labs(
title = "Bank Usage by Selected Clusters (%)",
x = "Bank",
y = "Cluster",
fill = "Percent"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
library(dplyr)
library(tibble)
library(FactoMineR)
library(factoextra)
library(ggplot2)
data_cluster_profile_5 <- data_clean_complete |>
mutate(cluster = factor(k5_pca2$cluster))
make_perception_map <- function(data, cluster_values, title_text) {
subset_data <- data |>
filter(cluster %in% cluster_values) |>
mutate(
across(Q13a:Q15g, as.numeric)
)
bank_means <- tibble(
bank = c("OTP", "Gorenjska Banka", "NLB", "Revolut", "N26", "Intesa SanPaolo", "UniCredit"),
innovation = c(
mean(subset_data$Q13a, na.rm = TRUE),
mean(subset_data$Q13b, na.rm = TRUE),
mean(subset_data$Q13c, na.rm = TRUE),
mean(subset_data$Q13d, na.rm = TRUE),
mean(subset_data$Q13e, na.rm = TRUE),
mean(subset_data$Q13f, na.rm = TRUE),
mean(subset_data$Q13g, na.rm = TRUE)
),
customer_support = c(
mean(subset_data$Q14a, na.rm = TRUE),
mean(subset_data$Q14b, na.rm = TRUE),
mean(subset_data$Q14c, na.rm = TRUE),
mean(subset_data$Q14d, na.rm = TRUE),
mean(subset_data$Q14e, na.rm = TRUE),
mean(subset_data$Q14f, na.rm = TRUE),
mean(subset_data$Q14g, na.rm = TRUE)
),
reliability = c(
mean(subset_data$Q15a, na.rm = TRUE),
mean(subset_data$Q15b, na.rm = TRUE),
mean(subset_data$Q15c, na.rm = TRUE),
mean(subset_data$Q15d, na.rm = TRUE),
mean(subset_data$Q15e, na.rm = TRUE),
mean(subset_data$Q15f, na.rm = TRUE),
mean(subset_data$Q15g, na.rm = TRUE)
)
)
mat <- bank_means |>
column_to_rownames("bank") |>
as.matrix()
pca_bank <- FactoMineR::PCA(mat, scale.unit = TRUE, graph = FALSE)
factoextra::fviz_pca_biplot(
pca_bank,
repel = TRUE,
col.var = "gray30"
) +
ggplot2::ggtitle(title_text) +
ggplot2::xlab("Dim1: Overall Digital Banking Performance") +
ggplot2::ylab("Dim2: Customer Support vs Innovation")
}
make_perception_map(
data = data_cluster_profile_5,
cluster_values = c("2"),
title_text = "Perceptual Map of Banks - Cluster 2"
)
make_perception_map(
data = data_cluster_profile_5,
cluster_values = c("3"),
title_text = "Perceptual Map of Banks - Cluster 3"
)
make_perception_map(
data = data_cluster_profile_5,
cluster_values = c("5"),
title_text = "Perceptual Map of Banks - Cluster 5"
)
make_perception_map(
data = data_cluster_profile_5,
cluster_values = c("2", "3", "5"),
title_text = "Perceptual Map of Banks - Clusters 2, 3, and 5"
)
library(dplyr)
library(tidyr)
library(ggplot2)
cluster_q4_table_named <- data_cluster_profile_5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
) |>
filter(cluster %in% c(
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-enthusiastic guidance seekers"
)) |>
group_by(cluster, Q4) |>
summarise(n = n(), .groups = "drop") |>
group_by(cluster) |>
mutate(percent = round(100 * n / sum(n), 1)) |>
ungroup() |>
mutate(
info_source = case_when(
Q4 == "1" ~ "Web browsers",
Q4 == "2" ~ "Family / friends",
Q4 == "3" ~ "Financial media",
Q4 == "4" ~ "ChatGPT / AI models",
Q4 == "5" ~ "Other",
TRUE ~ as.character(Q4)
)
) |>
select(cluster, info_source, n, percent)
cluster_q4_table_named
ggplot(cluster_q4_table_named, aes(x = info_source, y = cluster, fill = percent)) +
geom_tile(color = "white") +
geom_text(aes(label = paste0(percent, "%")), color = "white", size = 4) +
labs(
title = "Information Source by Selected Clusters (%)",
x = "Information Source",
y = "Cluster",
fill = "Percent"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
cluster_q4_wide <- cluster_q4_table_named |>
select(cluster, info_source, percent) |>
pivot_wider(
names_from = info_source,
values_from = percent,
values_fill = 0
)
cluster_q4_wide
library(dplyr)
cluster_vars <- c(
"Q6a", "Q6b", "Q6d", "Q6e", "Q6h", "Q6i",
"Q8a", "Q8b", "Q8c", "Q8d", "Q8e"
)
cluster_validation <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(k5_pca2$cluster),
across(all_of(cluster_vars), as.numeric)
)
anova_results <- lapply(cluster_vars, function(var) {
model <- aov(as.formula(paste(var, "~ cluster")), data = cluster_validation)
data.frame(
variable = var,
p_value = summary(model)[[1]][["Pr(>F)"]][1]
)
})
anova_results <- do.call(rbind, anova_results)
anova_results$p_adjusted <- p.adjust(anova_results$p_value, method = "holm")
anova_results
cluster_means_validation <- cluster_validation |>
group_by(cluster) |>
summarise(
across(all_of(cluster_vars), ~ mean(.x, na.rm = TRUE)),
.groups = "drop"
)
cluster_means_validation
library(dplyr)
age_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
) |>
filter(!is.na(Q21)) |>
mutate(
age = 2026 - as.numeric(Q21)
)
age_summary <- age_data |>
group_by(cluster) |>
summarise(
n = n(),
mean_age = mean(age, na.rm = TRUE),
sd_age = sd(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE),
min_age = min(age, na.rm = TRUE),
max_age = max(age, na.rm = TRUE)
)
age_summary
age_anova <- aov(age ~ cluster, data = age_data)
summary(age_anova)
Df Sum Sq Mean Sq F value Pr(>F)
cluster 4 11950 2987.6 12.44 2.09e-09 ***
Residuals 316 75907 240.2
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
TukeyHSD(age_anova)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = age ~ cluster, data = age_data)
$cluster
diff lwr
Cautious guidance seekers-Skeptical support seekers -7.5313131 -14.6825056
Feature-oriented adopters-Skeptical support seekers 2.1517677 -5.4632570
AI-resistant independents-Skeptical support seekers 10.4484848 1.0853761
AI-enthusiastic guidance seekers-Skeptical support seekers -8.2035191 -16.0800102
Feature-oriented adopters-Cautious guidance seekers 9.6830808 3.0969363
AI-resistant independents-Cautious guidance seekers 17.9797980 9.4324882
AI-enthusiastic guidance seekers-Cautious guidance seekers -0.6722059 -7.5589912
AI-resistant independents-Feature-oriented adopters 8.2967172 -0.6422734
AI-enthusiastic guidance seekers-Feature-oriented adopters -10.3552867 -17.7225697
AI-enthusiastic guidance seekers-AI-resistant independents -18.6520039 -27.8147581
upr p adj
Cautious guidance seekers-Skeptical support seekers -0.3801207 0.0333606
Feature-oriented adopters-Skeptical support seekers 9.7667923 0.9375949
AI-resistant independents-Skeptical support seekers 19.8115936 0.0200832
AI-enthusiastic guidance seekers-Skeptical support seekers -0.3270279 0.0365232
Feature-oriented adopters-Cautious guidance seekers 16.2692253 0.0006548
AI-resistant independents-Cautious guidance seekers 26.5271078 0.0000002
AI-enthusiastic guidance seekers-Cautious guidance seekers 6.2145793 0.9988664
AI-resistant independents-Feature-oriented adopters 17.2357077 0.0831926
AI-enthusiastic guidance seekers-Feature-oriented adopters -2.9880037 0.0013076
AI-enthusiastic guidance seekers-AI-resistant independents -9.4892498 0.0000005
age_summary <- age_data |>
group_by(cluster) |>
summarise(
n = n(),
mean_age = mean(age, na.rm = TRUE),
sd_age = sd(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE),
min_age = min(age, na.rm = TRUE),
max_age = max(age, na.rm = TRUE)
)
age_summary
age_summary
library(dplyr)
gender_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
) |>
filter(!is.na(Q22)) |>
mutate(
gender = factor(
Q22,
levels = c("1", "2"),
labels = c("Male", "Female")
)
)
gender_table <- table(gender_data$cluster, gender_data$gender)
gender_table
Male Female
Skeptical support seekers 39 16
Cautious guidance seekers 52 46
Feature-oriented adopters 35 38
AI-resistant independents 14 19
AI-enthusiastic guidance seekers 33 29
gender_chisq <- chisq.test(gender_table)
gender_chisq
Pearson's Chi-squared test
data: gender_table
X-squared = 9.2334, df = 4, p-value = 0.05552
gender_chisq$expected
Male Female
Skeptical support seekers 29.64174 25.35826
Cautious guidance seekers 52.81620 45.18380
Feature-oriented adopters 39.34268 33.65732
AI-resistant independents 17.78505 15.21495
AI-enthusiastic guidance seekers 33.41433 28.58567
library(dplyr)
mobile_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
) |>
filter(!is.na(Q2)) |>
mutate(
mobile_banking_use = as.numeric(Q2)
)
mobile_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
mobile_banking_use = as.numeric(Q2),
mobile_banking_use = ifelse(mobile_banking_use == -2, NA, mobile_banking_use)
) |>
filter(!is.na(mobile_banking_use))
table(mobile_data$mobile_banking_use)
1 2 3 4
72 112 105 20
mobile_data |>
count(cluster, mobile_banking_use)
mobile_summary <- mobile_data |>
group_by(cluster) |>
summarise(
n = n(),
mean_mobile_use = mean(mobile_banking_use, na.rm = TRUE),
sd_mobile_use = sd(mobile_banking_use, na.rm = TRUE),
median_mobile_use = median(mobile_banking_use, na.rm = TRUE),
min_mobile_use = min(mobile_banking_use, na.rm = TRUE),
max_mobile_use = max(mobile_banking_use, na.rm = TRUE)
)
mobile_summary
mobile_kw <- kruskal.test(mobile_banking_use ~ cluster, data = mobile_data)
mobile_kw
Kruskal-Wallis rank sum test
data: mobile_banking_use by cluster
Kruskal-Wallis chi-squared = 6.9046, df = 4, p-value = 0.141
library(dplyr)
education_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1,2,3,4,5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
education = as.numeric(Q24)
) |>
filter(!is.na(education))
table(education_data$education)
3 4 5 6 7 8
98 34 56 104 26 2
education_data |>
count(cluster, education)
education_table <- table(education_data$cluster, education_data$education)
education_table
3 4 5 6 7 8
Skeptical support seekers 15 2 14 18 6 0
Cautious guidance seekers 34 14 12 33 5 1
Feature-oriented adopters 20 9 12 23 8 0
AI-resistant independents 9 6 5 12 1 0
AI-enthusiastic guidance seekers 20 3 13 18 6 1
education_chisq <- chisq.test(education_table)
education_chisq
Pearson's Chi-squared test
data: education_table
X-squared = 19.208, df = 20, p-value = 0.5083
education_chisq$expected
3 4 5 6 7 8
Skeptical support seekers 16.84375 5.84375 9.625 17.875 4.46875 0.34375
Cautious guidance seekers 30.31875 10.51875 17.325 32.175 8.04375 0.61875
Feature-oriented adopters 22.05000 7.65000 12.600 23.400 5.85000 0.45000
AI-resistant independents 10.10625 3.50625 5.775 10.725 2.68125 0.20625
AI-enthusiastic guidance seekers 18.68125 6.48125 10.675 19.825 4.95625 0.38125
table(education_data$education)
3 4 5 6 7 8
98 34 56 104 26 2
education_table
3 4 5 6 7 8
Skeptical support seekers 15 2 14 18 6 0
Cautious guidance seekers 34 14 12 33 5 1
Feature-oriented adopters 20 9 12 23 8 0
AI-resistant independents 9 6 5 12 1 0
AI-enthusiastic guidance seekers 20 3 13 18 6 1
education_chisq
Pearson's Chi-squared test
data: education_table
X-squared = 19.208, df = 20, p-value = 0.5083
income_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1,2,3,4,5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
income = as.numeric(Q26)
) |>
filter(!is.na(income))
table(income_data$income)
1 2 3 4 5 6 7 8
43 43 58 64 49 31 12 20
income_data |>
count(cluster, income)
income_table <- table(income_data$cluster, income_data$income)
income_table
1 2 3 4 5 6 7 8
Skeptical support seekers 8 4 14 11 10 2 1 5
Cautious guidance seekers 23 18 15 20 7 8 4 3
Feature-oriented adopters 5 7 10 13 19 8 4 6
AI-resistant independents 1 6 8 6 3 5 0 4
AI-enthusiastic guidance seekers 6 8 11 14 10 8 3 2
income_chisq <- chisq.test(income_table)
income_chisq
Pearson's Chi-squared test
data: income_table
X-squared = 44.967, df = 28, p-value = 0.02223
income_chisq$expected
1 2 3 4 5
Skeptical support seekers 7.390625 7.390625 9.96875 11.0 8.421875
Cautious guidance seekers 13.168750 13.168750 17.76250 19.6 15.006250
Feature-oriented adopters 9.675000 9.675000 13.05000 14.4 11.025000
AI-resistant independents 4.434375 4.434375 5.98125 6.6 5.053125
AI-enthusiastic guidance seekers 8.331250 8.331250 11.23750 12.4 9.493750
6 7 8
Skeptical support seekers 5.328125 2.0625 3.4375
Cautious guidance seekers 9.493750 3.6750 6.1250
Feature-oriented adopters 6.975000 2.7000 4.5000
AI-resistant independents 3.196875 1.2375 2.0625
AI-enthusiastic guidance seekers 6.006250 2.3250 3.8750
table(income_data$income)
1 2 3 4 5 6 7 8
43 43 58 64 49 31 12 20
income_table
1 2 3 4 5 6 7 8
Skeptical support seekers 8 4 14 11 10 2 1 5
Cautious guidance seekers 23 18 15 20 7 8 4 3
Feature-oriented adopters 5 7 10 13 19 8 4 6
AI-resistant independents 1 6 8 6 3 5 0 4
AI-enthusiastic guidance seekers 6 8 11 14 10 8 3 2
income_chisq
Pearson's Chi-squared test
data: income_table
X-squared = 44.967, df = 28, p-value = 0.02223
income_data_grouped <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
income = as.numeric(Q26),
income_group = case_when(
income %in% c(1, 2) ~ "Low income (0–999€)",
income %in% c(3, 4) ~ "Lower-middle income (1000–1999€)",
income %in% c(5, 6) ~ "Upper-middle income (2000–2999€)",
income %in% c(7, 8) ~ "High income (3000€+)",
TRUE ~ NA_character_
)
) |>
filter(!is.na(income_group))
table(income_data_grouped$income_group)
High income (3000€+) Low income (0–999€)
32 86
Lower-middle income (1000–1999€) Upper-middle income (2000–2999€)
122 80
income_data_grouped |>
count(cluster, income_group)
income_group_table <- table(
income_data_grouped$cluster,
income_data_grouped$income_group
)
income_group_table
High income (3000€+) Low income (0–999€)
Skeptical support seekers 6 12
Cautious guidance seekers 7 41
Feature-oriented adopters 10 12
AI-resistant independents 4 7
AI-enthusiastic guidance seekers 5 14
Lower-middle income (1000–1999€)
Skeptical support seekers 25
Cautious guidance seekers 35
Feature-oriented adopters 23
AI-resistant independents 14
AI-enthusiastic guidance seekers 25
Upper-middle income (2000–2999€)
Skeptical support seekers 12
Cautious guidance seekers 15
Feature-oriented adopters 27
AI-resistant independents 8
AI-enthusiastic guidance seekers 18
income_group_chisq <- chisq.test(income_group_table)
income_group_chisq
Pearson's Chi-squared test
data: income_group_table
X-squared = 25.314, df = 12, p-value = 0.0134
income_group_chisq$expected
High income (3000€+) Low income (0–999€)
Skeptical support seekers 5.5 14.78125
Cautious guidance seekers 9.8 26.33750
Feature-oriented adopters 7.2 19.35000
AI-resistant independents 3.3 8.86875
AI-enthusiastic guidance seekers 6.2 16.66250
Lower-middle income (1000–1999€)
Skeptical support seekers 20.96875
Cautious guidance seekers 37.36250
Feature-oriented adopters 27.45000
AI-resistant independents 12.58125
AI-enthusiastic guidance seekers 23.63750
Upper-middle income (2000–2999€)
Skeptical support seekers 13.75
Cautious guidance seekers 24.50
Feature-oriented adopters 18.00
AI-resistant independents 8.25
AI-enthusiastic guidance seekers 15.50
income_profile <- income_data_grouped |>
count(cluster, income_group) |>
group_by(cluster) |>
mutate(percent = round(100 * n / sum(n), 1)) |>
ungroup()
income_profile
income_profile_wide <- income_profile |>
select(cluster, income_group, percent) |>
tidyr::pivot_wider(
names_from = income_group,
values_from = percent,
values_fill = 0
)
income_profile_wide
bank_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1,2,3,4,5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
)
bank_data <- bank_data |>
mutate(
across(Q25a:Q25l, ~ as.numeric(.))
)
revolut_table <- table(bank_data$cluster, bank_data$Q25d)
revolut_table
0 1
Skeptical support seekers 44 12
Cautious guidance seekers 68 31
Feature-oriented adopters 55 18
AI-resistant independents 28 4
AI-enthusiastic guidance seekers 43 19
chisq.test(revolut_table)
Pearson's Chi-squared test
data: revolut_table
X-squared = 5.8418, df = 4, p-value = 0.2113
banks <- c("Q25a","Q25b","Q25c","Q25d","Q25e","Q25f","Q25g","Q25h","Q25i","Q25j","Q25k","Q25l")
bank_tests <- lapply(banks, function(var){
tab <- table(bank_data$cluster, bank_data[[var]])
test <- chisq.test(tab)
data.frame(
bank = var,
chi_square = test$statistic,
p_value = test$p.value
)
})
bank_tests <- do.call(rbind, bank_tests)
bank_tests
otp_profile <- bank_data |>
group_by(cluster) |>
summarise(
otp_users = sum(Q25a == 1, na.rm = TRUE),
n = n(),
otp_percent = round(100 * otp_users / n, 1)
)
otp_profile
area_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1,2,3,4,5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
area = as.numeric(Q23)
) |>
filter(!is.na(area))
table(area_data$area)
1 2 3
185 68 70
area_table <- table(area_data$cluster, area_data$area)
area_table
1 2 3
Skeptical support seekers 36 11 9
Cautious guidance seekers 57 22 20
Feature-oriented adopters 38 15 20
AI-resistant independents 15 7 11
AI-enthusiastic guidance seekers 39 13 10
area_chisq <- chisq.test(area_table)
area_chisq
Pearson's Chi-squared test
data: area_table
X-squared = 7.067, df = 8, p-value = 0.5294
area_chisq$expected
1 2 3
Skeptical support seekers 32.07430 11.789474 12.136223
Cautious guidance seekers 56.70279 20.842105 21.455108
Feature-oriented adopters 41.81115 15.368421 15.820433
AI-resistant independents 18.90093 6.947368 7.151703
AI-enthusiastic guidance seekers 35.51084 13.052632 13.436533
area_profile <- area_data |>
count(cluster, area) |>
group_by(cluster) |>
mutate(percent = round(100 * n / sum(n), 1))
area_profile
area_profile <- area_data |>
count(cluster, area) |>
group_by(cluster) |>
mutate(percent = round(100 * n / sum(n), 1)) |>
ungroup()
area_profile
ai_data <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
) |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1,2,3,4,5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
ai_use = as.numeric(Q17)
) |>
filter(!is.na(ai_use))
ai_summary <- ai_data |>
group_by(cluster) |>
summarise(
n = n(),
mean_ai = mean(ai_use),
sd_ai = sd(ai_use),
median_ai = median(ai_use),
min_ai = min(ai_use),
max_ai = max(ai_use)
)
ai_summary
ai_anova <- aov(ai_use ~ cluster, data = ai_data)
summary(ai_anova)
Df Sum Sq Mean Sq F value Pr(>F)
cluster 4 302.8 75.71 33.2 <2e-16 ***
Residuals 294 670.4 2.28
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
TukeyHSD(ai_anova)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = ai_use ~ cluster, data = ai_data)
$cluster
diff lwr
Cautious guidance seekers-Skeptical support seekers 1.1444444 0.41801040
Feature-oriented adopters-Skeptical support seekers 1.9019608 1.13419826
AI-resistant independents-Skeptical support seekers -0.5000000 -1.45365166
AI-enthusiastic guidance seekers-Skeptical support seekers 2.5500000 1.76060613
Feature-oriented adopters-Cautious guidance seekers 0.7575163 0.09155992
AI-resistant independents-Cautious guidance seekers -1.6444444 -2.51822367
AI-enthusiastic guidance seekers-Cautious guidance seekers 1.4055556 0.71477242
AI-resistant independents-Feature-oriented adopters -2.4019608 -3.31038966
AI-enthusiastic guidance seekers-Feature-oriented adopters 0.6480392 -0.08608217
AI-enthusiastic guidance seekers-AI-resistant independents 3.0500000 2.12321717
upr p adj
Cautious guidance seekers-Skeptical support seekers 1.8708785 0.0002027
Feature-oriented adopters-Skeptical support seekers 2.6697233 0.0000000
AI-resistant independents-Skeptical support seekers 0.4536517 0.6028939
AI-enthusiastic guidance seekers-Skeptical support seekers 3.3393939 0.0000000
Feature-oriented adopters-Cautious guidance seekers 1.4234728 0.0167712
AI-resistant independents-Cautious guidance seekers -0.7706652 0.0000044
AI-enthusiastic guidance seekers-Cautious guidance seekers 2.0963387 0.0000005
AI-resistant independents-Feature-oriented adopters -1.4935319 0.0000000
AI-enthusiastic guidance seekers-Feature-oriented adopters 1.3821606 0.1120779
AI-enthusiastic guidance seekers-AI-resistant independents 3.9767828 0.0000000
ai_summary
summary(ai_anova)
Df Sum Sq Mean Sq F value Pr(>F)
cluster 4 302.8 75.71 33.2 <2e-16 ***
Residuals 294 670.4 2.28
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# =========================================================
# HYPOTHESIS TEST – H1c
# Q17: Likelihood of using the AI personal banking agent
# =========================================================
# H0 (Null Hypothesis):
# There are no statistically significant differences in the likelihood
# of using the AI personal banking agent (Q17) between the clusters.
# H1c (Research Hypothesis):
# Feature-oriented adopters are more likely to use the AI personal
# banking agent than cautious guidance seekers.
# =========================================================
# 1 LOAD LIBRARIES
# =========================================================
library(dplyr)
library(ggplot2)
# =========================================================
# 2 PREPARE DATA
# =========================================================
# ensure cluster variable is factor
data_cluster_profile_5$cluster <- as.factor(data_cluster_profile_5$cluster)
# convert Q17 to numeric (Likert scale)
data_cluster_profile_5 <- data_cluster_profile_5 %>%
mutate(Q17 = as.numeric(Q17))
# =========================================================
# 3 DESCRIPTIVE STATISTICS FOR EACH CLUSTER
# =========================================================
cluster_summary_Q17 <- data_cluster_profile_5 %>%
group_by(cluster) %>%
summarise(
n = n(),
mean = mean(Q17, na.rm = TRUE),
median = median(Q17, na.rm = TRUE),
sd = sd(Q17, na.rm = TRUE),
min = min(Q17, na.rm = TRUE),
max = max(Q17, na.rm = TRUE)
)
print(cluster_summary_Q17)
# =========================================================
# 4 TEST DIFFERENCES BETWEEN CLUSTERS
# Kruskal–Wallis test (appropriate for Likert data)
# =========================================================
kruskal_test_Q17 <- kruskal.test(Q17 ~ cluster, data = data_cluster_profile_5)
print(kruskal_test_Q17)
Kruskal-Wallis rank sum test
data: Q17 by cluster
Kruskal-Wallis chi-squared = 91.782, df = 4, p-value < 2.2e-16
# =========================================================
# 5 POST-HOC TEST (PAIRWISE COMPARISON)
# =========================================================
pairwise_results_Q17 <- pairwise.wilcox.test(
data_cluster_profile_5$Q17,
data_cluster_profile_5$cluster,
p.adjust.method = "bonferroni"
)
print(pairwise_results_Q17)
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: data_cluster_profile_5$Q17 and data_cluster_profile_5$cluster
1 2 3 4
2 0.00079 - - -
3 2.9e-07 0.00433 - -
4 1.00000 0.00014 1.3e-06 -
5 1.1e-10 1.7e-08 0.03730 6.8e-09
P value adjustment method: bonferroni
# =========================================================
# 6 VISUALIZATION
# =========================================================
ggplot(data_cluster_profile_5, aes(x = cluster, y = Q17)) +
geom_boxplot() +
labs(
title = "Likelihood of Using AI Personal Banking Agent by Cluster",
x = "Cluster",
y = "Likelihood of Use (Q17)"
) +
theme_minimal()
# =========================================================
# 4b EFFECT SIZE FOR KRUSKAL-WALLIS
# =========================================================
library(rstatix)
effect_size_Q17 <- data_cluster_profile_5 %>%
kruskal_effsize(Q17 ~ cluster)
print(effect_size_Q17)
# =====================================================
# ASSUMPTION CHECKS – H1c
# =====================================================
shapiro.test(data_cluster_profile_5$Q17)
Shapiro-Wilk normality test
data: data_cluster_profile_5$Q17
W = 0.90384, p-value = 7.189e-13
ggplot(data_cluster_profile_5, aes(x = Q17)) +
geom_histogram(binwidth = 1) +
theme_minimal()
qqnorm(data_cluster_profile_5$Q17)
qqline(data_cluster_profile_5$Q17)
library(rstatix)
levene_test(Q17 ~ cluster, data = data_cluster_profile_5)
# =====================================================
# KRUSKAL-WALLIS TEST
# =====================================================
kruskal.test(Q17 ~ cluster, data = data_cluster_profile_5)
Kruskal-Wallis rank sum test
data: Q17 by cluster
Kruskal-Wallis chi-squared = 91.782, df = 4, p-value < 2.2e-16
##H1e
# =========================================================
# HYPOTHESIS TEST
# Q12g – Preference for human interaction
# =========================================================
# H0 (Null Hypothesis):
# There are no statistically significant differences in preference
# for human interaction (Q12g) between the clusters.
# H1 (Research Hypothesis):
# Cautious guidance seekers show a higher preference for human
# interaction than AI-enthusiastic guidance seekers and
# feature-oriented adopters.
# =========================================================
# 1 LOAD LIBRARIES
# =========================================================
library(dplyr)
library(ggplot2)
# =========================================================
# 2 PREPARE DATA
# =========================================================
# ensure cluster variable is factor
data_cluster_profile_5$cluster <- as.factor(data_cluster_profile_5$cluster)
# convert Likert variable to numeric
data_cluster_profile_5 <- data_cluster_profile_5 %>%
mutate(Q12g = as.numeric(Q12g))
# =========================================================
# 3 DESCRIPTIVE STATISTICS FOR EACH CLUSTER
# =========================================================
cluster_summary <- data_cluster_profile_5 %>%
group_by(cluster) %>%
summarise(
n = n(),
mean = mean(Q12g, na.rm = TRUE),
median = median(Q12g, na.rm = TRUE),
sd = sd(Q12g, na.rm = TRUE),
min = min(Q12g, na.rm = TRUE),
max = max(Q12g, na.rm = TRUE)
)
print(cluster_summary)
# =========================================================
# 4 TEST DIFFERENCES BETWEEN CLUSTERS
# Kruskal–Wallis test (appropriate for Likert scale)
# =========================================================
kruskal_test <- kruskal.test(Q12g ~ cluster, data = data_cluster_profile_5)
print(kruskal_test)
Kruskal-Wallis rank sum test
data: Q12g by cluster
Kruskal-Wallis chi-squared = 13.865, df = 4, p-value = 0.007737
# =========================================================
# 5 POST-HOC TEST (PAIRWISE COMPARISON BETWEEN CLUSTERS)
# =========================================================
pairwise_results <- pairwise.wilcox.test(
data_cluster_profile_5$Q12g,
data_cluster_profile_5$cluster,
p.adjust.method = "bonferroni"
)
print(pairwise_results)
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: data_cluster_profile_5$Q12g and data_cluster_profile_5$cluster
1 2 3 4
2 1.0000 - - -
3 0.1240 0.3440 - -
4 1.0000 1.0000 1.0000 -
5 1.0000 0.6375 0.0046 1.0000
P value adjustment method: bonferroni
# =========================================================
# 6 VISUALIZATION OF DIFFERENCES BETWEEN CLUSTERS
# =========================================================
ggplot(data_cluster_profile_5, aes(x = cluster, y = Q12g)) +
geom_boxplot() +
labs(
title = "Preference for Human Interaction by Cluster",
x = "Cluster",
y = "Preference for Human Interaction (Q12g)"
) +
theme_minimal()
# =====================================================
# ASSUMPTION CHECKS – H1e
# =====================================================
library(ggplot2)
library(car)
# Normality test
shapiro.test(data_cluster_profile_5$Q12g)
Shapiro-Wilk normality test
data: data_cluster_profile_5$Q12g
W = 0.78641, p-value < 2.2e-16
# Histogram
ggplot(data_cluster_profile_5, aes(x = Q12g)) +
geom_histogram(binwidth = 1) +
theme_minimal()
# QQ plot
qqnorm(data_cluster_profile_5$Q12g)
qqline(data_cluster_profile_5$Q12g)
# Homogeneity of variances
leveneTest(Q12g ~ cluster, data = data_cluster_profile_5)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 4 2.6236 0.03484 *
315
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# =====================================================
# KRUSKAL-WALLIS TEST
# =====================================================
kruskal.test(Q12g ~ cluster, data = data_cluster_profile_5)
Kruskal-Wallis rank sum test
data: Q12g by cluster
Kruskal-Wallis chi-squared = 13.865, df = 4, p-value = 0.007737
# =========================================================
# 4b EFFECT SIZE FOR KRUSKAL-WALLIS
# =========================================================
library(rstatix)
effect_size_Q12g <- data_cluster_profile_5 %>%
kruskal_effsize(Q12g ~ cluster)
print(effect_size_Q12g)
install.packages("coin")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.5/coin_1.4-3.zip'
Content type 'application/zip' length 1473035 bytes (1.4 MB)
downloaded 1.4 MB
package ‘coin’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\frach\AppData\Local\Temp\Rtmp0giUNF\downloaded_packages
library(dplyr)
library(rstatix)
library(coin)
library(purrr)
data_cluster_profile_5 <- data_cluster_profile_5 %>%
mutate(
cluster = as.factor(cluster),
Q12g = as.numeric(as.character(Q12g))
) %>%
filter(!is.na(cluster), !is.na(Q12g))
cluster_pairs <- combn(levels(data_cluster_profile_5$cluster), 2, simplify = FALSE)
pairwise_effects_Q12g <- map_dfr(cluster_pairs, function(x) {
tmp <- data_cluster_profile_5 %>%
filter(cluster %in% x) %>%
droplevels()
wilcox_effsize(data = tmp, Q12g ~ cluster) %>%
mutate(comparison = paste(x, collapse = " vs "))
})
print(pairwise_effects_Q12g)
# =========================================================
# HYPOTHESIS TEST – H2d
# The higher the importance of security, the higher the
# willingness to adopt AI agents in personal banking
# Variables:
# Q12a = Importance of security
# Q17 = Willingness to use AI banking agent
# =========================================================
# H0 (Null Hypothesis):
# There is no relationship between the importance of security (Q12a)
# and willingness to adopt AI banking agents (Q17).
# H1 (Research Hypothesis):
# Higher perceived importance of security is associated with a higher
# willingness to adopt AI banking agents.
# =========================================================
# 1 LOAD LIBRARIES
# =========================================================
library(dplyr)
library(ggplot2)
# =========================================================
# 2 PREPARE DATA
# =========================================================
data_cluster_profile_5 <- data_cluster_profile_5 %>%
mutate(
Q12a = as.numeric(Q12a),
Q17 = as.numeric(Q17)
)
# =========================================================
# 3 DESCRIPTIVE STATISTICS
# =========================================================
summary_stats <- data_cluster_profile_5 %>%
summarise(
mean_security = mean(Q12a, na.rm = TRUE),
sd_security = sd(Q12a, na.rm = TRUE),
mean_AI_adoption = mean(Q17, na.rm = TRUE),
sd_AI_adoption = sd(Q17, na.rm = TRUE)
)
print(summary_stats)
# =========================================================
# 4 CORRELATION TEST (Spearman for ordinal data)
# =========================================================
correlation_test <- cor.test(
data_cluster_profile_5$Q12a,
data_cluster_profile_5$Q17,
method = "spearman",
use = "complete.obs"
)
print(correlation_test)
Spearman's rank correlation rho
data: data_cluster_profile_5$Q12a and data_cluster_profile_5$Q17
S = 4229048, p-value = 0.7115
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.02158374
# =========================================================
# 5 VISUALIZATION
# =========================================================
ggplot(data_cluster_profile_5, aes(x = Q12a, y = Q17)) +
geom_jitter(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Relationship Between Security Importance and AI Agent Adoption",
x = "Importance of Security (Q12a)",
y = "Willingness to Use AI Banking Agent (Q17)"
) +
theme_minimal()
# =====================================================
# ASSUMPTION CHECKS – H2d
# =====================================================
# Normality tests
shapiro.test(data_cluster_profile_5$Q12a)
Shapiro-Wilk normality test
data: data_cluster_profile_5$Q12a
W = 0.62052, p-value < 2.2e-16
shapiro.test(data_cluster_profile_5$Q17)
Shapiro-Wilk normality test
data: data_cluster_profile_5$Q17
W = 0.90494, p-value = 1.04e-12
# Histograms
ggplot(data_cluster_profile_5, aes(x = Q12a)) +
geom_histogram(binwidth = 1) +
theme_minimal()
ggplot(data_cluster_profile_5, aes(x = Q17)) +
geom_histogram(binwidth = 1) +
theme_minimal()
# QQ plots
qqnorm(data_cluster_profile_5$Q12a)
qqline(data_cluster_profile_5$Q12a)
qqnorm(data_cluster_profile_5$Q17)
qqline(data_cluster_profile_5$Q17)
# =====================================================
# SPEARMAN CORRELATION
# =====================================================
cor.test(
data_cluster_profile_5$Q12a,
data_cluster_profile_5$Q17,
method = "spearman"
)
Spearman's rank correlation rho
data: data_cluster_profile_5$Q12a and data_cluster_profile_5$Q17
S = 4229048, p-value = 0.7115
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.02158374
# =========================================================
# HYPOTHESIS TEST – H2e
# AI-enthusiastic guidance seekers perceive less financial
# stress than cautious guidance seekers and feature-oriented adopters
# Variable: Q5b
# =========================================================
library(dplyr)
library(ggplot2)
# ensure cluster is factor
data_cluster_profile_5$cluster <- as.factor(data_cluster_profile_5$cluster)
# convert Likert variable to numeric
data_cluster_profile_5 <- data_cluster_profile_5 %>%
mutate(Q5b = as.numeric(Q5b))
# =========================================================
# 1 DESCRIPTIVE STATISTICS
# =========================================================
cluster_summary_Q5b <- data_cluster_profile_5 %>%
group_by(cluster) %>%
summarise(
n = n(),
mean = mean(Q5b, na.rm = TRUE),
median = median(Q5b, na.rm = TRUE),
sd = sd(Q5b, na.rm = TRUE),
min = min(Q5b, na.rm = TRUE),
max = max(Q5b, na.rm = TRUE)
)
print(cluster_summary_Q5b)
# =========================================================
# 2 KRUSKAL-WALLIS TEST
# =========================================================
kruskal_test_Q5b <- kruskal.test(Q5b ~ cluster, data = data_cluster_profile_5)
print(kruskal_test_Q5b)
Kruskal-Wallis rank sum test
data: Q5b by cluster
Kruskal-Wallis chi-squared = 51.518, df = 4, p-value = 1.74e-10
# =========================================================
# 3 POST-HOC TEST (PAIRWISE COMPARISON)
# =========================================================
pairwise_results_Q5b <- pairwise.wilcox.test(
data_cluster_profile_5$Q5b,
data_cluster_profile_5$cluster,
p.adjust.method = "bonferroni"
)
print(pairwise_results_Q5b)
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: data_cluster_profile_5$Q5b and data_cluster_profile_5$cluster
1 2 3 4
2 0.00228 - - -
3 1.00000 0.00314 - -
4 0.09388 7.8e-07 0.00759 -
5 0.00072 1.00000 0.00094 2.1e-06
P value adjustment method: bonferroni
# =========================================================
# 4 VISUALIZATION
# =========================================================
ggplot(data_cluster_profile_5, aes(x = cluster, y = Q5b)) +
geom_boxplot() +
labs(
title = "Perceived Financial Stress by Cluster",
x = "Cluster",
y = "Financial Stress (Q5b)"
) +
theme_minimal()
# =====================================================
# ASSUMPTION CHECKS – H2e
# =====================================================
shapiro.test(data_cluster_profile_5$Q5b)
Shapiro-Wilk normality test
data: data_cluster_profile_5$Q5b
W = 0.92498, p-value = 1.51e-11
ggplot(data_cluster_profile_5, aes(x = Q5b)) +
geom_histogram(binwidth = 1) +
theme_minimal()
qqnorm(data_cluster_profile_5$Q5b)
qqline(data_cluster_profile_5$Q5b)
leveneTest(Q5b ~ cluster, data = data_cluster_profile_5)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 4 1.5213 0.1957
313
# =====================================================
# KRUSKAL-WALLIS TEST
# =====================================================
kruskal.test(Q5b ~ cluster, data = data_cluster_profile_5)
Kruskal-Wallis rank sum test
data: Q5b by cluster
Kruskal-Wallis chi-squared = 51.518, df = 4, p-value = 1.74e-10
library(rstatix)
effect_size_Q5b <- data_cluster_profile_5 %>%
kruskal_effsize(Q5b ~ cluster)
print(effect_size_Q5b)
##H2f
# =====================================================
# CLEAN DATA (REMOVE INVALID AND MISSING VALUES)
# =====================================================
library(dplyr)
library(ggplot2)
data_clean <- data_cluster_profile_5 %>%
mutate(Q19 = as.numeric(Q19)) %>% # ensure numeric
filter(Q19 >= 1) # remove invalid values (<1) and NA
# =====================================================
# DESCRIPTIVE STATISTICS BY CLUSTER
# =====================================================
cluster_summary_Q19 <- data_clean %>%
group_by(cluster) %>%
summarise(
n = n(),
mean = mean(Q19, na.rm = TRUE),
median = median(Q19, na.rm = TRUE),
sd = sd(Q19, na.rm = TRUE),
min = min(Q19, na.rm = TRUE),
max = max(Q19, na.rm = TRUE)
)
print(cluster_summary_Q19)
# =====================================================
# KRUSKAL-WALLIS TEST
# =====================================================
kruskal_test_Q19 <- kruskal.test(Q19 ~ cluster, data = data_clean)
print(kruskal_test_Q19)
Kruskal-Wallis rank sum test
data: Q19 by cluster
Kruskal-Wallis chi-squared = 3.9454, df = 4, p-value = 0.4134
# =====================================================
# PAIRWISE WILCOXON TEST
# =====================================================
pairwise_results_Q19 <- pairwise.wilcox.test(
data_clean$Q19,
data_clean$cluster,
p.adjust.method = "bonferroni"
)
print(pairwise_results_Q19)
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: data_clean$Q19 and data_clean$cluster
1 2 3 4
2 1 - - -
3 1 1 - -
4 1 1 1 -
5 1 1 1 1
P value adjustment method: bonferroni
# =====================================================
# VISUALIZATION
# =====================================================
ggplot(data_clean, aes(x = cluster, y = Q19)) +
geom_boxplot() +
labs(
title = "Willingness to Pay for AI Banking Assistant by Cluster",
x = "Cluster",
y = "Monthly Willingness to Pay (€)"
) +
theme_minimal()
# =====================================================
# ASSUMPTION CHECKS – H2f
# =====================================================
shapiro.test(data_clean$Q19)
Shapiro-Wilk normality test
data: data_clean$Q19
W = 0.93183, p-value = 0.002376
ggplot(data_clean, aes(x = Q19)) +
geom_histogram(binwidth = 1) +
theme_minimal()
qqnorm(data_clean$Q19)
qqline(data_clean$Q19)
leveneTest(Q19 ~ cluster, data = data_clean)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 4 0.6078 0.6587
55
# =====================================================
# KRUSKAL-WALLIS TEST
# =====================================================
kruskal.test(Q19 ~ cluster, data = data_clean)
Kruskal-Wallis rank sum test
data: Q19 by cluster
Kruskal-Wallis chi-squared = 3.9454, df = 4, p-value = 0.4134
df <- data_cluster_profile_5
# =====================================================
# KRUSKAL-WALLIS TEST
# =====================================================
kruskal_test_Q19 <- kruskal.test(Q19 ~ cluster, data = data_clean)
print(kruskal_test_Q19)
Kruskal-Wallis rank sum test
data: Q19 by cluster
Kruskal-Wallis chi-squared = 3.9454, df = 4, p-value = 0.4134
# =====================================================
# EFFECT SIZE
# =====================================================
library(rstatix)
effect_size_Q19 <- data_clean %>%
kruskal_effsize(Q19 ~ cluster)
print(effect_size_Q19)
library(dplyr)
library(rstatix)
library(coin)
library(purrr)
data_clean <- data_cluster_profile_5 %>%
mutate(
cluster = as.factor(cluster),
Q19 = as.numeric(as.character(Q19))
) %>%
filter(!is.na(cluster), !is.na(Q19), Q19 >= 1)
cluster_pairs <- combn(levels(data_clean$cluster), 2, simplify = FALSE)
pairwise_effects_Q19 <- map_dfr(cluster_pairs, function(x) {
tmp <- data_clean %>%
filter(cluster %in% x) %>%
droplevels()
wilcox_effsize(data = tmp, Q19 ~ cluster) %>%
mutate(comparison = paste(x, collapse = " vs "))
})
print(pairwise_effects_Q19)
library(dplyr)
df <- df |>
mutate(
across(c(Q10a, Q10b, Q10c, Q10d,
Q11a, Q11b, Q11c, Q11d, Q11e, Q11f,
Q17), as.numeric),
low_risk_delegate = rowMeans(pick(Q11a, Q11b, Q11f), na.rm = TRUE),
high_risk_delegate = rowMeans(pick(Q11c, Q11d, Q11e), na.rm = TRUE),
confirm_required = Q10c,
autonomous_decisions = Q10d,
trust_ai = rowMeans(pick(Q10a, Q10b, Q10c, Q10d), na.rm = TRUE),
intention_use = Q17
)
df$cluster <- factor(df$cluster)
table(df$cluster)
1 2 3 4 5
55 98 73 32 62
df |>
summarise(
h1a_low_min = min(low_risk_delegate, na.rm = TRUE),
h1a_low_max = max(low_risk_delegate, na.rm = TRUE),
h1a_high_min = min(high_risk_delegate, na.rm = TRUE),
h1a_high_max = max(high_risk_delegate, na.rm = TRUE),
h1b_confirm_min = min(confirm_required, na.rm = TRUE),
h1b_confirm_max = max(confirm_required, na.rm = TRUE),
h1b_auto_min = min(autonomous_decisions, na.rm = TRUE),
h1b_auto_max = max(autonomous_decisions, na.rm = TRUE),
h2a_trust_min = min(trust_ai, na.rm = TRUE),
h2a_trust_max = max(trust_ai, na.rm = TRUE),
h2c_intention_min = min(intention_use, na.rm = TRUE),
h2c_intention_max = max(intention_use, na.rm = TRUE)
)
h1a_results <- df |>
group_by(cluster) |>
do({
sub <- .
test <- t.test(sub$low_risk_delegate, sub$high_risk_delegate, paired = TRUE)
tibble(
n = nrow(sub),
low_mean = mean(sub$low_risk_delegate, na.rm = TRUE),
high_mean = mean(sub$high_risk_delegate, na.rm = TRUE),
t_value = unname(test$statistic),
p_value = test$p.value
)
})
h1a_results
df <- df |>
mutate(
h1a_diff = low_risk_delegate - high_risk_delegate
)
by(df$h1a_diff, df$cluster, shapiro.test)
df$cluster: 1
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.88369, p-value = 7.063e-05
----------------------------------------------------------------
df$cluster: 2
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.95528, p-value = 0.002138
----------------------------------------------------------------
df$cluster: 3
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.96967, p-value = 0.07566
----------------------------------------------------------------
df$cluster: 4
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.74155, p-value = 3.872e-06
----------------------------------------------------------------
df$cluster: 5
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.94489, p-value = 0.00763
h1a_wilcox_results <- df |>
group_by(cluster) |>
do({
sub <- .
test <- wilcox.test(sub$low_risk_delegate, sub$high_risk_delegate, paired = TRUE)
tibble(
n = nrow(sub),
low_mean = mean(sub$low_risk_delegate, na.rm = TRUE),
high_mean = mean(sub$high_risk_delegate, na.rm = TRUE),
p_value = test$p.value
)
})
h1a_wilcox_results
h1a_final_results <- df |>
group_by(cluster) |>
do({
sub <- .
t_res <- t.test(sub$low_risk_delegate, sub$high_risk_delegate, paired = TRUE)
w_res <- wilcox.test(sub$low_risk_delegate, sub$high_risk_delegate, paired = TRUE)
diff <- sub$low_risk_delegate - sub$high_risk_delegate
diff <- diff[!is.na(diff)]
dz <- if (sd(diff) == 0) NA_real_ else mean(diff) / sd(diff)
diff_nz <- diff[diff != 0]
if (length(diff_nz) == 0) {
rbc <- NA_real_
} else {
ranks <- rank(abs(diff_nz))
W_pos <- sum(ranks[diff_nz > 0])
W_neg <- sum(ranks[diff_nz < 0])
rbc <- (W_pos - W_neg) / (W_pos + W_neg)
}
tibble(
n = length(diff),
low_mean = mean(sub$low_risk_delegate, na.rm = TRUE),
high_mean = mean(sub$high_risk_delegate, na.rm = TRUE),
mean_diff = mean(diff),
t_value = unname(t_res$statistic),
t_p_value = t_res$p.value,
wilcox_p_value = w_res$p.value,
cohen_dz = dz,
rank_biserial = rbc
)
})
h1a_final_results
h1b_results <- df |>
group_by(cluster) |>
do({
sub <- .
test <- t.test(sub$confirm_required, sub$autonomous_decisions, paired = TRUE)
tibble(
n = nrow(sub),
confirm_mean = mean(sub$confirm_required, na.rm = TRUE),
autonomous_mean = mean(sub$autonomous_decisions, na.rm = TRUE),
t_value = unname(test$statistic),
p_value = test$p.value
)
})
h1b_results
df <- df |>
mutate(
h1b_diff = confirm_required - autonomous_decisions
)
by(df$h1b_diff, df$cluster, shapiro.test)
df$cluster: 1
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.86987, p-value = 2.597e-05
----------------------------------------------------------------
df$cluster: 2
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.94847, p-value = 0.0008754
----------------------------------------------------------------
df$cluster: 3
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.89899, p-value = 2.478e-05
----------------------------------------------------------------
df$cluster: 4
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.74448, p-value = 4.324e-06
----------------------------------------------------------------
df$cluster: 5
Shapiro-Wilk normality test
data: dd[x, ]
W = 0.87691, p-value = 1.58e-05
h1b_wilcox_results <- df |>
group_by(cluster) |>
do({
sub <- .
test <- wilcox.test(
sub$confirm_required,
sub$autonomous_decisions,
paired = TRUE,
exact = FALSE
)
tibble(
n = nrow(sub),
confirm_mean = mean(sub$confirm_required, na.rm = TRUE),
autonomous_mean = mean(sub$autonomous_decisions, na.rm = TRUE),
p_value = test$p.value
)
})
h1b_wilcox_results
h1b_final_results <- df |>
group_by(cluster) |>
do({
sub <- .
t_res <- t.test(sub$confirm_required, sub$autonomous_decisions, paired = TRUE)
w_res <- wilcox.test(
sub$confirm_required,
sub$autonomous_decisions,
paired = TRUE,
exact = FALSE
)
diff <- sub$confirm_required - sub$autonomous_decisions
diff <- diff[!is.na(diff)]
dz <- if (sd(diff) == 0) NA_real_ else mean(diff) / sd(diff)
diff_nz <- diff[diff != 0]
if (length(diff_nz) == 0) {
rbc <- NA_real_
} else {
ranks <- rank(abs(diff_nz))
W_pos <- sum(ranks[diff_nz > 0])
W_neg <- sum(ranks[diff_nz < 0])
rbc <- (W_pos - W_neg) / (W_pos + W_neg)
}
tibble(
n = length(diff),
confirm_mean = mean(sub$confirm_required, na.rm = TRUE),
autonomous_mean = mean(sub$autonomous_decisions, na.rm = TRUE),
mean_diff = mean(diff),
t_value = unname(t_res$statistic),
t_p_value = t_res$p.value,
wilcox_p_value = w_res$p.value,
cohen_dz = dz,
rank_biserial = rbc
)
})
h1b_final_results
library(dplyr)
# make sure variables are in the right format
df$cluster <- as.factor(df$cluster)
df$trust_ai <- as.numeric(df$trust_ai)
## H2a
h2a_model <- aov(trust_ai ~ cluster, data = df)
summary(h2a_model)
Df Sum Sq Mean Sq F value Pr(>F)
cluster 4 385.0 96.26 76.05 <2e-16 ***
Residuals 315 398.7 1.27
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Post-hoc option 1: Tukey
TukeyHSD(h2a_model)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = trust_ai ~ cluster, data = df)
$cluster
diff lwr upr p adj
2-1 1.1170686 0.597018142 1.6371191 0.0000001
3-1 1.5861768 1.035043859 2.1373098 0.0000000
4-1 -1.2555398 -1.941813671 -0.5692659 0.0000086
5-1 2.5143695 1.942614555 3.0861244 0.0000000
3-2 0.4691082 -0.008110863 0.9463272 0.0566237
4-2 -2.3726084 -3.001068703 -1.7441481 0.0000000
5-2 1.3973009 0.896407469 1.8981942 0.0000000
4-3 -2.8417166 -3.496130365 -2.1873029 0.0000000
5-3 0.9281927 0.395098660 1.4612867 0.0000268
5-4 3.7699093 3.098036131 4.4417824 0.0000000
# Post-hoc option 2: pairwise t-tests with Bonferroni
pairwise.t.test(
x = df$trust_ai,
g = df$cluster,
p.adjust.method = "bonferroni"
)
Pairwise comparisons using t tests with pooled SD
data: df$trust_ai and df$cluster
1 2 3 4
2 9.7e-08 - - -
3 4.8e-13 0.074 - -
4 8.7e-06 < 2e-16 < 2e-16 -
5 < 2e-16 2.4e-12 2.7e-05 < 2e-16
P value adjustment method: bonferroni
# Descriptive means
aggregate(trust_ai ~ cluster, data = df, mean, na.rm = TRUE)
install.packages("effectsize")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
There is a binary version available but the source version is later:
trying URL 'https://cran.rstudio.com/src/contrib/effectsize_1.0.2.tar.gz'
Content type 'application/x-gzip' length 396056 bytes (386 KB)
downloaded 386 KB
* installing *source* package 'effectsize' ...
** this is package 'effectsize' version '1.0.2'
** package 'effectsize' successfully unpacked and MD5 sums checked
** using staged installation
** R
** data
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (effectsize)
The downloaded source packages are in
‘C:\Users\frach\AppData\Local\Temp\Rtmp0giUNF\downloaded_packages’
library(effectsize)
library(dplyr)
library(effectsize)
df$cluster <- as.factor(df$cluster)
df$trust_ai <- as.numeric(df$trust_ai)
## H2a
h2a_model <- aov(trust_ai ~ cluster, data = df)
summary(h2a_model)
Df Sum Sq Mean Sq F value Pr(>F)
cluster 4 385.0 96.26 76.05 <2e-16 ***
Residuals 315 398.7 1.27
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
TukeyHSD(h2a_model)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = trust_ai ~ cluster, data = df)
$cluster
diff lwr upr p adj
2-1 1.1170686 0.597018142 1.6371191 0.0000001
3-1 1.5861768 1.035043859 2.1373098 0.0000000
4-1 -1.2555398 -1.941813671 -0.5692659 0.0000086
5-1 2.5143695 1.942614555 3.0861244 0.0000000
3-2 0.4691082 -0.008110863 0.9463272 0.0566237
4-2 -2.3726084 -3.001068703 -1.7441481 0.0000000
5-2 1.3973009 0.896407469 1.8981942 0.0000000
4-3 -2.8417166 -3.496130365 -2.1873029 0.0000000
5-3 0.9281927 0.395098660 1.4612867 0.0000268
5-4 3.7699093 3.098036131 4.4417824 0.0000000
aggregate(trust_ai ~ cluster, data = df, mean, na.rm = TRUE)
# Effect sizes
eta_squared(h2a_model)
# Effect Size for ANOVA
Parameter | Eta2 | 95% CI
-------------------------------
cluster | 0.49 | [0.43, 1.00]
- One-sided CIs: upper bound fixed at [1.00].
omega_squared(h2a_model)
# Effect Size for ANOVA
Parameter | Omega2 | 95% CI
---------------------------------
cluster | 0.48 | [0.42, 1.00]
- One-sided CIs: upper bound fixed at [1.00].
h2a_eta <- eta_squared(h2a_model)
print(h2a_eta)
# Effect Size for ANOVA
Parameter | Eta2 | 95% CI
-------------------------------
cluster | 0.49 | [0.43, 1.00]
- One-sided CIs: upper bound fixed at [1.00].
h2a_omega <- omega_squared(h2a_model)
print(h2a_omega)
# Effect Size for ANOVA
Parameter | Omega2 | 95% CI
---------------------------------
cluster | 0.48 | [0.42, 1.00]
- One-sided CIs: upper bound fixed at [1.00].
h2c_results <- lapply(levels(df$cluster), function(cl) {
sub <- df |>
filter(cluster == cl)
model <- lm(intention_use ~ trust_ai, data = sub)
coefs <- summary(model)$coefficients
tibble(
cluster = cl,
b_trust = coefs["trust_ai", "Estimate"],
p_value = coefs["trust_ai", "Pr(>|t|)"],
r_squared = summary(model)$r.squared
)
}) |>
bind_rows()
h2c_results
for(cl in levels(df$cluster)) {
sub <- df |>
filter(cluster == cl)
model <- lm(intention_use ~ trust_ai, data = sub)
cat("\n====================================\n")
cat("CLUSTER:", cl, "\n")
cat("====================================\n")
print(shapiro.test(residuals(model)))
}
====================================
CLUSTER: 1
====================================
Shapiro-Wilk normality test
data: residuals(model)
W = 0.96142, p-value = 0.1019
====================================
CLUSTER: 2
====================================
Shapiro-Wilk normality test
data: residuals(model)
W = 0.95435, p-value = 0.003354
====================================
CLUSTER: 3
====================================
Shapiro-Wilk normality test
data: residuals(model)
W = 0.95225, p-value = 0.01102
====================================
CLUSTER: 4
====================================
Shapiro-Wilk normality test
data: residuals(model)
W = 0.87471, p-value = 0.002561
====================================
CLUSTER: 5
====================================
Shapiro-Wilk normality test
data: residuals(model)
W = 0.86598, p-value = 9.328e-06
h2c_spearman <- lapply(levels(df$cluster), function(cl) {
sub <- df |>
filter(cluster == cl)
test <- cor.test(sub$trust_ai, sub$intention_use, method = "spearman")
tibble(
cluster = cl,
rho = unname(test$estimate),
p_value = test$p.value
)
}) |>
bind_rows()
h2c_spearman
h2c_results <- lapply(levels(df$cluster), function(cl) {
sub <- df |>
filter(cluster == cl)
model <- lm(intention_use ~ trust_ai, data = sub)
coefs <- summary(model)$coefficients
tibble(
cluster = cl,
b_trust = coefs["trust_ai", "Estimate"],
p_value = coefs["trust_ai", "Pr(>|t|)"],
r_squared = summary(model)$r.squared,
adj_r_squared = summary(model)$adj.r.squared
)
}) |>
bind_rows()
h2c_results
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
data_cluster_profile_5 <- data_cluster_profile_5 |>
mutate(
cluster_named = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
)
bank_labels <- c(
a = "OTP",
b = "Gorenjska Banka",
c = "NLB",
d = "Revolut",
e = "N26",
f = "Intesa SanPaolo",
g = "UniCredit"
)
run_bank_pairwise_tests <- function(data, cluster_name) {
subset_data <- data |>
filter(cluster_named == cluster_name) |>
mutate(across(Q13a:Q15g, as.numeric))
compare_dimension <- function(df, vars, dimension_name) {
pairs <- combn(vars, 2, simplify = FALSE)
results <- lapply(pairs, function(pair) {
v1 <- pair[1]
v2 <- pair[2]
x <- df[[v1]]
y <- df[[v2]]
complete_idx <- complete.cases(x, y)
x <- x[complete_idx]
y <- y[complete_idx]
test <- wilcox.test(x, y, paired = TRUE, exact = FALSE)
tibble(
cluster = cluster_name,
dimension = dimension_name,
bank_1 = bank_labels[substr(v1, nchar(v1), nchar(v1))],
bank_2 = bank_labels[substr(v2, nchar(v2), nchar(v2))],
n = length(x),
mean_bank_1 = mean(x, na.rm = TRUE),
mean_bank_2 = mean(y, na.rm = TRUE),
p_value = test$p.value
)
})
bind_rows(results) |>
mutate(
p_adjusted = p.adjust(p_value, method = "bonferroni"),
significant = ifelse(p_adjusted < 0.05, "Yes", "No")
) |>
arrange(p_adjusted)
}
innovation_results <- compare_dimension(
subset_data,
vars = c("Q13a", "Q13b", "Q13c", "Q13d", "Q13e", "Q13f", "Q13g"),
dimension_name = "Innovation"
)
support_results <- compare_dimension(
subset_data,
vars = c("Q14a", "Q14b", "Q14c", "Q14d", "Q14e", "Q14f", "Q14g"),
dimension_name = "Customer support"
)
reliability_results <- compare_dimension(
subset_data,
vars = c("Q15a", "Q15b", "Q15c", "Q15d", "Q15e", "Q15f", "Q15g"),
dimension_name = "Reliability"
)
bind_rows(
innovation_results,
support_results,
reliability_results
)
}
all_cluster_bank_tests <- bind_rows(
run_bank_pairwise_tests(data_cluster_profile_5, "Skeptical support seekers"),
run_bank_pairwise_tests(data_cluster_profile_5, "Cautious guidance seekers"),
run_bank_pairwise_tests(data_cluster_profile_5, "Feature-oriented adopters"),
run_bank_pairwise_tests(data_cluster_profile_5, "AI-resistant independents"),
run_bank_pairwise_tests(data_cluster_profile_5, "AI-enthusiastic guidance seekers")
)
all_cluster_bank_tests
all_cluster_bank_tests_sig <- all_cluster_bank_tests |>
filter(p_adjusted < 0.05)
all_cluster_bank_tests_sig
all_cluster_bank_tests_sig <- all_cluster_bank_tests_sig |>
arrange(cluster, dimension, p_adjusted)
all_cluster_bank_tests_sig
all_cluster_bank_tests_sig |>
count(cluster, dimension)
write.csv(
all_cluster_bank_tests,
"all_cluster_bank_pairwise_tests.csv",
row.names = FALSE
)
write.csv(
all_cluster_bank_tests_sig,
"significant_cluster_bank_pairwise_tests.csv",
row.names = FALSE
)
# Recreate readable output tables from all_cluster_bank_tests
interpretable_results <- all_cluster_bank_tests |>
mutate(
mean_bank_1 = round(mean_bank_1, 2),
mean_bank_2 = round(mean_bank_2, 2),
mean_difference = round(mean_bank_1 - mean_bank_2, 2),
higher_rated_bank = case_when(
mean_bank_1 > mean_bank_2 ~ bank_1,
mean_bank_2 > mean_bank_1 ~ bank_2,
TRUE ~ "Equal"
),
interpretation = case_when(
p_adjusted < 0.05 & mean_bank_1 > mean_bank_2 ~
paste0(bank_1, " is rated significantly higher than ", bank_2),
p_adjusted < 0.05 & mean_bank_2 > mean_bank_1 ~
paste0(bank_2, " is rated significantly higher than ", bank_1),
TRUE ~ "No significant difference"
),
p_value = round(p_value, 4),
p_adjusted = round(p_adjusted, 4)
) |>
arrange(cluster, dimension, p_adjusted)
significant_results_clean <- interpretable_results |>
filter(p_adjusted < 0.05) |>
select(
cluster,
dimension,
bank_1,
bank_2,
mean_bank_1,
mean_bank_2,
mean_difference,
higher_rated_bank,
p_value,
p_adjusted,
interpretation
)
report_table <- significant_results_clean |>
transmute(
Cluster = cluster,
Dimension = dimension,
Comparison = paste(bank_1, "vs", bank_2),
`Mean bank 1` = mean_bank_1,
`Mean bank 2` = mean_bank_2,
`Mean difference` = mean_difference,
`Higher-rated bank` = higher_rated_bank,
`Raw p-value` = p_value,
`Adjusted p-value` = p_adjusted,
Interpretation = interpretation
)
# Show outputs
interpretable_results
significant_results_clean
report_table
all_cluster_bank_tests |>
mutate(
mean_bank_1 = round(mean_bank_1, 2),
mean_bank_2 = round(mean_bank_2, 2),
p_adjusted = round(p_adjusted, 4),
result = case_when(
p_adjusted < 0.05 & mean_bank_1 > mean_bank_2 ~ paste(bank_1, ">", bank_2),
p_adjusted < 0.05 & mean_bank_2 > mean_bank_1 ~ paste(bank_2, ">", bank_1),
TRUE ~ "No significant difference"
)
) |>
filter(p_adjusted < 0.05) |>
select(cluster, dimension, bank_1, bank_2, mean_bank_1, mean_bank_2, p_adjusted, result) |>
arrange(cluster, dimension, p_adjusted)
library(dplyr)
q19_cluster_summary <- data_cluster_profile_5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
Q19 = as.numeric(Q19)
) |>
filter(!is.na(Q19), Q19 >= 1) |>
group_by(cluster) |>
summarise(
n = n(),
mean_q19 = mean(Q19, na.rm = TRUE),
median_q19 = median(Q19, na.rm = TRUE),
sd_q19 = sd(Q19, na.rm = TRUE),
min_q19 = min(Q19, na.rm = TRUE),
max_q19 = max(Q19, na.rm = TRUE),
.groups = "drop"
) |>
mutate(
mean_q19 = round(mean_q19, 2),
median_q19 = round(median_q19, 2),
sd_q19 = round(sd_q19, 2)
) |>
arrange(desc(mean_q19))
q19_cluster_summary
kruskal.test(Q19 ~ cluster, data = data_cluster_profile_5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
Q19 = as.numeric(Q19)
) |>
filter(!is.na(Q19), Q19 >= 1)
)
Kruskal-Wallis rank sum test
data: Q19 by cluster
Kruskal-Wallis chi-squared = 3.9454, df = 4, p-value = 0.4134
library(dplyr)
nlb_revolut_n26_overlap <- data_cluster_profile_5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
Q25c = as.numeric(Q25c), # NLB
Q25d = as.numeric(Q25d), # Revolut
Q25e = as.numeric(Q25e) # N26
) |>
group_by(cluster) |>
summarise(
cluster_n = n(),
n_nlb_revolut_n26 = sum(Q25c == 1 & Q25d == 1 & Q25e == 1, na.rm = TRUE),
percent_nlb_revolut_n26 = round(100 * n_nlb_revolut_n26 / cluster_n, 1),
.groups = "drop"
)
nlb_revolut_n26_overlap
library(dplyr)
nlb_revolut_n26_combinations <- data_cluster_profile_5 |>
mutate(
cluster = factor(
cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
),
Q25c = as.numeric(Q25c), # NLB
Q25d = as.numeric(Q25d), # Revolut
Q25e = as.numeric(Q25e) # N26
) |>
mutate(
bank_combo = case_when(
Q25c == 1 & Q25d == 1 & Q25e == 1 ~ "NLB + Revolut + N26",
Q25c == 1 & Q25d == 1 & (is.na(Q25e) | Q25e != 1) ~ "NLB + Revolut",
Q25c == 1 & Q25e == 1 & (is.na(Q25d) | Q25d != 1) ~ "NLB + N26",
Q25d == 1 & Q25e == 1 & (is.na(Q25c) | Q25c != 1) ~ "Revolut + N26",
Q25c == 1 & (is.na(Q25d) | Q25d != 1) & (is.na(Q25e) | Q25e != 1) ~ "NLB only",
Q25d == 1 & (is.na(Q25c) | Q25c != 1) & (is.na(Q25e) | Q25e != 1) ~ "Revolut only",
Q25e == 1 & (is.na(Q25c) | Q25c != 1) & (is.na(Q25d) | Q25d != 1) ~ "N26 only",
TRUE ~ "None of these three"
)
) |>
group_by(cluster, bank_combo) |>
summarise(n = n(), .groups = "drop_last") |>
mutate(percent = round(100 * n / sum(n), 1)) |>
ungroup()
nlb_revolut_n26_combinations
library(dplyr)
library(tibble)
library(FactoMineR)
library(factoextra)
library(ggplot2)
cluster2_bank_means <- data_cluster_profile_5 |>
filter(cluster == 2) |>
mutate(
across(Q13a:Q15g, as.numeric)
) |>
summarise(
OTP_innovation = mean(Q13a, na.rm = TRUE),
Gorenjska_innovation = mean(Q13b, na.rm = TRUE),
NLB_innovation = mean(Q13c, na.rm = TRUE),
Revolut_innovation = mean(Q13d, na.rm = TRUE),
N26_innovation = mean(Q13e, na.rm = TRUE),
Intesa_innovation = mean(Q13f, na.rm = TRUE),
UniCredit_innovation = mean(Q13g, na.rm = TRUE),
OTP_support = mean(Q14a, na.rm = TRUE),
Gorenjska_support = mean(Q14b, na.rm = TRUE),
NLB_support = mean(Q14c, na.rm = TRUE),
Revolut_support = mean(Q14d, na.rm = TRUE),
N26_support = mean(Q14e, na.rm = TRUE),
Intesa_support = mean(Q14f, na.rm = TRUE),
UniCredit_support = mean(Q14g, na.rm = TRUE),
OTP_reliability = mean(Q15a, na.rm = TRUE),
Gorenjska_reliability = mean(Q15b, na.rm = TRUE),
NLB_reliability = mean(Q15c, na.rm = TRUE),
Revolut_reliability = mean(Q15d, na.rm = TRUE),
N26_reliability = mean(Q15e, na.rm = TRUE),
Intesa_reliability = mean(Q15f, na.rm = TRUE),
UniCredit_reliability = mean(Q15g, na.rm = TRUE)
)
cluster2_bank_means
cluster2_bank_table <- tibble(
bank = c("OTP", "Gorenjska Banka", "NLB", "Revolut", "N26", "Intesa SanPaolo", "UniCredit"),
innovation = c(
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q13a), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q13b), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q13c), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q13d), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q13e), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q13f), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q13g), na.rm = TRUE)
),
customer_support = c(
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q14a), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q14b), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q14c), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q14d), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q14e), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q14f), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q14g), na.rm = TRUE)
),
reliability = c(
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q15a), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q15b), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q15c), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q15d), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q15e), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q15f), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 2) |> pull(Q15g), na.rm = TRUE)
)
)
cluster2_bank_table
cluster2_mat <- cluster2_bank_table |>
column_to_rownames("bank") |>
as.matrix()
pca_cluster2 <- FactoMineR::PCA(cluster2_mat, scale.unit = TRUE, graph = FALSE)
cluster2_coordinates <- as.data.frame(pca_cluster2$ind$coord)
cluster2_coordinates$bank <- rownames(cluster2_coordinates)
rownames(cluster2_coordinates) <- NULL
cluster2_coordinates
cluster2_loadings <- as.data.frame(pca_cluster2$var$coord)
cluster2_loadings$dimension <- rownames(cluster2_loadings)
rownames(cluster2_loadings) <- NULL
cluster2_loadings
cluster2_nlb_explanation <- cluster2_bank_table |>
filter(bank == "NLB") |>
mutate(
Dim1_loading_innovation = pca_cluster2$var$coord["innovation", "Dim.1"],
Dim1_loading_support = pca_cluster2$var$coord["customer_support", "Dim.1"],
Dim1_loading_reliability = pca_cluster2$var$coord["reliability", "Dim.1"],
Dim2_loading_innovation = pca_cluster2$var$coord["innovation", "Dim.2"],
Dim2_loading_support = pca_cluster2$var$coord["customer_support", "Dim.2"],
Dim2_loading_reliability = pca_cluster2$var$coord["reliability", "Dim.2"],
NLB_Dim1 = pca_cluster2$ind$coord["NLB", "Dim.1"],
NLB_Dim2 = pca_cluster2$ind$coord["NLB", "Dim.2"]
)
cluster2_nlb_explanation
factoextra::fviz_pca_biplot(
pca_cluster2,
repel = TRUE,
col.var = "gray30",
col.ind = "steelblue",
pointsize = 3
) +
ggplot2::ggtitle("Perception Map of Banks - Cluster 2") +
ggplot2::xlab("Dim 1") +
ggplot2::ylab("Dim 2") +
ggplot2::theme_minimal()
cluster2_bank_table
cluster2_coordinates
cluster2_loadings
library(dplyr)
library(tibble)
library(FactoMineR)
library(factoextra)
library(ggplot2)
# --------------------------------------------
# 1) Bank mean table for cluster 3
# --------------------------------------------
cluster3_bank_table <- tibble(
bank = c("OTP", "Gorenjska Banka", "NLB", "Revolut", "N26", "Intesa SanPaolo", "UniCredit"),
innovation = c(
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q13a), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q13b), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q13c), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q13d), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q13e), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q13f), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q13g), na.rm = TRUE)
),
customer_support = c(
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q14a), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q14b), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q14c), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q14d), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q14e), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q14f), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q14g), na.rm = TRUE)
),
reliability = c(
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q15a), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q15b), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q15c), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q15d), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q15e), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q15f), na.rm = TRUE),
mean(data_cluster_profile_5 |> filter(cluster == 3) |> pull(Q15g), na.rm = TRUE)
)
)
cluster3_bank_table
# --------------------------------------------
# 2) PCA for cluster 3
# --------------------------------------------
cluster3_mat <- cluster3_bank_table |>
column_to_rownames("bank") |>
as.matrix()
pca_cluster3 <- FactoMineR::PCA(cluster3_mat, scale.unit = TRUE, graph = FALSE)
# --------------------------------------------
# 3) Bank coordinates
# --------------------------------------------
cluster3_coordinates <- as.data.frame(pca_cluster3$ind$coord)
cluster3_coordinates$bank <- rownames(cluster3_coordinates)
rownames(cluster3_coordinates) <- NULL
cluster3_coordinates
# --------------------------------------------
# 4) Variable loadings
# --------------------------------------------
cluster3_loadings <- as.data.frame(pca_cluster3$var$coord)
cluster3_loadings$dimension <- rownames(cluster3_loadings)
rownames(cluster3_loadings) <- NULL
cluster3_loadings
# --------------------------------------------
# 5) NLB explanation table
# --------------------------------------------
cluster3_nlb_explanation <- cluster3_bank_table |>
filter(bank == "NLB") |>
mutate(
Dim1_loading_innovation = pca_cluster3$var$coord["innovation", "Dim.1"],
Dim1_loading_support = pca_cluster3$var$coord["customer_support", "Dim.1"],
Dim1_loading_reliability = pca_cluster3$var$coord["reliability", "Dim.1"],
Dim2_loading_innovation = pca_cluster3$var$coord["innovation", "Dim.2"],
Dim2_loading_support = pca_cluster3$var$coord["customer_support", "Dim.2"],
Dim2_loading_reliability = pca_cluster3$var$coord["reliability", "Dim.2"],
NLB_Dim1 = pca_cluster3$ind$coord["NLB", "Dim.1"],
NLB_Dim2 = pca_cluster3$ind$coord["NLB", "Dim.2"]
)
cluster3_nlb_explanation
# --------------------------------------------
# 6) Plot
# --------------------------------------------
factoextra::fviz_pca_biplot(
pca_cluster3,
repel = TRUE,
col.var = "gray30",
col.ind = "steelblue",
pointsize = 3
) +
ggplot2::ggtitle("Perception Map of Banks - Cluster 3") +
ggplot2::xlab("Dim 1") +
ggplot2::ylab("Dim 2") +
ggplot2::theme_minimal()
# --------------------------------------------
# 7) Compare NLB with Revolut and N26
# --------------------------------------------
cluster3_bank_table |>
filter(bank %in% c("NLB", "Revolut", "N26"))
cluster3_bank_table
library(dplyr)
library(tidyr)
library(purrr)
# -----------------------------
# 1) Q12 item labels
# -----------------------------
q12_labels <- c(
Q12a = "Security",
Q12b = "Personal data protection",
Q12c = "Trust in the bank",
Q12d = "Ease of use",
Q12e = "Transaction execution speed",
Q12f = "Variety of functions",
Q12g = "Availability of human support",
Q12h = "Personalized financial insights",
Q12i = "Understanding how the program makes decisions"
)
q12_vars <- names(q12_labels)
# -----------------------------
# 2) Attach the 5-cluster solution
# -----------------------------
cluster_labels <- c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
data_cluster_profile_5 <- data_clean_complete %>%
mutate(
cluster = factor(
k5_pca2$cluster,
levels = 1:5,
labels = cluster_labels
)
)
# -----------------------------
# 3) Function to profile ONE cluster independently
# -----------------------------
profile_one_cluster_q12 <- function(data, cluster_name) {
data %>%
filter(cluster == cluster_name) %>%
select(all_of(q12_vars)) %>%
mutate(across(everything(), as.numeric)) %>%
pivot_longer(
cols = everything(),
names_to = "item",
values_to = "score"
) %>%
group_by(item) %>%
summarise(
n = sum(!is.na(score)),
mean = round(mean(score, na.rm = TRUE), 2),
sd = round(sd(score, na.rm = TRUE), 2),
median = round(median(score, na.rm = TRUE), 2),
pct_6_7 = round(mean(score %in% c(6, 7), na.rm = TRUE) * 100, 1),
pct_7 = round(mean(score == 7, na.rm = TRUE) * 100, 1),
.groups = "drop"
) %>%
mutate(
label = q12_labels[item],
cluster = cluster_name
) %>%
select(cluster, item, label, n, mean, sd, median, pct_6_7, pct_7) %>%
arrange(desc(mean))
}
# -----------------------------
# 4) Create a separate profile for each cluster
# -----------------------------
cluster_1_q12_profile <- profile_one_cluster_q12(
data_cluster_profile_5,
"Skeptical support seekers"
)
cluster_2_q12_profile <- profile_one_cluster_q12(
data_cluster_profile_5,
"Cautious guidance seekers"
)
cluster_3_q12_profile <- profile_one_cluster_q12(
data_cluster_profile_5,
"Feature-oriented adopters"
)
cluster_4_q12_profile <- profile_one_cluster_q12(
data_cluster_profile_5,
"AI-resistant independents"
)
cluster_5_q12_profile <- profile_one_cluster_q12(
data_cluster_profile_5,
"AI-enthusiastic guidance seekers"
)
# -----------------------------
# 5) View each cluster independently
# -----------------------------
cluster_1_q12_profile
cluster_2_q12_profile
cluster_3_q12_profile
cluster_4_q12_profile
cluster_5_q12_profile
cluster_q12_profiles <- setNames(
lapply(levels(data_cluster_profile_5$cluster), function(cl) {
profile_one_cluster_q12(data_cluster_profile_5, cl)
}),
levels(data_cluster_profile_5$cluster)
)
# Example:
cluster_q12_profiles[["Skeptical support seekers"]]
cluster_q12_profiles[["AI-resistant independents"]]
bank_data <- data_cluster_profile_5 |>
mutate(across(Q25a:Q25l, as.numeric))
bank_combinations <- bank_data |>
mutate(
# Revolut only
revolut_only =
Q25d == 1 &
rowSums(across(Q25a:Q25c)) == 0 &
rowSums(across(Q25e:Q25l)) == 0,
# NLB + Revolut
nlb_revolut =
Q25c == 1 & Q25d == 1,
# Revolut + any other bank except NLB
revolut_other =
Q25d == 1 &
rowSums(across(c(Q25a, Q25b, Q25e:Q25l))) > 0,
# Revolut + grouped "other banks" (all except NLB)
revolut_group_other =
Q25d == 1 &
rowSums(across(c(Q25a, Q25b, Q25e:Q25l))) > 0
)
revolut_cluster_table <- bank_combinations |>
group_by(cluster) |>
summarise(
n_cluster = n(),
revolut_only = sum(revolut_only, na.rm = TRUE),
nlb_revolut = sum(nlb_revolut, na.rm = TRUE),
revolut_other = sum(revolut_other, na.rm = TRUE),
revolut_group_other = sum(revolut_group_other, na.rm = TRUE),
revolut_only_pct = round(100 * revolut_only / n_cluster, 1),
nlb_revolut_pct = round(100 * nlb_revolut / n_cluster, 1),
revolut_other_pct = round(100 * revolut_other / n_cluster, 1),
revolut_group_other_pct = round(100 * revolut_group_other / n_cluster, 1)
)
revolut_cluster_table
NA
library(tidyr)
revolut_cluster_table |>
select(
cluster,
revolut_only_pct,
nlb_revolut_pct,
revolut_other_pct
) |>
pivot_longer(
cols = -cluster,
names_to = "combination",
values_to = "percent"
)
NA
bank_data <- data_cluster_profile_5 |>
mutate(across(Q25a:Q25l, as.numeric))
bank_groups <- bank_data |>
mutate(
# Revolut only
revolut_only =
Q25d == 1 &
rowSums(across(c(Q25a, Q25b, Q25c, Q25e:Q25l))) == 0,
# NLB + Revolut
nlb_revolut =
Q25c == 1 & Q25d == 1,
# OTP + Revolut
otp_revolut =
Q25a == 1 & Q25d == 1,
# Revolut + any other bank except NLB and OTP
revolut_other =
Q25d == 1 &
rowSums(across(c(Q25b, Q25e:Q25l))) > 0
)
revolut_cluster_summary <- bank_groups |>
group_by(cluster) |>
summarise(
n_cluster = n(),
revolut_only = sum(revolut_only, na.rm = TRUE),
nlb_revolut = sum(nlb_revolut, na.rm = TRUE),
otp_revolut = sum(otp_revolut, na.rm = TRUE),
revolut_other = sum(revolut_other, na.rm = TRUE),
revolut_only_pct = round(100 * revolut_only / n_cluster, 1),
nlb_revolut_pct = round(100 * nlb_revolut / n_cluster, 1),
otp_revolut_pct = round(100 * otp_revolut / n_cluster, 1),
revolut_other_pct = round(100 * revolut_other / n_cluster, 1)
)
revolut_cluster_summary
NA
library(tidyr)
revolut_cluster_summary |>
select(
cluster,
revolut_only_pct,
nlb_revolut_pct,
otp_revolut_pct,
revolut_other_pct
) |>
pivot_longer(
cols = -cluster,
names_to = "group",
values_to = "percent"
)
NA
library(dplyr)
bank_segments <- data_cluster_profile_5 |>
mutate(across(Q25a:Q25l, as.numeric)) |>
mutate(
segment = case_when(
Q25c == 1 & Q25d == 0 & Q25a == 0 ~ "NLB only",
Q25d == 1 & Q25c == 0 & Q25a == 0 ~ "Revolut only",
Q25a == 1 & Q25c == 0 & Q25d == 0 ~ "OTP only",
Q25c == 1 & Q25d == 1 & Q25a == 0 ~ "NLB + Revolut",
Q25c == 1 & Q25a == 1 & Q25d == 0 ~ "NLB + OTP",
Q25a == 1 & Q25d == 1 & Q25c == 0 ~ "OTP + Revolut",
Q25a == 1 & Q25c == 1 & Q25d == 1 ~ "NLB + Revolut + OTP",
TRUE ~ "None of these three"
)
)
segment_table <- bank_segments |>
group_by(cluster, segment) |>
summarise(n = n(), .groups = "drop") |>
group_by(cluster) |>
mutate(
percent = round(100 * n / sum(n), 1)
) |>
arrange(cluster, desc(percent))
segment_table
conversion_table <- data_cluster_profile_5 |>
mutate(across(Q25a:Q25l, as.numeric)) |>
group_by(cluster) |>
summarise(
cluster_size = n(),
revolut_users = sum(Q25d == 1, na.rm = TRUE),
nlb_users = sum(Q25c == 1, na.rm = TRUE),
nlb_revolut_users = sum(Q25c == 1 & Q25d == 1, na.rm = TRUE),
revolut_without_nlb = sum(Q25d == 1 & Q25c == 0, na.rm = TRUE),
revolut_users_pct = round(100 * revolut_users / cluster_size, 1),
revolut_without_nlb_pct =
round(100 * revolut_without_nlb / cluster_size, 1)
)
conversion_table
##Cluster 3
library(readxl)
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
## 1. Load and clean data ----
raw_sheet <- read_excel(
"Questionnaire_results_EN.xlsx",
sheet = "Podatki",
col_names = FALSE
)
var_names <- raw_sheet |>
slice(1) |>
unlist(use.names = FALSE) |>
as.character()
question_text <- raw_sheet |>
slice(2) |>
unlist(use.names = FALSE) |>
as.character()
data_raw <- raw_sheet |>
slice(-(1:2))
names(data_raw) <- var_names
data_raw <- data_raw |>
mutate(respondent_id = row_number()) |>
relocate(respondent_id)
data_clean <- data_raw |>
mutate(across(everything(),
~ replace(as.character(.), as.character(.) == "-1", NA))) |>
filter(!is.na(Q22), !is.na(Q23))
data_clean <- data_clean |>
mutate(
across(
Q13a:Q15g,
~ as.numeric(replace(., . == "8", NA))
)
)
## 2. Rebuild the 5 respondent clusters (same logic as before) ----
cluster_data <- data_clean |>
select(Q6a, Q6b, Q6d, Q6e, Q6h, Q6i,
Q8a, Q8b, Q8c, Q8d, Q8e) |>
mutate(across(everything(), as.numeric))
cluster_data_complete <- cluster_data |>
drop_na()
pca_cluster <- prcomp(cluster_data_complete, scale. = TRUE)
pca_scores_2 <- as.data.frame(pca_cluster$x[, 1:2])
set.seed(123)
k5_pca2 <- kmeans(pca_scores_2, centers = 5, nstart = 25)
data_clean_complete <- data_clean |>
filter(
!is.na(Q6a), !is.na(Q6b), !is.na(Q6d), !is.na(Q6e), !is.na(Q6h), !is.na(Q6i),
!is.na(Q8a), !is.na(Q8b), !is.na(Q8c), !is.na(Q8d), !is.na(Q8e)
)
data_cluster <- data_clean_complete |>
mutate(
cluster = factor(
k5_pca2$cluster,
levels = c(1, 2, 3, 4, 5),
labels = c(
"Skeptical support seekers",
"Cautious guidance seekers",
"Feature-oriented adopters", # cluster 3
"AI-resistant independents",
"AI-enthusiastic guidance seekers"
)
)
)
## 3. Hard-code banks and perception items (Q13–Q15) ----
# Q13: innovation, Q14: customer support, Q15: reliability
percept_vars <- c(
"Q13a","Q13b","Q13c","Q13d","Q13e","Q13f","Q13g",
"Q14a","Q14b","Q14c","Q14d","Q14e","Q14f","Q14g",
"Q15a","Q15b","Q15c","Q15d","Q15e","Q15f","Q15g"
)
# explicit mapping from column suffix (a–g) to bank name
bank_lookup <- tibble(
bank_code = letters[1:7],
bank_label = c("OTP",
"Gorenjska Banka",
"NLB",
"Revolut",
"N26",
"Intesa SanPaolo",
"UniCredit")
)
# helper table describing each Q13–15 variable
qmeta <- tibble(variable = percept_vars) %>%
mutate(
qcode = substr(variable, 1, 3), # Q13, Q14, Q15
bank_code = substr(variable, 4, 4) # a–g
) %>%
left_join(bank_lookup, by = "bank_code") %>%
mutate(
attribute = case_when(
qcode == "Q13" ~ "innovation",
qcode == "Q14" ~ "customer_support",
qcode == "Q15" ~ "reliability",
TRUE ~ qcode
)
)
## 4. Build bank x attribute matrix for cluster 3 ----
cluster3 <- data_cluster %>%
filter(cluster == "Feature-oriented adopters")
cluster3_percept <- cluster3 %>%
select(all_of(percept_vars)) %>%
mutate(across(everything(), as.numeric))
means_c3 <- cluster3_percept %>%
summarise(across(everything(), ~ mean(.x, na.rm = TRUE)))
means_long <- means_c3 %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "mean_score") %>%
left_join(qmeta, by = "variable")
bank_attr_mat <- means_long %>%
group_by(bank_label, attribute) %>%
summarise(mean_score = mean(mean_score), .groups = "drop") %>%
pivot_wider(
id_cols = bank_label,
names_from = attribute,
values_from = mean_score
) %>%
as.data.frame()
rownames(bank_attr_mat) <- bank_attr_mat$bank_label
bank_attr_mat$bank_label <- NULL
## 5. PCA for banks (cluster 3) ----
pca_banks_c3 <- prcomp(bank_attr_mat, scale. = TRUE)
scores_c3 <- as.data.frame(pca_banks_c3$x[, 1:2])
scores_c3$bank_label <- rownames(scores_c3)
loadings_c3 <- as.data.frame(pca_banks_c3$rotation[, 1:2])
loadings_c3$attribute <- rownames(loadings_c3)
arrow_scale <- 1.5
loadings_c3 <- loadings_c3 %>%
mutate(PC1 = PC1 * arrow_scale,
PC2 = PC2 * arrow_scale)
## 6. Plot: perceptual map of banks – cluster 3 ----
ggplot() +
geom_point(data = scores_c3,
aes(x = PC1, y = PC2),
size = 2.8, colour = "black") +
geom_text(data = scores_c3,
aes(x = PC1, y = PC2, label = bank_label),
vjust = -0.7, size = 3.2) +
geom_segment(data = loadings_c3,
aes(x = 0, y = 0, xend = PC1, yend = PC2),
arrow = arrow(length = unit(0.2, "cm")),
colour = "grey40") +
geom_text(data = loadings_c3,
aes(x = PC1, y = PC2, label = attribute),
hjust = 0.5, vjust = -0.4, colour = "grey40", size = 3) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey75") +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey75") +
coord_equal(xlim = c(min(scores_c3$PC1) - 0.8, max(scores_c3$PC1) + 0.8),
ylim = c(min(scores_c3$PC2) - 0.8, max(scores_c3$PC2) + 0.8),
expand = TRUE) +
labs(
title = "Perceptual Map of Banks – Cluster 3",
x = "Overall Digital Banking Performance",
y = "Customer Support vs. Innovation"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
panel.grid.minor = element_blank()
)
bank_attr_mat
# loadings for each attribute on PC1 and PC2
load_table <- as.data.frame(pca_banks_c3$rotation[, 1:2])
load_table$attribute <- rownames(load_table)
rownames(load_table) <- NULL
load_table
# bank attribute means we already had:
bank_attr_mat # rows = banks, cols = innovation, customer_support, reliability
# join with scores on PC2
bank_pc2 <- scores_c3 %>%
select(bank_label, PC2) %>%
left_join(
bank_attr_mat %>%
tibble::rownames_to_column("bank_label"),
by = "bank_label"
)
bank_pc2
NA
bank_pc2 <- scores_c3 %>%
select(bank_label, PC2) %>%
left_join(
bank_attr_mat %>%
tibble::rownames_to_column("bank_label"),
by = "bank_label"
)
bank_pc2
# load_table: columns PC1, PC2, attribute (innovation, customer_support, reliability)
load_table
# 1) put PC2 loadings into a named vector
pc2_load <- load_table$PC2
names(pc2_load) <- load_table$attribute
# 2) compute contribution scores: attribute_mean * PC2 loading
contrib_pc2 <- bank_pc2 %>%
mutate(
contrib_customer_support = customer_support * pc2_load["customer_support"],
contrib_innovation = innovation * pc2_load["innovation"],
contrib_reliability = reliability * pc2_load["reliability"]
)
contrib_pc2
NA
contrib_pc2_long <- contrib_pc2 %>%
select(bank_label,
contrib_customer_support,
contrib_innovation,
contrib_reliability) %>%
tidyr::pivot_longer(
cols = starts_with("contrib_"),
names_to = "attribute",
values_to = "contribution"
) %>%
mutate(attribute = gsub("contrib_", "", attribute))
contrib_pc2_long
NA
ggplot(contrib_pc2_long,
aes(x = attribute, y = contribution, fill = attribute)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ bank_label) +
theme_minimal() +
labs(
title = "Approximate attribute contributions to Dimension 2 (cluster 3)",
x = "Attribute",
y = "Contribution to Dim2"
)
load_table3 <- as.data.frame(pca_banks_c3$rotation[, 1:2])
load_table3$attribute <- rownames(load_table3)
rownames(load_table3) <- NULL
load_table3
NA
## Cluster 5: bank x attribute matrix ----
cluster5 <- data_cluster %>%
filter(cluster == "AI-enthusiastic guidance seekers") # cluster 5 label
cluster5_percept <- cluster5 %>%
select(all_of(percept_vars)) %>%
mutate(across(everything(), as.numeric))
means_c5 <- cluster5_percept %>%
summarise(across(everything(), ~ mean(.x, na.rm = TRUE)))
means_c5_long <- means_c5 %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "mean_score") %>%
left_join(qmeta, by = "variable")
bank_attr_mat5 <- means_c5_long %>%
group_by(bank_label, attribute) %>%
summarise(mean_score = mean(mean_score), .groups = "drop") %>%
pivot_wider(
id_cols = bank_label,
names_from = attribute,
values_from = mean_score
) %>%
as.data.frame()
rownames(bank_attr_mat5) <- bank_attr_mat5$bank_label
bank_attr_mat5$bank_label <- NULL
## PCA for banks – cluster 5 ----
pca_banks_c5 <- prcomp(bank_attr_mat5, scale. = TRUE)
scores_c5 <- as.data.frame(pca_banks_c5$x[, 1:2])
scores_c5$bank_label <- rownames(scores_c5)
loadings_c5 <- as.data.frame(pca_banks_c5$rotation[, 1:2])
loadings_c5$attribute <- rownames(loadings_c5)
arrow_scale <- 1.5
loadings_c5 <- loadings_c5 %>%
mutate(PC1 = PC1 * arrow_scale,
PC2 = PC2 * arrow_scale)
## Plot: perceptual map for cluster 5 ----
ggplot() +
geom_point(data = scores_c5,
aes(x = PC1, y = PC2),
size = 2.8, colour = "black") +
geom_text(data = scores_c5,
aes(x = PC1, y = PC2, label = bank_label),
vjust = -0.7, size = 3.2) +
geom_segment(data = loadings_c5,
aes(x = 0, y = 0, xend = PC1, yend = PC2),
arrow = arrow(length = unit(0.2, "cm")),
colour = "grey40") +
geom_text(data = loadings_c5,
aes(x = PC1, y = PC2, label = attribute),
hjust = 0.5, vjust = -0.4, colour = "grey40", size = 3) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey75") +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey75") +
coord_equal(xlim = c(min(scores_c5$PC1) - 0.8, max(scores_c5$PC1) + 0.8),
ylim = c(min(scores_c5$PC2) - 0.8, max(scores_c5$PC2) + 0.8),
expand = TRUE) +
labs(
title = "Perceptual Map of Banks – Cluster 5",
x = "Overall Digital Banking Performance",
y = "Customer Support vs. Relaibility"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
panel.grid.minor = element_blank()
)
## Loadings table for cluster 5 ----
load_table5 <- as.data.frame(pca_banks_c5$rotation[, 1:2])
load_table5$attribute <- rownames(load_table5)
rownames(load_table5) <- NULL
# PC2 loadings as named vector
pc2_load5 <- load_table5$PC2
names(pc2_load5) <- load_table5$attribute
# join PC2 scores with attribute means
bank_pc2_5 <- scores_c5 %>%
select(bank_label, PC2) %>%
left_join(
bank_attr_mat5 %>% tibble::rownames_to_column("bank_label"),
by = "bank_label"
)
# contribution scores
contrib_pc2_5 <- bank_pc2_5 %>%
mutate(
contrib_customer_support = customer_support * pc2_load5["customer_support"],
contrib_innovation = innovation * pc2_load5["innovation"],
contrib_reliability = reliability * pc2_load5["reliability"]
)
contrib_pc2_5
NA
contrib_pc2_5_long <- contrib_pc2_5 %>%
select(bank_label,
contrib_customer_support,
contrib_innovation,
contrib_reliability) %>%
pivot_longer(
cols = starts_with("contrib_"),
names_to = "attribute",
values_to = "contribution"
) %>%
mutate(attribute = gsub("contrib_", "", attribute))
ggplot(contrib_pc2_5_long,
aes(x = attribute, y = contribution, fill = attribute)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ bank_label) +
theme_minimal() +
labs(
title = "Approximate attribute contributions to Dimension 2 (cluster 5)",
x = "Attribute",
y = "Contribution to Dim2"
)
bank_pc2_5 <- scores_c5 %>%
select(bank_label, PC2) %>%
left_join(
bank_attr_mat5 %>%
tibble::rownames_to_column("bank_label"),
by = "bank_label"
)
bank_pc2_5
NA
# loadings for cluster 5 (PC1 and PC2)
load_table5 <- as.data.frame(pca_banks_c5$rotation[, 1:2])
load_table5$attribute <- rownames(load_table5)
rownames(load_table5) <- NULL
load_table5
## loadings for cluster 5 already in load_table5
# pc2_load5: named vector of PC2 loadings
pc2_load5 <- load_table5$PC2
names(pc2_load5) <- load_table5$attribute
## bank_pc2_5 already created like this:
# bank_pc2_5 <- scores_c5 %>%
# select(bank_label, PC2) %>%
# left_join(
# bank_attr_mat5 %>% tibble::rownames_to_column("bank_label"),
# by = "bank_label"
# )
contrib_pc2_5 <- bank_pc2_5 %>%
mutate(
contrib_customer_support = customer_support * pc2_load5["customer_support"],
contrib_innovation = innovation * pc2_load5["innovation"],
contrib_reliability = reliability * pc2_load5["reliability"]
)
contrib_pc2_5
NA
## Cluster 2: bank x attribute matrix ----
cluster2 <- data_cluster %>%
filter(cluster == "Cautious guidance seekers") # cluster 2 label
cluster2_percept <- cluster2 %>%
select(all_of(percept_vars)) %>%
mutate(across(everything(), as.numeric))
means_c2 <- cluster2_percept %>%
summarise(across(everything(), ~ mean(.x, na.rm = TRUE)))
means_c2_long <- means_c2 %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "mean_score") %>%
left_join(qmeta, by = "variable")
bank_attr_mat2 <- means_c2_long %>%
group_by(bank_label, attribute) %>%
summarise(mean_score = mean(mean_score), .groups = "drop") %>%
pivot_wider(
id_cols = bank_label,
names_from = attribute,
values_from = mean_score
) %>%
as.data.frame()
rownames(bank_attr_mat2) <- bank_attr_mat2$bank_label
bank_attr_mat2$bank_label <- NULL
## PCA for banks – cluster 2 ----
pca_banks_c2 <- prcomp(bank_attr_mat2, scale. = TRUE)
scores_c2 <- as.data.frame(pca_banks_c2$x[, 1:2])
scores_c2$bank_label <- rownames(scores_c2)
loadings_c2 <- as.data.frame(pca_banks_c2$rotation[, 1:2])
loadings_c2$attribute <- rownames(loadings_c2)
arrow_scale <- 1.5
loadings_c2 <- loadings_c2 %>%
mutate(PC1 = PC1 * arrow_scale,
PC2 = PC2 * arrow_scale)
## Plot: perceptual map for cluster 2 ----
ggplot() +
geom_point(data = scores_c2,
aes(x = PC1, y = PC2),
size = 2.8, colour = "black") +
geom_text(data = scores_c2,
aes(x = PC1, y = PC2, label = bank_label),
vjust = -0.7, size = 3.2) +
geom_segment(data = loadings_c2,
aes(x = 0, y = 0, xend = PC1, yend = PC2),
arrow = arrow(length = unit(0.2, "cm")),
colour = "grey40") +
geom_text(data = loadings_c2,
aes(x = PC1, y = PC2, label = attribute),
hjust = 0.5, vjust = -0.4, colour = "grey40", size = 3) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey75") +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey75") +
coord_equal(xlim = c(min(scores_c2$PC1) - 0.8, max(scores_c2$PC1) + 0.8),
ylim = c(min(scores_c2$PC2) - 0.8, max(scores_c2$PC2) + 0.8),
expand = TRUE) +
labs(
title = "Perceptual Map of Banks – Cluster 2",
x = "Overall Digital Banking Performance",
y = "Customer Support vs. Reliability"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
panel.grid.minor = element_blank()
)
bank_pc2_2 <- scores_c2 %>%
select(bank_label, PC2) %>%
left_join(
bank_attr_mat2 %>%
tibble::rownames_to_column("bank_label"),
by = "bank_label"
)
bank_pc2_2
NA
load_table2 <- as.data.frame(pca_banks_c2$rotation[, 1:2])
load_table2$attribute <- rownames(load_table2)
rownames(load_table2) <- NULL
load_table2
NA
pc2_load2 <- load_table2$PC2
names(pc2_load2) <- load_table2$attribute
contrib_pc2_2 <- bank_pc2_2 %>%
mutate(
contrib_customer_support = customer_support * pc2_load2["customer_support"],
contrib_innovation = innovation * pc2_load2["innovation"],
contrib_reliability = reliability * pc2_load2["reliability"]
)
contrib_pc2_2
NA
library(ggplot2)
library(dplyr)
library(tidyr)
## PC2 loadings for cluster 2
pc2_load2 <- load_table2$PC2
names(pc2_load2) <- load_table2$attribute
## Contribution scores table for cluster 2
contrib_pc2_2 <- bank_pc2_2 %>%
mutate(
contrib_customer_support = customer_support * pc2_load2["customer_support"],
contrib_innovation = innovation * pc2_load2["innovation"],
contrib_reliability = reliability * pc2_load2["reliability"]
)
## Long format for plotting
contrib_pc2_2_long <- contrib_pc2_2 %>%
select(bank_label,
contrib_customer_support,
contrib_innovation,
contrib_reliability) %>%
pivot_longer(
cols = starts_with("contrib_"),
names_to = "attribute",
values_to = "contribution"
) %>%
mutate(attribute = gsub("contrib_", "", attribute))
## Bar plot
ggplot(contrib_pc2_2_long,
aes(x = attribute, y = contribution, fill = attribute)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ bank_label) +
theme_minimal() +
labs(
title = "Approximate attribute contributions to Dimension 2 – Cluster 2",
x = "Attribute",
y = "Contribution to Dim2"
)
library(dplyr)
library(purrr)
library(tidyr)
# make sure logical combo vars are numeric for testing
bank_groups_test <- bank_groups |>
mutate(
across(
c(revolut_only, nlb_revolut, otp_revolut, revolut_other),
~ as.integer(.)
)
)
# helper: chi-square with simulated p-value fallback if expected counts are small
run_cluster_sig_test <- function(data, var_name) {
tab <- table(data$cluster, data[[var_name]])
chi <- suppressWarnings(chisq.test(tab))
if (any(chi$expected < 5)) {
chi_sim <- chisq.test(tab, simulate.p.value = TRUE, B = 10000)
tibble(
group = var_name,
test = "Chi-square (simulated p-value)",
statistic = unname(chi$statistic),
p_value = chi_sim$p.value
)
} else {
tibble(
group = var_name,
test = "Chi-square",
statistic = unname(chi$statistic),
p_value = chi$p.value
)
}
}
# overall significance for each bank-combo across clusters
combo_significance <- bind_rows(
lapply(
c("revolut_only", "nlb_revolut", "otp_revolut", "revolut_other"),
function(x) run_cluster_sig_test(bank_groups_test, x)
)
) |>
mutate(p_adj_bh = p.adjust(p_value, method = "BH")) |>
arrange(p_value)
combo_significance
pairwise_cluster_props <- function(data, var_name) {
counts <- data |>
group_by(cluster) |>
summarise(
users = sum(.data[[var_name]], na.rm = TRUE),
n = n(),
.groups = "drop"
)
pw <- pairwise.prop.test(
x = counts$users,
n = counts$n,
p.adjust.method = "BH"
)
as.data.frame(as.table(pw$p.value)) |>
filter(!is.na(Freq)) |>
rename(
cluster_1 = Var1,
cluster_2 = Var2,
p_adj_bh = Freq
) |>
mutate(group = var_name, .before = 1)
}
pairwise_cluster_props(bank_groups_test, "revolut_only")
pairwise_cluster_props(bank_groups_test, "nlb_revolut")
pairwise_cluster_props(bank_groups_test, "otp_revolut")
pairwise_cluster_props(bank_groups_test, "revolut_other")
segment_tab <- table(bank_segments$cluster, bank_segments$segment)
segment_test <- chisq.test(segment_tab, simulate.p.value = TRUE, B = 10000)
segment_test
Pearson's Chi-squared test with simulated p-value (based on 10000
replicates)
data: segment_tab
X-squared = 47.692, df = NA, p-value = 0.009699
run_segment_sig <- function(data, segment_name) {
tmp <- data |>
mutate(in_segment = as.integer(segment == segment_name))
run_cluster_sig_test(tmp, "in_segment") |>
mutate(segment = segment_name, .before = 1) |>
select(segment, everything(), -group)
}
segment_significance <- bind_rows(
run_segment_sig(bank_segments, "Revolut only"),
run_segment_sig(bank_segments, "NLB + Revolut"),
run_segment_sig(bank_segments, "OTP + Revolut")
) |>
mutate(p_adj_bh = p.adjust(p_value, method = "BH"))
segment_significance
pairwise_segment_props <- function(data, segment_name) {
counts <- data |>
mutate(in_segment = as.integer(segment == segment_name)) |>
group_by(cluster) |>
summarise(
users = sum(in_segment, na.rm = TRUE),
n = n(),
.groups = "drop"
)
pw <- pairwise.prop.test(
x = counts$users,
n = counts$n,
p.adjust.method = "BH"
)
as.data.frame(as.table(pw$p.value)) |>
filter(!is.na(Freq)) |>
rename(
cluster_1 = Var1,
cluster_2 = Var2,
p_adj_bh = Freq
) |>
mutate(segment = segment_name, .before = 1)
}
pairwise_segment_props(bank_segments, "Revolut only")
pairwise_segment_props(bank_segments, "NLB + Revolut")
pairwise_segment_props(bank_segments, "OTP + Revolut")
cluster_5_wants_needs <- cluster_means_pca2_5_named |>
filter(cluster == "AI-enthusiastic guidance seekers")
cluster_5_wants_needs
library(dplyr)
library(purrr)
library(tidyr)
# keep only clusters 2, 3, and 5
bank_groups_235 <- bank_groups |>
filter(cluster %in% c(
"Cautious guidance seekers",
"Feature-oriented adopters",
"AI-enthusiastic guidance seekers"
)) |>
mutate(
cluster = droplevels(cluster),
across(
c(revolut_only, nlb_revolut, otp_revolut, revolut_other),
~ as.integer(.)
)
)
run_cluster_sig_test <- function(data, var_name) {
tab <- table(data$cluster, data[[var_name]])
chi <- suppressWarnings(chisq.test(tab))
if (any(chi$expected < 5)) {
chi_sim <- chisq.test(tab, simulate.p.value = TRUE, B = 10000)
tibble(
group = var_name,
test = "Chi-square (simulated p-value)",
statistic = unname(chi$statistic),
p_value = chi_sim$p.value
)
} else {
tibble(
group = var_name,
test = "Chi-square",
statistic = unname(chi$statistic),
p_value = chi$p.value
)
}
}
overall_sig_235 <- bind_rows(
run_cluster_sig_test(bank_groups_235, "revolut_only"),
run_cluster_sig_test(bank_groups_235, "nlb_revolut"),
run_cluster_sig_test(bank_groups_235, "otp_revolut"),
run_cluster_sig_test(bank_groups_235, "revolut_other")
) |>
mutate(p_adj_bh = p.adjust(p_value, method = "BH")) |>
arrange(p_value)
overall_sig_235
run_pair_test <- function(data, var_name, cl1, cl2) {
tmp <- data |>
filter(cluster %in% c(cl1, cl2)) |>
mutate(cluster = droplevels(cluster))
counts <- tmp |>
group_by(cluster) |>
summarise(
users = sum(.data[[var_name]], na.rm = TRUE),
n = n(),
.groups = "drop"
)
test <- prop.test(
x = counts$users,
n = counts$n,
correct = FALSE
)
tibble(
group = var_name,
cluster_1 = cl1,
cluster_2 = cl2,
users_1 = counts$users[1],
n_1 = counts$n[1],
pct_1 = round(100 * counts$users[1] / counts$n[1], 1),
users_2 = counts$users[2],
n_2 = counts$n[2],
pct_2 = round(100 * counts$users[2] / counts$n[2], 1),
statistic = unname(test$statistic),
p_value = test$p.value
)
}
cluster_pairs_235 <- list(
c("Cautious guidance seekers", "Feature-oriented adopters"),
c("Cautious guidance seekers", "AI-enthusiastic guidance seekers"),
c("Feature-oriented adopters", "AI-enthusiastic guidance seekers")
)
bank_vars <- c("revolut_only", "nlb_revolut", "otp_revolut", "revolut_other")
pairwise_sig_235 <- map_dfr(bank_vars, function(v) {
map_dfr(cluster_pairs_235, function(p) {
run_pair_test(bank_groups_235, v, p[1], p[2])
})
}) |>
group_by(group) |>
mutate(p_adj_bh = p.adjust(p_value, method = "BH")) |>
ungroup() |>
arrange(group, p_value)
pairwise_sig_235
pairwise_sig_235 |>
filter(p_adj_bh < 0.05)
bank_groups_235 |>
group_by(cluster) |>
summarise(
n_cluster = n(),
revolut_only_n = sum(revolut_only, na.rm = TRUE),
revolut_only_pct = round(100 * revolut_only_n / n_cluster, 1),
nlb_revolut_n = sum(nlb_revolut, na.rm = TRUE),
nlb_revolut_pct = round(100 * nlb_revolut_n / n_cluster, 1),
otp_revolut_n = sum(otp_revolut, na.rm = TRUE),
otp_revolut_pct = round(100 * otp_revolut_n / n_cluster, 1),
revolut_other_n = sum(revolut_other, na.rm = TRUE),
revolut_other_pct = round(100 * revolut_other_n / n_cluster, 1)
)