Data analysis: bots, descriptive stats

Author

Julius Fenn

1 Notes

This script creates a simplified participant-level dataset using the raw study.rds file and produces:

A clean wide table with core variables + associations (wide) + keyStats
Paradata events in long format (clipboard, mouse, scroll, keyIntervals)
Defocus events in long format

2 global variables

data_root <- "../01_dataPreperation/outputs"
output_root <- "outputs"

3 functions

# not needed

4 load packages

require(pacman)
p_load(
  "tidyverse",
  "psych",
  "afex",
  "mlr3",
  "mlr3cluster",
  "cluster",
  "writexl",
  "readxl",
  "DT"
)

5 load processed data

setwd(data_root)
setwd("questionnaire")

dat_ques <- readRDS(file = "clean_complete_wide.rds")

setwd("../paradata")
dat_para_defocus <- readRDS(file = "paradata_defocus_long.rds")
dat_para_events <- readRDS(file = "paradata_events_long.rds")

6 Descriptive Statistics

6.1 sample

socidem_vars <- str_subset(string = colnames(dat_ques), pattern = "^socidem")
socidem_vars

 [1] "socidem_Submission.id"                
 [2] "socidem_Status"                       
 [3] "socidem_Custom.study.tncs.accepted.at"
 [4] "socidem_Started.at"                   
 [5] "socidem_Completed.at"                 
 [6] "socidem_Reviewed.at"                  
 [7] "socidem_Archived.at"                  
 [8] "socidem_Time.taken"                   
 [9] "socidem_Completion.code"              
[10] "socidem_Total.approvals"              
[11] "socidem_Age"                          
[12] "socidem_Sex"                          
[13] "socidem_Ethnicity.simplified"         
[14] "socidem_Country.of.birth"             
[15] "socidem_Country.of.residence"         
[16] "socidem_Nationality"                  
[17] "socidem_Language"                     
[18] "socidem_Student.status"               
[19] "socidem_Employment.status"

socidem_df <- dat_ques %>%
  select(all_of(socidem_vars))

socidem_overview <- tibble(
  variable = socidem_vars,
  class = purrr::map_chr(socidem_df, ~ paste(class(.x), collapse = "/")),
  n = nrow(socidem_df),
  n_missing = purrr::map_int(socidem_df, ~ sum(is.na(.x))),
  missing_pct = round(100 * n_missing / n, 2),
  n_unique = purrr::map_int(socidem_df, ~ dplyr::n_distinct(.x, na.rm = TRUE))
)

socidem_descriptives <- purrr::map_dfr(socidem_vars, function(v) {
  x <- socidem_df[[v]]

  if (is.numeric(x)) {
    n_non_missing <- sum(!is.na(x))

    tibble(
      variable = v,
      type = "numeric",
      mean = if (n_non_missing == 0) NA_real_ else mean(x, na.rm = TRUE),
      sd = if (n_non_missing <= 1) NA_real_ else sd(x, na.rm = TRUE),
      median = if (n_non_missing == 0) NA_real_ else median(x, na.rm = TRUE),
      min = if (n_non_missing == 0) NA_real_ else min(x, na.rm = TRUE),
      max = if (n_non_missing == 0) NA_real_ else max(x, na.rm = TRUE),
      top_levels = NA_character_
    )
  } else {
    x_chr <- as.character(x)
    x_chr[x_chr == ""] <- NA_character_
    tab <- sort(table(x_chr, useNA = "no"), decreasing = TRUE)
    k <- min(5, length(tab))

    tibble(
      variable = v,
      type = "categorical_or_text",
      mean = NA_real_,
      sd = NA_real_,
      median = NA_real_,
      min = NA_real_,
      max = NA_real_,
      top_levels = if (k == 0) {
        NA_character_
      } else {
        paste0(names(tab)[seq_len(k)], " (", as.integer(tab[seq_len(k)]), ")", collapse = "; ")
      }
    )
  }
})

socidem_table <- socidem_overview %>%
  left_join(socidem_descriptives, by = "variable")


DT::datatable(data = socidem_table)

## factor vars:
factor_vars <- as.character(socidem_table$variable[socidem_table$class == "factor"])

for (var_name in factor_vars) {
  cat("\nTable for:", var_name, "\n")
  
  tab <- table(socidem_df[[var_name]], useNA = "ifany")
  pct <- prop.table(tab) * 100
  
  print(cbind(Frequency = tab, Percentage = round(pct, 1)))
}


Table for: socidem_Sex 
       Frequency Percentage
Female        28       40.6
Male          41       59.4

Table for: socidem_Ethnicity.simplified 
      Frequency Percentage
Asian         2        2.9
Black        18       26.1
Mixed         7       10.1
Other         4        5.8
White        38       55.1

Table for: socidem_Student.status 
             Frequency Percentage
DATA_EXPIRED        14       20.3
No                  45       65.2
Yes                 10       14.5

Table for: socidem_Employment.status 
                                                         Frequency Percentage
DATA_EXPIRED                                                    12       17.4
Due to start a new job within the next month                     2        2.9
Full-Time                                                       27       39.1
Not in paid work (e.g. homemaker', 'retired or disabled)         6        8.7
Other                                                            1        1.4
Part-Time                                                       15       21.7
Unemployed (and job seeking)                                     6        8.7

## numeric vars:
numeric_vars <- as.character(socidem_table$variable[socidem_table$class == "numeric"])

for (var_name in numeric_vars) {
  if (var_name %in% names(socidem_df)) {
    x <- socidem_df[[var_name]]
    x <- x[!is.na(x)]
    
    stats <- boxplot.stats(x)$stats
    names(stats) <- c("Min", "Q1", "Median", "Q3", "Max")
    
    mean_x <- mean(x)
    sd_x <- sd(x)
    
    boxplot(
      x,
      main = paste("Boxplot of", var_name),
      ylab = var_name
    )
    
    legend(
      "topright",
      legend = c(
        paste(names(stats), round(stats, 2), sep = ": "),
        paste("Mean:", round(mean_x, 2)),
        paste("SD:", round(sd_x, 2))
      ),
      bty = "n",
      cex = 0.8
    )
    
    cat("\nBoxplot for:", var_name, "\n")
    print(stats)
    cat("Mean:", round(mean_x, 2), "\n")
    cat("SD:", round(sd_x, 2), "\n")
  }
}


Boxplot for: socidem_Time.taken 
   Min     Q1 Median     Q3    Max 
   200    561    832   1248   2276 
Mean: 959.57 
SD: 512.36


Boxplot for: socidem_Total.approvals 
   Min     Q1 Median     Q3    Max 
     1    144    976   3461   8176 
Mean: 2064.29 
SD: 2744.69


Boxplot for: socidem_Age 
   Min     Q1 Median     Q3    Max 
    20     26     30     34     35 
Mean: 29.29 
SD: 4.39

rm(socidem_vars); rm(socidem_df)

6.2 paradata

para_visualTraps_vars <- str_subset(string = colnames(dat_ques), pattern = "^visualTrap")
para_visualTraps_vars

 [1] "visualTrapResponse"                 "visualTrap_Planets_response"       
 [3] "visualTrap_Planets_responseLabel"   "visualTrap_Planets_correct"        
 [5] "visualTrap_Planets_rtMs"            "visualTrap_Collision_response"     
 [7] "visualTrap_Collision_responseLabel" "visualTrap_Collision_correct"      
 [9] "visualTrap_Collision_rtMs"          "visualTrap_CafeWall_response"      
[11] "visualTrap_CafeWall_responseLabel"  "visualTrap_CafeWall_correct"       
[13] "visualTrap_CafeWall_rtMs"           "visualTrap_Circles_response"       
[15] "visualTrap_Circles_responseLabel"   "visualTrap_Circles_correct"        
[17] "visualTrap_Circles_rtMs"            "visualTrap_Lines_response"         
[19] "visualTrap_Lines_responseLabel"     "visualTrap_Lines_correct"          
[21] "visualTrap_Lines_rtMs"              "visualTrap_Robot_response"         
[23] "visualTrap_Robot_responseLabel"     "visualTrap_Robot_correct"          
[25] "visualTrap_Robot_rtMs"              "visualTrap_order"

para_visualTraps_vars_df <- dat_ques %>%
  select(all_of(para_visualTraps_vars))

para_visualTraps_overview <- tibble(
  variable = names(para_visualTraps_vars_df),
  class = purrr::map_chr(para_visualTraps_vars_df, ~ paste(class(.x), collapse = "/")),
  n = nrow(para_visualTraps_vars_df),
  n_missing = purrr::map_int(para_visualTraps_vars_df, ~ sum(is.na(.x))),
  n_unique = purrr::map_int(para_visualTraps_vars_df, ~ dplyr::n_distinct(.x, na.rm = TRUE))
) %>%
  mutate(
    missing_pct = round(100 * n_missing / n, 2)
  )


para_visualTrap_descriptives <- purrr::map_dfr(para_visualTraps_vars, function(v) {
  x <- para_visualTraps_vars_df[[v]]

  if (is.numeric(x)) {
    n_non_missing <- sum(!is.na(x))

    tibble(
      variable = v,
      type = "numeric",
      mean = if (n_non_missing == 0) NA_real_ else mean(x, na.rm = TRUE),
      sd = if (n_non_missing <= 1) NA_real_ else sd(x, na.rm = TRUE),
      median = if (n_non_missing == 0) NA_real_ else median(x, na.rm = TRUE),
      min = if (n_non_missing == 0) NA_real_ else min(x, na.rm = TRUE),
      max = if (n_non_missing == 0) NA_real_ else max(x, na.rm = TRUE),
      top_levels = NA_character_
    )
  } else {
    x_chr <- as.character(x)
    x_chr[x_chr == ""] <- NA_character_
    tab <- sort(table(x_chr, useNA = "no"), decreasing = TRUE)
    k <- min(5, length(tab))

    tibble(
      variable = v,
      type = "categorical_or_text",
      mean = NA_real_,
      sd = NA_real_,
      median = NA_real_,
      min = NA_real_,
      max = NA_real_,
      top_levels = if (k == 0) {
        NA_character_
      } else {
        paste0(names(tab)[seq_len(k)], " (", as.integer(tab[seq_len(k)]), ")", collapse = "; ")
      }
    )
  }
})


para_visualTraps_table <- para_visualTraps_overview %>%
  left_join(para_visualTrap_descriptives, by = "variable")


DT::datatable(data = para_visualTraps_table)

## factor vars:
factor_vars <- as.character(para_visualTraps_table$variable[para_visualTraps_table$class == "factor"])

for (var_name in factor_vars) {
  cat("\nTable for:", var_name, "\n")
  
  tab <- table(para_visualTraps_vars_df[[var_name]], useNA = "ifany")
  pct <- prop.table(tab) * 100
  
  print(cbind(Frequency = tab, Percentage = round(pct, 1)))
}


Table for: visualTrapResponse 
  Frequency Percentage
1        17       24.6
2        28       40.6
3        23       33.3
4         1        1.4

Table for: visualTrap_Planets_response 
  Frequency Percentage
1         1        1.4
2        65       94.2
3         2        2.9
4         1        1.4

Table for: visualTrap_Collision_response 
  Frequency Percentage
1        58       84.1
2         7       10.1
3         3        4.3
4         1        1.4

Table for: visualTrap_CafeWall_response 
  Frequency Percentage
1         3        4.3
3        59       85.5
4         7       10.1

Table for: visualTrap_Circles_response 
  Frequency Percentage
2        66       95.7
3         2        2.9
4         1        1.4

Table for: visualTrap_Lines_response 
  Frequency Percentage
1        67       97.1
2         1        1.4
3         1        1.4

Table for: visualTrap_Robot_response 
  Frequency Percentage
1         1        1.4
2        14       20.3
3        54       78.3

## numeric vars:
numeric_vars <- as.character(para_visualTraps_table$variable[para_visualTraps_table$class %in% c("numeric", "integer")])

for (var_name in numeric_vars) {
  if (var_name %in% names(para_visualTraps_vars_df)) {
    x <- para_visualTraps_vars_df[[var_name]]
    x <- x[!is.na(x)]
    
    stats <- boxplot.stats(x)$stats
    names(stats) <- c("Min", "Q1", "Median", "Q3", "Max")
    
    mean_x <- mean(x)
    sd_x <- sd(x)
    
    boxplot(
      x,
      main = paste("Boxplot of", var_name),
      ylab = var_name
    )
    
    legend(
      "topright",
      legend = c(
        paste(names(stats), round(stats, 2), sep = ": "),
        paste("Mean:", round(mean_x, 2)),
        paste("SD:", round(sd_x, 2))
      ),
      bty = "n",
      cex = 0.8
    )
    
    cat("\nBoxplot for:", var_name, "\n")
    print(stats)
    cat("Mean:", round(mean_x, 2), "\n")
    cat("SD:", round(sd_x, 2), "\n")
  }
}


Boxplot for: visualTrap_Planets_rtMs 
   Min     Q1 Median     Q3    Max 
  7546  18954  26180  40289  60062 
Mean: 37924.93 
SD: 41206.44


Boxplot for: visualTrap_Collision_rtMs 
   Min     Q1 Median     Q3    Max 
  2688  17596  30717  51485  89971 
Mean: 41787.09 
SD: 40415.81


Boxplot for: visualTrap_CafeWall_rtMs 
   Min     Q1 Median     Q3    Max 
  4754  12424  21813  37480  67396 
Mean: 31294.46 
SD: 35666.95


Boxplot for: visualTrap_Circles_rtMs 
   Min     Q1 Median     Q3    Max 
  2836   6732  10175  17395  29661 
Mean: 15166.86 
SD: 20691.36


Boxplot for: visualTrap_Lines_rtMs 
   Min     Q1 Median     Q3    Max 
  3095   6798   9543  14188  22983 
Mean: 23240.86 
SD: 68558.7


Boxplot for: visualTrap_Robot_rtMs 
   Min     Q1 Median     Q3    Max 
  5081  11972  18490  32784  54062 
Mean: 28435.51 
SD: 33601.96

rm(para_visualTraps_vars); rm(para_visualTraps_vars_df)

for static paradata:

para_statics_vars <- str_subset(string = colnames(dat_ques), pattern = "^para")
para_statics_vars

 [1] "para_countclicks"                                   
 [2] "para_static_userAgent"                              
 [3] "para_static_platform"                               
 [4] "para_static_language"                               
 [5] "para_static_languages"                              
 [6] "para_static_cookieEnabled"                          
 [7] "para_static_online"                                 
 [8] "para_static_webdriver"                              
 [9] "para_static_hardwareConcurrency"                    
[10] "para_static_deviceMemory"                           
[11] "para_static_maxTouchPoints"                         
[12] "para_static_screen_width"                           
[13] "para_static_screen_height"                          
[14] "para_static_screen_availWidth"                      
[15] "para_static_screen_availHeight"                     
[16] "para_static_screen_colorDepth"                      
[17] "para_static_screen_pixelDepth"                      
[18] "para_static_viewport_innerWidth"                    
[19] "para_static_viewport_outerWidth"                    
[20] "para_static_viewport_outerHeight"                   
[21] "para_static_viewport_devicePixelRatio"              
[22] "para_static_page_url"                               
[23] "para_static_page_hostname"                          
[24] "para_static_page_pathname"                          
[25] "para_static_page_protocol"                          
[26] "para_static_page_referrer"                          
[27] "para_static_page_title"                             
[28] "para_static_locale_timezone"                        
[29] "para_static_locale_timezoneOffset"                  
[30] "para_static_network_effectiveType"                  
[31] "para_static_network_downlink"                       
[32] "para_static_network_rtt"                            
[33] "para_static_network_saveData"                       
[34] "para_static_mediaQueries_prefersDarkScheme"         
[35] "para_static_mediaQueries_prefersReducedMotion"      
[36] "para_static_mediaQueries_prefersReducedTransparency"
[37] "para_static_mediaQueries_prefersContrastMore"       
[38] "para_static_mediaQueries_pointerCoarse"             
[39] "para_static_mediaQueries_pointerFine"               
[40] "para_static_mediaQueries_hover"                     
[41] "para_static_features_localStorage"                  
[42] "para_static_features_sessionStorage"                
[43] "para_static_features_indexedDB"                     
[44] "para_static_features_serviceWorker"                 
[45] "para_static_features_bluetooth"                     
[46] "para_static_features_usb"                           
[47] "para_static_features_clipboard"                     
[48] "para_static_features_mediaDevices"                  
[49] "para_static_features_permissions"                   
[50] "para_static_features_storageManager"                
[51] "para_static_features_visualViewport"                
[52] "para_ip_ip"                                         
[53] "para_ip_network"                                    
[54] "para_ip_version"                                    
[55] "para_ip_city"                                       
[56] "para_ip_region"                                     
[57] "para_ip_region_code"                                
[58] "para_ip_country"                                    
[59] "para_ip_country_name"                               
[60] "para_ip_country_code"                               
[61] "para_ip_country_code_iso3"                          
[62] "para_ip_country_capital"                            
[63] "para_ip_country_tld"                                
[64] "para_ip_in_eu"                                      
[65] "para_ip_postal"                                     
[66] "para_ip_latitude"                                   
[67] "para_ip_longitude"                                  
[68] "para_ip_timezone"                                   
[69] "para_ip_utc_offset"                                 
[70] "para_ip_country_calling_code"                       
[71] "para_ip_currency"                                   
[72] "para_ip_currency_name"                              
[73] "para_ip_languages"                                  
[74] "para_ip_country_area"                               
[75] "para_ip_country_population"                         
[76] "para_ip_asn"                                        
[77] "para_ip_org"                                        
[78] "para_meta_startedAt"                                
[79] "para_static_viewport_innerHeight"                   
[80] "para_static_doNotTrack"                             
[81] "para_ip_continent_code"

para_statics_vars_df <- dat_ques %>%
  select(all_of(para_statics_vars))

para_static_overview <- tibble(
  variable = names(para_statics_vars_df),
  class = purrr::map_chr(para_statics_vars_df, ~ paste(class(.x), collapse = "/")),
  n = nrow(para_statics_vars_df),
  n_missing = purrr::map_int(para_statics_vars_df, ~ sum(is.na(.x))),
  n_unique = purrr::map_int(para_statics_vars_df, ~ dplyr::n_distinct(.x, na.rm = TRUE))
) %>%
  mutate(
    missing_pct = round(100 * n_missing / n, 2)
  )


para_static_descriptives <- purrr::map_dfr(para_statics_vars, function(v) {
  x <- para_statics_vars_df[[v]]

  if (is.numeric(x)) {
    n_non_missing <- sum(!is.na(x))

    tibble(
      variable = v,
      type = "numeric",
      mean = if (n_non_missing == 0) NA_real_ else mean(x, na.rm = TRUE),
      sd = if (n_non_missing <= 1) NA_real_ else sd(x, na.rm = TRUE),
      median = if (n_non_missing == 0) NA_real_ else median(x, na.rm = TRUE),
      min = if (n_non_missing == 0) NA_real_ else min(x, na.rm = TRUE),
      max = if (n_non_missing == 0) NA_real_ else max(x, na.rm = TRUE),
      top_levels = NA_character_
    )
  } else {
    x_chr <- as.character(x)
    x_chr[x_chr == ""] <- NA_character_
    tab <- sort(table(x_chr, useNA = "no"), decreasing = TRUE)
    k <- min(5, length(tab))

    tibble(
      variable = v,
      type = "categorical_or_text",
      mean = NA_real_,
      sd = NA_real_,
      median = NA_real_,
      min = NA_real_,
      max = NA_real_,
      top_levels = if (k == 0) {
        NA_character_
      } else {
        paste0(names(tab)[seq_len(k)], " (", as.integer(tab[seq_len(k)]), ")", collapse = "; ")
      }
    )
  }
})


para_static_table <- para_static_overview %>%
  left_join(para_static_descriptives, by = "variable")


DT::datatable(data = para_static_table)

rm(para_statics_vars); rm(para_statics_vars_df)

7 Identify potential bots

visualtrap_rt_vars <- str_subset(
  string = colnames(dat_ques),
  pattern = "^visualTrap_.*_rtMs$"
)

visualtrap_items <- stringr::str_remove(visualtrap_rt_vars, "_rtMs$")
visualtrap_correct_vars <- paste0(visualtrap_items, "_correct")

visualtrap_pairs <- tibble(
  item = visualtrap_items,
  rt_var = visualtrap_rt_vars,
  correct_var = visualtrap_correct_vars
) %>%
  filter(correct_var %in% colnames(dat_ques))

parse_correct_binary <- function(x) {
  x_chr <- tolower(trimws(as.character(x)))

  dplyr::case_when(
    x_chr %in% c("1", "true", "t", "yes", "y", "correct", "right") ~ 1,
    x_chr %in% c("0", "false", "f", "no", "n", "incorrect", "wrong") ~ 0,
    TRUE ~ NA_real_
  )
}

visualtrap_rt_ttest <- purrr::map_dfr(seq_len(nrow(visualtrap_pairs)), function(i) {
  item_name <- visualtrap_pairs$item[[i]]
  rt_name <- visualtrap_pairs$rt_var[[i]]
  correct_name <- visualtrap_pairs$correct_var[[i]]

  correct_bin_all <- parse_correct_binary(dat_ques[[correct_name]])

  n_correct_all <- sum(correct_bin_all == 1, na.rm = TRUE)
  n_wrong_all <- sum(correct_bin_all == 0, na.rm = TRUE)
  n_correct_non_missing <- n_correct_all + n_wrong_all

  rt_all <- suppressWarnings(as.numeric(dat_ques[[rt_name]]))

  item_df <- tibble(
    rt_ms = rt_all,
    correct_bin = correct_bin_all
  ) %>%
    filter(!is.na(rt_ms), !is.na(correct_bin)) %>%
    mutate(group = factor(ifelse(correct_bin == 1, "right", "wrong"), levels = c("wrong", "right")))

  n_right_rt <- sum(item_df$group == "right")
  n_wrong_rt <- sum(item_df$group == "wrong")

  mean_right <- if (n_right_rt > 0) mean(item_df$rt_ms[item_df$group == "right"], na.rm = TRUE) else NA_real_
  mean_wrong <- if (n_wrong_rt > 0) mean(item_df$rt_ms[item_df$group == "wrong"], na.rm = TRUE) else NA_real_
  sd_right <- if (n_right_rt > 1) sd(item_df$rt_ms[item_df$group == "right"], na.rm = TRUE) else NA_real_
  sd_wrong <- if (n_wrong_rt > 1) sd(item_df$rt_ms[item_df$group == "wrong"], na.rm = TRUE) else NA_real_

  t_out <- tryCatch(
    {
      if (n_right_rt >= 2 && n_wrong_rt >= 2) {
        t.test(rt_ms ~ group, data = item_df)
      } else {
        NULL
      }
    },
    error = function(e) NULL
  )

  pooled_sd <- if (!is.na(sd_right) && !is.na(sd_wrong) && (n_right_rt + n_wrong_rt - 2) > 0) {
    sqrt(((n_right_rt - 1) * sd_right^2 + (n_wrong_rt - 1) * sd_wrong^2) / (n_right_rt + n_wrong_rt - 2))
  } else {
    NA_real_
  }

  cohen_d <- if (!is.na(pooled_sd) && pooled_sd > 0) {
    (mean_right - mean_wrong) / pooled_sd
  } else {
    NA_real_
  }

  hedges_g <- if (!is.na(cohen_d) && (n_right_rt + n_wrong_rt) > 3) {
    cohen_d * (1 - (3 / (4 * (n_right_rt + n_wrong_rt) - 9)))
  } else {
    NA_real_
  }

  tibble(
    item = item_name,
    correct_var = correct_name,
    rt_var = rt_name,
    n_correct_non_missing = n_correct_non_missing,
    n_right = n_correct_all,
    n_wrong = n_wrong_all,
    prop_right = if (n_correct_non_missing > 0) n_correct_all / n_correct_non_missing else NA_real_,
    prop_wrong = if (n_correct_non_missing > 0) n_wrong_all / n_correct_non_missing else NA_real_,
    n_rt_ttest = nrow(item_df),
    n_right_rt = n_right_rt,
    n_wrong_rt = n_wrong_rt,
    mean_rt_right = mean_right,
    sd_rt_right = sd_right,
    mean_rt_wrong = mean_wrong,
    sd_rt_wrong = sd_wrong,
    mean_diff_right_minus_wrong = mean_right - mean_wrong,
    t_statistic = if (!is.null(t_out)) unname(t_out$statistic) else NA_real_,
    df = if (!is.null(t_out)) unname(t_out$parameter) else NA_real_,
    p_value = if (!is.null(t_out)) t_out$p.value else NA_real_,
    conf_low = if (!is.null(t_out)) t_out$conf.int[[1]] else NA_real_,
    conf_high = if (!is.null(t_out)) t_out$conf.int[[2]] else NA_real_,
    cohen_d = cohen_d,
    hedges_g = hedges_g
  )
}) %>%
  mutate(
    across(c(prop_right, prop_wrong), ~ round(.x, 3)),
    across(c(mean_rt_right, sd_rt_right, mean_rt_wrong, sd_rt_wrong, mean_diff_right_minus_wrong), ~ round(.x, 2)),
    across(c(t_statistic, df, cohen_d, hedges_g), ~ round(.x, 3)),
    p_value = round(p_value, 4),
    across(c(conf_low, conf_high), ~ round(.x, 2))
  )

visualtrap_rt_ttest

# A tibble: 6 × 23
  item       correct_var rt_var n_correct_non_missing n_right n_wrong prop_right
  <chr>      <chr>       <chr>                  <int>   <int>   <int>      <dbl>
1 visualTra… visualTrap… visua…                    69      65       4      0.942
2 visualTra… visualTrap… visua…                    69      58      11      0.841
3 visualTra… visualTrap… visua…                    69      59      10      0.855
4 visualTra… visualTrap… visua…                    69      66       3      0.957
5 visualTra… visualTrap… visua…                    69      67       2      0.971
6 visualTra… visualTrap… visua…                    69      54      15      0.783
# ℹ 16 more variables: prop_wrong <dbl>, n_rt_ttest <int>, n_right_rt <int>,
#   n_wrong_rt <int>, mean_rt_right <dbl>, sd_rt_right <dbl>,
#   mean_rt_wrong <dbl>, sd_rt_wrong <dbl>, mean_diff_right_minus_wrong <dbl>,
#   t_statistic <dbl>, df <dbl>, p_value <dbl>, conf_low <dbl>,
#   conf_high <dbl>, cohen_d <dbl>, hedges_g <dbl>

DT::datatable(
  visualtrap_rt_ttest,
  caption = "Visual-trap item analysis: right vs wrong reaction times (Welch t-test + effect sizes)",
  options = list(pageLength = 10, scrollX = TRUE)
)

parse_bool <- function(x) {
  x_chr <- tolower(trimws(as.character(x)))
  dplyr::case_when(
    x_chr %in% c("1", "true", "t", "yes", "y") ~ 1,
    x_chr %in% c("0", "false", "f", "no", "n") ~ 0,
    TRUE ~ NA_real_
  )
}

safe_num <- function(x) suppressWarnings(as.numeric(as.character(x)))

vt_correct_vars <- str_subset(colnames(dat_ques), "^visualTrap_.*_correct$")
vt_rt_vars <- str_subset(colnames(dat_ques), "^visualTrap_.*_rtMs$")

vt_correct_mat <- dat_ques %>%
  select(all_of(vt_correct_vars)) %>%
  mutate(across(everything(), parse_bool))

vt_rt_mat <- dat_ques %>%
  select(all_of(vt_rt_vars)) %>%
  mutate(across(everything(), safe_num))

# IER / straightlining indicator from attributes scale
attr_cols <- str_subset(colnames(dat_ques), "^attributesFutureSociety-")

attr_mat <- dat_ques %>%
  select(all_of(attr_cols)) %>%
  mutate(across(everything(), safe_num))

attr_answered_count <- attr_mat %>%
  transmute(attr_answered_count = rowSums(!is.na(across(everything())))) %>%
  pull(attr_answered_count)

attr_response_sd <- attr_mat %>%
  transmute(attr_response_sd = apply(
    across(everything()),
    1,
    function(x) {
      x <- as.numeric(x)
      if (sum(!is.na(x)) < 2) return(NA_real_)
      stats::sd(x, na.rm = TRUE)
    }
  )) %>%
  pull(attr_response_sd)

vt_wrong_count <- vt_correct_mat %>%
  mutate(across(everything(), ~ ifelse(.x == 0, 1, 0))) %>%
  transmute(vt_wrong_count = rowSums(across(everything()), na.rm = TRUE)) %>%
  pull(vt_wrong_count)

vt_answered_count <- vt_correct_mat %>%
  transmute(vt_answered_count = rowSums(!is.na(across(everything())))) %>%
  pull(vt_answered_count)

vt_correct_count <- vt_correct_mat %>%
  transmute(vt_correct_count = rowSums(across(everything()), na.rm = TRUE)) %>%
  pull(vt_correct_count)

vt_prop_correct <- ifelse(vt_answered_count > 0, vt_correct_count / vt_answered_count, NA_real_)

vt_mean_rt <- vt_rt_mat %>%
  transmute(vt_mean_rt = rowMeans(across(everything()), na.rm = TRUE)) %>%
  pull(vt_mean_rt)

vt_mean_rt_p05 <- quantile(vt_mean_rt, probs = 0.05, na.rm = TRUE)
vt_mean_rt_p95 <- quantile(vt_mean_rt, probs = 0.95, na.rm = TRUE)


vt_fast_count <- vt_rt_mat %>%
  mutate(across(everything(), ~ ifelse(!is.na(.x) & .x < vt_mean_rt_p05, 1, 0))) %>%
  transmute(vt_fast_count = rowSums(across(everything()), na.rm = TRUE)) %>%
  pull(vt_fast_count)

vt_slow_count <- vt_rt_mat %>%
  mutate(across(everything(), ~ ifelse(!is.na(.x) & .x > vt_mean_rt_p95, 1, 0))) %>%
  transmute(vt_slow_count = rowSums(across(everything()), na.rm = TRUE)) %>%
  pull(vt_slow_count)

webdriver_flag <- parse_bool(dat_ques$para_static_webdriver)
honeypot_flag <- parse_bool(dat_ques$hp_exclusionCriteria)

duration_seconds <- safe_num(dat_ques$duration_seconds)
duration_p05 <- quantile(duration_seconds, probs = 0.05, na.rm = TRUE)
duration_p95 <- quantile(duration_seconds, probs = 0.95, na.rm = TRUE)

very_fast_duration_flag <- ifelse(!is.na(duration_seconds) & duration_seconds < duration_p05, 1, 0)
very_slow_duration_flag <- ifelse(!is.na(duration_seconds) & duration_seconds > duration_p95, 1, 0)

events_by_pid <- dat_para_events %>%
  mutate(
    event_type_norm = tolower(as.character(event_type)),
    clipboard_type_norm = tolower(as.character(clipboard_type)),
    key_interval_ms = safe_num(key_interval_ms),
    clipboard_length = safe_num(clipboard_length)
  ) %>%
  group_by(PROLIFIC_PID) %>%
  summarise(
    n_events = n(),
    n_copy = sum(event_type_norm == "copy" | (event_type_norm == "clipboard" & clipboard_type_norm == "copy"), na.rm = TRUE),
    n_paste = sum(event_type_norm == "paste" | (event_type_norm == "clipboard" & clipboard_type_norm == "paste"), na.rm = TRUE),
    n_mouse = sum(event_type_norm %in% c("mousemove", "mouse"), na.rm = TRUE),
    n_scroll = sum(event_type_norm == "scroll", na.rm = TRUE),
    n_key_intervals = sum(!is.na(key_interval_ms)),
    key_interval_mean = mean(key_interval_ms, na.rm = TRUE),
    key_interval_sd = sd(key_interval_ms, na.rm = TRUE),
    clipboard_total_chars = sum(clipboard_length, na.rm = TRUE),
    .groups = "drop"
  )

defocus_by_pid <- dat_para_defocus %>%
  mutate(durationblur = safe_num(durationblur)) %>%
  group_by(PROLIFIC_PID) %>%
  summarise(
    defocus_n = n(),
    defocus_total_ms = sum(durationblur, na.rm = TRUE),
    defocus_mean_ms = mean(durationblur, na.rm = TRUE),
    .groups = "drop"
  )

bot_df <- dat_ques %>%
  select(PROLIFIC_PID, result_id, duration_seconds = duration_seconds) %>%
  mutate(
    duration_seconds = safe_num(duration_seconds),
    vt_wrong_count = vt_wrong_count,
    vt_answered_count = vt_answered_count,
    vt_correct_count = vt_correct_count,
    vt_prop_correct = vt_prop_correct,
    vt_mean_rt = vt_mean_rt,
    vt_fast_count = vt_fast_count,
    vt_slow_count = vt_slow_count,
    attr_answered_count = attr_answered_count,
    attr_response_sd = attr_response_sd,
    webdriver_flag = webdriver_flag,
    honeypot_flag = honeypot_flag,
    very_fast_duration_flag = very_fast_duration_flag,
    very_slow_duration_flag = very_slow_duration_flag
  ) %>%
  left_join(events_by_pid, by = "PROLIFIC_PID") %>%
  left_join(defocus_by_pid, by = "PROLIFIC_PID") %>%
  mutate(
    across(c(n_events, n_copy, n_paste, n_mouse, n_scroll, n_key_intervals,
             clipboard_total_chars, defocus_n, defocus_total_ms, defocus_mean_ms),
           ~ replace_na(.x, 0)),
    key_interval_mean = ifelse(is.nan(key_interval_mean), NA_real_, key_interval_mean),
    key_interval_sd = ifelse(is.nan(key_interval_sd), NA_real_, key_interval_sd)
  )

vt_rt_p10 <- quantile(bot_df$vt_mean_rt, 0.10, na.rm = TRUE)
vt_rt_p90 <- quantile(bot_df$vt_mean_rt, 0.90, na.rm = TRUE)
paste_p95 <- quantile(bot_df$n_paste, 0.95, na.rm = TRUE)
attr_sd_p10 <- quantile(bot_df$attr_response_sd, 0.10, na.rm = TRUE)

bot_df <- bot_df %>%
  mutate(
    risk_webdriver = ifelse(webdriver_flag == 1, 1, 0),
    risk_honeypot = ifelse(honeypot_flag == 1, 1, 0),
    risk_many_wrong_visual = ifelse(vt_wrong_count >= 3, 1, 0),
    risk_low_visual_accuracy = ifelse(!is.na(vt_prop_correct) & vt_prop_correct < 0.50, 1, 0),
    risk_very_fast_visual_rt = ifelse(!is.na(vt_mean_rt) & vt_mean_rt < vt_rt_p10, 1, 0),
    risk_very_slow_visual_rt = ifelse(!is.na(vt_mean_rt) & vt_mean_rt > vt_rt_p90, 1, 0),
    risk_very_fast_duration = ifelse(very_fast_duration_flag == 1, 1, 0),
    risk_high_paste = ifelse(!is.na(n_paste) & n_paste > paste_p95, 1, 0),
    risk_low_mouse_events = ifelse(!is.na(n_mouse) & n_mouse <= 5, 1, 0),
    risk_high_defocus = ifelse(!is.na(defocus_n) & defocus_n >= 5, 1, 0),
    risk_low_attr_sd = ifelse(!is.na(attr_response_sd) & attr_response_sd <= attr_sd_p10, 1, 0),
    bot_risk_score = risk_webdriver + risk_honeypot + risk_many_wrong_visual +
      risk_low_visual_accuracy + risk_very_fast_visual_rt + risk_very_slow_visual_rt +
      risk_very_fast_duration + risk_high_paste + risk_low_mouse_events + risk_high_defocus +
      risk_low_attr_sd
  )

risk_indicator_table <- bot_df %>%
  summarise(
    n = n(),
    across(starts_with("risk_"), ~ mean(.x, na.rm = TRUE), .names = "{.col}_prop")
  ) %>%
  pivot_longer(cols = starts_with("risk_"), names_to = "indicator", values_to = "proportion_flagged") %>%
  mutate(proportion_flagged = round(proportion_flagged, 3)) %>%
  arrange(desc(proportion_flagged))

risk_indicator_table

# A tibble: 11 × 3
       n indicator                     proportion_flagged
   <int> <chr>                                      <dbl>
 1    69 risk_low_attr_sd_prop                      0.145
 2    69 risk_very_fast_visual_rt_prop              0.101
 3    69 risk_very_slow_visual_rt_prop              0.101
 4    69 risk_high_defocus_prop                     0.087
 5    69 risk_many_wrong_visual_prop                0.072
 6    69 risk_low_visual_accuracy_prop              0.058
 7    69 risk_very_fast_duration_prop               0.058
 8    69 risk_high_paste_prop                       0.058
 9    69 risk_webdriver_prop                        0    
10    69 risk_honeypot_prop                         0    
11    69 risk_low_mouse_events_prop                 0

bot_risk_score_dist <- bot_df %>%
  count(bot_risk_score, name = "n") %>%
  mutate(prop = round(n / sum(n), 3)) %>%
  arrange(bot_risk_score)

bot_risk_score_dist

# A tibble: 5 × 3
  bot_risk_score     n  prop
           <dbl> <int> <dbl>
1              0    42 0.609
2              1    11 0.159
3              2    13 0.188
4              3     2 0.029
5              4     1 0.014

cluster_features <- bot_df %>%
  select(
    vt_wrong_count, vt_prop_correct, vt_mean_rt, vt_fast_count, vt_slow_count,
    attr_response_sd,
    duration_seconds, n_events, n_paste, n_mouse, n_scroll,
    n_key_intervals, key_interval_mean, key_interval_sd,
    defocus_n, defocus_total_ms, bot_risk_score
  ) %>%
  mutate(across(everything(), ~ ifelse(is.infinite(.x), NA_real_, .x)))

# for (j in seq_len(ncol(cluster_features))) {
#   med_j <- median(cluster_features[[j]], na.rm = TRUE)
#   cluster_features[[j]][is.na(cluster_features[[j]])] <- med_j
# }


# robust NA handling: median impute per column (fallback to 0)
cluster_features_ready <- cluster_features %>%
  mutate(across(everything(), ~ {
    x <- as.numeric(.x)
    x[is.infinite(x)] <- NA_real_
    med <- suppressWarnings(stats::median(x, na.rm = TRUE))
    if (!is.finite(med)) med <- 0
    x[is.na(x)] <- med
    x
  }))

# drop zero-variance columns (would create NaN after scaling)
vars_zero_var <- names(cluster_features_ready)[
  vapply(
    cluster_features_ready,
    function(x) {
      s <- stats::sd(x, na.rm = TRUE)
      is.na(s) || s == 0
    },
    logical(1)
  )
]
vars_zero_var

[1] "n_key_intervals"   "key_interval_mean" "key_interval_sd"

cluster_features_model <- cluster_features_ready
if (length(vars_zero_var) > 0) {
  cluster_features_model <- cluster_features_model %>%
    dplyr::select(-dplyr::all_of(vars_zero_var))
}

# scale and force any residual non-finite values to 0
cluster_features_scaled <- cluster_features_model %>%
  scale(center = TRUE, scale = TRUE) %>%
  as.data.frame()

cluster_features_scaled <- cluster_features_scaled %>%
  mutate(across(everything(), ~ {
    x <- as.numeric(.x)
    x[!is.finite(x)] <- 0
    x
  }))

# clustering task
task <- mlr3cluster::TaskClust$new(
  id = "bot_profile",
  backend = cluster_features_scaled
)

# evaluate candidate k values
k_max <- min(6, nrow(cluster_features_scaled) - 1)
k_grid <- if (k_max >= 2) 2:k_max else 2
dmat <- dist(cluster_features_scaled)

k_eval <- purrr::map_dfr(k_grid, function(k) {
  learner_k <- mlr3::lrn("clust.kmeans", centers = k, nstart = 30)
  learner_k$train(task)
  pred_k <- learner_k$predict(task)
  cl <- as.integer(pred_k$partition)

  sil <- tryCatch({
    mean(cluster::silhouette(cl, dmat)[, "sil_width"])
  }, error = function(e) NA_real_)

  tibble::tibble(
    k = k,
    avg_silhouette = sil
  )
})

k_eval

# A tibble: 5 × 2
      k avg_silhouette
  <int>          <dbl>
1     2          0.345
2     3          0.338
3     4          0.364
4     5          0.367
5     6          0.372

# choose best k
best_k <- k_eval %>%
  dplyr::filter(!is.na(avg_silhouette)) %>%
  dplyr::arrange(dplyr::desc(avg_silhouette)) %>%
  dplyr::slice(1) %>%
  dplyr::pull(k)

if (length(best_k) == 0 || is.na(best_k)) best_k <- 3

# final model
learner_final <- mlr3::lrn("clust.kmeans", centers = best_k, nstart = 50)
learner_final$train(task)
pred_final <- learner_final$predict(task)

# attach cluster labels back to original data
bot_df <- bot_df %>%
  dplyr::mutate(bot_cluster = factor(pred_final$partition))

# cluster summaries
cluster_profile <- bot_df %>%
  dplyr::group_by(bot_cluster) %>%
  dplyr::summarise(
    n = dplyr::n(),
    prop_sample = round(n / nrow(bot_df), 3),
    mean_bot_risk_score = round(mean(bot_risk_score, na.rm = TRUE), 3),
    mean_vt_prop_correct = round(mean(vt_prop_correct, na.rm = TRUE), 3),
    mean_vt_mean_rt = round(mean(vt_mean_rt, na.rm = TRUE), 1),
    mean_attr_response_sd = round(mean(attr_response_sd, na.rm = TRUE), 3),
    mean_duration_seconds = round(mean(duration_seconds, na.rm = TRUE), 1),
    mean_n_paste = round(mean(n_paste, na.rm = TRUE), 2),
    mean_n_mouse = round(mean(n_mouse, na.rm = TRUE), 2),
    mean_defocus_n = round(mean(defocus_n, na.rm = TRUE), 2),
    prop_webdriver = round(mean(webdriver_flag == 1, na.rm = TRUE), 3),
    prop_honeypot = round(mean(honeypot_flag == 1, na.rm = TRUE), 3),
    .groups = "drop"
  ) %>%
  dplyr::arrange(dplyr::desc(mean_bot_risk_score))

cluster_profile

# A tibble: 6 × 13
  bot_cluster     n prop_sample mean_bot_risk_score mean_vt_prop_correct
  <fct>       <int>       <dbl>               <dbl>                <dbl>
1 5               5       0.072               2.6                  0.3  
2 4               1       0.014               2                    0.833
3 2               9       0.13                1.33                 0.87 
4 3               6       0.087               1.17                 0.861
5 6              18       0.261               0.556                0.972
6 1              30       0.435               0.1                  0.956
# ℹ 8 more variables: mean_vt_mean_rt <dbl>, mean_attr_response_sd <dbl>,
#   mean_duration_seconds <dbl>, mean_n_paste <dbl>, mean_n_mouse <dbl>,
#   mean_defocus_n <dbl>, prop_webdriver <dbl>, prop_honeypot <dbl>

# identify highest-risk cluster
high_risk_cluster <- cluster_profile %>%
  dplyr::slice(1) %>%
  dplyr::pull(bot_cluster)

suspected_bot_table <- bot_df %>%
  dplyr::mutate(suspected_bot_cluster = ifelse(bot_cluster == high_risk_cluster, 1, 0)) %>%
  dplyr::select(
    PROLIFIC_PID, result_id, bot_cluster, suspected_bot_cluster, bot_risk_score,
    vt_wrong_count, vt_prop_correct, vt_mean_rt, attr_response_sd, duration_seconds,
    webdriver_flag, honeypot_flag, n_paste, n_mouse, defocus_n
  ) %>%
  dplyr::arrange(dplyr::desc(suspected_bot_cluster), dplyr::desc(bot_risk_score), vt_prop_correct)

DT::datatable(
  suspected_bot_table,
  caption = paste0("Bot-risk profiling table (best k = ", best_k, ")"),
  options = list(pageLength = 12, scrollX = TRUE)
)

Interpetation:

cluster_profile

# A tibble: 6 × 13
  bot_cluster     n prop_sample mean_bot_risk_score mean_vt_prop_correct
  <fct>       <int>       <dbl>               <dbl>                <dbl>
1 5               5       0.072               2.6                  0.3  
2 4               1       0.014               2                    0.833
3 2               9       0.13                1.33                 0.87 
4 3               6       0.087               1.17                 0.861
5 6              18       0.261               0.556                0.972
6 1              30       0.435               0.1                  0.956
# ℹ 8 more variables: mean_vt_mean_rt <dbl>, mean_attr_response_sd <dbl>,
#   mean_duration_seconds <dbl>, mean_n_paste <dbl>, mean_n_mouse <dbl>,
#   mean_defocus_n <dbl>, prop_webdriver <dbl>, prop_honeypot <dbl>

Interpretation by cluster (ordered by concern): - Cluster 1 (n=5, 7.2%): highest concern
- risk=2.60 (highest), vt_correct=0.30 (very low), attr_sd=0.334 (very low)
- Pattern fits low-quality / likely IER, possibly suspicious responses. - Cluster 5 (n=1, 1.4%): anomaly case
- risk=2.00, moderate attr SD (0.707), decent visual accuracy (0.833)
- Single case; treat as individual outlier, not a stable subgroup. - Cluster 6 (n=9, 13.0%): moderate concern (slow responders)
- risk=1.33, vt_correct=0.87 (okay), very high RT (61742 ms), attr SD moderate (0.754)
- More like very slow/careful or distracted than clear bot pattern. - Cluster 2 (n=6, 8.7%): moderate concern
- risk=1.17, vt_correct=0.861, high RT (37262 ms), attr SD moderate-high (0.826)
- Could reflect slow/effortful respondents, not strong bot signal. - Cluster 4 (n=18, 26.1%): low concern
- risk=0.556, high visual accuracy (0.972), high attr SD (1.302)
- Looks like engaged human responding. - Cluster 3 (n=30, 43.5%): lowest concern / baseline majority
- risk=0.100, high accuracy (0.956), high attr SD (1.323)
- Most clearly normal-quality respondents.

# barplots of two extreme groups based on number of correctly answered visual traps:

tmp_pids_c1 <- suspected_bot_table$PROLIFIC_PID[suspected_bot_table$bot_cluster == 1]
tmp_pids_c1

 [1] "671537ffd700463e5e5af0ec" "66ec53e096df305784c5aa6f"
 [3] "696967a820034107da9a178c" "69c2cf612f3aaa621cd012c9"
 [5] "697ca395ee14230291313ff7" "6996d201e4546b84c632980a"
 [7] "616c6bcb2a62b082dfe84ca1" "69864918d688ba39aab5c4a9"
 [9] "607f7f18d84d010ea2cd8664" "6980f140677b5e0cbec2561d"
[11] "62d43cee3d60ac98c1dcacc8" "583ef6f5ad2f4300014b358d"
[13] "69cfb45d653ca2bb903544bf" "69ad6b5d144258967269bcef"
[15] "69d384aa090e38d0a5d3da70" "6999787fd639f1bad3dd0cff"
[17] "69a51d629d67227b3f4e89cb" "6983549b2eeaffbc45367e23"
[19] "69b57b2e2c7cfe69c1efee2d" "6949ca3d2fa32d8101dea6a7"
[21] "69b9dc367fe8a0f25c27fd3e" "698499af42aaccba55a1d46f"
[23] "66aa8d04719b0f5ef3433e9c" "69696695a46b968d40daf6e1"
[25] "69b32d9bb8ca8629fc044d19" "69d2a237eb676423789613a4"
[27] "67eff0475e43d9a5390ff55e" "69a396424c59826d1a132598"
[29] "69606ec5fd842b942ecc6932" "69cec122e3cb56c2e7000c6b"

tmp <- rowSums(dat_ques[dat_ques$PROLIFIC_PID %in% tmp_pids_c1, str_subset(string = colnames(dat_ques), pattern = "^visual.*_correct$")])
barplot(table(tmp), main = "Cluster 1")

tmp_pids_notC1 <- suspected_bot_table$PROLIFIC_PID[suspected_bot_table$bot_cluster != 1]
tmp <- rowSums(dat_ques[dat_ques$PROLIFIC_PID %in% tmp_pids_notC1, str_subset(string = colnames(dat_ques), pattern = "^visual.*_correct$")])
barplot(table(tmp), main = "All other clusters")

# t-tests of two extreme groups based on number of correctly answered visual traps:


# Build analysis frame
attr_cols <- str_subset(string = colnames(dat_ques), pattern = "^attributesFutureSociety-")
visual_correct_cols <- str_subset(string = colnames(dat_ques), pattern = "^visual.*_correct$")
ttest_df <- dat_ques %>%
  mutate(
    group_c1 = ifelse(PROLIFIC_PID %in% tmp_pids_c1, "Cluster1", "OtherClusters"),
    duration_seconds_num = suppressWarnings(as.numeric(as.character(duration_seconds))),
    attr_response_sd = apply(
      select(., all_of(attr_cols)) %>% mutate(across(everything(), ~ suppressWarnings(as.numeric(as.character(.x))))),
      1,
      function(x) if (sum(!is.na(x)) < 2) NA_real_ else sd(x, na.rm = TRUE)
    ),
    visual_correct_sum = rowSums(
      select(., all_of(visual_correct_cols)) %>% mutate(across(everything(), ~ suppressWarnings(as.numeric(as.character(.x))))),
      na.rm = TRUE
    )
  ) %>%
  filter(group_c1 %in% c("Cluster1", "OtherClusters"))
# Descriptives by group
ttest_df %>%
  group_by(group_c1) %>%
  summarise(
    n = n(),
    mean_duration = mean(duration_seconds_num, na.rm = TRUE),
    sd_duration = sd(duration_seconds_num, na.rm = TRUE),
    mean_attr_sd = mean(attr_response_sd, na.rm = TRUE),
    sd_attr_sd = sd(attr_response_sd, na.rm = TRUE),
    mean_visual_correct_sum = mean(visual_correct_sum, na.rm = TRUE),
    .groups = "drop"
  )

# A tibble: 2 × 7
  group_c1          n mean_duration sd_duration mean_attr_sd sd_attr_sd
  <chr>         <int>         <dbl>       <dbl>        <dbl>      <dbl>
1 Cluster1         30          877.        296.        1.32       0.543
2 OtherClusters    39          961.        603.        0.979      0.679
# ℹ 1 more variable: mean_visual_correct_sum <dbl>

# t-test 1: duration mean (Cluster 1 vs others)
t_duration <- t.test(duration_seconds_num ~ group_c1, data = ttest_df)
t_duration


    Welch Two Sample t-test

data:  duration_seconds_num by group_c1
t = -0.75609, df = 58.122, p-value = 0.4526
alternative hypothesis: true difference in means between group Cluster1 and group OtherClusters is not equal to 0
95 percent confidence interval:
 -305.0942  137.7968
sample estimates:
     mean in group Cluster1 mean in group OtherClusters 
                   877.3000                    960.9487

# t-test 2: attributes response SD mean (Cluster 1 vs others)
t_attr_sd <- t.test(attr_response_sd ~ group_c1, data = ttest_df)
t_attr_sd


    Welch Two Sample t-test

data:  attr_response_sd by group_c1
t = 2.3169, df = 65.984, p-value = 0.02362
alternative hypothesis: true difference in means between group Cluster1 and group OtherClusters is not equal to 0
95 percent confidence interval:
 0.04749195 0.63948367
sample estimates:
     mean in group Cluster1 mean in group OtherClusters 
                  1.3229126                   0.9794248

# Optional effect sizes (Cohen's d)
cohens_d <- function(x, g) {
  x1 <- x[g == "Cluster1"]; x2 <- x[g == "OtherClusters"]
  x1 <- x1[!is.na(x1)]; x2 <- x2[!is.na(x2)]
  n1 <- length(x1); n2 <- length(x2)
  s1 <- sd(x1); s2 <- sd(x2)
  sp <- sqrt(((n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2))
  (mean(x1) - mean(x2)) / sp
}
d_duration <- cohens_d(ttest_df$duration_seconds_num, ttest_df$group_c1)
d_attr_sd <- cohens_d(ttest_df$attr_response_sd, ttest_df$group_c1)
d_duration

[1] -0.1693453

d_attr_sd

[1] 0.551157

# check copy & paste events of single respondent:

tmp_pids <- suspected_bot_table$PROLIFIC_PID[suspected_bot_table$bot_cluster == 5]
tmp_pids

[1] "69b86e94de9c503e9a58411d" "66f0143ab60af2f164a772e2"
[3] "69c4c8d23e5ba793afd0e843" "69d2b682cb4fce2cc6c59711"
[5] "69c4c8d23e5ba793afd0e843"

dat_para_events[dat_para_events$PROLIFIC_PID %in% tmp_pids & dat_para_events$event_type %in% c("copy", "paste"), ]

# A tibble: 1 × 22
  PROLIFIC_PID result_id origin event_type event_index timestamp component field
  <chr>        <chr>     <chr>  <chr>            <int> <chr>     <chr>     <chr>
1 69b86e94de9… 15252     parad… copy                 1 2026-04-… Scenario… <NA> 
# ℹ 14 more variables: value <chr>, mouse_x <dbl>, mouse_y <dbl>,
#   scroll_x <dbl>, scroll_y <dbl>, key_interval_ms <dbl>,
#   clipboard_type <chr>, clipboard_text <chr>, clipboard_length <dbl>,
#   target_tag <chr>, target_type <chr>, target_id <chr>, target_name <chr>,
#   target_className <chr>

8 Descriptive statistics - quality

8.1 clearity, bias

table(dat_ques$condition_FutureSociety)


 aicentered    anarchic    futurist    lawBased moderngreen primitivist 
          6          14          16           7          10           7 
  religious 
          8

# ---- Helper ----
safe_num <- function(x) suppressWarnings(as.numeric(as.character(x)))

# ---- Recode to numeric + labeled factors ----
clearbias_df <- dat_ques %>%
  mutate(
    clearUtopia_num  = safe_num(clearUtopia),
    biasUtopia_num   = safe_num(biasUtopia),
    clearUtopia_lab = factor(
      clearUtopia_num,
      levels = 1:4,
      labels = c("Completely unclear", "Somewhat unclear",
                 "Somewhat clear", "Completely clear")
    ),
    biasUtopia_lab = factor(
      biasUtopia_num,
      levels = 1:3,
      labels = c("Opposed", "Neutral", "Supportive")
    ),
    clear_followup_triggered = !is.na(clearUtopiatext) & nchar(trimws(as.character(clearUtopiatext))) > 0,
    bias_followup_triggered  = !is.na(biasUtopiatext)  & nchar(trimws(as.character(biasUtopiatext)))  > 0
  )

# ---- Overall distributions ----
cat("\n--- Clarity (overall) ---\n")


--- Clarity (overall) ---

tab_clear <- table(clearbias_df$clearUtopia_lab, useNA = "ifany")
print(cbind(Frequency = tab_clear, Percentage = round(prop.table(tab_clear) * 100, 1)))

                   Frequency Percentage
Completely unclear         0        0.0
Somewhat unclear           0        0.0
Somewhat clear            27       39.1
Completely clear          41       59.4
<NA>                       1        1.4

cat("\n--- Bias (overall) ---\n")


--- Bias (overall) ---

tab_bias <- table(clearbias_df$biasUtopia_lab, useNA = "ifany")
print(cbind(Frequency = tab_bias, Percentage = round(prop.table(tab_bias) * 100, 1)))

           Frequency Percentage
Opposed            1        1.4
Neutral           53       76.8
Supportive        14       20.3
<NA>               1        1.4

cat("\n--- Follow-up trigger rates ---\n")


--- Follow-up trigger rates ---

clearbias_df %>%
  summarise(
    n = n(),
    n_clear_followup = sum(clear_followup_triggered, na.rm = TRUE),
    pct_clear_followup = round(100 * mean(clear_followup_triggered, na.rm = TRUE), 1),
    n_bias_followup  = sum(bias_followup_triggered, na.rm = TRUE),
    pct_bias_followup = round(100 * mean(bias_followup_triggered, na.rm = TRUE), 1)
  ) %>%
  print()

# A tibble: 1 × 5
      n n_clear_followup pct_clear_followup n_bias_followup pct_bias_followup
  <int>            <int>              <dbl>           <int>             <dbl>
1    69                0                  0              16              23.2

# ---- By condition: cross-tabs + means ----
clearbias_by_cond <- clearbias_df %>%
  group_by(condition_FutureSociety) %>%
  summarise(
    n = n(),
    mean_clear = round(mean(clearUtopia_num, na.rm = TRUE), 2),
    sd_clear   = round(sd(clearUtopia_num,   na.rm = TRUE), 2),
    pct_clear_high = round(100 * mean(clearUtopia_num >= 3, na.rm = TRUE), 1),
    mean_bias  = round(mean(biasUtopia_num,  na.rm = TRUE), 2),
    sd_bias    = round(sd(biasUtopia_num,    na.rm = TRUE), 2),
    pct_neutral = round(100 * mean(biasUtopia_num == 2, na.rm = TRUE), 1),
    .groups = "drop"
  )

DT::datatable(
  clearbias_by_cond,
  caption = "Clarity and bias means by condition (clarity 1–4, bias 1–3 where 2 = neutral)",
  options = list(pageLength = 10, scrollX = TRUE)
)

# ---- Stacked bar plots: clarity by condition ----
clear_plot_df <- clearbias_df %>%
  filter(!is.na(clearUtopia_lab), !is.na(condition_FutureSociety)) %>%
  count(condition_FutureSociety, clearUtopia_lab) %>%
  group_by(condition_FutureSociety) %>%
  mutate(prop = n / sum(n)) %>%
  ungroup()

ggplot(clear_plot_df,
       aes(x = condition_FutureSociety, y = prop, fill = clearUtopia_lab)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(x = "Condition", y = "Proportion", fill = "Clarity",
       title = "Perceived clarity by condition") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# ---- Stacked bar plots: bias by condition ----
bias_plot_df <- clearbias_df %>%
  filter(!is.na(biasUtopia_lab), !is.na(condition_FutureSociety)) %>%
  count(condition_FutureSociety, biasUtopia_lab) %>%
  group_by(condition_FutureSociety) %>%
  mutate(prop = n / sum(n)) %>%
  ungroup()

ggplot(bias_plot_df,
       aes(x = condition_FutureSociety, y = prop, fill = biasUtopia_lab)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(x = "Condition", y = "Proportion", fill = "Bias perception",
       title = "Perceived bias by condition") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# ---- Inferential: does condition affect clarity / bias? ----
# Chi-square on raw response distributions
cat("\n--- Chi-square: clarity x condition ---\n")


--- Chi-square: clarity x condition ---

print(chisq.test(table(clearbias_df$condition_FutureSociety, clearbias_df$clearUtopia_lab)))


    Pearson's Chi-squared test

data:  table(clearbias_df$condition_FutureSociety, clearbias_df$clearUtopia_lab)
X-squared = NaN, df = 18, p-value = NA

cat("\n--- Chi-square: bias x condition ---\n")


--- Chi-square: bias x condition ---

print(chisq.test(table(clearbias_df$condition_FutureSociety, clearbias_df$biasUtopia_lab)))


    Pearson's Chi-squared test

data:  table(clearbias_df$condition_FutureSociety, clearbias_df$biasUtopia_lab)
X-squared = 22.493, df = 12, p-value = 0.03235

# Treat clarity as quasi-interval and run an ANOVA (sensitivity check)
anova_clear <- afex::aov_ez(
  id = "PROLIFIC_PID",
  dv = "clearUtopia_num",
  between = "condition_FutureSociety",
  data = clearbias_df %>% drop_na(clearUtopia_num, condition_FutureSociety)
)
summary(anova_clear)

Anova Table (Type 3 tests)

Response: clearUtopia_num
                        num Df den Df     MSE      F     ges Pr(>F)
condition_FutureSociety      6     61 0.23919 1.1769 0.10375 0.3306

# ---- Inspect follow-up free-text answers ----
clearbias_followups <- clearbias_df %>%
  filter(clear_followup_triggered | bias_followup_triggered) %>%
  select(PROLIFIC_PID, condition_FutureSociety,
         clearUtopia_lab, clearUtopiatext,
         biasUtopia_lab,  biasUtopiatext) %>%
  mutate(
    clear_chars = nchar(as.character(clearUtopiatext)),
    bias_chars  = nchar(as.character(biasUtopiatext))
  )

DT::datatable(
  clearbias_followups,
  caption = "Participants who triggered clarity/bias follow-up text fields",
  options = list(pageLength = 10, scrollX = TRUE)
)

writexl::write_xlsx(x = clearbias_followups, path = file.path(output_root, "clearbias_followups.xlsx"))

8.2 prototype assignment

# First, check what condition labels actually exist:
cat("Condition levels in data:\n")

Condition levels in data:

print(levels(dat_ques$condition_FutureSociety))

[1] "aicentered"  "anarchic"    "futurist"    "lawBased"    "moderngreen"
[6] "primitivist" "religious"

# or, if not a factor:
print(unique(dat_ques$condition_FutureSociety))

[1] religious   anarchic    lawBased    futurist    moderngreen primitivist
[7] aicentered  <NA>       
7 Levels: aicentered anarchic futurist lawBased moderngreen ... religious

condition_to_prototype <- c(
  "futurist"    = "futurist",
  "aicentered"  = "ai_centered",
  "primitivist" = "primitivist_arcadian",
  "moderngreen" = "modern_green",
  "religious"   = "religious_millennial",
  "lawBased"    = "institutional_law",
  "anarchic"    = "moral_commonwealth_anarchic"
)

prototype_levels <- c(
  "futurist", "ai_centered", "primitivist_arcadian", "modern_green",
  "religious_millennial", "institutional_law", "moral_commonwealth_anarchic"
)

# ---- Build assignment dataframe ----
proto_df <- dat_ques %>%
  select(PROLIFIC_PID, condition_FutureSociety,
         utopiaPrototypeAssignment,
         protoConfidenceRadio,
         protoReasonText) %>%
  mutate(
    chosen_prototype = factor(as.character(utopiaPrototypeAssignment),
                              levels = prototype_levels),
    correct_prototype = factor(
      unname(condition_to_prototype[as.character(condition_FutureSociety)]),
      levels = prototype_levels
    ),
    proto_correct = as.integer(as.character(chosen_prototype) ==
                               as.character(correct_prototype)),
    confidence = suppressWarnings(as.numeric(as.character(protoConfidenceRadio))),
    proto_reason_provided = !is.na(protoReasonText) &
                            nchar(trimws(as.character(protoReasonText))) > 0
  )

# ---- Sanity check: confirm every condition resolved to a prototype ----
mapping_check <- proto_df %>%
  count(condition_FutureSociety, correct_prototype, name = "n") %>%
  arrange(condition_FutureSociety)

mapping_check

# A tibble: 8 × 3
  condition_FutureSociety correct_prototype               n
  <fct>                   <fct>                       <int>
1 aicentered              ai_centered                     6
2 anarchic                moral_commonwealth_anarchic    14
3 futurist                futurist                       16
4 lawBased                institutional_law               7
5 moderngreen             modern_green                   10
6 primitivist             primitivist_arcadian            7
7 religious               religious_millennial            8
8 <NA>                    <NA>                            1

unmapped <- proto_df %>%
  filter(is.na(correct_prototype) & !is.na(condition_FutureSociety)) %>%
  distinct(condition_FutureSociety)

if (nrow(unmapped) > 0) {
  warning("These condition_FutureSociety values are not in `condition_to_prototype`:\n",
          paste(unmapped$condition_FutureSociety, collapse = ", "))
} else {
  message("All conditions mapped successfully.")
}

# ---- Quick check of the result ----
proto_df %>%
  select(condition_FutureSociety, chosen_prototype, correct_prototype, proto_correct) %>%
  head(10)

# A tibble: 10 × 4
   condition_FutureSociety chosen_prototype      correct_prototype proto_correct
   <fct>                   <fct>                 <fct>                     <int>
 1 religious               religious_millennial  religious_millen…             1
 2 religious               religious_millennial  religious_millen…             1
 3 anarchic                moral_commonwealth_a… moral_commonweal…             1
 4 lawBased                institutional_law     institutional_law             1
 5 futurist                futurist              futurist                      1
 6 moderngreen             futurist              modern_green                  0
 7 futurist                futurist              futurist                      1
 8 moderngreen             futurist              modern_green                  0
 9 lawBased                institutional_law     institutional_law             1
10 futurist                futurist              futurist                      1

# ---- Overall accuracy ----
proto_overall <- proto_df %>%
  summarise(
    n = sum(!is.na(proto_correct)),
    n_correct = sum(proto_correct, na.rm = TRUE),
    accuracy = round(n_correct / n, 3),
    mean_confidence = round(mean(confidence, na.rm = TRUE), 2),
    sd_confidence   = round(sd(confidence,   na.rm = TRUE), 2)
  )
proto_overall

# A tibble: 1 × 5
      n n_correct accuracy mean_confidence sd_confidence
  <int>     <int>    <dbl>           <dbl>         <dbl>
1    68        50    0.735            6.18           0.9

# Compare to chance (1/7 = .143) with a one-sample test on the proportion
n_total <- sum(!is.na(proto_df$proto_correct))
k_correct <- sum(proto_df$proto_correct, na.rm = TRUE)
if (n_total > 0) {
  print(binom.test(k_correct, n_total, p = 1/7, alternative = "greater"))
}


    Exact binomial test

data:  k_correct and n_total
number of successes = 50, number of trials = 68, p-value < 2.2e-16
alternative hypothesis: true probability of success is greater than 0.1428571
95 percent confidence interval:
 0.633159 1.000000
sample estimates:
probability of success 
             0.7352941

# ---- Distribution of chosen prototypes ----
cat("\n--- Distribution of chosen prototypes ---\n")


--- Distribution of chosen prototypes ---

tab_chosen <- table(proto_df$chosen_prototype, useNA = "ifany")
print(cbind(Frequency = tab_chosen,
            Percentage = round(prop.table(tab_chosen) * 100, 1)))

                            Frequency Percentage
futurist                           20       29.0
ai_centered                         6        8.7
primitivist_arcadian                6        8.7
modern_green                       10       14.5
religious_millennial                9       13.0
institutional_law                  10       14.5
moral_commonwealth_anarchic         7       10.1
<NA>                                1        1.4

# ---- Accuracy by condition (i.e., by correct prototype) ----
proto_acc_by_cond <- proto_df %>%
  filter(!is.na(correct_prototype)) %>%
  group_by(condition_FutureSociety, correct_prototype) %>%
  summarise(
    n = n(),
    n_correct = sum(proto_correct, na.rm = TRUE),
    accuracy = round(n_correct / n, 3),
    mean_confidence = round(mean(confidence, na.rm = TRUE), 2),
    mean_confidence_correct =
      round(mean(confidence[proto_correct == 1], na.rm = TRUE), 2),
    mean_confidence_wrong =
      round(mean(confidence[proto_correct == 0], na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  arrange(desc(accuracy))

DT::datatable(
  proto_acc_by_cond,
  caption = "Prototype-assignment accuracy and confidence by condition",
  options = list(pageLength = 10, scrollX = TRUE)
)

# ---- Bar plot of accuracy per condition ----
p_proto_acc <- ggplot(proto_acc_by_cond,
       aes(x = reorder(condition_FutureSociety, -accuracy),
           y = accuracy, fill = condition_FutureSociety)) +
  geom_col(width = 0.7) +
  geom_hline(yintercept = 1/7, linetype = "dashed") +
  geom_text(aes(label = scales::percent(accuracy, accuracy = 1)),
            vjust = -0.4, size = 3) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                     limits = c(0, 1.05)) +
  labs(x = "Condition (correct prototype)", y = "Accuracy",
       title = "Prototype-assignment accuracy by condition",
       subtitle = "Dashed line = chance (1/7)") +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 30, hjust = 1))

p_proto_acc

# ---- Save plot ----
ggsave(
  filename = file.path(output_root, "prototype_accuracy_by_condition.png"),
  plot     = p_proto_acc,
  width    = 8, height = 5, dpi = 300, bg = "white"
)

# ---- Confusion matrix: rows = correct (condition), cols = chosen ----
conf_df <- proto_df %>%
  filter(!is.na(correct_prototype), !is.na(chosen_prototype)) %>%
  count(correct_prototype, chosen_prototype, .drop = FALSE) %>%
  group_by(correct_prototype) %>%
  mutate(prop = n / sum(n)) %>%
  ungroup()

# Wide table
conf_wide_n <- conf_df %>%
  select(correct_prototype, chosen_prototype, n) %>%
  pivot_wider(names_from = chosen_prototype, values_from = n, values_fill = 0)

DT::datatable(
  conf_wide_n,
  caption = "Confusion matrix (counts): rows = correct prototype, columns = chosen prototype",
  options = list(pageLength = 10, scrollX = TRUE)
)

# ---- Heatmap of proportions ----
p_proto_confmat <- ggplot(conf_df,
       aes(x = chosen_prototype, y = correct_prototype, fill = prop)) +
  geom_tile(color = "white") +
  geom_text(aes(label = ifelse(prop > 0,
                               scales::percent(prop, accuracy = 1), "")),
            size = 3) +
  scale_fill_gradient(low = "#f7fbff", high = "#08519c",
                      labels = scales::percent_format(accuracy = 1)) +
  labs(x = "Chosen prototype", y = "Correct prototype",
       fill = "Row %",
       title = "Confusion matrix: prototype assignments") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

p_proto_confmat

# ---- Save plot ----
ggsave(
  filename = file.path(output_root, "prototype_confusion_matrix.png"),
  plot     = p_proto_confmat,
  width    = 8, height = 6, dpi = 300, bg = "white"
)

9 Descriptive statistics dependent variables

9.1 descriptive mean differences

assoc_cols <- c("assoc1_valence", "assoc2_valence", "assoc3_valence", "assoc4_valence", "assoc5_valence")
attr_cols <- paste0("attributesFutureSociety-", 1:8)

clean_complete <- dat_ques %>%
  mutate(
    assoc_valence_mean = rowMeans(select(., all_of(assoc_cols)), na.rm = TRUE),
    attributes_mean = rowMeans(select(., all_of(attr_cols)), na.rm = TRUE)
  )

analysis_df <- clean_complete %>%
  select(PROLIFIC_PID, condition_FutureSociety, assoc_valence_mean, attributes_mean, all_of(attr_cols)) %>%
  drop_na(assoc_valence_mean, attributes_mean, condition_FutureSociety)

mean_summary <- analysis_df %>%
  pivot_longer(
    cols = c(assoc_valence_mean, attributes_mean),
    names_to = "measure",
    values_to = "value"
  ) %>%
  group_by(condition_FutureSociety, measure) %>%
  summarise(
    mean = mean(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE),
    n = dplyr::n(),
    se = sd / sqrt(n),
    .groups = "drop"
  ) %>%
  mutate(
    measure = recode(
      measure,
      assoc_valence_mean = "Association Valence Mean",
      attributes_mean = "Attributes Mean"
    )
  )

ggplot(mean_summary, aes(x = condition_FutureSociety, y = mean, fill = measure)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - se, ymax = mean + se),
    position = position_dodge(width = 0.8),
    width = 0.2
  ) +
  labs(x = "Condition", y = "Mean", fill = "Measure") +
  theme_minimal()

anova_assoc <- afex::aov_ez(
  id = "PROLIFIC_PID",
  dv = "assoc_valence_mean",
  between = "condition_FutureSociety",
  data = analysis_df
)

Contrasts set to contr.sum for the following variables: condition_FutureSociety

anova_attr <- afex::aov_ez(
  id = "PROLIFIC_PID",
  dv = "attributes_mean",
  between = "condition_FutureSociety",
  data = analysis_df
)

Contrasts set to contr.sum for the following variables: condition_FutureSociety

summary(anova_assoc)

Anova Table (Type 3 tests)

Response: assoc_valence_mean
                        num Df den Df    MSE      F     ges  Pr(>F)  
condition_FutureSociety      6     61 2.7046 2.6343 0.20579 0.02451 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(anova_attr)

Anova Table (Type 3 tests)

Response: attributes_mean
                        num Df den Df    MSE      F     ges   Pr(>F)   
condition_FutureSociety      6     61 1.4635 3.1599 0.23711 0.009164 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

attr_labels <- c(
  "1" = "Utopian",
  "2" = "Desirable",
  "3" = "Ideal",
  "4" = "Beneficial for the greater good",
  "5" = "Imaginative",
  "6" = "Innovative",
  "7" = "Creative",
  "8" = "Possible"
)

attr_long <- analysis_df %>%
  pivot_longer(
    cols = all_of(attr_cols),
    names_to = "item",
    values_to = "value"
  ) %>%
  mutate(
    item_num = stringr::str_extract(item, "[0-9]+"),
    item_label = recode(item_num, !!!attr_labels)
  )

attr_effects <- attr_long %>%
  filter(!is.na(value)) %>%
  group_by(item_label) %>%
  summarise(
    eta_sq = {
      fit <- aov(value ~ condition_FutureSociety)
      ss <- summary(fit)[[1]]
      ss_between <- ss["condition_FutureSociety", "Sum Sq"]
      ss_total <- sum(ss[, "Sum Sq"])
      if (is.na(ss_between) || is.na(ss_total) || ss_total == 0) NA_real_ else ss_between / ss_total
    },
    .groups = "drop"
  ) %>%
  mutate(
    eta_label = ifelse(is.na(eta_sq), "eta^2=NA", paste0("eta^2=", round(eta_sq, 2)))
  )

attr_effects_table <- attr_effects %>%
  arrange(desc(eta_sq))

attr_effects_table

# A tibble: 8 × 3
  item_label                      eta_sq eta_label 
  <chr>                            <dbl> <chr>     
1 Innovative                      0.251  eta^2=0.25
2 Desirable                       0.227  eta^2=0.23
3 Ideal                           0.220  eta^2=0.22
4 Utopian                         0.208  eta^2=0.21
5 Beneficial for the greater good 0.176  eta^2=0.18
6 Creative                        0.169  eta^2=0.17
7 Imaginative                     0.159  eta^2=0.16
8 Possible                        0.0144 eta^2=0.01

item_order <- attr_effects %>%
  mutate(eta_order = ifelse(is.na(eta_sq), -Inf, eta_sq)) %>%
  arrange(desc(eta_order)) %>%
  pull(item_label)

label_order <- attr_effects %>%
  mutate(eta_order = ifelse(is.na(eta_sq), -Inf, eta_sq)) %>%
  arrange(desc(eta_order)) %>%
  mutate(facet_label = paste0(item_label, "\n", eta_label)) %>%
  pull(facet_label)

attr_summary <- attr_long %>%
  group_by(condition_FutureSociety, item_label) %>%
  summarise(
    mean = mean(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE),
    n = dplyr::n(),
    se = sd / sqrt(n),
    .groups = "drop"
  ) %>%
  left_join(attr_effects %>% select(item_label, eta_label), by = "item_label") %>%
  mutate(
    item_label = factor(item_label, levels = item_order),
    facet_label = factor(paste0(item_label, "\n", eta_label), levels = label_order)
  )

condition_order <- analysis_df %>%
  distinct(condition_FutureSociety) %>%
  arrange(condition_FutureSociety) %>%
  pull(condition_FutureSociety)

attr_summary <- attr_summary %>%
  mutate(condition_FutureSociety = factor(condition_FutureSociety, levels = condition_order))

ggplot(attr_summary, aes(x = condition_FutureSociety, y = mean, fill = condition_FutureSociety)) +
  geom_col(width = 0.7) +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.2) +
  geom_text(aes(label = round(mean, 2)), vjust = -0.3, size = 3) +
  facet_wrap(~ facet_label, ncol = 4) +
  labs(x = "Condition", y = "Mean", fill = "Condition") +
  theme_minimal() +
  theme(legend.position = "bottom", axis.text.x = element_text(angle = 30, hjust = 1))

tmp <- clean_complete[, str_subset(string = colnames(clean_complete), pattern = "attri.*[1-9]")]
psych::corPlot(r = cor(tmp, use = "pairwise"))

hist(clean_complete$`attributesFutureSociety-8`)

hist(clean_complete$`attributesFutureSociety-4`)

tmp_efa <- psych::fa.parallel(x = tmp)

Parallel analysis suggests that the number of factors =  2  and the number of components =  1

tmp_fa <- psych::fa(r = cor(tmp, use = "pairwise"), nfactors = 2, n.obs = nrow(tmp))

Loading required namespace: GPArotation

tmp_fa

Factor Analysis using method =  minres
Call: psych::fa(r = cor(tmp, use = "pairwise"), nfactors = 2, n.obs = nrow(tmp))
Standardized loadings (pattern matrix) based upon correlation matrix
                            MR1   MR2    h2    u2 com
attributesFutureSociety-5 -0.10  0.95 0.793 0.207 1.0
attributesFutureSociety-6  0.25  0.57 0.576 0.424 1.4
attributesFutureSociety-2  0.97 -0.03 0.911 0.089 1.0
attributesFutureSociety-3  0.96 -0.01 0.917 0.083 1.0
attributesFutureSociety-4  0.68  0.18 0.661 0.339 1.1
attributesFutureSociety-7  0.07  0.91 0.914 0.086 1.0
attributesFutureSociety-1  0.27  0.43 0.412 0.588 1.7
attributesFutureSociety-8  0.23 -0.01 0.052 0.948 1.0

                       MR1  MR2
SS loadings           2.75 2.49
Proportion Var        0.34 0.31
Cumulative Var        0.34 0.65
Proportion Explained  0.52 0.48
Cumulative Proportion 0.52 1.00

 With factor correlations of 
     MR1  MR2
MR1 1.00 0.65
MR2 0.65 1.00

Mean item complexity =  1.1
Test of the hypothesis that 2 factors are sufficient.

df null model =  28  with the objective function =  6.38 with Chi Square =  411.24
df of  the model are 13  and the objective function was  0.41 

The root mean square of the residuals (RMSR) is  0.03 
The df corrected root mean square of the residuals is  0.04 

The harmonic n.obs is  69 with the empirical chi square  3.16  with prob <  1 
The total n.obs was  69  with Likelihood Chi Square =  25.81  with prob <  0.018 

Tucker Lewis Index of factoring reliability =  0.926
RMSEA index =  0.119  and the 90 % confidence intervals are  0.048 0.188
BIC =  -29.23
Fit based upon off diagonal values = 1
Measures of factor score adequacy             
                                                   MR1  MR2
Correlation of (regression) scores with factors   0.94 0.92
Multiple R square of scores with factors          0.87 0.84
Minimum correlation of possible factor scores     0.75 0.68