Data analysis: bots, descriptive stats

Author

Julius Fenn

1 Notes

This script creates a simplified participant-level dataset using the raw study.rds file and produces:

  • A clean wide table with core variables + associations (wide) + keyStats
  • Paradata events in long format (clipboard, mouse, scroll, keyIntervals)
  • Defocus events in long format

2 global variables

data_root <- "../01_dataPreperation/outputs"
output_root <- "outputs"

3 functions

# not needed

4 load packages

require(pacman)
p_load(
  "tidyverse",
  "psych",
  "afex",
  "mlr3",
  "mlr3cluster",
  "cluster",
  "writexl",
  "readxl",
  "DT"
)

5 load processed data

setwd(data_root)
setwd("questionnaire")

dat_ques <- readRDS(file = "clean_complete_wide.rds")

setwd("../paradata")
dat_para_defocus <- readRDS(file = "paradata_defocus_long.rds")
dat_para_events <- readRDS(file = "paradata_events_long.rds")

6 Descriptive Statistics

6.1 sample

socidem_vars <- str_subset(string = colnames(dat_ques), pattern = "^socidem")
socidem_vars
 [1] "socidem_Submission.id"                
 [2] "socidem_Status"                       
 [3] "socidem_Custom.study.tncs.accepted.at"
 [4] "socidem_Started.at"                   
 [5] "socidem_Completed.at"                 
 [6] "socidem_Reviewed.at"                  
 [7] "socidem_Archived.at"                  
 [8] "socidem_Time.taken"                   
 [9] "socidem_Completion.code"              
[10] "socidem_Total.approvals"              
[11] "socidem_Age"                          
[12] "socidem_Sex"                          
[13] "socidem_Ethnicity.simplified"         
[14] "socidem_Country.of.birth"             
[15] "socidem_Country.of.residence"         
[16] "socidem_Nationality"                  
[17] "socidem_Language"                     
[18] "socidem_Student.status"               
[19] "socidem_Employment.status"            
socidem_df <- dat_ques %>%
  select(all_of(socidem_vars))

socidem_overview <- tibble(
  variable = socidem_vars,
  class = purrr::map_chr(socidem_df, ~ paste(class(.x), collapse = "/")),
  n = nrow(socidem_df),
  n_missing = purrr::map_int(socidem_df, ~ sum(is.na(.x))),
  missing_pct = round(100 * n_missing / n, 2),
  n_unique = purrr::map_int(socidem_df, ~ dplyr::n_distinct(.x, na.rm = TRUE))
)

socidem_descriptives <- purrr::map_dfr(socidem_vars, function(v) {
  x <- socidem_df[[v]]

  if (is.numeric(x)) {
    n_non_missing <- sum(!is.na(x))

    tibble(
      variable = v,
      type = "numeric",
      mean = if (n_non_missing == 0) NA_real_ else mean(x, na.rm = TRUE),
      sd = if (n_non_missing <= 1) NA_real_ else sd(x, na.rm = TRUE),
      median = if (n_non_missing == 0) NA_real_ else median(x, na.rm = TRUE),
      min = if (n_non_missing == 0) NA_real_ else min(x, na.rm = TRUE),
      max = if (n_non_missing == 0) NA_real_ else max(x, na.rm = TRUE),
      top_levels = NA_character_
    )
  } else {
    x_chr <- as.character(x)
    x_chr[x_chr == ""] <- NA_character_
    tab <- sort(table(x_chr, useNA = "no"), decreasing = TRUE)
    k <- min(5, length(tab))

    tibble(
      variable = v,
      type = "categorical_or_text",
      mean = NA_real_,
      sd = NA_real_,
      median = NA_real_,
      min = NA_real_,
      max = NA_real_,
      top_levels = if (k == 0) {
        NA_character_
      } else {
        paste0(names(tab)[seq_len(k)], " (", as.integer(tab[seq_len(k)]), ")", collapse = "; ")
      }
    )
  }
})

socidem_table <- socidem_overview %>%
  left_join(socidem_descriptives, by = "variable")


DT::datatable(data = socidem_table)
## factor vars:
factor_vars <- as.character(socidem_table$variable[socidem_table$class == "factor"])

for (var_name in factor_vars) {
  cat("\nTable for:", var_name, "\n")
  
  tab <- table(socidem_df[[var_name]], useNA = "ifany")
  pct <- prop.table(tab) * 100
  
  print(cbind(Frequency = tab, Percentage = round(pct, 1)))
}

Table for: socidem_Sex 
       Frequency Percentage
Female        28       40.6
Male          41       59.4

Table for: socidem_Ethnicity.simplified 
      Frequency Percentage
Asian         2        2.9
Black        18       26.1
Mixed         7       10.1
Other         4        5.8
White        38       55.1

Table for: socidem_Student.status 
             Frequency Percentage
DATA_EXPIRED        14       20.3
No                  45       65.2
Yes                 10       14.5

Table for: socidem_Employment.status 
                                                         Frequency Percentage
DATA_EXPIRED                                                    12       17.4
Due to start a new job within the next month                     2        2.9
Full-Time                                                       27       39.1
Not in paid work (e.g. homemaker', 'retired or disabled)         6        8.7
Other                                                            1        1.4
Part-Time                                                       15       21.7
Unemployed (and job seeking)                                     6        8.7
## numeric vars:
numeric_vars <- as.character(socidem_table$variable[socidem_table$class == "numeric"])

for (var_name in numeric_vars) {
  if (var_name %in% names(socidem_df)) {
    x <- socidem_df[[var_name]]
    x <- x[!is.na(x)]
    
    stats <- boxplot.stats(x)$stats
    names(stats) <- c("Min", "Q1", "Median", "Q3", "Max")
    
    mean_x <- mean(x)
    sd_x <- sd(x)
    
    boxplot(
      x,
      main = paste("Boxplot of", var_name),
      ylab = var_name
    )
    
    legend(
      "topright",
      legend = c(
        paste(names(stats), round(stats, 2), sep = ": "),
        paste("Mean:", round(mean_x, 2)),
        paste("SD:", round(sd_x, 2))
      ),
      bty = "n",
      cex = 0.8
    )
    
    cat("\nBoxplot for:", var_name, "\n")
    print(stats)
    cat("Mean:", round(mean_x, 2), "\n")
    cat("SD:", round(sd_x, 2), "\n")
  }
}


Boxplot for: socidem_Time.taken 
   Min     Q1 Median     Q3    Max 
   200    561    832   1248   2276 
Mean: 959.57 
SD: 512.36 


Boxplot for: socidem_Total.approvals 
   Min     Q1 Median     Q3    Max 
     1    144    976   3461   8176 
Mean: 2064.29 
SD: 2744.69 


Boxplot for: socidem_Age 
   Min     Q1 Median     Q3    Max 
    20     26     30     34     35 
Mean: 29.29 
SD: 4.39 
rm(socidem_vars); rm(socidem_df)

6.2 paradata

para_visualTraps_vars <- str_subset(string = colnames(dat_ques), pattern = "^visualTrap")
para_visualTraps_vars
 [1] "visualTrapResponse"                 "visualTrap_Planets_response"       
 [3] "visualTrap_Planets_responseLabel"   "visualTrap_Planets_correct"        
 [5] "visualTrap_Planets_rtMs"            "visualTrap_Collision_response"     
 [7] "visualTrap_Collision_responseLabel" "visualTrap_Collision_correct"      
 [9] "visualTrap_Collision_rtMs"          "visualTrap_CafeWall_response"      
[11] "visualTrap_CafeWall_responseLabel"  "visualTrap_CafeWall_correct"       
[13] "visualTrap_CafeWall_rtMs"           "visualTrap_Circles_response"       
[15] "visualTrap_Circles_responseLabel"   "visualTrap_Circles_correct"        
[17] "visualTrap_Circles_rtMs"            "visualTrap_Lines_response"         
[19] "visualTrap_Lines_responseLabel"     "visualTrap_Lines_correct"          
[21] "visualTrap_Lines_rtMs"              "visualTrap_Robot_response"         
[23] "visualTrap_Robot_responseLabel"     "visualTrap_Robot_correct"          
[25] "visualTrap_Robot_rtMs"              "visualTrap_order"                  
para_visualTraps_vars_df <- dat_ques %>%
  select(all_of(para_visualTraps_vars))

para_visualTraps_overview <- tibble(
  variable = names(para_visualTraps_vars_df),
  class = purrr::map_chr(para_visualTraps_vars_df, ~ paste(class(.x), collapse = "/")),
  n = nrow(para_visualTraps_vars_df),
  n_missing = purrr::map_int(para_visualTraps_vars_df, ~ sum(is.na(.x))),
  n_unique = purrr::map_int(para_visualTraps_vars_df, ~ dplyr::n_distinct(.x, na.rm = TRUE))
) %>%
  mutate(
    missing_pct = round(100 * n_missing / n, 2)
  )


para_visualTrap_descriptives <- purrr::map_dfr(para_visualTraps_vars, function(v) {
  x <- para_visualTraps_vars_df[[v]]

  if (is.numeric(x)) {
    n_non_missing <- sum(!is.na(x))

    tibble(
      variable = v,
      type = "numeric",
      mean = if (n_non_missing == 0) NA_real_ else mean(x, na.rm = TRUE),
      sd = if (n_non_missing <= 1) NA_real_ else sd(x, na.rm = TRUE),
      median = if (n_non_missing == 0) NA_real_ else median(x, na.rm = TRUE),
      min = if (n_non_missing == 0) NA_real_ else min(x, na.rm = TRUE),
      max = if (n_non_missing == 0) NA_real_ else max(x, na.rm = TRUE),
      top_levels = NA_character_
    )
  } else {
    x_chr <- as.character(x)
    x_chr[x_chr == ""] <- NA_character_
    tab <- sort(table(x_chr, useNA = "no"), decreasing = TRUE)
    k <- min(5, length(tab))

    tibble(
      variable = v,
      type = "categorical_or_text",
      mean = NA_real_,
      sd = NA_real_,
      median = NA_real_,
      min = NA_real_,
      max = NA_real_,
      top_levels = if (k == 0) {
        NA_character_
      } else {
        paste0(names(tab)[seq_len(k)], " (", as.integer(tab[seq_len(k)]), ")", collapse = "; ")
      }
    )
  }
})


para_visualTraps_table <- para_visualTraps_overview %>%
  left_join(para_visualTrap_descriptives, by = "variable")


DT::datatable(data = para_visualTraps_table)
## factor vars:
factor_vars <- as.character(para_visualTraps_table$variable[para_visualTraps_table$class == "factor"])

for (var_name in factor_vars) {
  cat("\nTable for:", var_name, "\n")
  
  tab <- table(para_visualTraps_vars_df[[var_name]], useNA = "ifany")
  pct <- prop.table(tab) * 100
  
  print(cbind(Frequency = tab, Percentage = round(pct, 1)))
}

Table for: visualTrapResponse 
  Frequency Percentage
1        17       24.6
2        28       40.6
3        23       33.3
4         1        1.4

Table for: visualTrap_Planets_response 
  Frequency Percentage
1         1        1.4
2        65       94.2
3         2        2.9
4         1        1.4

Table for: visualTrap_Collision_response 
  Frequency Percentage
1        58       84.1
2         7       10.1
3         3        4.3
4         1        1.4

Table for: visualTrap_CafeWall_response 
  Frequency Percentage
1         3        4.3
3        59       85.5
4         7       10.1

Table for: visualTrap_Circles_response 
  Frequency Percentage
2        66       95.7
3         2        2.9
4         1        1.4

Table for: visualTrap_Lines_response 
  Frequency Percentage
1        67       97.1
2         1        1.4
3         1        1.4

Table for: visualTrap_Robot_response 
  Frequency Percentage
1         1        1.4
2        14       20.3
3        54       78.3
## numeric vars:
numeric_vars <- as.character(para_visualTraps_table$variable[para_visualTraps_table$class %in% c("numeric", "integer")])

for (var_name in numeric_vars) {
  if (var_name %in% names(para_visualTraps_vars_df)) {
    x <- para_visualTraps_vars_df[[var_name]]
    x <- x[!is.na(x)]
    
    stats <- boxplot.stats(x)$stats
    names(stats) <- c("Min", "Q1", "Median", "Q3", "Max")
    
    mean_x <- mean(x)
    sd_x <- sd(x)
    
    boxplot(
      x,
      main = paste("Boxplot of", var_name),
      ylab = var_name
    )
    
    legend(
      "topright",
      legend = c(
        paste(names(stats), round(stats, 2), sep = ": "),
        paste("Mean:", round(mean_x, 2)),
        paste("SD:", round(sd_x, 2))
      ),
      bty = "n",
      cex = 0.8
    )
    
    cat("\nBoxplot for:", var_name, "\n")
    print(stats)
    cat("Mean:", round(mean_x, 2), "\n")
    cat("SD:", round(sd_x, 2), "\n")
  }
}


Boxplot for: visualTrap_Planets_rtMs 
   Min     Q1 Median     Q3    Max 
  7546  18954  26180  40289  60062 
Mean: 37924.93 
SD: 41206.44 


Boxplot for: visualTrap_Collision_rtMs 
   Min     Q1 Median     Q3    Max 
  2688  17596  30717  51485  89971 
Mean: 41787.09 
SD: 40415.81 


Boxplot for: visualTrap_CafeWall_rtMs 
   Min     Q1 Median     Q3    Max 
  4754  12424  21813  37480  67396 
Mean: 31294.46 
SD: 35666.95 


Boxplot for: visualTrap_Circles_rtMs 
   Min     Q1 Median     Q3    Max 
  2836   6732  10175  17395  29661 
Mean: 15166.86 
SD: 20691.36 


Boxplot for: visualTrap_Lines_rtMs 
   Min     Q1 Median     Q3    Max 
  3095   6798   9543  14188  22983 
Mean: 23240.86 
SD: 68558.7 


Boxplot for: visualTrap_Robot_rtMs 
   Min     Q1 Median     Q3    Max 
  5081  11972  18490  32784  54062 
Mean: 28435.51 
SD: 33601.96 
rm(para_visualTraps_vars); rm(para_visualTraps_vars_df)

for static paradata:

para_statics_vars <- str_subset(string = colnames(dat_ques), pattern = "^para")
para_statics_vars
 [1] "para_countclicks"                                   
 [2] "para_static_userAgent"                              
 [3] "para_static_platform"                               
 [4] "para_static_language"                               
 [5] "para_static_languages"                              
 [6] "para_static_cookieEnabled"                          
 [7] "para_static_online"                                 
 [8] "para_static_webdriver"                              
 [9] "para_static_hardwareConcurrency"                    
[10] "para_static_deviceMemory"                           
[11] "para_static_maxTouchPoints"                         
[12] "para_static_screen_width"                           
[13] "para_static_screen_height"                          
[14] "para_static_screen_availWidth"                      
[15] "para_static_screen_availHeight"                     
[16] "para_static_screen_colorDepth"                      
[17] "para_static_screen_pixelDepth"                      
[18] "para_static_viewport_innerWidth"                    
[19] "para_static_viewport_outerWidth"                    
[20] "para_static_viewport_outerHeight"                   
[21] "para_static_viewport_devicePixelRatio"              
[22] "para_static_page_url"                               
[23] "para_static_page_hostname"                          
[24] "para_static_page_pathname"                          
[25] "para_static_page_protocol"                          
[26] "para_static_page_referrer"                          
[27] "para_static_page_title"                             
[28] "para_static_locale_timezone"                        
[29] "para_static_locale_timezoneOffset"                  
[30] "para_static_network_effectiveType"                  
[31] "para_static_network_downlink"                       
[32] "para_static_network_rtt"                            
[33] "para_static_network_saveData"                       
[34] "para_static_mediaQueries_prefersDarkScheme"         
[35] "para_static_mediaQueries_prefersReducedMotion"      
[36] "para_static_mediaQueries_prefersReducedTransparency"
[37] "para_static_mediaQueries_prefersContrastMore"       
[38] "para_static_mediaQueries_pointerCoarse"             
[39] "para_static_mediaQueries_pointerFine"               
[40] "para_static_mediaQueries_hover"                     
[41] "para_static_features_localStorage"                  
[42] "para_static_features_sessionStorage"                
[43] "para_static_features_indexedDB"                     
[44] "para_static_features_serviceWorker"                 
[45] "para_static_features_bluetooth"                     
[46] "para_static_features_usb"                           
[47] "para_static_features_clipboard"                     
[48] "para_static_features_mediaDevices"                  
[49] "para_static_features_permissions"                   
[50] "para_static_features_storageManager"                
[51] "para_static_features_visualViewport"                
[52] "para_ip_ip"                                         
[53] "para_ip_network"                                    
[54] "para_ip_version"                                    
[55] "para_ip_city"                                       
[56] "para_ip_region"                                     
[57] "para_ip_region_code"                                
[58] "para_ip_country"                                    
[59] "para_ip_country_name"                               
[60] "para_ip_country_code"                               
[61] "para_ip_country_code_iso3"                          
[62] "para_ip_country_capital"                            
[63] "para_ip_country_tld"                                
[64] "para_ip_in_eu"                                      
[65] "para_ip_postal"                                     
[66] "para_ip_latitude"                                   
[67] "para_ip_longitude"                                  
[68] "para_ip_timezone"                                   
[69] "para_ip_utc_offset"                                 
[70] "para_ip_country_calling_code"                       
[71] "para_ip_currency"                                   
[72] "para_ip_currency_name"                              
[73] "para_ip_languages"                                  
[74] "para_ip_country_area"                               
[75] "para_ip_country_population"                         
[76] "para_ip_asn"                                        
[77] "para_ip_org"                                        
[78] "para_meta_startedAt"                                
[79] "para_static_viewport_innerHeight"                   
[80] "para_static_doNotTrack"                             
[81] "para_ip_continent_code"                             
para_statics_vars_df <- dat_ques %>%
  select(all_of(para_statics_vars))

para_static_overview <- tibble(
  variable = names(para_statics_vars_df),
  class = purrr::map_chr(para_statics_vars_df, ~ paste(class(.x), collapse = "/")),
  n = nrow(para_statics_vars_df),
  n_missing = purrr::map_int(para_statics_vars_df, ~ sum(is.na(.x))),
  n_unique = purrr::map_int(para_statics_vars_df, ~ dplyr::n_distinct(.x, na.rm = TRUE))
) %>%
  mutate(
    missing_pct = round(100 * n_missing / n, 2)
  )


para_static_descriptives <- purrr::map_dfr(para_statics_vars, function(v) {
  x <- para_statics_vars_df[[v]]

  if (is.numeric(x)) {
    n_non_missing <- sum(!is.na(x))

    tibble(
      variable = v,
      type = "numeric",
      mean = if (n_non_missing == 0) NA_real_ else mean(x, na.rm = TRUE),
      sd = if (n_non_missing <= 1) NA_real_ else sd(x, na.rm = TRUE),
      median = if (n_non_missing == 0) NA_real_ else median(x, na.rm = TRUE),
      min = if (n_non_missing == 0) NA_real_ else min(x, na.rm = TRUE),
      max = if (n_non_missing == 0) NA_real_ else max(x, na.rm = TRUE),
      top_levels = NA_character_
    )
  } else {
    x_chr <- as.character(x)
    x_chr[x_chr == ""] <- NA_character_
    tab <- sort(table(x_chr, useNA = "no"), decreasing = TRUE)
    k <- min(5, length(tab))

    tibble(
      variable = v,
      type = "categorical_or_text",
      mean = NA_real_,
      sd = NA_real_,
      median = NA_real_,
      min = NA_real_,
      max = NA_real_,
      top_levels = if (k == 0) {
        NA_character_
      } else {
        paste0(names(tab)[seq_len(k)], " (", as.integer(tab[seq_len(k)]), ")", collapse = "; ")
      }
    )
  }
})


para_static_table <- para_static_overview %>%
  left_join(para_static_descriptives, by = "variable")


DT::datatable(data = para_static_table)
rm(para_statics_vars); rm(para_statics_vars_df)

7 Identify potential bots

visualtrap_rt_vars <- str_subset(
  string = colnames(dat_ques),
  pattern = "^visualTrap_.*_rtMs$"
)

visualtrap_items <- stringr::str_remove(visualtrap_rt_vars, "_rtMs$")
visualtrap_correct_vars <- paste0(visualtrap_items, "_correct")

visualtrap_pairs <- tibble(
  item = visualtrap_items,
  rt_var = visualtrap_rt_vars,
  correct_var = visualtrap_correct_vars
) %>%
  filter(correct_var %in% colnames(dat_ques))

parse_correct_binary <- function(x) {
  x_chr <- tolower(trimws(as.character(x)))

  dplyr::case_when(
    x_chr %in% c("1", "true", "t", "yes", "y", "correct", "right") ~ 1,
    x_chr %in% c("0", "false", "f", "no", "n", "incorrect", "wrong") ~ 0,
    TRUE ~ NA_real_
  )
}

visualtrap_rt_ttest <- purrr::map_dfr(seq_len(nrow(visualtrap_pairs)), function(i) {
  item_name <- visualtrap_pairs$item[[i]]
  rt_name <- visualtrap_pairs$rt_var[[i]]
  correct_name <- visualtrap_pairs$correct_var[[i]]

  correct_bin_all <- parse_correct_binary(dat_ques[[correct_name]])

  n_correct_all <- sum(correct_bin_all == 1, na.rm = TRUE)
  n_wrong_all <- sum(correct_bin_all == 0, na.rm = TRUE)
  n_correct_non_missing <- n_correct_all + n_wrong_all

  rt_all <- suppressWarnings(as.numeric(dat_ques[[rt_name]]))

  item_df <- tibble(
    rt_ms = rt_all,
    correct_bin = correct_bin_all
  ) %>%
    filter(!is.na(rt_ms), !is.na(correct_bin)) %>%
    mutate(group = factor(ifelse(correct_bin == 1, "right", "wrong"), levels = c("wrong", "right")))

  n_right_rt <- sum(item_df$group == "right")
  n_wrong_rt <- sum(item_df$group == "wrong")

  mean_right <- if (n_right_rt > 0) mean(item_df$rt_ms[item_df$group == "right"], na.rm = TRUE) else NA_real_
  mean_wrong <- if (n_wrong_rt > 0) mean(item_df$rt_ms[item_df$group == "wrong"], na.rm = TRUE) else NA_real_
  sd_right <- if (n_right_rt > 1) sd(item_df$rt_ms[item_df$group == "right"], na.rm = TRUE) else NA_real_
  sd_wrong <- if (n_wrong_rt > 1) sd(item_df$rt_ms[item_df$group == "wrong"], na.rm = TRUE) else NA_real_

  t_out <- tryCatch(
    {
      if (n_right_rt >= 2 && n_wrong_rt >= 2) {
        t.test(rt_ms ~ group, data = item_df)
      } else {
        NULL
      }
    },
    error = function(e) NULL
  )

  pooled_sd <- if (!is.na(sd_right) && !is.na(sd_wrong) && (n_right_rt + n_wrong_rt - 2) > 0) {
    sqrt(((n_right_rt - 1) * sd_right^2 + (n_wrong_rt - 1) * sd_wrong^2) / (n_right_rt + n_wrong_rt - 2))
  } else {
    NA_real_
  }

  cohen_d <- if (!is.na(pooled_sd) && pooled_sd > 0) {
    (mean_right - mean_wrong) / pooled_sd
  } else {
    NA_real_
  }

  hedges_g <- if (!is.na(cohen_d) && (n_right_rt + n_wrong_rt) > 3) {
    cohen_d * (1 - (3 / (4 * (n_right_rt + n_wrong_rt) - 9)))
  } else {
    NA_real_
  }

  tibble(
    item = item_name,
    correct_var = correct_name,
    rt_var = rt_name,
    n_correct_non_missing = n_correct_non_missing,
    n_right = n_correct_all,
    n_wrong = n_wrong_all,
    prop_right = if (n_correct_non_missing > 0) n_correct_all / n_correct_non_missing else NA_real_,
    prop_wrong = if (n_correct_non_missing > 0) n_wrong_all / n_correct_non_missing else NA_real_,
    n_rt_ttest = nrow(item_df),
    n_right_rt = n_right_rt,
    n_wrong_rt = n_wrong_rt,
    mean_rt_right = mean_right,
    sd_rt_right = sd_right,
    mean_rt_wrong = mean_wrong,
    sd_rt_wrong = sd_wrong,
    mean_diff_right_minus_wrong = mean_right - mean_wrong,
    t_statistic = if (!is.null(t_out)) unname(t_out$statistic) else NA_real_,
    df = if (!is.null(t_out)) unname(t_out$parameter) else NA_real_,
    p_value = if (!is.null(t_out)) t_out$p.value else NA_real_,
    conf_low = if (!is.null(t_out)) t_out$conf.int[[1]] else NA_real_,
    conf_high = if (!is.null(t_out)) t_out$conf.int[[2]] else NA_real_,
    cohen_d = cohen_d,
    hedges_g = hedges_g
  )
}) %>%
  mutate(
    across(c(prop_right, prop_wrong), ~ round(.x, 3)),
    across(c(mean_rt_right, sd_rt_right, mean_rt_wrong, sd_rt_wrong, mean_diff_right_minus_wrong), ~ round(.x, 2)),
    across(c(t_statistic, df, cohen_d, hedges_g), ~ round(.x, 3)),
    p_value = round(p_value, 4),
    across(c(conf_low, conf_high), ~ round(.x, 2))
  )

visualtrap_rt_ttest
# A tibble: 6 × 23
  item       correct_var rt_var n_correct_non_missing n_right n_wrong prop_right
  <chr>      <chr>       <chr>                  <int>   <int>   <int>      <dbl>
1 visualTra… visualTrap… visua…                    69      65       4      0.942
2 visualTra… visualTrap… visua…                    69      58      11      0.841
3 visualTra… visualTrap… visua…                    69      59      10      0.855
4 visualTra… visualTrap… visua…                    69      66       3      0.957
5 visualTra… visualTrap… visua…                    69      67       2      0.971
6 visualTra… visualTrap… visua…                    69      54      15      0.783
# ℹ 16 more variables: prop_wrong <dbl>, n_rt_ttest <int>, n_right_rt <int>,
#   n_wrong_rt <int>, mean_rt_right <dbl>, sd_rt_right <dbl>,
#   mean_rt_wrong <dbl>, sd_rt_wrong <dbl>, mean_diff_right_minus_wrong <dbl>,
#   t_statistic <dbl>, df <dbl>, p_value <dbl>, conf_low <dbl>,
#   conf_high <dbl>, cohen_d <dbl>, hedges_g <dbl>
DT::datatable(
  visualtrap_rt_ttest,
  caption = "Visual-trap item analysis: right vs wrong reaction times (Welch t-test + effect sizes)",
  options = list(pageLength = 10, scrollX = TRUE)
)
parse_bool <- function(x) {
  x_chr <- tolower(trimws(as.character(x)))
  dplyr::case_when(
    x_chr %in% c("1", "true", "t", "yes", "y") ~ 1,
    x_chr %in% c("0", "false", "f", "no", "n") ~ 0,
    TRUE ~ NA_real_
  )
}

safe_num <- function(x) suppressWarnings(as.numeric(as.character(x)))

vt_correct_vars <- str_subset(colnames(dat_ques), "^visualTrap_.*_correct$")
vt_rt_vars <- str_subset(colnames(dat_ques), "^visualTrap_.*_rtMs$")

vt_correct_mat <- dat_ques %>%
  select(all_of(vt_correct_vars)) %>%
  mutate(across(everything(), parse_bool))

vt_rt_mat <- dat_ques %>%
  select(all_of(vt_rt_vars)) %>%
  mutate(across(everything(), safe_num))

# IER / straightlining indicator from attributes scale
attr_cols <- str_subset(colnames(dat_ques), "^attributesFutureSociety-")

attr_mat <- dat_ques %>%
  select(all_of(attr_cols)) %>%
  mutate(across(everything(), safe_num))

attr_answered_count <- attr_mat %>%
  transmute(attr_answered_count = rowSums(!is.na(across(everything())))) %>%
  pull(attr_answered_count)

attr_response_sd <- attr_mat %>%
  transmute(attr_response_sd = apply(
    across(everything()),
    1,
    function(x) {
      x <- as.numeric(x)
      if (sum(!is.na(x)) < 2) return(NA_real_)
      stats::sd(x, na.rm = TRUE)
    }
  )) %>%
  pull(attr_response_sd)

vt_wrong_count <- vt_correct_mat %>%
  mutate(across(everything(), ~ ifelse(.x == 0, 1, 0))) %>%
  transmute(vt_wrong_count = rowSums(across(everything()), na.rm = TRUE)) %>%
  pull(vt_wrong_count)

vt_answered_count <- vt_correct_mat %>%
  transmute(vt_answered_count = rowSums(!is.na(across(everything())))) %>%
  pull(vt_answered_count)

vt_correct_count <- vt_correct_mat %>%
  transmute(vt_correct_count = rowSums(across(everything()), na.rm = TRUE)) %>%
  pull(vt_correct_count)

vt_prop_correct <- ifelse(vt_answered_count > 0, vt_correct_count / vt_answered_count, NA_real_)

vt_mean_rt <- vt_rt_mat %>%
  transmute(vt_mean_rt = rowMeans(across(everything()), na.rm = TRUE)) %>%
  pull(vt_mean_rt)

vt_mean_rt_p05 <- quantile(vt_mean_rt, probs = 0.05, na.rm = TRUE)
vt_mean_rt_p95 <- quantile(vt_mean_rt, probs = 0.95, na.rm = TRUE)


vt_fast_count <- vt_rt_mat %>%
  mutate(across(everything(), ~ ifelse(!is.na(.x) & .x < vt_mean_rt_p05, 1, 0))) %>%
  transmute(vt_fast_count = rowSums(across(everything()), na.rm = TRUE)) %>%
  pull(vt_fast_count)

vt_slow_count <- vt_rt_mat %>%
  mutate(across(everything(), ~ ifelse(!is.na(.x) & .x > vt_mean_rt_p95, 1, 0))) %>%
  transmute(vt_slow_count = rowSums(across(everything()), na.rm = TRUE)) %>%
  pull(vt_slow_count)

webdriver_flag <- parse_bool(dat_ques$para_static_webdriver)
honeypot_flag <- parse_bool(dat_ques$hp_exclusionCriteria)

duration_seconds <- safe_num(dat_ques$duration_seconds)
duration_p05 <- quantile(duration_seconds, probs = 0.05, na.rm = TRUE)
duration_p95 <- quantile(duration_seconds, probs = 0.95, na.rm = TRUE)

very_fast_duration_flag <- ifelse(!is.na(duration_seconds) & duration_seconds < duration_p05, 1, 0)
very_slow_duration_flag <- ifelse(!is.na(duration_seconds) & duration_seconds > duration_p95, 1, 0)

events_by_pid <- dat_para_events %>%
  mutate(
    event_type_norm = tolower(as.character(event_type)),
    clipboard_type_norm = tolower(as.character(clipboard_type)),
    key_interval_ms = safe_num(key_interval_ms),
    clipboard_length = safe_num(clipboard_length)
  ) %>%
  group_by(PROLIFIC_PID) %>%
  summarise(
    n_events = n(),
    n_copy = sum(event_type_norm == "copy" | (event_type_norm == "clipboard" & clipboard_type_norm == "copy"), na.rm = TRUE),
    n_paste = sum(event_type_norm == "paste" | (event_type_norm == "clipboard" & clipboard_type_norm == "paste"), na.rm = TRUE),
    n_mouse = sum(event_type_norm %in% c("mousemove", "mouse"), na.rm = TRUE),
    n_scroll = sum(event_type_norm == "scroll", na.rm = TRUE),
    n_key_intervals = sum(!is.na(key_interval_ms)),
    key_interval_mean = mean(key_interval_ms, na.rm = TRUE),
    key_interval_sd = sd(key_interval_ms, na.rm = TRUE),
    clipboard_total_chars = sum(clipboard_length, na.rm = TRUE),
    .groups = "drop"
  )

defocus_by_pid <- dat_para_defocus %>%
  mutate(durationblur = safe_num(durationblur)) %>%
  group_by(PROLIFIC_PID) %>%
  summarise(
    defocus_n = n(),
    defocus_total_ms = sum(durationblur, na.rm = TRUE),
    defocus_mean_ms = mean(durationblur, na.rm = TRUE),
    .groups = "drop"
  )

bot_df <- dat_ques %>%
  select(PROLIFIC_PID, result_id, duration_seconds = duration_seconds) %>%
  mutate(
    duration_seconds = safe_num(duration_seconds),
    vt_wrong_count = vt_wrong_count,
    vt_answered_count = vt_answered_count,
    vt_correct_count = vt_correct_count,
    vt_prop_correct = vt_prop_correct,
    vt_mean_rt = vt_mean_rt,
    vt_fast_count = vt_fast_count,
    vt_slow_count = vt_slow_count,
    attr_answered_count = attr_answered_count,
    attr_response_sd = attr_response_sd,
    webdriver_flag = webdriver_flag,
    honeypot_flag = honeypot_flag,
    very_fast_duration_flag = very_fast_duration_flag,
    very_slow_duration_flag = very_slow_duration_flag
  ) %>%
  left_join(events_by_pid, by = "PROLIFIC_PID") %>%
  left_join(defocus_by_pid, by = "PROLIFIC_PID") %>%
  mutate(
    across(c(n_events, n_copy, n_paste, n_mouse, n_scroll, n_key_intervals,
             clipboard_total_chars, defocus_n, defocus_total_ms, defocus_mean_ms),
           ~ replace_na(.x, 0)),
    key_interval_mean = ifelse(is.nan(key_interval_mean), NA_real_, key_interval_mean),
    key_interval_sd = ifelse(is.nan(key_interval_sd), NA_real_, key_interval_sd)
  )

vt_rt_p10 <- quantile(bot_df$vt_mean_rt, 0.10, na.rm = TRUE)
vt_rt_p90 <- quantile(bot_df$vt_mean_rt, 0.90, na.rm = TRUE)
paste_p95 <- quantile(bot_df$n_paste, 0.95, na.rm = TRUE)
attr_sd_p10 <- quantile(bot_df$attr_response_sd, 0.10, na.rm = TRUE)

bot_df <- bot_df %>%
  mutate(
    risk_webdriver = ifelse(webdriver_flag == 1, 1, 0),
    risk_honeypot = ifelse(honeypot_flag == 1, 1, 0),
    risk_many_wrong_visual = ifelse(vt_wrong_count >= 3, 1, 0),
    risk_low_visual_accuracy = ifelse(!is.na(vt_prop_correct) & vt_prop_correct < 0.50, 1, 0),
    risk_very_fast_visual_rt = ifelse(!is.na(vt_mean_rt) & vt_mean_rt < vt_rt_p10, 1, 0),
    risk_very_slow_visual_rt = ifelse(!is.na(vt_mean_rt) & vt_mean_rt > vt_rt_p90, 1, 0),
    risk_very_fast_duration = ifelse(very_fast_duration_flag == 1, 1, 0),
    risk_high_paste = ifelse(!is.na(n_paste) & n_paste > paste_p95, 1, 0),
    risk_low_mouse_events = ifelse(!is.na(n_mouse) & n_mouse <= 5, 1, 0),
    risk_high_defocus = ifelse(!is.na(defocus_n) & defocus_n >= 5, 1, 0),
    risk_low_attr_sd = ifelse(!is.na(attr_response_sd) & attr_response_sd <= attr_sd_p10, 1, 0),
    bot_risk_score = risk_webdriver + risk_honeypot + risk_many_wrong_visual +
      risk_low_visual_accuracy + risk_very_fast_visual_rt + risk_very_slow_visual_rt +
      risk_very_fast_duration + risk_high_paste + risk_low_mouse_events + risk_high_defocus +
      risk_low_attr_sd
  )

risk_indicator_table <- bot_df %>%
  summarise(
    n = n(),
    across(starts_with("risk_"), ~ mean(.x, na.rm = TRUE), .names = "{.col}_prop")
  ) %>%
  pivot_longer(cols = starts_with("risk_"), names_to = "indicator", values_to = "proportion_flagged") %>%
  mutate(proportion_flagged = round(proportion_flagged, 3)) %>%
  arrange(desc(proportion_flagged))

risk_indicator_table
# A tibble: 11 × 3
       n indicator                     proportion_flagged
   <int> <chr>                                      <dbl>
 1    69 risk_low_attr_sd_prop                      0.145
 2    69 risk_very_fast_visual_rt_prop              0.101
 3    69 risk_very_slow_visual_rt_prop              0.101
 4    69 risk_high_defocus_prop                     0.087
 5    69 risk_many_wrong_visual_prop                0.072
 6    69 risk_low_visual_accuracy_prop              0.058
 7    69 risk_very_fast_duration_prop               0.058
 8    69 risk_high_paste_prop                       0.058
 9    69 risk_webdriver_prop                        0    
10    69 risk_honeypot_prop                         0    
11    69 risk_low_mouse_events_prop                 0    
bot_risk_score_dist <- bot_df %>%
  count(bot_risk_score, name = "n") %>%
  mutate(prop = round(n / sum(n), 3)) %>%
  arrange(bot_risk_score)

bot_risk_score_dist
# A tibble: 5 × 3
  bot_risk_score     n  prop
           <dbl> <int> <dbl>
1              0    42 0.609
2              1    11 0.159
3              2    13 0.188
4              3     2 0.029
5              4     1 0.014
cluster_features <- bot_df %>%
  select(
    vt_wrong_count, vt_prop_correct, vt_mean_rt, vt_fast_count, vt_slow_count,
    attr_response_sd,
    duration_seconds, n_events, n_paste, n_mouse, n_scroll,
    n_key_intervals, key_interval_mean, key_interval_sd,
    defocus_n, defocus_total_ms, bot_risk_score
  ) %>%
  mutate(across(everything(), ~ ifelse(is.infinite(.x), NA_real_, .x)))

# for (j in seq_len(ncol(cluster_features))) {
#   med_j <- median(cluster_features[[j]], na.rm = TRUE)
#   cluster_features[[j]][is.na(cluster_features[[j]])] <- med_j
# }


# robust NA handling: median impute per column (fallback to 0)
cluster_features_ready <- cluster_features %>%
  mutate(across(everything(), ~ {
    x <- as.numeric(.x)
    x[is.infinite(x)] <- NA_real_
    med <- suppressWarnings(stats::median(x, na.rm = TRUE))
    if (!is.finite(med)) med <- 0
    x[is.na(x)] <- med
    x
  }))

# drop zero-variance columns (would create NaN after scaling)
vars_zero_var <- names(cluster_features_ready)[
  vapply(
    cluster_features_ready,
    function(x) {
      s <- stats::sd(x, na.rm = TRUE)
      is.na(s) || s == 0
    },
    logical(1)
  )
]
vars_zero_var
[1] "n_key_intervals"   "key_interval_mean" "key_interval_sd"  
cluster_features_model <- cluster_features_ready
if (length(vars_zero_var) > 0) {
  cluster_features_model <- cluster_features_model %>%
    dplyr::select(-dplyr::all_of(vars_zero_var))
}

# scale and force any residual non-finite values to 0
cluster_features_scaled <- cluster_features_model %>%
  scale(center = TRUE, scale = TRUE) %>%
  as.data.frame()

cluster_features_scaled <- cluster_features_scaled %>%
  mutate(across(everything(), ~ {
    x <- as.numeric(.x)
    x[!is.finite(x)] <- 0
    x
  }))

# clustering task
task <- mlr3cluster::TaskClust$new(
  id = "bot_profile",
  backend = cluster_features_scaled
)

# evaluate candidate k values
k_max <- min(6, nrow(cluster_features_scaled) - 1)
k_grid <- if (k_max >= 2) 2:k_max else 2
dmat <- dist(cluster_features_scaled)

k_eval <- purrr::map_dfr(k_grid, function(k) {
  learner_k <- mlr3::lrn("clust.kmeans", centers = k, nstart = 30)
  learner_k$train(task)
  pred_k <- learner_k$predict(task)
  cl <- as.integer(pred_k$partition)

  sil <- tryCatch({
    mean(cluster::silhouette(cl, dmat)[, "sil_width"])
  }, error = function(e) NA_real_)

  tibble::tibble(
    k = k,
    avg_silhouette = sil
  )
})

k_eval
# A tibble: 5 × 2
      k avg_silhouette
  <int>          <dbl>
1     2          0.345
2     3          0.338
3     4          0.364
4     5          0.367
5     6          0.372
# choose best k
best_k <- k_eval %>%
  dplyr::filter(!is.na(avg_silhouette)) %>%
  dplyr::arrange(dplyr::desc(avg_silhouette)) %>%
  dplyr::slice(1) %>%
  dplyr::pull(k)

if (length(best_k) == 0 || is.na(best_k)) best_k <- 3

# final model
learner_final <- mlr3::lrn("clust.kmeans", centers = best_k, nstart = 50)
learner_final$train(task)
pred_final <- learner_final$predict(task)

# attach cluster labels back to original data
bot_df <- bot_df %>%
  dplyr::mutate(bot_cluster = factor(pred_final$partition))

# cluster summaries
cluster_profile <- bot_df %>%
  dplyr::group_by(bot_cluster) %>%
  dplyr::summarise(
    n = dplyr::n(),
    prop_sample = round(n / nrow(bot_df), 3),
    mean_bot_risk_score = round(mean(bot_risk_score, na.rm = TRUE), 3),
    mean_vt_prop_correct = round(mean(vt_prop_correct, na.rm = TRUE), 3),
    mean_vt_mean_rt = round(mean(vt_mean_rt, na.rm = TRUE), 1),
    mean_attr_response_sd = round(mean(attr_response_sd, na.rm = TRUE), 3),
    mean_duration_seconds = round(mean(duration_seconds, na.rm = TRUE), 1),
    mean_n_paste = round(mean(n_paste, na.rm = TRUE), 2),
    mean_n_mouse = round(mean(n_mouse, na.rm = TRUE), 2),
    mean_defocus_n = round(mean(defocus_n, na.rm = TRUE), 2),
    prop_webdriver = round(mean(webdriver_flag == 1, na.rm = TRUE), 3),
    prop_honeypot = round(mean(honeypot_flag == 1, na.rm = TRUE), 3),
    .groups = "drop"
  ) %>%
  dplyr::arrange(dplyr::desc(mean_bot_risk_score))

cluster_profile
# A tibble: 6 × 13
  bot_cluster     n prop_sample mean_bot_risk_score mean_vt_prop_correct
  <fct>       <int>       <dbl>               <dbl>                <dbl>
1 5               4       0.058               2.5                  0.25 
2 3               1       0.014               2                    0.833
3 2              10       0.145               1.6                  0.817
4 6               3       0.043               1                    0.833
5 1               5       0.072               0.8                  0.9  
6 4              46       0.667               0.261                0.967
# ℹ 8 more variables: mean_vt_mean_rt <dbl>, mean_attr_response_sd <dbl>,
#   mean_duration_seconds <dbl>, mean_n_paste <dbl>, mean_n_mouse <dbl>,
#   mean_defocus_n <dbl>, prop_webdriver <dbl>, prop_honeypot <dbl>
# identify highest-risk cluster
high_risk_cluster <- cluster_profile %>%
  dplyr::slice(1) %>%
  dplyr::pull(bot_cluster)

suspected_bot_table <- bot_df %>%
  dplyr::mutate(suspected_bot_cluster = ifelse(bot_cluster == high_risk_cluster, 1, 0)) %>%
  dplyr::select(
    PROLIFIC_PID, result_id, bot_cluster, suspected_bot_cluster, bot_risk_score,
    vt_wrong_count, vt_prop_correct, vt_mean_rt, attr_response_sd, duration_seconds,
    webdriver_flag, honeypot_flag, n_paste, n_mouse, defocus_n
  ) %>%
  dplyr::arrange(dplyr::desc(suspected_bot_cluster), dplyr::desc(bot_risk_score), vt_prop_correct)

DT::datatable(
  suspected_bot_table,
  caption = paste0("Bot-risk profiling table (best k = ", best_k, ")"),
  options = list(pageLength = 12, scrollX = TRUE)
)

Interpetation:

cluster_profile
# A tibble: 6 × 13
  bot_cluster     n prop_sample mean_bot_risk_score mean_vt_prop_correct
  <fct>       <int>       <dbl>               <dbl>                <dbl>
1 5               4       0.058               2.5                  0.25 
2 3               1       0.014               2                    0.833
3 2              10       0.145               1.6                  0.817
4 6               3       0.043               1                    0.833
5 1               5       0.072               0.8                  0.9  
6 4              46       0.667               0.261                0.967
# ℹ 8 more variables: mean_vt_mean_rt <dbl>, mean_attr_response_sd <dbl>,
#   mean_duration_seconds <dbl>, mean_n_paste <dbl>, mean_n_mouse <dbl>,
#   mean_defocus_n <dbl>, prop_webdriver <dbl>, prop_honeypot <dbl>

Interpretation by cluster (ordered by concern): - Cluster 1 (n=5, 7.2%): highest concern
- risk=2.60 (highest), vt_correct=0.30 (very low), attr_sd=0.334 (very low)
- Pattern fits low-quality / likely IER, possibly suspicious responses. - Cluster 5 (n=1, 1.4%): anomaly case
- risk=2.00, moderate attr SD (0.707), decent visual accuracy (0.833)
- Single case; treat as individual outlier, not a stable subgroup. - Cluster 6 (n=9, 13.0%): moderate concern (slow responders)
- risk=1.33, vt_correct=0.87 (okay), very high RT (61742 ms), attr SD moderate (0.754)
- More like very slow/careful or distracted than clear bot pattern. - Cluster 2 (n=6, 8.7%): moderate concern
- risk=1.17, vt_correct=0.861, high RT (37262 ms), attr SD moderate-high (0.826)
- Could reflect slow/effortful respondents, not strong bot signal. - Cluster 4 (n=18, 26.1%): low concern
- risk=0.556, high visual accuracy (0.972), high attr SD (1.302)
- Looks like engaged human responding. - Cluster 3 (n=30, 43.5%): lowest concern / baseline majority
- risk=0.100, high accuracy (0.956), high attr SD (1.323)
- Most clearly normal-quality respondents.

# barplots of two extreme groups based on number of correctly answered visual traps:

tmp_pids_c1 <- suspected_bot_table$PROLIFIC_PID[suspected_bot_table$bot_cluster == 1]
tmp_pids_c1
[1] "698ce4ba931f89581ecc6d7d" "6998517b5258b155f123c651"
[3] "675c89cc4e1a6bad637f454f" "69cc50b7d8b69400a3cc5e0f"
[5] "69963fa97606bb3a27603d6f"
tmp <- rowSums(dat_ques[dat_ques$PROLIFIC_PID %in% tmp_pids_c1, str_subset(string = colnames(dat_ques), pattern = "^visual.*_correct$")])
barplot(table(tmp), main = "Cluster 1")

tmp_pids_notC1 <- suspected_bot_table$PROLIFIC_PID[suspected_bot_table$bot_cluster != 1]
tmp <- rowSums(dat_ques[dat_ques$PROLIFIC_PID %in% tmp_pids_notC1, str_subset(string = colnames(dat_ques), pattern = "^visual.*_correct$")])
barplot(table(tmp), main = "All other clusters")

# t-tests of two extreme groups based on number of correctly answered visual traps:


# Build analysis frame
attr_cols <- str_subset(string = colnames(dat_ques), pattern = "^attributesFutureSociety-")
visual_correct_cols <- str_subset(string = colnames(dat_ques), pattern = "^visual.*_correct$")
ttest_df <- dat_ques %>%
  mutate(
    group_c1 = ifelse(PROLIFIC_PID %in% tmp_pids_c1, "Cluster1", "OtherClusters"),
    duration_seconds_num = suppressWarnings(as.numeric(as.character(duration_seconds))),
    attr_response_sd = apply(
      select(., all_of(attr_cols)) %>% mutate(across(everything(), ~ suppressWarnings(as.numeric(as.character(.x))))),
      1,
      function(x) if (sum(!is.na(x)) < 2) NA_real_ else sd(x, na.rm = TRUE)
    ),
    visual_correct_sum = rowSums(
      select(., all_of(visual_correct_cols)) %>% mutate(across(everything(), ~ suppressWarnings(as.numeric(as.character(.x))))),
      na.rm = TRUE
    )
  ) %>%
  filter(group_c1 %in% c("Cluster1", "OtherClusters"))
# Descriptives by group
ttest_df %>%
  group_by(group_c1) %>%
  summarise(
    n = n(),
    mean_duration = mean(duration_seconds_num, na.rm = TRUE),
    sd_duration = sd(duration_seconds_num, na.rm = TRUE),
    mean_attr_sd = mean(attr_response_sd, na.rm = TRUE),
    sd_attr_sd = sd(attr_response_sd, na.rm = TRUE),
    mean_visual_correct_sum = mean(visual_correct_sum, na.rm = TRUE),
    .groups = "drop"
  )
# A tibble: 2 × 7
  group_c1          n mean_duration sd_duration mean_attr_sd sd_attr_sd
  <chr>         <int>         <dbl>       <dbl>        <dbl>      <dbl>
1 Cluster1          5         1385.        304.        0.564      0.674
2 OtherClusters    64          889.        487.        1.18       0.623
# ℹ 1 more variable: mean_visual_correct_sum <dbl>
# t-test 1: duration mean (Cluster 1 vs others)
t_duration <- t.test(duration_seconds_num ~ group_c1, data = ttest_df)
t_duration

    Welch Two Sample t-test

data:  duration_seconds_num by group_c1
t = 3.3318, df = 5.7467, p-value = 0.01682
alternative hypothesis: true difference in means between group Cluster1 and group OtherClusters is not equal to 0
95 percent confidence interval:
 128.0148 865.6290
sample estimates:
     mean in group Cluster1 mean in group OtherClusters 
                  1385.4000                    888.5781 
# t-test 2: attributes response SD mean (Cluster 1 vs others)
t_attr_sd <- t.test(attr_response_sd ~ group_c1, data = ttest_df)
t_attr_sd

    Welch Two Sample t-test

data:  attr_response_sd by group_c1
t = -1.963, df = 4.5588, p-value = 0.1124
alternative hypothesis: true difference in means between group Cluster1 and group OtherClusters is not equal to 0
95 percent confidence interval:
 -1.436668  0.213220
sample estimates:
     mean in group Cluster1 mean in group OtherClusters 
                  0.5642195                   1.1759433 
# Optional effect sizes (Cohen's d)
cohens_d <- function(x, g) {
  x1 <- x[g == "Cluster1"]; x2 <- x[g == "OtherClusters"]
  x1 <- x1[!is.na(x1)]; x2 <- x2[!is.na(x2)]
  n1 <- length(x1); n2 <- length(x2)
  s1 <- sd(x1); s2 <- sd(x2)
  sp <- sqrt(((n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2))
  (mean(x1) - mean(x2)) / sp
}
d_duration <- cohens_d(ttest_df$duration_seconds_num, ttest_df$group_c1)
d_attr_sd <- cohens_d(ttest_df$attr_response_sd, ttest_df$group_c1)
d_duration
[1] 1.038941
d_attr_sd
[1] -0.9769174
# check copy & paste events of single respondent:

tmp_pids <- suspected_bot_table$PROLIFIC_PID[suspected_bot_table$bot_cluster == 5]
tmp_pids
[1] "69b86e94de9c503e9a58411d" "69c4c8d23e5ba793afd0e843"
[3] "69d2b682cb4fce2cc6c59711" "69c4c8d23e5ba793afd0e843"
dat_para_events[dat_para_events$PROLIFIC_PID %in% tmp_pids & dat_para_events$event_type %in% c("copy", "paste"), ]
# A tibble: 1 × 22
  PROLIFIC_PID result_id origin event_type event_index timestamp component field
  <chr>        <chr>     <chr>  <chr>            <int> <chr>     <chr>     <chr>
1 69b86e94de9… 15252     parad… copy                 1 2026-04-… Scenario… <NA> 
# ℹ 14 more variables: value <chr>, mouse_x <dbl>, mouse_y <dbl>,
#   scroll_x <dbl>, scroll_y <dbl>, key_interval_ms <dbl>,
#   clipboard_type <chr>, clipboard_text <chr>, clipboard_length <dbl>,
#   target_tag <chr>, target_type <chr>, target_id <chr>, target_name <chr>,
#   target_className <chr>

8 Descriptive statistics dependent variables

8.1 descriptive mean differences

assoc_cols <- c("assoc1_valence", "assoc2_valence", "assoc3_valence", "assoc4_valence", "assoc5_valence")
attr_cols <- paste0("attributesFutureSociety-", 1:8)

clean_complete <- dat_ques %>%
  mutate(
    assoc_valence_mean = rowMeans(select(., all_of(assoc_cols)), na.rm = TRUE),
    attributes_mean = rowMeans(select(., all_of(attr_cols)), na.rm = TRUE)
  )

analysis_df <- clean_complete %>%
  select(PROLIFIC_PID, condition_FutureSociety, assoc_valence_mean, attributes_mean, all_of(attr_cols)) %>%
  drop_na(assoc_valence_mean, attributes_mean, condition_FutureSociety)

mean_summary <- analysis_df %>%
  pivot_longer(
    cols = c(assoc_valence_mean, attributes_mean),
    names_to = "measure",
    values_to = "value"
  ) %>%
  group_by(condition_FutureSociety, measure) %>%
  summarise(
    mean = mean(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE),
    n = dplyr::n(),
    se = sd / sqrt(n),
    .groups = "drop"
  ) %>%
  mutate(
    measure = recode(
      measure,
      assoc_valence_mean = "Association Valence Mean",
      attributes_mean = "Attributes Mean"
    )
  )

ggplot(mean_summary, aes(x = condition_FutureSociety, y = mean, fill = measure)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - se, ymax = mean + se),
    position = position_dodge(width = 0.8),
    width = 0.2
  ) +
  labs(x = "Condition", y = "Mean", fill = "Measure") +
  theme_minimal()

anova_assoc <- afex::aov_ez(
  id = "PROLIFIC_PID",
  dv = "assoc_valence_mean",
  between = "condition_FutureSociety",
  data = analysis_df
)
Contrasts set to contr.sum for the following variables: condition_FutureSociety
anova_attr <- afex::aov_ez(
  id = "PROLIFIC_PID",
  dv = "attributes_mean",
  between = "condition_FutureSociety",
  data = analysis_df
)
Contrasts set to contr.sum for the following variables: condition_FutureSociety
summary(anova_assoc)
Anova Table (Type 3 tests)

Response: assoc_valence_mean
                        num Df den Df    MSE      F     ges  Pr(>F)  
condition_FutureSociety      6     61 2.7046 2.6343 0.20579 0.02451 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_attr)
Anova Table (Type 3 tests)

Response: attributes_mean
                        num Df den Df    MSE      F     ges   Pr(>F)   
condition_FutureSociety      6     61 1.4635 3.1599 0.23711 0.009164 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
attr_labels <- c(
  "1" = "Utopian",
  "2" = "Desirable",
  "3" = "Ideal",
  "4" = "Beneficial for the greater good",
  "5" = "Imaginative",
  "6" = "Innovative",
  "7" = "Creative",
  "8" = "Possible"
)

attr_long <- analysis_df %>%
  pivot_longer(
    cols = all_of(attr_cols),
    names_to = "item",
    values_to = "value"
  ) %>%
  mutate(
    item_num = stringr::str_extract(item, "[0-9]+"),
    item_label = recode(item_num, !!!attr_labels)
  )

attr_effects <- attr_long %>%
  filter(!is.na(value)) %>%
  group_by(item_label) %>%
  summarise(
    eta_sq = {
      fit <- aov(value ~ condition_FutureSociety)
      ss <- summary(fit)[[1]]
      ss_between <- ss["condition_FutureSociety", "Sum Sq"]
      ss_total <- sum(ss[, "Sum Sq"])
      if (is.na(ss_between) || is.na(ss_total) || ss_total == 0) NA_real_ else ss_between / ss_total
    },
    .groups = "drop"
  ) %>%
  mutate(
    eta_label = ifelse(is.na(eta_sq), "eta^2=NA", paste0("eta^2=", round(eta_sq, 2)))
  )

attr_effects_table <- attr_effects %>%
  arrange(desc(eta_sq))

attr_effects_table
# A tibble: 8 × 3
  item_label                      eta_sq eta_label 
  <chr>                            <dbl> <chr>     
1 Innovative                      0.251  eta^2=0.25
2 Desirable                       0.227  eta^2=0.23
3 Ideal                           0.220  eta^2=0.22
4 Utopian                         0.208  eta^2=0.21
5 Beneficial for the greater good 0.176  eta^2=0.18
6 Creative                        0.169  eta^2=0.17
7 Imaginative                     0.159  eta^2=0.16
8 Possible                        0.0144 eta^2=0.01
item_order <- attr_effects %>%
  mutate(eta_order = ifelse(is.na(eta_sq), -Inf, eta_sq)) %>%
  arrange(desc(eta_order)) %>%
  pull(item_label)

label_order <- attr_effects %>%
  mutate(eta_order = ifelse(is.na(eta_sq), -Inf, eta_sq)) %>%
  arrange(desc(eta_order)) %>%
  mutate(facet_label = paste0(item_label, "\n", eta_label)) %>%
  pull(facet_label)

attr_summary <- attr_long %>%
  group_by(condition_FutureSociety, item_label) %>%
  summarise(
    mean = mean(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE),
    n = dplyr::n(),
    se = sd / sqrt(n),
    .groups = "drop"
  ) %>%
  left_join(attr_effects %>% select(item_label, eta_label), by = "item_label") %>%
  mutate(
    item_label = factor(item_label, levels = item_order),
    facet_label = factor(paste0(item_label, "\n", eta_label), levels = label_order)
  )

condition_order <- analysis_df %>%
  distinct(condition_FutureSociety) %>%
  arrange(condition_FutureSociety) %>%
  pull(condition_FutureSociety)

attr_summary <- attr_summary %>%
  mutate(condition_FutureSociety = factor(condition_FutureSociety, levels = condition_order))

ggplot(attr_summary, aes(x = condition_FutureSociety, y = mean, fill = condition_FutureSociety)) +
  geom_col(width = 0.7) +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.2) +
  geom_text(aes(label = round(mean, 2)), vjust = -0.3, size = 3) +
  facet_wrap(~ facet_label, ncol = 4) +
  labs(x = "Condition", y = "Mean", fill = "Condition") +
  theme_minimal() +
  theme(legend.position = "bottom", axis.text.x = element_text(angle = 30, hjust = 1))

tmp <- clean_complete[, str_subset(string = colnames(clean_complete), pattern = "attri.*[1-9]")]
psych::corPlot(r = cor(tmp, use = "pairwise"))

hist(clean_complete$`attributesFutureSociety-8`)

hist(clean_complete$`attributesFutureSociety-4`)

tmp_efa <- psych::fa.parallel(x = tmp)

Parallel analysis suggests that the number of factors =  2  and the number of components =  1 
tmp_fa <- psych::fa(r = cor(tmp, use = "pairwise"), nfactors = 2, n.obs = nrow(tmp))
Loading required namespace: GPArotation
tmp_fa
Factor Analysis using method =  minres
Call: psych::fa(r = cor(tmp, use = "pairwise"), nfactors = 2, n.obs = nrow(tmp))
Standardized loadings (pattern matrix) based upon correlation matrix
                            MR1   MR2    h2    u2 com
attributesFutureSociety-5 -0.10  0.95 0.793 0.207 1.0
attributesFutureSociety-6  0.25  0.57 0.576 0.424 1.4
attributesFutureSociety-2  0.97 -0.03 0.911 0.089 1.0
attributesFutureSociety-3  0.96 -0.01 0.917 0.083 1.0
attributesFutureSociety-4  0.68  0.18 0.661 0.339 1.1
attributesFutureSociety-7  0.07  0.91 0.914 0.086 1.0
attributesFutureSociety-1  0.27  0.43 0.412 0.588 1.7
attributesFutureSociety-8  0.23 -0.01 0.052 0.948 1.0

                       MR1  MR2
SS loadings           2.75 2.49
Proportion Var        0.34 0.31
Cumulative Var        0.34 0.65
Proportion Explained  0.52 0.48
Cumulative Proportion 0.52 1.00

 With factor correlations of 
     MR1  MR2
MR1 1.00 0.65
MR2 0.65 1.00

Mean item complexity =  1.1
Test of the hypothesis that 2 factors are sufficient.

df null model =  28  with the objective function =  6.38 with Chi Square =  411.24
df of  the model are 13  and the objective function was  0.41 

The root mean square of the residuals (RMSR) is  0.03 
The df corrected root mean square of the residuals is  0.04 

The harmonic n.obs is  69 with the empirical chi square  3.16  with prob <  1 
The total n.obs was  69  with Likelihood Chi Square =  25.81  with prob <  0.018 

Tucker Lewis Index of factoring reliability =  0.926
RMSEA index =  0.119  and the 90 % confidence intervals are  0.048 0.188
BIC =  -29.23
Fit based upon off diagonal values = 1
Measures of factor score adequacy             
                                                   MR1  MR2
Correlation of (regression) scores with factors   0.94 0.92
Multiple R square of scores with factors          0.87 0.84
Minimum correlation of possible factor scores     0.75 0.68