Data preperation

Author

Julius Fenn

1 Notes

This script creates a simplified participant-level dataset using the raw study.rds file and produces:

A clean wide table with core variables + associations (wide) + keyStats
Paradata events in long format (clipboard, mouse, scroll, keyIntervals)
Defocus events in long format

2 global variables

createRawFiles <- TRUE

data_root <- "data"
output_root <- "outputs"
raw_output_dir <- file.path(output_root, "raw")
questionnaire_output_dir <- file.path(output_root, "questionnaire")
paradata_output_dir <- file.path(output_root, "paradata")

3 functions

last_non_empty <- function(x) {
  if (length(x) == 0) return(NA)
  for (i in seq_along(x)[length(x):1]) {
    val <- x[[i]]
    if (is.null(val)) next
    if (is.list(val) && length(val) == 0) next
    if (!is.list(val) && length(val) == 1 && is.na(val)) next
    if (is.character(val) && length(val) == 1 && isTRUE(val == "")) next
    return(val)
  }
  return(NA)
}

`%||%` <- function(x, y) {
  if (is.null(x)) y else x
}

parse_duration_seconds <- function(x) {
  if (is.na(x) || x == "") return(NA_real_)
  clean <- stringr::str_remove(x, "\\s*\\(.*\\)")
  if (is.na(clean) || clean == "") return(NA_real_)
  as.numeric(hms::as_hms(clean))
}

null_to_empty <- function(x) {
  if (is.null(x)) list() else x
}

normalize_cell <- function(val) {
  if (is.null(val)) return(NA)
  if (is.data.frame(val)) return(list(val))
  if (is.list(val)) {
    if (length(val) == 0) return(NA)
    return(list(val))
  }
  if (length(val) != 1) return(list(val))
  val
}

collapse_list_to_string <- function(x, sep = " // ") {
  if (is.null(x)) return(NA_character_)
  if (is.list(x)) {
    if (length(x) == 0) return(NA_character_)
    flat <- unlist(x, recursive = TRUE, use.names = FALSE)
    if (length(flat) == 0) return(NA_character_)
    return(paste0(flat, collapse = sep))
  }
  if (length(x) > 1) return(paste0(x, collapse = sep))
  as.character(x)
}

sanitize_paradata_name <- function(name) {
  cleaned <- gsub("[^A-Za-z0-9]+", "_", name)
  cleaned <- gsub("^_+|_+$", "", cleaned)
  paste0("para_", cleaned)
}

init_paradata_result <- function() {
  list(
    wide_vals = list(),
    wide_cols = character(),
    wide_cols_output = character(),
    long_rows = list(),
    long_cols = character()
  )
}

append_paradata_result <- function(target, source) {
  target$wide_vals <- c(target$wide_vals, source$wide_vals)
  target$wide_cols <- c(target$wide_cols, source$wide_cols)
  target$wide_cols_output <- c(target$wide_cols_output, source$wide_cols_output)
  target$long_rows <- c(target$long_rows, source$long_rows)
  target$long_cols <- c(target$long_cols, source$long_cols)
  target
}

add_paradata_wide <- function(res, field_label, value) {
  out_col <- sanitize_paradata_name(field_label)
  res$wide_vals[[out_col]] <- value
  res$wide_cols <- c(res$wide_cols, field_label)
  res$wide_cols_output <- c(res$wide_cols_output, out_col)
  res
}

add_paradata_long_row <- function(res, pid, result_id, field_label, value, row_index, timestamp) {
  res$long_rows[[length(res$long_rows) + 1]] <- tibble::tibble(
    PROLIFIC_PID = pid,
    result_id = result_id,
    origin = "paradata_general",
    event_type = "paradata_column",
    event_index = row_index,
    timestamp = timestamp %||% NA_character_,
    component = NA_character_,
    field = field_label,
    value = value,
    mouse_x = NA_real_,
    mouse_y = NA_real_,
    scroll_x = NA_real_,
    scroll_y = NA_real_,
    key_interval_ms = NA_real_,
    clipboard_type = NA_character_,
    clipboard_text = NA_character_,
    clipboard_length = NA_real_,
    target_tag = NA_character_,
    target_type = NA_character_,
    target_id = NA_character_,
    target_name = NA_character_,
    target_className = NA_character_
  )
  res$long_cols <- c(res$long_cols, field_label)
  res
}

get_event_value <- function(ev, name) {
  if (is.null(ev)) return(NA)
  if (is.data.frame(ev)) return(ev[[name]] %||% NA)
  if (is.list(ev)) return(ev[[name]] %||% NA)
  NA
}

get_target_value <- function(ev, field) {
  if (is.null(ev)) return(NA)
  if (is.data.frame(ev)) {
    col <- paste0("target.", field)
    return(ev[[col]] %||% NA)
  }
  if (is.list(ev) && !is.null(ev$target)) return(ev$target[[field]] %||% NA)
  NA
}

is_non_empty_element <- function(x) {
  if (is.null(x)) return(FALSE)
  if (is.data.frame(x)) return(nrow(x) > 0)
  if (is.list(x)) {
    if (length(x) == 0) return(FALSE)
    return(!all(purrr::map_lgl(x, ~ is.null(.x) || (is.list(.x) && length(.x) == 0))))
  }
  length(x) > 0
}

find_last_non_empty_element <- function(col_vals) {
  if (is.null(col_vals)) return(list(value = NULL, index = NA_integer_))
  if (!is.list(col_vals)) {
    if (length(col_vals) == 0) return(list(value = NULL, index = NA_integer_))
    return(list(value = col_vals, index = length(col_vals)))
  }
  for (i in seq_along(col_vals)[length(col_vals):1]) {
    v <- col_vals[[i]]
    if (is_non_empty_element(v)) return(list(value = v, index = i))
  }
  list(value = NULL, index = NA_integer_)
}

process_group_df <- function(df, prefix_label, timestamp_vec, pid, result_id) {
  res <- init_paradata_result()
  if (is.null(df) || !is.data.frame(df)) return(res)

  for (col in names(df)) {
    col_vals <- df[[col]]
    field_label <- paste0(prefix_label, ".", col)

    if (is.data.frame(col_vals)) {
      nested_res <- process_group_df(col_vals, field_label, timestamp_vec, pid, result_id)
      res <- append_paradata_result(res, nested_res)
      next
    }

    if (is.list(col_vals)) {
      vals_json <- vapply(col_vals, function(v) {
        if (is.null(v) || (is.list(v) && length(v) == 0)) return(NA_character_)
        jsonlite::toJSON(v, auto_unbox = TRUE)
      }, character(1))

      non_na <- vals_json[!is.na(vals_json)]
      if (length(non_na) == 0) next
      uniq <- unique(non_na)

      if (length(uniq) == 1) {
        res <- add_paradata_wide(res, field_label, uniq[[1]])
      } else {
        for (r in seq_along(vals_json)) {
          v <- vals_json[[r]]
          if (is.na(v)) next
          ts <- if (!is.null(timestamp_vec) && length(timestamp_vec) >= r) timestamp_vec[[r]] else NA_character_
          res <- add_paradata_long_row(res, pid, result_id, field_label, v, r, ts)
        }
      }
      next
    }

    non_na <- col_vals[!is.na(col_vals)]
    if (length(non_na) == 0) next
    uniq <- unique(non_na)

    if (length(uniq) == 1) {
      res <- add_paradata_wide(res, field_label, uniq[[1]])
    } else {
      for (r in seq_along(col_vals)) {
        v <- col_vals[[r]]
        if (is.na(v)) next
        ts <- if (!is.null(timestamp_vec) && length(timestamp_vec) >= r) timestamp_vec[[r]] else NA_character_
        res <- add_paradata_long_row(res, pid, result_id, field_label, as.character(v), r, ts)
      }
    }
  }

  res
}

flatten_with_prefix <- function(x, prefix) {
  if (is.null(x)) return(list())
  flat <- unlist(x, recursive = TRUE, use.names = TRUE)
  if (length(flat) == 0) return(list())
  names(flat) <- paste0(prefix, names(flat))
  as.list(flat)
}

assoc_to_wide <- function(assoc_list) {
  if (is.null(assoc_list) || length(assoc_list) == 0) {
    return(tibble::tibble(
      assoc1_response = NA_character_, assoc1_valence = NA_character_,
      assoc2_response = NA_character_, assoc2_valence = NA_character_,
      assoc3_response = NA_character_, assoc3_valence = NA_character_,
      assoc4_response = NA_character_, assoc4_valence = NA_character_,
      assoc5_response = NA_character_, assoc5_valence = NA_character_
    ))
  }

  assoc_df <- dplyr::as_tibble(assoc_list)
  assoc_df <- assoc_df[seq_len(min(5, nrow(assoc_df))), ]

  get_val <- function(df, col, idx) {
    if (!col %in% names(df)) return(NA_character_)
    if (nrow(df) < idx) return(NA_character_)
    df[[col]][idx]
  }

  tibble::tibble(
    assoc1_response = get_val(assoc_df, "response", 1),
    assoc1_valence = get_val(assoc_df, "valence", 1),
    assoc2_response = get_val(assoc_df, "response", 2),
    assoc2_valence = get_val(assoc_df, "valence", 2),
    assoc3_response = get_val(assoc_df, "response", 3),
    assoc3_valence = get_val(assoc_df, "valence", 3),
    assoc4_response = get_val(assoc_df, "response", 4),
    assoc4_valence = get_val(assoc_df, "valence", 4),
    assoc5_response = get_val(assoc_df, "response", 5),
    assoc5_valence = get_val(assoc_df, "valence", 5)
  )
}

build_paradata_events <- function(pid, result_id, paradata) {
  if (is.null(paradata) || is.null(paradata$behavior)) return(tibble::tibble())

  events <- list()
  behavior <- paradata$behavior

  clipboard_events <- null_to_empty(behavior$clipboardEvents)
  if (length(clipboard_events)) {
    events[[length(events) + 1]] <- purrr::map_dfr(seq_along(clipboard_events), function(i) {
      ev <- clipboard_events[[i]]
      target <- ev$target
      tibble::tibble(
        PROLIFIC_PID = pid,
        result_id = result_id,
        origin = "paradata_general",
        event_type = "clipboard",
        event_index = i,
        timestamp = ev$timestamp %||% NA_character_,
      component = ev$component %||% NA_character_,
      field = NA_character_,
      value = NA_character_,
      mouse_x = NA_real_,
      mouse_y = NA_real_,
        scroll_x = NA_real_,
        scroll_y = NA_real_,
        key_interval_ms = NA_real_,
        clipboard_type = ev$type %||% NA_character_,
        clipboard_text = ev$text %||% NA_character_,
        clipboard_length = ev$length %||% NA_real_,
        target_tag = target$tag %||% NA_character_,
        target_type = target$type %||% NA_character_,
        target_id = target$id %||% NA_character_,
        target_name = target$name %||% NA_character_,
        target_className = target$className %||% NA_character_
      )
    })
  }

  mouse_events <- null_to_empty(behavior$mouseEvents)
  if (length(mouse_events)) {
    events[[length(events) + 1]] <- purrr::map_dfr(seq_along(mouse_events), function(i) {
      ev <- mouse_events[[i]]
      tibble::tibble(
        PROLIFIC_PID = pid,
        result_id = result_id,
        origin = "paradata_general",
        event_type = "mouse",
        event_index = i,
        timestamp = ev$timestamp %||% NA_character_,
      component = ev$component %||% NA_character_,
      field = NA_character_,
      value = NA_character_,
      mouse_x = ev$x %||% NA_real_,
      mouse_y = ev$y %||% NA_real_,
        scroll_x = NA_real_,
        scroll_y = NA_real_,
        key_interval_ms = NA_real_,
        clipboard_type = NA_character_,
        clipboard_text = NA_character_,
        clipboard_length = NA_real_,
        target_tag = NA_character_,
        target_type = NA_character_,
        target_id = NA_character_,
        target_name = NA_character_,
        target_className = NA_character_
      )
    })
  }

  scroll_events <- null_to_empty(behavior$scrollEvents)
  if (length(scroll_events)) {
    events[[length(events) + 1]] <- purrr::map_dfr(seq_along(scroll_events), function(i) {
      ev <- scroll_events[[i]]
      tibble::tibble(
        PROLIFIC_PID = pid,
        result_id = result_id,
        origin = "paradata_general",
        event_type = "scroll",
        event_index = i,
        timestamp = ev$timestamp %||% NA_character_,
      component = ev$component %||% NA_character_,
      field = NA_character_,
      value = NA_character_,
      mouse_x = NA_real_,
      mouse_y = NA_real_,
        scroll_x = ev$scrollX %||% NA_real_,
        scroll_y = ev$scrollY %||% NA_real_,
        key_interval_ms = NA_real_,
        clipboard_type = NA_character_,
        clipboard_text = NA_character_,
        clipboard_length = NA_real_,
        target_tag = NA_character_,
        target_type = NA_character_,
        target_id = NA_character_,
        target_name = NA_character_,
        target_className = NA_character_
      )
    })
  }


  if (length(events) == 0) return(tibble::tibble())
  dplyr::bind_rows(events)
}

process_paradata_general <- function(pid, result_id, paradata) {
  empty <- list(
    wide = tibble::tibble(),
    long = tibble::tibble(),
    wide_cols = character(),
    wide_cols_output = character(),
    long_cols = character(),
    key_stats = tibble::tibble(
      keyStats_count = NA_real_,
      keyStats_mean = NA_real_,
      keyStats_median = NA_real_,
      keyStats_min = NA_real_,
      keyStats_max = NA_real_
    )
  )

  if (is.null(paradata)) return(empty)

  if (is.data.frame(paradata) && all(c("static", "ip", "behavior", "meta") %in% names(paradata))) {
    static_df <- paradata$static
    ip_df <- paradata$ip
    behavior_df <- paradata$behavior
    meta_df <- paradata$meta
    ip_error <- paradata$ipError

    timestamps <- NULL
    if (is.data.frame(static_df) && "timestamp" %in% names(static_df)) {
      timestamps <- static_df$timestamp
    } else if (is.data.frame(meta_df) && "updatedAt" %in% names(meta_df)) {
      timestamps <- meta_df$updatedAt
    }

    res <- init_paradata_result()
    res <- append_paradata_result(res, process_group_df(static_df, "static", timestamps, pid, result_id))
    res <- append_paradata_result(res, process_group_df(ip_df, "ip", timestamps, pid, result_id))
    res <- append_paradata_result(res, process_group_df(meta_df, "meta", timestamps, pid, result_id))

    if (!is.null(ip_error)) {
      non_na <- ip_error[!is.na(ip_error)]
      if (length(non_na) == 1) {
        res <- add_paradata_wide(res, "ipError", non_na[[1]])
      } else if (length(non_na) > 1) {
        for (r in seq_along(ip_error)) {
          v <- ip_error[[r]]
          if (is.na(v)) next
          ts <- if (!is.null(timestamps) && length(timestamps) >= r) timestamps[[r]] else NA_character_
          res <- add_paradata_long_row(res, pid, result_id, "ipError", as.character(v), r, ts)
        }
      }
    }

    key_stats <- empty$key_stats
    if (is.data.frame(behavior_df) && "keyStats" %in% names(behavior_df)) {
      key_df <- behavior_df$keyStats
      if (is.data.frame(key_df)) {
        key_stats <- tibble::tibble(
          keyStats_count = if ("count" %in% names(key_df)) last_non_empty(key_df$count) else NA_real_,
          keyStats_mean = if ("mean" %in% names(key_df)) last_non_empty(key_df$mean) else NA_real_,
          keyStats_median = if ("median" %in% names(key_df)) last_non_empty(key_df$median) else NA_real_,
          keyStats_min = if ("min" %in% names(key_df)) last_non_empty(key_df$min) else NA_real_,
          keyStats_max = if ("max" %in% names(key_df)) last_non_empty(key_df$max) else NA_real_
        )
      }
    }

    long_rows <- list()
    if (is.data.frame(behavior_df)) {
      if ("clipboardEvents" %in% names(behavior_df)) {
        last_clip <- find_last_non_empty_element(behavior_df$clipboardEvents)
        ev_list <- last_clip$value
        if (!is.null(ev_list)) {
          if (is.data.frame(ev_list)) {
            for (j in seq_len(nrow(ev_list))) {
              ev <- ev_list[j, ]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "clipboard",
                event_index = j,
                timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_clip$index) && length(timestamps) >= last_clip$index) timestamps[[last_clip$index]] else NA_character_),
                component = get_event_value(ev, "component"),
                field = NA_character_,
                value = NA_character_,
                mouse_x = NA_real_,
                mouse_y = NA_real_,
                scroll_x = NA_real_,
                scroll_y = NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = get_event_value(ev, "type"),
                clipboard_text = get_event_value(ev, "text"),
                clipboard_length = get_event_value(ev, "length"),
                target_tag = get_target_value(ev, "tag"),
                target_type = get_target_value(ev, "type"),
                target_id = get_target_value(ev, "id"),
                target_name = get_target_value(ev, "name"),
                target_className = get_target_value(ev, "className")
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.clipboardEvents")
          } else if (is.list(ev_list)) {
            ev_list <- null_to_empty(ev_list)
            for (j in seq_along(ev_list)) {
              ev <- ev_list[[j]]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "clipboard",
                event_index = j,
                timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_clip$index) && length(timestamps) >= last_clip$index) timestamps[[last_clip$index]] else NA_character_),
                component = get_event_value(ev, "component"),
                field = NA_character_,
                value = NA_character_,
                mouse_x = NA_real_,
                mouse_y = NA_real_,
                scroll_x = NA_real_,
                scroll_y = NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = get_event_value(ev, "type"),
                clipboard_text = get_event_value(ev, "text"),
                clipboard_length = get_event_value(ev, "length"),
                target_tag = get_target_value(ev, "tag"),
                target_type = get_target_value(ev, "type"),
                target_id = get_target_value(ev, "id"),
                target_name = get_target_value(ev, "name"),
                target_className = get_target_value(ev, "className")
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.clipboardEvents")
          }
        }
      }

      if ("mouseEvents" %in% names(behavior_df)) {
        last_mouse <- find_last_non_empty_element(behavior_df$mouseEvents)
        ev_df <- last_mouse$value
        if (!is.null(ev_df)) {
          if (is.data.frame(ev_df)) {
            for (j in seq_len(nrow(ev_df))) {
              ev <- ev_df[j, ]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "mouse",
                event_index = j,
                timestamp = ev$timestamp %||% (if (!is.null(timestamps) && !is.na(last_mouse$index) && length(timestamps) >= last_mouse$index) timestamps[[last_mouse$index]] else NA_character_),
                component = ev$component %||% NA_character_,
                field = NA_character_,
                value = NA_character_,
                mouse_x = ev$x %||% NA_real_,
                mouse_y = ev$y %||% NA_real_,
                scroll_x = NA_real_,
                scroll_y = NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = NA_character_,
                clipboard_text = NA_character_,
                clipboard_length = NA_real_,
                target_tag = NA_character_,
                target_type = NA_character_,
                target_id = NA_character_,
                target_name = NA_character_,
                target_className = NA_character_
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.mouseEvents")
          } else if (is.list(ev_df)) {
            ev_df <- null_to_empty(ev_df)
            for (j in seq_along(ev_df)) {
              ev <- ev_df[[j]]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "mouse",
                event_index = j,
                timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_mouse$index) && length(timestamps) >= last_mouse$index) timestamps[[last_mouse$index]] else NA_character_),
                component = get_event_value(ev, "component"),
                field = NA_character_,
                value = NA_character_,
                mouse_x = get_event_value(ev, "x"),
                mouse_y = get_event_value(ev, "y"),
                scroll_x = NA_real_,
                scroll_y = NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = NA_character_,
                clipboard_text = NA_character_,
                clipboard_length = NA_real_,
                target_tag = NA_character_,
                target_type = NA_character_,
                target_id = NA_character_,
                target_name = NA_character_,
                target_className = NA_character_
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.mouseEvents")
          }
        }
      }

      if ("scrollEvents" %in% names(behavior_df)) {
        last_scroll <- find_last_non_empty_element(behavior_df$scrollEvents)
        ev_df <- last_scroll$value
        if (!is.null(ev_df)) {
          if (is.data.frame(ev_df)) {
            for (j in seq_len(nrow(ev_df))) {
              ev <- ev_df[j, ]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "scroll",
                event_index = j,
                timestamp = ev$timestamp %||% (if (!is.null(timestamps) && !is.na(last_scroll$index) && length(timestamps) >= last_scroll$index) timestamps[[last_scroll$index]] else NA_character_),
                component = ev$component %||% NA_character_,
                field = NA_character_,
                value = NA_character_,
                mouse_x = NA_real_,
                mouse_y = NA_real_,
                scroll_x = ev$scrollX %||% NA_real_,
                scroll_y = ev$scrollY %||% NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = NA_character_,
                clipboard_text = NA_character_,
                clipboard_length = NA_real_,
                target_tag = NA_character_,
                target_type = NA_character_,
                target_id = NA_character_,
                target_name = NA_character_,
                target_className = NA_character_
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.scrollEvents")
          } else if (is.list(ev_df)) {
            ev_df <- null_to_empty(ev_df)
            for (j in seq_along(ev_df)) {
              ev <- ev_df[[j]]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "scroll",
                event_index = j,
                timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_scroll$index) && length(timestamps) >= last_scroll$index) timestamps[[last_scroll$index]] else NA_character_),
                component = get_event_value(ev, "component"),
                field = NA_character_,
                value = NA_character_,
                mouse_x = NA_real_,
                mouse_y = NA_real_,
                scroll_x = get_event_value(ev, "scrollX"),
                scroll_y = get_event_value(ev, "scrollY"),
                key_interval_ms = NA_real_,
                clipboard_type = NA_character_,
                clipboard_text = NA_character_,
                clipboard_length = NA_real_,
                target_tag = NA_character_,
                target_type = NA_character_,
                target_id = NA_character_,
                target_name = NA_character_,
                target_className = NA_character_
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.scrollEvents")
          }
        }
      }

    }

    if (length(long_rows)) {
      res$long_rows <- c(res$long_rows, long_rows)
    }

    return(list(
      wide = if (length(res$wide_vals)) tibble::as_tibble(res$wide_vals) else tibble::tibble(),
      long = if (length(res$long_rows)) dplyr::bind_rows(res$long_rows) else tibble::tibble(),
      wide_cols = res$wide_cols,
      wide_cols_output = res$wide_cols_output,
      long_cols = res$long_cols,
      key_stats = key_stats
    ))
  }

  if (is.list(paradata)) {
    wide_vals <- c(
      flatten_with_prefix(paradata$static, "para_static_"),
      flatten_with_prefix(paradata$ip, "para_ip_"),
      flatten_with_prefix(paradata$meta, "para_meta_")
    )

    key_stats <- tibble::tibble(
      keyStats_count = paradata$behavior$keyStats$count %||% NA_real_,
      keyStats_mean = paradata$behavior$keyStats$mean %||% NA_real_,
      keyStats_median = paradata$behavior$keyStats$median %||% NA_real_,
      keyStats_min = paradata$behavior$keyStats$min %||% NA_real_,
      keyStats_max = paradata$behavior$keyStats$max %||% NA_real_
    )

    return(list(
      wide = if (length(wide_vals)) tibble::as_tibble(wide_vals) else tibble::tibble(),
      long = build_paradata_events(pid, result_id, paradata),
      wide_cols = names(wide_vals),
      wide_cols_output = names(wide_vals),
      long_cols = character(),
      key_stats = key_stats
    ))
  }

  empty
}

build_defocus_events <- function(pid, result_id, defocus_list) {
  if (is.null(defocus_list)) return(tibble::tibble())

  if (is.data.frame(defocus_list)) {
    if (nrow(defocus_list) == 0) return(tibble::tibble())
    return(dplyr::mutate(
      tibble::as_tibble(defocus_list),
      PROLIFIC_PID = pid,
      result_id = result_id,
      origin = "paradata_focus",
      event_index = dplyr::row_number()
    ) %>%
      dplyr::select(PROLIFIC_PID, result_id, origin, event_index, durationblur, senderblur))
  }

  if (is.list(defocus_list)) {
    if (length(defocus_list) == 0) return(tibble::tibble())
    return(purrr::map_dfr(seq_along(defocus_list), function(i) {
      ev <- defocus_list[[i]]
      tibble::tibble(
        PROLIFIC_PID = pid,
        result_id = result_id,
        origin = "paradata_focus",
        event_index = i,
        durationblur = ev$durationblur %||% NA_real_,
        senderblur = ev$senderblur %||% NA_character_
      )
    }))
  }

  tibble::tibble()
}

is_missing_value <- function(x) {
  if (is.null(x)) return(TRUE)
  if (is.list(x)) {
    return(length(x) == 0 || all(is.na(unlist(x))))
  }
  if (length(x) > 1) {
    if (is.character(x)) return(all(is.na(x) | x == ""))
    return(all(is.na(x)))
  }
  if (is.character(x)) return(is.na(x) | x == "")
  is.na(x)
}

4 load packages

require(pacman)
p_load(
  "tidyverse",
  "jsonlite",
  "hms",
  "psych",
  "afex",
  "writexl",
  "readxl",
  "haven"
)

5 create raw data files

if (createRawFiles) {
  dir.create(raw_output_dir, recursive = TRUE, showWarnings = FALSE)

  data_files <- list.files(
    data_root,
    pattern = "data.txt$",
    recursive = TRUE,
    full.names = TRUE
  )

  results <- list()
  for (file in data_files) {
    result_folder <- basename(dirname(dirname(file)))
    result_id <- stringr::str_remove(result_folder, "^study_result_")

    tmp <- jsonlite::fromJSON(file)
    tmp[, colnames(tmp) == ""] <- NULL
    tmp <- dplyr::as_tibble(tmp)
    tmp$result_id <- as.character(result_id)

    results[[length(results) + 1]] <- tmp
  }

  saveRDS(results, file = file.path(raw_output_dir, "study.rds"))
  json_lines <- purrr::map_chr(results, ~ jsonlite::toJSON(.x, auto_unbox = TRUE))
  writeLines(json_lines, con = file.path(raw_output_dir, "study.jsonl"))

  print("Raw files have been created in this run!")
} else {
  print("Raw files have been created in previous run!")
}

[1] "Raw files have been created in this run!"

6 load JATOS metadata

metadata_files <- list.files(
  data_root,
  pattern = "jatos_results_metadata_.*\\.csv$",
  full.names = TRUE
)

metadata_file <- metadata_files[which.max(file.info(metadata_files)$mtime)]
metadata <- readr::read_csv(metadata_file, show_col_types = FALSE)
names(metadata) <- make.names(names(metadata))

metadata <- metadata %>%
  mutate(
    result_id = as.character(Result.ID),
    duration_seconds = purrr::map_dbl(Duration, parse_duration_seconds)
  )

metadata_small <- metadata %>%
  select(result_id, State, Duration, duration_seconds)

table(metadata_small$State)


FINISHED      PRE 
      68       20

7 load Prolific demographics

prolific_demographic_file <- file.path(
  data_root,
  "prolific_demographic_export_69d4ced16907d423ae787668.csv"
)

prolific_demographic_raw <- readr::read_csv(prolific_demographic_file, show_col_types = FALSE)
names(prolific_demographic_raw) <- make.names(names(prolific_demographic_raw))

prolific_demographic <- prolific_demographic_raw %>%
  rename(PROLIFIC_PID = Participant.id) %>%
  mutate(PROLIFIC_PID = as.character(PROLIFIC_PID)) %>%
  distinct(PROLIFIC_PID, .keep_all = TRUE) %>%
  rename_with(~ paste0("socidem_", .x), -PROLIFIC_PID)

8 build participant-level data

study <- readRDS(file.path(raw_output_dir, "study.rds"))

core_vars <- c(
  "dummy_informedconsent",
  "hp_exclusionCriteria",
  "commCheck",
  "counterVisualTraps",
  "visualTrapResponse",
  "visualTrap_Planets_response",
  "visualTrap_Planets_responseLabel",
  "visualTrap_Planets_correct",
  "visualTrap_Planets_rtMs",
  "visualTrap_Collision_response",
  "visualTrap_Collision_responseLabel",
  "visualTrap_Collision_correct",
  "visualTrap_Collision_rtMs",
  "visualTrap_CafeWall_response",
  "visualTrap_CafeWall_responseLabel",
  "visualTrap_CafeWall_correct",
  "visualTrap_CafeWall_rtMs",
  "visualTrap_Circles_response",
  "visualTrap_Circles_responseLabel",
  "visualTrap_Circles_correct",
  "visualTrap_Circles_rtMs",
  "visualTrap_Lines_response",
  "visualTrap_Lines_responseLabel",
  "visualTrap_Lines_correct",
  "visualTrap_Lines_rtMs",
  "visualTrap_Robot_response",
  "visualTrap_Robot_responseLabel",
  "visualTrap_Robot_correct",
  "visualTrap_Robot_rtMs",
  "visualTrap_order",
  "condition_FutureSociety",
  "cue",
  "cue_coding",
  "R1",
  "R2",
  "R3",
  "R4",
  "R5",
  "para_countclicks",
  "AR-2",
  "AR-5",
  "AR-3",
  "AR-1",
  "AR-4",
  "undUtopiaGeneral",
  "undUtopiaGoal",
  "undUtopiaHow",
  "clearUtopia",
  "clearUtopiatext",
  "biasUtopia",
  "biasUtopiatext",
  "attributesFutureSociety-5",
  "attributesFutureSociety-6",
  "attributesFutureSociety-2",
  "attributesFutureSociety-3",
  "attributesFutureSociety-4",
  "attributesFutureSociety-7",
  "attributesFutureSociety-1",
  "attributesFutureSociety-8",
  "utopiaPrototypeAssignment",
  "protoConfidenceRadio",
  "protoReasonText",
  "feedback_critic"
)

core_rows <- list()
paradata_events_rows <- list()
paradata_defocus_rows <- list()
paradata_wide_cols_rows <- list()
paradata_long_cols_rows <- list()

for (i in seq_along(study)) {
  print(i)
  df <- study[[i]]

  pid <- if ("PROLIFIC_PID" %in% names(df)) last_non_empty(df$PROLIFIC_PID) else NA
  result_id <- if ("result_id" %in% names(df)) last_non_empty(df$result_id) else NA

  core_values <- lapply(core_vars, function(v) {
    val <- if (v %in% names(df)) {
      last_non_empty(df[[v]])
    } else if (v == "attributesFutureSociety-8" && "attributesFutureSociety-undefined" %in% names(df)) {
      last_non_empty(df[["attributesFutureSociety-undefined"]])
    } else {
      NA
    }
    normalize_cell(val)
  })
  names(core_values) <- core_vars
  core_row <- tibble::as_tibble(core_values)

  if ("visualTrap_order" %in% names(core_row)) {
    core_row$visualTrap_order <- collapse_list_to_string(core_row$visualTrap_order)
  }

  assoc_list <- if ("sucsessfulAssociations" %in% names(df)) last_non_empty(df$sucsessfulAssociations) else NULL
  assoc_wide <- assoc_to_wide(assoc_list)

  paradata <- if ("paradata_general" %in% names(df)) df$paradata_general else NULL
  paradata_summary <- process_paradata_general(pid, result_id, paradata)
  paradata_wide <- paradata_summary$wide
  key_stats_row <- paradata_summary$key_stats

  core_row <- dplyr::bind_cols(
    tibble::tibble(result_id = result_id, PROLIFIC_PID = pid),
    core_row,
    assoc_wide,
    paradata_wide,
    key_stats_row
  )

  core_rows[[length(core_rows) + 1]] <- core_row

  paradata_events <- paradata_summary$long
  if (nrow(paradata_events) > 0) {
    paradata_events_rows[[length(paradata_events_rows) + 1]] <- paradata_events
  }

  if (length(paradata_summary$wide_cols) > 0) {
    paradata_wide_cols_rows[[length(paradata_wide_cols_rows) + 1]] <- tibble::tibble(
      PROLIFIC_PID = pid,
      result_id = result_id,
      field = paradata_summary$wide_cols,
      output_field = paradata_summary$wide_cols_output,
      destination = "wide"
    )
  }

  if (length(paradata_summary$long_cols) > 0) {
    paradata_long_cols_rows[[length(paradata_long_cols_rows) + 1]] <- tibble::tibble(
      PROLIFIC_PID = pid,
      result_id = result_id,
      field = paradata_summary$long_cols,
      output_field = NA_character_,
      destination = "long"
    )
  }

  defocus_list <- if ("para_defocuscount" %in% names(df)) last_non_empty(df$para_defocuscount) else NULL
  defocus_events <- build_defocus_events(pid, result_id, defocus_list)
  if (nrow(defocus_events) > 0) {
    paradata_defocus_rows[[length(paradata_defocus_rows) + 1]] <- defocus_events
  }
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88

core_df <- dplyr::bind_rows(core_rows)
paradata_events_long <- dplyr::bind_rows(paradata_events_rows)
paradata_defocus_long <- dplyr::bind_rows(paradata_defocus_rows)
paradata_column_map <- dplyr::bind_rows(paradata_wide_cols_rows, paradata_long_cols_rows)

if (nrow(paradata_events_long) == 0) {
  paradata_events_long <- tibble::tibble(
    PROLIFIC_PID = character(),
    result_id = character(),
    origin = character(),
    event_type = character(),
    event_index = integer(),
    timestamp = character(),
    component = character(),
    field = character(),
    value = character(),
    mouse_x = double(),
    mouse_y = double(),
    scroll_x = double(),
    scroll_y = double(),
    key_interval_ms = double(),
    clipboard_type = character(),
    clipboard_text = character(),
    clipboard_length = double(),
    target_tag = character(),
    target_type = character(),
    target_id = character(),
    target_name = character(),
    target_className = character()
  )
}

if (nrow(paradata_defocus_long) == 0) {
  paradata_defocus_long <- tibble::tibble(
    PROLIFIC_PID = character(),
    result_id = character(),
    origin = character(),
    event_index = integer(),
    durationblur = double(),
    senderblur = character()
  )
}

if (nrow(paradata_column_map) == 0) {
  paradata_column_map <- tibble::tibble(
    PROLIFIC_PID = character(),
    result_id = character(),
    field = character(),
    output_field = character(),
    destination = character()
  )
}

core_df <- core_df %>%
  left_join(metadata_small, by = "result_id") %>%
  left_join(prolific_demographic, by = "PROLIFIC_PID")

9 filter to complete datasets

optional_text <- c("clearUtopiatext", "biasUtopiatext", "protoReasonText", "feedback_critic")

required_cols <- setdiff(
  c("PROLIFIC_PID",
    core_vars,
    "assoc1_response", "assoc2_response", "assoc3_response", "assoc4_response", "assoc5_response",
    "assoc1_valence", "assoc2_valence", "assoc3_valence", "assoc4_valence", "assoc5_valence"
  ),
  optional_text
)

table(core_df$State)


FINISHED      PRE 
      69       19

# clean_complete <- core_df
clean_complete <- core_df %>%
  filter(State == "FINISHED")

dim(clean_complete)

[1]  69 182

table(clean_complete$condition_FutureSociety)


 aicentered    anarchic    futurist    lawBased moderngreen primitivist 
          6          14          16           7          10           7 
  religious 
          8

to_numeric <- function(x) {
  if (is.numeric(x)) return(x)
  suppressWarnings(as.numeric(as.character(x)))
}

to_logical <- function(x) {
  if (is.logical(x)) return(x)
  if (is.factor(x)) x <- as.character(x)
  as.logical(x)
}

num_cols <- c(
  "counterVisualTraps",
  "para_countclicks",
  "visualTrap_Planets_rtMs",
  "visualTrap_Collision_rtMs",
  "visualTrap_CafeWall_rtMs",
  "visualTrap_Circles_rtMs",
  "visualTrap_Lines_rtMs",
  "visualTrap_Robot_rtMs",
  "AR-1",
  "AR-2",
  "AR-3",
  "AR-4",
  "AR-5",
  "clearUtopia",
  "biasUtopia",
  "attributesFutureSociety-1",
  "attributesFutureSociety-2",
  "attributesFutureSociety-3",
  "attributesFutureSociety-4",
  "attributesFutureSociety-5",
  "attributesFutureSociety-6",
  "attributesFutureSociety-7",
  "attributesFutureSociety-8",
  "protoConfidenceRadio",
  "assoc1_valence",
  "assoc2_valence",
  "assoc3_valence",
  "assoc4_valence",
  "assoc5_valence",
  "keyStats_count",
  "keyStats_mean",
  "keyStats_median",
  "keyStats_min",
  "keyStats_max",
  "duration_seconds",
  "socidem_Age"
)

bool_cols <- c(
  "hp_exclusionCriteria",
  "visualTrap_Planets_correct",
  "visualTrap_Collision_correct",
  "visualTrap_CafeWall_correct",
  "visualTrap_Circles_correct",
  "visualTrap_Lines_correct",
  "visualTrap_Robot_correct"
)

factor_cols <- c(
  "dummy_informedconsent",
  "commCheck",
  "condition_FutureSociety",
  "cue_coding",
  "utopiaPrototypeAssignment",
  "visualTrapResponse",
  "visualTrap_Planets_response",
  "visualTrap_Collision_response",
  "visualTrap_CafeWall_response",
  "visualTrap_Circles_response",
  "visualTrap_Lines_response",
  "visualTrap_Robot_response",
  "socidem_Sex",
  "socidem_Ethnicity.simplified",
  "socidem_Student.status",
  "socidem_Employment.status"
)

clean_complete <- clean_complete %>%
  mutate(
    across(any_of(bool_cols), ~ to_logical(.)),
    across(any_of(num_cols), ~ to_numeric(.)),
    across(any_of(factor_cols), ~ as.factor(.))
  )

10 save outputs

dir.create(questionnaire_output_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(paradata_output_dir, recursive = TRUE, showWarnings = FALSE)

saveRDS(clean_complete, file = file.path(questionnaire_output_dir, "clean_complete_wide.rds"))
writexl::write_xlsx(x = clean_complete, path = file.path(questionnaire_output_dir, "clean_complete_wide.xlsx"))
write.csv(clean_complete, file = file.path(questionnaire_output_dir, "clean_complete_wide.csv"), row.names = FALSE)

saveRDS(paradata_events_long, file = file.path(paradata_output_dir, "paradata_events_long.rds"))
writexl::write_xlsx(x = paradata_events_long, path = file.path(paradata_output_dir, "paradata_events_long.xlsx"))
write.csv(paradata_events_long, file = file.path(paradata_output_dir, "paradata_events_long.csv"), row.names = FALSE)

saveRDS(paradata_defocus_long, file = file.path(paradata_output_dir, "paradata_defocus_long.rds"))
writexl::write_xlsx(x = paradata_defocus_long, path = file.path(paradata_output_dir, "paradata_defocus_long.xlsx"))
write.csv(paradata_defocus_long, file = file.path(paradata_output_dir, "paradata_defocus_long.csv"), row.names = FALSE)

writexl::write_xlsx(x = clean_complete[, c("PROLIFIC_PID", "condition_FutureSociety","clearUtopia", "clearUtopiatext", "biasUtopia", "biasUtopiatext", "protoConfidenceRadio", "protoReasonText")], path = file.path(questionnaire_output_dir, "clean_complete_wide_text.xlsx"))

11 Summary tables

paradata_column_summary <- paradata_column_map %>%
  group_by(destination, field, output_field) %>%
  summarise(n_participants = dplyr::n_distinct(PROLIFIC_PID), .groups = "drop") %>%
  arrange(destination, field)

paradata_participant_summary <- core_df %>%
  select(PROLIFIC_PID, result_id, State, duration_seconds) %>%
  distinct() %>%
  left_join(paradata_events_long %>% count(PROLIFIC_PID, name = "paradata_event_rows"), by = "PROLIFIC_PID") %>%
  left_join(paradata_defocus_long %>% count(PROLIFIC_PID, name = "defocus_event_rows"), by = "PROLIFIC_PID") %>%
  left_join(paradata_column_map %>% filter(destination == "wide") %>% count(PROLIFIC_PID, name = "paradata_wide_fields"), by = "PROLIFIC_PID") %>%
  left_join(paradata_column_map %>% filter(destination == "long") %>% count(PROLIFIC_PID, name = "paradata_long_fields"), by = "PROLIFIC_PID") %>%
  mutate(
    paradata_event_rows = tidyr::replace_na(paradata_event_rows, 0),
    defocus_event_rows = tidyr::replace_na(defocus_event_rows, 0),
    paradata_wide_fields = tidyr::replace_na(paradata_wide_fields, 0),
    paradata_long_fields = tidyr::replace_na(paradata_long_fields, 0)
  )

paradata_column_summary

# A tibble: 93 × 4
   destination field                            output_field n_participants
   <chr>       <chr>                            <chr>                 <int>
 1 long        behavior.clipboardEvents         <NA>                     10
 2 long        behavior.mouseEvents             <NA>                     84
 3 long        behavior.scrollEvents            <NA>                     84
 4 long        meta.updatedAt                   <NA>                     84
 5 long        static.network.downlink          <NA>                      9
 6 long        static.network.effectiveType     <NA>                      3
 7 long        static.network.rtt               <NA>                     19
 8 long        static.timestamp                 <NA>                     84
 9 long        static.viewport.devicePixelRatio <NA>                      2
10 long        static.viewport.innerHeight      <NA>                     12
# ℹ 83 more rows

paradata_participant_summary

# A tibble: 84 × 8
   PROLIFIC_PID             result_id State duration_seconds paradata_event_rows
   <chr>                    <chr>     <chr>            <dbl>               <int>
 1 6980f140677b5e0cbec2561d 15245     FINI…              538                 429
 2 69d46a112c7ba62da0198176 15246     FINI…              579                 724
 3 5f888f1504ea37030833e9ec 15247     FINI…             1561                 667
 4 62d43cee3d60ac98c1dcacc8 15248     FINI…             1062                1018
 5 583ef6f5ad2f4300014b358d 15249     FINI…              871                 504
 6 69ced52e0c7bfe72777b1f89 15250     FINI…              607                 684
 7 66ec53e096df305784c5aa6f 15251     FINI…             1324                 308
 8 69b86e94de9c503e9a58411d 15252     FINI…              504                 443
 9 697ca395ee14230291313ff7 15253     FINI…              587                 803
10 60f657824394dac9902c821f 15255     PRE                243                 246
# ℹ 74 more rows
# ℹ 3 more variables: defocus_event_rows <int>, paradata_wide_fields <int>,
#   paradata_long_fields <int>

12 Summary statistics

summary_stats_df <- clean_complete %>%
  select(
    PROLIFIC_PID,
    duration_seconds,
    keyStats_count,
    keyStats_mean,
    keyStats_median,
    keyStats_min,
    keyStats_max
  ) %>%
  left_join(
    paradata_participant_summary %>%
      select(PROLIFIC_PID, paradata_event_rows, defocus_event_rows, paradata_wide_fields, paradata_long_fields),
    by = "PROLIFIC_PID"
  ) %>%
  mutate(
    paradata_event_rows = tidyr::replace_na(paradata_event_rows, 0),
    defocus_event_rows = tidyr::replace_na(defocus_event_rows, 0),
    paradata_wide_fields = tidyr::replace_na(paradata_wide_fields, 0),
    paradata_long_fields = tidyr::replace_na(paradata_long_fields, 0)
  )

psych::describe(summary_stats_df %>% select(where(is.numeric)))

                     vars  n      mean        sd   median  trimmed      mad
duration_seconds        1 69    924.58    492.08   819.00   877.49   455.16
keyStats_count          2 69    514.55    328.10   395.00   470.81   201.63
keyStats_mean           3 69    963.88    671.07   773.74   870.03   547.06
keyStats_median         4 69    208.15     95.48   183.95   196.63    66.05
keyStats_min            5 69     22.40     34.76     6.60    15.28     9.49
keyStats_max            6 69 120610.58 141136.65 65536.60 93808.39 61007.95
paradata_event_rows     7 69    738.38    396.62   667.00   677.18   244.63
defocus_event_rows      8 69      1.06      1.82     0.00     0.68     0.00
paradata_wide_fields    9 69     78.59     12.96    78.00    76.74     1.48
paradata_long_fields   10 69     63.77     30.24    46.00    58.58     2.97
                         min       max    range skew kurtosis       se
duration_seconds      194.00   2149.00   1955.0 0.82    -0.14    59.24
keyStats_count          3.00   1714.00   1711.0 1.47     2.18    39.50
keyStats_mean         227.77   3031.07   2803.3 1.23     0.84    80.79
keyStats_median        40.20    523.80    483.6 1.30     1.85    11.49
keyStats_min            0.00    179.70    179.7 2.30     5.72     4.18
keyStats_max         1197.70 738032.00 736834.3 2.32     5.89 16990.86
paradata_event_rows   253.00   2267.00   2014.0 1.79     3.61    47.75
defocus_event_rows      0.00      9.00      9.0 2.04     4.23     0.22
paradata_wide_fields   69.00    152.00     83.0 5.24    26.82     1.56
paradata_long_fields   42.00    150.00    108.0 1.36     0.46     3.64

12.1 descriptive sample

socidem_df <- clean_complete %>%
  select(starts_with("socidem_"))

psych::describe(socidem_df %>% select(where(is.numeric)))

                        vars  n    mean      sd median trimmed     mad min
socidem_Time.taken         1 69  959.57  512.36    832  905.11  443.30 200
socidem_Total.approvals    2 69 2064.29 2744.69    976 1557.72 1365.47   1
socidem_Age                3 69   29.29    4.39     30   29.60    5.93  20
                          max range  skew kurtosis     se
socidem_Time.taken       2276  2076  0.88    -0.06  61.68
socidem_Total.approvals 12802 12801  1.91     3.76 330.42
socidem_Age                35    15 -0.44    -0.94   0.53

table(socidem_df$socidem_Sex)


Female   Male 
    28     41

table(socidem_df$socidem_Employment.status)


                                            DATA_EXPIRED 
                                                      12 
            Due to start a new job within the next month 
                                                       2 
                                               Full-Time 
                                                      27 
Not in paid work (e.g. homemaker', 'retired or disabled) 
                                                       6 
                                                   Other 
                                                       1 
                                               Part-Time 
                                                      15 
                            Unemployed (and job seeking) 
                                                       6

12.2 descriptive mean differences

assoc_cols <- c("assoc1_valence", "assoc2_valence", "assoc3_valence", "assoc4_valence", "assoc5_valence")
attr_cols <- paste0("attributesFutureSociety-", 1:8)

clean_complete <- clean_complete %>%
  mutate(
    assoc_valence_mean = rowMeans(select(., all_of(assoc_cols)), na.rm = TRUE),
    attributes_mean = rowMeans(select(., all_of(attr_cols)), na.rm = TRUE)
  )

analysis_df <- clean_complete %>%
  select(PROLIFIC_PID, condition_FutureSociety, assoc_valence_mean, attributes_mean, all_of(attr_cols)) %>%
  drop_na(assoc_valence_mean, attributes_mean, condition_FutureSociety)

mean_summary <- analysis_df %>%
  pivot_longer(
    cols = c(assoc_valence_mean, attributes_mean),
    names_to = "measure",
    values_to = "value"
  ) %>%
  group_by(condition_FutureSociety, measure) %>%
  summarise(
    mean = mean(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE),
    n = dplyr::n(),
    se = sd / sqrt(n),
    .groups = "drop"
  ) %>%
  mutate(
    measure = recode(
      measure,
      assoc_valence_mean = "Association Valence Mean",
      attributes_mean = "Attributes Mean"
    )
  )

ggplot(mean_summary, aes(x = condition_FutureSociety, y = mean, fill = measure)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = mean - se, ymax = mean + se),
    position = position_dodge(width = 0.8),
    width = 0.2
  ) +
  labs(x = "Condition", y = "Mean", fill = "Measure") +
  theme_minimal()

anova_assoc <- afex::aov_ez(
  id = "PROLIFIC_PID",
  dv = "assoc_valence_mean",
  between = "condition_FutureSociety",
  data = analysis_df
)

Contrasts set to contr.sum for the following variables: condition_FutureSociety

anova_attr <- afex::aov_ez(
  id = "PROLIFIC_PID",
  dv = "attributes_mean",
  between = "condition_FutureSociety",
  data = analysis_df
)

Contrasts set to contr.sum for the following variables: condition_FutureSociety

summary(anova_assoc)

Anova Table (Type 3 tests)

Response: assoc_valence_mean
                        num Df den Df    MSE      F     ges  Pr(>F)  
condition_FutureSociety      6     61 2.7046 2.6343 0.20579 0.02451 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(anova_attr)

Anova Table (Type 3 tests)

Response: attributes_mean
                        num Df den Df    MSE      F     ges   Pr(>F)   
condition_FutureSociety      6     61 1.4635 3.1599 0.23711 0.009164 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

attr_labels <- c(
  "1" = "Utopian",
  "2" = "Desirable",
  "3" = "Ideal",
  "4" = "Beneficial for the greater good",
  "5" = "Imaginative",
  "6" = "Innovative",
  "7" = "Creative",
  "8" = "Possible"
)

attr_long <- analysis_df %>%
  pivot_longer(
    cols = all_of(attr_cols),
    names_to = "item",
    values_to = "value"
  ) %>%
  mutate(
    item_num = stringr::str_extract(item, "[0-9]+"),
    item_label = recode(item_num, !!!attr_labels)
  )

attr_effects <- attr_long %>%
  filter(!is.na(value)) %>%
  group_by(item_label) %>%
  summarise(
    eta_sq = {
      fit <- aov(value ~ condition_FutureSociety)
      ss <- summary(fit)[[1]]
      ss_between <- ss["condition_FutureSociety", "Sum Sq"]
      ss_total <- sum(ss[, "Sum Sq"])
      if (is.na(ss_between) || is.na(ss_total) || ss_total == 0) NA_real_ else ss_between / ss_total
    },
    .groups = "drop"
  ) %>%
  mutate(
    eta_label = ifelse(is.na(eta_sq), "eta^2=NA", paste0("eta^2=", round(eta_sq, 2)))
  )

attr_effects_table <- attr_effects %>%
  arrange(desc(eta_sq))

attr_effects_table

# A tibble: 8 × 3
  item_label                      eta_sq eta_label 
  <chr>                            <dbl> <chr>     
1 Innovative                      0.251  eta^2=0.25
2 Desirable                       0.227  eta^2=0.23
3 Ideal                           0.220  eta^2=0.22
4 Utopian                         0.208  eta^2=0.21
5 Beneficial for the greater good 0.176  eta^2=0.18
6 Creative                        0.169  eta^2=0.17
7 Imaginative                     0.159  eta^2=0.16
8 Possible                        0.0144 eta^2=0.01

item_order <- attr_effects %>%
  mutate(eta_order = ifelse(is.na(eta_sq), -Inf, eta_sq)) %>%
  arrange(desc(eta_order)) %>%
  pull(item_label)

label_order <- attr_effects %>%
  mutate(eta_order = ifelse(is.na(eta_sq), -Inf, eta_sq)) %>%
  arrange(desc(eta_order)) %>%
  mutate(facet_label = paste0(item_label, "\n", eta_label)) %>%
  pull(facet_label)

attr_summary <- attr_long %>%
  group_by(condition_FutureSociety, item_label) %>%
  summarise(
    mean = mean(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE),
    n = dplyr::n(),
    se = sd / sqrt(n),
    .groups = "drop"
  ) %>%
  left_join(attr_effects %>% select(item_label, eta_label), by = "item_label") %>%
  mutate(
    item_label = factor(item_label, levels = item_order),
    facet_label = factor(paste0(item_label, "\n", eta_label), levels = label_order)
  )

condition_order <- analysis_df %>%
  distinct(condition_FutureSociety) %>%
  arrange(condition_FutureSociety) %>%
  pull(condition_FutureSociety)

attr_summary <- attr_summary %>%
  mutate(condition_FutureSociety = factor(condition_FutureSociety, levels = condition_order))

ggplot(attr_summary, aes(x = condition_FutureSociety, y = mean, fill = condition_FutureSociety)) +
  geom_col(width = 0.7) +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.2) +
  geom_text(aes(label = round(mean, 2)), vjust = -0.3, size = 3) +
  facet_wrap(~ facet_label, ncol = 4) +
  labs(x = "Condition", y = "Mean", fill = "Condition") +
  theme_minimal() +
  theme(legend.position = "bottom", axis.text.x = element_text(angle = 30, hjust = 1))

tmp <- clean_complete[, str_subset(string = colnames(clean_complete), pattern = "attri.*[1-9]")]
psych::corPlot(r = cor(tmp, use = "pairwise"))

hist(clean_complete$`attributesFutureSociety-8`)

hist(clean_complete$`attributesFutureSociety-4`)

tmp_efa <- psych::fa.parallel(x = tmp)

Parallel analysis suggests that the number of factors =  2  and the number of components =  1

tmp_fa <- psych::fa(r = cor(tmp, use = "pairwise"), nfactors = 2, n.obs = nrow(tmp))

Loading required namespace: GPArotation

tmp_fa

Factor Analysis using method =  minres
Call: psych::fa(r = cor(tmp, use = "pairwise"), nfactors = 2, n.obs = nrow(tmp))
Standardized loadings (pattern matrix) based upon correlation matrix
                            MR1   MR2    h2    u2 com
attributesFutureSociety-5 -0.10  0.95 0.793 0.207 1.0
attributesFutureSociety-6  0.25  0.57 0.576 0.424 1.4
attributesFutureSociety-2  0.97 -0.03 0.911 0.089 1.0
attributesFutureSociety-3  0.96 -0.01 0.917 0.083 1.0
attributesFutureSociety-4  0.68  0.18 0.661 0.339 1.1
attributesFutureSociety-7  0.07  0.91 0.914 0.086 1.0
attributesFutureSociety-1  0.27  0.43 0.412 0.588 1.7
attributesFutureSociety-8  0.23 -0.01 0.052 0.948 1.0

                       MR1  MR2
SS loadings           2.75 2.49
Proportion Var        0.34 0.31
Cumulative Var        0.34 0.65
Proportion Explained  0.52 0.48
Cumulative Proportion 0.52 1.00

 With factor correlations of 
     MR1  MR2
MR1 1.00 0.65
MR2 0.65 1.00

Mean item complexity =  1.1
Test of the hypothesis that 2 factors are sufficient.

df null model =  28  with the objective function =  6.38 with Chi Square =  411.24
df of  the model are 13  and the objective function was  0.41 

The root mean square of the residuals (RMSR) is  0.03 
The df corrected root mean square of the residuals is  0.04 

The harmonic n.obs is  69 with the empirical chi square  3.16  with prob <  1 
The total n.obs was  69  with Likelihood Chi Square =  25.81  with prob <  0.018 

Tucker Lewis Index of factoring reliability =  0.926
RMSEA index =  0.119  and the 90 % confidence intervals are  0.048 0.188
BIC =  -29.23
Fit based upon off diagonal values = 1
Measures of factor score adequacy             
                                                   MR1  MR2
Correlation of (regression) scores with factors   0.94 0.92
Multiple R square of scores with factors          0.87 0.84
Minimum correlation of possible factor scores     0.75 0.68