Data preperation

Author

Julius Fenn

1 Notes

This script creates a simplified participant-level dataset using the raw study.rds file and produces:

  • A clean wide table with core variables + associations (wide) + keyStats
  • Paradata events in long format (clipboard, mouse, scroll, keyIntervals)
  • Defocus events in long format

2 global variables

createRawFiles <- TRUE

data_root <- "data"
output_root <- "outputs"
raw_output_dir <- file.path(output_root, "raw")
questionnaire_output_dir <- file.path(output_root, "questionnaire")
paradata_output_dir <- file.path(output_root, "paradata")

3 functions

last_non_empty <- function(x) {
  if (length(x) == 0) return(NA)
  for (i in seq_along(x)[length(x):1]) {
    val <- x[[i]]
    if (is.null(val)) next
    if (is.list(val) && length(val) == 0) next
    if (!is.list(val) && length(val) == 1 && is.na(val)) next
    if (is.character(val) && length(val) == 1 && isTRUE(val == "")) next
    return(val)
  }
  return(NA)
}

`%||%` <- function(x, y) {
  if (is.null(x)) y else x
}

parse_duration_seconds <- function(x) {
  if (is.na(x) || x == "") return(NA_real_)
  clean <- stringr::str_remove(x, "\\s*\\(.*\\)")
  if (is.na(clean) || clean == "") return(NA_real_)
  as.numeric(hms::as_hms(clean))
}

null_to_empty <- function(x) {
  if (is.null(x)) list() else x
}

normalize_cell <- function(val) {
  if (is.null(val)) return(NA)
  if (is.data.frame(val)) return(list(val))
  if (is.list(val)) {
    if (length(val) == 0) return(NA)
    return(list(val))
  }
  if (length(val) != 1) return(list(val))
  val
}

collapse_list_to_string <- function(x, sep = " // ") {
  if (is.null(x)) return(NA_character_)
  if (is.list(x)) {
    if (length(x) == 0) return(NA_character_)
    flat <- unlist(x, recursive = TRUE, use.names = FALSE)
    if (length(flat) == 0) return(NA_character_)
    return(paste0(flat, collapse = sep))
  }
  if (length(x) > 1) return(paste0(x, collapse = sep))
  as.character(x)
}

sanitize_paradata_name <- function(name) {
  cleaned <- gsub("[^A-Za-z0-9]+", "_", name)
  cleaned <- gsub("^_+|_+$", "", cleaned)
  paste0("para_", cleaned)
}

init_paradata_result <- function() {
  list(
    wide_vals = list(),
    wide_cols = character(),
    wide_cols_output = character(),
    long_rows = list(),
    long_cols = character()
  )
}

append_paradata_result <- function(target, source) {
  target$wide_vals <- c(target$wide_vals, source$wide_vals)
  target$wide_cols <- c(target$wide_cols, source$wide_cols)
  target$wide_cols_output <- c(target$wide_cols_output, source$wide_cols_output)
  target$long_rows <- c(target$long_rows, source$long_rows)
  target$long_cols <- c(target$long_cols, source$long_cols)
  target
}

add_paradata_wide <- function(res, field_label, value) {
  out_col <- sanitize_paradata_name(field_label)
  res$wide_vals[[out_col]] <- value
  res$wide_cols <- c(res$wide_cols, field_label)
  res$wide_cols_output <- c(res$wide_cols_output, out_col)
  res
}

add_paradata_long_row <- function(res, pid, result_id, field_label, value, row_index, timestamp) {
  res$long_rows[[length(res$long_rows) + 1]] <- tibble::tibble(
    PROLIFIC_PID = pid,
    result_id = result_id,
    origin = "paradata_general",
    event_type = "paradata_column",
    event_index = row_index,
    timestamp = timestamp %||% NA_character_,
    component = NA_character_,
    field = field_label,
    value = value,
    mouse_x = NA_real_,
    mouse_y = NA_real_,
    scroll_x = NA_real_,
    scroll_y = NA_real_,
    key_interval_ms = NA_real_,
    clipboard_type = NA_character_,
    clipboard_text = NA_character_,
    clipboard_length = NA_real_,
    target_tag = NA_character_,
    target_type = NA_character_,
    target_id = NA_character_,
    target_name = NA_character_,
    target_className = NA_character_
  )
  res$long_cols <- c(res$long_cols, field_label)
  res
}

get_event_value <- function(ev, name) {
  if (is.null(ev)) return(NA)
  if (is.data.frame(ev)) return(ev[[name]] %||% NA)
  if (is.list(ev)) return(ev[[name]] %||% NA)
  NA
}

target_type_fallback <- function(type_val, tag_val) {
  if (!is.null(type_val) && length(type_val) > 0 && !is.na(type_val) && type_val != "") {
    return(as.character(type_val))
  }
  if (!is.null(tag_val) && length(tag_val) > 0 && !is.na(tag_val) && tag_val != "") {
    return(tolower(as.character(tag_val)))
  }
  NA_character_
}

get_target_value <- function(ev, field) {
  if (is.null(ev)) return(NA)
  if (is.data.frame(ev)) {
    col <- paste0("target.", field)
    if (col %in% names(ev)) {
      val <- ev[[col]]
      if (length(val) == 0) return(NA)
      return(val[[1]] %||% NA)
    }

    if ("target" %in% names(ev)) {
      target_col <- ev[["target"]]
      if (is.list(target_col) && length(target_col) > 0) {
        target_obj <- target_col[[1]]
        if (is.data.frame(target_obj) && field %in% names(target_obj)) {
          val <- target_obj[[field]]
          if (length(val) == 0) return(NA)
          return(val[[1]] %||% NA)
        }
        if (is.list(target_obj)) return(target_obj[[field]] %||% NA)
      }
      if (is.data.frame(target_col) && field %in% names(target_col)) {
        val <- target_col[[field]]
        if (length(val) == 0) return(NA)
        return(val[[1]] %||% NA)
      }
    }

    return(NA)
  }
  if (is.list(ev) && !is.null(ev$target)) return(ev$target[[field]] %||% NA)
  NA
}

is_non_empty_element <- function(x) {
  if (is.null(x)) return(FALSE)
  if (is.data.frame(x)) return(nrow(x) > 0)
  if (is.list(x)) {
    if (length(x) == 0) return(FALSE)
    return(!all(purrr::map_lgl(x, ~ is.null(.x) || (is.list(.x) && length(.x) == 0))))
  }
  length(x) > 0
}

find_last_non_empty_element <- function(col_vals) {
  if (is.null(col_vals)) return(list(value = NULL, index = NA_integer_))
  if (!is.list(col_vals)) {
    if (length(col_vals) == 0) return(list(value = NULL, index = NA_integer_))
    return(list(value = col_vals, index = length(col_vals)))
  }
  for (i in seq_along(col_vals)[length(col_vals):1]) {
    v <- col_vals[[i]]
    if (is_non_empty_element(v)) return(list(value = v, index = i))
  }
  list(value = NULL, index = NA_integer_)
}

process_group_df <- function(df, prefix_label, timestamp_vec, pid, result_id) {
  res <- init_paradata_result()
  if (is.null(df) || !is.data.frame(df)) return(res)

  for (col in names(df)) {
    col_vals <- df[[col]]
    field_label <- paste0(prefix_label, ".", col)

    if (is.data.frame(col_vals)) {
      nested_res <- process_group_df(col_vals, field_label, timestamp_vec, pid, result_id)
      res <- append_paradata_result(res, nested_res)
      next
    }

    if (is.list(col_vals)) {
      vals_json <- vapply(col_vals, function(v) {
        if (is.null(v) || (is.list(v) && length(v) == 0)) return(NA_character_)
        jsonlite::toJSON(v, auto_unbox = TRUE)
      }, character(1))

      non_na <- vals_json[!is.na(vals_json)]
      if (length(non_na) == 0) next
      uniq <- unique(non_na)

      if (length(uniq) == 1) {
        res <- add_paradata_wide(res, field_label, uniq[[1]])
      } else {
        for (r in seq_along(vals_json)) {
          v <- vals_json[[r]]
          if (is.na(v)) next
          ts <- if (!is.null(timestamp_vec) && length(timestamp_vec) >= r) timestamp_vec[[r]] else NA_character_
          res <- add_paradata_long_row(res, pid, result_id, field_label, v, r, ts)
        }
      }
      next
    }

    non_na <- col_vals[!is.na(col_vals)]
    if (length(non_na) == 0) next
    uniq <- unique(non_na)

    if (length(uniq) == 1) {
      res <- add_paradata_wide(res, field_label, uniq[[1]])
    } else {
      for (r in seq_along(col_vals)) {
        v <- col_vals[[r]]
        if (is.na(v)) next
        ts <- if (!is.null(timestamp_vec) && length(timestamp_vec) >= r) timestamp_vec[[r]] else NA_character_
        res <- add_paradata_long_row(res, pid, result_id, field_label, as.character(v), r, ts)
      }
    }
  }

  res
}

flatten_with_prefix <- function(x, prefix) {
  if (is.null(x)) return(list())
  flat <- unlist(x, recursive = TRUE, use.names = TRUE)
  if (length(flat) == 0) return(list())
  names(flat) <- paste0(prefix, names(flat))
  as.list(flat)
}

assoc_to_wide <- function(assoc_list) {
  if (is.null(assoc_list) || length(assoc_list) == 0) {
    return(tibble::tibble(
      assoc1_response = NA_character_, assoc1_valence = NA_character_,
      assoc2_response = NA_character_, assoc2_valence = NA_character_,
      assoc3_response = NA_character_, assoc3_valence = NA_character_,
      assoc4_response = NA_character_, assoc4_valence = NA_character_,
      assoc5_response = NA_character_, assoc5_valence = NA_character_
    ))
  }

  assoc_df <- dplyr::as_tibble(assoc_list)
  assoc_df <- assoc_df[seq_len(min(5, nrow(assoc_df))), ]

  get_val <- function(df, col, idx) {
    if (!col %in% names(df)) return(NA_character_)
    if (nrow(df) < idx) return(NA_character_)
    df[[col]][idx]
  }

  tibble::tibble(
    assoc1_response = get_val(assoc_df, "response", 1),
    assoc1_valence = get_val(assoc_df, "valence", 1),
    assoc2_response = get_val(assoc_df, "response", 2),
    assoc2_valence = get_val(assoc_df, "valence", 2),
    assoc3_response = get_val(assoc_df, "response", 3),
    assoc3_valence = get_val(assoc_df, "valence", 3),
    assoc4_response = get_val(assoc_df, "response", 4),
    assoc4_valence = get_val(assoc_df, "valence", 4),
    assoc5_response = get_val(assoc_df, "response", 5),
    assoc5_valence = get_val(assoc_df, "valence", 5)
  )
}

build_paradata_events <- function(pid, result_id, paradata) {
  if (is.null(paradata) || is.null(paradata$behavior)) return(tibble::tibble())

  events <- list()
  behavior <- paradata$behavior

  clipboard_events <- null_to_empty(behavior$clipboardEvents)
  if (length(clipboard_events)) {
    events[[length(events) + 1]] <- purrr::map_dfr(seq_along(clipboard_events), function(i) {
      ev <- clipboard_events[[i]]
      target <- ev$target
      tibble::tibble(
        PROLIFIC_PID = pid,
        result_id = result_id,
        origin = "paradata_general",
        event_type = ev$type %||% "clipboard",
        event_index = i,
        timestamp = ev$timestamp %||% NA_character_,
      component = ev$component %||% NA_character_,
      field = NA_character_,
      value = NA_character_,
      mouse_x = NA_real_,
      mouse_y = NA_real_,
        scroll_x = NA_real_,
        scroll_y = NA_real_,
        key_interval_ms = NA_real_,
        clipboard_type = ev$type %||% NA_character_,
        clipboard_text = ev$text %||% NA_character_,
        clipboard_length = ev$length %||% NA_real_,
        target_tag = target$tag %||% NA_character_,
        target_type = target_type_fallback(target$type %||% NA_character_, target$tag %||% NA_character_),
        target_id = target$id %||% NA_character_,
        target_name = target$name %||% NA_character_,
        target_className = target$className %||% NA_character_
      )
    })
  }

  mouse_events <- null_to_empty(behavior$mouseEvents)
  if (length(mouse_events)) {
    events[[length(events) + 1]] <- purrr::map_dfr(seq_along(mouse_events), function(i) {
      ev <- mouse_events[[i]]
      tibble::tibble(
        PROLIFIC_PID = pid,
        result_id = result_id,
        origin = "paradata_general",
        event_type = "mousemove",
        event_index = i,
        timestamp = ev$timestamp %||% NA_character_,
      component = ev$component %||% NA_character_,
      field = NA_character_,
      value = NA_character_,
      mouse_x = ev$x %||% NA_real_,
      mouse_y = ev$y %||% NA_real_,
        scroll_x = NA_real_,
        scroll_y = NA_real_,
        key_interval_ms = NA_real_,
        clipboard_type = NA_character_,
        clipboard_text = NA_character_,
        clipboard_length = NA_real_,
        target_tag = NA_character_,
        target_type = NA_character_,
        target_id = NA_character_,
        target_name = NA_character_,
        target_className = NA_character_
      )
    })
  }

  scroll_events <- null_to_empty(behavior$scrollEvents)
  if (length(scroll_events)) {
    events[[length(events) + 1]] <- purrr::map_dfr(seq_along(scroll_events), function(i) {
      ev <- scroll_events[[i]]
      tibble::tibble(
        PROLIFIC_PID = pid,
        result_id = result_id,
        origin = "paradata_general",
        event_type = "scroll",
        event_index = i,
        timestamp = ev$timestamp %||% NA_character_,
      component = ev$component %||% NA_character_,
      field = NA_character_,
      value = NA_character_,
      mouse_x = NA_real_,
      mouse_y = NA_real_,
        scroll_x = ev$scrollX %||% NA_real_,
        scroll_y = ev$scrollY %||% NA_real_,
        key_interval_ms = NA_real_,
        clipboard_type = NA_character_,
        clipboard_text = NA_character_,
        clipboard_length = NA_real_,
        target_tag = NA_character_,
        target_type = NA_character_,
        target_id = NA_character_,
        target_name = NA_character_,
        target_className = NA_character_
      )
    })
  }


  if (length(events) == 0) return(tibble::tibble())
  dplyr::bind_rows(events)
}

process_paradata_general <- function(pid, result_id, paradata) {
  empty <- list(
    wide = tibble::tibble(),
    long = tibble::tibble(),
    wide_cols = character(),
    wide_cols_output = character(),
    long_cols = character(),
    key_stats = tibble::tibble(
      keyStats_count = NA_real_,
      keyStats_mean = NA_real_,
      keyStats_median = NA_real_,
      keyStats_min = NA_real_,
      keyStats_max = NA_real_
    )
  )

  if (is.null(paradata)) return(empty)

  if (is.data.frame(paradata) && all(c("static", "ip", "behavior", "meta") %in% names(paradata))) {
    static_df <- paradata$static
    ip_df <- paradata$ip
    behavior_df <- paradata$behavior
    meta_df <- paradata$meta
    ip_error <- paradata$ipError

    timestamps <- NULL
    if (is.data.frame(static_df) && "timestamp" %in% names(static_df)) {
      timestamps <- static_df$timestamp
    } else if (is.data.frame(meta_df) && "updatedAt" %in% names(meta_df)) {
      timestamps <- meta_df$updatedAt
    }

    res <- init_paradata_result()
    res <- append_paradata_result(res, process_group_df(static_df, "static", timestamps, pid, result_id))
    res <- append_paradata_result(res, process_group_df(ip_df, "ip", timestamps, pid, result_id))
    res <- append_paradata_result(res, process_group_df(meta_df, "meta", timestamps, pid, result_id))

    if (!is.null(ip_error)) {
      non_na <- ip_error[!is.na(ip_error)]
      if (length(non_na) == 1) {
        res <- add_paradata_wide(res, "ipError", non_na[[1]])
      } else if (length(non_na) > 1) {
        for (r in seq_along(ip_error)) {
          v <- ip_error[[r]]
          if (is.na(v)) next
          ts <- if (!is.null(timestamps) && length(timestamps) >= r) timestamps[[r]] else NA_character_
          res <- add_paradata_long_row(res, pid, result_id, "ipError", as.character(v), r, ts)
        }
      }
    }

    key_stats <- empty$key_stats
    if (is.data.frame(behavior_df) && "keyStats" %in% names(behavior_df)) {
      key_df <- behavior_df$keyStats
      if (is.data.frame(key_df)) {
        key_stats <- tibble::tibble(
          keyStats_count = if ("count" %in% names(key_df)) last_non_empty(key_df$count) else NA_real_,
          keyStats_mean = if ("mean" %in% names(key_df)) last_non_empty(key_df$mean) else NA_real_,
          keyStats_median = if ("median" %in% names(key_df)) last_non_empty(key_df$median) else NA_real_,
          keyStats_min = if ("min" %in% names(key_df)) last_non_empty(key_df$min) else NA_real_,
          keyStats_max = if ("max" %in% names(key_df)) last_non_empty(key_df$max) else NA_real_
        )
      }
    }

    long_rows <- list()
    if (is.data.frame(behavior_df)) {
      if ("clipboardEvents" %in% names(behavior_df)) {
        last_clip <- find_last_non_empty_element(behavior_df$clipboardEvents)
        ev_list <- last_clip$value
        if (!is.null(ev_list)) {
          if (is.data.frame(ev_list)) {
            for (j in seq_len(nrow(ev_list))) {
              ev <- ev_list[j, ]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = get_event_value(ev, "type") %||% "clipboard",
                event_index = j,
                timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_clip$index) && length(timestamps) >= last_clip$index) timestamps[[last_clip$index]] else NA_character_),
                component = get_event_value(ev, "component"),
                field = NA_character_,
                value = NA_character_,
                mouse_x = NA_real_,
                mouse_y = NA_real_,
                scroll_x = NA_real_,
                scroll_y = NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = get_event_value(ev, "type"),
                clipboard_text = get_event_value(ev, "text"),
                clipboard_length = get_event_value(ev, "length"),
                target_tag = get_target_value(ev, "tag"),
                target_type = target_type_fallback(get_target_value(ev, "type"), get_target_value(ev, "tag")),
                target_id = get_target_value(ev, "id"),
                target_name = get_target_value(ev, "name"),
                target_className = get_target_value(ev, "className")
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.clipboardEvents")
          } else if (is.list(ev_list)) {
            ev_list <- null_to_empty(ev_list)
            for (j in seq_along(ev_list)) {
              ev <- ev_list[[j]]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = get_event_value(ev, "type") %||% "clipboard",
                event_index = j,
                timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_clip$index) && length(timestamps) >= last_clip$index) timestamps[[last_clip$index]] else NA_character_),
                component = get_event_value(ev, "component"),
                field = NA_character_,
                value = NA_character_,
                mouse_x = NA_real_,
                mouse_y = NA_real_,
                scroll_x = NA_real_,
                scroll_y = NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = get_event_value(ev, "type"),
                clipboard_text = get_event_value(ev, "text"),
                clipboard_length = get_event_value(ev, "length"),
                target_tag = get_target_value(ev, "tag"),
                target_type = target_type_fallback(get_target_value(ev, "type"), get_target_value(ev, "tag")),
                target_id = get_target_value(ev, "id"),
                target_name = get_target_value(ev, "name"),
                target_className = get_target_value(ev, "className")
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.clipboardEvents")
          }
        }
      }

      if ("mouseEvents" %in% names(behavior_df)) {
        last_mouse <- find_last_non_empty_element(behavior_df$mouseEvents)
        ev_df <- last_mouse$value
        if (!is.null(ev_df)) {
          if (is.data.frame(ev_df)) {
            for (j in seq_len(nrow(ev_df))) {
              ev <- ev_df[j, ]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "mousemove",
                event_index = j,
                timestamp = ev$timestamp %||% (if (!is.null(timestamps) && !is.na(last_mouse$index) && length(timestamps) >= last_mouse$index) timestamps[[last_mouse$index]] else NA_character_),
                component = ev$component %||% NA_character_,
                field = NA_character_,
                value = NA_character_,
                mouse_x = ev$x %||% NA_real_,
                mouse_y = ev$y %||% NA_real_,
                scroll_x = NA_real_,
                scroll_y = NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = NA_character_,
                clipboard_text = NA_character_,
                clipboard_length = NA_real_,
                target_tag = NA_character_,
                target_type = NA_character_,
                target_id = NA_character_,
                target_name = NA_character_,
                target_className = NA_character_
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.mouseEvents")
          } else if (is.list(ev_df)) {
            ev_df <- null_to_empty(ev_df)
            for (j in seq_along(ev_df)) {
              ev <- ev_df[[j]]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "mousemove",
                event_index = j,
                timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_mouse$index) && length(timestamps) >= last_mouse$index) timestamps[[last_mouse$index]] else NA_character_),
                component = get_event_value(ev, "component"),
                field = NA_character_,
                value = NA_character_,
                mouse_x = get_event_value(ev, "x"),
                mouse_y = get_event_value(ev, "y"),
                scroll_x = NA_real_,
                scroll_y = NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = NA_character_,
                clipboard_text = NA_character_,
                clipboard_length = NA_real_,
                target_tag = NA_character_,
                target_type = NA_character_,
                target_id = NA_character_,
                target_name = NA_character_,
                target_className = NA_character_
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.mouseEvents")
          }
        }
      }

      if ("scrollEvents" %in% names(behavior_df)) {
        last_scroll <- find_last_non_empty_element(behavior_df$scrollEvents)
        ev_df <- last_scroll$value
        if (!is.null(ev_df)) {
          if (is.data.frame(ev_df)) {
            for (j in seq_len(nrow(ev_df))) {
              ev <- ev_df[j, ]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "scroll",
                event_index = j,
                timestamp = ev$timestamp %||% (if (!is.null(timestamps) && !is.na(last_scroll$index) && length(timestamps) >= last_scroll$index) timestamps[[last_scroll$index]] else NA_character_),
                component = ev$component %||% NA_character_,
                field = NA_character_,
                value = NA_character_,
                mouse_x = NA_real_,
                mouse_y = NA_real_,
                scroll_x = ev$scrollX %||% NA_real_,
                scroll_y = ev$scrollY %||% NA_real_,
                key_interval_ms = NA_real_,
                clipboard_type = NA_character_,
                clipboard_text = NA_character_,
                clipboard_length = NA_real_,
                target_tag = NA_character_,
                target_type = NA_character_,
                target_id = NA_character_,
                target_name = NA_character_,
                target_className = NA_character_
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.scrollEvents")
          } else if (is.list(ev_df)) {
            ev_df <- null_to_empty(ev_df)
            for (j in seq_along(ev_df)) {
              ev <- ev_df[[j]]
              long_rows[[length(long_rows) + 1]] <- tibble::tibble(
                PROLIFIC_PID = pid,
                result_id = result_id,
                origin = "paradata_general",
                event_type = "scroll",
                event_index = j,
                timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_scroll$index) && length(timestamps) >= last_scroll$index) timestamps[[last_scroll$index]] else NA_character_),
                component = get_event_value(ev, "component"),
                field = NA_character_,
                value = NA_character_,
                mouse_x = NA_real_,
                mouse_y = NA_real_,
                scroll_x = get_event_value(ev, "scrollX"),
                scroll_y = get_event_value(ev, "scrollY"),
                key_interval_ms = NA_real_,
                clipboard_type = NA_character_,
                clipboard_text = NA_character_,
                clipboard_length = NA_real_,
                target_tag = NA_character_,
                target_type = NA_character_,
                target_id = NA_character_,
                target_name = NA_character_,
                target_className = NA_character_
              )
            }
            res$long_cols <- c(res$long_cols, "behavior.scrollEvents")
          }
        }
      }

    }

    if (length(long_rows)) {
      res$long_rows <- c(res$long_rows, long_rows)
    }

    return(list(
      wide = if (length(res$wide_vals)) tibble::as_tibble(res$wide_vals) else tibble::tibble(),
      long = if (length(res$long_rows)) dplyr::bind_rows(res$long_rows) else tibble::tibble(),
      wide_cols = res$wide_cols,
      wide_cols_output = res$wide_cols_output,
      long_cols = res$long_cols,
      key_stats = key_stats
    ))
  }

  if (is.list(paradata)) {
    wide_vals <- c(
      flatten_with_prefix(paradata$static, "para_static_"),
      flatten_with_prefix(paradata$ip, "para_ip_"),
      flatten_with_prefix(paradata$meta, "para_meta_")
    )

    key_stats <- tibble::tibble(
      keyStats_count = paradata$behavior$keyStats$count %||% NA_real_,
      keyStats_mean = paradata$behavior$keyStats$mean %||% NA_real_,
      keyStats_median = paradata$behavior$keyStats$median %||% NA_real_,
      keyStats_min = paradata$behavior$keyStats$min %||% NA_real_,
      keyStats_max = paradata$behavior$keyStats$max %||% NA_real_
    )

    return(list(
      wide = if (length(wide_vals)) tibble::as_tibble(wide_vals) else tibble::tibble(),
      long = build_paradata_events(pid, result_id, paradata),
      wide_cols = names(wide_vals),
      wide_cols_output = names(wide_vals),
      long_cols = character(),
      key_stats = key_stats
    ))
  }

  empty
}

build_defocus_events <- function(pid, result_id, defocus_list) {
  if (is.null(defocus_list)) return(tibble::tibble())

  if (is.data.frame(defocus_list)) {
    if (nrow(defocus_list) == 0) return(tibble::tibble())
    return(dplyr::mutate(
      tibble::as_tibble(defocus_list),
      PROLIFIC_PID = pid,
      result_id = result_id,
      origin = "paradata_focus",
      event_index = dplyr::row_number()
    ) %>%
      dplyr::select(PROLIFIC_PID, result_id, origin, event_index, durationblur, senderblur))
  }

  if (is.list(defocus_list)) {
    if (length(defocus_list) == 0) return(tibble::tibble())
    return(purrr::map_dfr(seq_along(defocus_list), function(i) {
      ev <- defocus_list[[i]]
      tibble::tibble(
        PROLIFIC_PID = pid,
        result_id = result_id,
        origin = "paradata_focus",
        event_index = i,
        durationblur = ev$durationblur %||% NA_real_,
        senderblur = ev$senderblur %||% NA_character_
      )
    }))
  }

  tibble::tibble()
}

is_missing_value <- function(x) {
  if (is.null(x)) return(TRUE)
  if (is.list(x)) {
    return(length(x) == 0 || all(is.na(unlist(x))))
  }
  if (length(x) > 1) {
    if (is.character(x)) return(all(is.na(x) | x == ""))
    return(all(is.na(x)))
  }
  if (is.character(x)) return(is.na(x) | x == "")
  is.na(x)
}

4 load packages

require(pacman)
p_load(
  "tidyverse",
  "jsonlite",
  "hms",
  "psych",
  "afex",
  "writexl",
  "readxl",
  "haven"
)

5 create raw data files

if (createRawFiles) {
  dir.create(raw_output_dir, recursive = TRUE, showWarnings = FALSE)

  data_files <- list.files(
    data_root,
    pattern = "data.txt$",
    recursive = TRUE,
    full.names = TRUE
  )

  results <- list()
  for (file in data_files) {
    result_folder <- basename(dirname(dirname(file)))
    result_id <- stringr::str_remove(result_folder, "^study_result_")

    tmp <- jsonlite::fromJSON(file)
    tmp[, colnames(tmp) == ""] <- NULL
    tmp <- dplyr::as_tibble(tmp)
    tmp$result_id <- as.character(result_id)

    results[[length(results) + 1]] <- tmp
  }

  saveRDS(results, file = file.path(raw_output_dir, "study.rds"))
  json_lines <- purrr::map_chr(results, ~ jsonlite::toJSON(.x, auto_unbox = TRUE))
  writeLines(json_lines, con = file.path(raw_output_dir, "study.jsonl"))

  print("Raw files have been created in this run!")
} else {
  print("Raw files have been created in previous run!")
}
[1] "Raw files have been created in this run!"

6 load JATOS metadata

metadata_files <- list.files(
  data_root,
  pattern = "jatos_results_metadata_.*\\.csv$",
  full.names = TRUE
)

metadata_file <- metadata_files[which.max(file.info(metadata_files)$mtime)]
metadata <- readr::read_csv(metadata_file, show_col_types = FALSE)
names(metadata) <- make.names(names(metadata))

metadata <- metadata %>%
  mutate(
    result_id = as.character(Result.ID),
    duration_seconds = purrr::map_dbl(Duration, parse_duration_seconds)
  )

metadata_small <- metadata %>%
  select(result_id, State, Duration, duration_seconds)

table(metadata_small$State)

FINISHED      PRE 
      68       20 

7 load Prolific demographics

prolific_demographic_file <- file.path(
  data_root,
  "prolific_demographic_export_69d4ced16907d423ae787668.csv"
)

prolific_demographic_raw <- readr::read_csv(prolific_demographic_file, show_col_types = FALSE)
names(prolific_demographic_raw) <- make.names(names(prolific_demographic_raw))

prolific_demographic <- prolific_demographic_raw %>%
  rename(PROLIFIC_PID = Participant.id) %>%
  mutate(PROLIFIC_PID = as.character(PROLIFIC_PID)) %>%
  distinct(PROLIFIC_PID, .keep_all = TRUE) %>%
  rename_with(~ paste0("socidem_", .x), -PROLIFIC_PID)

8 build participant-level data

study <- readRDS(file.path(raw_output_dir, "study.rds"))

core_vars <- c(
  "dummy_informedconsent",
  "hp_exclusionCriteria",
  "commCheck",
  "counterVisualTraps",
  "visualTrapResponse",
  "visualTrap_Planets_response",
  "visualTrap_Planets_responseLabel",
  "visualTrap_Planets_correct",
  "visualTrap_Planets_rtMs",
  "visualTrap_Collision_response",
  "visualTrap_Collision_responseLabel",
  "visualTrap_Collision_correct",
  "visualTrap_Collision_rtMs",
  "visualTrap_CafeWall_response",
  "visualTrap_CafeWall_responseLabel",
  "visualTrap_CafeWall_correct",
  "visualTrap_CafeWall_rtMs",
  "visualTrap_Circles_response",
  "visualTrap_Circles_responseLabel",
  "visualTrap_Circles_correct",
  "visualTrap_Circles_rtMs",
  "visualTrap_Lines_response",
  "visualTrap_Lines_responseLabel",
  "visualTrap_Lines_correct",
  "visualTrap_Lines_rtMs",
  "visualTrap_Robot_response",
  "visualTrap_Robot_responseLabel",
  "visualTrap_Robot_correct",
  "visualTrap_Robot_rtMs",
  "visualTrap_order",
  "condition_FutureSociety",
  "cue",
  "cue_coding",
  "R1",
  "R2",
  "R3",
  "R4",
  "R5",
  "para_countclicks",
  "AR-2",
  "AR-5",
  "AR-3",
  "AR-1",
  "AR-4",
  "undUtopiaGeneral",
  "undUtopiaGoal",
  "undUtopiaHow",
  "clearUtopia",
  "clearUtopiatext",
  "biasUtopia",
  "biasUtopiatext",
  "attributesFutureSociety-5",
  "attributesFutureSociety-6",
  "attributesFutureSociety-2",
  "attributesFutureSociety-3",
  "attributesFutureSociety-4",
  "attributesFutureSociety-7",
  "attributesFutureSociety-1",
  "attributesFutureSociety-8",
  "utopiaPrototypeAssignment",
  "protoConfidenceRadio",
  "protoReasonText",
  "feedback_critic"
)

core_rows <- list()
paradata_events_rows <- list()
paradata_defocus_rows <- list()
paradata_wide_cols_rows <- list()
paradata_long_cols_rows <- list()

for (i in seq_along(study)) {
  print(i)
  df <- study[[i]]

  pid <- if ("PROLIFIC_PID" %in% names(df)) last_non_empty(df$PROLIFIC_PID) else NA
  result_id <- if ("result_id" %in% names(df)) last_non_empty(df$result_id) else NA

  core_values <- lapply(core_vars, function(v) {
    val <- if (v %in% names(df)) {
      last_non_empty(df[[v]])
    } else if (v == "attributesFutureSociety-8" && "attributesFutureSociety-undefined" %in% names(df)) {
      last_non_empty(df[["attributesFutureSociety-undefined"]])
    } else {
      NA
    }
    normalize_cell(val)
  })
  names(core_values) <- core_vars
  core_row <- tibble::as_tibble(core_values)

  if ("visualTrap_order" %in% names(core_row)) {
    core_row$visualTrap_order <- collapse_list_to_string(core_row$visualTrap_order)
  }

  assoc_list <- if ("sucsessfulAssociations" %in% names(df)) last_non_empty(df$sucsessfulAssociations) else NULL
  assoc_wide <- assoc_to_wide(assoc_list)

  paradata <- if ("paradata_general" %in% names(df)) df$paradata_general else NULL
  paradata_summary <- process_paradata_general(pid, result_id, paradata)
  paradata_wide <- paradata_summary$wide
  key_stats_row <- paradata_summary$key_stats

  core_row <- dplyr::bind_cols(
    tibble::tibble(result_id = result_id, PROLIFIC_PID = pid),
    core_row,
    assoc_wide,
    paradata_wide,
    key_stats_row
  )

  core_rows[[length(core_rows) + 1]] <- core_row

  paradata_events <- paradata_summary$long
  if (nrow(paradata_events) > 0) {
    paradata_events_rows[[length(paradata_events_rows) + 1]] <- paradata_events
  }

  if (length(paradata_summary$wide_cols) > 0) {
    paradata_wide_cols_rows[[length(paradata_wide_cols_rows) + 1]] <- tibble::tibble(
      PROLIFIC_PID = pid,
      result_id = result_id,
      field = paradata_summary$wide_cols,
      output_field = paradata_summary$wide_cols_output,
      destination = "wide"
    )
  }

  if (length(paradata_summary$long_cols) > 0) {
    paradata_long_cols_rows[[length(paradata_long_cols_rows) + 1]] <- tibble::tibble(
      PROLIFIC_PID = pid,
      result_id = result_id,
      field = paradata_summary$long_cols,
      output_field = NA_character_,
      destination = "long"
    )
  }

  defocus_list <- if ("para_defocuscount" %in% names(df)) last_non_empty(df$para_defocuscount) else NULL
  defocus_events <- build_defocus_events(pid, result_id, defocus_list)
  if (nrow(defocus_events) > 0) {
    paradata_defocus_rows[[length(paradata_defocus_rows) + 1]] <- defocus_events
  }
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88
core_df <- dplyr::bind_rows(core_rows)
paradata_events_long <- dplyr::bind_rows(paradata_events_rows)
paradata_defocus_long <- dplyr::bind_rows(paradata_defocus_rows)
paradata_column_map <- dplyr::bind_rows(paradata_wide_cols_rows, paradata_long_cols_rows)

if (nrow(paradata_events_long) == 0) {
  paradata_events_long <- tibble::tibble(
    PROLIFIC_PID = character(),
    result_id = character(),
    origin = character(),
    event_type = character(),
    event_index = integer(),
    timestamp = character(),
    component = character(),
    field = character(),
    value = character(),
    mouse_x = double(),
    mouse_y = double(),
    scroll_x = double(),
    scroll_y = double(),
    key_interval_ms = double(),
    clipboard_type = character(),
    clipboard_text = character(),
    clipboard_length = double(),
    target_tag = character(),
    target_type = character(),
    target_id = character(),
    target_name = character(),
    target_className = character()
  )
}

if (nrow(paradata_defocus_long) == 0) {
  paradata_defocus_long <- tibble::tibble(
    PROLIFIC_PID = character(),
    result_id = character(),
    origin = character(),
    event_index = integer(),
    durationblur = double(),
    senderblur = character()
  )
}

if (nrow(paradata_column_map) == 0) {
  paradata_column_map <- tibble::tibble(
    PROLIFIC_PID = character(),
    result_id = character(),
    field = character(),
    output_field = character(),
    destination = character()
  )
}

core_df <- core_df %>%
  left_join(metadata_small, by = "result_id") %>%
  left_join(prolific_demographic, by = "PROLIFIC_PID")

9 filter to complete datasets

optional_text <- c("clearUtopiatext", "biasUtopiatext", "protoReasonText", "feedback_critic")

required_cols <- setdiff(
  c("PROLIFIC_PID",
    core_vars,
    "assoc1_response", "assoc2_response", "assoc3_response", "assoc4_response", "assoc5_response",
    "assoc1_valence", "assoc2_valence", "assoc3_valence", "assoc4_valence", "assoc5_valence"
  ),
  optional_text
)

table(core_df$State)

FINISHED      PRE 
      69       19 
# clean_complete <- core_df
clean_complete <- core_df %>%
  filter(State == "FINISHED")

dim(clean_complete)
[1]  69 182
table(clean_complete$condition_FutureSociety)

 aicentered    anarchic    futurist    lawBased moderngreen primitivist 
          6          14          16           7          10           7 
  religious 
          8 
to_numeric <- function(x) {
  if (is.numeric(x)) return(x)
  suppressWarnings(as.numeric(as.character(x)))
}

to_logical <- function(x) {
  if (is.logical(x)) return(x)
  if (is.factor(x)) x <- as.character(x)
  as.logical(x)
}

num_cols <- c(
  "counterVisualTraps",
  "para_countclicks",
  "visualTrap_Planets_rtMs",
  "visualTrap_Collision_rtMs",
  "visualTrap_CafeWall_rtMs",
  "visualTrap_Circles_rtMs",
  "visualTrap_Lines_rtMs",
  "visualTrap_Robot_rtMs",
  "AR-1",
  "AR-2",
  "AR-3",
  "AR-4",
  "AR-5",
  "clearUtopia",
  "biasUtopia",
  "attributesFutureSociety-1",
  "attributesFutureSociety-2",
  "attributesFutureSociety-3",
  "attributesFutureSociety-4",
  "attributesFutureSociety-5",
  "attributesFutureSociety-6",
  "attributesFutureSociety-7",
  "attributesFutureSociety-8",
  "protoConfidenceRadio",
  "assoc1_valence",
  "assoc2_valence",
  "assoc3_valence",
  "assoc4_valence",
  "assoc5_valence",
  "keyStats_count",
  "keyStats_mean",
  "keyStats_median",
  "keyStats_min",
  "keyStats_max",
  "duration_seconds",
  "socidem_Age"
)

bool_cols <- c(
  "hp_exclusionCriteria",
  "visualTrap_Planets_correct",
  "visualTrap_Collision_correct",
  "visualTrap_CafeWall_correct",
  "visualTrap_Circles_correct",
  "visualTrap_Lines_correct",
  "visualTrap_Robot_correct"
)

factor_cols <- c(
  "dummy_informedconsent",
  "commCheck",
  "condition_FutureSociety",
  "cue_coding",
  "utopiaPrototypeAssignment",
  "visualTrapResponse",
  "visualTrap_Planets_response",
  "visualTrap_Collision_response",
  "visualTrap_CafeWall_response",
  "visualTrap_Circles_response",
  "visualTrap_Lines_response",
  "visualTrap_Robot_response",
  "socidem_Sex",
  "socidem_Ethnicity.simplified",
  "socidem_Student.status",
  "socidem_Employment.status"
)

clean_complete <- clean_complete %>%
  mutate(
    across(any_of(bool_cols), ~ to_logical(.)),
    across(any_of(num_cols), ~ to_numeric(.)),
    across(any_of(factor_cols), ~ as.factor(.))
  )

10 save outputs

dir.create(questionnaire_output_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(paradata_output_dir, recursive = TRUE, showWarnings = FALSE)

saveRDS(clean_complete, file = file.path(questionnaire_output_dir, "clean_complete_wide.rds"))
writexl::write_xlsx(x = clean_complete, path = file.path(questionnaire_output_dir, "clean_complete_wide.xlsx"))
write.csv(clean_complete, file = file.path(questionnaire_output_dir, "clean_complete_wide.csv"), row.names = FALSE)

saveRDS(paradata_events_long, file = file.path(paradata_output_dir, "paradata_events_long.rds"))
writexl::write_xlsx(x = paradata_events_long, path = file.path(paradata_output_dir, "paradata_events_long.xlsx"))
write.csv(paradata_events_long, file = file.path(paradata_output_dir, "paradata_events_long.csv"), row.names = FALSE)

saveRDS(paradata_defocus_long, file = file.path(paradata_output_dir, "paradata_defocus_long.rds"))
writexl::write_xlsx(x = paradata_defocus_long, path = file.path(paradata_output_dir, "paradata_defocus_long.xlsx"))
write.csv(paradata_defocus_long, file = file.path(paradata_output_dir, "paradata_defocus_long.csv"), row.names = FALSE)

writexl::write_xlsx(x = clean_complete[, c("PROLIFIC_PID", "condition_FutureSociety","clearUtopia", "clearUtopiatext", "biasUtopia", "biasUtopiatext", "protoConfidenceRadio", "protoReasonText")], path = file.path(questionnaire_output_dir, "clean_complete_wide_text.xlsx"))