createRawFiles <- TRUE
data_root <- "data"
output_root <- "outputs"
raw_output_dir <- file.path(output_root, "raw")
questionnaire_output_dir <- file.path(output_root, "questionnaire")
paradata_output_dir <- file.path(output_root, "paradata")Data preperation
1 Notes
This script creates a simplified participant-level dataset using the raw study.rds file and produces:
- A clean wide table with core variables + associations (wide) + keyStats
- Paradata events in long format (clipboard, mouse, scroll, keyIntervals)
- Defocus events in long format
2 global variables
3 functions
last_non_empty <- function(x) {
if (length(x) == 0) return(NA)
for (i in seq_along(x)[length(x):1]) {
val <- x[[i]]
if (is.null(val)) next
if (is.list(val) && length(val) == 0) next
if (!is.list(val) && length(val) == 1 && is.na(val)) next
if (is.character(val) && length(val) == 1 && isTRUE(val == "")) next
return(val)
}
return(NA)
}
`%||%` <- function(x, y) {
if (is.null(x)) y else x
}
parse_duration_seconds <- function(x) {
if (is.na(x) || x == "") return(NA_real_)
clean <- stringr::str_remove(x, "\\s*\\(.*\\)")
if (is.na(clean) || clean == "") return(NA_real_)
as.numeric(hms::as_hms(clean))
}
null_to_empty <- function(x) {
if (is.null(x)) list() else x
}
normalize_cell <- function(val) {
if (is.null(val)) return(NA)
if (is.data.frame(val)) return(list(val))
if (is.list(val)) {
if (length(val) == 0) return(NA)
return(list(val))
}
if (length(val) != 1) return(list(val))
val
}
collapse_list_to_string <- function(x, sep = " // ") {
if (is.null(x)) return(NA_character_)
if (is.list(x)) {
if (length(x) == 0) return(NA_character_)
flat <- unlist(x, recursive = TRUE, use.names = FALSE)
if (length(flat) == 0) return(NA_character_)
return(paste0(flat, collapse = sep))
}
if (length(x) > 1) return(paste0(x, collapse = sep))
as.character(x)
}
sanitize_paradata_name <- function(name) {
cleaned <- gsub("[^A-Za-z0-9]+", "_", name)
cleaned <- gsub("^_+|_+$", "", cleaned)
paste0("para_", cleaned)
}
init_paradata_result <- function() {
list(
wide_vals = list(),
wide_cols = character(),
wide_cols_output = character(),
long_rows = list(),
long_cols = character()
)
}
append_paradata_result <- function(target, source) {
target$wide_vals <- c(target$wide_vals, source$wide_vals)
target$wide_cols <- c(target$wide_cols, source$wide_cols)
target$wide_cols_output <- c(target$wide_cols_output, source$wide_cols_output)
target$long_rows <- c(target$long_rows, source$long_rows)
target$long_cols <- c(target$long_cols, source$long_cols)
target
}
add_paradata_wide <- function(res, field_label, value) {
out_col <- sanitize_paradata_name(field_label)
res$wide_vals[[out_col]] <- value
res$wide_cols <- c(res$wide_cols, field_label)
res$wide_cols_output <- c(res$wide_cols_output, out_col)
res
}
add_paradata_long_row <- function(res, pid, result_id, field_label, value, row_index, timestamp) {
res$long_rows[[length(res$long_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = "paradata_column",
event_index = row_index,
timestamp = timestamp %||% NA_character_,
component = NA_character_,
field = field_label,
value = value,
mouse_x = NA_real_,
mouse_y = NA_real_,
scroll_x = NA_real_,
scroll_y = NA_real_,
key_interval_ms = NA_real_,
clipboard_type = NA_character_,
clipboard_text = NA_character_,
clipboard_length = NA_real_,
target_tag = NA_character_,
target_type = NA_character_,
target_id = NA_character_,
target_name = NA_character_,
target_className = NA_character_
)
res$long_cols <- c(res$long_cols, field_label)
res
}
get_event_value <- function(ev, name) {
if (is.null(ev)) return(NA)
if (is.data.frame(ev)) return(ev[[name]] %||% NA)
if (is.list(ev)) return(ev[[name]] %||% NA)
NA
}
target_type_fallback <- function(type_val, tag_val) {
if (!is.null(type_val) && length(type_val) > 0 && !is.na(type_val) && type_val != "") {
return(as.character(type_val))
}
if (!is.null(tag_val) && length(tag_val) > 0 && !is.na(tag_val) && tag_val != "") {
return(tolower(as.character(tag_val)))
}
NA_character_
}
get_target_value <- function(ev, field) {
if (is.null(ev)) return(NA)
if (is.data.frame(ev)) {
col <- paste0("target.", field)
if (col %in% names(ev)) {
val <- ev[[col]]
if (length(val) == 0) return(NA)
return(val[[1]] %||% NA)
}
if ("target" %in% names(ev)) {
target_col <- ev[["target"]]
if (is.list(target_col) && length(target_col) > 0) {
target_obj <- target_col[[1]]
if (is.data.frame(target_obj) && field %in% names(target_obj)) {
val <- target_obj[[field]]
if (length(val) == 0) return(NA)
return(val[[1]] %||% NA)
}
if (is.list(target_obj)) return(target_obj[[field]] %||% NA)
}
if (is.data.frame(target_col) && field %in% names(target_col)) {
val <- target_col[[field]]
if (length(val) == 0) return(NA)
return(val[[1]] %||% NA)
}
}
return(NA)
}
if (is.list(ev) && !is.null(ev$target)) return(ev$target[[field]] %||% NA)
NA
}
is_non_empty_element <- function(x) {
if (is.null(x)) return(FALSE)
if (is.data.frame(x)) return(nrow(x) > 0)
if (is.list(x)) {
if (length(x) == 0) return(FALSE)
return(!all(purrr::map_lgl(x, ~ is.null(.x) || (is.list(.x) && length(.x) == 0))))
}
length(x) > 0
}
find_last_non_empty_element <- function(col_vals) {
if (is.null(col_vals)) return(list(value = NULL, index = NA_integer_))
if (!is.list(col_vals)) {
if (length(col_vals) == 0) return(list(value = NULL, index = NA_integer_))
return(list(value = col_vals, index = length(col_vals)))
}
for (i in seq_along(col_vals)[length(col_vals):1]) {
v <- col_vals[[i]]
if (is_non_empty_element(v)) return(list(value = v, index = i))
}
list(value = NULL, index = NA_integer_)
}
process_group_df <- function(df, prefix_label, timestamp_vec, pid, result_id) {
res <- init_paradata_result()
if (is.null(df) || !is.data.frame(df)) return(res)
for (col in names(df)) {
col_vals <- df[[col]]
field_label <- paste0(prefix_label, ".", col)
if (is.data.frame(col_vals)) {
nested_res <- process_group_df(col_vals, field_label, timestamp_vec, pid, result_id)
res <- append_paradata_result(res, nested_res)
next
}
if (is.list(col_vals)) {
vals_json <- vapply(col_vals, function(v) {
if (is.null(v) || (is.list(v) && length(v) == 0)) return(NA_character_)
jsonlite::toJSON(v, auto_unbox = TRUE)
}, character(1))
non_na <- vals_json[!is.na(vals_json)]
if (length(non_na) == 0) next
uniq <- unique(non_na)
if (length(uniq) == 1) {
res <- add_paradata_wide(res, field_label, uniq[[1]])
} else {
for (r in seq_along(vals_json)) {
v <- vals_json[[r]]
if (is.na(v)) next
ts <- if (!is.null(timestamp_vec) && length(timestamp_vec) >= r) timestamp_vec[[r]] else NA_character_
res <- add_paradata_long_row(res, pid, result_id, field_label, v, r, ts)
}
}
next
}
non_na <- col_vals[!is.na(col_vals)]
if (length(non_na) == 0) next
uniq <- unique(non_na)
if (length(uniq) == 1) {
res <- add_paradata_wide(res, field_label, uniq[[1]])
} else {
for (r in seq_along(col_vals)) {
v <- col_vals[[r]]
if (is.na(v)) next
ts <- if (!is.null(timestamp_vec) && length(timestamp_vec) >= r) timestamp_vec[[r]] else NA_character_
res <- add_paradata_long_row(res, pid, result_id, field_label, as.character(v), r, ts)
}
}
}
res
}
flatten_with_prefix <- function(x, prefix) {
if (is.null(x)) return(list())
flat <- unlist(x, recursive = TRUE, use.names = TRUE)
if (length(flat) == 0) return(list())
names(flat) <- paste0(prefix, names(flat))
as.list(flat)
}
assoc_to_wide <- function(assoc_list) {
if (is.null(assoc_list) || length(assoc_list) == 0) {
return(tibble::tibble(
assoc1_response = NA_character_, assoc1_valence = NA_character_,
assoc2_response = NA_character_, assoc2_valence = NA_character_,
assoc3_response = NA_character_, assoc3_valence = NA_character_,
assoc4_response = NA_character_, assoc4_valence = NA_character_,
assoc5_response = NA_character_, assoc5_valence = NA_character_
))
}
assoc_df <- dplyr::as_tibble(assoc_list)
assoc_df <- assoc_df[seq_len(min(5, nrow(assoc_df))), ]
get_val <- function(df, col, idx) {
if (!col %in% names(df)) return(NA_character_)
if (nrow(df) < idx) return(NA_character_)
df[[col]][idx]
}
tibble::tibble(
assoc1_response = get_val(assoc_df, "response", 1),
assoc1_valence = get_val(assoc_df, "valence", 1),
assoc2_response = get_val(assoc_df, "response", 2),
assoc2_valence = get_val(assoc_df, "valence", 2),
assoc3_response = get_val(assoc_df, "response", 3),
assoc3_valence = get_val(assoc_df, "valence", 3),
assoc4_response = get_val(assoc_df, "response", 4),
assoc4_valence = get_val(assoc_df, "valence", 4),
assoc5_response = get_val(assoc_df, "response", 5),
assoc5_valence = get_val(assoc_df, "valence", 5)
)
}
build_paradata_events <- function(pid, result_id, paradata) {
if (is.null(paradata) || is.null(paradata$behavior)) return(tibble::tibble())
events <- list()
behavior <- paradata$behavior
clipboard_events <- null_to_empty(behavior$clipboardEvents)
if (length(clipboard_events)) {
events[[length(events) + 1]] <- purrr::map_dfr(seq_along(clipboard_events), function(i) {
ev <- clipboard_events[[i]]
target <- ev$target
tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = ev$type %||% "clipboard",
event_index = i,
timestamp = ev$timestamp %||% NA_character_,
component = ev$component %||% NA_character_,
field = NA_character_,
value = NA_character_,
mouse_x = NA_real_,
mouse_y = NA_real_,
scroll_x = NA_real_,
scroll_y = NA_real_,
key_interval_ms = NA_real_,
clipboard_type = ev$type %||% NA_character_,
clipboard_text = ev$text %||% NA_character_,
clipboard_length = ev$length %||% NA_real_,
target_tag = target$tag %||% NA_character_,
target_type = target_type_fallback(target$type %||% NA_character_, target$tag %||% NA_character_),
target_id = target$id %||% NA_character_,
target_name = target$name %||% NA_character_,
target_className = target$className %||% NA_character_
)
})
}
mouse_events <- null_to_empty(behavior$mouseEvents)
if (length(mouse_events)) {
events[[length(events) + 1]] <- purrr::map_dfr(seq_along(mouse_events), function(i) {
ev <- mouse_events[[i]]
tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = "mousemove",
event_index = i,
timestamp = ev$timestamp %||% NA_character_,
component = ev$component %||% NA_character_,
field = NA_character_,
value = NA_character_,
mouse_x = ev$x %||% NA_real_,
mouse_y = ev$y %||% NA_real_,
scroll_x = NA_real_,
scroll_y = NA_real_,
key_interval_ms = NA_real_,
clipboard_type = NA_character_,
clipboard_text = NA_character_,
clipboard_length = NA_real_,
target_tag = NA_character_,
target_type = NA_character_,
target_id = NA_character_,
target_name = NA_character_,
target_className = NA_character_
)
})
}
scroll_events <- null_to_empty(behavior$scrollEvents)
if (length(scroll_events)) {
events[[length(events) + 1]] <- purrr::map_dfr(seq_along(scroll_events), function(i) {
ev <- scroll_events[[i]]
tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = "scroll",
event_index = i,
timestamp = ev$timestamp %||% NA_character_,
component = ev$component %||% NA_character_,
field = NA_character_,
value = NA_character_,
mouse_x = NA_real_,
mouse_y = NA_real_,
scroll_x = ev$scrollX %||% NA_real_,
scroll_y = ev$scrollY %||% NA_real_,
key_interval_ms = NA_real_,
clipboard_type = NA_character_,
clipboard_text = NA_character_,
clipboard_length = NA_real_,
target_tag = NA_character_,
target_type = NA_character_,
target_id = NA_character_,
target_name = NA_character_,
target_className = NA_character_
)
})
}
if (length(events) == 0) return(tibble::tibble())
dplyr::bind_rows(events)
}
process_paradata_general <- function(pid, result_id, paradata) {
empty <- list(
wide = tibble::tibble(),
long = tibble::tibble(),
wide_cols = character(),
wide_cols_output = character(),
long_cols = character(),
key_stats = tibble::tibble(
keyStats_count = NA_real_,
keyStats_mean = NA_real_,
keyStats_median = NA_real_,
keyStats_min = NA_real_,
keyStats_max = NA_real_
)
)
if (is.null(paradata)) return(empty)
if (is.data.frame(paradata) && all(c("static", "ip", "behavior", "meta") %in% names(paradata))) {
static_df <- paradata$static
ip_df <- paradata$ip
behavior_df <- paradata$behavior
meta_df <- paradata$meta
ip_error <- paradata$ipError
timestamps <- NULL
if (is.data.frame(static_df) && "timestamp" %in% names(static_df)) {
timestamps <- static_df$timestamp
} else if (is.data.frame(meta_df) && "updatedAt" %in% names(meta_df)) {
timestamps <- meta_df$updatedAt
}
res <- init_paradata_result()
res <- append_paradata_result(res, process_group_df(static_df, "static", timestamps, pid, result_id))
res <- append_paradata_result(res, process_group_df(ip_df, "ip", timestamps, pid, result_id))
res <- append_paradata_result(res, process_group_df(meta_df, "meta", timestamps, pid, result_id))
if (!is.null(ip_error)) {
non_na <- ip_error[!is.na(ip_error)]
if (length(non_na) == 1) {
res <- add_paradata_wide(res, "ipError", non_na[[1]])
} else if (length(non_na) > 1) {
for (r in seq_along(ip_error)) {
v <- ip_error[[r]]
if (is.na(v)) next
ts <- if (!is.null(timestamps) && length(timestamps) >= r) timestamps[[r]] else NA_character_
res <- add_paradata_long_row(res, pid, result_id, "ipError", as.character(v), r, ts)
}
}
}
key_stats <- empty$key_stats
if (is.data.frame(behavior_df) && "keyStats" %in% names(behavior_df)) {
key_df <- behavior_df$keyStats
if (is.data.frame(key_df)) {
key_stats <- tibble::tibble(
keyStats_count = if ("count" %in% names(key_df)) last_non_empty(key_df$count) else NA_real_,
keyStats_mean = if ("mean" %in% names(key_df)) last_non_empty(key_df$mean) else NA_real_,
keyStats_median = if ("median" %in% names(key_df)) last_non_empty(key_df$median) else NA_real_,
keyStats_min = if ("min" %in% names(key_df)) last_non_empty(key_df$min) else NA_real_,
keyStats_max = if ("max" %in% names(key_df)) last_non_empty(key_df$max) else NA_real_
)
}
}
long_rows <- list()
if (is.data.frame(behavior_df)) {
if ("clipboardEvents" %in% names(behavior_df)) {
last_clip <- find_last_non_empty_element(behavior_df$clipboardEvents)
ev_list <- last_clip$value
if (!is.null(ev_list)) {
if (is.data.frame(ev_list)) {
for (j in seq_len(nrow(ev_list))) {
ev <- ev_list[j, ]
long_rows[[length(long_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = get_event_value(ev, "type") %||% "clipboard",
event_index = j,
timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_clip$index) && length(timestamps) >= last_clip$index) timestamps[[last_clip$index]] else NA_character_),
component = get_event_value(ev, "component"),
field = NA_character_,
value = NA_character_,
mouse_x = NA_real_,
mouse_y = NA_real_,
scroll_x = NA_real_,
scroll_y = NA_real_,
key_interval_ms = NA_real_,
clipboard_type = get_event_value(ev, "type"),
clipboard_text = get_event_value(ev, "text"),
clipboard_length = get_event_value(ev, "length"),
target_tag = get_target_value(ev, "tag"),
target_type = target_type_fallback(get_target_value(ev, "type"), get_target_value(ev, "tag")),
target_id = get_target_value(ev, "id"),
target_name = get_target_value(ev, "name"),
target_className = get_target_value(ev, "className")
)
}
res$long_cols <- c(res$long_cols, "behavior.clipboardEvents")
} else if (is.list(ev_list)) {
ev_list <- null_to_empty(ev_list)
for (j in seq_along(ev_list)) {
ev <- ev_list[[j]]
long_rows[[length(long_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = get_event_value(ev, "type") %||% "clipboard",
event_index = j,
timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_clip$index) && length(timestamps) >= last_clip$index) timestamps[[last_clip$index]] else NA_character_),
component = get_event_value(ev, "component"),
field = NA_character_,
value = NA_character_,
mouse_x = NA_real_,
mouse_y = NA_real_,
scroll_x = NA_real_,
scroll_y = NA_real_,
key_interval_ms = NA_real_,
clipboard_type = get_event_value(ev, "type"),
clipboard_text = get_event_value(ev, "text"),
clipboard_length = get_event_value(ev, "length"),
target_tag = get_target_value(ev, "tag"),
target_type = target_type_fallback(get_target_value(ev, "type"), get_target_value(ev, "tag")),
target_id = get_target_value(ev, "id"),
target_name = get_target_value(ev, "name"),
target_className = get_target_value(ev, "className")
)
}
res$long_cols <- c(res$long_cols, "behavior.clipboardEvents")
}
}
}
if ("mouseEvents" %in% names(behavior_df)) {
last_mouse <- find_last_non_empty_element(behavior_df$mouseEvents)
ev_df <- last_mouse$value
if (!is.null(ev_df)) {
if (is.data.frame(ev_df)) {
for (j in seq_len(nrow(ev_df))) {
ev <- ev_df[j, ]
long_rows[[length(long_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = "mousemove",
event_index = j,
timestamp = ev$timestamp %||% (if (!is.null(timestamps) && !is.na(last_mouse$index) && length(timestamps) >= last_mouse$index) timestamps[[last_mouse$index]] else NA_character_),
component = ev$component %||% NA_character_,
field = NA_character_,
value = NA_character_,
mouse_x = ev$x %||% NA_real_,
mouse_y = ev$y %||% NA_real_,
scroll_x = NA_real_,
scroll_y = NA_real_,
key_interval_ms = NA_real_,
clipboard_type = NA_character_,
clipboard_text = NA_character_,
clipboard_length = NA_real_,
target_tag = NA_character_,
target_type = NA_character_,
target_id = NA_character_,
target_name = NA_character_,
target_className = NA_character_
)
}
res$long_cols <- c(res$long_cols, "behavior.mouseEvents")
} else if (is.list(ev_df)) {
ev_df <- null_to_empty(ev_df)
for (j in seq_along(ev_df)) {
ev <- ev_df[[j]]
long_rows[[length(long_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = "mousemove",
event_index = j,
timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_mouse$index) && length(timestamps) >= last_mouse$index) timestamps[[last_mouse$index]] else NA_character_),
component = get_event_value(ev, "component"),
field = NA_character_,
value = NA_character_,
mouse_x = get_event_value(ev, "x"),
mouse_y = get_event_value(ev, "y"),
scroll_x = NA_real_,
scroll_y = NA_real_,
key_interval_ms = NA_real_,
clipboard_type = NA_character_,
clipboard_text = NA_character_,
clipboard_length = NA_real_,
target_tag = NA_character_,
target_type = NA_character_,
target_id = NA_character_,
target_name = NA_character_,
target_className = NA_character_
)
}
res$long_cols <- c(res$long_cols, "behavior.mouseEvents")
}
}
}
if ("scrollEvents" %in% names(behavior_df)) {
last_scroll <- find_last_non_empty_element(behavior_df$scrollEvents)
ev_df <- last_scroll$value
if (!is.null(ev_df)) {
if (is.data.frame(ev_df)) {
for (j in seq_len(nrow(ev_df))) {
ev <- ev_df[j, ]
long_rows[[length(long_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = "scroll",
event_index = j,
timestamp = ev$timestamp %||% (if (!is.null(timestamps) && !is.na(last_scroll$index) && length(timestamps) >= last_scroll$index) timestamps[[last_scroll$index]] else NA_character_),
component = ev$component %||% NA_character_,
field = NA_character_,
value = NA_character_,
mouse_x = NA_real_,
mouse_y = NA_real_,
scroll_x = ev$scrollX %||% NA_real_,
scroll_y = ev$scrollY %||% NA_real_,
key_interval_ms = NA_real_,
clipboard_type = NA_character_,
clipboard_text = NA_character_,
clipboard_length = NA_real_,
target_tag = NA_character_,
target_type = NA_character_,
target_id = NA_character_,
target_name = NA_character_,
target_className = NA_character_
)
}
res$long_cols <- c(res$long_cols, "behavior.scrollEvents")
} else if (is.list(ev_df)) {
ev_df <- null_to_empty(ev_df)
for (j in seq_along(ev_df)) {
ev <- ev_df[[j]]
long_rows[[length(long_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_general",
event_type = "scroll",
event_index = j,
timestamp = get_event_value(ev, "timestamp") %||% (if (!is.null(timestamps) && !is.na(last_scroll$index) && length(timestamps) >= last_scroll$index) timestamps[[last_scroll$index]] else NA_character_),
component = get_event_value(ev, "component"),
field = NA_character_,
value = NA_character_,
mouse_x = NA_real_,
mouse_y = NA_real_,
scroll_x = get_event_value(ev, "scrollX"),
scroll_y = get_event_value(ev, "scrollY"),
key_interval_ms = NA_real_,
clipboard_type = NA_character_,
clipboard_text = NA_character_,
clipboard_length = NA_real_,
target_tag = NA_character_,
target_type = NA_character_,
target_id = NA_character_,
target_name = NA_character_,
target_className = NA_character_
)
}
res$long_cols <- c(res$long_cols, "behavior.scrollEvents")
}
}
}
}
if (length(long_rows)) {
res$long_rows <- c(res$long_rows, long_rows)
}
return(list(
wide = if (length(res$wide_vals)) tibble::as_tibble(res$wide_vals) else tibble::tibble(),
long = if (length(res$long_rows)) dplyr::bind_rows(res$long_rows) else tibble::tibble(),
wide_cols = res$wide_cols,
wide_cols_output = res$wide_cols_output,
long_cols = res$long_cols,
key_stats = key_stats
))
}
if (is.list(paradata)) {
wide_vals <- c(
flatten_with_prefix(paradata$static, "para_static_"),
flatten_with_prefix(paradata$ip, "para_ip_"),
flatten_with_prefix(paradata$meta, "para_meta_")
)
key_stats <- tibble::tibble(
keyStats_count = paradata$behavior$keyStats$count %||% NA_real_,
keyStats_mean = paradata$behavior$keyStats$mean %||% NA_real_,
keyStats_median = paradata$behavior$keyStats$median %||% NA_real_,
keyStats_min = paradata$behavior$keyStats$min %||% NA_real_,
keyStats_max = paradata$behavior$keyStats$max %||% NA_real_
)
return(list(
wide = if (length(wide_vals)) tibble::as_tibble(wide_vals) else tibble::tibble(),
long = build_paradata_events(pid, result_id, paradata),
wide_cols = names(wide_vals),
wide_cols_output = names(wide_vals),
long_cols = character(),
key_stats = key_stats
))
}
empty
}
build_defocus_events <- function(pid, result_id, defocus_list) {
if (is.null(defocus_list)) return(tibble::tibble())
if (is.data.frame(defocus_list)) {
if (nrow(defocus_list) == 0) return(tibble::tibble())
return(dplyr::mutate(
tibble::as_tibble(defocus_list),
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_focus",
event_index = dplyr::row_number()
) %>%
dplyr::select(PROLIFIC_PID, result_id, origin, event_index, durationblur, senderblur))
}
if (is.list(defocus_list)) {
if (length(defocus_list) == 0) return(tibble::tibble())
return(purrr::map_dfr(seq_along(defocus_list), function(i) {
ev <- defocus_list[[i]]
tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
origin = "paradata_focus",
event_index = i,
durationblur = ev$durationblur %||% NA_real_,
senderblur = ev$senderblur %||% NA_character_
)
}))
}
tibble::tibble()
}
is_missing_value <- function(x) {
if (is.null(x)) return(TRUE)
if (is.list(x)) {
return(length(x) == 0 || all(is.na(unlist(x))))
}
if (length(x) > 1) {
if (is.character(x)) return(all(is.na(x) | x == ""))
return(all(is.na(x)))
}
if (is.character(x)) return(is.na(x) | x == "")
is.na(x)
}4 load packages
require(pacman)
p_load(
"tidyverse",
"jsonlite",
"hms",
"psych",
"afex",
"writexl",
"readxl",
"haven"
)5 create raw data files
if (createRawFiles) {
dir.create(raw_output_dir, recursive = TRUE, showWarnings = FALSE)
data_files <- list.files(
data_root,
pattern = "data.txt$",
recursive = TRUE,
full.names = TRUE
)
results <- list()
for (file in data_files) {
result_folder <- basename(dirname(dirname(file)))
result_id <- stringr::str_remove(result_folder, "^study_result_")
tmp <- jsonlite::fromJSON(file)
tmp[, colnames(tmp) == ""] <- NULL
tmp <- dplyr::as_tibble(tmp)
tmp$result_id <- as.character(result_id)
results[[length(results) + 1]] <- tmp
}
saveRDS(results, file = file.path(raw_output_dir, "study.rds"))
json_lines <- purrr::map_chr(results, ~ jsonlite::toJSON(.x, auto_unbox = TRUE))
writeLines(json_lines, con = file.path(raw_output_dir, "study.jsonl"))
print("Raw files have been created in this run!")
} else {
print("Raw files have been created in previous run!")
}[1] "Raw files have been created in this run!"
6 load JATOS metadata
metadata_files <- list.files(
data_root,
pattern = "jatos_results_metadata_.*\\.csv$",
full.names = TRUE
)
metadata_file <- metadata_files[which.max(file.info(metadata_files)$mtime)]
metadata <- readr::read_csv(metadata_file, show_col_types = FALSE)
names(metadata) <- make.names(names(metadata))
metadata <- metadata %>%
mutate(
result_id = as.character(Result.ID),
duration_seconds = purrr::map_dbl(Duration, parse_duration_seconds)
)
metadata_small <- metadata %>%
select(result_id, State, Duration, duration_seconds)
table(metadata_small$State)
FINISHED PRE
68 20
7 load Prolific demographics
prolific_demographic_file <- file.path(
data_root,
"prolific_demographic_export_69d4ced16907d423ae787668.csv"
)
prolific_demographic_raw <- readr::read_csv(prolific_demographic_file, show_col_types = FALSE)
names(prolific_demographic_raw) <- make.names(names(prolific_demographic_raw))
prolific_demographic <- prolific_demographic_raw %>%
rename(PROLIFIC_PID = Participant.id) %>%
mutate(PROLIFIC_PID = as.character(PROLIFIC_PID)) %>%
distinct(PROLIFIC_PID, .keep_all = TRUE) %>%
rename_with(~ paste0("socidem_", .x), -PROLIFIC_PID)8 build participant-level data
study <- readRDS(file.path(raw_output_dir, "study.rds"))
core_vars <- c(
"dummy_informedconsent",
"hp_exclusionCriteria",
"commCheck",
"counterVisualTraps",
"visualTrapResponse",
"visualTrap_Planets_response",
"visualTrap_Planets_responseLabel",
"visualTrap_Planets_correct",
"visualTrap_Planets_rtMs",
"visualTrap_Collision_response",
"visualTrap_Collision_responseLabel",
"visualTrap_Collision_correct",
"visualTrap_Collision_rtMs",
"visualTrap_CafeWall_response",
"visualTrap_CafeWall_responseLabel",
"visualTrap_CafeWall_correct",
"visualTrap_CafeWall_rtMs",
"visualTrap_Circles_response",
"visualTrap_Circles_responseLabel",
"visualTrap_Circles_correct",
"visualTrap_Circles_rtMs",
"visualTrap_Lines_response",
"visualTrap_Lines_responseLabel",
"visualTrap_Lines_correct",
"visualTrap_Lines_rtMs",
"visualTrap_Robot_response",
"visualTrap_Robot_responseLabel",
"visualTrap_Robot_correct",
"visualTrap_Robot_rtMs",
"visualTrap_order",
"condition_FutureSociety",
"cue",
"cue_coding",
"R1",
"R2",
"R3",
"R4",
"R5",
"para_countclicks",
"AR-2",
"AR-5",
"AR-3",
"AR-1",
"AR-4",
"undUtopiaGeneral",
"undUtopiaGoal",
"undUtopiaHow",
"clearUtopia",
"clearUtopiatext",
"biasUtopia",
"biasUtopiatext",
"attributesFutureSociety-5",
"attributesFutureSociety-6",
"attributesFutureSociety-2",
"attributesFutureSociety-3",
"attributesFutureSociety-4",
"attributesFutureSociety-7",
"attributesFutureSociety-1",
"attributesFutureSociety-8",
"utopiaPrototypeAssignment",
"protoConfidenceRadio",
"protoReasonText",
"feedback_critic"
)
core_rows <- list()
paradata_events_rows <- list()
paradata_defocus_rows <- list()
paradata_wide_cols_rows <- list()
paradata_long_cols_rows <- list()
for (i in seq_along(study)) {
print(i)
df <- study[[i]]
pid <- if ("PROLIFIC_PID" %in% names(df)) last_non_empty(df$PROLIFIC_PID) else NA
result_id <- if ("result_id" %in% names(df)) last_non_empty(df$result_id) else NA
core_values <- lapply(core_vars, function(v) {
val <- if (v %in% names(df)) {
last_non_empty(df[[v]])
} else if (v == "attributesFutureSociety-8" && "attributesFutureSociety-undefined" %in% names(df)) {
last_non_empty(df[["attributesFutureSociety-undefined"]])
} else {
NA
}
normalize_cell(val)
})
names(core_values) <- core_vars
core_row <- tibble::as_tibble(core_values)
if ("visualTrap_order" %in% names(core_row)) {
core_row$visualTrap_order <- collapse_list_to_string(core_row$visualTrap_order)
}
assoc_list <- if ("sucsessfulAssociations" %in% names(df)) last_non_empty(df$sucsessfulAssociations) else NULL
assoc_wide <- assoc_to_wide(assoc_list)
paradata <- if ("paradata_general" %in% names(df)) df$paradata_general else NULL
paradata_summary <- process_paradata_general(pid, result_id, paradata)
paradata_wide <- paradata_summary$wide
key_stats_row <- paradata_summary$key_stats
core_row <- dplyr::bind_cols(
tibble::tibble(result_id = result_id, PROLIFIC_PID = pid),
core_row,
assoc_wide,
paradata_wide,
key_stats_row
)
core_rows[[length(core_rows) + 1]] <- core_row
paradata_events <- paradata_summary$long
if (nrow(paradata_events) > 0) {
paradata_events_rows[[length(paradata_events_rows) + 1]] <- paradata_events
}
if (length(paradata_summary$wide_cols) > 0) {
paradata_wide_cols_rows[[length(paradata_wide_cols_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
field = paradata_summary$wide_cols,
output_field = paradata_summary$wide_cols_output,
destination = "wide"
)
}
if (length(paradata_summary$long_cols) > 0) {
paradata_long_cols_rows[[length(paradata_long_cols_rows) + 1]] <- tibble::tibble(
PROLIFIC_PID = pid,
result_id = result_id,
field = paradata_summary$long_cols,
output_field = NA_character_,
destination = "long"
)
}
defocus_list <- if ("para_defocuscount" %in% names(df)) last_non_empty(df$para_defocuscount) else NULL
defocus_events <- build_defocus_events(pid, result_id, defocus_list)
if (nrow(defocus_events) > 0) {
paradata_defocus_rows[[length(paradata_defocus_rows) + 1]] <- defocus_events
}
}[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88
core_df <- dplyr::bind_rows(core_rows)
paradata_events_long <- dplyr::bind_rows(paradata_events_rows)
paradata_defocus_long <- dplyr::bind_rows(paradata_defocus_rows)
paradata_column_map <- dplyr::bind_rows(paradata_wide_cols_rows, paradata_long_cols_rows)
if (nrow(paradata_events_long) == 0) {
paradata_events_long <- tibble::tibble(
PROLIFIC_PID = character(),
result_id = character(),
origin = character(),
event_type = character(),
event_index = integer(),
timestamp = character(),
component = character(),
field = character(),
value = character(),
mouse_x = double(),
mouse_y = double(),
scroll_x = double(),
scroll_y = double(),
key_interval_ms = double(),
clipboard_type = character(),
clipboard_text = character(),
clipboard_length = double(),
target_tag = character(),
target_type = character(),
target_id = character(),
target_name = character(),
target_className = character()
)
}
if (nrow(paradata_defocus_long) == 0) {
paradata_defocus_long <- tibble::tibble(
PROLIFIC_PID = character(),
result_id = character(),
origin = character(),
event_index = integer(),
durationblur = double(),
senderblur = character()
)
}
if (nrow(paradata_column_map) == 0) {
paradata_column_map <- tibble::tibble(
PROLIFIC_PID = character(),
result_id = character(),
field = character(),
output_field = character(),
destination = character()
)
}
core_df <- core_df %>%
left_join(metadata_small, by = "result_id") %>%
left_join(prolific_demographic, by = "PROLIFIC_PID")9 filter to complete datasets
optional_text <- c("clearUtopiatext", "biasUtopiatext", "protoReasonText", "feedback_critic")
required_cols <- setdiff(
c("PROLIFIC_PID",
core_vars,
"assoc1_response", "assoc2_response", "assoc3_response", "assoc4_response", "assoc5_response",
"assoc1_valence", "assoc2_valence", "assoc3_valence", "assoc4_valence", "assoc5_valence"
),
optional_text
)
table(core_df$State)
FINISHED PRE
69 19
# clean_complete <- core_df
clean_complete <- core_df %>%
filter(State == "FINISHED")
dim(clean_complete)[1] 69 182
table(clean_complete$condition_FutureSociety)
aicentered anarchic futurist lawBased moderngreen primitivist
6 14 16 7 10 7
religious
8
to_numeric <- function(x) {
if (is.numeric(x)) return(x)
suppressWarnings(as.numeric(as.character(x)))
}
to_logical <- function(x) {
if (is.logical(x)) return(x)
if (is.factor(x)) x <- as.character(x)
as.logical(x)
}
num_cols <- c(
"counterVisualTraps",
"para_countclicks",
"visualTrap_Planets_rtMs",
"visualTrap_Collision_rtMs",
"visualTrap_CafeWall_rtMs",
"visualTrap_Circles_rtMs",
"visualTrap_Lines_rtMs",
"visualTrap_Robot_rtMs",
"AR-1",
"AR-2",
"AR-3",
"AR-4",
"AR-5",
"clearUtopia",
"biasUtopia",
"attributesFutureSociety-1",
"attributesFutureSociety-2",
"attributesFutureSociety-3",
"attributesFutureSociety-4",
"attributesFutureSociety-5",
"attributesFutureSociety-6",
"attributesFutureSociety-7",
"attributesFutureSociety-8",
"protoConfidenceRadio",
"assoc1_valence",
"assoc2_valence",
"assoc3_valence",
"assoc4_valence",
"assoc5_valence",
"keyStats_count",
"keyStats_mean",
"keyStats_median",
"keyStats_min",
"keyStats_max",
"duration_seconds",
"socidem_Age"
)
bool_cols <- c(
"hp_exclusionCriteria",
"visualTrap_Planets_correct",
"visualTrap_Collision_correct",
"visualTrap_CafeWall_correct",
"visualTrap_Circles_correct",
"visualTrap_Lines_correct",
"visualTrap_Robot_correct"
)
factor_cols <- c(
"dummy_informedconsent",
"commCheck",
"condition_FutureSociety",
"cue_coding",
"utopiaPrototypeAssignment",
"visualTrapResponse",
"visualTrap_Planets_response",
"visualTrap_Collision_response",
"visualTrap_CafeWall_response",
"visualTrap_Circles_response",
"visualTrap_Lines_response",
"visualTrap_Robot_response",
"socidem_Sex",
"socidem_Ethnicity.simplified",
"socidem_Student.status",
"socidem_Employment.status"
)
clean_complete <- clean_complete %>%
mutate(
across(any_of(bool_cols), ~ to_logical(.)),
across(any_of(num_cols), ~ to_numeric(.)),
across(any_of(factor_cols), ~ as.factor(.))
)10 save outputs
dir.create(questionnaire_output_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(paradata_output_dir, recursive = TRUE, showWarnings = FALSE)
saveRDS(clean_complete, file = file.path(questionnaire_output_dir, "clean_complete_wide.rds"))
writexl::write_xlsx(x = clean_complete, path = file.path(questionnaire_output_dir, "clean_complete_wide.xlsx"))
write.csv(clean_complete, file = file.path(questionnaire_output_dir, "clean_complete_wide.csv"), row.names = FALSE)
saveRDS(paradata_events_long, file = file.path(paradata_output_dir, "paradata_events_long.rds"))
writexl::write_xlsx(x = paradata_events_long, path = file.path(paradata_output_dir, "paradata_events_long.xlsx"))
write.csv(paradata_events_long, file = file.path(paradata_output_dir, "paradata_events_long.csv"), row.names = FALSE)
saveRDS(paradata_defocus_long, file = file.path(paradata_output_dir, "paradata_defocus_long.rds"))
writexl::write_xlsx(x = paradata_defocus_long, path = file.path(paradata_output_dir, "paradata_defocus_long.xlsx"))
write.csv(paradata_defocus_long, file = file.path(paradata_output_dir, "paradata_defocus_long.csv"), row.names = FALSE)
writexl::write_xlsx(x = clean_complete[, c("PROLIFIC_PID", "condition_FutureSociety","clearUtopia", "clearUtopiatext", "biasUtopia", "biasUtopiatext", "protoConfidenceRadio", "protoReasonText")], path = file.path(questionnaire_output_dir, "clean_complete_wide_text.xlsx"))