Data preperation pilot study

Author

Julius Fenn, Stephanie Bugler

1 Notes

2 global variables

Define your global variables (can take some time to run):

3 create raw data files

# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))

### load packages
require(pacman)
p_load('tidyverse', 'jsonlite',
       'stargazer',  'DT', 'psych',
       'writexl')


### load socio-demographic data
setwd("data demographic")
# prolific <- read.csv(file = "prolific_export_65d4744b1373145896174eba.csv", header = TRUE)

### load JATOS data
setwd("../data")
suppressMessages(
  read_file('jatos_results_data_20250405122610.txt') %>%
    # ... split it into lines ...
    str_split('\n') %>% first() %>%
    # ... filter empty rows ...
    discard(function(x)
      x == '') %>%
    discard(function(x)
      x == '\r') %>%
    # ... parse JSON into a data.frame
    map_dfr(fromJSON, flatten = TRUE)
) -> dat

# Read and parse each JSON line into a list
# json_data <- suppressMessages(
#   read_file("jatos_results_data_20250405122610.txt") %>%
#     str_split("\n") %>%
#     first() %>%
#     discard(~ .x == "" || .x == "\r") %>%
#     map(~ fromJSON(.x, simplifyVector = FALSE)) # Keep full nested structure
# )

#> add ID counter
dat$ID <- NA

tmp_IDcounter <- 0
for (i in 1:nrow(dat)) {
  if (!is.na(dat$sender[i]) &&
      dat$sender[i] == "Greetings") {
    tmp_IDcounter = tmp_IDcounter + 1
  }
  dat$ID[i] <- tmp_IDcounter
}
rm(tmp_IDcounter)


### load functions
setwd("../functions")
for(i in 1:length(dir())){
  # print(dir()[i])
  source(dir()[i], encoding = "utf-8")
}

rm(i)


### summary function
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      se = sd(x[[col]], na.rm=TRUE) / sqrt(length(x[[col]])))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- plyr::rename(data_sum, c("mean" = varname))
  return(data_sum)
}

4 set up data.frame questionnaires

### keep only complete data sets
sort(table(dat$ID))

 2  6 13 15 25 26 35 44 57 34  1  3  4  5  7  8  9 10 11 12 14 16 17 18 19 20 
 2  2  2  2  2  2  2  2  2 13 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 
21 22 23 24 27 28 29 30 31 32 33 36 37 38 39 40 41 42 43 45 46 47 48 49 50 51 
17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 
52 53 54 55 56 58 59 
17 17 17 17 17 17 17 
sum(table(dat$ID) != max(table(dat$ID)))
[1] 10
sum(table(dat$ID) == max(table(dat$ID)))
[1] 49
dat <-
  dat[dat$ID %in% names(table(dat$ID))[table(dat$ID) == max(table(dat$ID))], ]



### json (from JATOS) to 2D data.frame
# > pre study
# add paradata
tmp_notNumeric <-
  str_subset(string = colnames(dat), pattern = "^meta|^sustainable|^bioinspired|^rationalCon|^feedback")
tmp_notNumeric <-
  str_subset(string = tmp_notNumeric,
             pattern = "labjs|location",
             negate = TRUE)

### get survey
vec_ques <- c("PROLIFIC_PID",
              "dummy_informedconsent",
              tmp_notNumeric)

vec_notNumeric = c("PROLIFIC_PID", tmp_notNumeric)

questionnaire <- questionnairetype(
  dataset = dat,
  listvars = vec_ques,
  notNumeric = vec_notNumeric,
  verbose = FALSE
)

dim(questionnaire)
[1] 49 39
### get word list of associations
library(tidyverse)

# First, select only relevant columns
wordlistAssociations <- questionnaire %>%
  select(PROLIFIC_PID, starts_with("bioinspired_R"), starts_with("sustainable_R")) %>%
  pivot_longer(
    cols = starts_with(c("bioinspired_R", "sustainable_R")),
    names_to = c("typeWord", "orderAssociation"),
    names_pattern = "(bioinspired|sustainable)_R(\\d)",
    values_to = "association"
  ) %>%
  mutate(orderAssociation = as.integer(orderAssociation)) %>%
  arrange(PROLIFIC_PID, typeWord, orderAssociation)



### get edge list of seen connections
edge_list <- list()

for(i in unique(dat$ID)){
  tmp_connections <- dat$drawnConnections[dat$ID == i]
  tmp_connections <- tmp_connections[!sapply(tmp_connections, is.null)]
  tmp_rational <- questionnaire[questionnaire$ID == i, str_subset(colnames(questionnaire), "rationalCon")]

  # Only proceed if there is at least one set of connections
  if (length(tmp_connections) > 0 && !is.null(unlist(tmp_connections))) {
    # Assuming only one non-null element per ID (if not, loop or bind all)
    con_df <- tmp_connections[[1]]
    
    # Flatten tmp_rational into a character vector
    rat_vec <- as.character(tmp_rational[1, ])
    
    # Pad rational vector to match the number of rows in con_df
    if (length(rat_vec) < nrow(con_df)) {
      rat_vec <- c(rat_vec, rep(NA, nrow(con_df) - length(rat_vec)))
    } else if (length(rat_vec) > nrow(con_df)) {
      rat_vec <- rat_vec[1:nrow(con_df)]
    }
    
    con_df$rational <- rat_vec
    
    # Add to result list
    edge_list[[as.character(i)]] <- con_df
  }
}

# Combine all into a single data frame
edgelistAssociations <- bind_rows(edge_list, .id = "ID")
### save files
setwd("outputs")
#> questionnaire
## save as .xlsx file
writexl::write_xlsx(x = questionnaire, path = "questionnaire.xlsx")
## save as R object
saveRDS(questionnaire, file = "questionnaire.rds")

#> wordlistAssociations
## save as .xlsx file
writexl::write_xlsx(x = wordlistAssociations, path = "wordlistAssociations.xlsx")
## save as R object
saveRDS(wordlistAssociations, file = "wordlistAssociations.rds")

#> edgelistAssociations
## save as .xlsx file
writexl::write_xlsx(x = edgelistAssociations, path = "edgelistAssociations.xlsx")
## save as R object
saveRDS(edgelistAssociations, file = "edgelistAssociations.rds")

5 show data

DT::datatable(questionnaire, options = list(pageLength = 5))
summary(as.numeric(table(edgelistAssociations$ID)))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   4.000   4.022   5.000  11.000 
table(edgelistAssociations$ID)

 1 10 11 12 14 16 17 18 19 20 21 22 23 24 28 29  3 30 31 32 33 36 37 38 39  4 
 2  4  5  4  5  5  2  3  8 11  5  4  2  8  6  2  9  5  2  4  3  1  3  1  1  6 
40 41 43 45 46 48 49  5 50 51 52 53 54 56 58 59  7  8  9 
 4  3  6  2  7  2  4  6  1  3  2  5  4  4  2  4  5  3  3 
DT::datatable(wordlistAssociations, options = list(pageLength = 5))
DT::datatable(edgelistAssociations, options = list(pageLength = 5))