Data preperation all studies

Author

Julius Fenn, Katja Pollak

1 Notes

2 global variables

Define your global variables (e.g., to reduce run time):

createRawFiles <- FALSE

3 functions

########################################
# from JATOS to table
########################################
### argss:
# dataset = dat_secondPostCAM
# listvars = vec_ques
# notNumeric = vec_notNumeric
# verbose = TRUE


questionnairetype <- function(dataset,
                              listvars = ques_mixed,
                              notNumeric = vec_notNumeric,
                              verbose=FALSE){

  datasetques <- data.frame(ID = unique(dataset$ID))

  for(c in 1:length(listvars)){
    if(verbose){
      print(c)
    }

    if(any(colnames(dataset) == listvars[c])){
      if(verbose){
        print(listvars[c])
      }

      ## tmp IDs
      tmpid <- dataset$ID[!is.na(dataset[, listvars[c]])]
      ## tmp value variable
      tmpvalue <- dataset[, listvars[c]][!is.na(dataset[, listvars[c]])]
      datasetques[listvars[c]]  <- NA

      if(listvars[c] %in% notNumeric){
        datasetques[datasetques$ID %in% tmpid, listvars[c]] <- tmpvalue
      }else if(is.list(tmpvalue)){
        tmpvalue_tmp <- unique(tmpvalue)
        tmpvalue <- c()
        counter = 1
        for(i in 1:length(tmpvalue_tmp)){
          if(!is.null(tmpvalue_tmp[[i]])){
            tmpvalue[counter] <- paste0(tmpvalue_tmp[[i]], collapse = " - ")
            counter = counter + 1
          }
        }
        datasetques[datasetques$ID %in% tmpid, listvars[c]] <- tmpvalue
      }else{
        datasetques[datasetques$ID %in% tmpid, listvars[c]] <- as.numeric(tmpvalue)
      }
    }
  }
  return(datasetques)
}

4 load packages

### load packages
require(pacman)
p_load('tidyverse', 'jsonlite',
       'stargazer',  'DT', 'psych',
       'writexl')
# devtools::install_github("samuelae/associatoR")

5 create raw data files

if(createRawFiles){
  # Define root paths
  source_root <- "../../data"
  output_dir <- "outputs/raw"
  
  
  # Define studies with paths
  studies <- list(
    study1 = file.path(source_root, "study_1"),
    study2_BMI_high = file.path(source_root, "study_2", "BMI_high"),
    study2_BMI_mixed = file.path(source_root, "study_2", "BMI_mixed")
  )
  
  # Loop over each study
  for (study_name in names(studies)) {
    study_path <- studies[[study_name]]
    
    # Get folders matching pattern "study_result*"
    tmp_folders <- list.files(study_path, pattern = "^study_result.*", full.names = TRUE)
    
    results <- list()
    
    for (folder in tmp_folders) {
      # Look for a single subdirectory inside the folder
      inner_folders <- list.files(folder, full.names = TRUE)
      
      if (length(inner_folders) == 1) {
        data_file <- file.path(inner_folders[1], "data.txt")
        
        if (file.exists(data_file)) {
          # Load and store the JSON data
          tmp <- fromJSON(data_file)
          results[[length(results) + 1]] <- tmp
        }
      }
    }
    
    # Save as RDS
    saveRDS(results, file = file.path(output_dir, paste0(study_name, ".rds")))
    
    # Optionally also save as JSON Lines (.jsonl)
    json_lines <- map_chr(results, ~ toJSON(.x, auto_unbox = TRUE))
    writeLines(json_lines, con = file.path(output_dir, paste0(study_name, ".jsonl")))
  }
}else{
  print("Raw files have been created in previous run!")
}
[1] "Raw files have been created in previous run!"

6 prepare data questionnaires

6.1 study 1

load data:

output_dir <- "outputs/raw"

# Load the RDS files into a named list
study1 <- readRDS(file.path(output_dir, "study1.rds"))

# Combine into single data.frame
suppressMessages({
  dat_study1 <- bind_rows(study1)
})

rm(study1)

counter variable:

dat_study1$ID <- cumsum(dat_study1$sender == "Greetings" & !is.na(dat_study1$sender))

table(dat_study1$ID)

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
   5 2291    2 2291    5 2230 2291 2291 2291 2291 2291 2291 2291 2291    5 2291 
  17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
2291 2291 2291 2291 2291 2291 2291 2230    5 2291    5 2291 2291    5 2291 2291 
  33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48 
2291 2291 2291 2291 2230 2291 2291 2291 2291 2261    2 2291 2291 2291 2291 2291 
  49   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64 
2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291    2 2291 2291 2291 
  65   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80 
2291 2291 2291    2    5 2291 2291 2291 2291    5 2291    2 2291 2291 2291 2291 
  81   82   83   84   85   86   87   88   89   90   91   92   93   94   95   96 
2291 2291    2 2291 2291 2291 2291 2291 2291 2291 2291    5 2291 2291 2291 2291 
  97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112 
2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 
 113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128 
2291    5 2291 2291 2291 2291 2291 2291 2291 2291    2 2291    5 2291    5 2291 
 129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144 
2291 2291    2 2291 2291 2291 2291    5    5 2291 2291    5    5 2291 2291 2291 
 145  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160 
2291 2291 2291 2291 2291    2 2291 2291    5    2 2291 2291 2291 2291 2291 2291 
 161  162  163  164  165  166  167  168  169  170  171  172  173  174  175  176 
   2 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 
 177  178  179  180  181  182  183  184  185  186  187  188  189  190  191  192 
   5 2291    2 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 
 193  194  195  196  197  198  199  200  201  202  203  204  205  206  207  208 
2291 2291 2291    2 2291 2291    5 2291 2291 2291 2291 2291 2291    2 2291 2291 
 209  210  211  212  213  214  215  216  217  218  219  220  221  222  223  224 
2291    5 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 
 225  226  227  228  229  230  231  232  233  234  235  236  237  238  239  240 
2291 2291 2291 2291    5    5 2291 2291 2291    5 2291 2291 2291 2291    5 2291 
 241  242  243  244  245  246  247  248  249  250  251  252  253  254  255  256 
2291    5 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291    5 2291 2291 2291 
 257  258  259  260  261  262  263  264  265  266  267  268  269  270  271  272 
2291 2291 2291 2291    5 2291 2291 2291 2291 2291    5 2291    5    2 2291 2291 
 273  274  275  276  277  278  279  280  281  282  283  284  285  286  287  288 
2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 
 289 
   5 

filter:

sum(table(dat_study1$ID) != max(table(dat_study1$ID)))
[1] 49
sum(table(dat_study1$ID) == max(table(dat_study1$ID)))
[1] 240
length(unique(dat_study1$ID))
[1] 289
dat_study1 <-
  dat_study1[dat_study1$ID %in% names(table(dat_study1$ID))[table(dat_study1$ID) == max(table(dat_study1$ID))], ]
length(unique(dat_study1$ID))
[1] 240

questionnaire:

some elements are lists:

colnames(dat_study1)[sapply(dat_study1, is.list)]
[1] "url"               "meta"              "key_assignment"   
[4] "para_defocuscount"

add meta manually and remove it:

meta_df <- dat_study1$meta[, c("language", "screen_width", "screen_height", "userAgent")]
dat_study1 <- bind_cols(dat_study1, meta_df)
New names:
• `...7` -> `...157`
rm(meta_df)
dat_study1$meta <- NULL

create questionnaire:

vec_notNumeric <- c("PROLIFIC_PID",
                    "sociodemo_residency", "sociodemo_gender", "sociodemo_sexualOrientation",
                    "sociodemo_weight_type", "sociodemo_height_type",
                    "feedback_critic", 
                    "language", "screen_width", "screen_height", "userAgent",
                    str_subset(string = colnames(dat_study1), pattern = "^normalweight_R"),
                    str_subset(string = colnames(dat_study1), pattern = "^underweight_R"),
                    str_subset(string = colnames(dat_study1), pattern = "^obesity_R"),
                    str_subset(string = colnames(dat_study1), pattern = "^overweight_R"))

### get survey
vec_ques <- c("PROLIFIC_PID",
              "dummy_informedconsent",
              "commCheck", "attCheck", "feedback_conscientiousCompletion",
              sort(str_subset(string = colnames(dat_study1), pattern = "^pair")),
              sort(str_subset(string = colnames(dat_study1), pattern = "^PathogenDisgus")),
              sort(str_subset(string = colnames(dat_study1), pattern = "^GermAversion")),
              sort(str_subset(string = colnames(dat_study1), pattern = "^BeliefsAboutObesePersons")),
              sort(str_subset(string = colnames(dat_study1), pattern = "^AttitudeTowardsObesePeople")),
              sort(str_subset(string = colnames(dat_study1), pattern = "^PerceivedCausesofObesity")),

              sort(str_subset(string = colnames(dat_study1), pattern = "^sociodemo")),
              
              sort(str_subset(string = colnames(dat_study1), pattern = "^normalweight")),
              sort(str_subset(string = colnames(dat_study1), pattern = "^underweigh")),
              sort(str_subset(string = colnames(dat_study1), pattern = "^obesity")),
              sort(str_subset(string = colnames(dat_study1), pattern = "^overweight")),

              vec_notNumeric)



ques_study1 <- questionnairetype(
  dataset = dat_study1,
  listvars = vec_ques,
  notNumeric = vec_notNumeric,
  verbose = FALSE
)

dim(ques_study1)
[1] 240 130
setwd("outputs/questionnaire")

saveRDS(ques_study1, file = paste0("ques_study1", ".rds"))

6.2 study 2 - BMI high

load data:

output_dir <- "outputs/raw"

# Load the RDS files into a named list
study2_BMI_high <- readRDS(file.path(output_dir, "study2_BMI_high.rds"))

# Combine into single data.frame
suppressMessages({
  dat_study2_BMI_high <- bind_rows(study2_BMI_high)
})

rm(study2_BMI_high)

counter variable:

dat_study2_BMI_high$ID <- cumsum(dat_study2_BMI_high$sender == "Greetings" & !is.na(dat_study2_BMI_high$sender))

table(dat_study2_BMI_high$ID)

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
2922 2922 2922 2922 2922 2922 2922 2922 2922 2922 2922    4 2922 2922 2922 2922 
  17   18   19   20   21   22   23   24   25   26   27 
2922    4    4 2922 2922 2922 2922 2922 2922 2922 2920 

filter:

sum(table(dat_study2_BMI_high$ID) != max(table(dat_study2_BMI_high$ID)))
[1] 4
sum(table(dat_study2_BMI_high$ID) == max(table(dat_study2_BMI_high$ID)))
[1] 23
length(unique(dat_study2_BMI_high$ID))
[1] 27
dat_study2_BMI_high <-
  dat_study2_BMI_high[dat_study2_BMI_high$ID %in% names(table(dat_study2_BMI_high$ID))[table(dat_study2_BMI_high$ID) == max(table(dat_study2_BMI_high$ID))], ]
length(unique(dat_study2_BMI_high$ID))
[1] 23

questionnaire:

some elements are lists:

colnames(dat_study2_BMI_high)[sapply(dat_study2_BMI_high, is.list)]
[1] "url"                      "meta"                    
[3] "para_defocuscount"        "key_assignment"          
[5] "unsucsessfulAssociations" "sucsessfulAssociations"  

add meta manually and remove it:

meta_df <- dat_study2_BMI_high$meta[, c("language", "screen_width", "screen_height", "userAgent")]
dat_study2_BMI_high <- bind_cols(dat_study2_BMI_high, meta_df)
New names:
• `...6` -> `...131`
rm(meta_df)
dat_study2_BMI_high$meta <- NULL

create questionnaire:

vec_notNumeric <- c("PROLIFIC_PID", "study_condition",
                    "sociodemo_residency", "sociodemo_gender",
                    "sociodemo_weight_type", "sociodemo_height_type",
                    "feedback_critic", 
                    "language", "screen_width", "screen_height", "userAgent")

### get survey
vec_ques <- c("PROLIFIC_PID", "study_condition",
              "dummy_informedconsent",
              "commCheck", "attCheck", "feedback_conscientiousCompletion",
              sort(str_subset(string = colnames(dat_study2_BMI_high), pattern = "^pair")),
              sort(str_subset(string = colnames(dat_study2_BMI_high), pattern = "^PathogenDisgus")),
              sort(str_subset(string = colnames(dat_study2_BMI_high), pattern = "^GermAversion")),
              sort(str_subset(string = colnames(dat_study2_BMI_high), pattern = "^BeliefsAboutObesePersons")),
              sort(str_subset(string = colnames(dat_study2_BMI_high), pattern = "^AttitudeTowardsObesePeople")),
              sort(str_subset(string = colnames(dat_study2_BMI_high), pattern = "^PerceivedCausesofObesity")),

              sort(str_subset(string = colnames(dat_study2_BMI_high), pattern = "^sociodemo")),
              
              "weight_avatar_choice1", "weight_avatar_choice2",

              vec_notNumeric)



ques_study2_BMI_high <- questionnairetype(
  dataset = dat_study2_BMI_high,
  listvars = vec_ques,
  notNumeric = vec_notNumeric,
  verbose = FALSE
)

dim(ques_study2_BMI_high)
[1] 23 95
setwd("outputs/questionnaire")

saveRDS(ques_study2_BMI_high, file = paste0("ques_study2_BMI_high", ".rds"))

6.3 study 2 - BMI mixed

load data:

output_dir <- "outputs/raw"

# Load the RDS files into a named list
study2_BMI_mixed <- readRDS(file.path(output_dir, "study2_BMI_mixed.rds"))

# Combine into single data.frame
suppressMessages({
  dat_study2_BMI_mixed <- bind_rows(study2_BMI_mixed)
})

rm(study2_BMI_mixed)

counter variable:

dat_study2_BMI_mixed$ID <- cumsum(dat_study2_BMI_mixed$sender == "Greetings" & !is.na(dat_study2_BMI_mixed$sender))

table(dat_study2_BMI_mixed$ID)

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
2920 2920 2920 2920 2920 2920    2 2920 2920    2    2 2920 2920 2920 2920 2920 
  17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
2920 2920 2920 2920 2920 2920 2920 2920 2920 2920 2920 2920 2920    2 2920 2920 
  33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48 
2920    2 2920 2920 2920 2920    2 2920    2 2920 2920 2920    2 2920    2    2 
  49   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64 
2920 2920 2920 2920 2920 2920    2    2 2920 2920 2920 2920 2920 2920 2920 2920 
  65   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80 
2920 2920 2920 2920 2920    2 2920 2920 2920 2920    2 2920 2920    2 2920 2920 
  81   82   83   84   85   86   87   88   89   90   91   92   93   94   95   96 
2920 2920 2920 2920 2920 2920    2    2 2920    2 2920 2920 2920 2920 2920    2 
  97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112 
2920 2920 2920    2    2    2 2920    2 2920 2920 2920    2 2920 2920    2 2920 
 113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128 
   2 2913    2    2 2920    2 2920 2920 2920 2920 2920 2920 2920    2 2920 2920 
 129  130  131  132  133  134 
   2    2    2 2920 2920 2920 

filter:

sum(table(dat_study2_BMI_mixed$ID) != max(table(dat_study2_BMI_mixed$ID)))
[1] 34
sum(table(dat_study2_BMI_mixed$ID) == max(table(dat_study2_BMI_mixed$ID)))
[1] 100
length(unique(dat_study2_BMI_mixed$ID))
[1] 134
dat_study2_BMI_mixed <-
  dat_study2_BMI_mixed[dat_study2_BMI_mixed$ID %in% names(table(dat_study2_BMI_mixed$ID))[table(dat_study2_BMI_mixed$ID) == max(table(dat_study2_BMI_mixed$ID))], ]
length(unique(dat_study2_BMI_mixed$ID))
[1] 100

questionnaire:

some elements are lists:

colnames(dat_study2_BMI_mixed)[sapply(dat_study2_BMI_mixed, is.list)]
[1] "url"                      "meta"                    
[3] "key_assignment"           "para_defocuscount"       
[5] "unsucsessfulAssociations" "sucsessfulAssociations"  

add meta manually and remove it:

meta_df <- dat_study2_BMI_mixed$meta[, c("language", "screen_width", "screen_height", "userAgent")]
dat_study2_BMI_mixed <- bind_cols(dat_study2_BMI_mixed, meta_df)
New names:
• `...7` -> `...130`
rm(meta_df)
dat_study2_BMI_mixed$meta <- NULL

create questionnaire:

vec_notNumeric <- c("PROLIFIC_PID", "study_condition",
                    "sociodemo_residency", "sociodemo_gender",
                    "sociodemo_weight_type", "sociodemo_height_type",
                    "feedback_critic", 
                    "language", "screen_width", "screen_height", "userAgent")

### get survey
vec_ques <- c("PROLIFIC_PID", "study_condition",
              "dummy_informedconsent",
              "commCheck", "attCheck", "feedback_conscientiousCompletion",
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^pair")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^PathogenDisgus")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^GermAversion")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^BeliefsAboutObesePersons")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^AttitudeTowardsObesePeople")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^PerceivedCausesofObesity")),

              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^sociodemo")),
              
              "weight_avatar_choice1", "weight_avatar_choice2",

              vec_notNumeric)



ques_study2_BMI_mixed <- questionnairetype(
  dataset = dat_study2_BMI_mixed,
  listvars = vec_ques,
  notNumeric = vec_notNumeric,
  verbose = FALSE
)

dim(ques_study2_BMI_mixed)
[1] 100  94

add BMI:

functions:

# Define the conversion functions (same as your second block)
convert_weight_to_kg <- function(weight, weight_type) {
  if (weight_type == "pounds") {
    return(weight * 0.45359237)  
  } else if (weight_type == "kilograms") {
    return(weight)  
  } else {
    return(NA_real_)
  }
}

convert_height_to_cm <- function(height_cm, height_ft, height_ftin, height_type) {
  if (height_type == "centimeters") {
    return(height_cm) 
  } else if (height_type == "feet-inches") {
    feet_in_inches <- height_ft
    inches <- height_ftin
    return(feet_in_inches * 30.48 + inches * 2.54)
  } else {
    return(NA_real_)
  }
}

# Apply the computation using mutate + mapply
ques_study2_BMI_mixed <- ques_study2_BMI_mixed %>%
  mutate(
    weight_kg = mapply(convert_weight_to_kg, 
                       weight = sociodemo_weight, 
                       weight_type = sociodemo_weight_type),
    
    height_cm_val = mapply(convert_height_to_cm, 
                           height_cm = sociodemo_height_cm, 
                           height_ft = sociodemo_height_ft, 
                           height_ftin = sociodemo_height_ftin, 
                           height_type = sociodemo_height_type),
    
    sociodemo_BMI = ifelse(
      !is.na(weight_kg) & !is.na(height_cm_val), 
      weight_kg / ((height_cm_val / 100)^2), 
      NA_real_
    )
  )

add BMI - ques_study2_BMI_mixed:

# Apply the computation using mutate + mapply
ques_study2_BMI_mixed <- ques_study2_BMI_mixed %>%
  mutate(
    weight_kg = mapply(convert_weight_to_kg, 
                       weight = sociodemo_weight, 
                       weight_type = sociodemo_weight_type),
    
    height_cm_val = mapply(convert_height_to_cm, 
                           height_cm = sociodemo_height_cm, 
                           height_ft = sociodemo_height_ft, 
                           height_ftin = sociodemo_height_ftin, 
                           height_type = sociodemo_height_type),
    
    sociodemo_BMI = ifelse(
      !is.na(weight_kg) & !is.na(height_cm_val), 
      weight_kg / ((height_cm_val / 100)^2), 
      NA_real_
    )
  )

ques_study2_BMI_mixed$sociodemo_BMI[ques_study2_BMI_mixed$weight_kg < 40 | ques_study2_BMI_mixed$height_cm_val < 140 | ques_study2_BMI_mixed$height_cm_val > 230] <- NA
# ques_study2_BMI_mixed$weight_kg[ques_study2_BMI_mixed$sociodemo_BMI < 10]
# ques_study2_BMI_mixed$height_cm_val[ques_study2_BMI_mixed$sociodemo_BMI < 10]

add BMI - ques_study1:

ques_study1$sociodemo_height_ft <- NA
ques_study1$sociodemo_height_ftin <- NA


ques_study1 <- ques_study1 %>%
  mutate(
    weight_kg = mapply(convert_weight_to_kg, 
                       weight = sociodemo_weight, 
                       weight_type = sociodemo_weight_type),
    
    height_cm_val = mapply(convert_height_to_cm, 
                           height_cm = sociodemo_height,
                            height_ft = sociodemo_height_ft, 
                           height_ftin = sociodemo_height_ftin, 
                           height_type = sociodemo_height_type),
    
    sociodemo_BMI = ifelse(
      !is.na(weight_kg) & !is.na(height_cm_val), 
      weight_kg / ((height_cm_val / 100)^2), 
      NA_real_
    )
  )


ques_study1$sociodemo_BMI[ques_study1$weight_kg < 40 | ques_study1$height_cm_val < 140 | ques_study1$height_cm_val > 230] <- NA
# ques_study1$weight_kg[ques_study1$sociodemo_BMI < 10]
# ques_study1$height_cm_val[ques_study1$sociodemo_BMI < 10]
summary(ques_study2_BMI_high$sociodemo_BMI)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  30.04   32.04   37.30   36.88   38.90   61.50 
summary(ques_study2_BMI_mixed$sociodemo_BMI)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  16.04   22.34   24.45   26.56   28.14   53.37       4 
summary(ques_study1$sociodemo_BMI)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  14.04   22.22   24.57   25.93   28.58   48.21      91 
setwd("outputs/questionnaire")

saveRDS(ques_study2_BMI_mixed, file = paste0("ques_study2_BMI_mixed", ".rds"))

6.4 combine questionnaires

tmp_surveysVars <- c(sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^pair")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^PathogenDisgus")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^GermAversion")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^BeliefsAboutObesePersons")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^AttitudeTowardsObesePeople")),
              sort(str_subset(string = colnames(dat_study2_BMI_mixed), pattern = "^PerceivedCausesofObesity")))


ques_study1$weight_avatar_choice2 <- NA

ques_study1$source <- "study1"
ques_study2_BMI_high$source <- "study2_BMI_high"
ques_study2_BMI_mixed$source <- "study2_BMI_mixed"



ques_combined <- rbind(ques_study1[, c("PROLIFIC_PID", "source", "sociodemo_age", "sociodemo_gender", "sociodemo_BMI", "weight_avatar_choice2", tmp_surveysVars)],
      ques_study2_BMI_high[, c("PROLIFIC_PID", "source", "sociodemo_age", "sociodemo_gender",  "sociodemo_BMI","weight_avatar_choice2", tmp_surveysVars)],
      ques_study2_BMI_mixed[, c("PROLIFIC_PID", "source", "sociodemo_age", "sociodemo_gender", "sociodemo_BMI", "weight_avatar_choice2", tmp_surveysVars)])

dim(ques_combined)
[1] 363  77
names(table(ques_combined$PROLIFIC_PID))[table(ques_combined$PROLIFIC_PID) >= 2]
[1] "601129f77e0c21000b0c408a"

6.4.1 reverse code items

Fat Phobia Scale:

psych::cor.plot(r = cor(ques_combined[,str_subset(string = colnames(ques_combined), pattern = "^pair")]))

# --- 1) Item metadata (from your list) ---
items <- tibble::tribble(
  ~scale,   ~left,                ~right,
  "pair01", "lazy",               "industrious",
  "pair02", "no will power",      "has will power",
  "pair03", "attractive",         "unattractive",
  "pair04", "good self-control",  "poor self-control",
  "pair05", "fast",               "slow",
  "pair06", "having endurance",   "having no endurance",
  "pair07", "active",             "inactive",
  "pair08", "weak",               "strong",
  "pair09", "self-indulgent",     "self-sacrificing",
  "pair10", "dislikes food",      "likes food",
  "pair11", "shapeless",          "shapely",
  "pair12", "undereats",          "overeats",
  "pair13", "insecure",           "secure",
  "pair14", "low self-esteem",    "high self-esteem"
)

# --- 2) Define which *right* words are "positive" ---
# Edit this list as needed. Only items whose RIGHT word is in this list will be reversed.
positive_right_terms <- c(
  "industrious",
  "has will power",
  "strong",
  "self-sacrificing",
  "shapely",
  "secure",
  "high self-esteem"
  # NOTE: "likes food" and "overeats" are context-dependent; add/remove if needed.
)

# Items to reverse: those whose RIGHT term is positive
reverse_items <- items$scale[items$right %in% positive_right_terms]

# --- 3) Reverse-code those columns in your data frame df ---

min_val <- 1
max_val <- 5

# Safety check: only reverse variables that actually exist in df
reverse_items_in_df <- intersect(reverse_items, names(ques_combined))

ques_combined[reverse_items_in_df] <- lapply(ques_combined[reverse_items_in_df], function(x) {
  (max_val + min_val) - x
})

# --- 4) (Optional) Inspect what was reversed ---
list(
  reversed_by_right_positive = reverse_items_in_df,
  not_found_in_df = setdiff(reverse_items, reverse_items_in_df)
)
$reversed_by_right_positive
[1] "pair01" "pair02" "pair08" "pair09" "pair11" "pair13" "pair14"

$not_found_in_df
character(0)
psych::cor.plot(r = cor(ques_combined[,str_subset(string = colnames(ques_combined), pattern = "^pair")]))

Subscale Germ Aversion:

psych::cor.plot(r = cor(ques_combined[,str_subset(string = colnames(ques_combined), pattern = "^GermAversion")]))

# with names like GermAversion1, GermAversion5r, GermAversion7r, etc.
min_val <- 1
max_val <- 7

# 1. Identify all GermAversion columns
germ_cols <- str_subset(colnames(ques_combined), pattern = "^GermAversion")

# 2. Find which of these end with "r"
germ_cols_rev <- str_subset(germ_cols, pattern = "r$")

# 3. Reverse code the ones ending in "r"
ques_combined[germ_cols_rev] <- lapply(ques_combined[germ_cols_rev], function(x) {
  (max_val + min_val) - x
})
psych::cor.plot(r = cor(ques_combined[,str_subset(string = colnames(ques_combined), pattern = "^GermAversion")]))

Beliefs About Obese Persons Scale:

psych::cor.plot(r = cor(ques_combined[,str_subset(string = colnames(ques_combined), pattern = "^BeliefsAboutObesePersons")]))

min_val <- 1
max_val <- 6

# 1. Identify all BeliefsAboutObesePersons columns
beliefsAboutObe_cols <- str_subset(colnames(ques_combined), pattern = "^BeliefsAboutObesePersons")

# 2. Find which of these end with "r"
beliefsAboutObe_cols_rev <- str_subset(beliefsAboutObe_cols, pattern = "r$")

# 3. Reverse code the ones ending in "r"
ques_combined[beliefsAboutObe_cols_rev] <- lapply(ques_combined[beliefsAboutObe_cols_rev], function(x) {
  (max_val + min_val) - x
})
psych::cor.plot(r = cor(ques_combined[,str_subset(string = colnames(ques_combined), pattern = "^BeliefsAboutObesePersons")]))

Attitude Towards Obese People Scale:

psych::cor.plot(r = cor(ques_combined[,str_subset(string = colnames(ques_combined), pattern = "^AttitudeTowardsObesePeople")]))

min_val <- 1
max_val <- 6

# 1. Identify all AttitudeTowardsObesePeople columns
attitudeTowObe_cols <- str_subset(colnames(ques_combined), pattern = "^AttitudeTowardsObesePeople")

# 2. Find which of these end with "r"
attitudeTowObe_cols_rev <- str_subset(attitudeTowObe_cols, pattern = "r$")

# 3. Reverse code the ones ending in "r"
ques_combined[attitudeTowObe_cols_rev] <- lapply(ques_combined[attitudeTowObe_cols_rev], function(x) {
  (max_val + min_val) - x
})
psych::cor.plot(r = cor(ques_combined[,str_subset(string = colnames(ques_combined), pattern = "^AttitudeTowardsObesePeople")]))

6.4.2 save questionnaire

setwd("outputs/questionnaire")


# Save RDS
saveRDS(ques_combined, file = "ques_combined.rds")

# Save CSV
write.csv(ques_combined, file = "ques_combined.csv", row.names = FALSE)

# Save XLSX using writexl
writexl::write_xlsx(ques_combined, path = "ques_combined.xlsx")

7 prepare data associations

colnames(dat_study2_BMI_high)[sapply(dat_study2_BMI_high, is.list)]
[1] "url"                      "para_defocuscount"       
[3] "key_assignment"           "unsucsessfulAssociations"
[5] "sucsessfulAssociations"  
colnames(dat_study2_BMI_mixed)[sapply(dat_study2_BMI_mixed, is.list)]
[1] "url"                      "key_assignment"          
[3] "para_defocuscount"        "unsucsessfulAssociations"
[5] "sucsessfulAssociations"  

Function:

# association_list = sucsessfulAssociations_study2_BMI_mixed
# metadata_df = ques_study2_BMI_mixed
process_association_data <- function(association_list, metadata_df) {
  # Initialize output data frame
  output_df <- data.frame(
    participant_id = character(),
    gender = character(),
    age = integer(),
    BMI = numeric(),
    cue = character(),
    response = character(),
    response_position = integer(),
    response_level = integer(),
    timestamp = character(),
    time_diff_sec = numeric(),
    stringsAsFactors = FALSE
  )
  
  # Loop through all participants
  for (i in seq_along(association_list)) {
    
    # Get associations
    assoc <- association_list[[i]]
    assoc_L1 <- assoc[1:5, ]
    assoc_L2 <- assoc[6:nrow(assoc), ]
    
    # Get participant metadata
    meta <- metadata_df[i, ]
    pid <- meta$PROLIFIC_PID
    gender <- meta$sociodemo_gender
    age <- meta$sociodemo_age 
    BMI <- meta$sociodemo_BMI
    avatar <- meta$weight_avatar_choice2
    
    if(nrow(assoc_L1) != 5){
      cat("assoc_L1 not 5 for:", pid, "\n")
    }
    if(nrow(assoc_L2) != 25){
      cat("assoc_L2 not 25 but", nrow(assoc_L2), "for:", pid, "\n")
    }
    
    # --- Level 1: responses ---
    df_level1 <- data.frame(
      participant_id = pid,
      gender = gender,
      age = age,
      BMI = BMI,
      avatar = avatar,
      cue = assoc_L1$cue,
      response = assoc_L1$response,
      response_position = seq_along(assoc_L1$cue),
      response_level = 1,
      timestamp = assoc_L1$timestamp,
      stringsAsFactors = FALSE
    )
    
    # --- Level 2: responses ---
    df_level2 <- data.frame(
      participant_id = pid,
      gender = gender,
      age = age,
      BMI = BMI,
      avatar = avatar,
      cue = assoc_L2$cue,
      response = assoc_L2$response,
      response_position = as.numeric(ave(assoc_L2$cue, assoc_L2$cue, FUN = seq_along)), #rep(1:5, times = 5)
      response_level = 2,
      timestamp = assoc_L2$timestamp,
      stringsAsFactors = FALSE
    )
    
    # Combine both levels
    combined <- rbind(df_level1, df_level2)
    
    # Convert timestamp and calculate time difference from first response
    combined$timestamp <- as.POSIXct(combined$timestamp, format = "%Y-%m-%dT%H:%M:%OSZ", tz = "UTC")
    combined$time_diff_sec <- as.numeric(difftime(combined$timestamp, combined$timestamp[1], units = "secs"))
    
    # Append to final output
    output_df <- rbind(output_df, combined)
  }
  
  return(output_df)
}

7.1 study 2 - BMI high

sucsessfulAssociations_study2_BMI_high <- dat_study2_BMI_high$sucsessfulAssociations[!sapply(dat_study2_BMI_high$sucsessfulAssociations, is.null)]

ass_study2_BMI_high <- process_association_data(
  association_list = sucsessfulAssociations_study2_BMI_high,
  metadata_df = ques_study2_BMI_high
)

dim(ass_study2_BMI_high)
[1] 690  11
setwd("outputs/associations")

# Save RDS
saveRDS(ass_study2_BMI_high, file = "ass_study2_BMI_high.rds")

# Save CSV
write.csv(ass_study2_BMI_high, file = "ass_study2_BMI_high.csv", row.names = FALSE)

# Save XLSX using writexl
writexl::write_xlsx(ass_study2_BMI_high, path = "ass_study2_BMI_high.xlsx")

7.2 study 2 - BMI mixed

sucsessfulAssociations_study2_BMI_mixed <- dat_study2_BMI_mixed$sucsessfulAssociations[!sapply(dat_study2_BMI_mixed$sucsessfulAssociations, is.null)]

ass_study2_BMI_mixed <- process_association_data(
  association_list = sucsessfulAssociations_study2_BMI_mixed,
  metadata_df = ques_study2_BMI_mixed
)
assoc_L2 not 25 but 23 for: 673b8b36a23f091c707518ae 
assoc_L2 not 25 but 24 for: 6664dfc807e75330a8b54a8b 
assoc_L2 not 25 but 24 for: 67bcdccadec7e4c73404a0ad 
assoc_L2 not 25 but 24 for: 64f3036a5b5f2feae0b1f51b 
assoc_L2 not 25 but 19 for: 65feaaac53eb219f09ad5ea0 
assoc_L2 not 25 but 24 for: 5bb0cc738f3bd70001e513e3 
assoc_L2 not 25 but 24 for: 67edb2c2157a46a0dcb9c5e3 
assoc_L2 not 25 but 24 for: 667c744d940ae66df47ab873 
assoc_L2 not 25 but 24 for: 634688daf4776ef459edf673 
assoc_L2 not 25 but 22 for: 678a406859e436e14f1b1deb 
assoc_L2 not 25 but 24 for: 6165d25922529904e5d7267d 
assoc_L2 not 25 but 24 for: 6776815763e82f23fe4bcd40 
dim(ass_study2_BMI_mixed)
[1] 2980   11
setwd("outputs/associations")

# Save RDS
saveRDS(ass_study2_BMI_mixed, file = "ass_study2_BMI_mixed.rds")

# Save CSV
write.csv(ass_study2_BMI_mixed, file = "ass_study2_BMI_mixed.csv", row.names = FALSE)

# Save XLSX using writexl
writexl::write_xlsx(ass_study2_BMI_mixed, path = "ass_study2_BMI_mixed.xlsx")

7.3 combine associations

ass_combined <- rbind(ass_study2_BMI_high, ass_study2_BMI_mixed)

dim(ass_combined)
[1] 3670   11
names(table(ass_combined$participant_id))[table(ass_combined$participant_id) != 30]
 [1] "5bb0cc738f3bd70001e513e3" "6165d25922529904e5d7267d"
 [3] "634688daf4776ef459edf673" "64f3036a5b5f2feae0b1f51b"
 [5] "65feaaac53eb219f09ad5ea0" "6664dfc807e75330a8b54a8b"
 [7] "667c744d940ae66df47ab873" "673b8b36a23f091c707518ae"
 [9] "6776815763e82f23fe4bcd40" "678a406859e436e14f1b1deb"
[11] "67bcdccadec7e4c73404a0ad" "67edb2c2157a46a0dcb9c5e3"
setwd("outputs/associations")


# Save RDS
saveRDS(ass_combined, file = "ass_combined.rds")

# Save CSV
write.csv(ass_combined, file = "ass_combined.csv", row.names = FALSE)

# Save XLSX using writexl
writexl::write_xlsx(ass_combined, path = "ass_combined.xlsx")