EM6_Preprocessing_performance

First, load libraries!

# Load libraries
library(lme4)

## Loading required package: Matrix

library(lmerTest)

## 
## Attaching package: 'lmerTest'

## The following object is masked from 'package:lme4':
## 
##     lmer

## The following object is masked from 'package:stats':
## 
##     step

library(emmeans)
library(effectsize)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.0.10     ✔ readr     2.1.4 
## ✔ forcats   1.0.0      ✔ stringr   1.5.0 
## ✔ ggplot2   3.4.2      ✔ tibble    3.1.8 
## ✔ lubridate 1.9.2      ✔ tidyr     1.2.1 
## ✔ purrr     0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(ggpubr)

Now, initialize data frames!

# Initialize data frames
allFiles <<- list.files(pattern=".csv$")
all_Real_Data <<- data.frame()
all_Clicked <<- data.frame()
all_Correct_Clicked <<- data.frame()

We need to remove these columns, because just in case the experimenter forgets to press “q” at the end of the experiment and instead presses “esc” to end the experiment (I did this a few times), the experiment ends normally BUT these columns are not logged into the csv, and then there’s a discrepancy where some csv’s have these columns and others don’t, which causes a problem. So we just remove them. And these are definitely pointless columns!

columns_to_remove <<- c("goRightMouse.clicked_name", "endExp.keys", "endExp.rt",
                       "endExp.started", "endExp.stopped", "expFeedbackText.started",
                       "expFeedbackText.stopped")

Ginormous for-loop

This is one of the big meats of the preprocessing stage. Pretty much each step is commented. TLDR: it extracts the relevant columns from the csv files and cleans up the names a bit for readability.

# Extract data files
for (d in 1:length(allFiles)){
  all_Data <- read.csv(allFiles[d],header=TRUE,fileEncoding="UTF-8-BOM")
  all_Data <- all_Data[, !(colnames(all_Data) %in% columns_to_remove)]
  # Remove .csv extension for readability
  all_Data$condsFile <- all_Data$condsFile %>% str_replace(".csv","") 
  # Filter out the practice trials
  real_Data <- all_Data %>% filter(realOrNot == 1) %>% 
    # Filter out the introduction and instruction screens
    filter(condsFile!="mixed") %>% 
    filter(trialFeedBackPress.keys!="q") %>% 
    # Select the relevant columns
    select("participant","condsFile","delayDuration","order",
           "multipleTrials.thisN","oneTrial.thisN","templateTime"
           ,"templateFrame","nCorrect","nIncorrect",
           "clickedImages","clickedImagesTime",
           "correctClickedsName","correctClickedsTime",
           "correctImages","curCorrectImages",
           "workingTime","workingFrame", "closing") %>%
    # Rename for readability
    rename(trial = multipleTrials.thisN, trip = oneTrial.thisN,
           listTime = templateTime, listFrame = templateFrame ,
           shelfTime = workingTime, shelfFrame = workingFrame,
           remainCorAfterTrip = curCorrectImages,
           numCorrect = nCorrect,
           numIncorrect = nIncorrect)
  # Increment trip and trial index by 1 because the starting point is 0
  real_Data$trial <- real_Data$trial+1  
  real_Data$trip <- real_Data$trip+1
  # Initialize columns containing the number of correct/incorrect for the 
  # current trip, see line 84
  real_Data[,c("nCurCorrect","nCurIncorrect","nCurClickeds")] <- NA 
  
  # Create new data frames that will hold the number of correct clicks per
  # participant
  curParClicked <- data.frame()
  curParCorrectClicked <- data.frame()
  # Create dataframe called "current_Info" that holds the information about the 
  # current moment in the for-loop
  for (n in 1:nrow(real_Data)){
    
    current_Info <- real_Data[n, c("participant","condsFile","delayDuration","order", 
                                   "trial","trip","listTime","listFrame","numCorrect",
                                   "numIncorrect","shelfTime",'shelfFrame',
                                   "correctImages")]
    # Clean the values in the column that specifies how many correct selections 
    # are left
    # Note that when we use str_split(), the values turn into a list, so to keep
    # it in vector form we use unlist()
    # length() gets the count, so this variable will be a count of the number of
    # correct selections that are left
    curCorRemain <- real_Data$remainCorAfterTrip[n] %>%  gsub("\\[|\\]","",.) %>% 
      gsub(" ","",.) %>% gsub("\'","",.) %>% str_split(",") %>% unlist() %>%   
      length() 
    # Clean the values from having weird characters
    # setNames changes the name of the column
    # First, we look at all of the attempts
    curClickedName <- real_Data$clickedImages[n] %>%  gsub("\\[|\\]","",.) %>% 
      gsub(" ","",.) %>% gsub("\'","",.) %>% str_split(",") %>% 
      setNames("clickName")
    curClickedTime <-real_Data$clickedImagesTime[n] %>% gsub("\\[|\\]","",.) %>%
      str_split(",") %>% unlist %>% as.numeric() %>% list() %>% 
      setNames("clickTime")
    # Combine the info with the names of what was clicked and the time
    curClicked <- cbind(current_Info,curClickedName,row.names=NULL) %>% 
      cbind(curClickedTime,row.names=NULL)
    # Now, we are cleaning the values for the correct attempts
    curCorClickedName <- real_Data$correctClickedsName[n] %>%  
      gsub("\\[|\\]","",.) %>% gsub(" ","",.) %>% gsub("\'","",.) %>% 
      str_split(",") %>% setNames("corClickName") 
    curCorClickedTime <-real_Data$correctClickedsTime[n] %>% 
      gsub("\\[|\\]","",.) %>% str_split(",") %>% unlist %>% as.numeric() %>% 
      list() %>% setNames("corClickTime")
    # Combine the info with the names and the times of what was clicked
    curCorClicked <-  cbind(current_Info,curCorClickedName,row.names=NULL) %>% 
      cbind(curCorClickedTime, row.names=NULL)
    # set nCurCorrect to be a count of what was correctly clicked, and 
    # nCurClickeds to be a count of what was attempted
    real_Data[n,c("nCurCorrect","nCurClickeds")] <- 
      c(length(na.omit(curCorClickedTime$corClickTime)),
        length(na.omit(curClickedTime$clickTime))) 
    # A new column, nRemainCorrect, gets the number of the remaining correct 
    # items added to the count of correct clicks (???)
    real_Data[n,"nRemainCorrect"] <- 
      curCorRemain+length(na.omit(curCorClickedTime$corClickTime))
    # Information about what a participant clicked goes in curParClicked, and 
    # info about what they clicked correctly goes in curParCorrectClicked
    curParClicked <- rbind(curParClicked, curClicked)
    curParCorrectClicked <- rbind(curParCorrectClicked, curCorClicked) 
  }
  
  # Set incorrect clicks to be the number of clicks minus the number of correct 
  # clicks
  real_Data$nCurIncorrect <- real_Data$nCurClickeds-real_Data$nCurCorrect 
  # Add the current iteration's data to the mother dataset, all_Real_Data
  all_Real_Data <- rbind(all_Real_Data,real_Data)
  all_Clicked <- rbind(all_Clicked,curParClicked) 
  all_Correct_Clicked <- rbind(all_Correct_Clicked,curParCorrectClicked)  
  
}

Calculating performance

Next are some managerial bits where we initialize the data frames that will store the info about performance.

# Make a dataframe containing the participant names and one containing the names
# of the conditions
participants <- unique(all_Clicked$participant)
cond <- unique(all_Clicked$condsFile)
# Initialize a data frame that will hold the information about each trip
newAllclicked <- data.frame()
# Add columns to the dataframe that contains all of the clicked data that will
# contain information about performance
all_Clicked[,c("clickAccuracy","remainCorImage","nRemainCor", 
               "remainOptions")] <- NA

Here is another big for-loop, where we get the items the participant clicked on in each trip, and which were correct.

# This for-loop will iterate over each participant and calculate their 
# performance
for (p in 1:length(participants)){
  # In the for-loop, we are going to look at one participant at a time
  curParClicked <- all_Clicked %>% filter(participant==participants[p]) 
  # And we are going to look at one condition at a time
  cond <- unique(curParClicked$condsFile) # NEW
  for (c in 1: length(cond)){
    curCond <- curParClicked %>% filter(condsFile == cond[c])
    # Set trials to be the unique values in the "trial" column for this condition
    trials <- unique(curCond$trial)
    # Now we are going to look at one trial at a time
    for (t in 1:length(trials)){
      curTrial <- curCond %>% filter(trial == trials[t])
      # Clean the first value in the correctImages column for readability
      curCorrectIm <- curTrial$correctImages[1] %>%  gsub("\\[|\\]","",.) %>%
        gsub(" ","",.) %>% gsub("\'","",.) %>% str_split(",") %>% unlist()
      # Set trips to be the unique values in the "trip" column for this trial
      trips <- unique(curTrial$trip)
      # Now we are going to look at one trip at a time (within the trial, within
      # the condition, within the participant)
      for (tr in 1:length(trips)){
        curTrip <- curTrial %>% filter(trip == trips[tr])
        # The remaining options for any given trip, at the beginning, is always 
        # 10 (10 pairs of 2-AFC)
        nRemainOption <- 10
        # For each trip, make sure we know what images are currently correct
        for (q in 1: nrow(curTrip)){
          curTrip$remainCorImage[q] <- vapply(list(curCorrectIm),
                                              paste,collapse=",",character(1L))
          # nRemainCor is the count of how many correct options are left
          curTrip$nRemainCor[q] <- length(curCorrectIm)
          # remainOptions is how many clickable images are left
          curTrip$remainOptions[q] <- nRemainOption 
          # clickAccuracy is whether the clicked image is correct or not
          curTrip$clickAccuracy[q] <- 
            as.numeric(curTrip$clickName[q] %in% curCorrectIm)
          # Here, the remaining options become 10 minus the number of clicks
          nRemainOption <- nRemainOption -length(na.omit(curTrip$clickTime[q])) 
          # And the correct options that are left are the correct options not 
          # yet clicked on
          curCorrectIm <- curCorrectIm[!curCorrectIm %in% curTrip$clickName[q]] 
        }
        # newAllClicked will hold the information about each trip
        newAllclicked <- rbind(newAllclicked,curTrip)
      }
    }
  }
}

Now we are going to initialize another data frame, this one will store everything the participant clicked on before they click on something incorrect… so, its the streak correct!

allClickedBe4Incor <- data.frame()

This is the for-loop that will help us calculate streak correct. The comment towards the end about removing duplicates is because participants 1 and 2 are missing the no_cost condition, so if we don’t remove duplicates, those participants will have duplicate rows for their first blocks because the code wants everyone to end up with the same number of rows, if that makes sense.

for (p in 1:length(participants)){
  # In the for-loop, we are going to look at one participant at a time, one 
  # condition at a time, one trial and then one trip at a time
  curParClicked <- newAllclicked %>% filter(participant==participants[p])
  cond <- unique(curParClicked$condsFile) # NEW
  for (c in 1: length(cond)){
    curCond <- curParClicked %>% filter(condsFile == cond[c])
    trials <- unique(curCond$trial)
    for (t in 1:length(trials)){
      curTrial <- curCond %>% filter(trial == trials[t])
      trips <- unique(curTrial$trip)
      # Now we are we will go within the trip
      for (tr in 1:length(trips)){
        curTrip <- curTrial %>% filter(trip == trips[tr])
        # Here, correctIndex is assigned the first instance where the accuracy 
        # is zero, or the first instance where the participant was incorrect
        correctIndex <- (which(curTrip$clickAccuracy==0))[1]
        # if correctIndex is NA, it means that the participant did not click on 
        # anything, so their streak is not broken, so collect the data from this
        # trip
        if (is.na(correctIndex)){
          validTripClick <- curTrip
          # if correctIndex is 1, we want this row to be in ValidTripClick
        } else if(correctIndex == 1) {
          validTripClick <- curTrip[1,] 
          # if correctIndex is indeed 0, make sure validTripClick contains the 
          # first row of the current trip up until (but not including) the first
          # incorrect response
        } else{
          validTripClick <- curTrip[1:correctIndex-1,]
        }
        # Combine it into a dataframe!
        allClickedBe4Incor <- rbind(allClickedBe4Incor,validTripClick) 
      }
      
    }
    
  }
  # Remove exact duplicate rows from allClickedBe4Incor #NEW because the 
  # condsFile mismatch, it was making duplicates for Ss 1 and 2
  allClickedBe4Incor <- allClickedBe4Incor %>% distinct()
}

This part just sums up the streak corrects for us.

# Get the sum of the streak correct for each trip
allClickedBe4IncorSum <- allClickedBe4Incor %>% 
  group_by(participant,condsFile,trial,trip) %>% 
  summarize(nCurCorBe4Inc = sum(clickAccuracy,na.rm = T)) %>% 
  ungroup()

## `summarise()` has grouped output by 'participant', 'condsFile', 'trial'. You
## can override using the `.groups` argument.

This part combines streak correct data with the OG data.

# Combine with the mother dataset
all_Real_Data <- merge(all_Real_Data,allClickedBe4IncorSum) %>% 
  relocate(nCurCorBe4Inc,.after=nCurCorrect)

Calculate Memory Usage!

Yay!

# Calculate Memory Usage
# MU = (((numCorrect/numAttempted)-0.5)/0.5)*numAttempted 
all_Real_Data <- all_Real_Data %>%
  group_by(participant, condsFile, trial, trip) %>%
  mutate(MU = (((nCurCorrect / nCurClickeds) - 0.5) / 0.5) * nCurClickeds)

When participants don’t click anything, the above calculation sets MU to NaN, so we make sure its actually 0 (because they used zero memory…)

# Replace NaNs with zeroes, because NaNs in MU occurred when the participant did
# not make any clicks (so they used zero memory)
all_Real_Data$MU[is.nan(all_Real_Data$MU)] <- 0

Now we make sure that every participant has a first trip. We want the below code to return an empty dataset, and it should.

# Make sure there is a first trip for every trial
unique_combinations_no_trip_1 <- all_Real_Data %>%
  group_by(participant, condsFile, trial) %>%
  summarise(has_trip_1 = any(trip == 1), .groups = 'drop') %>%
  filter(!has_trip_1) %>%
  select(participant, condsFile, trial)
# Looks good! This code should output an empty dataset

Final data summarization!

Summarized_data <- all_Real_Data %>% 
  group_by(participant,condsFile,order,trial) %>% 
  summarise(nTrip=n(),
            totalListTime=sum(listTime,na.rm = T),
            meanListTime = mean(listTime,na.rm = T),
            totalShelfTime = sum(shelfTime,na.rm = T),
            meanShelfTime = mean(shelfTime,na.rm = T),
            totalTime = totalListTime + totalShelfTime,
            totalCorrect=sum(nCurCorrect),
            meanCorrect = totalCorrect/nTrip,
            meanCorBefInc=sum(nCurCorBe4Inc)/nTrip,
            totalInCorrect = sum(nCurIncorrect),
            meanIncorrect = totalInCorrect/nTrip,
            maxCorrect = max(nCurCorrect),
            maxIncorrect = max(nCurIncorrect),
            totalClickeds=sum(nCurClickeds),
            meanclickeds=totalClickeds/nTrip,
            meanMU = mean(MU, na.rm = TRUE)
  )

## `summarise()` has grouped output by 'participant', 'condsFile', 'order'. You
## can override using the `.groups` argument.

And again for first trips only…

# Do the same but for first trips only
first_trip_only <- all_Real_Data %>%
  filter(trip == 1)

Summarized_first_trips <- first_trip_only %>% 
  group_by(participant,condsFile,order,trial) %>% 
  summarise(nTrip=n(),
            totalListTime=sum(listTime,na.rm = T),
            meanListTime = mean(listTime,na.rm = T),
            totalShelfTime = sum(shelfTime,na.rm = T),
            meanShelfTime = mean(shelfTime,na.rm = T),
            totalTime = totalListTime + totalShelfTime, 
            totalCorrect=sum(nCurCorrect),
            meanCorrect = totalCorrect/nTrip,
            meanCorBefInc=sum(nCurCorBe4Inc)/nTrip,
            totalInCorrect = sum(nCurIncorrect),
            meanIncorrect = totalInCorrect/nTrip,
            maxCorrect = max(nCurCorrect),
            maxIncorrect = max(nCurIncorrect),
            totalClickeds=sum(nCurClickeds),
            meanclickeds=totalClickeds/nTrip,
            meanMU = mean(MU, na.rm = TRUE)
  )

## `summarise()` has grouped output by 'participant', 'condsFile', 'order'. You
## can override using the `.groups` argument.

Save as csv!

#write.csv(Summarized_data,"Summarized_data_EM6_FALLpilot.csv",row.names = FALSE)
#write.csv(Summarized_first_trips,"Summarized_first_trips_EM6_FALLpilot.csv",row.names = FALSE)

EM6_Preprocessing_performance

Candice Koolhaas

2024-11-09

Ginormous for-loop

Calculating performance

Calculate Memory Usage!

Final data summarization!