EM6_Eyetracking_preprocessing

Extracting the eyetracking data from hdf5 file

Below will extract the relevant data from the hdf5 file and put it in normal data frames. It will also begin to make sure the timings of things are lining up.

# For each hdf5 file, make it a new H5 file object 
  # and set its ID
  for (f in 1:length(hdfFilesList)) {
    df = H5File$new(hdfFilesList[f], mode="r")
    id <- strsplit(hdfFilesList[f],"_")[[1]][1]
    # H5 files are hierarchical
    # - they have a sort of nested structure
    # They kind of have their own files
    # So below we go into the file that stores the eyetracking data, 
    # and then we convert it into and R object and make sure its a data frame, 
    # and not a matrix
    et = df[["data_collection/events/eyetracker/BinocularEyeSampleEvent"]]
    et = et[] 
    tt <- data.frame(et)
    # MessageEvent stores the events from the experiment itself
    msg = df[["data_collection/events/experiment/MessageEvent"]]
    msg = msg[]
    # Below is initializing the columns that will have the start and end times
    msg$tStart = NA
    msg$tEnd = NA
    msg$tDur = NA
    
    # Below initializes the columns that will have info from the experiment
    msg$stage = NA
    msg$condsFile = NA
    msg$realOrNot = NA
    msg$trial = NA
    msg$trip = NA
    et$stage = NA
    et$condsFile = NA
    et$realOrNot = NA
    et$trial = NA
    et$trip = NA
    et$participant = id
    
    # The "text" column has the information about where data from that 
    # row comes from
    # The start and end point for each section is based on data in 
    # the "msg" object
    for (l in 1:nrow(msg)) {
      msg[l,c("stage","condsFile","realOrNot","trial","trip")] <- 
        strsplit(msg$text[l], split = ' ')[[1]][c(1,2,3,4,5)]
      if ((grepl('Start ', msg$text[l])) == TRUE) {
        msg$tStart[l] = msg$time[l]
        msg$tEnd[l] = msg$time[l+1]
      }
    }
    msg$tDur = msg$tEnd- msg$tStart
   
    # "msg" will hold the rows where each stage starts (as opposed to ends)
    msg = subset(msg, grepl('Start ',msg$text))
    
    # Section eyetracking data based on the message start and end point
    # This is setting the variables for stage, condition, etc in the ET file
    for(i in 1:nrow(msg)) {
      # Look at the time between Start and End...
      filt = between(et$time, msg$tStart[i],  msg$tEnd[i]) 
      et$stage[filt] = msg$stage[i]
      # ...and whatever value for stage that's in that between-time...
      et$condsFile[filt] = msg$condsFile[i] 
      et$realOrNot[filt] = msg$realOrNot[i]
      et$trial[filt] = msg$trial[i]
      # ...becomes the value for that variable.
      et$trip[filt] = msg$trip[i]  
      
    }
    
    # Replace all the NAN with NA
    et <- et %>% mutate_all(~ifelse(is.nan(.), NA, .))
    
    # rawET will hold the data from et
    rawET <- rbind(rawET,et)
    
    # Removing unnecessary columns here
    etSelect <- et %>% 
      select(-c(1,2,3,5,7,8,9,10,11,14,18,19,20,21,24,25,26,
                27,28,29,30,33,37,38,39,40,43,44,45,
                46,47,48,49)) %>% 
      # Making d_time, which converts time to seconds from ms 
      # relative to the first value in device time so it starts from zero
      ##### Important! In these sections where we line up the frame timing,
      # the maintenance phase AND list 
      # is being assigned NA values in d_time
      ### So that the 1 second maintenance phase is having the same number of 
      # rows as the 4 second maintenance phase
      # And all the list phases have the same number of rows
      ### Even though they're different durations
      ### And the the difference is being made up by NA rows
      ### This was originally to help keep things lined up
      # So when we calculate study time/plot anything regarding the list
      # We will have to make sure we only are visualizing/calculating
      # rows where d_time is not NA
      mutate(d_time=(device_time-device_time[1])/1000)
    
    # remapping the time to the supposed frame time----
    newFrame <- data.frame(
      # calFrame will be the frame rate (60 per second)
      calFrame = seq(1:ceiling(etSelect$d_time[length(etSelect$d_time)]/(1000/60)))
    ) %>% 
      # calTime (starting from zero) is the time in ms for each frame
      # So it would go 0, 16.67, 33.33, etc
      mutate(calTime=(calFrame-1)*(1000/60))
    
    # The above section created newFrame which is has increments of ~16.6 and 
    # a frame counter
    # The next parts will calibrate the d_time in etSelect with newFrame
    # Below finds the closest value in d_time that corresponds to the "clean" 
    # time value in newFrame
    closestL <- sapply(etSelect$d_time, 
                       function(oldStamps, newStamps) 
                       {which.min(abs(oldStamps-newStamps))},
                       newFrame$calTime)
    
    # This is assigning the closest values (closestL) from 
    # newFrame to etSelect
    # So that etSelect will have calFrame, which tells us what frame 
    # each row of data is
    # and calTime, which is essentially counting up from zero in 16.67 ms bins
    etSelect[,c("calFrame","calTime")] <- 
      newFrame[closestL,c("calFrame","calTime")]
    
    
    # put in the missing frame data and reorganize column----
    # nmissFrame is getting the number of missing rows in between each instance 
    # of calFrame
    # And its only holding the instances where the number of missing frames is 
    # more than 0
    missFrame <- etSelect %>% 
      mutate(nmissFrame=calFrame-lag(calFrame)-1) %>% 
      filter(nmissFrame>0) 
    
    # This is attaching information about the participant and experiment to the 
    # rows of missFrame
    allMissFrame <- missFrame %>% 
      select("participant","realOrNot","condsFile","trial",
             "trip","stage","nmissFrame") %>% 
      rbind(allMissFrame,.)
    
    # Below is updating the calFrame and calTime in missFrame
    missFrame <- missFrame %>% 
      lapply(rep, .$nmissFrame) %>%
      as.data.frame() %>%
      group_by(event_id) %>%mutate(
        counter = row_number(event_id),
        calFrame = calFrame-nmissFrame-1+counter,
        calTime=(calFrame-1)*(1000/60)
      ) %>% 
      select(-c("nmissFrame","counter"))
    
    # Below replaces all columns that aren't those specified with NAs
    # Just in case there are any weird values in the ET columns
    # It's to make sure that there isn't any "data" in the rows that were missing
    missFrame[,!names(missFrame)%in%c("stage","condsFile","realOrNot",
                                      "trial","trip","participant",
                                      "calFrame","calTime" )] <- NA
    
    # Below connects missFrame to etSelect, arranges it based on calFrame, 
    # relocates the positioning of the columns, and attaches it to selectET 
    # (currently blank)
    etSelect <- etSelect %>% 
      rbind(missFrame) %>% 
      arrange(calFrame) %>% 
      relocate(c(participant,realOrNot,condsFile,trial,
                 trip,stage,d_time,calFrame,calTime),
               .before=event_id)
    
    selectET <- rbind(selectET,etSelect)
    
  }

Below improves readability and fixes the numbering of trips and trials (which start at 0 but it needs to start at 1)

# Below is some housekeeping
# Arrange the dataset by participant, then by frame, then remove .csv from 
# the condsFile names
# The "Start" string in stage is uninformative, and trials begin at 0 so they 
# need the +1
# Similar for trips. Finally, create column that acts as an index of row number
selectET <- selectET %>% 
arrange(participant,calFrame) %>% 
mutate(condsFile = str_replace(condsFile,".csv",""),
       stage = str_replace(stage,"Start",""),
       trial=as.numeric(trial)+1,
       trip=ifelse(stage=="listTrip",as.numeric(trip)+2,as.numeric(trip)+1),
       location=row_number())

Below sets the row count for each stage, trial, and participant.

# Below is for real vs. practice trials
stagePoint <- selectET %>%
  group_by(participant,realOrNot,condsFile,trial,trip,stage) %>% 
  # Create start and end locations for each stage
  summarise(n=n(),
            startPoint=min(calFrame),
            endPoint=max(calFrame),
            startLoc=min(location),
            endLoc=max(location)) %>% 
  # Remove NA's
  na.omit() %>% 
  ungroup() %>% 
  arrange(participant,startPoint) %>% 
  # ...overLocation stores the row count for each participant
  mutate(overLocation=row_number()) %>% 
  # ...then for each participant's trial
  group_by(participant,condsFile,trial) %>% 
  arrange(participant,startPoint) %>% 
  mutate(trialLocation=row_number())

## `summarise()` has grouped output by 'participant', 'realOrNot', 'condsFile',
## 'trial', 'trip'. You can override using the `.groups` argument.

Begin aligning the frames by getting the maximum number of frames for each stage

# Below gets the number of frames in each stage (stage is like, shelf, list, shelfBack, delay, etc)
stageMaxFrame <-stagePoint %>% 
  group_by(stage) %>% 
  summarize(maxFrame=max(n))

This is making sure that each stage has the same number of frames. This is for lining things up temporally so that the frames perfectly align if you “stack” the stage events on top of each other, but keep in mind that extra padding is added to the end of each stage so that they are all the same “length”.

# This is the beginning of calibrating the missing frames with the stages
# It's combining stagePoint with the maximum number of frames in each stage
# Then calculating how many frames are needed to get to the maximum number
# Then selecting a subset of columns to be included in stagePoint
stagePoint <- stagePoint %>%
  full_join(stageMaxFrame,by="stage") %>% 
  mutate(needFrame = maxFrame - n)%>% 
  arrange(participant,startLoc) %>% 
  select("participant","condsFile","trial","trip","stage","needFrame")

Continuing from last step…

# Below is making the number of frames match the max frames for that stage
# This is setting the number of frames for each stage to be however many is in 
# the longest condition
# This is going to lead to a lot of blank rows as the stages between 
# participants are not equally as long
# Keep that in mind when plotting!
stageLastRow <-  selectET %>% 
  filter(!is.na(stage)) %>% 
  group_by(participant,condsFile,trial,trip,stage) %>% 
  slice_tail() %>% 
  inner_join(stagePoint) %>% 
  arrange(participant,calFrame) %>% 
  lapply(rep, .$needFrame) %>% 
  data.frame() %>% 
  group_by(participant,condsFile,trial,trip,stage) %>% 
  mutate(location=location+(1/(needFrame+1))*row_number()) %>% 
  select(-"needFrame")

## Joining, by = c("participant", "condsFile", "trial", "trip", "stage")

This part is adding padding to make the stages have equal lengths

# Set everything but these columns to NA
# To make sure there isn't any "data" in them
stageLastRow[,!names(stageLastRow)%in%c("stage","condsFile","realOrNot",
                                        "trial","trip","participant","location")] <- NA
  
# This is more code that lines up the time with the frames...
selectET<- selectET %>% 
  rbind(stageLastRow) %>% 
  arrange(participant,location) %>% 
  mutate(adjustTrip = ifelse(stage %in% c("list","shelfBack","shelf", "listTrip"),trip,NA)) %>% 
  # If the stage isn't the list, listtrip, the shelf, or shelfBack, make it NA
  group_by(participant,condsFile,trial,adjustTrip) %>% 
  mutate(tripCalFrame = row_number(),
          tripCalTime = (tripCalFrame-1)*(1000/60)) %>% 
  relocate(adjustTrip,.after=trip) %>% 
  relocate(c(tripCalFrame,tripCalTime),.after = calTime)

We will make a couple datasets that have basic basic summary info about number of trips and trials.

# This will make a really small dataset that just gives us the info about 
# the trials, like the most basic stuff
totalTrial <- selectET %>% 
  group_by(participant,realOrNot,condsFile,trial) %>% 
  summarize(startpoint=min(location)) %>% 
  filter(realOrNot==1) %>% 
  filter(!is.na(condsFile)) %>% 
  ungroup() %>% 
  group_by(participant) %>% 
  arrange(startpoint) %>% 
  mutate(totalTrialNum=row_number()) %>% 
  select(-startpoint)

## `summarise()` has grouped output by 'participant', 'realOrNot', 'condsFile'.
## You can override using the `.groups` argument.

# Same but with trips!
totalTrip <- selectET %>% 
  group_by(participant,realOrNot,condsFile,trial,trip) %>% 
  summarize(startpoint=min(location)) %>% 
  filter(realOrNot==1) %>% 
  filter(!is.na(condsFile)) %>% 
  ungroup() %>% 
  group_by(participant) %>% 
  arrange(startpoint) %>% 
  mutate(totalTripNum=row_number())%>% 
  select(-startpoint)

## `summarise()` has grouped output by 'participant', 'realOrNot', 'condsFile',
## 'trial'. You can override using the `.groups` argument.

Combine into one dataset, and finally write the csv that will hold all this preprocessed stuff!

# Combine them into selectET
selectET <- selectET %>% 
  full_join(totalTrial,by=c("participant","realOrNot","condsFile","trial")) %>% 
  full_join(totalTrip,by=c("participant","realOrNot","condsFile","trial","trip")) %>% 
  filter(realOrNot==1)
  
  
data.table::fwrite(selectET,"selectETFallPilot2024.csv",row.names=F)

EM6_Eyetracking_preprocessing_Step1

Candice Koolhaas

2024-11-10

Extracting the eyetracking data from hdf5 file