EM6_ETpreprocessing2

Read in data and load libraries

data_raw <- read.csv("selectETFallPilot2024.csv", header = TRUE)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(zoo)

## Warning: package 'zoo' was built under R version 4.2.3

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(stringr)

## Warning: package 'stringr' was built under R version 4.2.3

The conditions still have their old names, so we need to update those before we do anything else:

# We need to systematically rename the columns so that they make sense for this experiment!
# Apply replacements in condsFile column
data_raw <- data_raw %>%
  mutate(
    condsFile = str_replace(condsFile, "noDelay", "delayAfterList"),
    condsFile = str_replace(condsFile, "longDelay", "delayBeforeList")
  )

In the first preprocessing step, there was padding added to each stage so that everything had the same number of rows. This added a lot of NA rows. Here is what Bill said about this: “Let’s say for one trip, the child spends 1 second on the list, then it will have 60 frames. In another trip, the child spends 2 seconds, then it will have 120 frames. With that, they will have different rows of data for the time they spent on the list. This is not good for the data analysis for the eye tracking if I recall correctly. So I pad it with NA rows base on the maxium so they have the same number of rows” We will keep this in mind moving forward when we visualize/calculate anything.

Next, we get the mean of the left and right pupil. The left and right pupil are extremely correlated, so getting the mean of them to be the raw pupil value is standard (Kret & Shak-Sjie, 2019).

data_raw <- data_raw %>%
  mutate(mean_pupil = if_else(
    is.na(left_pupil_measure1) & is.na(right_pupil_measure1), NA_real_, #If both columns are NA, it will just be NA
    if_else(is.na(left_pupil_measure1), right_pupil_measure1, #If left is na, use right pupil
            if_else(is.na(right_pupil_measure1), left_pupil_measure1, #If right is na, use left pupil
                    (left_pupil_measure1 + right_pupil_measure1) / 2) #If they are both available, get the mean!
    )))

Since we are primarily interested in the pupil during the list phase, let’s isolate that to make the dataset smaller and our lives easier:

L_data <- data_raw %>%
  filter(stage == "list")

Check data quality

Let’s see how much NA rows there are

na_summary <- L_data %>%
  group_by(participant, condsFile, trial, adjustTrip) %>%
  filter(!is.na(d_time)) %>%  # Only keep rows where d_time is not NA
  summarize(
    total = n(),
    na_count = sum(is.na(mean_pupil)),
    na_percentage = (na_count / total) * 100
  ) %>%
  arrange(participant, condsFile, trial, adjustTrip)

## `summarise()` has grouped output by 'participant', 'condsFile', 'trial'. You
## can override using the `.groups` argument.

# Which rows are over 40% NA?
over_40 <- na_summary %>%
  filter(na_percentage > 40.0)

length(over_40)

## [1] 7

Participant 12, 15, and 26 has super messy data. They will be excluded. Participant 13’s no-delay trial 1 trip 1, and participant 28’s delay-after-list trial 3 trip 3, and participant 29’s delay before list trial 4 trip 2 is also bad.

Next, we apply blink correction and a running median filter.

# We need to do this for every trip
L_data2 <- L_data

# Add NextPupil, PrevPupil, NextTotal, PrevTotal columns within groups
L_data2 <- L_data2 %>%
  group_by(participant, condsFile, trial, trip) %>%
  mutate(
    NextPupil = lead(mean_pupil),
    PrevPupil = lag(mean_pupil),
    NextTotal = lead(tripCalTime),
    PrevTotal = lag(tripCalTime)
  ) %>%
  ungroup()

# Calculate x, y, and DilationSpeed
L_data2 <- L_data2 %>%
  mutate(
    x = abs((mean_pupil - PrevPupil) / (tripCalTime - PrevTotal)),
    y = abs((NextPupil - mean_pupil) / (NextTotal - tripCalTime)),
    DilationSpeed = pmax(x, y, na.rm = TRUE)
  )

# Calculate the threshold using the MAD method
MAD <- median(abs(L_data2$DilationSpeed - median(L_data2$DilationSpeed, na.rm = TRUE)), na.rm = TRUE)
x <- 7
threshold <- median(L_data2$DilationSpeed, na.rm = TRUE) + (x * MAD)

# Identify rows where DilationSpeed exceeds the threshold
blink_rows <- which(L_data2$DilationSpeed >= threshold)

# Replace rows around the blink rows with NA
for (i in blink_rows) {
  start_row <- max(1, i - 2)
  end_row <- min(nrow(L_data2), i + 2)
  L_data2$mean_pupil[start_row:end_row] <- NA
}

# Now we check data quality again
na_summary <- L_data2 %>%
  group_by(participant, condsFile, trial, adjustTrip) %>%
  filter(!is.na(d_time)) %>%  # Only keep rows where d_time is not NA
  summarize(
    total = n(),
    na_count = sum(is.na(mean_pupil)),
    na_percentage = (na_count / total) * 100
  ) %>%
  arrange(participant, condsFile, trial, adjustTrip)

## `summarise()` has grouped output by 'participant', 'condsFile', 'trial'. You
## can override using the `.groups` argument.

# Which rows are over 40% NA?
over_40 <- na_summary %>%
  filter(na_percentage > 40.0)

# looking good. let's clean the real data.

L_data2 <- L_data2 %>%
  anti_join(over_40, by = c("participant", "condsFile", "trial", "adjustTrip"))

# Let's add a running median filter, a good default
library(stats)
L_data2$smoothed_pupil <- runmed(L_data2$mean_pupil, k =11)

# compare traces
Trace_data <- L_data2 %>%
  filter(participant == "pilotv2027" & 
           condsFile == "delayAfterList" & 
           trial == 1 & 
           adjustTrip == 1 & 
           !is.na(d_time))

plot(Trace_data$tripCalFrame, Trace_data$smoothed_pupil, 
     type = "l",                        # 'l' specifies a line plot
     col = "blue",                      # Line color
     lwd = 2,                           # Line width
     main = "pilotv2027, delayAfterList, trial 1 trip 1, smoothed", 
     xlab = "Frame", 
     ylab = "Pupil change",
     ylim = c(0, 5.0))               # Set y-axis limits

# RAW
plot(Trace_data$tripCalFrame, Trace_data$mean_pupil, 
     type = "l",                        # 'l' specifies a line plot
     col = "blue",                      # Line color
     lwd = 2,                           # Line width
     main = "pilotv2027, delayAfterList, trial 1 trip 1, raw", 
     xlab = "Frame", 
     ylab = "Pupil change",
     ylim = c(0, 5.0))               # Set y-axis limits

First, let’s check the data quality of the baseline stages.

baselines <- data_raw %>%
  filter(stage == "baseline")

# blink correction and same running median filter
baselines <- baselines %>%
  group_by(participant, condsFile, trial) %>%
  mutate(
    NextPupil = lead(mean_pupil),
    PrevPupil = lag(mean_pupil),
    NextTotal = lead(tripCalTime),
    PrevTotal = lag(tripCalTime)
  ) %>%
  ungroup()

# Calculate x, y, and DilationSpeed
baselines <- baselines %>%
  mutate(
    x = abs((mean_pupil - PrevPupil) / (tripCalTime - PrevTotal)),
    y = abs((NextPupil - mean_pupil) / (NextTotal - tripCalTime)),
    DilationSpeed = pmax(x, y, na.rm = TRUE)
  )

# Calculate the threshold using the MAD method
MAD <- median(abs(baselines$DilationSpeed - median(baselines$DilationSpeed, na.rm = TRUE)), na.rm = TRUE)
x <- 7
threshold <- median(baselines$DilationSpeed, na.rm = TRUE) + (x * MAD)

# Identify rows where DilationSpeed exceeds the threshold
blink_rows <- which(baselines$DilationSpeed >= threshold)

# Replace rows around the blink rows with NA
for (i in blink_rows) {
  start_row <- max(1, i - 2)
  end_row <- min(nrow(L_data2), i + 2)
  baselines$mean_pupil[start_row:end_row] <- NA
}

na_summary_baselines <- baselines %>%
  group_by(participant, condsFile, trial) %>%
  filter(!is.na(d_time)) %>%  # Only keep rows where d_time is not NA
  summarize(
    total = n(),
    na_count = sum(is.na(mean_pupil)),
    na_percentage = (na_count / total) * 100
  ) %>%
  arrange(participant, condsFile, trial)

## `summarise()` has grouped output by 'participant', 'condsFile'. You can
## override using the `.groups` argument.

over_40 <- na_summary_baselines %>%
  filter(na_percentage > 40.0)

# remove these trials
L_data2 <- L_data2 %>%
  anti_join(over_40, by = c("participant", "condsFile", "trial"))

baselines <- baselines %>%
  anti_join(over_40, by = c("participant", "condsFile", "trial"))

baselines$smoothed_baseline <- runmed(baselines$mean_pupil, k =11)

Now we need to get a good 500ms streak of data, get the mean of that, and combine it to the L_data.

baselines <- baselines %>%
  ungroup() %>% 
  select(participant, condsFile, trial, stage, baseline_d_time = d_time, smoothed_baseline = smoothed_baseline, baselineTime = tripCalTime)

# First lets find when e dont have enough data for a baseline

problematic_cases <- baselines %>%
    group_by(participant, condsFile, trial) %>%
    filter(n() > 30) %>%  # Only consider groups with more than 30 rows
    mutate(
        valid_chunk_found = {
            pupil_values <- smoothed_baseline[31:n()]
            if (length(pupil_values) < 30) {
                FALSE  # Not enough rows to form a chunk
            } else {
                # Check if there is any continuous stretch of 30 non-NA rows
                any(sapply(1:(length(pupil_values) - 29), function(i) {
                    all(!is.na(pupil_values[i:(i + 29)]))
                }))
            }
        }
    ) %>%
    filter(!valid_chunk_found) %>%  # Keep only groups where no valid chunk is found
    ungroup()

# remove those instances
baselines <- baselines %>%
  anti_join(problematic_cases, by = c("participant", "condsFile", "trial"))

# Below will skip the first 30 rows for each baseline period, then look for the
# next consecutive non-NA streak of 30 rows (30 because that corresponds to
# 500 ms!)
baselines <- baselines %>%
    group_by(participant, condsFile, trial) %>%
    mutate(
        # Initialize baseline with NA by default
        baseline = NA_real_,
        
        # Find a continuous stretch of 30 rows with no NA values after skipping the first 30 rows
        baseline = {
            # Check if the group has at least 31 rows
            if (n() > 30) {
                # Get the Pupil values after skipping the first 30 rows
                pupil_values <- smoothed_baseline[31:n()]
                
                # Identify the first continuous chunk of 30 rows without any NAs
                valid_chunk_found <- FALSE
                for (i in seq_len(length(pupil_values) - 29)) {
                    # Check for a chunk of 30 non-NA values
                    chunk <- pupil_values[i:(i + 29)]
                    if (all(!is.na(chunk))) {
                        # If we find a valid chunk, calculate the median and break the loop
                        baseline_median <- median(chunk, na.rm = TRUE)
                        valid_chunk_found <- TRUE
                        break
                    }
                }
                
                # Assign the mean of the first valid chunk or NA if no chunk was found
                if (valid_chunk_found) baseline_median else NA_real_
            } else {
                NA_real_  # Not enough rows to find a valid chunk
            }
        }
    ) %>%
    ungroup() %>%
    # Keep only one row per group with the `baseline` value
    distinct(participant, condsFile, trial, .keep_all = TRUE) %>%
    select(-stage, -baseline_d_time, -smoothed_baseline, -baselineTime)

SO actually I am curious as to comparing the data when we have a baseline gathered like above, versus just using the first 500ms of data in general for the list phase… so let’s include that too so that we can compare on the side…

L_data_baseline2 <- L_data2 %>%
  group_by(participant, condsFile, trial, adjustTrip) %>%
  slice_head(n = 30) %>%  
  summarise(baseline2 = median(smoothed_pupil, na.rm = TRUE), .groups = "drop")

And another thing that might be interesting is instead of getting the median change from baseline for the entire list phase, instead getting the median of the last 500 ms (30 rows) of the list phase, because that’s the point where participants have “loaded” up their memory with the items they want to remember.

pupil_end_trip <- L_data2 %>%
  filter(!is.na(d_time)) %>%  # Filter out rows where d_time is NA
  group_by(participant, condsFile, trial, adjustTrip) %>%
  slice_tail(n = 30) %>%  # Select only the last 30 rows of each group
  summarise(last_500ms = median(smoothed_pupil, na.rm = TRUE), .groups = "drop")

Now combine with L_data

L_data3 <- L_data2 %>%
  left_join(baselines %>% select(participant, condsFile, trial, baseline), 
            by = c("participant", "condsFile", "trial"))

L_data3 <- L_data3 %>%
  left_join(L_data_baseline2 %>% select(participant, condsFile, trial, adjustTrip, baseline2), 
            by = c("participant", "condsFile", "trial", "adjustTrip"))

L_data3 <- L_data3 %>%
  left_join(pupil_end_trip %>% select(participant, condsFile, trial, adjustTrip, last_500ms), 
            by = c("participant", "condsFile", "trial", "adjustTrip"))

Now subtract the baseline from the pupil values to get change-from-baseline!

#Get change-from-baseline through subtractive correction
baseline_corrected_data <- L_data3 %>%
  mutate(change_from_baseline_classic = ifelse(is.na(smoothed_pupil), NA, smoothed_pupil - baseline))

# Same for backup baseline...
baseline_corrected_data <- baseline_corrected_data %>%
  mutate(change_from_baseline2 = ifelse(is.na(smoothed_pupil), NA, smoothed_pupil - baseline2))

# And for last 500ms pupil...
baseline_corrected_data <- baseline_corrected_data %>%
  mutate(last500_minus_first500 = ifelse(is.na(last_500ms), NA, last_500ms - baseline2)) %>%
  mutate(last500_minus_baseline = ifelse(is.na(last_500ms), NA, last_500ms - baseline))

Now… we plot some stuff!

write.csv(baseline_corrected_data, "em6_eyetracking.csv", row.names = F)

EM6_ETpreprocessing2_MKDN

Candice Koolhaas

2024-11-10

Check data quality