setwd("D:/Canna_d/EM6_stuff/Feb28_all_pilot_business/scripts_and_data/processed_data")
<- read.csv("SG_ET_step1.csv", header = TRUE)
data_raw
library(dplyr)
library(tidyr)
library(zoo)
library(stringr)
library(stats)
# We need to rename the conditions
# Previous condition names were from previous version of experiment
<- data_raw %>%
data_raw mutate(
condsFile = str_replace(condsFile, "noDelay", "maintenance_delay"),
condsFile = str_replace(condsFile, "longDelay", "lag_delay")
)
# Also there was an error in how the experiment was logging the stages.
# The maintenance stage was never properly logged as separate from the shelf.
# So we make the first 300 rows of the shelf stage in maintenance conditions to be the maintenance delay.
# (This is specific to the pilot data, this problem si fixed in the final experiment)
<- data_raw %>%
data_raw group_by(participant, condsFile, trial, adjustTrip) %>%
filter(condsFile %in% c("maintenance_delay", "maintenance_delayClosing")) %>%
mutate(stage = if_else(stage == "shelf" & cumsum(stage == "shelf") <= 300, "maintenance_delay", stage))
# get mean of L and R pupil, as they are correlated
# (Jackson & Sirois, 2009)
<- data_raw %>%
data_raw mutate(mean_pupil = if_else(
is.na(left_pupil_measure1) & is.na(right_pupil_measure1), NA_real_,
if_else(is.na(left_pupil_measure1), right_pupil_measure1,
if_else(is.na(right_pupil_measure1), left_pupil_measure1,
+ right_pupil_measure1) / 2)
(left_pupil_measure1
)))
# We don't use the old "baseline" stage for anything
# So we remove it to ease the burden on datafile size
<- data_raw %>%
Less_raw_data filter(stage != "baseline")
# This is to make the visualizations easier to read
# It is just milliseconds for eeach stage
<- Less_raw_data %>%
Less_raw_data group_by(participant, condsFile, trial, adjustTrip, stage) %>%
mutate(milliseconds = seq(16.6, by = 16.6, length.out = n())) %>%
ungroup()
EM6_Preprocessing_step2_March7
Overview
In these preprocessing steps, we follow the guidelines of Mathot et al. (2023) and Kret & Sjak Shie (2018) as closely as possible. Below is the general outline of the steps, and following that will be all of the code and explanations for each step.
- Remove impossible values
- Smooth pupil trace
- Calculate velocity
- Detect blinks
- Interpolate
- Detect blinks again
- Interpolate again
- Smooth final trace
- Calculate baseline
- Perform subtractive baseline correction
0. Load in data
1. Remove impossible values
First we remove any values that would be physiologically impossible (negative or outside the range of 1.5 and 9 mm. This happens due to errors in the recording of data by the eyetracker.
<- Less_raw_data %>%
Less_raw_data # remove values in mean_pupil that are negative
# remove values in mean_pupil that are below 1.5 and above 9
# Below does both at once (only includes pupil values within our range)
# Keep NA for now
filter((mean_pupil >= 1.5 & mean_pupil <= 9) | is.na(mean_pupil))
2. Smooth trace
We use a Hanning window as that is what Mathot & Vilotijevic (2023) use to lightly smooth the trace prior to blink correction.
Note that Mathot’s code is for EyeLink 1000 (1000 Hz). The default window length for his smoother is 21, which is about 20 ms. Since we are using a 60 Hz Tobii, this imperfectly translates to using a window length of 3 (for us it would be a window length of 49.8 ms, because this function has a minimum window size of 3 points).
Also note that I am just using ggplot because it makes it easier to visualize all of the participants before and after the smoother but blinks might show up as gaps because ggplot just removes values that don’t fit in the scale.
library(Convolutioner)
library(ggplot2)
# Below is old
#smoothed_trace_data <- Less_raw_data %>%
# ungroup() %>%
# group_by(participant) %>%
# mutate(smoothed_pupil = Hann(mean_pupil, buffer_size = 7))
# option 1:
<- Less_raw_data %>%
smoothed_trace_data ungroup() %>%
group_by(participant) %>%
mutate(smoothed_pupil = Hann(mean_pupil, buffer_size = 7)) %>%
mutate(smoothed_pupil = ifelse(is.na(smoothed_pupil), mean_pupil, smoothed_pupil))
# Just to compare, lets do median smoother too:
$med_smoother1 <- runmed(smoothed_trace_data$mean_pupil, k =11)
smoothed_trace_data
# visualize raw vs. smooth trace
# Filter the data for trial 1
<- smoothed_trace_data %>%
Trace_data_trial1trip1 filter(condsFile == "maintenance_delay" &
== "maintenance_delay" &
stage == 1 &
trial == 1 & !is.na(d_time))
adjustTrip
# Plot traces for all participants in trial 1
ggplot(Trace_data_trial1trip1, aes(x = milliseconds, y = mean_pupil, group = participant, color = participant)) +
geom_line(size = 1) + # Line plot for each participant's trace
facet_wrap(~ participant, scales = "free_y") + # Facet by participant, with independent y-axis
labs(title = "Raw pupil - Trial 1 Trip 1 (All Participants)",
x = "Frame",
y = "Pupil (mm)") +
theme_minimal() +
theme(legend.position = "none")
ggplot(Trace_data_trial1trip1, aes(x = milliseconds, y = smoothed_pupil, group = participant, color = participant)) +
geom_line(size = 1) + # Line plot for each participant's trace
facet_wrap(~ participant, scales = "free_y") + # Facet by participant, with independent y-axis
labs(title = "Smoothed pupil - Trial 1 Trip 1 (All Participants)",
x = "Frame",
y = "Pupil (mm)") +
theme_minimal() +
theme(legend.position = "none")
2. Calculate velocity
We calculate the velocity by getting the 1st derivative between time points, in the style of Mathot & Vilotijevic (2023).
<- smoothed_trace_data %>%
smoothed_trace_data group_by(participant) %>%
# make first value 0 because we can't get a difference with 1 value
mutate(vtrace = c(0, diff(smoothed_pupil, lag = 1)))
3. Detect blinks
For blink correction, I calculate velocity speed outliers by getting the median absolute deviation and multiplying it by a constant. This method was in Preprocessing pupil size data: Guidelines and code by Kret and Sjak Shie (2019). About the constant, they say it “should be chosen empirically by researchers so that they best fit a particular dataset. It is our experience that no”one size fits all” set of rejection criteria exists, due to differing eyetracker sampling rates, precision, noise susceptibility, and pupil detection algorithms.” From playing around with different values, it seems constants between 5 and 10 do a decent job at removing blink artifacts.
# Calculate the threshold using the MAD method
<- median(abs(smoothed_trace_data$vtrace - median(smoothed_trace_data$vtrace, na.rm = TRUE)), na.rm = TRUE)
MAD # Below is the constant we multiply by the MAD
<- 5
x <- median(smoothed_trace_data$vtrace, na.rm = TRUE) + (x * MAD)
threshold
# Identify rows where velocity exceeds the threshold
# ignore NAs because first moments of the vtrace are always NA
# and its not necessarily a blink
<- which(!is.na(smoothed_trace_data$vtrace)
blink_rows & abs(smoothed_trace_data$vtrace) >= threshold)
# Make table of how many blinks showed up for each participant
<- smoothed_trace_data %>%
blink_data filter(row_number() %in% blink_rows) %>%
select(participant, condsFile, trial, adjustTrip, stage)
# checkpoint
<- smoothed_trace_data
blink_corrected_1
# NEW changing to 30ms
for (i in blink_rows) {
<- max(1, i - 2)
start_row <- min(nrow(blink_corrected_1), i + 2)
end_row $smoothed_pupil[start_row:end_row] <- NA
blink_corrected_1 }
4. Interpolate
We will use cubic spline interpolation as per Mathot & Vilotijevic (2023), but if the gap is over 500ms, we do not interpolate.
# Spline interpolation of mean_pupil
# but DO NOT interpolate over gaps over 31 rows
library(zoo)
<- na.spline(zoo(blink_corrected_1$smoothed_pupil), maxgap = 31)
interpolated_pupil $interpolated_pupil <- interpolated_pupil
blink_corrected_1
## Visualize
# Filter the data for trial 1
<- blink_corrected_1 %>%
Trace_data_trial1trip1 filter(condsFile == "maintenance_delay" &
== "maintenance_delay" &
stage == 1 &
trial == 1 & !is.na(d_time))
adjustTrip
# Plot traces for all participants in trial 2
ggplot(Trace_data_trial1trip1, aes(x = milliseconds, y = smoothed_pupil, group = participant, color = participant)) +
geom_line(size = 1) + # Line plot for each participant's trace
facet_wrap(~ participant, scales = "free_y") + # Facet by participant, with independent y-axis
labs(title = "Blink corrected (1st iteration) pupil - Trial 1 Trip 1 (All Participants)",
x = "Frame",
y = "Pupil (mm)") +
theme_minimal() +
theme(legend.position = "none")
Warning: Removed 321 rows containing missing values or values outside the scale range
(`geom_line()`).
ggplot(Trace_data_trial1trip1, aes(x = milliseconds, y = interpolated_pupil, group = participant, color = participant)) +
geom_line(size = 1) + # Line plot for each participant's trace
facet_wrap(~ participant, scales = "free_y") + # Facet by participant, with independent y-axis
labs(title = "INTERPOLATED pupil - Trial 1 Trip 1 (All Participants)",
x = "Frame",
y = "Pupil (mm)") +
theme_minimal() +
theme(legend.position = "none")
Don't know how to automatically pick scale for object of type <zoo>. Defaulting
to continuous.
Warning: Removed 299 rows containing missing values or values outside the scale range
(`geom_line()`).
5. Smooth trace again
Smooth trace again before second round of blink correction.
# below applies Hann filter
<- blink_corrected_1 %>%
blink_corrected_1 mutate(interpolated_pupil = as.numeric(interpolated_pupil))
# old
#blink_corrected_1 <- blink_corrected_1 %>%
# group_by(participant) %>%
# mutate(smoothed_pupil2 = Hann(interpolated_pupil, buffer_size = 7))
# Just to compare, lets do median smoother too:
$med_smoother2 <- runmed(blink_corrected_1$interpolated_pupil, k =11)
blink_corrected_1
# NEw:
<- blink_corrected_1 %>%
blink_corrected_1 ungroup() %>%
group_by(participant) %>%
mutate(smoothed_pupil2 = Hann(interpolated_pupil, buffer_size = 7)) %>%
mutate(smoothed_pupil2 = ifelse(is.na(smoothed_pupil2), interpolated_pupil, smoothed_pupil2))
6. Detect blinks again
Go through blink correction a second time (recommendation by Kret & Sjak Shie).
# Get new velocity
<- blink_corrected_1 %>%
blink_corrected_1 group_by(participant) %>%
# make first value 0 because we can't get a difference with 1 value
mutate(vtrace2 = c(0, diff(smoothed_pupil2, lag = 1)))
# Calculate the threshold using the MAD method
# Only consider rows in smoothed_pupil that are not NA
<- blink_corrected_1 %>%
MAD filter(!is.na(smoothed_pupil2)) %>% # Keep only rows where smoothed_pupil is not NA
pull(vtrace2) %>% # Extract vtrace as a numeric vector
median(abs(. - median(., na.rm = TRUE)), na.rm = TRUE) } # Compute MAD
{
# Below is the constant we multiply by the MAD
<- 5
x <- median(blink_corrected_1$vtrace2, na.rm = TRUE) + (x * MAD)
threshold
# Identify rows where velocity exceeds the threshold
<- which(!is.na(blink_corrected_1$vtrace2)
blink_rows & abs(blink_corrected_1$vtrace2) >= threshold)
# Make table of how many blinks showed up for each participant
<- blink_corrected_1 %>%
blink_data filter(row_number() %in% blink_rows) %>%
select(participant, trial, adjustTrip, stage)
# checkpoint
<- blink_corrected_1
blink_corrected_2
# Replace ~17ms of rows around the blink rows with NA
for (i in blink_rows) {
<- max(1, i - 2)
start_row <- min(nrow(blink_corrected_2), i + 2)
end_row $smoothed_pupil2[start_row:end_row] <- NA
blink_corrected_2 }
7. Interpolate again
Cubic spline, unless over 250 ms.
# Spline interpolation of mean_pupil
# but DO NOT interpolate over gaps over 15 rows
<- na.spline(zoo(blink_corrected_2$smoothed_pupil2), maxgap = 31)
interpolated_pupil $interpolated_pupil2 <- interpolated_pupil
blink_corrected_2
## Visualize
# Filter the data for trial 1
<- blink_corrected_2 %>%
Trace_data_trial1trip1 filter(condsFile == "maintenance_delay" &
== "maintenance_delay" &
stage == 1 &
trial == 1 & !is.na(d_time))
adjustTrip
# Plot traces for all participants in trial 2
ggplot(Trace_data_trial1trip1, aes(x = milliseconds, y = smoothed_pupil, group = participant, color = participant)) +
geom_line(size = 1) + # Line plot for each participant's trace
facet_wrap(~ participant, scales = "free_y") + # Facet by participant, with independent y-axis
labs(title = "Blink corrected (2nd iteration) pupil - Trial 1 Trip 1 (All Participants)",
x = "Frame",
y = "Pupil (mm)") +
theme_minimal() +
theme(legend.position = "none")
Warning: Removed 321 rows containing missing values or values outside the scale range
(`geom_line()`).
# Same but make it the non-interpolated to compare
ggplot(Trace_data_trial1trip1, aes(x = milliseconds, y = interpolated_pupil2, group = participant, color = participant)) +
geom_line(size = 1) + # Line plot for each participant's trace
facet_wrap(~ participant, scales = "free_y") + # Facet by participant, with independent y-axis
labs(title = "INTERPOLATED pupil - Trial 1 Trip 1 (All Participants)",
x = "Frame",
y = "Pupil (mm)") +
theme_minimal() +
theme(legend.position = "none")
Don't know how to automatically pick scale for object of type <zoo>. Defaulting
to continuous.
Warning: Removed 321 rows containing missing values or values outside the scale range
(`geom_line()`).
8. Smooth final trace
Final smoothing before baseline correction.
# below applies Hann filter
<- blink_corrected_2 %>%
blink_corrected_2 mutate(interpolated_pupil2 = as.numeric(interpolated_pupil2))
# Old
#blink_corrected_2 <- blink_corrected_2 %>%
# group_by(participant) %>%
# mutate(smoothed_pupil = Hann(interpolated_pupil2, buffer_size = 7))
# Just to compare, lets do median smoother too:
$med_smoother3 <- runmed(blink_corrected_2$interpolated_pupil2, k =11)
blink_corrected_2
# NEw:
<- blink_corrected_2 %>%
blink_corrected_2 ungroup() %>%
group_by(participant) %>%
mutate(smoothed_pupil3 = Hann(interpolated_pupil2, buffer_size = 7)) %>%
mutate(smoothed_pupil3 = ifelse(is.na(smoothed_pupil3), interpolated_pupil2, smoothed_pupil3))
9. Calculate baseline and remove invalid trials
Because we are interested in how cognitive effort (measured by pupil size) explains variance in Memory Usage, we want to collect the baseline period before participants undergo any cognitive effort for each trip to the list. Therefore, out baseline period will be the median of the first 500ms of each list trip.
write.csv(blink_corrected_2, "calculate_baseline_step.csv", row.names = F)
<- blink_corrected_2 %>%
baselines_500 filter(stage == "list") %>%
group_by(participant, condsFile, trial, adjustTrip, stage) %>%
mutate(
non_na_count = sum(!is.na(smoothed_pupil3[1:min(30, n())])),
baseline = ifelse(non_na_count >= 14, median(smoothed_pupil3[1:min(30, n())], na.rm = TRUE), NA_real_)
%>%
) select(-non_na_count) %>% # Remove the helper column
ungroup()
# Get the count bad trials
<- baselines_500 %>%
baselines_NA_500 filter(is.na(baseline)) %>%
distinct(participant, condsFile, trial, adjustTrip) %>%
filter(adjustTrip == 1)
cat("There are this many bad baselines:", nrow(baselines_NA_500), "\n")
There are this many bad baselines: 12
# Looks good, now we isolate baselines and combine
<- baselines_500 %>%
baselines_500 select(participant, condsFile, trial, adjustTrip, stage, baseline) %>%
distinct()
<- blink_corrected_2 %>%
blink_corrected_2 left_join(baselines_500, by = c("participant", "condsFile", "trial", "adjustTrip"))
# remove any trips where the baseline could not be collected:
<- blink_corrected_2 %>%
blink_corrected_2 group_by(participant, condsFile, trial, adjustTrip) %>%
filter(!all(is.na(baseline))) %>%
ungroup()
# Visualize the baselines:
hist(blink_corrected_2$baseline)
10. Subtractive baseline correction
We use subtractive correction as it is the most robust form of baseline correction (Mathot et al., 2018).
#Get change-from-baseline through subtractive correction
<- blink_corrected_2 %>%
baseline_corrected_data group_by(participant, condsFile, trial, adjustTrip, stage.x) %>%
mutate(change_from_baseline = ifelse(is.na(smoothed_pupil3), NA, smoothed_pupil3 - baseline))
## Visualize
# Filter the data for trial 1
<- baseline_corrected_data %>%
Trace_data_trial1trip1 filter(condsFile == "maintenance_delay" &
== "maintenance_delay" &
stage.x == 1 &
trial == 1 & !is.na(d_time))
adjustTrip
ggplot(Trace_data_trial1trip1, aes(x = milliseconds, y = smoothed_pupil3, group = participant, color = participant)) +
geom_line(size = 1) + # Line plot for each participant's trace
facet_wrap(~ participant, scales = "free_y") + # Facet by participant, with independent y-axis
labs(title = "Smoothed pupil - Trial 1 Trip 1 (All Participants)",
x = "Frame",
y = "Pupil (mm)") +
theme_minimal() +
theme(legend.position = "none")
Warning: Removed 149 rows containing missing values or values outside the scale range
(`geom_line()`).
ggplot(Trace_data_trial1trip1, aes(x = milliseconds, y = change_from_baseline, group = participant, color = participant)) +
geom_line(size = 1) + # Line plot for each participant's trace
facet_wrap(~ participant, scales = "free_y") + # Facet by participant, with independent y-axis
labs(title = "Change from baseline - Trial 1 Trip 1 (All Participants)",
x = "Frame",
y = "Pupil (mm)") +
theme_minimal() +
theme(legend.position = "none")
Warning: Removed 149 rows containing missing values or values outside the scale range
(`geom_line()`).
Save the data!
#write.csv(baseline_corrected_data, "scripts_and_data/processed_data/SG_ET_step2_March24.csv", row.names = F)
#write.csv(data_raw, "RAW_pupil_data.csv", row.names = F)