Peelle and Van Engen (2020) style multiverse analysis considering possible time windows with logistic growth curve models in a dataset with words of varying frequency, stimuli with varying levels of noise, and with young or old adults.
For our analysis, we will restrict ourselves to familiar words, and will model age effects.
# get local copy by running peekbank_explore.Rmd
load("data/aoi_data_joined.Rds")
fam_data <- aoi_data_joined %>%
filter(age > 12, age <= 60,
stimulus_novelty == "familiar",
dataset_name!="mahr_coartic") %>% # (only has endTimes up to 1800ms)
mutate(age_binned = cut(age, seq(0,60,12)))
rm(aoi_data_joined)
# t_norm = [-1000,4000]
get_alpha <- function (df) {
dfw <- pivot_wider(df,
names_from = english_stimulus_label,
values_from = prop_corr) %>%
select(-administration_id)
alphas <- psych::alpha(dfw)
return(alphas$total$raw_alpha)
}
# inter-item correlations
get_interitem <- function (df) {
dfw <- pivot_wider(df,
names_from = english_stimulus_label,
values_from = prop_corr) %>%
ungroup() %>%
select(-administration_id)
corrs <- corrr::correlate(dfw)
mean_corr <- mean(rowMeans(select(corrs, -term), na.rm=TRUE), na.rm=TRUE)
rm(corrs)
return(mean_corr)
}
# Time bins are 25 ms
startTs <- seq(from = -300, to = 1500, by = 25)
# could use min(pb datasets min time) and max(pb datasets max time)
#windowLengths <- seq(from = 300, to = 2500, by = 25)
endTs <- seq(from = 0, to = 4000, by = 25)
param.grid <- expand_grid(startTs, endTs) %>%
rename(startTimes = startTs, endTimes = endTs) %>%
filter((endTimes - startTimes) > 25)
do_interitem_analysis <- function(startTime, endTime) {
print(paste(startTime, endTime))
fam_data %>% filter(t_norm > startTime,
t_norm < endTime) %>%
group_by(dataset_name, age_binned, administration_id, english_stimulus_label) %>%
summarise(prop_corr = sum(aoi == "target") /
sum(aoi %in% c("target", "distractor"))) %>%
group_by(dataset_name, age_binned) %>%
nest() %>%
mutate(interitem = lapply(data, quietly(get_interitem)),
interitem = interitem[[1]]$result) %>%
select(-data) %>%
unnest(cols = "interitem") %>%
group_by(age_binned) %>%
summarise(mean = mean(interitem, na.rm=TRUE)) %>%
mutate(startTime = startTime,
endTime = endTime)
}
# test
#do_interitem_analysis(-300, 0)
mout <- map2_dfr(param.grid$startTimes, param.grid$endTimes,
do_interitem_analysis)
# this should also work, but does not
#mout <- param.grid %>% map2_dfr(startTimes, endTimes,
# do_interitem_analysis)
# Error in as_mapper(.f, ...) : object 'endTimes' not found
write.table(mout, file="data/time-window-results.csv")
Visualize inter-item correlation (per-subject, dataset, and age-bin) as a function of start time and window length.
Maybe annotate the region that we recommend?
What is the overall relationship between strength of correlation and start time, end time, or window length?
mout <- mout %>% mutate(windowLength = endTime - startTime)
# summary(lm(data=mout, mean ~ startTime + windowLength))
cor(mout[,2:5])[1,2:4]
## startTime endTime windowLength
## 0.3481233 0.6771913 0.4978535
Correlations only among the positive IICs:
prop_positive = length(which(mout$mean>0)) / nrow(mout) # 92.6% positive
hist(mout$mean)
mout_pos <- mout %>% filter(mean > 0)
cor(mout_pos[,2:5])[1,2:4]
## startTime endTime windowLength
## 0.2849253 0.5925008 0.4139564
0.93 of the time windows investigated resulted in positive IICs, and the median IIC from all simulations was 0.05. The correlations of IIC with start time, end time, and window length are shown above: end time is most predictive of higher IIC.
Where do we get bad ICCs? (<.01 – should we consider a higher threshold? e.g. median IIC = .048)
#require(GGally)
mout_neg <- mout %>% filter(mean < 0.01)
#ggpairs(mout_neg, columns=2:5, aes(color=age_binned), alpha=.3)
summary(mout_neg)
## age_binned mean startTime endTime
## Length:4424 Min. :-0.045365 Min. :-300.0 Min. : 0.0
## Class :character 1st Qu.:-0.013926 1st Qu.:-125.0 1st Qu.: 525.0
## Mode :character Median :-0.004539 Median : 125.0 Median : 775.0
## Mean :-0.006594 Mean : 139.4 Mean : 813.2
## 3rd Qu.: 0.003592 3rd Qu.: 300.0 3rd Qu.:1075.0
## Max. : 0.009991 Max. :1450.0 Max. :2150.0
## windowLength
## Min. : 50.0
## 1st Qu.: 300.0
## Median : 600.0
## Mean : 673.8
## 3rd Qu.: 975.0
## Max. :1925.0
start_lt300 = length(which(mout_neg$startTime < 300)) / nrow(mout_neg)
start_lt500 = length(which(mout_neg$startTime < 500)) / nrow(mout_neg)
# condition on start time at least 300ms like reasonable researchers?
# which(mout_neg$startTime >= 300 &
winLength_lt1500 = length(which(mout_neg$windowLength < 1500)) / nrow(mout_neg) # .94
win1500_start300 = length(which(mout_neg$startTime >= 300 & mout_neg$windowLength >= 1500)) / nrow(mout_neg)
rec_settings_iic_avg = mean(subset(mout, startTime>=500 & windowLength>=1500)$mean)
72% of low IICs (<.01) had start times <300ms, but pushing the start time out to at least 500 ms eliminated 90% of the low IICs. A window length of at least 1500 ms eliminated 94% of low ICCs, and this threshold combined with a start time of at least 300 ms eliminated all but 0.5% of low ICCs. A start time of 500 ms and a window of at least 1500 ms resulted in no IICs < .01, and an average IIC of 0.08.