surps.RTs %>%
pivot_longer(starts_with("surp_"), names_to = "model", values_to = "surprisal", names_prefix = "surp_") %>%
drop_na() %>%
ggplot(aes(x=surprisal,y=RT)) +
# geom_density(aes(x=surprisal,y=1200+750*..density..)) +
geom_hex() + scale_fill_gradient(low="#DDDDDD",high="black") +
# geom_point(alpha=.1,color="gray") +
geom_smooth() +
xlab("surprisal") +
facet_wrap(~model)+
ggtitle("surprisal vs self-paced RT on naturalstories data")
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# with arithmetic statistics
surps.RTs.perword %>%
pivot_longer(starts_with("surp_"), names_to = "model", values_to = "surprisal", names_prefix = "surp_") %>%
drop_na() %>%
ggplot(aes(x=surprisal,y=meanItemRT)) +
# geom_density(data=surps.RTs %>%
# pivot_longer(starts_with("surp_"), names_to = "model", values_to = "surprisal", names_prefix = "surp_") %>%
# drop_na(),
# aes(x=surprisal,y=-1250*..density..)) +
geom_errorbar(aes(ymin=meanItemRT-sdItemRT, ymax=meanItemRT+sdItemRT),alpha=.1, color="black") +
geom_point(alpha=.1,color="blue") +
xlab("surprisal") +
facet_wrap(~model)+
ggtitle("surprisal vs mean RT (blue) +-1sd (black) per item")
# with geometric statistics
# surps.RTs.perword %>%
# pivot_longer(starts_with("surp_"), names_to = "model", values_to = "surprisal", names_prefix = "surp_") %>%
# drop_na() %>%
# ggplot(aes(x=surprisal,y=gmeanItemRT)) +
# geom_density(aes(x=surprisal,y=1200+750*..density..)) +
# geom_errorbar(aes(ymin=gmeanItemRT/gsdItemRT, ymax=gmeanItemRT*gsdItemRT),alpha=.1, color="black") +
# geom_point(alpha=.1,color="blue") +
# xlab("surprisal") +
# facet_wrap(~model)+
# ggtitle("surprisal vs geometric mean RT (blue) * gsd^{+-1} (black) per item")
Almost all the data is at low surprisal. The variance increases a bit with higher surprisal.
surps.RTs %>%
pivot_longer(starts_with("surp_") &!contains("boyce"),
names_to = "model", values_to = "surprisal", names_prefix = "surp_") %>%
drop_na() %>%
ggplot(aes(x=surprisal,y=RT)) +
stat_boxplot(aes(x=cut_width(surprisal,boundary=0, width=1)),notch=T,varwidth=F,position="identity",alpha=.75,
outlier.color="gray", outlier.alpha = .5, outlier.size= .5,outlier.shape=20, color="blue"
) +
# stat_summary_bin(fun.data="mean_sdl", geom = "crossbar", binwidth = .5, alpha=0.2, fill="blue", size=.1)+
# stat_summary_bin(fun="median", binwidth = 1, alpha=0.2, size=.5, shape=3, color="red")+
xlab("surprisal (binned)") + theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1)) +
facet_wrap(~model,nrow=1) +
ggtitle("surprisal vs RT, binned by surprisal")
surps.RTs %>%
pivot_longer(starts_with("surp_boyce"),
names_to = "model", values_to = "surprisal", names_prefix = "surp_") %>%
drop_na() %>%
ggplot(aes(x=surprisal,y=RT)) +
stat_boxplot(aes(x=cut_width(surprisal,boundary=0, width=1)),notch=T,varwidth=F,position="identity",alpha=.75,
outlier.color="gray", outlier.alpha = .5, outlier.size= .5,outlier.shape=20, color="blue"
) +
# stat_summary_bin(fun.data="mean_sdl", geom = "crossbar", binwidth = .5, alpha=0.2, fill="blue", size=.1)+
# stat_summary_bin(fun="median", binwidth = 1, alpha=0.2, size=.5, shape=3, color="red")+
xlab("surprisal (binned)") + theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1)) +
facet_wrap(~model) +
ggtitle("surprisal vs RT, binned by surprisal")
plot_compare_models_smooth<-function(data, what_to_predict, surp_threshold = 100, ...){
data %>%
pivot_longer(starts_with("surp_") & !ends_with("_c"),
names_to = "model", values_to = "surprisal",
names_prefix = "surp_") %>%
filter(!is.na(get(what_to_predict)),!is.na(surprisal)) %>% filter(surprisal<=surp_threshold) %>%
mutate(model=factor(model)) %>%
mutate(model=fct_relevel(model,"GPT2","GPT2-large","GPT-Neo","GPT-J","GPT3", after = 3)) %>%
ggplot(aes(color=model)) +
geom_smooth(aes(x=surprisal,y=get(what_to_predict)), ...) +
labs(x = "surprisal", y=what_to_predict)
}
What to consider in choosing our basis?
bs = 'cr' places knots by quantile (probably not what we want, since it will make things wiggly in the low-surprisal area where all the data are, artificially inflating edf, I think)bs = 'bs', bs = 'ps', bs = 'ad') places knots evenlybs = 'tp' places knots randomly max.knots (see smooth.construct.tp.smooth.spec)#quick and dirty comparison
(
surps.RTs.perword %>%
plot_compare_models_smooth("meanItemRT", #surp_threshold = 30,
method = "gam", formula = y ~ s(x, bs = "tp")) + ggtitle("tp")
)+(
surps.RTs.perword %>%
plot_compare_models_smooth("meanItemRT", #surp_threshold = 30,
method = "gam", formula = y ~ s(x, bs = "cr")) + ggtitle("cr")
)+ plot_layout(guides = 'collect') + plot_annotation("SPRT vs surprisal GAMs quick model comparison")
surps.RTs_lag_c <- surps.RTs %>%
inner_join(surps.RTs_lag_c.perword) %>%
mutate(Word_ID=as_factor(str_c(story_num, word_num_in_story, sep="_"))) %>%
relocate(Word_ID,word)
Joining, by = c("word_num_in_sentence", "sentence_num", "sentence", "word", "offset", "story_num", "surp_GPT2", "surp_GPT2-large", "surp_GPT-Neo", "surp_GPT-J", "surp_GPT3", "nth_occurence_in_story", "word_num_in_story", "nItem", "meanItemRT", "sdItemRT", "gmeanItemRT", "gsdItemRT", "word_human", "wordp_whole", "wordp_word", "wordp_1", "is_in_common_vocab", "wordlength", "Word", "surp_boyce_txl", "tokencount_boyce_txl", "surp_boyce_ngram", "tokencount_boyce_ngram", "surp_boyce_grnn", "tokencount_boyce_grnn", "freq", "length")
Fit a GAM for the GPT data
job::job({
gam_gpt3_RT <- fit_bam_SPRT(formula1, surps.RTs_lag_c, "GPT3")
gam_gpt2_RT <- fit_bam_SPRT(formula1, surps.RTs_lag_c, "GPT2")
gam_gpt2l_RT <- fit_bam_SPRT(formula1, surps.RTs_lag_c, "GPT2-large")
gam_gptj_RT <- fit_bam_SPRT(formula1, surps.RTs_lag_c, "GPT-J")
gam_gptneo_RT <- fit_bam_SPRT(formula1, surps.RTs_lag_c, "GPT-Neo")
write_rds(gam_gpt3_RT , file="scratch/gam_gpt3_RT.rds")
write_rds(gam_gpt2_RT , file="scratch/gam_gpt2_RT.rds")
write_rds(gam_gpt2l_RT , file="scratch/gam_gpt2l_RT.rds")
write_rds(gam_gptj_RT , file="scratch/gam_gptj_RT.rds")
write_rds(gam_gptneo_RT, file="scratch/gam_gptneo_RT.rds")
},
title="fitting GAMs RT v GPT surprisals")
# Data processing as in Boyce and Levy, in data_processing_mazeRTs.Rmd
maze_pre_error<-read_csv("natural-stories-surprisals/maze_data/maze_pre_error.csv")
(surps.RTs_lag_c.maze_pre_error %>%
plot_compare_models_smooth("mazeRT", #surp_threshold = 30,
method = "gam", formula = y ~ s(x, bs = "tp")) + ggtitle("thin plate")
)+(
surps.RTs_lag_c.maze_pre_error %>%
plot_compare_models_smooth("mazeRT",# surp_threshold = 30,
method = "gam", formula = y ~ s(x, bs = "cr")) + ggtitle("cubic regression")
) + plot_layout(guides = 'collect') + plot_annotation("Maze RT vs surprisal GAMs quick model comparison")
# TODO: Morgan suggested adding randomeffects like this
job::job({
gam_gpt3_maze_withrandomeffects <- gam(
rt ~
s(surprisal, subject, bs="fs") + s(subject, bs='re') + s(word, bs='re') +
s(surprisal, bs="cr", k=20) +
te(freq, len, bs="cr") +
s(prev_surp, bs="cr", k=20) +
te(prev_freq, prev_len, bs="cr"),
data=gpt3_maze,
method="REML")
},
title = "gam_gpt3_maze_withrandomeffects")
plot.gam_surpRTmaze
plot.gam_surpRTmaze_boyce
draw_gam_both<-function(gam_model){
print(gam.check(gam_model))
draw(gam_model, seWithMean=T, shift=coef(gam_model)["(Intercept)"], select=c(1,3))
}