Are semantic associations related to individuals differences? Namely: self-report measures of visual, linguistic, spatial, and novel processing biases
The task: participants given cue and asked to produce 4 associates.
d.raw = read.csv("../data/assocData_complete.csv", header = TRUE) %>%
select(workerId, cue, associate, num, forwardStrength, backwardStrength, visual, spatial, novel, word, Dom_PoS_SUBTLEX, Lg10WF)
mean.associates = d.raw %>%
group_by(workerId) %>%
summarise(n = n()) %>%
summarise(mean = mean(n))
There are 65 participants. The mean number of associates per participants is 156.
measures = d.raw %>%
group_by(workerId) %>%
slice(1) %>%
ungroup() %>%
select(workerId, visual, spatial, word, novel)
measures %>%
gather("measure", "value", 2:5) %>%
ggplot(aes(x = value, fill = measure)) +
geom_density(alpha = .4) +
facet_grid(~ measure) +
theme_bw() +
theme(legend.position ="none")
What are the pairwise correlations across measures?
cormat = round(cor(measures[,2:5]),2)
cormat[upper.tri(cormat)]<- NA
cormat.long = reshape2::melt(cormat, na.rm = TRUE)
ggplot(data = cormat.long, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
geom_tile() +
geom_text(aes(Var1, Var2, label = value),
color = "black", size = 4) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
ylab("") +
xlab("") +
coord_fixed()
Visual and spatial correlation
kable(tidy(cor.test(measures$visual, measures$spatial)) %>% select(estimate, statistic, p.value), round = 2)
| estimate | statistic | p.value |
|---|---|---|
| 0.1928055 | 1.55961 | 0.1238619 |
Visual and word correlation
kable(tidy(cor.test(measures$visual, measures$word)) %>% select(estimate, statistic, p.value), round = 2)
| estimate | statistic | p.value |
|---|---|---|
| 0.0018503 | 0.0146864 | 0.9883287 |
Word and spatial correlation
kable(tidy(cor.test(measures$word, measures$spatial)) %>% select(estimate, statistic, p.value), round = 2)
| estimate | statistic | p.value |
|---|---|---|
| -0.0206234 | -0.1637279 | 0.8704695 |
Word and novel correlation
kable(tidy(cor.test(measures$word, measures$novel)) %>% select(estimate, statistic, p.value), round = 2)
| estimate | statistic | p.value |
|---|---|---|
| 0.1558429 | 1.252265 | 0.2151023 |
Spatial and novel correlation
kable(tidy(cor.test(measures$spatial, measures$novel)) %>% select(estimate, statistic, p.value), round = 2)
| estimate | statistic | p.value |
|---|---|---|
| 0.2502204 | 2.051318 | 0.0443993 |
None of the correlations between individual difference are significant except novel and spatial.
Next we look at how responses vary as a funciton of these individual difference meassures. First we ask: within a given bin (word, visual, etc.), and given the same cue, how many participants gave the same associate?
#Make bins for analysis
d.raw$word.bin = as.factor(ifelse(d.raw$word > median(measures$word), 1, 0))
d.raw$visual.bin = as.factor(ifelse(d.raw$visual > median(measures$visual), 1, 0))
d.raw$spatial.bin = as.factor(ifelse(d.raw$spatial > median(measures$spatial), 1, 0))
d.raw$novel.bin = as.factor(ifelse(d.raw$novel > median(measures$novel), 1, 0))
d.raw$word.quant = quantcut(d.raw$word)
By associate
get_num_matchesW <- function(data, this.cue, this.word.bin, this.num, this.associate){
all_associates = filter(data,
cue == this.cue & word.bin == this.word.bin
& num == this.num)$associate
num_matches = sum(all_associates == this.associate)
num_matches
}
same.responses = d.raw %>%
rowwise() %>%
mutate(num_same = get_num_matchesW(d.raw, cue, word.bin, num, associate))
same.responses %>%
group_by(word.bin, num) %>%
multi_boot_standard(column = "num_same", na.rm = T) %>%
ggplot(aes(x = num, y = mean, group = word.bin,
color = word.bin)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Associate") +
ylab("Num matches") +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
get_num_matches <- function(data, this.cue, this.bin, this.associate, measure.bin){
names(data)[which(names(data) == measure.bin)] = "measure.bin"
d.sameBin = filter(data, cue == this.cue & measure.bin == this.bin)
all_associates = d.sameBin$associate
num_matches = sum(all_associates == this.associate)
num_matches
}
same.responses.subj.W = d.raw %>%
rowwise() %>%
mutate(num_same = get_num_matches(d.raw, cue, word.bin, associate, "word.bin")) %>%
group_by(workerId, word.bin) %>%
summarise(num_same = mean(num_same)) %>%
gather("measure", "bin", 2)
same.responses.subj.V = d.raw %>%
rowwise() %>%
mutate(num_same = get_num_matches(d.raw, cue, visual.bin, associate, "visual.bin")) %>%
group_by(workerId, visual.bin) %>%
summarise(num_same = mean(num_same)) %>%
gather("measure", "bin", 2)
same.responses.subj.S = d.raw %>%
rowwise() %>%
mutate(num_same = get_num_matches(d.raw, cue, spatial.bin, associate, "spatial.bin")) %>%
group_by(workerId, spatial.bin) %>%
summarise(num_same = mean(num_same)) %>%
gather("measure", "bin", 2)
same.responses.subj.N = d.raw %>%
rowwise() %>%
mutate(num_same = get_num_matches(d.raw, cue, spatial.bin, associate, "novel.bin")) %>%
group_by(workerId, novel.bin) %>%
summarise(num_same = mean(num_same)) %>%
gather("measure", "bin", 2)
same.response.subj = rbind(same.responses.subj.W,
same.responses.subj.V) %>%
rbind(same.responses.subj.S) %>%
rbind(same.responses.subj.N)
same.response.subj.MS = same.response.subj %>%
group_by(measure, bin) %>%
multi_boot_standard(column = "num_same", na.rm = T)
ggplot(same.response.subj.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
position = position_dodge(.3)) +
ylab("Num matches") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
t-tests:
ns = same.response.subj %>%
group_by(measure, bin) %>%
summarize (n = n()) %>%
spread(bin, n) %>%
rename(n1 = `0`, n2 = `1`)
ts = same.response.subj %>%
group_by(measure) %>%
mutate(bin = as.factor(bin), # order bins correctly
bin = factor(bin,levels(bin)[c(2,1)])) %>%
do(tidy(t.test(num_same~bin,data=.))) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
left_join(ns)
kable(ts, digits = 2)
| measure | estimate | statistic | p.value | conf.low | conf.high | n1 | n2 |
|---|---|---|---|---|---|---|---|
| novel.bin | -0.50 | -1.81 | 0.08 | -1.04 | 0.05 | 33 | 32 |
| spatial.bin | -0.16 | -0.60 | 0.55 | -0.71 | 0.38 | 34 | 31 |
| visual.bin | -0.03 | -0.13 | 0.90 | -0.56 | 0.50 | 34 | 31 |
| word.bin | 0.69 | 2.69 | 0.01 | 0.18 | 1.20 | 33 | 32 |
Effect sizes:
ds = ts %>%
mutate(d = tes(statistic, n1, n2, verbose = F)$d,
l.d = tes(statistic, n1, n2, verbose = F)$l.d,
u.d = tes(statistic, n1, n2, verbose = F)$u.d,
p.d = tes(statistic, n1, n2, verbose = F)$pval.d)
ggplot(ds, aes(x = measure, y = d)) +
geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
ylab("Effect size") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept = 0, linetype = "longdash") +
coord_flip()
Conditional probability p(associate|cue) using Nelson norms.
FS.by.participant.W = d.raw %>%
group_by(workerId, word.bin) %>%
summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
gather("measure", "bin", 2)
FS.by.participant.V = d.raw %>%
group_by(workerId, visual.bin) %>%
summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
gather("measure", "bin", 2)
FS.by.participant.S = d.raw %>%
group_by(workerId, spatial.bin) %>%
summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
gather("measure", "bin", 2)
FS.by.participant.N = d.raw %>%
group_by(workerId, novel.bin) %>%
summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))%>%
gather("measure", "bin", 2)
FS.by.participant= rbind(FS.by.participant.W,
FS.by.participant.V) %>%
rbind(FS.by.participant.S) %>%
rbind(FS.by.participant.N)
FS.by.participant.MS = FS.by.participant %>%
group_by(measure, bin) %>%
multi_boot_standard(column = "log.forwardStrength", na.rm = T)
ggplot(FS.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
position = position_dodge(.3)) +
ylab("Log forward strength") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
t-tests:
ns = FS.by.participant %>%
group_by(measure, bin) %>%
summarize (n = n()) %>%
spread(bin, n) %>%
rename(n1 = `0`, n2 = `1`)
ts = FS.by.participant %>%
group_by(measure) %>%
mutate(bin = as.factor(bin), # order bins correctly
bin = factor(bin,levels(bin)[c(2,1)])) %>%
do(tidy(t.test(log.forwardStrength~bin,data=.))) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
left_join(ns)
kable(ts, digits = 2)
| measure | estimate | statistic | p.value | conf.low | conf.high | n1 | n2 |
|---|---|---|---|---|---|---|---|
| novel.bin | -0.01 | -0.24 | 0.81 | -0.08 | 0.06 | 33 | 32 |
| spatial.bin | -0.03 | -0.78 | 0.44 | -0.10 | 0.04 | 34 | 31 |
| visual.bin | 0.04 | 1.03 | 0.31 | -0.03 | 0.11 | 34 | 31 |
| word.bin | 0.07 | 2.01 | 0.05 | 0.00 | 0.14 | 33 | 32 |
Effect sizes:
ds = ts %>%
mutate(d = tes(statistic, n1, n2, verbose = F)$d,
l.d = tes(statistic, n1, n2, verbose = F)$l.d,
u.d = tes(statistic, n1, n2, verbose = F)$u.d,
p.d = tes(statistic, n1, n2, verbose = F)$pval.d)
ggplot(ds, aes(x = measure, y = d)) +
geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
ylab("Effect size") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept = 0, linetype = "longdash") +
coord_flip()
FS.by.participant.W.wide = d.raw %>%
group_by(workerId, word.bin) %>%
summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))
FS.by.participant.W.wide %>%
group_by(word.bin) %>%
multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
ggplot(aes(x = word.bin, y = mean, group = word.bin,
fill = word.bin)) +
xlab("Word bin") +
ylab("Log forward strength") +
geom_bar(position = "dodge", stat = "identity") +
theme_bw(base_size = 15) +
geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none")
Quantiles
FS.by.participant = d.raw %>%
group_by(workerId, word.quant) %>%
summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))
FS.by.participant %>%
group_by(word.quant) %>%
multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
ggplot(aes(x = word.quant, y = mean, group = word.quant,
fill = word.quant)) +
xlab("Word quantile") +
ylab("Log forward strength") +
ylim(-2.85,-2.5) +
geom_point() +
theme_bw(base_size = 15) +
geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(legend.position ="none")
Continuous
FS.by.participant = d.raw %>%
group_by(workerId, word) %>%
summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))
FS.by.participant %>%
ggplot(aes(x = word, y = log.forwardStrength)) +
xlab("Word") +
ylab("Log forward strength") +
geom_point() +
geom_smooth(method = "lm") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
corr = tidy(cor.test(FS.by.participant$log.forwardStrength, FS.by.participant$word))
kable(select(corr, estimate, statistic, p.value))
| estimate | statistic | p.value |
|---|---|---|
| 0.2400805 | 1.962992 | 0.0540657 |
Forward strengths are taken from the small world of words (SWOW) dataset. This dataset has only three associates so we weighted the forward strengths by the overall forward strength in the SWOW dataset by fitting a log function to the first three associates. See SWOW_CP.Rmd for this analysis.
swow.FS = read.csv("SWOW_CP/mean_SWOW_CP.csv")
swow.FS.weights = read.csv("SWOW_CP/SWOW_associate_weights.csv")
d.raw = d.raw %>%
mutate(bigram = paste(cue, associate)) %>%
left_join(swow.FS) %>% # merge in cps
left_join(swow.FS.weights, by=c("num" = "associate")) %>% # merge in weights
mutate(swow.weighted.FS = trans.prob * weights) # get weighted cp
FS.by.participant.W = d.raw %>%
group_by(workerId, word.bin) %>%
summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
gather("measure", "bin", 2)
FS.by.participant.V = d.raw %>%
group_by(workerId, visual.bin) %>%
summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
gather("measure", "bin", 2)
FS.by.participant.S = d.raw %>%
group_by(workerId, spatial.bin) %>%
summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
gather("measure", "bin", 2)
FS.by.participant.N = d.raw %>%
group_by(workerId, novel.bin) %>%
summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
gather("measure", "bin", 2)
FS.by.participant= rbind(FS.by.participant.W,
FS.by.participant.V) %>%
rbind(FS.by.participant.S) %>%
rbind(FS.by.participant.N)
FS.by.participant.MS = FS.by.participant %>%
group_by(measure, bin) %>%
multi_boot_standard(column = "log.forwardStrength", na.rm = T)
ggplot(FS.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
position = position_dodge(.3)) +
ylab("Log forward strength") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
t-tests:
ns = FS.by.participant %>%
group_by(measure, bin) %>%
summarize (n = n()) %>%
spread(bin, n) %>%
rename(n1 = `0`, n2 = `1`)
ts = FS.by.participant %>%
group_by(measure) %>%
mutate(bin = as.factor(bin), # order bins correctly
bin = factor(bin,levels(bin)[c(2,1)])) %>%
do(tidy(t.test(log.forwardStrength~bin,data=.))) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
left_join(ns)
kable(ts, digits = 2)
| measure | estimate | statistic | p.value | conf.low | conf.high | n1 | n2 |
|---|---|---|---|---|---|---|---|
| novel.bin | -0.05 | -1.04 | 0.30 | -0.14 | 0.04 | 33 | 32 |
| spatial.bin | 0.00 | -0.09 | 0.93 | -0.10 | 0.09 | 34 | 31 |
| visual.bin | 0.06 | 1.28 | 0.21 | -0.03 | 0.15 | 34 | 31 |
| word.bin | 0.09 | 2.08 | 0.04 | 0.00 | 0.18 | 33 | 32 |
Effect sizes:
ds = ts %>%
mutate(d = tes(statistic, n1, n2, verbose = F)$d,
l.d = tes(statistic, n1, n2, verbose = F)$l.d,
u.d = tes(statistic, n1, n2, verbose = F)$u.d,
p.d = tes(statistic, n1, n2, verbose = F)$pval.d)
ggplot(ds, aes(x = measure, y = d)) +
geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
ylab("Effect size") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept = 0, linetype = "longdash") +
coord_flip()
i.e. How similar is a participant to other high/low word/visual/spatial/novel participants?
#write.csv(select(d.raw.wide, workerId, cue, a1, a2, a3, a4, word.bin, visual.bin), "all_associate_strings_bins.csv", row.names=FALSE) # for semantic similarity.py
#freqs = select(d.raw, associate, Lg10WF) %>%
# mutate(Lg10WF = ifelse(is.na(Lg10WF), 0, Lg10WF)) %>%
# unique()write.csv(freqs, "freqs.csv", row.names=FALSE)
sim.word.by.participant =
read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_word.csv") %>%
gather("measure", "word_name", 3:6) %>%
group_by(workerId, word.bin) %>%
summarize(sim.word.bin = mean(sim.word.bin, na.rm = T)) %>%
gather("measure", "bin", 2) %>%
gather("temp", "similarity", 2) %>%
select(-temp)
sim.visual.by.participant =
read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_visual.csv") %>%
gather("measure", "word_name", 3:6) %>%
group_by(workerId, visual.bin) %>%
summarize(sim.visual.bin = mean(sim.visual.bin, na.rm = T)) %>%
gather("measure", "bin", 2) %>%
gather("temp", "similarity", 2) %>%
select(-temp)
sim.spatial.by.participant =
read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_spatial.csv") %>%
gather("measure", "word_name", 3:6) %>%
group_by(workerId, spatial.bin) %>%
summarize(sim.spatial.bin = mean(sim.spatial.bin, na.rm = T)) %>%
gather("measure", "bin", 2) %>%
gather("temp", "similarity", 2) %>%
select(-temp)
sim.novel.by.participant =
read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_novel.csv") %>%
gather("measure", "word_name", 3:6) %>%
group_by(workerId, novel.bin) %>%
summarize(sim.novel.bin = mean(sim.novel.bin, na.rm = T)) %>%
gather("measure", "bin", 2) %>%
gather("temp", "similarity", 2) %>%
select(-temp)
sim.by.participant= rbind(sim.word.by.participant,
sim.visual.by.participant) %>%
rbind(sim.spatial.by.participant) %>%
rbind(sim.novel.by.participant)
sim.by.participant.MS = sim.by.participant %>%
mutate(bin = as.factor(bin)) %>%
group_by(measure, bin) %>%
multi_boot_standard(column = "similarity", na.rm = T)
ggplot(sim.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
position = position_dodge(.3)) +
ylab("Similarity") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
t-tests:
ns = sim.by.participant %>%
group_by(measure, bin) %>%
summarize (n = n()) %>%
spread(bin, n) %>%
rename(n1 = `0`, n2 = `1`)
ts = sim.by.participant %>%
group_by(measure) %>%
mutate(bin = as.factor(bin), # order bins correctly
bin = factor(bin,levels(bin)[c(2,1)])) %>%
do(tidy(t.test(similarity~bin,data=.))) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
left_join(ns)
kable(ts, digits = 2)
| measure | estimate | statistic | p.value | conf.low | conf.high | n1 | n2 |
|---|---|---|---|---|---|---|---|
| novel.bin | -0.03 | -3.75 | 0.00 | -0.04 | -0.01 | 33 | 32 |
| spatial.bin | 0.02 | 2.24 | 0.03 | 0.00 | 0.03 | 34 | 31 |
| visual.bin | 0.02 | 2.30 | 0.02 | 0.00 | 0.03 | 34 | 31 |
| word.bin | 0.02 | 3.13 | 0.00 | 0.01 | 0.03 | 33 | 32 |
Effect sizes:
ds = ts %>%
mutate(d = tes(statistic, n1, n2, verbose = F)$d,
l.d = tes(statistic, n1, n2, verbose = F)$l.d,
u.d = tes(statistic, n1, n2, verbose = F)$u.d,
p.d = tes(statistic, n1, n2, verbose = F)$pval.d)
ggplot(ds, aes(x = measure, y = d)) +
geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
ylab("Effect size") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept = 0, linetype = "longdash") +
coord_flip()