Are semantic associations related to individuals differences? Namely: self-report measures of visual, linguistic, spatial, and novel processing biases

The task: participants given cue and asked to produce 4 associates.


d.raw = read.csv("../data/assocData_complete.csv", header = TRUE) %>%
  select(workerId, cue, associate, num, forwardStrength, backwardStrength, visual, spatial, novel, word, Dom_PoS_SUBTLEX, Lg10WF)

mean.associates = d.raw %>% 
  group_by(workerId) %>% 
  summarise(n = n()) %>%
  summarise(mean = mean(n))

There are 65 participants. The mean number of associates per participants is 156.

Indiviudal difference measures

measures = d.raw %>%
  group_by(workerId) %>%
  slice(1) %>%
  ungroup() %>%
  select(workerId, visual, spatial, word, novel) 
  
measures %>%
  gather("measure", "value", 2:5) %>%
  ggplot(aes(x = value, fill = measure)) +
           geom_density(alpha = .4) +
  facet_grid(~ measure) +
  theme_bw() +
  theme(legend.position ="none") 

What are the pairwise correlations across measures?

cormat = round(cor(measures[,2:5]),2)
cormat[upper.tri(cormat)]<- NA
cormat.long = reshape2::melt(cormat, na.rm = TRUE)
ggplot(data = cormat.long, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile(color = "white")+
  geom_tile() +
  geom_text(aes(Var1, Var2, label = value), 
            color = "black", size = 4) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Pearson\nCorrelation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
  ylab("") +
  xlab("") +
  coord_fixed()

Visual and spatial correlation

kable(tidy(cor.test(measures$visual, measures$spatial)) %>% select(estimate, statistic, p.value), round = 2)
estimate statistic p.value
0.1928055 1.55961 0.1238619

Visual and word correlation

kable(tidy(cor.test(measures$visual, measures$word)) %>% select(estimate, statistic,    p.value), round = 2)
estimate statistic p.value
0.0018503 0.0146864 0.9883287

Word and spatial correlation

kable(tidy(cor.test(measures$word, measures$spatial)) %>% select(estimate, statistic,   p.value), round = 2)
estimate statistic p.value
-0.0206234 -0.1637279 0.8704695

Word and novel correlation

kable(tidy(cor.test(measures$word, measures$novel)) %>% select(estimate, statistic, p.value), round = 2)
estimate statistic p.value
0.1558429 1.252265 0.2151023

Spatial and novel correlation

kable(tidy(cor.test(measures$spatial, measures$novel)) %>% select(estimate, statistic,  p.value), round = 2)
estimate statistic p.value
0.2502204 2.051318 0.0443993

None of the correlations between individual difference are significant except novel and spatial.

Num within-group matches

Next we look at how responses vary as a funciton of these individual difference meassures. First we ask: within a given bin (word, visual, etc.), and given the same cue, how many participants gave the same associate?

#Make bins for analysis
d.raw$word.bin = as.factor(ifelse(d.raw$word > median(measures$word), 1, 0))
d.raw$visual.bin = as.factor(ifelse(d.raw$visual > median(measures$visual), 1, 0))
d.raw$spatial.bin = as.factor(ifelse(d.raw$spatial > median(measures$spatial), 1, 0))
d.raw$novel.bin = as.factor(ifelse(d.raw$novel > median(measures$novel), 1, 0))
d.raw$word.quant = quantcut(d.raw$word)
By associate
get_num_matchesW <- function(data, this.cue, this.word.bin, this.num, this.associate){
  all_associates = filter(data, 
                          cue == this.cue & word.bin == this.word.bin
                          & num == this.num)$associate
  num_matches = sum(all_associates == this.associate) 
  num_matches
}

same.responses = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matchesW(d.raw, cue, word.bin, num, associate)) 

same.responses %>% 
  group_by(word.bin, num) %>%
  multi_boot_standard(column = "num_same", na.rm = T) %>%
  ggplot(aes(x = num, y = mean, group = word.bin, 
             color = word.bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Associate") +
  ylab("Num matches") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
get_num_matches <- function(data, this.cue, this.bin, this.associate, measure.bin){
  names(data)[which(names(data) == measure.bin)] = "measure.bin"
  
  d.sameBin = filter(data, cue == this.cue & measure.bin == this.bin)
  all_associates = d.sameBin$associate
  num_matches = sum(all_associates == this.associate) 
  num_matches
}

same.responses.subj.W = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matches(d.raw, cue, word.bin, associate, "word.bin")) %>%
  group_by(workerId, word.bin) %>%
  summarise(num_same = mean(num_same)) %>%
  gather("measure", "bin", 2)

same.responses.subj.V = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matches(d.raw, cue, visual.bin, associate, "visual.bin")) %>%
  group_by(workerId, visual.bin) %>%
  summarise(num_same = mean(num_same))  %>%
  gather("measure", "bin", 2)

same.responses.subj.S = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matches(d.raw, cue, spatial.bin, associate, "spatial.bin")) %>%
  group_by(workerId, spatial.bin) %>%
  summarise(num_same = mean(num_same))  %>%
  gather("measure", "bin", 2)

same.responses.subj.N = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matches(d.raw, cue, spatial.bin, associate, "novel.bin")) %>%
  group_by(workerId, novel.bin) %>%
  summarise(num_same = mean(num_same))  %>%
  gather("measure", "bin", 2)

same.response.subj = rbind(same.responses.subj.W, 
                               same.responses.subj.V) %>%
                     rbind(same.responses.subj.S) %>%
                     rbind(same.responses.subj.N)

same.response.subj.MS = same.response.subj %>%
  group_by(measure, bin) %>%
  multi_boot_standard(column = "num_same", na.rm = T)

ggplot(same.response.subj.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
                  position = position_dodge(.3)) +
  ylab("Num matches") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

t-tests:

ns = same.response.subj %>%
  group_by(measure, bin) %>%
  summarize (n = n()) %>%
  spread(bin, n) %>%
  rename(n1 = `0`, n2 = `1`)

ts = same.response.subj %>%
  group_by(measure) %>%
  mutate(bin = as.factor(bin), # order bins correctly
         bin = factor(bin,levels(bin)[c(2,1)])) %>%
  do(tidy(t.test(num_same~bin,data=.))) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  left_join(ns)

kable(ts, digits = 2)
measure estimate statistic p.value conf.low conf.high n1 n2
novel.bin -0.50 -1.81 0.08 -1.04 0.05 33 32
spatial.bin -0.16 -0.60 0.55 -0.71 0.38 34 31
visual.bin -0.03 -0.13 0.90 -0.56 0.50 34 31
word.bin 0.69 2.69 0.01 0.18 1.20 33 32

Effect sizes:

ds = ts %>%
  mutate(d = tes(statistic, n1, n2, verbose = F)$d,
         l.d = tes(statistic, n1, n2, verbose = F)$l.d,
         u.d = tes(statistic, n1, n2, verbose = F)$u.d,
         p.d = tes(statistic, n1, n2, verbose = F)$pval.d)

ggplot(ds, aes(x = measure, y = d)) +
  geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
  ylab("Effect size") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept = 0, linetype = "longdash") +
  coord_flip()

Forward strength (Nelson)

Conditional probability p(associate|cue) using Nelson norms.

All

FS.by.participant.W = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.V = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.S = d.raw %>%
  group_by(workerId, spatial.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.N = d.raw %>%
  group_by(workerId, novel.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))%>%
  gather("measure", "bin", 2)

FS.by.participant= rbind(FS.by.participant.W, 
                               FS.by.participant.V) %>%
                     rbind(FS.by.participant.S) %>%
                     rbind(FS.by.participant.N)

FS.by.participant.MS = FS.by.participant %>%
  group_by(measure, bin) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T)

ggplot(FS.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
                  position = position_dodge(.3)) +
  ylab("Log forward strength") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

t-tests:

ns = FS.by.participant %>%
  group_by(measure, bin) %>%
  summarize (n = n()) %>%
  spread(bin, n) %>%
  rename(n1 = `0`, n2 = `1`)

ts = FS.by.participant %>%
  group_by(measure) %>%
  mutate(bin = as.factor(bin), # order bins correctly
         bin = factor(bin,levels(bin)[c(2,1)])) %>%
  do(tidy(t.test(log.forwardStrength~bin,data=.))) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  left_join(ns)

kable(ts, digits = 2)
measure estimate statistic p.value conf.low conf.high n1 n2
novel.bin -0.01 -0.24 0.81 -0.08 0.06 33 32
spatial.bin -0.03 -0.78 0.44 -0.10 0.04 34 31
visual.bin 0.04 1.03 0.31 -0.03 0.11 34 31
word.bin 0.07 2.01 0.05 0.00 0.14 33 32

Effect sizes:

ds = ts %>%
  mutate(d = tes(statistic, n1, n2, verbose = F)$d,
         l.d = tes(statistic, n1, n2, verbose = F)$l.d,
         u.d = tes(statistic, n1, n2, verbose = F)$u.d,
         p.d = tes(statistic, n1, n2, verbose = F)$pval.d)

ggplot(ds, aes(x = measure, y = d)) +
  geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
  ylab("Effect size") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept = 0, linetype = "longdash") +
  coord_flip()

Word only

FS.by.participant.W.wide = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant.W.wide %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Word bin") +
  ylab("Log forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none")

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, word.quant) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(word.quant) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.quant, y = mean, group = word.quant, 
             fill = word.quant)) +
  xlab("Word quantile") +
  ylab("Log forward strength") +
  ylim(-2.85,-2.5) +
  geom_point() +
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(legend.position ="none")

Continuous

FS.by.participant = d.raw %>%
  group_by(workerId, word) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  ggplot(aes(x = word, y = log.forwardStrength)) +
  xlab("Word") +
  ylab("Log forward strength") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

corr = tidy(cor.test(FS.by.participant$log.forwardStrength, FS.by.participant$word))
kable(select(corr, estimate, statistic, p.value))
estimate statistic p.value
0.2400805 1.962992 0.0540657

Forward strength (SWOW)

Forward strengths are taken from the small world of words (SWOW) dataset. This dataset has only three associates so we weighted the forward strengths by the overall forward strength in the SWOW dataset by fitting a log function to the first three associates. See SWOW_CP.Rmd for this analysis.

swow.FS = read.csv("SWOW_CP/mean_SWOW_CP.csv")
swow.FS.weights = read.csv("SWOW_CP/SWOW_associate_weights.csv")

d.raw = d.raw %>%
        mutate(bigram = paste(cue, associate)) %>%
        left_join(swow.FS) %>% # merge in cps
        left_join(swow.FS.weights, by=c("num" = "associate")) %>% # merge in weights
        mutate(swow.weighted.FS = trans.prob * weights) # get weighted cp
FS.by.participant.W = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.V = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.S = d.raw %>%
  group_by(workerId, spatial.bin) %>%
  summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.N = d.raw %>%
  group_by(workerId, novel.bin) %>%
  summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant= rbind(FS.by.participant.W, 
                               FS.by.participant.V) %>%
                   rbind(FS.by.participant.S) %>%
                   rbind(FS.by.participant.N)

FS.by.participant.MS = FS.by.participant %>%
  group_by(measure, bin) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T)

ggplot(FS.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
                  position = position_dodge(.3)) +
  ylab("Log forward strength") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

t-tests:

ns = FS.by.participant %>%
  group_by(measure, bin) %>%
  summarize (n = n()) %>%
  spread(bin, n) %>%
  rename(n1 = `0`, n2 = `1`)

ts = FS.by.participant %>%
  group_by(measure) %>%
  mutate(bin = as.factor(bin), # order bins correctly
         bin = factor(bin,levels(bin)[c(2,1)])) %>%
  do(tidy(t.test(log.forwardStrength~bin,data=.))) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  left_join(ns)

kable(ts, digits = 2)
measure estimate statistic p.value conf.low conf.high n1 n2
novel.bin -0.05 -1.04 0.30 -0.14 0.04 33 32
spatial.bin 0.00 -0.09 0.93 -0.10 0.09 34 31
visual.bin 0.06 1.28 0.21 -0.03 0.15 34 31
word.bin 0.09 2.08 0.04 0.00 0.18 33 32

Effect sizes:

ds = ts %>%
  mutate(d = tes(statistic, n1, n2, verbose = F)$d,
         l.d = tes(statistic, n1, n2, verbose = F)$l.d,
         u.d = tes(statistic, n1, n2, verbose = F)$u.d,
         p.d = tes(statistic, n1, n2, verbose = F)$pval.d)

ggplot(ds, aes(x = measure, y = d)) +
  geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
  ylab("Effect size") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept = 0, linetype = "longdash") +
  coord_flip()

Semantic similarity (by-sentence method)

i.e. How similar is a participant to other high/low word/visual/spatial/novel participants?

#write.csv(select(d.raw.wide, workerId, cue, a1, a2, a3, a4, word.bin, visual.bin), "all_associate_strings_bins.csv",  row.names=FALSE) # for semantic similarity.py

#freqs = select(d.raw, associate, Lg10WF) %>% 
#  mutate(Lg10WF = ifelse(is.na(Lg10WF), 0, Lg10WF))  %>%
#        unique()write.csv(freqs, "freqs.csv",  row.names=FALSE)
sim.word.by.participant =
  read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_word.csv") %>%
  gather("measure", "word_name", 3:6) %>%
  group_by(workerId, word.bin) %>%
  summarize(sim.word.bin = mean(sim.word.bin, na.rm = T)) %>%
  gather("measure", "bin", 2) %>%
  gather("temp", "similarity", 2) %>%
  select(-temp)

sim.visual.by.participant =
  read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_visual.csv") %>%
  gather("measure", "word_name", 3:6) %>%
  group_by(workerId, visual.bin) %>%
  summarize(sim.visual.bin = mean(sim.visual.bin, na.rm = T)) %>%
  gather("measure", "bin", 2) %>%
  gather("temp", "similarity", 2) %>%
  select(-temp)

sim.spatial.by.participant =
  read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_spatial.csv") %>%
  gather("measure", "word_name", 3:6) %>%
  group_by(workerId, spatial.bin) %>%
  summarize(sim.spatial.bin = mean(sim.spatial.bin, na.rm = T)) %>%
  gather("measure", "bin", 2) %>%
  gather("temp", "similarity", 2) %>%
  select(-temp)

sim.novel.by.participant =
  read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_novel.csv") %>%
  gather("measure", "word_name", 3:6) %>%
  group_by(workerId, novel.bin) %>%
  summarize(sim.novel.bin = mean(sim.novel.bin, na.rm = T)) %>%
  gather("measure", "bin", 2) %>%
  gather("temp", "similarity", 2) %>%
  select(-temp)

sim.by.participant= rbind(sim.word.by.participant, 
                               sim.visual.by.participant) %>%
                   rbind(sim.spatial.by.participant) %>%
                   rbind(sim.novel.by.participant)

sim.by.participant.MS = sim.by.participant %>%
  mutate(bin = as.factor(bin)) %>%
  group_by(measure, bin) %>%
  multi_boot_standard(column = "similarity", na.rm = T) 

ggplot(sim.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
                  position = position_dodge(.3)) +
  ylab("Similarity") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

t-tests:

ns = sim.by.participant %>%
  group_by(measure, bin) %>%
  summarize (n = n()) %>%
  spread(bin, n) %>%
  rename(n1 = `0`, n2 = `1`)

ts = sim.by.participant %>%
  group_by(measure) %>%
  mutate(bin = as.factor(bin), # order bins correctly
         bin = factor(bin,levels(bin)[c(2,1)])) %>%
  do(tidy(t.test(similarity~bin,data=.))) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  left_join(ns)

kable(ts, digits = 2)
measure estimate statistic p.value conf.low conf.high n1 n2
novel.bin -0.03 -3.75 0.00 -0.04 -0.01 33 32
spatial.bin 0.02 2.24 0.03 0.00 0.03 34 31
visual.bin 0.02 2.30 0.02 0.00 0.03 34 31
word.bin 0.02 3.13 0.00 0.01 0.03 33 32

Effect sizes:

ds = ts %>%
  mutate(d = tes(statistic, n1, n2, verbose = F)$d,
         l.d = tes(statistic, n1, n2, verbose = F)$l.d,
         u.d = tes(statistic, n1, n2, verbose = F)$u.d,
         p.d = tes(statistic, n1, n2, verbose = F)$pval.d)

ggplot(ds, aes(x = measure, y = d)) +
  geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
  ylab("Effect size") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept = 0, linetype = "longdash") +
  coord_flip()