Are semantic associations related to individuals differences? Namely: self-report measures of visual, linguistic, spatial, and novel processing biases

The task: participants given cue and asked to produce 4 associates.

d.raw = read.csv("../data/assocData_complete.csv", header = TRUE) %>%
  select(workerId, cue, associate, num, forwardStrength, backwardStrength, visual, spatial, novel, word, Dom_PoS_SUBTLEX, Lg10WF)

mean.associates = d.raw %>% 
  group_by(workerId) %>% 
  summarise(n = n()) %>%
  summarise(mean = mean(n))

There are 65 participants. The mean number of associates per participants is 156.

Indiviudal difference measures

measures = d.raw %>%
  group_by(workerId) %>%
  slice(1) %>%
  ungroup() %>%
  select(workerId, visual, spatial, word, novel) 
  
measures %>%
  gather("measure", "value", 2:5) %>%
  ggplot(aes(x = value, fill = measure)) +
           geom_density(alpha = .4) +
  facet_grid(~ measure) +
  theme_bw() +
  theme(legend.position ="none")

What are the pairwise correlations across measures?

cormat = round(cor(measures[,2:5]),2)
cormat[upper.tri(cormat)]<- NA
cormat.long = reshape2::melt(cormat, na.rm = TRUE)
ggplot(data = cormat.long, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile(color = "white")+
  geom_tile() +
  geom_text(aes(Var1, Var2, label = value), 
            color = "black", size = 4) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Pearson\nCorrelation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
  ylab("") +
  xlab("") +
  coord_fixed()

Visual and spatial correlation

kable(tidy(cor.test(measures$visual, measures$spatial)) %>% select(estimate, statistic, p.value), round = 2)

estimate	statistic	p.value
0.1928055	1.55961	0.1238619

Visual and word correlation

kable(tidy(cor.test(measures$visual, measures$word)) %>% select(estimate, statistic,    p.value), round = 2)

estimate	statistic	p.value
0.0018503	0.0146864	0.9883287

Word and spatial correlation

kable(tidy(cor.test(measures$word, measures$spatial)) %>% select(estimate, statistic,   p.value), round = 2)

estimate	statistic	p.value
-0.0206234	-0.1637279	0.8704695

Word and novel correlation

kable(tidy(cor.test(measures$word, measures$novel)) %>% select(estimate, statistic, p.value), round = 2)

estimate	statistic	p.value
0.1558429	1.252265	0.2151023

Spatial and novel correlation

kable(tidy(cor.test(measures$spatial, measures$novel)) %>% select(estimate, statistic,  p.value), round = 2)

estimate	statistic	p.value
0.2502204	2.051318	0.0443993

None of the correlations between individual difference are significant except novel and spatial.

Num within-group matches

Next we look at how responses vary as a funciton of these individual difference meassures. First we ask: within a given bin (word, visual, etc.), and given the same cue, how many participants gave the same associate?

#Make bins for analysis
d.raw$word.bin = as.factor(ifelse(d.raw$word > median(measures$word), 1, 0))
d.raw$visual.bin = as.factor(ifelse(d.raw$visual > median(measures$visual), 1, 0))
d.raw$spatial.bin = as.factor(ifelse(d.raw$spatial > median(measures$spatial), 1, 0))
d.raw$novel.bin = as.factor(ifelse(d.raw$novel > median(measures$novel), 1, 0))
d.raw$word.quant = quantcut(d.raw$word)

By associate
get_num_matchesW <- function(data, this.cue, this.word.bin, this.num, this.associate){
  all_associates = filter(data, 
                          cue == this.cue & word.bin == this.word.bin
                          & num == this.num)$associate
  num_matches = sum(all_associates == this.associate) 
  num_matches
}

same.responses = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matchesW(d.raw, cue, word.bin, num, associate)) 

same.responses %>% 
  group_by(word.bin, num) %>%
  multi_boot_standard(column = "num_same", na.rm = T) %>%
  ggplot(aes(x = num, y = mean, group = word.bin, 
             color = word.bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Associate") +
  ylab("Num matches") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

get_num_matches <- function(data, this.cue, this.bin, this.associate, measure.bin){
  names(data)[which(names(data) == measure.bin)] = "measure.bin"
  
  d.sameBin = filter(data, cue == this.cue & measure.bin == this.bin)
  all_associates = d.sameBin$associate
  num_matches = sum(all_associates == this.associate) 
  num_matches
}

same.responses.subj.W = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matches(d.raw, cue, word.bin, associate, "word.bin")) %>%
  group_by(workerId, word.bin) %>%
  summarise(num_same = mean(num_same)) %>%
  gather("measure", "bin", 2)

same.responses.subj.V = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matches(d.raw, cue, visual.bin, associate, "visual.bin")) %>%
  group_by(workerId, visual.bin) %>%
  summarise(num_same = mean(num_same))  %>%
  gather("measure", "bin", 2)

same.responses.subj.S = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matches(d.raw, cue, spatial.bin, associate, "spatial.bin")) %>%
  group_by(workerId, spatial.bin) %>%
  summarise(num_same = mean(num_same))  %>%
  gather("measure", "bin", 2)

same.responses.subj.N = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matches(d.raw, cue, spatial.bin, associate, "novel.bin")) %>%
  group_by(workerId, novel.bin) %>%
  summarise(num_same = mean(num_same))  %>%
  gather("measure", "bin", 2)

same.response.subj = rbind(same.responses.subj.W, 
                               same.responses.subj.V) %>%
                     rbind(same.responses.subj.S) %>%
                     rbind(same.responses.subj.N)

same.response.subj.MS = same.response.subj %>%
  group_by(measure, bin) %>%
  multi_boot_standard(column = "num_same", na.rm = T)

ggplot(same.response.subj.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
                  position = position_dodge(.3)) +
  ylab("Num matches") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

t-tests:

ns = same.response.subj %>%
  group_by(measure, bin) %>%
  summarize (n = n()) %>%
  spread(bin, n) %>%
  rename(n1 = `0`, n2 = `1`)

ts = same.response.subj %>%
  group_by(measure) %>%
  mutate(bin = as.factor(bin), # order bins correctly
         bin = factor(bin,levels(bin)[c(2,1)])) %>%
  do(tidy(t.test(num_same~bin,data=.))) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  left_join(ns)

kable(ts, digits = 2)

measure	estimate	statistic	p.value	conf.low	conf.high	n1	n2
novel.bin	-0.50	-1.81	0.08	-1.04	0.05	33	32
spatial.bin	-0.16	-0.60	0.55	-0.71	0.38	34	31
visual.bin	-0.03	-0.13	0.90	-0.56	0.50	34	31
word.bin	0.69	2.69	0.01	0.18	1.20	33	32

Effect sizes:

ds = ts %>%
  mutate(d = tes(statistic, n1, n2, verbose = F)$d,
         l.d = tes(statistic, n1, n2, verbose = F)$l.d,
         u.d = tes(statistic, n1, n2, verbose = F)$u.d,
         p.d = tes(statistic, n1, n2, verbose = F)$pval.d)

ggplot(ds, aes(x = measure, y = d)) +
  geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
  ylab("Effect size") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept = 0, linetype = "longdash") +
  coord_flip()

Forward strength (Nelson)

Conditional probability p(associate|cue) using Nelson norms.

All

FS.by.participant.W = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.V = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.S = d.raw %>%
  group_by(workerId, spatial.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.N = d.raw %>%
  group_by(workerId, novel.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))%>%
  gather("measure", "bin", 2)

FS.by.participant= rbind(FS.by.participant.W, 
                               FS.by.participant.V) %>%
                     rbind(FS.by.participant.S) %>%
                     rbind(FS.by.participant.N)

FS.by.participant.MS = FS.by.participant %>%
  group_by(measure, bin) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T)

ggplot(FS.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
                  position = position_dodge(.3)) +
  ylab("Log forward strength") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

t-tests:

ns = FS.by.participant %>%
  group_by(measure, bin) %>%
  summarize (n = n()) %>%
  spread(bin, n) %>%
  rename(n1 = `0`, n2 = `1`)

ts = FS.by.participant %>%
  group_by(measure) %>%
  mutate(bin = as.factor(bin), # order bins correctly
         bin = factor(bin,levels(bin)[c(2,1)])) %>%
  do(tidy(t.test(log.forwardStrength~bin,data=.))) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  left_join(ns)

kable(ts, digits = 2)

measure	estimate	statistic	p.value	conf.low	conf.high	n1	n2
novel.bin	-0.01	-0.24	0.81	-0.08	0.06	33	32
spatial.bin	-0.03	-0.78	0.44	-0.10	0.04	34	31
visual.bin	0.04	1.03	0.31	-0.03	0.11	34	31
word.bin	0.07	2.01	0.05	0.00	0.14	33	32

Effect sizes:

ds = ts %>%
  mutate(d = tes(statistic, n1, n2, verbose = F)$d,
         l.d = tes(statistic, n1, n2, verbose = F)$l.d,
         u.d = tes(statistic, n1, n2, verbose = F)$u.d,
         p.d = tes(statistic, n1, n2, verbose = F)$pval.d)

ggplot(ds, aes(x = measure, y = d)) +
  geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
  ylab("Effect size") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept = 0, linetype = "longdash") +
  coord_flip()

Word only

FS.by.participant.W.wide = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant.W.wide %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Word bin") +
  ylab("Log forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none")

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, word.quant) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(word.quant) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.quant, y = mean, group = word.quant, 
             fill = word.quant)) +
  xlab("Word quantile") +
  ylab("Log forward strength") +
  ylim(-2.85,-2.5) +
  geom_point() +
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(legend.position ="none")

Continuous

FS.by.participant = d.raw %>%
  group_by(workerId, word) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  ggplot(aes(x = word, y = log.forwardStrength)) +
  xlab("Word") +
  ylab("Log forward strength") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

corr = tidy(cor.test(FS.by.participant$log.forwardStrength, FS.by.participant$word))
kable(select(corr, estimate, statistic, p.value))

estimate	statistic	p.value
0.2400805	1.962992	0.0540657

Forward strength (SWOW)

Forward strengths are taken from the small world of words (SWOW) dataset. This dataset has only three associates so we weighted the forward strengths by the overall forward strength in the SWOW dataset by fitting a log function to the first three associates. See SWOW_CP.Rmd for this analysis.

swow.FS = read.csv("SWOW_CP/mean_SWOW_CP.csv")
swow.FS.weights = read.csv("SWOW_CP/SWOW_associate_weights.csv")

d.raw = d.raw %>%
        mutate(bigram = paste(cue, associate)) %>%
        left_join(swow.FS) %>% # merge in cps
        left_join(swow.FS.weights, by=c("num" = "associate")) %>% # merge in weights
        mutate(swow.weighted.FS = trans.prob * weights) # get weighted cp

FS.by.participant.W = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.V = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.S = d.raw %>%
  group_by(workerId, spatial.bin) %>%
  summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant.N = d.raw %>%
  group_by(workerId, novel.bin) %>%
  summarize(log.forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) %>%
  gather("measure", "bin", 2)

FS.by.participant= rbind(FS.by.participant.W, 
                               FS.by.participant.V) %>%
                   rbind(FS.by.participant.S) %>%
                   rbind(FS.by.participant.N)

FS.by.participant.MS = FS.by.participant %>%
  group_by(measure, bin) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T)

ggplot(FS.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
                  position = position_dodge(.3)) +
  ylab("Log forward strength") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

t-tests:

ns = FS.by.participant %>%
  group_by(measure, bin) %>%
  summarize (n = n()) %>%
  spread(bin, n) %>%
  rename(n1 = `0`, n2 = `1`)

ts = FS.by.participant %>%
  group_by(measure) %>%
  mutate(bin = as.factor(bin), # order bins correctly
         bin = factor(bin,levels(bin)[c(2,1)])) %>%
  do(tidy(t.test(log.forwardStrength~bin,data=.))) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  left_join(ns)

kable(ts, digits = 2)

measure	estimate	statistic	p.value	conf.low	conf.high	n1	n2
novel.bin	-0.05	-1.04	0.30	-0.14	0.04	33	32
spatial.bin	0.00	-0.09	0.93	-0.10	0.09	34	31
visual.bin	0.06	1.28	0.21	-0.03	0.15	34	31
word.bin	0.09	2.08	0.04	0.00	0.18	33	32

Effect sizes:

ds = ts %>%
  mutate(d = tes(statistic, n1, n2, verbose = F)$d,
         l.d = tes(statistic, n1, n2, verbose = F)$l.d,
         u.d = tes(statistic, n1, n2, verbose = F)$u.d,
         p.d = tes(statistic, n1, n2, verbose = F)$pval.d)

ggplot(ds, aes(x = measure, y = d)) +
  geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
  ylab("Effect size") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept = 0, linetype = "longdash") +
  coord_flip()

Semantic similarity (by-sentence method)

i.e. How similar is a participant to other high/low word/visual/spatial/novel participants?

#write.csv(select(d.raw.wide, workerId, cue, a1, a2, a3, a4, word.bin, visual.bin), "all_associate_strings_bins.csv",  row.names=FALSE) # for semantic similarity.py

#freqs = select(d.raw, associate, Lg10WF) %>% 
#  mutate(Lg10WF = ifelse(is.na(Lg10WF), 0, Lg10WF))  %>%
#        unique()write.csv(freqs, "freqs.csv",  row.names=FALSE)

sim.word.by.participant =
  read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_word.csv") %>%
  gather("measure", "word_name", 3:6) %>%
  group_by(workerId, word.bin) %>%
  summarize(sim.word.bin = mean(sim.word.bin, na.rm = T)) %>%
  gather("measure", "bin", 2) %>%
  gather("temp", "similarity", 2) %>%
  select(-temp)

sim.visual.by.participant =
  read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_visual.csv") %>%
  gather("measure", "word_name", 3:6) %>%
  group_by(workerId, visual.bin) %>%
  summarize(sim.visual.bin = mean(sim.visual.bin, na.rm = T)) %>%
  gather("measure", "bin", 2) %>%
  gather("temp", "similarity", 2) %>%
  select(-temp)

sim.spatial.by.participant =
  read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_spatial.csv") %>%
  gather("measure", "word_name", 3:6) %>%
  group_by(workerId, spatial.bin) %>%
  summarize(sim.spatial.bin = mean(sim.spatial.bin, na.rm = T)) %>%
  gather("measure", "bin", 2) %>%
  gather("temp", "similarity", 2) %>%
  select(-temp)

sim.novel.by.participant =
  read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_novel.csv") %>%
  gather("measure", "word_name", 3:6) %>%
  group_by(workerId, novel.bin) %>%
  summarize(sim.novel.bin = mean(sim.novel.bin, na.rm = T)) %>%
  gather("measure", "bin", 2) %>%
  gather("temp", "similarity", 2) %>%
  select(-temp)

sim.by.participant= rbind(sim.word.by.participant, 
                               sim.visual.by.participant) %>%
                   rbind(sim.spatial.by.participant) %>%
                   rbind(sim.novel.by.participant)

sim.by.participant.MS = sim.by.participant %>%
  mutate(bin = as.factor(bin)) %>%
  group_by(measure, bin) %>%
  multi_boot_standard(column = "similarity", na.rm = T) 

ggplot(sim.by.participant.MS, aes(x = measure, y = mean, color = bin, group = bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower),
                  position = position_dodge(.3)) +
  ylab("Similarity") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

t-tests:

ns = sim.by.participant %>%
  group_by(measure, bin) %>%
  summarize (n = n()) %>%
  spread(bin, n) %>%
  rename(n1 = `0`, n2 = `1`)

ts = sim.by.participant %>%
  group_by(measure) %>%
  mutate(bin = as.factor(bin), # order bins correctly
         bin = factor(bin,levels(bin)[c(2,1)])) %>%
  do(tidy(t.test(similarity~bin,data=.))) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  left_join(ns)

kable(ts, digits = 2)

measure	estimate	statistic	p.value	conf.low	conf.high	n1	n2
novel.bin	-0.03	-3.75	0.00	-0.04	-0.01	33	32
spatial.bin	0.02	2.24	0.03	0.00	0.03	34	31
visual.bin	0.02	2.30	0.02	0.00	0.03	34	31
word.bin	0.02	3.13	0.00	0.01	0.03	33	32

Effect sizes:

ds = ts %>%
  mutate(d = tes(statistic, n1, n2, verbose = F)$d,
         l.d = tes(statistic, n1, n2, verbose = F)$l.d,
         u.d = tes(statistic, n1, n2, verbose = F)$u.d,
         p.d = tes(statistic, n1, n2, verbose = F)$pval.d)

ggplot(ds, aes(x = measure, y = d)) +
  geom_pointrange(aes(ymax = u.d, ymin = l.d), color = "red") +
  ylab("Effect size") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept = 0, linetype = "longdash") +
  coord_flip()

X-Individual Concept Variability

2016-07-17

Indiviudal difference measures

Num within-group matches

Forward strength (Nelson)

All

Word only

Forward strength (SWOW)

Semantic similarity (by-sentence method)