Question: Are there correlates of conceptual complexity from a word association task? Here we look at three measures: Conditional probability, entropy of associates, and proporition similar responses.
d = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")
d.clean = d %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
These data are from Small World of Words. In the task, participants are given a cue, and are asked to generate 3 associates. Each participante completes 15-19 trials. We have 73256 participants. There are 10050 distinct cues.
Number of participants per cue.
word.summary = d.clean %>%
group_by(cue) %>%
summarise(n = n())
ggplot(word.summary, aes(x = reorder(cue,-n), y = n)) +
geom_bar(stat = "identity") +
ylab("n trials")+
xlab("distinct cue words") +
geom_hline(yintercept = mean(word.summary$n), color = "red",
linetype = "longdash") +
theme_bw(base_size = 12) +
theme(legend.position="none",
axis.ticks = element_blank(), axis.text.x = element_blank())
The conditional probability of w1-> w2 is, p(w2|w1) = count(w1-> w2)/count(w1). Here we consider the case that w1 = cue and w2 = any of the associates.
Get full conditional probabilities.
# conditional probability function
get_trans_prob <- function(df, w1, w2) {
names(df)[which(names(df) == w1)] = "w1"
names(df)[which(names(df) == w2)] = "w2"
# remove NAs and get bigrams
df.f = filter(df, w1 != "NA" & w2 != "NA") %>%
mutate(bigram = paste(w1, w2))
# get counts of w1
w1.counts = df.f %>%
count(w1) %>%
rename(w1.counts = n)
# calculate trans prob [count(w1->w2)/count(w1)]
df.f %>%
count(bigram, w1) %>%
rename(joint.counts = n) %>%
left_join(w1.counts, by="w1") %>%
mutate(trans.prob = joint.counts/w1.counts) %>%
select(bigram,trans.prob, w1) %>%
arrange(trans.prob) %>%
ungroup()
}
# get conditional probability pairs of intersest
perms = permutations(4, 2, c(0:3)) %>%
as.data.frame() %>%
rename(w1 = V1, w2 = V2) %>%
filter(w1 < w2) %>%
mutate(w1 = as.factor(w1),
w2 = as.factor(w2),
w1 = plyr::mapvalues(w1, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
w2 = plyr::mapvalues(w2, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
pair = paste(w1, w2, sep = "_")) %>%
slice(1:3)
# get all conditional probabilities
all.cb = pmap(list(as.list(perms$w1), as.list(perms$w2), as.list(perms$pair)),
function(x, y, z) {
get_trans_prob(d.clean, x[[1]], y[[1]]) %>%
mutate(pair = z)}) %>%
bind_rows() %>%
mutate(pair = as.factor(pair))
ms = all.cb %>%
group_by(w1) %>%
summarise(mean.trans.prob = mean(trans.prob)) %>%
mutate(log.length = log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(w1))))))) %>%
mutate(log.mean.trans.prob = log(mean.trans.prob))
Let’s look at the distribution of conditional probabilities.
ggplot(ms, aes(x = log.mean.trans.prob)) +
geom_histogram() +
theme_bw()
This looks reasonable.
Entropy in associates across participants
associate.counts = d.clean %>%
gather("associate_type", "associate", 7:9) %>%
select(cue, associate) %>%
group_by(cue, associate) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(cue) %>%
summarize(entropy = entropy(n)) %>%
ungroup() %>%
mutate(log.length =
log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))))
Here we calculate the probability that two randomly sampled participants who are given the same cue, will produce the same first associate (a1). To do this, we randomly sample 2 participants x N_PAIR_SAMPLED for each cue, then take the proportion agreement across samples.
one_sample <- function(a1) {
function(k) {
sample.is = sample(1:length(a1), 2, replace = T)
a1[sample.is[1]] == a1[sample.is[2]]
}
}
all_samples <- function(a1, n_pair_samples) {
sample_values <- 1:n_pair_samples %>%
map(one_sample(a1)) %>%
unlist()
data.frame(prop = sum(sample_values)/length(sample_values))
}
N_PAIR_SAMPLED = 1000
sim.responses = d.clean %>%
select(6,7) %>%
filter(a1 != "NA") %>%
group_by(cue) %>%
mutate(prop_similar = unlist(all_samples(a1, N_PAIR_SAMPLED))) %>%
summarise(prop_similar_cue_a1 = mean(prop_similar))
sim.responses = sim.responses %>%
mutate(log.length =
log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))),
log.prop_similar_cue_a1 = log(prop_similar_cue_a1))