Question: Are there correlates of conceptual complexity from a word association task? Here we look at three measures: Conditional probability, entropy of associates, and proporition similar responses.

d = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")

d.clean = d %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean)

These data are from Small World of Words. In the task, participants are given a cue, and are asked to generate 3 associates. Each participante completes 15-19 trials. We have 73256 participants. There are 10050 distinct cues.

Number of participants per cue.

word.summary = d.clean %>%
  group_by(cue) %>%
  summarise(n = n()) 

ggplot(word.summary, aes(x = reorder(cue,-n), y = n)) + 
    geom_bar(stat = "identity") + 
    ylab("n trials")+
    xlab("distinct cue words") +
    geom_hline(yintercept = mean(word.summary$n), color = "red", 
               linetype = "longdash") +
    theme_bw(base_size = 12) +
    theme(legend.position="none",
          axis.ticks = element_blank(), axis.text.x = element_blank())

Conditional probabilities

The conditional probability of w1-> w2 is, p(w2|w1) = count(w1-> w2)/count(w1). Here we consider the case that w1 = cue and w2 = any of the associates.

Get full conditional probabilities.

# conditional probability function
get_trans_prob <- function(df, w1, w2) {
  names(df)[which(names(df) == w1)] = "w1"
  names(df)[which(names(df) == w2)] = "w2"
  
  # remove NAs and get bigrams
  df.f = filter(df, w1 != "NA" & w2 != "NA") %>%
    mutate(bigram = paste(w1, w2))

  # get counts of w1
  w1.counts = df.f %>%
    count(w1) %>%
    rename(w1.counts = n) 
  
  # calculate trans prob [count(w1->w2)/count(w1)]
  df.f %>%
    count(bigram, w1) %>%
    rename(joint.counts = n) %>%
    left_join(w1.counts, by="w1") %>%
    mutate(trans.prob = joint.counts/w1.counts) %>%
    select(bigram,trans.prob, w1) %>%
    arrange(trans.prob) %>%
    ungroup() 
}

# get conditional probability pairs of intersest
perms = permutations(4, 2, c(0:3)) %>%
  as.data.frame() %>%
  rename(w1 = V1, w2 = V2) %>%
  filter(w1 < w2) %>%
  mutate(w1 = as.factor(w1),
         w2 = as.factor(w2),
         w1 = plyr::mapvalues(w1, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
         w2 = plyr::mapvalues(w2, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
         pair = paste(w1, w2, sep = "_")) %>%
  slice(1:3)

# get all conditional probabilities
all.cb = pmap(list(as.list(perms$w1), as.list(perms$w2), as.list(perms$pair)), 
     function(x, y, z) {
       get_trans_prob(d.clean, x[[1]], y[[1]]) %>%
         mutate(pair = z)}) %>%
  bind_rows() %>%
  mutate(pair = as.factor(pair))

ms = all.cb %>%
  group_by(w1) %>%
  summarise(mean.trans.prob = mean(trans.prob)) %>%
  mutate(log.length = log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(w1))))))) %>%
  mutate(log.mean.trans.prob = log(mean.trans.prob))

Let’s look at the distribution of conditional probabilities.

ggplot(ms, aes(x = log.mean.trans.prob)) + 
  geom_histogram() + 
  theme_bw()

This looks reasonable.

Are transitional probabilites correlated with complexity norms?

complexity.norms  = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Papers/RC/data/norms/complexityNormsEnglishexp9.csv")

ms = inner_join(ms, select(complexity.norms, complexity, word), by = c("w1" = "word")) 

ggplot(ms, aes(y = complexity, x = log.mean.trans.prob)) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

tidy(cor.test(ms$complexity, ms$log.mean.trans.prob)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
  kable()
estimate statistic p.value
-0.1922279 -4.057149 5.9e-05
tidy(cor.test(ms$complexity, ms$log.length))  %>%
  select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
  kable()
estimate statistic p.value
0.6121366 16.03381 0
kable(tidy(lm(log.mean.trans.prob ~ complexity + Lg10WF, ms)))
term estimate std.error statistic p.value
(Intercept) -3.6478836 0.0430924 -84.652522 0
complexity -0.0518408 0.0079542 -6.517397 0
Lg10WF -0.0596737 0.0083993 -7.104578 0

Yes, strong correlation: more complex words have lower conditional probability. Holds up controling for frequency.

Entropy

Entropy in associates across participants

associate.counts = d.clean %>%
  gather("associate_type", "associate", 7:9)  %>%
  select(cue, associate) %>%
  group_by(cue, associate) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  group_by(cue) %>%
  summarize(entropy = entropy(n)) %>%
  ungroup() %>%
  mutate(log.length = 
         log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))))

Are entropy norms correlated with complexity?

associate.counts = inner_join(associate.counts, select(complexity.norms, complexity, word), by = c("cue" = "word")) 

ggplot(associate.counts, aes(y = complexity, x = entropy)) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

tidy(cor.test(associate.counts$complexity, associate.counts$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()
estimate statistic p.value
0.2077399 4.398738 1.38e-05
Yes.

Across particpant variability

Here we calculate the probability that two randomly sampled participants who are given the same cue, will produce the same first associate (a1). To do this, we randomly sample 2 participants x N_PAIR_SAMPLED for each cue, then take the proportion agreement across samples.

one_sample <- function(a1) {
  function(k) {
    sample.is = sample(1:length(a1), 2, replace = T)
    a1[sample.is[1]] == a1[sample.is[2]] 
  }
}

all_samples <- function(a1, n_pair_samples) {
  sample_values <- 1:n_pair_samples %>%
    map(one_sample(a1)) %>%
    unlist()
   data.frame(prop = sum(sample_values)/length(sample_values))
}

N_PAIR_SAMPLED = 1000
sim.responses = d.clean %>%
  select(6,7) %>%
  filter(a1 != "NA") %>%
  group_by(cue) %>%
  mutate(prop_similar = unlist(all_samples(a1, N_PAIR_SAMPLED))) %>%
  summarise(prop_similar_cue_a1 = mean(prop_similar))


sim.responses = sim.responses %>%
                mutate(log.length = 
                log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))),
                log.prop_similar_cue_a1 = log(prop_similar_cue_a1)) 

Is log.prop_similar_cue_a1 correlated with complexity norms?

Look at subset of normed words:

sim.responses = inner_join(sim.responses, 
                           select(complexity.norms, complexity, word),
                           by = c("cue" = "word"))

ggplot(sim.responses, aes(y = complexity, x = log.prop_similar_cue_a1)) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

tidy(cor.test(sim.responses$complexity, sim.responses$log.prop_similar_cue_a1)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()
estimate statistic p.value
-0.1468406 -3.074739 0.0022415

Yes - participants tend to generate more similar associates for less complex meaings.


So, in conclusion:

  • Words that have lower conditional probability and less agreement in terms associates, tend to be longer
  • Transitional probability, associate entropy and agreement correlate with complexity norms.