Question: Are there correlates of conceptual complexity from a word association task? Here we look at three measures: Conditional probability, entropy of associates, and proporition similar responses.

d = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")

d.clean = d %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean)

These data are from Small World of Words. In the task, participants are given a cue, and are asked to generate 3 associates. Each participante completes 15-19 trials. We have 73256 participants. There are 10050 distinct cues.

Number of participants per cue.

word.summary = d.clean %>%
  group_by(cue) %>%
  summarise(n = n()) 

ggplot(word.summary, aes(x = reorder(cue,-n), y = n)) + 
    geom_bar(stat = "identity") + 
    ylab("n trials")+
    xlab("distinct cue words") +
    geom_hline(yintercept = mean(word.summary$n), color = "red", 
               linetype = "longdash") +
    theme_bw(base_size = 12) +
    theme(legend.position="none",
          axis.ticks = element_blank(), axis.text.x = element_blank())

Conditional probabilities

The conditional probability of w1-> w2 is, p(w2|w1) = count(w1-> w2)/count(w1). Here we consider the case that w1 = cue and w2 = any of the associates.

Get full conditional probabilities.

# conditional probability function
get_trans_prob <- function(df, w1, w2) {
  names(df)[which(names(df) == w1)] = "w1"
  names(df)[which(names(df) == w2)] = "w2"
  
  # remove NAs and get bigrams
  df.f = filter(df, w1 != "NA" & w2 != "NA") %>%
    mutate(bigram = paste(w1, w2))

  # get counts of w1
  w1.counts = df.f %>%
    count(w1) %>%
    rename(w1.counts = n) 
  
  # calculate trans prob [count(w1->w2)/count(w1)]
  df.f %>%
    count(bigram, w1) %>%
    rename(joint.counts = n) %>%
    left_join(w1.counts, by="w1") %>%
    mutate(trans.prob = joint.counts/w1.counts) %>%
    select(bigram,trans.prob, w1) %>%
    arrange(trans.prob) %>%
    ungroup() 
}

# get conditional probability pairs of intersest
perms = permutations(4, 2, c(0:3)) %>%
  as.data.frame() %>%
  rename(w1 = V1, w2 = V2) %>%
  filter(w1 < w2) %>%
  mutate(w1 = as.factor(w1),
         w2 = as.factor(w2),
         w1 = plyr::mapvalues(w1, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
         w2 = plyr::mapvalues(w2, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
         pair = paste(w1, w2, sep = "_")) %>%
  slice(1:3)

# get all conditional probabilities
all.cb = pmap(list(as.list(perms$w1), as.list(perms$w2), as.list(perms$pair)), 
     function(x, y, z) {
       get_trans_prob(d.clean, x[[1]], y[[1]]) %>%
         mutate(pair = z)}) %>%
  bind_rows() %>%
  mutate(pair = as.factor(pair))

ms = all.cb %>%
  group_by(w1) %>%
  summarise(mean.trans.prob = mean(trans.prob)) %>%
  mutate(log.length = log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(w1))))))) %>%
  mutate(log.mean.trans.prob = log(mean.trans.prob))

Let’s look at the distribution of conditional probabilities.

ggplot(ms, aes(x = log.mean.trans.prob)) + 
  geom_histogram() + 
  theme_bw()

This looks reasonable.

Is conditional probability related to word length?

First, let’s look at the distribtion of word lengths.

ggplot(ms, aes(x = log.length)) + 
  geom_density() + 
  theme_bw()

Looks reasonably normal.

Are conditional probabilities correlated with length?

ggplot(filter(ms, log.mean.trans.prob < -3), aes(y = log.length, x = log.mean.trans.prob)) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

Correlations betweeen conditional probability of cue and associates, and word length.

tidy(cor.test(ms$log.length, ms$log.mean.trans.prob)) %>%
  select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
  kable()

estimate	statistic	p.value
-0.1411878	-14.29583	0

They are correlated.

Now, let’s do this for all pairs.

ms.pair = all.cb %>%
  group_by(w1, pair) %>%
  summarise(mean.trans.prob = mean(trans.prob)) %>%
  mutate(log.length = log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "",
                                                                as.character(w1)))))))  %>%
  mutate(log.mean.trans.prob = log(mean.trans.prob))

ggplot(filter(ms.pair, log.mean.trans.prob < -2), aes(x = log.mean.trans.prob)) + 
   facet_grid(.~ pair) +
  geom_histogram() + 
  theme_bw()

ggplot(filter(ms.pair, log.mean.trans.prob < -2), 
       aes(y = log.length, x = log.mean.trans.prob)) + 
  facet_grid(.~ pair) +
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

ms.pair %>%
  group_by(pair) %>%
  do(tidy(cor.test(.$log.mean.trans.prob, .$log.length))) %>%
  select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
  kable()

pair	estimate	statistic	p.value
cue_a1	-0.1295783	-13.099327	0
cue_a2	-0.1438790	-14.574032	0
cue_a3	-0.0903559	-9.094453	0
Holds up	across all pa	irs.

Does this hold up controlling for frequency?

subtlexus.url <- getURL("https://raw.githubusercontent.com/mllewis/RC/master/data/corpus/SUBTLEXus_corpus.txt")
freqs <- read.table(text = subtlexus.url, header=TRUE) %>%
          select(Word,Lg10WF)

ms = left_join(ms,freqs, by=c("w1" = "Word"))
kable(tidy(lm(log.length ~ Lg10WF + log.mean.trans.prob, ms)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.3253063	0.0729609	4.45864	8.3e-06
Lg10WF	-0.1721978	0.0040347	-42.67966	0.0e+00
log.mean.trans.prob	-0.4849412	0.0187349	-25.88441	0.0e+00

ms.pair = left_join(ms.pair,freqs, by=c("w1" = "Word"))
kable(tidy(lm(log.length ~ Lg10WF + log.mean.trans.prob + pair, ms.pair)))

term	estimate	std.error	statistic
(Intercept)	1.1125035	0.0305466	36.41982
Lg10WF	-0.1633424	0.0023311	-70.06982
log.mean.trans.prob	-0.3044993	0.0083932	-36.27935
paircue_a2	-0.1064552	0.0054231	-19.62978
paircue_a3	-0.1375466	0.0059308	-23.19191

Yes.

Are transitional probabilites correlated with complexity norms?

complexity.norms  = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Papers/RC/data/norms/complexityNormsEnglishexp9.csv")

ms = inner_join(ms, select(complexity.norms, complexity, word), by = c("w1" = "word")) 

ggplot(ms, aes(y = complexity, x = log.mean.trans.prob)) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

tidy(cor.test(ms$complexity, ms$log.mean.trans.prob)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
  kable()

estimate	statistic	p.value
-0.1922279	-4.057149	5.9e-05

tidy(cor.test(ms$complexity, ms$log.length))  %>%
  select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
  kable()

estimate	statistic	p.value
0.6121366	16.03381	0

kable(tidy(lm(log.mean.trans.prob ~ complexity + Lg10WF, ms)))

term	estimate	std.error	statistic
(Intercept)	-3.6478836	0.0430924	-84.652522
complexity	-0.0518408	0.0079542	-6.517397
Lg10WF	-0.0596737	0.0083993	-7.104578

Yes, strong correlation: more complex words have lower conditional probability. Holds up controling for frequency.

Entropy

Entropy in associates across participants

associate.counts = d.clean %>%
  gather("associate_type", "associate", 7:9)  %>%
  select(cue, associate) %>%
  group_by(cue, associate) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  group_by(cue) %>%
  summarize(entropy = entropy(n)) %>%
  ungroup() %>%
  mutate(log.length = 
         log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))))

Is entropy related to word length?

ggplot(filter(associate.counts, entropy > 2), aes(y = log.length, x = entropy)) +
  geom_point() + 
  geom_smooth(method = "lm") +
  theme_bw()

tidy(cor.test(associate.counts$log.length, associate.counts$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()

estimate	statistic	p.value
0.1560719	15.8387	0

Yes.

Is it correlated, controling for frequency?

ms = left_join(associate.counts,freqs, by=c("cue" = "Word"))
kable(tidy(lm(log.length ~ Lg10WF + entropy, ms)))

term	estimate	std.error	statistic
(Intercept)	1.3159668	0.0349239	37.68093
Lg10WF	-0.1698669	0.0040052	-42.41198
entropy	0.2329527	0.0088562	26.30392

Yes.

Are entropy norms correlated with complexity?

associate.counts = inner_join(associate.counts, select(complexity.norms, complexity, word), by = c("cue" = "word")) 

ggplot(associate.counts, aes(y = complexity, x = entropy)) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

tidy(cor.test(associate.counts$complexity, associate.counts$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()

estimate	statistic	p.value
0.2077399	4.398738	1.38e-05
Yes.

Across particpant variability

Here we calculate the probability that two randomly sampled participants who are given the same cue, will produce the same first associate (a1). To do this, we randomly sample 2 participants x N_PAIR_SAMPLED for each cue, then take the proportion agreement across samples.

one_sample <- function(a1) {
  function(k) {
    sample.is = sample(1:length(a1), 2, replace = T)
    a1[sample.is[1]] == a1[sample.is[2]] 
  }
}

all_samples <- function(a1, n_pair_samples) {
  sample_values <- 1:n_pair_samples %>%
    map(one_sample(a1)) %>%
    unlist()
   data.frame(prop = sum(sample_values)/length(sample_values))
}

N_PAIR_SAMPLED = 1000
sim.responses = d.clean %>%
  select(6,7) %>%
  filter(a1 != "NA") %>%
  group_by(cue) %>%
  mutate(prop_similar = unlist(all_samples(a1, N_PAIR_SAMPLED))) %>%
  summarise(prop_similar_cue_a1 = mean(prop_similar))


sim.responses = sim.responses %>%
                mutate(log.length = 
                log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))),
                log.prop_similar_cue_a1 = log(prop_similar_cue_a1))

Is log.prop_similar_cue_a1 related to word length?

ggplot(sim.responses, aes(x = log.prop_similar_cue_a1))+ 
  geom_histogram() + 
  theme_bw()

ggplot(sim.responses, aes(y = log.length, x = log.prop_similar_cue_a1)) +
  geom_point() + 
  geom_smooth(method = "lm") +
  theme_bw()

sim.responses = left_join(sim.responses,freqs, by=c("cue" = "Word"))

tidy(lm(log.length ~ Lg10WF + log.prop_similar_cue_a1, 
           filter(sim.responses,log.prop_similar_cue_a1 != -Inf))) %>%
  kable()

term	estimate	std.error	statistic
(Intercept)	2.0501591	0.0140193	146.23827
Lg10WF	-0.1522833	0.0040151	-37.92778
log.prop_similar_cue_a1	-0.0677176	0.0044326	-15.27709

There’s a robust effect here: Longer words tend to cue different asosciates across people, and this holds up controling for frequency.

Is log.prop_similar_cue_a1 correlated with complexity norms?

Look at subset of normed words:

sim.responses = inner_join(sim.responses, 
                           select(complexity.norms, complexity, word),
                           by = c("cue" = "word"))

ggplot(sim.responses, aes(y = complexity, x = log.prop_similar_cue_a1)) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

tidy(cor.test(sim.responses$complexity, sim.responses$log.prop_similar_cue_a1)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()

estimate	statistic	p.value
-0.1468406	-3.074739	0.0022415

Yes - participants tend to generate more similar associates for less complex meaings.

So, in conclusion:

Words that have lower conditional probability and less agreement in terms associates, tend to be longer
Transitional probability, associate entropy and agreement correlate with complexity norms.

SEMSOC and complexity analyses

Molly Lewis

2016-09-23

Conditional probabilities

Are transitional probabilites correlated with complexity norms?

Entropy

Are entropy norms correlated with complexity?

Across particpant variability

Is log.prop_similar_cue_a1 correlated with complexity norms?