To what extent does the cue drive the different associations (i.e., cue vs. a1, cue vs. a2, cue vs. a3), and how does this pattern vary across demographic groups?

Other questions:

  • To what extent does the dependence of the generated word on the previous word vary across the task? [e.g. cue vs. a1, a1 vs. a2, a2 vs. a3]
  • To what extent are the associations related to eachother [e.g. a1 vs. a2, a2 vs. a3, a1 vs. a3]

Several measures:

  • conditional probability [derived both from subset and full set]
  • to what extent does the cue drive associations, relative to a-a pairs: [Pr(a1 | cue) + Pr(a2 | cue) + Pr(a3 | cue)] / [Pr(a2 | a1) + Pr(a3 | a1) + Pr(a3 | a2)]
  • coefficient of variance

I’ve omitted errorbars because bootstrapping is slow and almost every difference is significant.

d = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")

d.clean = d %>%
  filter(gender == "Ma"| gender ==  "Fe") %>%
  filter(education > 0) %>%
  filter(nativeLanguage != "") %>%
  mutate(gender = droplevels(gender),
         gender = plyr::revalue(gender,c("Fe" = "F", "Ma" = "M")),
         userID = as.factor(userID),
         nativeLanguage = as.factor(tolower(nativeLanguage)))

d.clean = d.clean %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean)

Conditional probabilities

The conditional probability of w1-> w2 is, p(w2|w1) = count(w1-> w2)/count(w1).

There are at least two ways to calculcate these conditional probabilities: based on the full dataset, or across each subset of interest. I initially did it by subset but this makes it so you can’t make inferences across groups because of sparsity; the better way is probably just using the full dataset. This is how I’ve done it below.

Conditional probabilities are calculated for each pair – is this the right way to do this? i.e., I’ve counted up all the times that w1 -> w2 for each pairing (e.g. cue_a1), and calculated the conditional probabilities based on that.

Get full conditional probabilities.

# conditional probability function
get_trans_prob <- function(df, w1, w2) {
  names(df)[which(names(df) == w1)] = "w1"
  names(df)[which(names(df) == w2)] = "w2"
  
  # remove NAs and get bigrams
  df.f = filter(df, w1 != "NA" & w2 != "NA") %>%
    mutate(bigram = paste(w1, w2))

  # get counts of w1
  w1.counts = df.f %>%
    count(w1) %>%
    rename(w1.counts = n) 
  
  # calculate trans prob [count(w1->w2)/count(w1)]
  df.f %>%
    count(bigram, w1) %>%
    rename(joint.counts = n) %>%
    left_join(w1.counts, by="w1") %>%
    mutate(trans.prob = joint.counts/w1.counts) %>%
    select(bigram,trans.prob) %>%
    arrange(trans.prob) %>%
    ungroup() 
}

# get conditional probability pairs of intersest
perms = permutations(4, 2, c(0:3)) %>%
  as.data.frame() %>%
  rename(w1 = V1, w2 = V2) %>%
  filter(w1 < w2) %>%
  mutate(w1 = as.factor(w1),
         w2 = as.factor(w2),
         w1 = plyr::mapvalues(w1, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
         w2 = plyr::mapvalues(w2, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
         pair = paste(w1, w2, sep = "_"))

# get all conditional probabilities
all.cb = pmap(list(as.list(perms$w1), as.list(perms$w2), as.list(perms$pair)), 
     function(x, y, z) {
       get_trans_prob(d.clean, x[[1]], y[[1]]) %>%
         mutate(pair = z)}) %>%
  bind_rows() %>%
  mutate(pair = as.factor(pair))

Conditional probability distributions

ggplot(all.cb, aes(x = trans.prob, group = pair, fill = pair)) +
  geom_density(alpha = .4) +
  xlab("conditional probability") +
  theme_bw(base_size = 18)

ggplot(filter(all.cb, trans.prob<.03), aes(x = trans.prob, group = pair, fill = pair)) +
  geom_density(alpha = .4) +
  xlab("conditional probability") +
  theme_bw(base_size = 18)

ggplot(filter(all.cb, trans.prob<.03),
       aes(y = trans.prob, x = pair, fill = pair)) +
  geom_boxplot(alpha = .4) +
  ylab("conditional probability") +
  theme_bw(base_size = 18)

This is a sanity check on the conditional probabilities. Cues drive associations more than association associations (i.e. p(a|cue) > p(a|a)). But – the order of p(a|cue) is surprising; I think this is because the mass is somewhat bimodal (the means, below, make sense).

Full sample

Cue pairs

Merge in bigrams

d.clean = d.clean %>%
  mutate(b.cue_a1 = paste(cue, a1),
         b.cue_a2 = paste(cue, a2),
         b.cue_a3 = paste(cue, a3),
         b.a1_a2 = paste(a1, a2),
         b.a2_a3 = paste(a2, a3),
         b.a1_a3 = paste(a1, a3)) %>%
  mutate_each(funs(ifelse(grepl("NA",.),"NA",.)), b.cue_a1:b.a1_a3) # remove NA

# merge in bigram probabilities
d.clean.bigram = left_join(d.clean, 
                           filter(all.cb, pair == "cue_a1") %>% select(-pair),
                           by=c("b.cue_a1" = "bigram")) %>%
                  rename(tp.cue_a1 = trans.prob) %>%
                  left_join(filter(all.cb, pair == "cue_a2") %>% select(-pair),
                           by=c("b.cue_a2" = "bigram")) %>%
                  rename(tp.cue_a2 = trans.prob) %>%
                  left_join(filter(all.cb, pair == "cue_a3") %>% select(-pair),
                           by=c("b.cue_a3" = "bigram")) %>%
                  rename(tp.cue_a3 = trans.prob)  %>%
                  left_join(filter(all.cb, pair == "a1_a2") %>% select(-pair),
                           by=c("b.a1_a2" = "bigram")) %>%
                  rename(tp.a1_a2 = trans.prob)  %>%
                  left_join(filter(all.cb, pair == "a2_a3") %>% select(-pair),
                           by=c("b.a2_a3" = "bigram")) %>%
                  rename(tp.a2_a3 = trans.prob)  %>%
                  left_join(filter(all.cb, pair == "a1_a3") %>% select(-pair),
                           by=c("b.a1_a3" = "bigram")) %>%
                  rename(tp.a1_a3 = trans.prob) %>%
                  select(userID, age, gender, education, contains("tp."), a1, cue)
tp.full.ms = d.clean.bigram %>%
             gather("pair", "tp", 5:7) %>%
             group_by(userID, pair) %>%
             summarise(mean = mean(tp, na.rm = T)) %>%
             group_by(pair) %>%
             summarise(mean = mean(mean, na.rm = T))

ggplot(tp.full.ms, aes(y = mean, x = pair, group = 1)) +
  geom_point() +
  geom_line()+
  xlab("pair") +
  ylab("mean cp") +
  theme_bw(base_size = 18)

Association pairs

tp.full.ms = d.clean.bigram %>%
             gather("pair", "tp", 8:10) %>%
             group_by(userID, pair) %>%
             summarise(mean = mean(tp, na.rm = T)) %>%
             group_by(pair) %>%
             summarise(mean = mean(mean, na.rm = T))

ggplot(tp.full.ms, aes(y = mean, x = pair, group = 1)) +
  geom_point() +
  geom_line()+
  ylab("mean cp") +
  xlab("pair") +
  theme_bw(base_size = 18)

Education

Cue pairs

tp.educ.ms.c = filter(d.clean.bigram, education > 1) %>%
             gather("pair", "tp", 5:7) %>%
             group_by(userID, pair) %>%
             summarise(mean = mean(tp, na.rm = T)) %>%
             left_join(d.clean %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID)) %>%
             group_by(pair, education) %>%
             summarise(mean = mean(mean, na.rm = T)) %>%
             ungroup() %>%
             mutate(education = as.factor(education))

ggplot(tp.educ.ms.c, aes(y = mean, x = pair, group = education, color = education)) +
  geom_point() + 
  geom_line() +
  xlab("pair") +
  ylab("mean cp")+
  theme_bw(base_size = 18)

ggplot(tp.educ.ms.c, aes(y = mean, x = education, group = pair, color = pair)) +
  geom_point() + 
  geom_line() +
  xlab("education") +
  ylab("mean cp")+
  theme_bw(base_size = 18)

Asssociation pairs

tp.educ.ms.a = filter(d.clean.bigram, education > 1) %>%
             gather("pair", "tp", 8:10) %>%
             group_by(userID, pair) %>%
             summarise(mean = mean(tp, na.rm = T)) %>%
             left_join(d.clean %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID)) %>%
             group_by(pair, education) %>%
             summarise(mean = mean(mean, na.rm = T)) %>%
             ungroup() %>%
             mutate(education = as.factor(education))

ggplot(tp.educ.ms.a, aes(y = mean, x = pair, group = education, color = education)) +
  geom_point() + 
  geom_line()+
  xlab("pair") +
  ylab("mean cp")+
  theme_bw(base_size = 18)

ggplot(tp.educ.ms.a, aes(y = mean, x = education, group = pair, color = pair)) +
  geom_point() + 
  geom_line() +
  xlab("education") +
  ylab("mean cp") +
  theme_bw(base_size = 18)

Age

d.pos.age = d.clean.bigram %>%
            filter(age > 15 & age < 75) %>%
            mutate(age.bin = cut_width(age, width = 10))

d.clean.bigram = d.clean.bigram  %>%
              left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID)) %>%
              filter(!is.na(age.bin))

Cue pairs

tp.age.ms.c = gather(d.clean.bigram, "pair", "tp", 5:7) %>%
            group_by(pair, userID) %>%
            summarise(mean = mean(tp, na.rm = T)) %>%
            left_join(d.clean.bigram %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID)) %>%
            group_by(pair, age.bin) %>%
            summarise(mean = mean(mean, na.rm = T))

ggplot(tp.age.ms.c, aes(y = mean, x = pair, group = age.bin, color = age.bin)) +
  geom_point() + 
  geom_line() +
  xlab("pair") +
  ylab("mean cp")+
  theme_bw(base_size = 18)

ggplot(tp.age.ms.c, aes(y = mean, x = age.bin, group = pair, color = pair)) +
  geom_point() + 
  geom_line() +
  xlab("age bin") +
  ylab("mean cp")+
  theme_bw(base_size = 18)

Association pairs

tp.age.ms.a = gather(d.clean.bigram, "pair", "tp", 8:10) %>%
            group_by(pair, userID) %>%
            summarise(mean = mean(tp, na.rm = T)) %>%
            left_join(d.clean.bigram %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID)) %>%
            group_by(pair, age.bin) %>%
            summarise(mean = mean(mean, na.rm = T))

ggplot(tp.age.ms.a, aes(y = mean, x = pair, group = age.bin, color = age.bin)) +
  geom_point() + 
  geom_line() +
  xlab("pair") +
  ylab("mean cp")+
  theme_bw(base_size = 18)

ggplot(tp.age.ms.a, aes(y = mean, x = age.bin, group = pair, color = pair)) +
  geom_point() + 
  geom_line() +
  xlab("age.bin") +
  ylab("mean cp")+
  theme_bw(base_size = 18)

As you get older, the associations more coherent with eachother.

Coefficient of variation

Education

# Coeffificent of variation (log distribution); from: https://en.wikipedia.org/wiki/Coefficient_of_variation
cv_log <- function(probs) {
  var.probs.log = var(log(probs), na.rm = T)
  sqrt((exp(1)^var.probs.log)-1)
}

bigram.counts.educ = d.clean %>%
  gather("pair", "bigram", 10:15) %>%
  mutate(pair = as.factor(pair)) %>%
  filter(bigram != "NA") %>%
  count(bigram, pair, education) %>%
  ungroup()

educ.cv = bigram.counts.educ %>%
          filter(education > 1) %>%
          mutate(education = as.factor(education)) %>%
          group_by(pair, education) %>%
          summarize(cv = cv_log(n)) %>%
          ungroup()

ggplot(educ.cv, aes(y = cv, x = pair, group = education, color = education)) +
  geom_point() + 
  geom_line() +
  xlab("pair") +
  ylab("mean cv")+
  theme_bw(base_size = 18) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(educ.cv, aes(y = cv, x = education, group = pair, color = pair)) +
  geom_point() + 
  geom_line() +
  xlab("education") +
  ylab("mean cv")+
  theme_bw(base_size = 18)

Age

bigram.counts.age = d.clean %>%
  gather("pair", "bigram", 10:15) %>%
  left_join(d.clean.bigram %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID)) %>%
  mutate(pair = as.factor(pair)) %>%
  filter(bigram != "NA") %>%
  filter(age.bin != "NA") %>%
  count(bigram, pair, age.bin) %>%
  ungroup()

age.cv = bigram.counts.age %>%
          group_by(pair, age.bin) %>%
          summarize(cv = cv_log(n)) %>%
          ungroup()

ggplot(age.cv, aes(y = cv, x = pair, group = age.bin, color = age.bin)) +
  geom_point() + 
  geom_line() +
  xlab("pair") +
  ylab("mean cv")+
  theme_bw(base_size = 18) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(age.cv, aes(y = cv, x = age.bin, group = pair, color = pair)) +
  geom_point() + 
  geom_line() +
  xlab("age.bin") +
  ylab("mean cv")+
  theme_bw(base_size = 18)

Mean CP and CP CV Summary

Munge.

tp.educ.ms.c$group = "cue-associate"
tp.educ.ms.a$group = "associate-associate"
tp.educ = rbind(tp.educ.ms.c, tp.educ.ms.a)
tp.educ$pair = str_split_fixed(tp.educ$pair, "\\.",2)[,2]

tp.age.ms.c$group = "cue-associate"
tp.age.ms.a$group = "associate-associate"
tp.age = rbind(tp.age.ms.c, tp.age.ms.a)
tp.age$pair = str_split_fixed(tp.age$pair, "\\.",2)[,2]

educ.cv$pair = str_split_fixed(educ.cv$pair, "\\.",2)[,2]
age.cv$pair = str_split_fixed(age.cv$pair, "\\.",2)[,2]

Let’s also include log frequency in here just for reference.

subtlexus.url <- getURL("https://raw.githubusercontent.com/mllewis/RC/master/data/corpus/SUBTLEXus_corpus.txt")
freqs <- read.table(text = subtlexus.url, header=TRUE) %>%
          select(Word,Lg10WF)

freq.data = d.clean %>%
            left_join(freqs, by = c("cue" = "Word")) %>%
            rename(cue.freq.log = Lg10WF) %>%
            left_join(freqs, by = c("a1" = "Word")) %>%
            rename(a1.freq.log = Lg10WF) %>%
            left_join(freqs, by = c("a2" = "Word")) %>%
            rename(a2.freq.log = Lg10WF) %>%
            left_join(freqs, by = c("a3" = "Word")) %>%
            rename(a3.freq.log = Lg10WF)

freq.ms = freq.data %>%
               gather("association", "freq", 17:19) %>%
               group_by(userID) %>%
               summarise(mean.assoc.freq = mean(freq, na.rm = T))

freq.age.ms = freq.ms %>%
               left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age, userID)) %>%
              filter(age > 15 & age < 75) %>%
              mutate(age.bin = cut_width(age, width = 10)) %>%
              group_by(age.bin) %>%
              summarise(log.freq = mean(mean.assoc.freq, na.rm = T)) %>%
              mutate(group = "associate") %>%
              gather("measure", "value", 2)

freq.ed.ms = freq.ms %>%
              left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID)) %>%
              group_by(education) %>%
              summarise(log.freq = mean(mean.assoc.freq, na.rm = T))%>%
              filter(education > 1) %>%
              mutate(group = "associate") %>%
              gather("measure", "value", 2)

Collapsing over associates

age.cp.dist.ms = left_join(tp.age, age.cv) %>%
              rename(cp.mean=mean,cp.cv=cv) %>%
              gather("measure", "value", c(3,5)) %>%
              group_by(group,age.bin,measure) %>%
              summarize(value = mean(value)) %>%
              rbind(freq.age.ms)

ggplot(age.cp.dist.ms, aes(x = age.bin, y = value, group = measure, color = measure)) +
  facet_grid(group~measure, scales = "free") +
  #geom_bar(stat = "identity", position = "dodge") +
  geom_line(size = 1.5, aes(lty = group)) +
  theme_bw(base_size = 18) +
  theme(legend.position="none") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

educ.cp.dist.ms = left_join(tp.educ, educ.cv) %>%
                   rename(cp.cv=cv, cp.mean=mean) %>%
              gather("measure", "value", c(3,5))  %>%
              group_by(group,education,measure) %>%
              summarize(value = mean(value)) %>%
              rbind(freq.ed.ms)

ggplot(educ.cp.dist.ms, aes(x = education, y = value, group = measure, color = measure)) +
  facet_grid(group~measure, scales = "free") +
  #geom_bar(stat = "identity", position = "dodge") +
  geom_line(size = 1.5, aes(lty = group)) +
  theme_bw(base_size = 18) +
  theme(legend.position="none")

By associate

age.cp.dist = left_join(tp.age, age.cv) %>%
              gather("measure", "value", c(3,5))

ggplot(age.cp.dist, aes(x = age.bin, y = value, group = measure, color = measure)) +
  facet_grid(measure~pair, scales = "free") +
  #geom_bar(stat = "identity", position = "dodge") +
  geom_line(size = 1.5, aes(lty = group)) +
  theme_bw(base_size = 18) +
  theme(legend.position="none") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

educ.cp.dist = left_join(tp.educ, educ.cv) %>%
              gather("measure", "value", c(3,5))

ggplot(educ.cp.dist, aes(x = education, y = value, group = measure, color = measure)) +
  facet_grid(measure~pair, scales = "free") +
  #geom_bar(stat = "identity", position = "dodge") +
  geom_line(size = 1.5, aes(lty = group)) +
  theme_bw(base_size = 18) +
  theme(legend.position="none")

Cue probabilities, relative to associative probailities

[Pr(a1 | cue) + Pr(a2 | cue) + Pr(a3 | cue)] / [Pr(a2 | a1) + Pr(a3 | a1) + Pr(a3 | a2)]

This seems not terribly useful since there are such different baselines in associative probabilities.

Education

relative.educ.ms = d.clean.bigram %>%
          gather("pair", "tp", 5:10) %>%
          group_by(pair, userID) %>%
          summarise(mean = mean(tp, na.rm = T)) %>%
          left_join(d.clean %>% group_by(userID) %>% slice(1) 
                               %>% ungroup() %>% select(education, userID)) %>%
          spread("pair", "mean")  %>%
          mutate(cue_drivenness = tp.cue_a1 + tp.cue_a2 + tp.cue_a3,
                 a_drivenness = tp.a1_a2 + tp.a2_a3 + tp.a1_a3) %>%
          select(-contains("tp.")) %>%
          mutate(relative_driven = cue_drivenness/a_drivenness) %>%
          group_by(education) %>%
          summarise(mean = mean(relative_driven, na.rm = T)) 

ggplot(relative.educ.ms, aes(y = mean, x = education, group = 1)) +
  geom_point() + 
  geom_line()+
  xlab("education") +
  ylab("relative cp")+
  theme_bw(base_size = 18)

Age

relative.age.ms = d.clean.bigram %>%
          gather("pair", "tp", 5:10) %>%
          group_by(pair, userID) %>%
          summarise(mean = mean(tp, na.rm = T)) %>%
          left_join(d.clean.bigram %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID)) %>%
          spread("pair", "mean")  %>%
          mutate(cue_drivenness = tp.cue_a1 + tp.cue_a2 + tp.cue_a3,
                 a_drivenness = tp.a1_a2 + tp.a2_a3 + tp.a1_a3) %>%
          select(-contains("tp.")) %>%
          mutate(relative_driven = cue_drivenness/a_drivenness) %>%
          group_by(age.bin) %>%
          summarise(mean = mean(relative_driven, na.rm = T)) 

ggplot(relative.age.ms, aes(y = mean, x = age.bin, group = 1)) +
  geom_point() + 
  geom_line()+
  xlab("age") +
  ylab("relative cp")+
  theme_bw(base_size = 18)

Less driven by cue with age.

Within-group agreement

Here we calculate the probability that two randomly sampled participants who are given the same cue, will produce the same first associate (a1). To do this, we randomly sample 2 participants x N_PAIR_SAMPLED for each cue with an education level. We then take the proportion agreement across samples, and then average across cues within an education level.

Education

one_sample <- function(a1) {
  function(k) {
    sample.is = sample(1:length(a1), 2, replace = T)
    a1[sample.is[1]] == a1[sample.is[2]] 
  }
}

all_samples <- function(a1, n_pair_samples) {
  sample_values <- 1:n_pair_samples %>%
    map(one_sample(a1)) %>%
    unlist()
   data.frame(prop = sum(sample_values)/length(sample_values))
}

N_PAIR_SAMPLED = 100
sim.responses = d.clean.bigram %>%
  select(1,4,11,12) %>%
  filter(a1 != "NA") %>%
  group_by(cue, education) %>%
  mutate(prop_similar = unlist(all_samples(a1, N_PAIR_SAMPLED))) %>%
  group_by(education) %>%
  summarise(prop_similar_cue_a1 = mean(prop_similar))

ggplot(sim.responses, aes(x = education, y = prop_similar_cue_a1, group = 1)) +
  geom_point() +
  geom_line()+
  theme_bw(base_size = 18)

Age

one_sample <- function(a1) {
  function(k) {
    sample.is = sample(1:length(a1), 2, replace = T)
    a1[sample.is[1]] == a1[sample.is[2]] 
  }
}

all_samples <- function(a1, n_pair_samples) {
  sample_values <- 1:n_pair_samples %>%
    map(one_sample(a1)) %>%
    unlist()
   data.frame(prop = sum(sample_values)/length(sample_values))
}

N_PAIR_SAMPLED = 100
sim.responses = d.clean.bigram %>%
  select(1,11:13) %>%
  filter(a1 != "NA") %>%
  group_by(cue, age.bin) %>%
  mutate(prop_similar = unlist(all_samples(a1, N_PAIR_SAMPLED))) %>%
  group_by(age.bin) %>%
  summarise(prop_similar_cue_a1 = mean(prop_similar))

ggplot(sim.responses, aes(x = age.bin, y = prop_similar_cue_a1, group = 1)) +
  geom_point() +
  geom_line()+
  theme_bw(base_size = 18)

Cohort Relationships

How are age and education related to each other?

Education and age are positively correlated (r = 0.2).

d.clean %>%
  group_by(userID) %>%
  slice(1) %>%
  ungroup() %>%
  filter(education >1) %>%
  mutate(education = as.factor(education)) %>%
  ggplot(aes(x = education, y= age, group = education, fill = education)) +
  geom_boxplot() +
  theme_bw(base_size = 18) +
  theme(legend.position="none")