To what extent does the cue drive the different associations (i.e., cue vs. a1, cue vs. a2, cue vs. a3), and how does this pattern vary across demographic groups?
Other questions:
Several measures:
I’ve omitted errorbars because bootstrapping is slow and almost every difference is significant.
d = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")
d.clean = d %>%
filter(gender == "Ma"| gender == "Fe") %>%
filter(education > 0) %>%
filter(nativeLanguage != "") %>%
mutate(gender = droplevels(gender),
gender = plyr::revalue(gender,c("Fe" = "F", "Ma" = "M")),
userID = as.factor(userID),
nativeLanguage = as.factor(tolower(nativeLanguage)))
d.clean = d.clean %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
The conditional probability of w1-> w2 is, p(w2|w1) = count(w1-> w2)/count(w1).
There are at least two ways to calculcate these conditional probabilities: based on the full dataset, or across each subset of interest. I initially did it by subset but this makes it so you can’t make inferences across groups because of sparsity; the better way is probably just using the full dataset. This is how I’ve done it below.
Conditional probabilities are calculated for each pair – is this the right way to do this? i.e., I’ve counted up all the times that w1 -> w2 for each pairing (e.g. cue_a1), and calculated the conditional probabilities based on that.
Get full conditional probabilities.
# conditional probability function
get_trans_prob <- function(df, w1, w2) {
names(df)[which(names(df) == w1)] = "w1"
names(df)[which(names(df) == w2)] = "w2"
# remove NAs and get bigrams
df.f = filter(df, w1 != "NA" & w2 != "NA") %>%
mutate(bigram = paste(w1, w2))
# get counts of w1
w1.counts = df.f %>%
count(w1) %>%
rename(w1.counts = n)
# calculate trans prob [count(w1->w2)/count(w1)]
df.f %>%
count(bigram, w1) %>%
rename(joint.counts = n) %>%
left_join(w1.counts, by="w1") %>%
mutate(trans.prob = joint.counts/w1.counts) %>%
select(bigram,trans.prob) %>%
arrange(trans.prob) %>%
ungroup()
}
# get conditional probability pairs of intersest
perms = permutations(4, 2, c(0:3)) %>%
as.data.frame() %>%
rename(w1 = V1, w2 = V2) %>%
filter(w1 < w2) %>%
mutate(w1 = as.factor(w1),
w2 = as.factor(w2),
w1 = plyr::mapvalues(w1, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
w2 = plyr::mapvalues(w2, from = c("0", "1", "2", "3"), to = c("cue", "a1", "a2", "a3")),
pair = paste(w1, w2, sep = "_"))
# get all conditional probabilities
all.cb = pmap(list(as.list(perms$w1), as.list(perms$w2), as.list(perms$pair)),
function(x, y, z) {
get_trans_prob(d.clean, x[[1]], y[[1]]) %>%
mutate(pair = z)}) %>%
bind_rows() %>%
mutate(pair = as.factor(pair))
Conditional probability distributions
ggplot(all.cb, aes(x = trans.prob, group = pair, fill = pair)) +
geom_density(alpha = .4) +
xlab("conditional probability") +
theme_bw(base_size = 18)
ggplot(filter(all.cb, trans.prob<.03), aes(x = trans.prob, group = pair, fill = pair)) +
geom_density(alpha = .4) +
xlab("conditional probability") +
theme_bw(base_size = 18)
ggplot(filter(all.cb, trans.prob<.03),
aes(y = trans.prob, x = pair, fill = pair)) +
geom_boxplot(alpha = .4) +
ylab("conditional probability") +
theme_bw(base_size = 18)
This is a sanity check on the conditional probabilities. Cues drive associations more than association associations (i.e. p(a|cue) > p(a|a)). But – the order of p(a|cue) is surprising; I think this is because the mass is somewhat bimodal (the means, below, make sense).
Merge in bigrams
d.clean = d.clean %>%
mutate(b.cue_a1 = paste(cue, a1),
b.cue_a2 = paste(cue, a2),
b.cue_a3 = paste(cue, a3),
b.a1_a2 = paste(a1, a2),
b.a2_a3 = paste(a2, a3),
b.a1_a3 = paste(a1, a3)) %>%
mutate_each(funs(ifelse(grepl("NA",.),"NA",.)), b.cue_a1:b.a1_a3) # remove NA
# merge in bigram probabilities
d.clean.bigram = left_join(d.clean,
filter(all.cb, pair == "cue_a1") %>% select(-pair),
by=c("b.cue_a1" = "bigram")) %>%
rename(tp.cue_a1 = trans.prob) %>%
left_join(filter(all.cb, pair == "cue_a2") %>% select(-pair),
by=c("b.cue_a2" = "bigram")) %>%
rename(tp.cue_a2 = trans.prob) %>%
left_join(filter(all.cb, pair == "cue_a3") %>% select(-pair),
by=c("b.cue_a3" = "bigram")) %>%
rename(tp.cue_a3 = trans.prob) %>%
left_join(filter(all.cb, pair == "a1_a2") %>% select(-pair),
by=c("b.a1_a2" = "bigram")) %>%
rename(tp.a1_a2 = trans.prob) %>%
left_join(filter(all.cb, pair == "a2_a3") %>% select(-pair),
by=c("b.a2_a3" = "bigram")) %>%
rename(tp.a2_a3 = trans.prob) %>%
left_join(filter(all.cb, pair == "a1_a3") %>% select(-pair),
by=c("b.a1_a3" = "bigram")) %>%
rename(tp.a1_a3 = trans.prob) %>%
select(userID, age, gender, education, contains("tp."), a1, cue)
tp.full.ms = d.clean.bigram %>%
gather("pair", "tp", 5:7) %>%
group_by(userID, pair) %>%
summarise(mean = mean(tp, na.rm = T)) %>%
group_by(pair) %>%
summarise(mean = mean(mean, na.rm = T))
ggplot(tp.full.ms, aes(y = mean, x = pair, group = 1)) +
geom_point() +
geom_line()+
xlab("pair") +
ylab("mean cp") +
theme_bw(base_size = 18)
tp.full.ms = d.clean.bigram %>%
gather("pair", "tp", 8:10) %>%
group_by(userID, pair) %>%
summarise(mean = mean(tp, na.rm = T)) %>%
group_by(pair) %>%
summarise(mean = mean(mean, na.rm = T))
ggplot(tp.full.ms, aes(y = mean, x = pair, group = 1)) +
geom_point() +
geom_line()+
ylab("mean cp") +
xlab("pair") +
theme_bw(base_size = 18)
tp.educ.ms.c = filter(d.clean.bigram, education > 1) %>%
gather("pair", "tp", 5:7) %>%
group_by(userID, pair) %>%
summarise(mean = mean(tp, na.rm = T)) %>%
left_join(d.clean %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
group_by(pair, education) %>%
summarise(mean = mean(mean, na.rm = T)) %>%
ungroup() %>%
mutate(education = as.factor(education))
ggplot(tp.educ.ms.c, aes(y = mean, x = pair, group = education, color = education)) +
geom_point() +
geom_line() +
xlab("pair") +
ylab("mean cp")+
theme_bw(base_size = 18)
ggplot(tp.educ.ms.c, aes(y = mean, x = education, group = pair, color = pair)) +
geom_point() +
geom_line() +
xlab("education") +
ylab("mean cp")+
theme_bw(base_size = 18)
tp.educ.ms.a = filter(d.clean.bigram, education > 1) %>%
gather("pair", "tp", 8:10) %>%
group_by(userID, pair) %>%
summarise(mean = mean(tp, na.rm = T)) %>%
left_join(d.clean %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
group_by(pair, education) %>%
summarise(mean = mean(mean, na.rm = T)) %>%
ungroup() %>%
mutate(education = as.factor(education))
ggplot(tp.educ.ms.a, aes(y = mean, x = pair, group = education, color = education)) +
geom_point() +
geom_line()+
xlab("pair") +
ylab("mean cp")+
theme_bw(base_size = 18)
ggplot(tp.educ.ms.a, aes(y = mean, x = education, group = pair, color = pair)) +
geom_point() +
geom_line() +
xlab("education") +
ylab("mean cp") +
theme_bw(base_size = 18)
d.pos.age = d.clean.bigram %>%
filter(age > 15 & age < 75) %>%
mutate(age.bin = cut_width(age, width = 10))
d.clean.bigram = d.clean.bigram %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
filter(!is.na(age.bin))
tp.age.ms.c = gather(d.clean.bigram, "pair", "tp", 5:7) %>%
group_by(pair, userID) %>%
summarise(mean = mean(tp, na.rm = T)) %>%
left_join(d.clean.bigram %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
group_by(pair, age.bin) %>%
summarise(mean = mean(mean, na.rm = T))
ggplot(tp.age.ms.c, aes(y = mean, x = pair, group = age.bin, color = age.bin)) +
geom_point() +
geom_line() +
xlab("pair") +
ylab("mean cp")+
theme_bw(base_size = 18)
ggplot(tp.age.ms.c, aes(y = mean, x = age.bin, group = pair, color = pair)) +
geom_point() +
geom_line() +
xlab("age bin") +
ylab("mean cp")+
theme_bw(base_size = 18)
tp.age.ms.a = gather(d.clean.bigram, "pair", "tp", 8:10) %>%
group_by(pair, userID) %>%
summarise(mean = mean(tp, na.rm = T)) %>%
left_join(d.clean.bigram %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
group_by(pair, age.bin) %>%
summarise(mean = mean(mean, na.rm = T))
ggplot(tp.age.ms.a, aes(y = mean, x = pair, group = age.bin, color = age.bin)) +
geom_point() +
geom_line() +
xlab("pair") +
ylab("mean cp")+
theme_bw(base_size = 18)
ggplot(tp.age.ms.a, aes(y = mean, x = age.bin, group = pair, color = pair)) +
geom_point() +
geom_line() +
xlab("age.bin") +
ylab("mean cp")+
theme_bw(base_size = 18)
As you get older, the associations more coherent with eachother.
# Coeffificent of variation (log distribution); from: https://en.wikipedia.org/wiki/Coefficient_of_variation
cv_log <- function(probs) {
var.probs.log = var(log(probs), na.rm = T)
sqrt((exp(1)^var.probs.log)-1)
}
bigram.counts.educ = d.clean %>%
gather("pair", "bigram", 10:15) %>%
mutate(pair = as.factor(pair)) %>%
filter(bigram != "NA") %>%
count(bigram, pair, education) %>%
ungroup()
educ.cv = bigram.counts.educ %>%
filter(education > 1) %>%
mutate(education = as.factor(education)) %>%
group_by(pair, education) %>%
summarize(cv = cv_log(n)) %>%
ungroup()
ggplot(educ.cv, aes(y = cv, x = pair, group = education, color = education)) +
geom_point() +
geom_line() +
xlab("pair") +
ylab("mean cv")+
theme_bw(base_size = 18) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(educ.cv, aes(y = cv, x = education, group = pair, color = pair)) +
geom_point() +
geom_line() +
xlab("education") +
ylab("mean cv")+
theme_bw(base_size = 18)
bigram.counts.age = d.clean %>%
gather("pair", "bigram", 10:15) %>%
left_join(d.clean.bigram %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
mutate(pair = as.factor(pair)) %>%
filter(bigram != "NA") %>%
filter(age.bin != "NA") %>%
count(bigram, pair, age.bin) %>%
ungroup()
age.cv = bigram.counts.age %>%
group_by(pair, age.bin) %>%
summarize(cv = cv_log(n)) %>%
ungroup()
ggplot(age.cv, aes(y = cv, x = pair, group = age.bin, color = age.bin)) +
geom_point() +
geom_line() +
xlab("pair") +
ylab("mean cv")+
theme_bw(base_size = 18) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(age.cv, aes(y = cv, x = age.bin, group = pair, color = pair)) +
geom_point() +
geom_line() +
xlab("age.bin") +
ylab("mean cv")+
theme_bw(base_size = 18)
Munge.
tp.educ.ms.c$group = "cue-associate"
tp.educ.ms.a$group = "associate-associate"
tp.educ = rbind(tp.educ.ms.c, tp.educ.ms.a)
tp.educ$pair = str_split_fixed(tp.educ$pair, "\\.",2)[,2]
tp.age.ms.c$group = "cue-associate"
tp.age.ms.a$group = "associate-associate"
tp.age = rbind(tp.age.ms.c, tp.age.ms.a)
tp.age$pair = str_split_fixed(tp.age$pair, "\\.",2)[,2]
educ.cv$pair = str_split_fixed(educ.cv$pair, "\\.",2)[,2]
age.cv$pair = str_split_fixed(age.cv$pair, "\\.",2)[,2]
Let’s also include log frequency in here just for reference.
subtlexus.url <- getURL("https://raw.githubusercontent.com/mllewis/RC/master/data/corpus/SUBTLEXus_corpus.txt")
freqs <- read.table(text = subtlexus.url, header=TRUE) %>%
select(Word,Lg10WF)
freq.data = d.clean %>%
left_join(freqs, by = c("cue" = "Word")) %>%
rename(cue.freq.log = Lg10WF) %>%
left_join(freqs, by = c("a1" = "Word")) %>%
rename(a1.freq.log = Lg10WF) %>%
left_join(freqs, by = c("a2" = "Word")) %>%
rename(a2.freq.log = Lg10WF) %>%
left_join(freqs, by = c("a3" = "Word")) %>%
rename(a3.freq.log = Lg10WF)
freq.ms = freq.data %>%
gather("association", "freq", 17:19) %>%
group_by(userID) %>%
summarise(mean.assoc.freq = mean(freq, na.rm = T))
freq.age.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age, userID)) %>%
filter(age > 15 & age < 75) %>%
mutate(age.bin = cut_width(age, width = 10)) %>%
group_by(age.bin) %>%
summarise(log.freq = mean(mean.assoc.freq, na.rm = T)) %>%
mutate(group = "associate") %>%
gather("measure", "value", 2)
freq.ed.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
group_by(education) %>%
summarise(log.freq = mean(mean.assoc.freq, na.rm = T))%>%
filter(education > 1) %>%
mutate(group = "associate") %>%
gather("measure", "value", 2)
age.cp.dist.ms = left_join(tp.age, age.cv) %>%
rename(cp.mean=mean,cp.cv=cv) %>%
gather("measure", "value", c(3,5)) %>%
group_by(group,age.bin,measure) %>%
summarize(value = mean(value)) %>%
rbind(freq.age.ms)
ggplot(age.cp.dist.ms, aes(x = age.bin, y = value, group = measure, color = measure)) +
facet_grid(group~measure, scales = "free") +
#geom_bar(stat = "identity", position = "dodge") +
geom_line(size = 1.5, aes(lty = group)) +
theme_bw(base_size = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
educ.cp.dist.ms = left_join(tp.educ, educ.cv) %>%
rename(cp.cv=cv, cp.mean=mean) %>%
gather("measure", "value", c(3,5)) %>%
group_by(group,education,measure) %>%
summarize(value = mean(value)) %>%
rbind(freq.ed.ms)
ggplot(educ.cp.dist.ms, aes(x = education, y = value, group = measure, color = measure)) +
facet_grid(group~measure, scales = "free") +
#geom_bar(stat = "identity", position = "dodge") +
geom_line(size = 1.5, aes(lty = group)) +
theme_bw(base_size = 18) +
theme(legend.position="none")
age.cp.dist = left_join(tp.age, age.cv) %>%
gather("measure", "value", c(3,5))
ggplot(age.cp.dist, aes(x = age.bin, y = value, group = measure, color = measure)) +
facet_grid(measure~pair, scales = "free") +
#geom_bar(stat = "identity", position = "dodge") +
geom_line(size = 1.5, aes(lty = group)) +
theme_bw(base_size = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
educ.cp.dist = left_join(tp.educ, educ.cv) %>%
gather("measure", "value", c(3,5))
ggplot(educ.cp.dist, aes(x = education, y = value, group = measure, color = measure)) +
facet_grid(measure~pair, scales = "free") +
#geom_bar(stat = "identity", position = "dodge") +
geom_line(size = 1.5, aes(lty = group)) +
theme_bw(base_size = 18) +
theme(legend.position="none")
[Pr(a1 | cue) + Pr(a2 | cue) + Pr(a3 | cue)] / [Pr(a2 | a1) + Pr(a3 | a1) + Pr(a3 | a2)]
This seems not terribly useful since there are such different baselines in associative probabilities.
relative.educ.ms = d.clean.bigram %>%
gather("pair", "tp", 5:10) %>%
group_by(pair, userID) %>%
summarise(mean = mean(tp, na.rm = T)) %>%
left_join(d.clean %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
spread("pair", "mean") %>%
mutate(cue_drivenness = tp.cue_a1 + tp.cue_a2 + tp.cue_a3,
a_drivenness = tp.a1_a2 + tp.a2_a3 + tp.a1_a3) %>%
select(-contains("tp.")) %>%
mutate(relative_driven = cue_drivenness/a_drivenness) %>%
group_by(education) %>%
summarise(mean = mean(relative_driven, na.rm = T))
ggplot(relative.educ.ms, aes(y = mean, x = education, group = 1)) +
geom_point() +
geom_line()+
xlab("education") +
ylab("relative cp")+
theme_bw(base_size = 18)
relative.age.ms = d.clean.bigram %>%
gather("pair", "tp", 5:10) %>%
group_by(pair, userID) %>%
summarise(mean = mean(tp, na.rm = T)) %>%
left_join(d.clean.bigram %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
spread("pair", "mean") %>%
mutate(cue_drivenness = tp.cue_a1 + tp.cue_a2 + tp.cue_a3,
a_drivenness = tp.a1_a2 + tp.a2_a3 + tp.a1_a3) %>%
select(-contains("tp.")) %>%
mutate(relative_driven = cue_drivenness/a_drivenness) %>%
group_by(age.bin) %>%
summarise(mean = mean(relative_driven, na.rm = T))
ggplot(relative.age.ms, aes(y = mean, x = age.bin, group = 1)) +
geom_point() +
geom_line()+
xlab("age") +
ylab("relative cp")+
theme_bw(base_size = 18)
Less driven by cue with age.
Here we calculate the probability that two randomly sampled participants who are given the same cue, will produce the same first associate (a1). To do this, we randomly sample 2 participants x N_PAIR_SAMPLED for each cue with an education level. We then take the proportion agreement across samples, and then average across cues within an education level.
one_sample <- function(a1) {
function(k) {
sample.is = sample(1:length(a1), 2, replace = T)
a1[sample.is[1]] == a1[sample.is[2]]
}
}
all_samples <- function(a1, n_pair_samples) {
sample_values <- 1:n_pair_samples %>%
map(one_sample(a1)) %>%
unlist()
data.frame(prop = sum(sample_values)/length(sample_values))
}
N_PAIR_SAMPLED = 100
sim.responses = d.clean.bigram %>%
select(1,4,11,12) %>%
filter(a1 != "NA") %>%
group_by(cue, education) %>%
mutate(prop_similar = unlist(all_samples(a1, N_PAIR_SAMPLED))) %>%
group_by(education) %>%
summarise(prop_similar_cue_a1 = mean(prop_similar))
ggplot(sim.responses, aes(x = education, y = prop_similar_cue_a1, group = 1)) +
geom_point() +
geom_line()+
theme_bw(base_size = 18)
one_sample <- function(a1) {
function(k) {
sample.is = sample(1:length(a1), 2, replace = T)
a1[sample.is[1]] == a1[sample.is[2]]
}
}
all_samples <- function(a1, n_pair_samples) {
sample_values <- 1:n_pair_samples %>%
map(one_sample(a1)) %>%
unlist()
data.frame(prop = sum(sample_values)/length(sample_values))
}
N_PAIR_SAMPLED = 100
sim.responses = d.clean.bigram %>%
select(1,11:13) %>%
filter(a1 != "NA") %>%
group_by(cue, age.bin) %>%
mutate(prop_similar = unlist(all_samples(a1, N_PAIR_SAMPLED))) %>%
group_by(age.bin) %>%
summarise(prop_similar_cue_a1 = mean(prop_similar))
ggplot(sim.responses, aes(x = age.bin, y = prop_similar_cue_a1, group = 1)) +
geom_point() +
geom_line()+
theme_bw(base_size = 18)
How are age and education related to each other?
Education and age are positively correlated (r = 0.2).
d.clean %>%
group_by(userID) %>%
slice(1) %>%
ungroup() %>%
filter(education >1) %>%
mutate(education = as.factor(education)) %>%
ggplot(aes(x = education, y= age, group = education, fill = education)) +
geom_boxplot() +
theme_bw(base_size = 18) +
theme(legend.position="none")