d = read.csv("/Documents/GRADUATE_SCHOOL/Projects/SEMSOC/data/associations_ppdetails_en_05_01_2015.csv")

d.clean = d %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean)

These data are from Small World of Words. In the task, participants are given a cue, and are asked to generate 3 associates. Each participante completes 15-19 trials. We have 73256 participants. There are 10050 distinct cues.

Number of participants per cue.

word.summary = d.clean %>%
  group_by(cue) %>%
  summarise(n = n()) 

ggplot(word.summary, aes(x = reorder(cue,-n), y = n)) + 
  geom_bar(stat = "identity") + 
  ylab("n trials")+
  xlab("distinct cue words") +
  geom_hline(yintercept = mean(word.summary$n), color = "red", 
             linetype = "longdash") +
  theme_bw(base_size = 12) +
  theme(legend.position="none",
        axis.ticks = element_blank(), axis.text.x = element_blank())

ggplot(word.summary, aes(x = n)) + 
geom_histogram() + 
theme_bw()

Number of unique associates per cue.

associate.counts.per.cue = d.clean %>%
gather("associate_type", "associate", 7:9)  %>%
select(cue, associate) %>%
distinct(cue, associate) %>%
group_by(cue) %>%
summarise(n = n())

ggplot(associate.counts.per.cue, aes(x = n)) + 
geom_histogram() + 
theme_bw()

So, for each cue, we have a mean of 106.8882587 responses from participants and 124.259801 unique words.

Sample distributions of associations, for particular cues.

associate.counts = d.clean %>%
gather("associate_type", "associate", 7:9)  %>%
select(cue, associate) %>%
group_by(cue, associate) %>%
summarise(n = n()) 

ggplot(filter(associate.counts, cue == "abacus"), aes(x = n)) + 
geom_histogram() + 
ggtitle("Cue: Abacus") +
theme_bw()

ggplot(filter(associate.counts, cue == "zucchini"), aes(x = n)) + 
geom_histogram() + 
ggtitle("Cue: zucchini") +
theme_bw()

ggplot(filter(associate.counts, cue == "abandon"), aes(x = n)) + 
geom_histogram() + 
ggtitle("Cue: abandon") +
theme_bw()

ggplot(filter(associate.counts, cue == "abbreviation"), aes(x = n)) +
geom_histogram() + 
ggtitle("Cue: abbreviation") +
theme_bw()