d = read.csv("/Documents/GRADUATE_SCHOOL/Projects/SEMSOC/data/associations_ppdetails_en_05_01_2015.csv")
d.clean = d %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
cues = d.clean %>%
gather("associate_type", "associate", 7:9) %>%
distinct(cue)
These data are from Small World of Words. In the task, participants are given a cue, and are asked to generate 3 associates. Each participant completes 15-19 trials. We have 73256 participants. There are 10050 distinct cues.
Sample distributions of associations for particular cues.
associate.counts = d.clean %>%
gather("associate_type", "associate", 7:9) %>%
select(cue, associate) %>%
group_by(cue, associate) %>%
summarise(n = n())
ggplot(filter(associate.counts, cue == "abacus"), aes(x = n)) +
geom_histogram() +
ggtitle("Cue: Abacus") +
theme_bw()
ggplot(filter(associate.counts, cue == "zucchini"), aes(x = n)) +
geom_histogram() +
ggtitle("Cue: zucchini") +
theme_bw()
Save stuff for SEMSOC_entropy.py script.
#First we need to create a dictionary of the words -> integer
word.to.int.dict = d.clean %>%
gather("associate_type", "associate", 7:9) %>%
select(associate) %>%
distinct(associate) %>%
mutate(word.int = 1:dim(.)[1])
#write.csv(word.to.int.dict, "word.to.int.dict.csv")
all.associates.for.py = d.clean %>%
gather("associate_type", "associate", 7:9) %>%
select(cue, associate) %>%
left_join(word.to.int.dict) %>%
mutate(ca = paste(cue,associate)) %>%
mutate(n2 = 1:n())
# m = spread(all.associates.for.py, cue, word.int) # this doesn't work; not enough memory
all.associates.for.py = d.clean %>%
gather("associate_type", "associate", 7:9) %>%
select(cue, associate) %>%
left_join(word.to.int.dict) %>%
select(-associate)
for (i in 1:nrow(cues)){
my.df = filter(all.associates.for.py, cue == cues[i,1])
word = as.character(my.df$cue[1]) # this is necessary because there's a cue named "cue"
my.df = select(my.df,-cue)
names(my.df)[1] = word
if (i == 1) {
all.df = my.df
} else {
all.df = Cbind(all.df, my.df)
}
if (i == 100| i == 500 | i == 5000) {print (i)}
}
#write.csv(t(all.df), "JVHW_entropy/JVHW_v3_python/t.all_associates.csv", row.names = FALSE)
#write.csv(t(all.df[,1:5]), "JVHW_entropy/JVHW_v3_python/t.small_associates.csv", row.names = FALSE) # each row is a cue
JVHWent = read.csv("JVHW_entropy/JVHW_v3_python/JVHW.csv")
names(JVHWent) = "JVHW_entropy"
# get old entropies
old.entropy = d.clean %>%
gather("associate_type", "associate", 7:9) %>%
select(cue, associate) %>%
group_by(cue, associate) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(cue) %>%
summarize(entropy = entropy(n)) %>%
ungroup() %>%
mutate(log.length =
log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))))
ent.df = cbind(cues, JVHWent) %>%
left_join(old.entropy)
Very highly correlated, JVWH has larger variance.
ent.df %>%
filter(entropy > 2) %>%
gather(entropy_measure, entropy_value, 2:3) %>%
ggplot(aes(x = entropy_value, fill = entropy_measure)) +
facet_wrap(~entropy_measure) +
geom_histogram()+
theme_bw() +
theme(legend.position="none")
ggplot(ent.df, aes(x = JVHW_entropy, y = entropy)) +
geom_point()+
geom_smooth(method = "lm") +
theme_bw()
tidy(cor.test(ent.df$JVHW_entropy, ent.df$entropy)) %>%
select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| 0.985063 | 573.4359 | 0 |