d = read.csv("/Documents/GRADUATE_SCHOOL/Projects/SEMSOC/data/associations_ppdetails_en_05_01_2015.csv")

d.clean = d %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean)

cues = d.clean %>%
  gather("associate_type", "associate", 7:9) %>%
  distinct(cue) 

These data are from Small World of Words. In the task, participants are given a cue, and are asked to generate 3 associates. Each participant completes 15-19 trials. We have 73256 participants. There are 10050 distinct cues.

Compare distribution of associates across cues

Sample distributions of associations for particular cues.

associate.counts = d.clean %>%
gather("associate_type", "associate", 7:9)  %>%
select(cue, associate) %>%
group_by(cue, associate) %>%
summarise(n = n()) 

ggplot(filter(associate.counts, cue == "abacus"), aes(x = n)) + 
geom_histogram() + 
ggtitle("Cue: Abacus") +
theme_bw()

ggplot(filter(associate.counts, cue == "zucchini"), aes(x = n)) + 
geom_histogram() + 
ggtitle("Cue: zucchini") +
theme_bw()

Save stuff for SEMSOC_entropy.py script.

#First we need to create a dictionary of the words -> integer
word.to.int.dict = d.clean %>%
  gather("associate_type", "associate", 7:9)  %>%
  select(associate) %>%
  distinct(associate) %>%
  mutate(word.int = 1:dim(.)[1])
#write.csv(word.to.int.dict, "word.to.int.dict.csv")

all.associates.for.py = d.clean %>%
  gather("associate_type", "associate", 7:9) %>%
  select(cue, associate) %>% 
  left_join(word.to.int.dict) %>%
  mutate(ca = paste(cue,associate)) %>%
  mutate(n2 = 1:n()) 

# m = spread(all.associates.for.py, cue, word.int) # this doesn't work; not enough memory

all.associates.for.py = d.clean %>%
  gather("associate_type", "associate", 7:9) %>%
  select(cue, associate) %>% 
  left_join(word.to.int.dict) %>%
  select(-associate)



for (i in 1:nrow(cues)){
  my.df = filter(all.associates.for.py, cue == cues[i,1])
  word = as.character(my.df$cue[1]) # this is necessary because there's a cue named "cue"
  my.df = select(my.df,-cue)
  names(my.df)[1] = word
  
  if (i == 1) {
      all.df = my.df
  } else {
    all.df = Cbind(all.df, my.df)
  }

  if (i == 100| i == 500 | i ==  5000) {print (i)}
}
#write.csv(t(all.df), "JVHW_entropy/JVHW_v3_python/t.all_associates.csv", row.names = FALSE)
#write.csv(t(all.df[,1:5]), "JVHW_entropy/JVHW_v3_python/t.small_associates.csv", row.names = FALSE) # each row is a cue

Merge in JVWH entropies

JVHWent = read.csv("JVHW_entropy/JVHW_v3_python/JVHW.csv")
names(JVHWent) = "JVHW_entropy"

# get old entropies
old.entropy = d.clean %>%
    gather("associate_type", "associate", 7:9)  %>%
    select(cue, associate) %>%
    group_by(cue, associate) %>%
    summarise(n = n()) %>%
    ungroup() %>%
    group_by(cue) %>%
    summarize(entropy = entropy(n)) %>%
    ungroup() %>%
    mutate(log.length = 
               log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))))

ent.df = cbind(cues, JVHWent) %>%
  left_join(old.entropy)

Compare two measures of entropy

Very highly correlated, JVWH has larger variance.

ent.df %>%
  filter(entropy > 2) %>%
  gather(entropy_measure, entropy_value, 2:3) %>%
  ggplot(aes(x = entropy_value, fill = entropy_measure)) +
  facet_wrap(~entropy_measure) + 
  geom_histogram()+
  theme_bw() +
  theme(legend.position="none")

ggplot(ent.df, aes(x = JVHW_entropy, y = entropy)) +
  geom_point()+
  geom_smooth(method = "lm") +
  theme_bw()

tidy(cor.test(ent.df$JVHW_entropy, ent.df$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()
estimate statistic p.value
0.985063 573.4359 0

Are entropies correlated with cue length?

Yes - Both are correlated with length (JVHW_entropy is slightly higher).

ent.df %>%
  filter(entropy > 2) %>%
  gather(entropy_measure, entropy_value, 2:3) %>%
  ggplot(aes(y = log.length, x = entropy_value)) +
  facet_wrap(~entropy_measure) + 
  geom_point() + 
  geom_smooth(method = "lm") +
  theme_bw()

tidy(cor.test(ent.df$log.length, ent.df$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()
estimate statistic p.value
0.1560719 15.8387 0
tidy(cor.test(ent.df$log.length, ent.df$JVHW_entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()
estimate statistic p.value
0.1669151 16.96958 0

Are entropies correlated with conceptual complexity norms?

Yes – Correlation slightly higher for JVHW.

complexity.norms  = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Papers/RC/data/norms/complexityNormsEnglishexp9.csv")

ent.df = inner_join(ent.df, select(complexity.norms, complexity, word), 
                    by = c("cue" = "word")) 
ent.df %>%
  filter(entropy > 2) %>%
  gather(entropy_measure, entropy_value, 2:3) %>%
  ggplot( aes(y = complexity, x = entropy_value)) + 
  facet_wrap(~entropy_measure) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

tidy(cor.test(ent.df$complexity, ent.df$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()
estimate statistic p.value
0.2077399 4.398738 1.38e-05
tidy(cor.test(ent.df$complexity, ent.df$JVHW_entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()
estimate statistic p.value
0.2275113 4.839191 1.8e-06

Are entropies correlated with length, controling for conceptual complexity?

Nope.

summary(lm(log.length ~ JVHW_entropy + complexity, ent.df)) %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 1.0302089 0.1386223 7.4317714 0.0000000
JVHW_entropy -0.0172879 0.0214332 -0.8065935 0.4203487
complexity 0.2169913 0.0137419 15.7904798 0.0000000