SEMSOC and JVHW entropy

Compare distribution of associates across cues
Merge in JVWH entropies
Compare two measures of entropy
Are entropies correlated with cue length?
Are entropies correlated with conceptual complexity norms?
Are entropies correlated with length, controling for conceptual complexity?

d = read.csv("/Documents/GRADUATE_SCHOOL/Projects/SEMSOC/data/associations_ppdetails_en_05_01_2015.csv")

d.clean = d %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean)

cues = d.clean %>%
  gather("associate_type", "associate", 7:9) %>%
  distinct(cue)

These data are from Small World of Words. In the task, participants are given a cue, and are asked to generate 3 associates. Each participant completes 15-19 trials. We have 73256 participants. There are 10050 distinct cues.

Compare distribution of associates across cues

Sample distributions of associations for particular cues.

associate.counts = d.clean %>%
gather("associate_type", "associate", 7:9)  %>%
select(cue, associate) %>%
group_by(cue, associate) %>%
summarise(n = n()) 

ggplot(filter(associate.counts, cue == "abacus"), aes(x = n)) + 
geom_histogram() + 
ggtitle("Cue: Abacus") +
theme_bw()

ggplot(filter(associate.counts, cue == "zucchini"), aes(x = n)) + 
geom_histogram() + 
ggtitle("Cue: zucchini") +
theme_bw()

Save stuff for SEMSOC_entropy.py script.

#First we need to create a dictionary of the words -> integer
word.to.int.dict = d.clean %>%
  gather("associate_type", "associate", 7:9)  %>%
  select(associate) %>%
  distinct(associate) %>%
  mutate(word.int = 1:dim(.)[1])
#write.csv(word.to.int.dict, "word.to.int.dict.csv")

all.associates.for.py = d.clean %>%
  gather("associate_type", "associate", 7:9) %>%
  select(cue, associate) %>% 
  left_join(word.to.int.dict) %>%
  mutate(ca = paste(cue,associate)) %>%
  mutate(n2 = 1:n()) 

# m = spread(all.associates.for.py, cue, word.int) # this doesn't work; not enough memory

all.associates.for.py = d.clean %>%
  gather("associate_type", "associate", 7:9) %>%
  select(cue, associate) %>% 
  left_join(word.to.int.dict) %>%
  select(-associate)



for (i in 1:nrow(cues)){
  my.df = filter(all.associates.for.py, cue == cues[i,1])
  word = as.character(my.df$cue[1]) # this is necessary because there's a cue named "cue"
  my.df = select(my.df,-cue)
  names(my.df)[1] = word
  
  if (i == 1) {
      all.df = my.df
  } else {
    all.df = Cbind(all.df, my.df)
  }

  if (i == 100| i == 500 | i ==  5000) {print (i)}
}
#write.csv(t(all.df), "JVHW_entropy/JVHW_v3_python/t.all_associates.csv", row.names = FALSE)
#write.csv(t(all.df[,1:5]), "JVHW_entropy/JVHW_v3_python/t.small_associates.csv", row.names = FALSE) # each row is a cue

Merge in JVWH entropies

JVHWent = read.csv("JVHW_entropy/JVHW_v3_python/JVHW.csv")
names(JVHWent) = "JVHW_entropy"

# get old entropies
old.entropy = d.clean %>%
    gather("associate_type", "associate", 7:9)  %>%
    select(cue, associate) %>%
    group_by(cue, associate) %>%
    summarise(n = n()) %>%
    ungroup() %>%
    group_by(cue) %>%
    summarize(entropy = entropy(n)) %>%
    ungroup() %>%
    mutate(log.length = 
               log(nchar(as.character(gsub(" ", "", gsub("[[:punct:]]", "", as.character(cue)))))))

ent.df = cbind(cues, JVHWent) %>%
  left_join(old.entropy)

Compare two measures of entropy

Very highly correlated, JVWH has larger variance.

ent.df %>%
  filter(entropy > 2) %>%
  gather(entropy_measure, entropy_value, 2:3) %>%
  ggplot(aes(x = entropy_value, fill = entropy_measure)) +
  facet_wrap(~entropy_measure) + 
  geom_histogram()+
  theme_bw() +
  theme(legend.position="none")

ggplot(ent.df, aes(x = JVHW_entropy, y = entropy)) +
  geom_point()+
  geom_smooth(method = "lm") +
  theme_bw()

tidy(cor.test(ent.df$JVHW_entropy, ent.df$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()

estimate	statistic	p.value
0.985063	573.4359	0

Are entropies correlated with cue length?

Yes - Both are correlated with length (JVHW_entropy is slightly higher).

ent.df %>%
  filter(entropy > 2) %>%
  gather(entropy_measure, entropy_value, 2:3) %>%
  ggplot(aes(y = log.length, x = entropy_value)) +
  facet_wrap(~entropy_measure) + 
  geom_point() + 
  geom_smooth(method = "lm") +
  theme_bw()

tidy(cor.test(ent.df$log.length, ent.df$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()

estimate	statistic	p.value
0.1560719	15.8387	0

tidy(cor.test(ent.df$log.length, ent.df$JVHW_entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()

estimate	statistic	p.value
0.1669151	16.96958	0

Are entropies correlated with conceptual complexity norms?

Yes – Correlation slightly higher for JVHW.

complexity.norms  = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Papers/RC/data/norms/complexityNormsEnglishexp9.csv")

ent.df = inner_join(ent.df, select(complexity.norms, complexity, word), 
                    by = c("cue" = "word")) 
ent.df %>%
  filter(entropy > 2) %>%
  gather(entropy_measure, entropy_value, 2:3) %>%
  ggplot( aes(y = complexity, x = entropy_value)) + 
  facet_wrap(~entropy_measure) + 
  geom_point() + 
  geom_smooth(method = "lm")+
  theme_bw()

tidy(cor.test(ent.df$complexity, ent.df$entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()

estimate	statistic	p.value
0.2077399	4.398738	1.38e-05

tidy(cor.test(ent.df$complexity, ent.df$JVHW_entropy)) %>%
   select(-parameter, -method, -conf.low, -conf.high, -alternative) %>%
   kable()

estimate	statistic	p.value
0.2275113	4.839191	1.8e-06

Are entropies correlated with length, controling for conceptual complexity?

Nope.

summary(lm(log.length ~ JVHW_entropy + complexity, ent.df)) %>%
  tidy() %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	1.0302089	0.1386223	7.4317714	0.0000000
JVHW_entropy	-0.0172879	0.0214332	-0.8065935	0.4203487
complexity	0.2169913	0.0137419	15.7904798	0.0000000