Do the statistics of cue-associate bigrams differ across languages?

Read in data and munge

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

Collapse across associate types

d.all.ca = d.clean %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))

bigram.counts = d.all.ca %>%
  count(native.lang, cue, associate, bigram) 

Participant counts

d.all.ca %>%
  select(userID,  native.lang) %>%
  distinct() %>%
  group_by(native.lang ) %>%
  summarize(n=n()) %>%
  kable()
native.lang n
english 62412
other 9369

Paired t-test

From Manning and Schuetze (1999; pg. 166): Discover co-occurences that best-distinguish between words within the same corpus. This is slightly different that our goal, because here we want to compare across corpora, and thus our N’s are different. Here, I modify their formula to account for this (analagous to Welch’s two-sample t-test).

Manning and Schuetze: \[ t = \frac{\frac{C(v_1w)}{N}-\frac{C(v_2w)}{N}} {\sqrt{\frac{C(v_1w) + C(v_2w)}{N^2}}} \]

Welch’s version: \[ t = \frac{\frac{C(v_1w)}{n_1}-\frac{C(v_2w)}{n_2}} {\sqrt{\frac{C(v_1w)}{n_1} + \frac{C(v_2w)}{n_2}}} \]

Question: Are N bigram counts or word counts? Assume they are bigrams.

Compare native to non-native:

total.bigrams.native.lang = bigram.counts %>%
  group_by(native.lang) %>%
  summarize(total.bigrams = sum(n))

bigram.counts.rf = bigram.counts %>%
  left_join(total.bigrams.native.lang) %>%
  mutate(rf = n/total.bigrams)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(native.lang, rf, cue, associate) %>%
  spread(native.lang, rf) %>%
  filter(!is.na(english), !is.na(other)) %>%
  mutate(t = (english - other)/sqrt(english  + other))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

t.scores %>% 
  arrange(-t) %>%
  select(cue, associate, t) %>%
  slice (1:100) %>%
  kable()
cue associate t
bashful shy 0.0049792
phobia fear 0.0049036
venison deer 0.0047491
cob corn 0.0046577
ought should 0.0046195
intoxication drunk 0.0045492
syringe needle 0.0045082
spud potato 0.0045031
crayola crayon 0.0044670
eggplant purple 0.0043833
dingo dog 0.0043410
heifer cow 0.0043035
honeydew melon 0.0043035
tardy late 0.0043035
autumn fall 0.0042983
plasma blood 0.0042983
nutmeg spice 0.0042551
rattle baby 0.0042551
request ask 0.0042551
require need 0.0042116
paprika spice 0.0041677
skeleton bones 0.0041677
would should 0.0041677
belch burp 0.0041384
blaring loud 0.0041384
intriguing interesting 0.0041234
crate box 0.0040787
squeak mouse 0.0040540
collapse fall 0.0040335
not sharp dull 0.0040335
you me 0.0040112
cuckoo clock 0.0039681
present gift 0.0039681
felon criminal 0.0039418
human being 0.0039418
stuffed full 0.0039418
celebration party 0.0039246
calamari squid 0.0038952
outrage anger 0.0038952
petroleum oil 0.0038952
platter plate 0.0038952
shingle roof 0.0038952
son daughter 0.0038952
bolts nuts 0.0038481
glee happy 0.0038481
pane glass 0.0038481
saucer cup 0.0038481
shoot gun 0.0038481
sloppy messy 0.0038481
canary bird 0.0038365
congregation church 0.0038365
kid child 0.0038365
kitty cat 0.0038017
ewe sheep 0.0038006
expense cost 0.0038006
London england 0.0038006
mend fix 0.0038006
menstruation period 0.0038006
rattle snake 0.0038006
woof bark 0.0038006
splinter wood 0.0037919
brief short 0.0037594
conclude finish 0.0037525
dill pickle 0.0037525
fracture break 0.0037525
lime green 0.0037525
acquaintance friend 0.0037469
greatest best 0.0037469
off on 0.0037469
untidy messy 0.0037469
amen prayer 0.0037039
gum chew 0.0037039
nimble quick 0.0037039
polygon shape 0.0037039
spew vomit 0.0037039
stork baby 0.0037039
youngster child 0.0037039
holler yell 0.0036557
probable likely 0.0036557
stump tree 0.0036557
barrister lawyer 0.0036547
clef treble 0.0036547
donkey ass 0.0036547
ladle spoon 0.0036547
mustard yellow 0.0036547
on time punctual 0.0036547
story tale 0.0036547
twirl spin 0.0036547
vivid bright 0.0036547
wizened old 0.0036547
picket fence 0.0036305
beet red 0.0036095
certain sure 0.0036095
converse talk 0.0036095
frightening scary 0.0036095
portly fat 0.0036095
taut tight 0.0036095
banister stairs 0.0036050
corpse body 0.0036050
ergo therefore 0.0036050

Predicting divergence

Now let’s see if we can predict the t-score based on characteristics of the cue.

Create dataset with many cue characterstics for each cue.

# all unique cues
cues.chars = d.clean %>%
  group_by(cue) %>%
  slice(1) %>%
  select(cue) 

# read in frequency and concreteness
subtlexus.url <- getURL("https://raw.githubusercontent.com/mllewis/RC/master/data/corpus/SUBTLEXus_corpus.txt")
freqs <- read.table(text = subtlexus.url, header=TRUE) %>%
          select(Word,Lg10WF)
concreteness <-read.csv("data/brysbaert_corpus.csv", header=TRUE)

# merge in all characteristics (note that this adds rows to dataframe where a cue has multiple characteristics)
cues.chars = cues.chars %>%
  left_join(freqs, by=c("cue"= "Word"))  %>%
  left_join(get_sentiments("afinn"), by=c("cue"= "word")) %>%
  rename(quant.sent = score)  %>%
  left_join(get_sentiments("nrc"), by=c("cue"= "word")) %>%
  rename(qual.sent = sentiment) %>%
  left_join(parts_of_speech, by=c("cue"= "word")) %>%
  left_join(concreteness %>% select(Word,Conc.M), by=c("cue"= "Word"))

# write.csv(cues.chars, "data/cues_chars.csv")

Join t.scores and cues characteristics

t.scores.full = t.scores %>%
  left_join(cues.chars) %>%
  select(-english, -other)
mean.t.scores <- t.scores %>%
  select(cue, t) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t))

write.csv(mean.t.scores, "data/t.scors.by.cue.csv")

Frequency

Does frequency of cue predict mean t for a cue?

freq.ts =  t.scores.full %>%
  select(cue, associate, t, Lg10WF) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            Lg10WF = Lg10WF[1])

ggplot(freq.ts, aes(x = Lg10WF, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

freq.ts %>%
  do(tidy(cor.test(.$t, .$Lg10WF))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.0069726 0.6693821 0.5032685

No relation between frequency of cue and t

Sentiment - quant

Does sentiment of cue predict mean t for a cue?

quantsent.ts =  t.scores.full %>%
  select(cue, associate, t, quant.sent) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            quant.sent = quant.sent[1])

ggplot(quantsent.ts, aes(x = quant.sent, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

quantsent.ts %>%
  do(tidy(cor.test(.$t, .$quant.sent))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.0262494 0.8597384 0.3901255

No relation between sentiment of cue and t.

Sentiment - qual

Does sentiment of cue predict mean t for a cue?

qual.sent.ts =  t.scores.full %>%
  select(cue, associate, t, qual.sent) %>%
  distinct() %>%
  filter(!is.na(qual.sent)) %>%
  group_by(cue,qual.sent) %>%
  summarize(t = mean(t)) %>% # mean t for each cue %>%
  group_by(qual.sent) %>%
  multi_boot_standard(column = "t")

ggplot(qual.sent.ts, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
    xlab("sentiment") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Not much going on here, small effect of positive – more difference for negative words.

Part of speech

Does pos of cue predict mean t for a cue?

pos.ts =  t.scores.full %>%
  select(cue, associate, t, pos) %>%
  distinct() %>%
  filter(!is.na(pos)) %>%
  group_by(cue,pos) %>%
  summarize(t = mean(t)) %>% # mean t for each cue 
  group_by(pos) %>%
  multi_boot_standard(column = "t")

ggplot(pos.ts, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
    xlab("pos") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Less variability for function words.

Concreteness

Does concreteness of cue predict mean t for a cue?

conc.ts =  t.scores.full %>%
  select(cue, associate, t, Conc.M) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            conc = Conc.M[1])

ggplot(conc.ts, aes(x = conc, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

conc.ts %>%
  do(tidy(cor.test(.$t, .$conc))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.0441618 4.218039 2.49e-05

Small effect of concreteness in the predicted direction – Less difference for cues that are more concrete.