Do the statistics of cue-associate bigrams differ across languages?

Read in data and munge

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 400

lang.dems = d.clean %>%
    group_by(userID, LanguageName, native.lang) %>%
    slice(1) %>%
    group_by(LanguageName, native.lang) %>%
    summarise(n = n()) %>%
    filter(native.lang == "other") %>%
    arrange(-n) %>%
    filter(n > N_CUTOFF) %>%
    select(-native.lang) %>%
    ungroup()

collapse across associates

d.all.ca = d.clean %>%
  filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))
bigram.counts = d.all.ca %>%
  count(native.lang, cue, associate, bigram) 

Participant counts

d.all.ca %>%
  select(userID, LanguageName, native.lang) %>%
  distinct() %>%
  group_by(LanguageName,native.lang ) %>%
  summarize(n=n()) %>%
  kable()
LanguageName native.lang n
Dutch other 687
English english 62412
Finnish other 459
French other 601
German other 1176
Italian other 411
Spanish other 1071

Paired t-test

From Manning and Schuetze (1999; pg. 166): Discover co-occurences that best-distinguish between words within the same corpus. This is slightly different that our goal, because here we want to compare across corpora, and thus our N’s are different. Here, I modify their formula to account for this (analagous to Welch’s two-sample t-test).

Manning and Schuetze: \[ t = \frac{\frac{C(v_1w)}{N}-\frac{C(v_2w)}{N}} {\sqrt{\frac{C(v_1w) + C(v_2w)}{N^2}}} \]

Welch’s version: \[ t = \frac{\frac{C(v_1w)}{n_1}-\frac{C(v_2w)}{n_2}} {\sqrt{\frac{C(v_1w)}{n_1} + \frac{C(v_2w)}{n_2}}} \]

Question: Are N bigram counts or word counts? Assume they are bigrams.

Compare native to non-native:

total.bigrams.native.lang = bigram.counts %>%
  group_by(native.lang) %>%
  summarize(total.bigrams = sum(n))

bigram.counts.rf = bigram.counts %>%
  left_join(total.bigrams.native.lang) %>%
  mutate(rf = n/total.bigrams)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(native.lang, rf, cue, associate) %>%
  spread(native.lang, rf) %>%
  filter(!is.na(english), !is.na(other)) %>%
  mutate(t = (english - other)/sqrt(english  + other)) %>%
  mutate(t = abs(t))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Large absolute t.scores

t.scores %>% 
  arrange(-t) %>%
  select(cue, associate, t) %>%
  slice (1:100) %>%
  kable()
cue associate t
boiling water 0.0091908
calorie food 0.0077082
jaguar animal 0.0074312
cholesterol fat 0.0069060
hurts pain 0.0067886
polyester plastic 0.0067227
tires car 0.0066291
calorie fat 0.0065609
lollipop sweet 0.0065609
not smart stupid 0.0065203
hold on wait 0.0065012
presents birthday 0.0064531
diapers baby 0.0063988
nirvana band 0.0063562
breast feeding mother 0.0063209
Freud psychoanalysis 0.0062768
lily flower 0.0061485
swimmer water 0.0061346
gladiator Rome 0.0061207
junk food fat 0.0061207
utensil tool 0.0061207
chewing gum 0.0061052
vagina sex 0.0060618
raccoon animal 0.0060035
refugee war 0.0060035
corn-beef meat 0.0059667
Florida miami 0.0059667
sulfur chemistry 0.0059667
brainy intelligent 0.0059348
vagina woman 0.0059345
head & shoulders shampoo 0.0059034
vacancy job 0.0058916
flirt love 0.0058819
cooked food 0.0058719
jaguar car 0.0058404
cents euro 0.0058324
salami food 0.0058324
drums music 0.0058166
fountain water 0.0058068
lunar moon 0.0058068
dealer drugs 0.0057826
leftover food 0.0057768
white powder cocaine 0.0057230
junk food unhealthy 0.0057151
ships water 0.0057151
asteroid space 0.0056679
eyelid eye 0.0056636
lubricant sex 0.0056636
heartbeat love 0.0056432
peroxide chemistry 0.0056432
plates dish 0.0056432
vaseline sex 0.0056432
viola flower 0.0056432
embarrassed shame 0.0056329
atheism god 0.0056241
bunker war 0.0056241
ivory elephant 0.0055633
cornflakes breakfast 0.0055631
bedsheets bed 0.0055515
egocentric me 0.0055504
scuba sea 0.0055504
tangerine fruit 0.0055310
Buddha religion 0.0055225
gifts birthday 0.0055225
isle island 0.0054981
newsstand news 0.0054709
topping cake 0.0054709
catapult war 0.0054586
drinks friends 0.0054586
intestine food 0.0054586
November rain 0.0054586
clementine orange 0.0054580
tutu ballet 0.0054421
Alpine mountain 0.0054002
heroic hero 0.0053911
mayonnaise egg 0.0053911
yours mine 0.0053871
honk car 0.0053823
campfire tent 0.0053680
Disney cartoon 0.0053680
CD music 0.0053538
cornflakes milk 0.0053355
dark brown chocolate 0.0053355
curved road 0.0053120
festival music 0.0053120
caress love 0.0053082
demolition house 0.0053068
mixer kitchen 0.0053068
nerves body 0.0053068
grunge seattle 0.0052784
proven evidence 0.0052784
sheets white 0.0052784
snowball fun 0.0052784
tangerine mandarin 0.0052784
anatomy body 0.0052759
boiling hot 0.0052759
thigh leg 0.0052759
musical instrument guitar 0.0052712
boiled water 0.0052636
chanting singing 0.0052379

Predicting divergence

Now let’s see if we can predict the t-score based on characteristics of the cue.

Join t.scores and cues characteristics

cues.chars = read.csv ("data/cues_chars.csv")

t.scores.full = t.scores %>%
  left_join(cues.chars) %>%
  select(-english, -other)

Write df

mean.t.scores <- t.scores %>%
  select(cue, t) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t))

#write.csv(mean.t.scores, "data/t.scors.by.cue.filtered.csv")

Frequency

Does frequency of cue predict mean t for a cue?

freq.ts =  t.scores.full %>%
  select(cue, associate, t, Lg10WF) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            Lg10WF = Lg10WF[1])

ggplot(freq.ts, aes(x = Lg10WF, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

freq.ts %>%
  do(tidy(cor.test(.$t, .$Lg10WF))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
-0.1030076 -9.877217 0

Higher frequency cues have less divergence.

Sentiment - quant

Does sentiment of cue predict mean t for a cue?

quantsent.ts =  t.scores.full %>%
  select(cue, associate, t, quant.sent) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            quant.sent = quant.sent[1])

ggplot(quantsent.ts, aes(x = quant.sent, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

quantsent.ts %>%
  do(tidy(cor.test(.$t, .$quant.sent))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
-0.0892866 -2.919994 0.0035744

No relation between sentiment of cue and t.

Sentiment - qual

Does sentiment of cue predict mean t for a cue?

qual.sent.ts =  t.scores.full %>%
  select(cue, associate, t, qual.sent) %>%
  distinct() %>%
  filter(!is.na(qual.sent)) %>%
  group_by(cue,qual.sent) %>%
  summarize(t = mean(t)) %>% # mean t for each cue %>%
  group_by(qual.sent) %>%
  multi_boot_standard(column = "t")

ggplot(qual.sent.ts, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
    xlab("sentiment") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Not much going on here, small effect of negative – more difference for negative words.

Part of speech

Does pos of cue predict mean t for a cue?

pos.ts =  t.scores.full %>%
  select(cue, associate, t, pos) %>%
  distinct() %>%
  filter(!is.na(pos)) %>%
  group_by(cue,pos) %>%
  summarize(t = mean(t)) %>% # mean t for each cue 
  group_by(pos) %>%
  multi_boot_standard(column = "t")

ggplot(pos.ts, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
    xlab("pos") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Less variability for function words.

Concreteness

Does concreteness of cue predict mean t for a cue?

conc.ts =  t.scores.full %>%
  select(cue, associate, t, Conc.M) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            conc = Conc.M[1])

ggplot(conc.ts, aes(x = conc, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

conc.ts %>%
  do(tidy(cor.test(.$t, .$conc))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
-0.0226869 -2.152344 0.0313968

Small effect of concreteness in the predicted direction – Less difference for cues that are more concrete.