Do the statistics of cue-associate bigrams differ across languages?

Read in data and munge

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 400

lang.dems = d.clean %>%
    group_by(userID, LanguageName, native.lang) %>%
    slice(1) %>%
    group_by(LanguageName, native.lang) %>%
    summarise(n = n()) %>%
    filter(native.lang == "other") %>%
    arrange(-n) %>%
    filter(n > N_CUTOFF) %>%
    select(-native.lang) %>%
    ungroup()

collapse across associates

d.all.ca = d.clean %>%
  filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))
bigram.counts = d.all.ca %>%
  count(native.lang, cue, associate, bigram) 

Participant counts

d.all.ca %>%
  select(userID, LanguageName, native.lang) %>%
  distinct() %>%
  group_by(LanguageName,native.lang ) %>%
  summarize(n=n()) %>%
  kable()
LanguageName native.lang n
Dutch other 687
English english 62412
Finnish other 459
French other 601
German other 1176
Italian other 411
Spanish other 1071

Paired t-test

From Manning and Schuetze (1999; pg. 166): Discover co-occurences that best-distinguish between words within the same corpus. This is slightly different that our goal, because here we want to compare across corpora, and thus our N’s are different. Here, I modify their formula to account for this (analagous to Welch’s two-sample t-test).

Manning and Schuetze: \[ t = \frac{\frac{C(v_1w)}{N}-\frac{C(v_2w)}{N}} {\sqrt{\frac{C(v_1w) + C(v_2w)}{N^2}}} \]

Welch’s version: \[ t = \frac{\frac{C(v_1w)}{n_1}-\frac{C(v_2w)}{n_2}} {\sqrt{\frac{C(v_1w)}{n_1} + \frac{C(v_2w)}{n_2}}} \]

Question: Are N bigram counts or word counts? Assume they are bigrams.

Compare native to non-native:

total.bigrams.native.lang = bigram.counts %>%
  group_by(native.lang) %>%
  summarize(total.bigrams = sum(n))

bigram.counts.rf = bigram.counts %>%
  left_join(total.bigrams.native.lang) %>%
  mutate(rf = n/total.bigrams)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(native.lang, rf, cue, associate) %>%
  spread(native.lang, rf) %>%
  filter(!is.na(english), !is.na(other)) %>%
  mutate(t = (english - other)/sqrt(english  + other))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Large t.scores

t.scores %>% 
  arrange(-t) %>%
  select(cue, associate, t) %>%
  slice (1:100) %>%
  kable()
cue associate t
begin start 0.0046245
mozzarella cheese 0.0046245
cob corn 0.0045862
comprehend understand 0.0044698
exhausted tired 0.0044698
branch tree 0.0044305
smooch kiss 0.0044305
espresso coffee 0.0043107
kitty cat 0.0043107
bashful shy 0.0042703
yolk egg 0.0042703
heifer cow 0.0042295
assist help 0.0041884
freezing cold 0.0041884
ill sick 0.0041884
phobia fear 0.0041884
chubby fat 0.0041470
quick fast 0.0041052
chilly cold 0.0040632
flamingo pink 0.0040632
actual real 0.0040208
polar bear 0.0040208
curl hair 0.0039781
film movie 0.0039781
sterling silver 0.0039781
wager bet 0.0039781
grin smile 0.0039350
parchment paper 0.0039350
weary tired 0.0039350
blackboard chalk 0.0038916
cuckoo clock 0.0038916
present gift 0.0038916
scheme plan 0.0038916
celebration party 0.0038478
crossword puzzle 0.0038478
cub bear 0.0038478
however but 0.0038478
query question 0.0038478
consume eat 0.0038036
finances money 0.0038036
intoxication drunk 0.0038036
stone rock 0.0038036
canary bird 0.0037591
chapel church 0.0037591
congregation church 0.0037591
idiotic stupid 0.0037591
instruct teach 0.0037591
comrade friend 0.0037141
cranny nook 0.0037141
found lost 0.0037141
frequently often 0.0037141
rifle gun 0.0037141
row boat 0.0037141
smiling happy 0.0037141
acquaintance friend 0.0036688
aged old 0.0036688
off on 0.0036688
aroma smell 0.0036230
bestow give 0.0036230
bullet gun 0.0036230
hello hi 0.0036230
heron bird 0.0036230
obsessive compulsive 0.0036230
seek find 0.0036230
station train 0.0036230
thank you 0.0036230
unkind mean 0.0036230
anchovy fish 0.0035769
bumper car 0.0035769
corpse dead 0.0035769
dagger knife 0.0035769
dingo dog 0.0035769
ebony black 0.0035769
gigantic huge 0.0035769
handbag purse 0.0035769
holler yell 0.0035769
pupil student 0.0035769
start begin 0.0035769
stump tree 0.0035769
briefs underwear 0.0035303
certain sure 0.0035303
converse talk 0.0035303
frightening scary 0.0035303
plasma blood 0.0035303
portly fat 0.0035303
reef coral 0.0035303
woodpecker bird 0.0035303
amphibian frog 0.0034833
cuisine food 0.0034833
federal government 0.0034833
height tall 0.0034833
hoot owl 0.0034833
iris flower 0.0034833
jingle bells 0.0034833
licorice black 0.0034833
paste glue 0.0034833
rattle baby 0.0034833
request ask 0.0034833
scissors cut 0.0034833
wellness health 0.0034833

Predicting divergence

Now let’s see if we can predict the t-score based on characteristics of the cue.

Join t.scores and cues characteristics

cues.chars = read.csv ("data/cues_chars.csv")

t.scores.full = t.scores %>%
  left_join(cues.chars) %>%
  select(-english, -other)

Write df

mean.t.scores <- t.scores %>%
  select(cue, t) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t))

#write.csv(mean.t.scores, "data/t.scors.by.cue.filtered.csv")

Frequency

Does frequency of cue predict mean t for a cue?

freq.ts =  t.scores.full %>%
  select(cue, associate, t, Lg10WF) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            Lg10WF = Lg10WF[1])

ggplot(freq.ts, aes(x = Lg10WF, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

freq.ts %>%
  do(tidy(cor.test(.$t, .$Lg10WF))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.0825235 7.897885 0

Higher frequency cues have less divergence.

Sentiment - quant

Does sentiment of cue predict mean t for a cue?

quantsent.ts =  t.scores.full %>%
  select(cue, associate, t, quant.sent) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            quant.sent = quant.sent[1])

ggplot(quantsent.ts, aes(x = quant.sent, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

quantsent.ts %>%
  do(tidy(cor.test(.$t, .$quant.sent))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.0253056 0.8245423 0.4098168

No relation between sentiment of cue and t.

Sentiment - qual

Does sentiment of cue predict mean t for a cue?

qual.sent.ts =  t.scores.full %>%
  select(cue, associate, t, qual.sent) %>%
  distinct() %>%
  filter(!is.na(qual.sent)) %>%
  group_by(cue,qual.sent) %>%
  summarize(t = mean(t)) %>% # mean t for each cue %>%
  group_by(qual.sent) %>%
  multi_boot_standard(column = "t")

ggplot(qual.sent.ts, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
    xlab("sentiment") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Not much going on here, small effect of negative – more difference for negative words.

Part of speech

Does pos of cue predict mean t for a cue?

pos.ts =  t.scores.full %>%
  select(cue, associate, t, pos) %>%
  distinct() %>%
  filter(!is.na(pos)) %>%
  group_by(cue,pos) %>%
  summarize(t = mean(t)) %>% # mean t for each cue 
  group_by(pos) %>%
  multi_boot_standard(column = "t")

ggplot(pos.ts, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
    xlab("pos") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Less variability for function words.

Concreteness

Does concreteness of cue predict mean t for a cue?

conc.ts =  t.scores.full %>%
  select(cue, associate, t, Conc.M) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            conc = Conc.M[1])

ggplot(conc.ts, aes(x = conc, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

conc.ts %>%
  do(tidy(cor.test(.$t, .$conc))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.0607458 5.772228 0

Small effect of concreteness in the predicted direction – Less difference for cues that are more concrete.