Native vs. non-native responses association differences

Do the statistics of cue-associate bigrams differ across languages?

Read in data and munge

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 400

lang.dems = d.clean %>%
    group_by(userID, LanguageName, native.lang) %>%
    slice(1) %>%
    group_by(LanguageName, native.lang) %>%
    summarise(n = n()) %>%
    filter(native.lang == "other") %>%
    arrange(-n) %>%
    filter(n > N_CUTOFF) %>%
    select(-native.lang) %>%
    ungroup()

collapse across associates

d.all.ca = d.clean %>%
  filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))

bigram.counts = d.all.ca %>%
  count(native.lang, cue, associate, bigram)

Participant counts

d.all.ca %>%
  select(userID, LanguageName, native.lang) %>%
  distinct() %>%
  group_by(LanguageName,native.lang ) %>%
  summarize(n=n()) %>%
  kable()

LanguageName	native.lang	n
Dutch	other	687
English	english	62412
Finnish	other	459
French	other	601
German	other	1176
Italian	other	411
Spanish	other	1071

Paired t-test

From Manning and Schuetze (1999; pg. 166): Discover co-occurences that best-distinguish between words within the same corpus. This is slightly different that our goal, because here we want to compare across corpora, and thus our N’s are different. Here, I modify their formula to account for this (analagous to Welch’s two-sample t-test).

Manning and Schuetze: \[ t = \frac{\frac{C(v_1w)}{N}-\frac{C(v_2w)}{N}} {\sqrt{\frac{C(v_1w) + C(v_2w)}{N^2}}} \]

Welch’s version: \[ t = \frac{\frac{C(v_1w)}{n_1}-\frac{C(v_2w)}{n_2}} {\sqrt{\frac{C(v_1w)}{n_1} + \frac{C(v_2w)}{n_2}}} \]

Question: Are N bigram counts or word counts? Assume they are bigrams.

Compare native to non-native:

total.bigrams.native.lang = bigram.counts %>%
  group_by(native.lang) %>%
  summarize(total.bigrams = sum(n))

bigram.counts.rf = bigram.counts %>%
  left_join(total.bigrams.native.lang) %>%
  mutate(rf = n/total.bigrams)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(native.lang, rf, cue, associate) %>%
  spread(native.lang, rf) %>%
  filter(!is.na(english), !is.na(other)) %>%
  mutate(t = (english - other)/sqrt(english  + other)) %>%
  mutate(t = abs(t))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Large absolute t.scores

t.scores %>% 
  arrange(-t) %>%
  select(cue, associate, t) %>%
  slice (1:100) %>%
  kable()

cue	associate	t
boiling	water	0.0091908
calorie	food	0.0077082
jaguar	animal	0.0074312
cholesterol	fat	0.0069060
hurts	pain	0.0067886
polyester	plastic	0.0067227
tires	car	0.0066291
calorie	fat	0.0065609
lollipop	sweet	0.0065609
not smart	stupid	0.0065203
hold on	wait	0.0065012
presents	birthday	0.0064531
diapers	baby	0.0063988
nirvana	band	0.0063562
breast feeding	mother	0.0063209
Freud	psychoanalysis	0.0062768
lily	flower	0.0061485
swimmer	water	0.0061346
gladiator	Rome	0.0061207
junk food	fat	0.0061207
utensil	tool	0.0061207
chewing	gum	0.0061052
vagina	sex	0.0060618
raccoon	animal	0.0060035
refugee	war	0.0060035
corn-beef	meat	0.0059667
Florida	miami	0.0059667
sulfur	chemistry	0.0059667
brainy	intelligent	0.0059348
vagina	woman	0.0059345
head & shoulders	shampoo	0.0059034
vacancy	job	0.0058916
flirt	love	0.0058819
cooked	food	0.0058719
jaguar	car	0.0058404
cents	euro	0.0058324
salami	food	0.0058324
drums	music	0.0058166
fountain	water	0.0058068
lunar	moon	0.0058068
dealer	drugs	0.0057826
leftover	food	0.0057768
white powder	cocaine	0.0057230
junk food	unhealthy	0.0057151
ships	water	0.0057151
asteroid	space	0.0056679
eyelid	eye	0.0056636
lubricant	sex	0.0056636
heartbeat	love	0.0056432
peroxide	chemistry	0.0056432
plates	dish	0.0056432
vaseline	sex	0.0056432
viola	flower	0.0056432
embarrassed	shame	0.0056329
atheism	god	0.0056241
bunker	war	0.0056241
ivory	elephant	0.0055633
cornflakes	breakfast	0.0055631
bedsheets	bed	0.0055515
egocentric	me	0.0055504
scuba	sea	0.0055504
tangerine	fruit	0.0055310
Buddha	religion	0.0055225
gifts	birthday	0.0055225
isle	island	0.0054981
newsstand	news	0.0054709
topping	cake	0.0054709
catapult	war	0.0054586
drinks	friends	0.0054586
intestine	food	0.0054586
November	rain	0.0054586
clementine	orange	0.0054580
tutu	ballet	0.0054421
Alpine	mountain	0.0054002
heroic	hero	0.0053911
mayonnaise	egg	0.0053911
yours	mine	0.0053871
honk	car	0.0053823
campfire	tent	0.0053680
Disney	cartoon	0.0053680
CD	music	0.0053538
cornflakes	milk	0.0053355
dark brown	chocolate	0.0053355
curved	road	0.0053120
festival	music	0.0053120
caress	love	0.0053082
demolition	house	0.0053068
mixer	kitchen	0.0053068
nerves	body	0.0053068
grunge	seattle	0.0052784
proven	evidence	0.0052784
sheets	white	0.0052784
snowball	fun	0.0052784
tangerine	mandarin	0.0052784
anatomy	body	0.0052759
boiling	hot	0.0052759
thigh	leg	0.0052759
musical instrument	guitar	0.0052712
boiled	water	0.0052636
chanting	singing	0.0052379

Predicting divergence

Now let’s see if we can predict the t-score based on characteristics of the cue.

Join t.scores and cues characteristics

cues.chars = read.csv ("data/cues_chars.csv")

t.scores.full = t.scores %>%
  left_join(cues.chars) %>%
  select(-english, -other)

Write df

mean.t.scores <- t.scores %>%
  select(cue, t) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t))

#write.csv(mean.t.scores, "data/t.scors.by.cue.filtered.csv")

Frequency

Does frequency of cue predict mean t for a cue?

freq.ts =  t.scores.full %>%
  select(cue, associate, t, Lg10WF) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            Lg10WF = Lg10WF[1])

ggplot(freq.ts, aes(x = Lg10WF, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

freq.ts %>%
  do(tidy(cor.test(.$t, .$Lg10WF))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
-0.1030076	-9.877217	0

Higher frequency cues have less divergence.

Sentiment - quant

Does sentiment of cue predict mean t for a cue?

quantsent.ts =  t.scores.full %>%
  select(cue, associate, t, quant.sent) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            quant.sent = quant.sent[1])

ggplot(quantsent.ts, aes(x = quant.sent, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

quantsent.ts %>%
  do(tidy(cor.test(.$t, .$quant.sent))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
-0.0892866	-2.919994	0.0035744

No relation between sentiment of cue and t.

Sentiment - qual

Does sentiment of cue predict mean t for a cue?

qual.sent.ts =  t.scores.full %>%
  select(cue, associate, t, qual.sent) %>%
  distinct() %>%
  filter(!is.na(qual.sent)) %>%
  group_by(cue,qual.sent) %>%
  summarize(t = mean(t)) %>% # mean t for each cue %>%
  group_by(qual.sent) %>%
  multi_boot_standard(column = "t")

ggplot(qual.sent.ts, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
    xlab("sentiment") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Not much going on here, small effect of negative – more difference for negative words.

Part of speech

Does pos of cue predict mean t for a cue?

pos.ts =  t.scores.full %>%
  select(cue, associate, t, pos) %>%
  distinct() %>%
  filter(!is.na(pos)) %>%
  group_by(cue,pos) %>%
  summarize(t = mean(t)) %>% # mean t for each cue 
  group_by(pos) %>%
  multi_boot_standard(column = "t")

ggplot(pos.ts, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
    xlab("pos") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Less variability for function words.

Concreteness

Does concreteness of cue predict mean t for a cue?

conc.ts =  t.scores.full %>%
  select(cue, associate, t, Conc.M) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            conc = Conc.M[1])

ggplot(conc.ts, aes(x = conc, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

conc.ts %>%
  do(tidy(cor.test(.$t, .$conc))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
-0.0226869	-2.152344	0.0313968

Small effect of concreteness in the predicted direction – Less difference for cues that are more concrete.