Native vs. non-native responses association differences

Do the statistics of cue-associate bigrams differ across languages?

Read in data and munge

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 400

lang.dems = d.clean %>%
    group_by(userID, LanguageName, native.lang) %>%
    slice(1) %>%
    group_by(LanguageName, native.lang) %>%
    summarise(n = n()) %>%
    filter(native.lang == "other") %>%
    arrange(-n) %>%
    filter(n > N_CUTOFF) %>%
    select(-native.lang) %>%
    ungroup()

collapse across associates

d.all.ca = d.clean %>%
  filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))

bigram.counts = d.all.ca %>%
  count(native.lang, cue, associate, bigram)

Participant counts

d.all.ca %>%
  select(userID, LanguageName, native.lang) %>%
  distinct() %>%
  group_by(LanguageName,native.lang ) %>%
  summarize(n=n()) %>%
  kable()

LanguageName	native.lang	n
Dutch	other	687
English	english	62412
Finnish	other	459
French	other	601
German	other	1176
Italian	other	411
Spanish	other	1071

Paired t-test

From Manning and Schuetze (1999; pg. 166): Discover co-occurences that best-distinguish between words within the same corpus. This is slightly different that our goal, because here we want to compare across corpora, and thus our N’s are different. Here, I modify their formula to account for this (analagous to Welch’s two-sample t-test).

Manning and Schuetze: \[ t = \frac{\frac{C(v_1w)}{N}-\frac{C(v_2w)}{N}} {\sqrt{\frac{C(v_1w) + C(v_2w)}{N^2}}} \]

Welch’s version: \[ t = \frac{\frac{C(v_1w)}{n_1}-\frac{C(v_2w)}{n_2}} {\sqrt{\frac{C(v_1w)}{n_1} + \frac{C(v_2w)}{n_2}}} \]

Question: Are N bigram counts or word counts? Assume they are bigrams.

Compare native to non-native:

total.bigrams.native.lang = bigram.counts %>%
  group_by(native.lang) %>%
  summarize(total.bigrams = sum(n))

bigram.counts.rf = bigram.counts %>%
  left_join(total.bigrams.native.lang) %>%
  mutate(rf = n/total.bigrams)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(native.lang, rf, cue, associate) %>%
  spread(native.lang, rf) %>%
  filter(!is.na(english), !is.na(other)) %>%
  mutate(t = (english - other)/sqrt(english  + other))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Large t.scores

t.scores %>% 
  arrange(-t) %>%
  select(cue, associate, t) %>%
  slice (1:100) %>%
  kable()

cue	associate	t
begin	start	0.0046245
mozzarella	cheese	0.0046245
cob	corn	0.0045862
comprehend	understand	0.0044698
exhausted	tired	0.0044698
branch	tree	0.0044305
smooch	kiss	0.0044305
espresso	coffee	0.0043107
kitty	cat	0.0043107
bashful	shy	0.0042703
yolk	egg	0.0042703
heifer	cow	0.0042295
assist	help	0.0041884
freezing	cold	0.0041884
ill	sick	0.0041884
phobia	fear	0.0041884
chubby	fat	0.0041470
quick	fast	0.0041052
chilly	cold	0.0040632
flamingo	pink	0.0040632
actual	real	0.0040208
polar	bear	0.0040208
curl	hair	0.0039781
film	movie	0.0039781
sterling	silver	0.0039781
wager	bet	0.0039781
grin	smile	0.0039350
parchment	paper	0.0039350
weary	tired	0.0039350
blackboard	chalk	0.0038916
cuckoo	clock	0.0038916
present	gift	0.0038916
scheme	plan	0.0038916
celebration	party	0.0038478
crossword	puzzle	0.0038478
cub	bear	0.0038478
however	but	0.0038478
query	question	0.0038478
consume	eat	0.0038036
finances	money	0.0038036
intoxication	drunk	0.0038036
stone	rock	0.0038036
canary	bird	0.0037591
chapel	church	0.0037591
congregation	church	0.0037591
idiotic	stupid	0.0037591
instruct	teach	0.0037591
comrade	friend	0.0037141
cranny	nook	0.0037141
found	lost	0.0037141
frequently	often	0.0037141
rifle	gun	0.0037141
row	boat	0.0037141
smiling	happy	0.0037141
acquaintance	friend	0.0036688
aged	old	0.0036688
off	on	0.0036688
aroma	smell	0.0036230
bestow	give	0.0036230
bullet	gun	0.0036230
hello	hi	0.0036230
heron	bird	0.0036230
obsessive	compulsive	0.0036230
seek	find	0.0036230
station	train	0.0036230
thank	you	0.0036230
unkind	mean	0.0036230
anchovy	fish	0.0035769
bumper	car	0.0035769
corpse	dead	0.0035769
dagger	knife	0.0035769
dingo	dog	0.0035769
ebony	black	0.0035769
gigantic	huge	0.0035769
handbag	purse	0.0035769
holler	yell	0.0035769
pupil	student	0.0035769
start	begin	0.0035769
stump	tree	0.0035769
briefs	underwear	0.0035303
certain	sure	0.0035303
converse	talk	0.0035303
frightening	scary	0.0035303
plasma	blood	0.0035303
portly	fat	0.0035303
reef	coral	0.0035303
woodpecker	bird	0.0035303
amphibian	frog	0.0034833
cuisine	food	0.0034833
federal	government	0.0034833
height	tall	0.0034833
hoot	owl	0.0034833
iris	flower	0.0034833
jingle	bells	0.0034833
licorice	black	0.0034833
paste	glue	0.0034833
rattle	baby	0.0034833
request	ask	0.0034833
scissors	cut	0.0034833
wellness	health	0.0034833

Predicting divergence

Now let’s see if we can predict the t-score based on characteristics of the cue.

Join t.scores and cues characteristics

cues.chars = read.csv ("data/cues_chars.csv")

t.scores.full = t.scores %>%
  left_join(cues.chars) %>%
  select(-english, -other)

Write df

mean.t.scores <- t.scores %>%
  select(cue, t) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t))

#write.csv(mean.t.scores, "data/t.scors.by.cue.filtered.csv")

Frequency

Does frequency of cue predict mean t for a cue?

freq.ts =  t.scores.full %>%
  select(cue, associate, t, Lg10WF) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            Lg10WF = Lg10WF[1])

ggplot(freq.ts, aes(x = Lg10WF, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

freq.ts %>%
  do(tidy(cor.test(.$t, .$Lg10WF))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
0.0825235	7.897885	0

Higher frequency cues have less divergence.

Sentiment - quant

Does sentiment of cue predict mean t for a cue?

quantsent.ts =  t.scores.full %>%
  select(cue, associate, t, quant.sent) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            quant.sent = quant.sent[1])

ggplot(quantsent.ts, aes(x = quant.sent, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

quantsent.ts %>%
  do(tidy(cor.test(.$t, .$quant.sent))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
0.0253056	0.8245423	0.4098168

No relation between sentiment of cue and t.

Sentiment - qual

Does sentiment of cue predict mean t for a cue?

qual.sent.ts =  t.scores.full %>%
  select(cue, associate, t, qual.sent) %>%
  distinct() %>%
  filter(!is.na(qual.sent)) %>%
  group_by(cue,qual.sent) %>%
  summarize(t = mean(t)) %>% # mean t for each cue %>%
  group_by(qual.sent) %>%
  multi_boot_standard(column = "t")

ggplot(qual.sent.ts, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
    xlab("sentiment") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Not much going on here, small effect of negative – more difference for negative words.

Part of speech

Does pos of cue predict mean t for a cue?

pos.ts =  t.scores.full %>%
  select(cue, associate, t, pos) %>%
  distinct() %>%
  filter(!is.na(pos)) %>%
  group_by(cue,pos) %>%
  summarize(t = mean(t)) %>% # mean t for each cue 
  group_by(pos) %>%
  multi_boot_standard(column = "t")

ggplot(pos.ts, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
    xlab("pos") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Less variability for function words.

Concreteness

Does concreteness of cue predict mean t for a cue?

conc.ts =  t.scores.full %>%
  select(cue, associate, t, Conc.M) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            conc = Conc.M[1])

ggplot(conc.ts, aes(x = conc, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

conc.ts %>%
  do(tidy(cor.test(.$t, .$conc))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
0.0607458	5.772228	0

Small effect of concreteness in the predicted direction – Less difference for cues that are more concrete.