Native vs. non-native responses association differences

Do the statistics of cue-associate bigrams differ across languages?

Read in data and munge

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

Collapse across associate types

d.all.ca = d.clean %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))

bigram.counts = d.all.ca %>%
  count(native.lang, cue, associate, bigram)

Participant counts

d.all.ca %>%
  select(userID,  native.lang) %>%
  distinct() %>%
  group_by(native.lang ) %>%
  summarize(n=n()) %>%
  kable()

native.lang	n
english	62412
other	9369

Paired t-test

From Manning and Schuetze (1999; pg. 166): Discover co-occurences that best-distinguish between words within the same corpus. This is slightly different that our goal, because here we want to compare across corpora, and thus our N’s are different. Here, I modify their formula to account for this (analagous to Welch’s two-sample t-test).

Manning and Schuetze: \[ t = \frac{\frac{C(v_1w)}{N}-\frac{C(v_2w)}{N}} {\sqrt{\frac{C(v_1w) + C(v_2w)}{N^2}}} \]

Welch’s version: \[ t = \frac{\frac{C(v_1w)}{n_1}-\frac{C(v_2w)}{n_2}} {\sqrt{\frac{C(v_1w)}{n_1} + \frac{C(v_2w)}{n_2}}} \]

Question: Are N bigram counts or word counts? Assume they are bigrams.

Compare native to non-native:

total.bigrams.native.lang = bigram.counts %>%
  group_by(native.lang) %>%
  summarize(total.bigrams = sum(n))

bigram.counts.rf = bigram.counts %>%
  left_join(total.bigrams.native.lang) %>%
  mutate(rf = n/total.bigrams)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(native.lang, rf, cue, associate) %>%
  spread(native.lang, rf) %>%
  filter(!is.na(english), !is.na(other)) %>%
  mutate(t = (english - other)/sqrt(english  + other))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

t.scores %>% 
  arrange(-t) %>%
  select(cue, associate, t) %>%
  slice (1:100) %>%
  kable()

cue	associate	t
bashful	shy	0.0049792
phobia	fear	0.0049036
venison	deer	0.0047491
cob	corn	0.0046577
ought	should	0.0046195
intoxication	drunk	0.0045492
syringe	needle	0.0045082
spud	potato	0.0045031
crayola	crayon	0.0044670
eggplant	purple	0.0043833
dingo	dog	0.0043410
heifer	cow	0.0043035
honeydew	melon	0.0043035
tardy	late	0.0043035
autumn	fall	0.0042983
plasma	blood	0.0042983
nutmeg	spice	0.0042551
rattle	baby	0.0042551
request	ask	0.0042551
require	need	0.0042116
paprika	spice	0.0041677
skeleton	bones	0.0041677
would	should	0.0041677
belch	burp	0.0041384
blaring	loud	0.0041384
intriguing	interesting	0.0041234
crate	box	0.0040787
squeak	mouse	0.0040540
collapse	fall	0.0040335
not sharp	dull	0.0040335
you	me	0.0040112
cuckoo	clock	0.0039681
present	gift	0.0039681
felon	criminal	0.0039418
human	being	0.0039418
stuffed	full	0.0039418
celebration	party	0.0039246
calamari	squid	0.0038952
outrage	anger	0.0038952
petroleum	oil	0.0038952
platter	plate	0.0038952
shingle	roof	0.0038952
son	daughter	0.0038952
bolts	nuts	0.0038481
glee	happy	0.0038481
pane	glass	0.0038481
saucer	cup	0.0038481
shoot	gun	0.0038481
sloppy	messy	0.0038481
canary	bird	0.0038365
congregation	church	0.0038365
kid	child	0.0038365
kitty	cat	0.0038017
ewe	sheep	0.0038006
expense	cost	0.0038006
London	england	0.0038006
mend	fix	0.0038006
menstruation	period	0.0038006
rattle	snake	0.0038006
woof	bark	0.0038006
splinter	wood	0.0037919
brief	short	0.0037594
conclude	finish	0.0037525
dill	pickle	0.0037525
fracture	break	0.0037525
lime	green	0.0037525
acquaintance	friend	0.0037469
greatest	best	0.0037469
off	on	0.0037469
untidy	messy	0.0037469
amen	prayer	0.0037039
gum	chew	0.0037039
nimble	quick	0.0037039
polygon	shape	0.0037039
spew	vomit	0.0037039
stork	baby	0.0037039
youngster	child	0.0037039
holler	yell	0.0036557
probable	likely	0.0036557
stump	tree	0.0036557
barrister	lawyer	0.0036547
clef	treble	0.0036547
donkey	ass	0.0036547
ladle	spoon	0.0036547
mustard	yellow	0.0036547
on time	punctual	0.0036547
story	tale	0.0036547
twirl	spin	0.0036547
vivid	bright	0.0036547
wizened	old	0.0036547
picket	fence	0.0036305
beet	red	0.0036095
certain	sure	0.0036095
converse	talk	0.0036095
frightening	scary	0.0036095
portly	fat	0.0036095
taut	tight	0.0036095
banister	stairs	0.0036050
corpse	body	0.0036050
ergo	therefore	0.0036050

Predicting divergence

Now let’s see if we can predict the t-score based on characteristics of the cue.

Create dataset with many cue characterstics for each cue.

# all unique cues
cues.chars = d.clean %>%
  group_by(cue) %>%
  slice(1) %>%
  select(cue) 

# read in frequency and concreteness
subtlexus.url <- getURL("https://raw.githubusercontent.com/mllewis/RC/master/data/corpus/SUBTLEXus_corpus.txt")
freqs <- read.table(text = subtlexus.url, header=TRUE) %>%
          select(Word,Lg10WF)
concreteness <-read.csv("data/brysbaert_corpus.csv", header=TRUE)

# merge in all characteristics (note that this adds rows to dataframe where a cue has multiple characteristics)
cues.chars = cues.chars %>%
  left_join(freqs, by=c("cue"= "Word"))  %>%
  left_join(get_sentiments("afinn"), by=c("cue"= "word")) %>%
  rename(quant.sent = score)  %>%
  left_join(get_sentiments("nrc"), by=c("cue"= "word")) %>%
  rename(qual.sent = sentiment) %>%
  left_join(parts_of_speech, by=c("cue"= "word")) %>%
  left_join(concreteness %>% select(Word,Conc.M), by=c("cue"= "Word"))

# write.csv(cues.chars, "data/cues_chars.csv")

Join t.scores and cues characteristics

t.scores.full = t.scores %>%
  left_join(cues.chars) %>%
  select(-english, -other)

mean.t.scores <- t.scores %>%
  select(cue, t) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t))

write.csv(mean.t.scores, "data/t.scors.by.cue.csv")

Frequency

Does frequency of cue predict mean t for a cue?

freq.ts =  t.scores.full %>%
  select(cue, associate, t, Lg10WF) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            Lg10WF = Lg10WF[1])

ggplot(freq.ts, aes(x = Lg10WF, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

freq.ts %>%
  do(tidy(cor.test(.$t, .$Lg10WF))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
0.0069726	0.6693821	0.5032685

No relation between frequency of cue and t

Sentiment - quant

Does sentiment of cue predict mean t for a cue?

quantsent.ts =  t.scores.full %>%
  select(cue, associate, t, quant.sent) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            quant.sent = quant.sent[1])

ggplot(quantsent.ts, aes(x = quant.sent, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

quantsent.ts %>%
  do(tidy(cor.test(.$t, .$quant.sent))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
0.0262494	0.8597384	0.3901255

No relation between sentiment of cue and t.

Sentiment - qual

Does sentiment of cue predict mean t for a cue?

qual.sent.ts =  t.scores.full %>%
  select(cue, associate, t, qual.sent) %>%
  distinct() %>%
  filter(!is.na(qual.sent)) %>%
  group_by(cue,qual.sent) %>%
  summarize(t = mean(t)) %>% # mean t for each cue %>%
  group_by(qual.sent) %>%
  multi_boot_standard(column = "t")

ggplot(qual.sent.ts, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
    xlab("sentiment") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Not much going on here, small effect of positive – more difference for negative words.

Part of speech

Does pos of cue predict mean t for a cue?

pos.ts =  t.scores.full %>%
  select(cue, associate, t, pos) %>%
  distinct() %>%
  filter(!is.na(pos)) %>%
  group_by(cue,pos) %>%
  summarize(t = mean(t)) %>% # mean t for each cue 
  group_by(pos) %>%
  multi_boot_standard(column = "t")

ggplot(pos.ts, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
    xlab("pos") +
    ylab("t") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Less variability for function words.

Concreteness

Does concreteness of cue predict mean t for a cue?

conc.ts =  t.scores.full %>%
  select(cue, associate, t, Conc.M) %>%
  distinct() %>%
  group_by(cue) %>%
  summarize(t = mean(t), # mean t for each cue
            conc = Conc.M[1])

ggplot(conc.ts, aes(x = conc, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

conc.ts %>%
  do(tidy(cor.test(.$t, .$conc))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
0.0441618	4.218039	2.49e-05

Small effect of concreteness in the predicted direction – Less difference for cues that are more concrete.