Do the statistics of cue-associate bigrams differ across languages?
Read in data and munge
d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")
lang.codes = read.csv("../../data/language_codes.csv") %>%
select(ISO639.2BCode, LanguageName)
d.long = d.raw %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
d.clean = d.long %>%
left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
nativeLanguage != "fla" & nativeLanguage != "can" & nativeLanguage != "nan" &
nativeLanguage != "pun" & nativeLanguage != "nl") %>%
mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
as.character(LanguageName)),
LanguageName = as.factor(LanguageName),
country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
country = as.factor(country),
native.lang = ifelse(LanguageName == "English", "english", "other"),
native.lang = as.factor(native.lang)) %>%
select(-nativeLanguage)
# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 400
lang.dems = d.clean %>%
group_by(userID, LanguageName, native.lang) %>%
slice(1) %>%
group_by(LanguageName, native.lang) %>%
summarise(n = n()) %>%
filter(native.lang == "other") %>%
arrange(-n) %>%
filter(n > N_CUTOFF) %>%
select(-native.lang) %>%
ungroup()
collapse across associates
d.all.ca = d.clean %>%
filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
ungroup() %>%
gather("associate.type", "associate", 6:8) %>%
filter(cue != "NA" & associate != "NA" ) %>%
mutate(bigram = paste(cue, associate))
bigram.counts = d.all.ca %>%
count(native.lang, cue, associate, bigram)
d.all.ca %>%
select(userID, LanguageName, native.lang) %>%
distinct() %>%
group_by(LanguageName,native.lang ) %>%
summarize(n=n()) %>%
kable()
| LanguageName | native.lang | n |
|---|---|---|
| Dutch | other | 687 |
| English | english | 62412 |
| Finnish | other | 459 |
| French | other | 601 |
| German | other | 1176 |
| Italian | other | 411 |
| Spanish | other | 1071 |
From Manning and Schuetze (1999; pg. 166): Discover co-occurences that best-distinguish between words within the same corpus. This is slightly different that our goal, because here we want to compare across corpora, and thus our N’s are different. Here, I modify their formula to account for this (analagous to Welch’s two-sample t-test).
Manning and Schuetze: \[ t = \frac{\frac{C(v_1w)}{N}-\frac{C(v_2w)}{N}} {\sqrt{\frac{C(v_1w) + C(v_2w)}{N^2}}} \]
Welch’s version: \[ t = \frac{\frac{C(v_1w)}{n_1}-\frac{C(v_2w)}{n_2}} {\sqrt{\frac{C(v_1w)}{n_1} + \frac{C(v_2w)}{n_2}}} \]
Question: Are N bigram counts or word counts? Assume they are bigrams.
Compare native to non-native:
total.bigrams.native.lang = bigram.counts %>%
group_by(native.lang) %>%
summarize(total.bigrams = sum(n))
bigram.counts.rf = bigram.counts %>%
left_join(total.bigrams.native.lang) %>%
mutate(rf = n/total.bigrams)
t.scores <- bigram.counts.rf %>%
ungroup() %>%
as_tibble() %>%
select(native.lang, rf, cue, associate) %>%
spread(native.lang, rf) %>%
filter(!is.na(english), !is.na(other)) %>%
mutate(t = (english - other)/sqrt(english + other)) %>%
mutate(t = abs(t))
t-scores
ggplot(t.scores, aes(x = t)) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of t-scores")
Large absolute t.scores
t.scores %>%
arrange(-t) %>%
select(cue, associate, t) %>%
slice (1:100) %>%
kable()
| cue | associate | t |
|---|---|---|
| boiling | water | 0.0091908 |
| calorie | food | 0.0077082 |
| jaguar | animal | 0.0074312 |
| cholesterol | fat | 0.0069060 |
| hurts | pain | 0.0067886 |
| polyester | plastic | 0.0067227 |
| tires | car | 0.0066291 |
| calorie | fat | 0.0065609 |
| lollipop | sweet | 0.0065609 |
| not smart | stupid | 0.0065203 |
| hold on | wait | 0.0065012 |
| presents | birthday | 0.0064531 |
| diapers | baby | 0.0063988 |
| nirvana | band | 0.0063562 |
| breast feeding | mother | 0.0063209 |
| Freud | psychoanalysis | 0.0062768 |
| lily | flower | 0.0061485 |
| swimmer | water | 0.0061346 |
| gladiator | Rome | 0.0061207 |
| junk food | fat | 0.0061207 |
| utensil | tool | 0.0061207 |
| chewing | gum | 0.0061052 |
| vagina | sex | 0.0060618 |
| raccoon | animal | 0.0060035 |
| refugee | war | 0.0060035 |
| corn-beef | meat | 0.0059667 |
| Florida | miami | 0.0059667 |
| sulfur | chemistry | 0.0059667 |
| brainy | intelligent | 0.0059348 |
| vagina | woman | 0.0059345 |
| head & shoulders | shampoo | 0.0059034 |
| vacancy | job | 0.0058916 |
| flirt | love | 0.0058819 |
| cooked | food | 0.0058719 |
| jaguar | car | 0.0058404 |
| cents | euro | 0.0058324 |
| salami | food | 0.0058324 |
| drums | music | 0.0058166 |
| fountain | water | 0.0058068 |
| lunar | moon | 0.0058068 |
| dealer | drugs | 0.0057826 |
| leftover | food | 0.0057768 |
| white powder | cocaine | 0.0057230 |
| junk food | unhealthy | 0.0057151 |
| ships | water | 0.0057151 |
| asteroid | space | 0.0056679 |
| eyelid | eye | 0.0056636 |
| lubricant | sex | 0.0056636 |
| heartbeat | love | 0.0056432 |
| peroxide | chemistry | 0.0056432 |
| plates | dish | 0.0056432 |
| vaseline | sex | 0.0056432 |
| viola | flower | 0.0056432 |
| embarrassed | shame | 0.0056329 |
| atheism | god | 0.0056241 |
| bunker | war | 0.0056241 |
| ivory | elephant | 0.0055633 |
| cornflakes | breakfast | 0.0055631 |
| bedsheets | bed | 0.0055515 |
| egocentric | me | 0.0055504 |
| scuba | sea | 0.0055504 |
| tangerine | fruit | 0.0055310 |
| Buddha | religion | 0.0055225 |
| gifts | birthday | 0.0055225 |
| isle | island | 0.0054981 |
| newsstand | news | 0.0054709 |
| topping | cake | 0.0054709 |
| catapult | war | 0.0054586 |
| drinks | friends | 0.0054586 |
| intestine | food | 0.0054586 |
| November | rain | 0.0054586 |
| clementine | orange | 0.0054580 |
| tutu | ballet | 0.0054421 |
| Alpine | mountain | 0.0054002 |
| heroic | hero | 0.0053911 |
| mayonnaise | egg | 0.0053911 |
| yours | mine | 0.0053871 |
| honk | car | 0.0053823 |
| campfire | tent | 0.0053680 |
| Disney | cartoon | 0.0053680 |
| CD | music | 0.0053538 |
| cornflakes | milk | 0.0053355 |
| dark brown | chocolate | 0.0053355 |
| curved | road | 0.0053120 |
| festival | music | 0.0053120 |
| caress | love | 0.0053082 |
| demolition | house | 0.0053068 |
| mixer | kitchen | 0.0053068 |
| nerves | body | 0.0053068 |
| grunge | seattle | 0.0052784 |
| proven | evidence | 0.0052784 |
| sheets | white | 0.0052784 |
| snowball | fun | 0.0052784 |
| tangerine | mandarin | 0.0052784 |
| anatomy | body | 0.0052759 |
| boiling | hot | 0.0052759 |
| thigh | leg | 0.0052759 |
| musical instrument | guitar | 0.0052712 |
| boiled | water | 0.0052636 |
| chanting | singing | 0.0052379 |
Now let’s see if we can predict the t-score based on characteristics of the cue.
Join t.scores and cues characteristics
cues.chars = read.csv ("data/cues_chars.csv")
t.scores.full = t.scores %>%
left_join(cues.chars) %>%
select(-english, -other)
Write df
mean.t.scores <- t.scores %>%
select(cue, t) %>%
distinct() %>%
group_by(cue) %>%
summarize(t = mean(t))
#write.csv(mean.t.scores, "data/t.scors.by.cue.filtered.csv")
Does frequency of cue predict mean t for a cue?
freq.ts = t.scores.full %>%
select(cue, associate, t, Lg10WF) %>%
distinct() %>%
group_by(cue) %>%
summarize(t = mean(t), # mean t for each cue
Lg10WF = Lg10WF[1])
ggplot(freq.ts, aes(x = Lg10WF, y = t)) +
geom_smooth(method = "lm") +
theme_bw()
freq.ts %>%
do(tidy(cor.test(.$t, .$Lg10WF))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| -0.1030076 | -9.877217 | 0 |
Higher frequency cues have less divergence.
Does sentiment of cue predict mean t for a cue?
quantsent.ts = t.scores.full %>%
select(cue, associate, t, quant.sent) %>%
distinct() %>%
group_by(cue) %>%
summarize(t = mean(t), # mean t for each cue
quant.sent = quant.sent[1])
ggplot(quantsent.ts, aes(x = quant.sent, y = t)) +
geom_smooth(method = "lm") +
theme_bw()
quantsent.ts %>%
do(tidy(cor.test(.$t, .$quant.sent))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| -0.0892866 | -2.919994 | 0.0035744 |
No relation between sentiment of cue and t.
Does sentiment of cue predict mean t for a cue?
qual.sent.ts = t.scores.full %>%
select(cue, associate, t, qual.sent) %>%
distinct() %>%
filter(!is.na(qual.sent)) %>%
group_by(cue,qual.sent) %>%
summarize(t = mean(t)) %>% # mean t for each cue %>%
group_by(qual.sent) %>%
multi_boot_standard(column = "t")
ggplot(qual.sent.ts, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
xlab("sentiment") +
ylab("t") +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
position = position_dodge(width = .9)) +
theme_bw() +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Not much going on here, small effect of negative – more difference for negative words.
Does pos of cue predict mean t for a cue?
pos.ts = t.scores.full %>%
select(cue, associate, t, pos) %>%
distinct() %>%
filter(!is.na(pos)) %>%
group_by(cue,pos) %>%
summarize(t = mean(t)) %>% # mean t for each cue
group_by(pos) %>%
multi_boot_standard(column = "t")
ggplot(pos.ts, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
xlab("pos") +
ylab("t") +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
position = position_dodge(width = .9)) +
theme_bw() +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Less variability for function words.
Does concreteness of cue predict mean t for a cue?
conc.ts = t.scores.full %>%
select(cue, associate, t, Conc.M) %>%
distinct() %>%
group_by(cue) %>%
summarize(t = mean(t), # mean t for each cue
conc = Conc.M[1])
ggplot(conc.ts, aes(x = conc, y = t)) +
geom_smooth(method = "lm") +
theme_bw()
conc.ts %>%
do(tidy(cor.test(.$t, .$conc))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| -0.0226869 | -2.152344 | 0.0313968 |
Small effect of concreteness in the predicted direction – Less difference for cues that are more concrete.