SEMSOC_x_filtered.Rmd showed that negative words tend to have more divergence. Here we test whether that holds up controlling for frequency and looking at other sentiment norms.
read in data
d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")
lang.codes = read.csv("../../data/language_codes.csv") %>%
select(ISO639.2BCode, LanguageName)
d.long = d.raw %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
d.clean = d.long %>%
left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
nativeLanguage != "fla" & nativeLanguage != "can" & nativeLanguage != "nan" &
nativeLanguage != "pun" & nativeLanguage != "nl") %>%
mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
as.character(LanguageName)),
LanguageName = as.factor(LanguageName),
country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
country = as.factor(country),
native.lang = ifelse(LanguageName == "English", "english", "other"),
native.lang = as.factor(native.lang)) %>%
select(-nativeLanguage)
# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 400
lang.dems = d.clean %>%
group_by(userID, LanguageName, native.lang) %>%
slice(1) %>%
group_by(LanguageName, native.lang) %>%
summarise(n = n()) %>%
filter(native.lang == "other") %>%
arrange(-n) %>%
filter(n > N_CUTOFF) %>%
select(-native.lang) %>%
ungroup()
collapse across associates
d.all.ca = d.clean %>%
filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
ungroup() %>%
gather("associate.type", "associate", 6:8) %>%
filter(cue != "NA" & associate != "NA" ) %>%
mutate(bigram = paste(cue, associate))
d.all.ca %>%
select(userID, LanguageName, native.lang) %>%
distinct() %>%
group_by(LanguageName,native.lang ) %>%
summarize(n=n()) %>%
kable()
| LanguageName | native.lang | n |
|---|---|---|
| Dutch | other | 687 |
| English | english | 62412 |
| Finnish | other | 459 |
| French | other | 601 |
| German | other | 1176 |
| Italian | other | 411 |
| Spanish | other | 1071 |
from: Manning and Schuetze (1999; pg. 171)
For each cue, compare the distribtion over associates for native and non-native speakers.
x2.scores <- d.all.ca %>%
group_by(native.lang,cue,associate) %>%
summarize(n = n()) %>%
spread(native.lang, n) %>%
filter(!is.na(english) & !is.na(other)) %>%
group_by(cue) %>%
do(x2 = chisq.test(rbind(.$english,.$other))$statistic,
p = chisq.test(rbind(.$english,.$other))$p.value) %>%
mutate(x2 = unlist(x2),
p = unlist(p))
x2 scores
ggplot(x2.scores, aes(x = x2)) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of x2-scores")
Skewed, but log transforming doesn’t really help.
Top x2 scores.
x2.scores %>%
arrange(-x2) %>%
select(cue) %>%
slice (1:100) %>%
as.list(.)
## $cue
## [1] cob comprehend spud polar bumble
## [6] locate utensil cuisine up pondering
## [11] annual communicate shoot confound fore
## [16] there drop disgrace hyphen frugal
## [21] lavatory embarrass experiment flavor receive
## [26] jaguar dungarees spare envious celebration
## [31] locale reindeer digit coleslaw warrant
## [36] drowsy disciple cents chlorine testament
## [41] cabin searching walk swipe sinus
## [46] latex jogging collision trip flamingo
## [51] mixer tango broom tablespoon Superbowl
## [56] myself pesto pass out shrub pancakes
## [61] difficulty street tricks pulse macaroni
## [66] terrible viola plea sonnet feet
## [71] components quick soy procession mourn
## [76] kid possess Sahara curl punctuation
## [81] prom stealing hoover plates salami
## [86] for sure hello mayonnaise brimstone stream
## [91] snail allow tidy puff archive
## [96] hydrate sphere politeness bashful buttercup
## 10050 Levels: a a few a little a lot aardvark abacus abandon ... zucchini
Many of these seem culture-y.
Now let’s see if we can predict the x2-score based on characteristics of the cue.
Join x2.scores and cues characteristics
cues.chars = read.csv("../second_orderQs/data/cues_chars.csv") %>%
select(-1)
x2.scores.full = x2.scores %>%
left_join(cues.chars)
Does frequency of cue predict x2-score for a cue?
freq.x2s = x2.scores.full %>%
select(cue, x2, Lg10WF) %>%
distinct() %>%
filter(!is.na(Lg10WF))
ggplot(freq.x2s, aes(x = Lg10WF, y = x2)) +
geom_smooth(method = "lm") +
theme_bw()
freq.x2s %>%
ungroup() %>%
do(tidy(cor.test(.$x2, .$Lg10WF))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| -0.066198 | -6.327729 | 0 |
Less divergence for high frequency items. But clearly not the whole effect.
tidytext::get_sentiments("afinn") (Finn Arup Nielse emotion ratings)
Does sentiment of cue predict mean x2 for a cue? Note that there’s only 1063 words here.
quantsent.x2s = x2.scores.full %>%
select(cue, x2, quant.sent,Lg10WF) %>%
distinct()
ggplot(quantsent.x2s, aes(x = quant.sent, y = x2)) +
geom_smooth(method = "lm") +
theme_bw()
quantsent.x2s %>%
ungroup() %>%
do(tidy(cor.test(.$x2, .$quant.sent))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| -0.0722092 | -2.358227 | 0.0185431 |
More divergence for negative cues.
full.freq.sent = quantsent.x2s %>%
filter(!is.na(quant.sent) & !is.na(Lg10WF))
1036 words with both frequency and sentiment data. But, controling for frequency there’s still a reliable effect of sentiment: More negative words show more divergence.
kable(tidy(lm(x2 ~ quant.sent + Lg10WF, data = full.freq.sent)))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 8.0711314 | 0.7683908 | 10.5039410 | 0.0000000 |
| quant.sent | -0.2172664 | 0.0897941 | -2.4196066 | 0.0157091 |
| Lg10WF | -0.0280716 | 0.2696704 | -0.1040961 | 0.9171133 |
Plot residuals:
x2.freq.resids = residuals(lm(x2 ~ Lg10WF, data = full.freq.sent))
resid.df = data.frame(resids = x2.freq.resids, sent = full.freq.sent$quant.sent)
ggplot(resid.df, aes(x = sent, y = resids)) +
#geom_point() +
geom_smooth(method = "lm") +
theme_bw()
Valence, arousal and dominance norms for 13,915 lemmas total (https://link.springer.com/article/10.3758%2Fs13428-012-0314-x). This provides a much larger sample to look at the relationship between valence and diveregence: 7290 with both frequency and emotion words.
warriner.sent = read.csv("data/BRM-emot-submit.csv") %>%
select(Word, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum)
x2.scores.full = left_join(x2.scores.full, warriner.sent, by=c("cue"= "Word"))
quantsent.x2s = x2.scores.full %>%
select(cue, x2, quant.sent,Lg10WF,V.Mean.Sum,A.Mean.Sum, D.Mean.Sum) %>%
distinct()
Look at correlation between norms
correlate(quantsent.x2s %>% select(-1)) %>%
shave() %>%
fashion() %>%
kable()
| rowname | x2 | quant.sent | Lg10WF | V.Mean.Sum | A.Mean.Sum | D.Mean.Sum |
|---|---|---|---|---|---|---|
| x2 | ||||||
| quant.sent | -.07 | |||||
| Lg10WF | -.07 | .07 | ||||
| V.Mean.Sum | -.01 | .89 | .15 | |||
| A.Mean.Sum | .01 | -.19 | .04 | -.17 | ||
| D.Mean.Sum | -.03 | .81 | .13 | .71 | -.17 |
There’s not the relationship seen previously between valence and divergence.
But, in exploring the relationship between these variables and frequency, there’s an interaction between valence and frequency: We only see the effect above on high frequency words.
valence.interaction.df = quantsent.x2s %>%
filter(!is.na(V.Mean.Sum) & !is.na(Lg10WF)) %>%
mutate(freq.bin = ifelse(Lg10WF> median(.$Lg10WF), "high", "low"),
freq.bin = as.factor(freq.bin))
kable(tidy(lm(x2 ~ V.Mean.Sum * Lg10WF, data = quantsent.x2s)))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 8.9541247 | 1.1700303 | 7.6528999 | 0.0000000 |
| V.Mean.Sum | 0.3812893 | 0.2141945 | 1.7801074 | 0.0751001 |
| Lg10WF | -0.0174352 | 0.4582840 | -0.0380444 | 0.9696533 |
| V.Mean.Sum:Lg10WF | -0.1519557 | 0.0820631 | -1.8516929 | 0.0641104 |
ggplot(valence.interaction.df, aes(x = V.Mean.Sum, y = x2, group = freq.bin, color = freq.bin)) +
geom_smooth(method = "lm") +
theme_bw()
And a reliable interaction between arousal and frequncy: High arousal words diverge more, but only for high frequency words.
arousal.interaction.df = quantsent.x2s %>%
filter(!is.na(A.Mean.Sum) & !is.na(Lg10WF)) %>%
mutate(freq.bin = ifelse(Lg10WF> median(.$Lg10WF), "high", "low"),
freq.bin = as.factor(freq.bin))
kable(tidy(lm(x2 ~ A.Mean.Sum * Lg10WF, data = quantsent.x2s)))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 13.6985480 | 1.2996694 | 10.540025 | 0.0000000 |
| A.Mean.Sum | -0.6468044 | 0.3053800 | -2.118031 | 0.0342062 |
| Lg10WF | -2.1090484 | 0.5023101 | -4.198698 | 0.0000272 |
| A.Mean.Sum:Lg10WF | 0.3026243 | 0.1175198 | 2.575093 | 0.0100409 |
ggplot(arousal.interaction.df, aes(x = A.Mean.Sum, y = x2, group = freq.bin, color = freq.bin)) +
geom_smooth(method = "lm") +
theme_bw()
And a reliable interaction between arousal and frequncy: High arousal words diverge more, but only for high frequency words.
arousal.interaction.df = quantsent.x2s %>%
filter(!is.na(A.Mean.Sum) & !is.na(Lg10WF)) %>%
mutate(freq.bin = ifelse(Lg10WF> median(.$Lg10WF), "high", "low"),
freq.bin = as.factor(freq.bin))
kable(tidy(lm(x2 ~ D.Mean.Sum + Lg10WF, data = quantsent.x2s)))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 11.6450557 | 0.4759275 | 24.468131 | 0.0000000 |
| D.Mean.Sum | -0.1326833 | 0.0790447 | -1.678585 | 0.0932758 |
| Lg10WF | -0.8166904 | 0.1108184 | -7.369631 | 0.0000000 |
ggplot(arousal.interaction.df, aes(x = D.Mean.Sum, y = x2, group = freq.bin, color = freq.bin)) +
geom_smooth(method = "lm") +
theme_bw()
In sum, for x2, for high frequency words there is more diverence for words that are low valence, high arousal, and (a trend toward) low dominance. This holds controling for frequency.