Native vs. non-native responses association differences

SEMSOC_x_filtered.Rmd showed that negative words tend to have more divergence. Here we test whether that holds up controlling for frequency and looking at other sentiment norms.

read in data

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 400

lang.dems = d.clean %>%
    group_by(userID, LanguageName, native.lang) %>%
    slice(1) %>%
    group_by(LanguageName, native.lang) %>%
    summarise(n = n()) %>%
    filter(native.lang == "other") %>%
    arrange(-n) %>%
    filter(n > N_CUTOFF) %>%
    select(-native.lang) %>%
    ungroup()

collapse across associates

d.all.ca = d.clean %>%
  filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))

Participant counts

d.all.ca %>%
  select(userID, LanguageName, native.lang) %>%
  distinct() %>%
  group_by(LanguageName,native.lang ) %>%
  summarize(n=n()) %>%
  kable()

LanguageName	native.lang	n
Dutch	other	687
English	english	62412
Finnish	other	459
French	other	601
German	other	1176
Italian	other	411
Spanish	other	1071

x2

from: Manning and Schuetze (1999; pg. 171)

For each cue, compare the distribtion over associates for native and non-native speakers.

x2.scores <- d.all.ca %>%
  group_by(native.lang,cue,associate) %>%
  summarize(n = n()) %>%
  spread(native.lang, n) %>%
  filter(!is.na(english) & !is.na(other)) %>%
  group_by(cue) %>%
  do(x2 = chisq.test(rbind(.$english,.$other))$statistic,
     p = chisq.test(rbind(.$english,.$other))$p.value) %>%
  mutate(x2 = unlist(x2),
         p = unlist(p))

x2 scores

ggplot(x2.scores, aes(x = x2)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of x2-scores")

Skewed, but log transforming doesn’t really help.

Top x2 scores.

x2.scores %>% 
  arrange(-x2) %>%
  select(cue) %>%
  slice (1:100) %>%
  as.list(.)

## $cue
##   [1] cob         comprehend  spud        polar       bumble     
##   [6] locate      utensil     cuisine     up          pondering  
##  [11] annual      communicate shoot       confound    fore       
##  [16] there       drop        disgrace    hyphen      frugal     
##  [21] lavatory    embarrass   experiment  flavor      receive    
##  [26] jaguar      dungarees   spare       envious     celebration
##  [31] locale      reindeer    digit       coleslaw    warrant    
##  [36] drowsy      disciple    cents       chlorine    testament  
##  [41] cabin       searching   walk        swipe       sinus      
##  [46] latex       jogging     collision   trip        flamingo   
##  [51] mixer       tango       broom       tablespoon  Superbowl  
##  [56] myself      pesto       pass out    shrub       pancakes   
##  [61] difficulty  street      tricks      pulse       macaroni   
##  [66] terrible    viola       plea        sonnet      feet       
##  [71] components  quick       soy         procession  mourn      
##  [76] kid         possess     Sahara      curl        punctuation
##  [81] prom        stealing    hoover      plates      salami     
##  [86] for sure    hello       mayonnaise  brimstone   stream     
##  [91] snail       allow       tidy        puff        archive    
##  [96] hydrate     sphere      politeness  bashful     buttercup  
## 10050 Levels: a a few a little a lot aardvark abacus abandon ... zucchini

Many of these seem culture-y.

Predicting divergence

Now let’s see if we can predict the x2-score based on characteristics of the cue.

Join x2.scores and cues characteristics

cues.chars = read.csv("../second_orderQs/data/cues_chars.csv") %>%
  select(-1)

x2.scores.full = x2.scores %>%
  left_join(cues.chars)

Frequency

Does frequency of cue predict x2-score for a cue?

freq.x2s =  x2.scores.full %>%
  select(cue, x2, Lg10WF) %>%
  distinct() %>%
  filter(!is.na(Lg10WF))

ggplot(freq.x2s, aes(x = Lg10WF, y = x2)) +
  geom_smooth(method  = "lm") +
  theme_bw()

freq.x2s %>%
  ungroup() %>%
  do(tidy(cor.test(.$x2, .$Lg10WF))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
-0.066198	-6.327729	0

Less divergence for high frequency items. But clearly not the whole effect.

Sentiment from tidytext package

tidytext::get_sentiments("afinn") (Finn Arup Nielse emotion ratings)

Does sentiment of cue predict mean x2 for a cue? Note that there’s only 1063 words here.

quantsent.x2s =  x2.scores.full %>%
  select(cue, x2, quant.sent,Lg10WF) %>%
  distinct()

ggplot(quantsent.x2s, aes(x = quant.sent, y = x2)) +
  geom_smooth(method  = "lm") +
  theme_bw()

quantsent.x2s %>%
  ungroup() %>%
  do(tidy(cor.test(.$x2, .$quant.sent))) %>%
  select(estimate, statistic, p.value) %>%
  kable()

estimate	statistic	p.value
-0.0722092	-2.358227	0.0185431

More divergence for negative cues.

Controling for frequency

full.freq.sent =  quantsent.x2s %>%
    filter(!is.na(quant.sent) & !is.na(Lg10WF))

1036 words with both frequency and sentiment data. But, controling for frequency there’s still a reliable effect of sentiment: More negative words show more divergence.

kable(tidy(lm(x2 ~ quant.sent + Lg10WF, data = full.freq.sent)))

term	estimate	std.error	statistic	p.value
(Intercept)	8.0711314	0.7683908	10.5039410	0.0000000
quant.sent	-0.2172664	0.0897941	-2.4196066	0.0157091
Lg10WF	-0.0280716	0.2696704	-0.1040961	0.9171133

Plot residuals:

x2.freq.resids = residuals(lm(x2 ~ Lg10WF, data = full.freq.sent))

resid.df = data.frame(resids = x2.freq.resids, sent = full.freq.sent$quant.sent)

ggplot(resid.df, aes(x = sent, y = resids)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

Sentiment from Warriner et al. (2013)

Valence, arousal and dominance norms for 13,915 lemmas total (https://link.springer.com/article/10.3758%2Fs13428-012-0314-x). This provides a much larger sample to look at the relationship between valence and diveregence: 7290 with both frequency and emotion words.

warriner.sent = read.csv("data/BRM-emot-submit.csv") %>%
  select(Word, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum)

x2.scores.full = left_join(x2.scores.full, warriner.sent, by=c("cue"= "Word"))

quantsent.x2s =  x2.scores.full %>%
  select(cue, x2, quant.sent,Lg10WF,V.Mean.Sum,A.Mean.Sum, D.Mean.Sum) %>%
  distinct()

Look at correlation between norms

correlate(quantsent.x2s %>%  select(-1)) %>%
  shave() %>%
  fashion() %>%
  kable()

rowname	x2	quant.sent	Lg10WF	V.Mean.Sum	A.Mean.Sum
x2
quant.sent	-.07
Lg10WF	-.07	.07
V.Mean.Sum	-.01	.89	.15
A.Mean.Sum	.01	-.19	.04	-.17
D.Mean.Sum	-.03	.81	.13	.71	-.17

There’s not the relationship seen previously between valence and divergence.

But, in exploring the relationship between these variables and frequency, there’s an interaction between valence and frequency: We only see the effect above on high frequency words.

valence.interaction.df = quantsent.x2s %>%
     filter(!is.na(V.Mean.Sum) & !is.na(Lg10WF)) %>%
     mutate(freq.bin = ifelse(Lg10WF> median(.$Lg10WF), "high", "low"),
            freq.bin = as.factor(freq.bin))

kable(tidy(lm(x2 ~ V.Mean.Sum  * Lg10WF, data = quantsent.x2s)))

term	estimate	std.error	statistic	p.value
(Intercept)	8.9541247	1.1700303	7.6528999	0.0000000
V.Mean.Sum	0.3812893	0.2141945	1.7801074	0.0751001
Lg10WF	-0.0174352	0.4582840	-0.0380444	0.9696533
V.Mean.Sum:Lg10WF	-0.1519557	0.0820631	-1.8516929	0.0641104

ggplot(valence.interaction.df, aes(x = V.Mean.Sum, y = x2, group = freq.bin, color = freq.bin)) +
  geom_smooth(method  = "lm") +
  theme_bw()

And a reliable interaction between arousal and frequncy: High arousal words diverge more, but only for high frequency words.

arousal.interaction.df = quantsent.x2s %>%
     filter(!is.na(A.Mean.Sum) & !is.na(Lg10WF)) %>%
     mutate(freq.bin = ifelse(Lg10WF> median(.$Lg10WF), "high", "low"),
            freq.bin = as.factor(freq.bin))

kable(tidy(lm(x2 ~ A.Mean.Sum * Lg10WF, data = quantsent.x2s)))

term	estimate	std.error	statistic	p.value
(Intercept)	13.6985480	1.2996694	10.540025	0.0000000
A.Mean.Sum	-0.6468044	0.3053800	-2.118031	0.0342062
Lg10WF	-2.1090484	0.5023101	-4.198698	0.0000272
A.Mean.Sum:Lg10WF	0.3026243	0.1175198	2.575093	0.0100409

ggplot(arousal.interaction.df, aes(x = A.Mean.Sum, y = x2, group = freq.bin, color = freq.bin)) +
  geom_smooth(method  = "lm") +
  theme_bw()

And a reliable interaction between arousal and frequncy: High arousal words diverge more, but only for high frequency words.

arousal.interaction.df = quantsent.x2s %>%
     filter(!is.na(A.Mean.Sum) & !is.na(Lg10WF)) %>%
     mutate(freq.bin = ifelse(Lg10WF> median(.$Lg10WF), "high", "low"),
            freq.bin = as.factor(freq.bin))

kable(tidy(lm(x2 ~ D.Mean.Sum + Lg10WF, data = quantsent.x2s)))

term	estimate	std.error	statistic	p.value
(Intercept)	11.6450557	0.4759275	24.468131	0.0000000
D.Mean.Sum	-0.1326833	0.0790447	-1.678585	0.0932758
Lg10WF	-0.8166904	0.1108184	-7.369631	0.0000000

ggplot(arousal.interaction.df, aes(x = D.Mean.Sum, y = x2, group = freq.bin, color = freq.bin)) +
  geom_smooth(method  = "lm") +
  theme_bw()

In sum, for x2, for high frequency words there is more diverence for words that are low valence, high arousal, and (a trend toward) low dominance. This holds controling for frequency.