read in data

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

collapse across associates

d.all.ca = d.clean %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))

Participant counts

d.all.ca %>%
  select(userID,  native.lang) %>%
  distinct() %>%
  group_by(native.lang ) %>%
  summarize(n=n()) %>%
  kable()
native.lang n
english 62412
other 9369

x2

from: Manning and Schuetze (1999; pg. 171)

For each cue compare the distribtion over associates for native and non-native speakers.

x2.scores <- d.all.ca %>%
  group_by(native.lang,cue,associate) %>%
  summarize(n = n()) %>%
  spread(native.lang, n) %>%
  filter(!is.na(english) & !is.na(other)) %>%
  group_by(cue) %>%
  do(x2 = chisq.test(rbind(.$english,.$other))$statistic,
     p = chisq.test(rbind(.$english,.$other))$p.value) %>%
  mutate(x2 = unlist(x2),
         p = unlist(p))

x2 scores

ggplot(x2.scores, aes(x = x2)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Top x2 scores.

x2.scores %>% 
  arrange(-x2) %>%
  select(cue) %>%
  slice (1:100) %>%
  as.list(.) 
## $cue
##   [1] spud         utensil      lavatory     reindeer     belch       
##   [6] tablespoon   bolts        jaguar       rural        humid       
##  [11] kid          guidance     hemisphere   celebration  drowsy      
##  [16] mourn        autumn       scratch      can          procession  
##  [21] warrant      trip         tea          pancakes     conclude    
##  [26] leopard      pavement     doe          college      honeydew    
##  [31] scared       over         projectile   yes          Superbowl   
##  [36] would        receptionist sunrise      dromedary    hunt        
##  [41] mama         saucer       leather      can't        terrible    
##  [46] calf         regulations  naughty      cents        mixer       
##  [51] searching    possess      single       concept      fever       
##  [56] plantation   handbag      beanie       stench       tango       
##  [61] Portland     washcloth    desire       crook        zoom        
##  [66] comma        rubber       excel        chlorine     household   
##  [71] zip          hangover     physics      Washington   woodpecker  
##  [76] cushion      frown        fearful      flexible     summit      
##  [81] unsolved     prom         swipe        brief        theft       
##  [86] final        envious      cafeteria    plates       park        
##  [91] legitimate   zucchini     viola        hamster      symphony    
##  [96] harvest      gingerbread  disciple     average      beg         
## 10050 Levels: a a few a little a lot aardvark abacus abandon ... zucchini

Many of these seem culture-y.

Predicting divergence

Now let’s see if we can predict the x2-score based on characteristics of the cue.

Join x2.scores and cues characteristics

cues.chars = read.csv ("data/cues_chars.csv")

x2.scores.full = x2.scores %>%
  left_join(cues.chars) 

Frequency

Does frequency of cue predict mean x2 for a cue?

freq.x2s =  x2.scores.full %>%
  select(cue, x2, Lg10WF) %>%
  distinct() %>%
  filter(!is.na(Lg10WF))

ggplot(freq.x2s, aes(x = Lg10WF, y = x2)) +
  geom_smooth(method  = "lm") +
  theme_bw()

freq.x2s %>%
  ungroup() %>%
  do(tidy(cor.test(.$x2, .$Lg10WF))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.0243199 2.335397 0.019544

Higher frequency words have more divergence.

Sentiment - Quant

Does sentiment of cue predict mean t for a cue? Sentiments from: Finn Arup Nielse

quantsent.x2s =  x2.scores.full %>%
  select(cue, x2, quant.sent) %>%
  distinct()

ggplot(quantsent.x2s, aes(x = quant.sent, y = x2)) +
  geom_smooth(method  = "lm") +
  theme_bw()

quantsent.x2s %>%
  ungroup() %>%
  do(tidy(cor.test(.$x2, .$quant.sent))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
-0.0185194 -0.6064539 0.5443418
No effect of sentiment

Sentiment - Qual

Sentiments from NRC Emotion Lexicon from Saif Mohammad and Peter Turney (n ~7000, but some have more than one category)

qual.sent.x2s =x2.scores.full %>%
  select(cue, x2, qual.sent) %>%
  distinct() %>%
  filter(!is.na(qual.sent)) %>%
  group_by(qual.sent) %>%
  multi_boot_standard(column = "x2")

ggplot(qual.sent.x2s, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
    xlab("sentiment") +
    ylab("x2") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Not much going on here.

Part of speech

Does pos of cue predict mean x2 for a cue?

pos.x2s =  x2.scores.full %>%
  select(cue, x2, pos) %>%
  distinct() %>%
  filter(!is.na(pos)) %>%
  group_by(pos) %>%
  multi_boot_standard(column = "x2")

ggplot(pos.x2s, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
    xlab("pos") +
    ylab("x2") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw() +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Note much going on with part of speech.

Concreteness

Does concreteness of cue predict x2 for a cue?

conc.x2s =  x2.scores.full %>%
  select(cue, x2, Conc.M) %>%
  distinct()

ggplot(conc.x2s, aes(x = Conc.M, y = x2)) +
  geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

conc.x2s %>%
  ungroup() %>%
  do(tidy(cor.test(.$x2, .$Conc.M))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.1113864 10.69506 0

More concreteness, more divergence.

What’s the relationship between ts and x2?

t.scores = read.csv("data/t.scors.by.cue.csv") %>%
  select(-1)

x2.scores = x2.scores %>%
  left_join(t.scores)

ggplot(x2.scores, aes(x = abs(t), y = x2)) +
  geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

x2.scores %>%
  ungroup() %>%
  do(tidy(cor.test(.$x2, abs(.$t)))) %>%
  select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
-0.0232808 -2.333947 0.0196181

t and x2 are weakly negatively correlated, which is puzzling.