Are semantic networks related to demographic variables? Summary statistics of data from Small World of Words.

d = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")

d.clean = d %>%
  filter(gender == "Ma"| gender ==  "Fe") %>%
  filter(education > 0) %>%
  filter(nativeLanguage != "") %>%
  mutate(gender = droplevels(gender),
         gender = plyr::revalue(gender,c("Fe" = "F", "Ma" = "M")),
         userID = as.factor(userID),
         nativeLanguage = as.factor(tolower(nativeLanguage)))

d.clean = d.clean %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean)

Frequency

Merge in frquency data. Frequency taken here from subtlex-us (Brysbaert & New, 2009).

subtlexus.url <- getURL("https://raw.githubusercontent.com/mllewis/RC/master/data/corpus/SUBTLEXus_corpus.txt")
freqs <- read.table(text = subtlexus.url, header=TRUE) %>%
          select(Word,Lg10WF)

All

freq.data = d.clean %>%
            left_join(freqs, by = c("cue" = "Word")) %>%
            rename(cue.freq.log = Lg10WF) %>%
            left_join(freqs, by = c("a1" = "Word")) %>%
            rename(a1.freq.log = Lg10WF) %>%
            left_join(freqs, by = c("a2" = "Word")) %>%
            rename(a2.freq.log = Lg10WF) %>%
            left_join(freqs, by = c("a3" = "Word")) %>%
            rename(a3.freq.log = Lg10WF)

freq.ms.type = freq.data %>%
               gather("word_type", "freq", 10:13) %>%
              mutate(word_type= as.factor(word_type)) %>%
               group_by(userID, word_type) %>%
               summarise(mean.freq = mean(freq))  %>%
               group_by(word_type) %>%
               multi_boot_standard(column = "mean.freq", na.rm = T)  

levels(freq.ms.type$word_type) = c("a1", "a2", "a3", "cue")
freq.ms.type$word_type = factor(freq.ms.type$word_type,levels(freq.ms.type$word_type)[c(4, 1:3)])

ggplot(freq.ms.type, aes(x = word_type, y = mean, group = 1)) +
  geom_line(color = "black") +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Word Type") +
  ylab("Mean log frequency") +
  theme_bw(base_size = 15) +
  theme(legend.position="none") 

Might be interesting to condition on cue here, by binning frequency.

Education

Mean frequency

freq.ms = freq.data %>%
               gather("association", "freq", 11:13) %>%
               group_by(userID) %>%
               summarise(mean.assoc.freq = mean(freq, na.rm = T))

freq.ed.ms = freq.ms %>%
              left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID)) %>%
              group_by(education) %>%
              multi_boot_standard(column = "mean.assoc.freq", na.rm = T) %>%
              filter(education > 1)

ggplot(freq.ed.ms, aes(y = mean, x = education)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  geom_line() +
  theme_bw(base_size = 15) +
  ylab("mean log frequency of associates") +
  scale_x_continuous(breaks = c(2,3,4,5), labels = c("elementary", "HS",
                                                     "bachelor", "master"))

Change in frequency across associates.

freq.ms.as = freq.data %>%
             group_by(userID) %>%
            summarise(mean.a1.freq = mean(a1.freq.log, na.rm = T),
                         mean.a2.freq = mean(a2.freq.log, na.rm = T),
                         mean.a3.freq = mean(a3.freq.log, na.rm = T))
          
freq.ms.as2 = freq.ms.as %>%
          left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID)) %>%
          gather(association, value, 2:4) %>%
          group_by(education, association) %>%
          multi_boot_standard(column = "value", na.rm = T) %>%
          filter(education > 1) %>%
          ungroup() %>%
          mutate(education = as.factor(education))

ggplot(freq.ms.as2, aes(y = mean, x = association, 
                        group = education, color = education)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  geom_line()+
  theme_bw(base_size = 15) +
  ylab("mean log frequency of associates") +
  scale_x_discrete(labels=c("1", "2", "3")) +
  scale_colour_discrete(labels = c("elementary", "HS", "bachelor", "master"))

Frequency decreases across associates, for all age groups.

Age

Mean frequency

freq.age.ms = freq.ms %>%
               left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age, userID)) %>%
               #mutate(age.bin = cut_width(age, width = 2)) %>%
               group_by(age) %>%
               multi_boot_standard(column = "mean.assoc.freq", na.rm = T)

ggplot(freq.age.ms, aes(y = mean, x = age)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower), size = .2) +
  #geom_line() +
  theme_bw(base_size = 15) +
  ylab("mean log frequency of associates")

p2 = freq.age.ms %>%
  filter(age > 15 & age < 80) %>%
  ggplot(aes(y = mean, x = age)) + 
    geom_smooth(method = "lm") +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme_bw(base_size = 15) +
  xlab("Age (years)") +
  ylab("Mean log frequency of associates") +
  theme(legend.position = "bottom",
        legend.key = element_blank(),
        legend.background = element_rect(fill = "transparent"),
        legend.title = element_blank(),
        axis.line= element_line(size = 40),
        axis.text = element_text(colour = "black", size = 17),
        axis.title = element_text(colour = "black", size = 17),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line.x = element_line(color="black", size = .8),
        axis.line.y = element_line(color="black", size = .8))  

jpeg("freq_SWOW.jpg", width = 5, height = 5, units = 'in', res = 300)
p2
dev.off()
## quartz_off_screen 
##                 2
freq.all.ms = freq.ms %>%
              left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, age, userID)) 

tidy(lm(mean.assoc.freq~age + education, filter(freq.all.ms, education > 0)))
##          term     estimate    std.error   statistic       p.value
## 1 (Intercept)  3.160430294 0.0111460942 283.5459885  0.000000e+00
## 2         age -0.004735110 0.0001364198 -34.7098570 5.867963e-256
## 3   education -0.001223986 0.0026023239  -0.4703436  6.381150e-01

Education does not predict frequency, controling for age.

Change in frequency across associates.

freq.ms.as2 = freq.ms.as %>%
          left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age, userID)) %>%
          mutate(age.bin = cut_width(age, width = 25)) %>%
          gather(association, value, 2:4) %>%
          group_by(age.bin, association) %>% 
          multi_boot_standard(column = "value", na.rm = T)

ggplot(freq.ms.as2, aes(y = mean, x = association, 
                        group = age.bin, color = age.bin)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  geom_line()+
  facet_grid(.~age.bin) +
  theme_bw(base_size = 15) +
  ylab("mean log frequency of associates") +
  scale_x_discrete(labels=c("1", "2", "3")) 

Gender

Mean frequency

freq.gender.ms = freq.ms %>%
               left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, userID)) %>%
               group_by(gender) %>%
               multi_boot_standard(column = "mean.assoc.freq", na.rm = T)

ggplot(freq.gender.ms, aes(y = mean, x = gender)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower), size = .2) +
  theme_bw(base_size = 15) +
  ylab("mean log frequency of associates")

freq.all.ms = freq.ms %>%
              left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, age, education, userID)) 

tidy(lm(mean.assoc.freq~age + gender + education, freq.all.ms))
##          term     estimate    std.error  statistic       p.value
## 1 (Intercept)  3.189741105 0.0113581872 280.831884  0.000000e+00
## 2         age -0.004844193 0.0001361785 -35.572383 2.532683e-268
## 3     genderM -0.049420917 0.0040406778 -12.230848  2.867446e-34
## 4   education -0.003309065 0.0025977441  -1.273822  2.027420e-01

Females have higher frequency associates, controling for age and education.

Change in frequency across associates.

freq.ms.as2 = freq.ms.as %>%
          left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, userID)) %>%
          gather(association, value, 2:4) %>%
          group_by(gender, association) %>% 
          multi_boot_standard(column = "value", na.rm = T)

ggplot(freq.ms.as2, aes(y = mean, x = association, 
                        group = gender, color = gender)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  geom_line()+
  theme_bw(base_size = 15) +
  ylab("mean log frequency of associates") +
  scale_x_discrete(labels=c("1", "2", "3")) 

Language/country

Mean frequency

top_langs = d.clean %>%
        group_by(userID) %>%
        slice(1) %>%
        group_by(nativeLanguage) %>%
        summarise(n = n()) %>%
        top_n(10,n) %>%
        select(nativeLanguage) %>%
        mutate(bilingual = c(1,1,0,0,0,1,0,0,0,1)) %>%
        filter(nativeLanguage != "nan")

freq.ms = freq.data %>%
               gather("association", "freq", 11:13) %>%
               group_by(userID) %>%
               summarise(mean.assoc.freq = mean(freq, na.rm = T))

freq.lang.ms = freq.ms %>%
               left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(nativeLanguage, userID)) %>%
               filter(nativeLanguage %in% levels(droplevels(top_langs$nativeLanguage))) %>%
               group_by(nativeLanguage) %>%
               multi_boot_standard(column = "mean.assoc.freq", na.rm = T)

ggplot(freq.lang.ms, aes(y = mean, x = reorder(nativeLanguage,-mean))) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower), size = .2) +
  theme_bw(base_size = 15) +
  xlab("country (top 10)") +
  ylab("mean log frequency of associates")

Second language speakers have higher frequency associates than native speakers?

freq.all.ms = freq.ms %>%
              left_join(freq.data %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, age, education, userID, nativeLanguage)) %>%
               left_join(top_langs, by = "nativeLanguage") %>%
                 filter(nativeLanguage %in% levels(droplevels(top_langs$nativeLanguage)))

tidy(lm(mean.assoc.freq~age + gender + education + nativeLanguage, freq.all.ms))
##                 term     estimate    std.error   statistic       p.value
## 1        (Intercept)  3.185845719 0.0144885033 219.8878420  0.000000e+00
## 2                age -0.004375775 0.0001442277 -30.3393631 1.331313e-196
## 3            genderM -0.053162149 0.0043546879 -12.2080274  4.003483e-34
## 4          education -0.012124774 0.0027919460  -4.3427679  1.415695e-05
## 5  nativeLanguagecan  0.015233118 0.0112818640   1.3502306  1.769612e-01
## 6  nativeLanguagedut  0.166985083 0.0171097170   9.7596637  1.938912e-22
## 7  nativeLanguagefin  0.096732669 0.0178511272   5.4188550  6.085504e-08
## 8  nativeLanguagefla  0.153942809 0.0173505213   8.8725178  7.899749e-19
## 9  nativeLanguagegbr -0.024331508 0.0096554283  -2.5199822  1.174579e-02
## 10 nativeLanguageger  0.122377923 0.0160703280   7.6151478  2.781581e-14
## 11 nativeLanguagespa  0.192901771 0.0138910799  13.8867369  1.369745e-43
## 12 nativeLanguageusa  0.006590866 0.0090269373   0.7301331  4.653196e-01
tidy(lm(mean.assoc.freq~age + gender + education + bilingual, freq.all.ms))
##          term     estimate   std.error  statistic       p.value
## 1 (Intercept)  3.343196970 0.013557141 246.600446  0.000000e+00
## 2         age -0.004489705 0.000143459 -31.296075 1.062193e-208
## 3     genderM -0.055746649 0.004346521 -12.825580  1.807896e-37
## 4   education -0.011980606 0.002796304  -4.284444  1.842732e-05
## 5   bilingual -0.153584905 0.006507269 -23.602052 4.387146e-121

Relationship between native language and frequency, controling for other stuff.

Part of speech

Read in POS

pos2 = d.clean %>%
          gather("category", "word", 6:9) %>%
          select(word) %>%
          filter(!grepl(" ", word)) %>%
          distinct(word)
          
# write.csv(pos2, "all_words.csv", row.names = F, col.names = F)
# run pos_tagger.py script

pos = read.csv("../data/pos_all_words.csv",
               col.names = c("word", "pos"), header = F)
sent = read.csv("../data/sent_all_words.csv", 
                col.names = c("word", "polarity", "subjectivity"), header = F)
lemma = read.csv("../data/lemma_all_words.csv",
                 col.names = c("word", "lemma"), header = F)

Make POS human readable (Penn POS)

pos.collapsed = pos %>%
  mutate(pos = as.character(pos)) %>%
  rowwise() %>%
  mutate(pos = if(is.na(pos)) as.character(pos)
                      else if(pos == "VB" | pos == "VBD" | pos == "VBG" | pos == "VBN" | pos == "VBP" | pos == "VBZ" ) "verb" 
                      else if(pos == "NN" | pos == "NNP" | pos == "NNS") "noun" 
                      else if(pos == "JJ" | pos == "JJR"| pos == "JJS") "adjective" 
                      else if(pos == "RB" | pos == "RBR") "adverb"
                      else if(pos == "DT") "determiner"
                      else if(pos == "PRP" | pos == "PRP$"| pos == "WP" | pos ==  "WDT" | pos == "WP$"  | pos == "WRB" ) "pronoun"
                      else if (pos == "CC") "conjunction"
                      else if (pos == "CD") "number"
                      else if (pos == "FW") "foreign word"
                      else if (pos == "IN") "preposition"
                      else if (pos == "MD") "modal"
                      else pos)

Merge datasets.

all_words = d.clean %>%
          gather("category", "word", 6:9) %>%
          select(word) %>%
          filter(!grepl(" ", word)) %>%
          distinct(word) %>%
          left_join(pos.collapsed) %>%
          left_join(sent) %>%
          left_join(lemma) 

d.all = d.clean %>%
            left_join(all_words, by = c("cue" = "word")) %>%
            rename(cue.pos = pos,
                   cue.polarity = polarity,
                   cue.subjectivity = subjectivity,
                   cue.lemma = lemma) %>%
            left_join(all_words, by = c("a1" = "word")) %>%
            rename(a1.pos = pos,
                   a1.polarity = polarity,
                   a1.subjectivity = subjectivity,
                   a1.lemma = lemma) %>%
            left_join(all_words, by = c("a2" = "word")) %>%
            rename(a2.pos = pos,
                   a2.polarity = polarity,
                   a2.subjectivity = subjectivity,
                   a2.lemma = lemma) %>%
            left_join(all_words, by = c("a3" = "word")) %>%
            rename(a3.pos = pos,
                   a3.polarity = polarity,
                   a3.subjectivity = subjectivity,
                   a3.lemma = lemma) %>%
            select(userID:education, starts_with('cue'), 
                   starts_with('a1'),
                   starts_with('a2'),
                   starts_with('a3'))

All

Distribution of cue words

d.pos.props = d.all %>%
  select(cue.pos, a1.pos, a2.pos,a3.pos) %>%
  gather("word_type", "pos", 1:4) %>%
  mutate(word_type = as.factor(word_type)) %>%
  group_by(word_type, pos) %>%
  filter(!is.na(pos)) %>%
  summarise(n= n()) %>%
  mutate(prop = n / sum(n)) %>%
  filter(prop > .005)  # filter out low frequency POS

levels(d.pos.props$word_type) = c("a1", "a2", "a3", "cue")
d.pos.props$word_type = factor(d.pos.props$word_type,levels(d.pos.props$word_type)[c(4, 1:3)])
  
ggplot(d.pos.props, aes(x = reorder(pos,-prop), y = prop, fill = pos)) + 
    facet_grid(.~word_type) +
    geom_bar(stat = "identity") + 
    ylab("proportion") +
    xlab("part of speech") +
    ggtitle("Distribution of words") +
    ylim(0,1)+
    theme_bw(base_size = 15) +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

d.pos.props.na = d.all %>%
  select(cue.pos, a1.pos, a2.pos,a3.pos) %>%
  gather("word_type", "pos", 1:4) %>%
  mutate(word_type = as.factor(word_type)) %>%
  group_by(word_type, pos) %>%
  summarise(n= n()) %>%
  mutate(prop = n / sum(n)) %>%
  filter(prop > .005) %>%  # filter out low frequency POS
  filter(is.na(pos))

levels(d.pos.props.na$word_type) = c("a1", "a2", "a3", "cue")
d.pos.props.na$word_type = factor(d.pos.props.na$word_type,levels(d.pos.props.na$word_type)[c(4, 1:3)])
  
ggplot(d.pos.props.na, aes(x = word_type, y = prop,group = word_type)) + 
    geom_bar(stat = "identity") + 
    ylab("proportion") +
    xlab("Condition") +
    ggtitle("Proportion NA") +
    ylim(0,1)+
    theme_bw(base_size = 15) +
    theme(legend.position="none",
          axis.text.x = element_text(angle = 90, hjust = 1))

Most of these untagged words are two word responses, some of which we probably want to include (e.g. “lunch box”), but not sure how to filter those from non-single-item phrases (“not sure” or “pants and trousers”).

Education

Prop category assocations

pos.counts = d.all %>%
  filter(cue.pos == "noun" | cue.pos == "adjective" | cue.pos == "verb") %>%
  gather("a.pos", "pos", c(12,17,22)) %>%
  select(-cue, -a1, -a2, -a3, -contains("polarity"), -contains("subjectivity"), -contains("lemma")) %>%
    filter(pos == "noun" | pos == "adjective" | pos == "verb") 

pos.props.all = pos.counts %>%
  group_by(pos, userID) %>%
  summarise(n = n()) %>%
  left_join(d.all %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID)) %>%
  left_join(pos.counts %>% group_by(userID) %>% filter(!is.na(pos)) %>% 
              summarise(all = n())) %>%
  mutate(prop = n/all) %>%
  group_by(education, pos) %>%
  multi_boot_standard(column = "prop", na.rm = T) %>%
  filter(education > 1) %>%
  ungroup() %>%
  mutate(education = as.factor(education))

ggplot(pos.props.all, aes(x = reorder(pos, -mean), y = mean,
                          group = education, color = education)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("association pos") +
  ylab("mean prop association pos") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15)

Condition on cue

pos.counts = d.all %>%
  filter(cue.pos == "noun" | cue.pos == "adjective" | cue.pos == "verb") %>%
  filter(education > 1) %>%
  group_by(cue.pos, education) %>%
  summarise(n = n(),
            a1_noun = length(which(a1.pos == "noun")),
            a1_adjective = length(which(a1.pos == "adjective")),
            a1_verb = length(which(a1.pos == "verb")), 
            a2_noun = length(which(a2.pos == "noun")),
            a2_adjective = length(which(a2.pos == "adjective")),
            a2_verb = length(which(a2.pos == "verb")), 
            a3_noun = length(which(a3.pos == "noun")),
            a3_adjective = length(which(a3.pos == "adjective")),
            a3_verb = length(which(a3.pos == "verb")))

pos.props = pos.counts %>%
  gather("var", "value", 4:12) %>%
  mutate(prop = value/n) %>%
  mutate(a = unlist(lapply(strsplit(var,"_"),function(x) x[1])),
         a_pos = unlist(lapply(strsplit(var,"_"),function(x) x[2]))) %>%
  select(-var) %>%
  mutate(education = as.factor(education))


ggplot(pos.props, aes(x = reorder(a_pos,-prop), 
                      y = prop, group = education, color = education)) +
  geom_line(stat= "identity") +  
  geom_point(stat= "identity") +
  facet_grid(cue.pos~a ) +
  xlab("association part of speech") +
  theme_bw()

No differences of POS by education.

Age

Prop category assocations

d.pos.age = d.all %>%
            filter(age > 15 & age < 75) %>%
            mutate(age.bin = cut_width(age, width = 10))

pos.prop.age = d.all %>%
  select(userID, age, gender, contains("pos")) %>%
  gather("a.pos", "pos", c(5:7)) %>%
  filter(pos == "noun" | pos == "adjective" | pos == "verb") %>%
  group_by(userID) %>%
  summarise(noun = length(which(pos == "noun"))/n(),
            adjective = length(which(pos == "adjective"))/n(),
            verb = length(which(pos == "verb"))/n()) %>%
  left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID))  %>%
  gather("type", "prop", 2:4) %>%
  group_by(age.bin, type) %>%   
  multi_boot_standard(column = "prop", na.rm = T) %>%
  filter(!is.na(age.bin))

ggplot(pos.prop.age, aes(x = age.bin, y = mean, colour = type, group = type)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  geom_line() + 
  xlab("association pos") +
  ylab("age bin") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

No differences of POS by age.

Condition on cue

pos.prop.age.cuetype = d.all %>%
  select(userID, age, gender, contains("pos")) %>%
  gather("a.pos", "pos", c(5:7)) %>%
  filter(pos == "noun" | pos == "adjective" | pos == "verb") %>%
  filter(cue.pos == "noun" | cue.pos == "adjective" | cue.pos == "verb") %>%
  group_by(userID, cue.pos) %>%
  summarise(noun = length(which(pos == "noun"))/n(),
            adjective = length(which(pos == "adjective"))/n(),
            verb = length(which(pos == "verb"))/n()) %>%
  left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID))  %>%
  gather("a.pos", "prop", 3:5) %>%
  group_by(age.bin, a.pos, cue.pos) %>%   
  multi_boot_standard(column = "prop", na.rm = T) %>%
  filter(!is.na(age.bin))

ggplot(pos.prop.age.cuetype, aes(x = age.bin, y = mean, 
                                 colour = a.pos, group = a.pos)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_grid(.~cue.pos) +
  xlab("age bin") +
  ylab("Proportion assocations") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

This plot is faceted by the cue POS. So, each point is the probability of giving an associate of a particular POS, given a cue POS, across development.

It looks like there may be an effect of age given a verb cue: Older people are more likely to respond with another verb than younger people (who respond with nouns). A similar story holds for adjectives.

Gender

Prop. category associations

pos.prop.gender = d.all %>%
  select(userID, age, gender, contains("pos")) %>%
  gather("a.pos", "pos", c(5:7)) %>%
  filter(pos == "noun" | pos == "adjective" | pos == "verb") %>%
  group_by(userID) %>%
  summarise(noun = length(which(pos == "noun"))/n(),
            adjective = length(which(pos == "adjective"))/n(),
            verb = length(which(pos == "verb"))/n()) %>%
  left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, userID))  %>%
  gather("type", "prop", 2:4) %>%
  group_by(gender, type) %>%   
  multi_boot_standard(column = "prop", na.rm = T)  %>%
  filter(!is.na(gender))

ggplot(pos.prop.gender, aes(x = reorder(type,-mean), 
                            y = mean, group = gender,
                            color = gender)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  geom_line() + 
  xlab("association pos") +
  ylab("prop associations") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

No effect of gender.

Condition on cue

pos.prop.gender.cuetype = d.all %>%
  select(userID, age, gender, contains("pos")) %>%
  gather("a.pos", "pos", c(5:7)) %>%
  filter(pos == "noun" | pos == "adjective" | pos == "verb") %>%
  filter(cue.pos == "noun" | cue.pos == "adjective" | cue.pos == "verb") %>%
  group_by(userID, cue.pos) %>%
  summarise(noun = length(which(pos == "noun"))/n(),
            adjective = length(which(pos == "adjective"))/n(),
            verb = length(which(pos == "verb"))/n()) %>%
  left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, userID))  %>%
  gather("a.pos", "prop", 3:5) %>%
  group_by(gender, a.pos, cue.pos) %>%   
  multi_boot_standard(column = "prop", na.rm = T) %>%
  filter(!is.na(gender))

ggplot(pos.prop.gender.cuetype, aes(x = reorder(a.pos, -mean), y = mean, 
                                 colour = gender, group = gender)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_grid(.~cue.pos) +
  xlab("Gender") +
  ylab("Prop. assocations") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Concreteness

Read in concreteness (from Brysbaert et al., 2013).

conc <- read.csv("../data/brysbaert_corpus.csv",header=TRUE) %>%
  select(Word, Conc.M)


d.all.conc = d.all %>%
            left_join(conc, by = c("cue.lemma" = "Word")) %>%
            rename(cue.conc = Conc.M) %>%
            left_join(conc, by = c("a1.lemma" = "Word")) %>%
            rename(a1.conc = Conc.M) %>%
            left_join(conc, by = c("a2.lemma" = "Word")) %>%
            rename(a2.conc = Conc.M) %>%
            left_join(conc, by = c("a3.lemma" = "Word")) %>%
            rename(a3.conc = Conc.M) %>%
            select(userID:education, contains(".conc"))

All

all.conc.means = d.all.conc %>%
  gather("word_type", "concreteness", 6:9) %>%
  mutate(word_type = as.factor(word_type)) %>%
  filter(!is.na(concreteness)) %>%
  group_by(userID, word_type) %>%
  summarise(mean.concreteness = mean(concreteness)) %>%
  group_by(word_type) %>%
  multi_boot_standard(column = "mean.concreteness", na.rm = T) 

levels(all.conc.means$word_type) = c("a1", "a2", "a3", "cue")
all.conc.means$word_type = factor(all.conc.means$word_type,levels(all.conc.means$word_type)[c(4, 1:3)])

ggplot(all.conc.means, aes(x = word_type, y = mean, 
                              group = 1)) +
  geom_line() +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower))+
  xlab("Word type") +
  ylab("Mean concreteness of word type") +
  theme_bw(base_size = 15)  +
  theme(legend.position = "none")

Education

d.conc.educ = d.all.conc %>%
  gather("a.type", "concreteness", 7:9) %>%
  filter(!is.na(concreteness)) %>%
  group_by(userID) %>%
  summarise(mean.concreteness= mean(concreteness)) %>%
  left_join(d.all %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID))  %>%
  group_by(education) %>%
  filter(education > 1) %>%
  filter(!is.na(education)) %>%
  multi_boot_standard(column = "mean.concreteness", na.rm = T)

ggplot(d.conc.educ, aes(x = education, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Education") +
  ylab("Mean concreteness of associates") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Age

d.conc.age = d.all.conc %>%
  gather("a.type", "concreteness", 7:9) %>%
  filter(!is.na(concreteness)) %>%
  group_by(userID) %>%
  summarise(mean.concreteness= mean(concreteness)) %>%
  left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID))  %>%
  group_by(age.bin) %>%
  filter(!is.na(age.bin))%>%
  multi_boot_standard(column = "mean.concreteness", na.rm = T)

ggplot(d.conc.age, aes(x = age.bin, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Age bin") +
  ylab("Mean concretness of associates") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

d.conc.age.atype = d.all.conc %>%
  gather("a.type", "concreteness", 7:9) %>%
  filter(!is.na(concreteness)) %>%
  group_by(userID, a.type) %>%
  summarise(mean.concreteness= mean(concreteness)) %>%
  left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID))  %>%
  group_by(age.bin, a.type) %>%
  filter(!is.na(age.bin))%>%
  multi_boot_standard(column = "mean.concreteness", na.rm = T)

ggplot(d.conc.age.atype, aes(x = age.bin, y = mean,
                             group = a.type, color = a.type)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  geom_line() +
  xlab("Age bin") +
  ylab("Mean concretness of associates") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Gender

d.conc.gender = d.all.conc %>%
  gather("a.type", "concreteness", 7:9) %>%
  filter(!is.na(concreteness)) %>%
  group_by(userID) %>%
  summarise(mean.concreteness= mean(concreteness)) %>%
  left_join(d.all %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, userID))  %>%
  group_by(gender) %>%
  filter(!is.na(gender)) %>%
  multi_boot_standard(column = "mean.concreteness", na.rm = T)

ggplot(d.conc.gender , aes(x = gender, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Gender") +
  ylab("Mean concreteness of associates") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) 

Polarity

The textblob toolkit has both polarity and subjectivity measures, so I just threw them in, but there’s a lot of missing data for both measures.

All

all.pol.means = d.all %>%
  select(userID, education, contains("polarity")) %>%
  gather("word_type", "polarity", 2:5) %>%
  mutate(word_type = as.factor(word_type)) %>%
  filter(!is.na(polarity)) %>%
  filter(polarity != 0) %>%
  group_by(userID, word_type) %>%
  summarise(mean.polarity= mean(polarity)) %>%
  group_by(word_type) %>%
  multi_boot_standard(column = "mean.polarity", na.rm = T) 

levels(all.pol.means$word_type) = c("a1", "a2", "a3", "cue")
all.pol.means$word_type = factor(all.pol.means$word_type,levels(all.pol.means$word_type)[c(4, 1:3)])

ggplot(all.pol.means, aes(x = word_type, y = mean, 
                              group = 1)) +
  geom_line() +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower))+
  xlab("Word type") +
  ylab("Mean polarity of word type") +
  theme_bw(base_size = 15)  +
  theme(legend.position = "none")

Education

d.polarity.educ = d.all %>%
  select(userID, education, contains("polarity"), -cue.polarity) %>%
  gather("a.type", "polarity", 3:5) %>%
  filter(!is.na(polarity)) %>%
  filter(polarity != 0) %>%
  group_by(userID) %>%
  summarise(mean.polarity = mean(polarity)) %>%
  left_join(d.all %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID))  %>%
  group_by(education) %>%
  filter(education > 1) %>%
  filter(!is.na(education)) %>%
  multi_boot_standard(column = "mean.polarity", na.rm = T)

ggplot(d.polarity.educ, aes(x = education, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Education") +
  ylab("Mean polarity of associates") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Age

d.polarity.age = d.all %>%
  select(userID, education, contains("polarity"), -cue.polarity) %>%
  gather("a.type", "polarity", 3:5) %>%
  filter(!is.na(polarity)) %>%
  filter(polarity != 0) %>%
  group_by(userID) %>%
  summarise(mean.polarity = mean(polarity)) %>%
  left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID))  %>%
  group_by(age.bin) %>%
  filter(!is.na(age.bin)) %>%
  multi_boot_standard(column = "mean.polarity", na.rm = T)

ggplot(d.polarity.age, aes(x = age.bin, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Age bin") +
  ylab("Mean polarity of associates") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Older folks are happier?

Gender

d.polarity.gender = d.all %>%
  select(userID, education, contains("polarity"), -cue.polarity) %>%
  gather("a.type", "polarity", 3:5) %>%
  filter(!is.na(polarity)) %>%
  filter(polarity != 0) %>%
  group_by(userID) %>%
  summarise(mean.polarity = mean(polarity)) %>%
  left_join(d.all %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, userID))  %>%
  group_by(gender) %>%
  filter(!is.na(gender)) %>%
  multi_boot_standard(column = "mean.polarity", na.rm = T)

ggplot(d.polarity.gender, aes(x = gender, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Education") +
  ylab("Mean polarity of associates") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Subjectivity

All

all.subj.means = d.all %>%
  select(userID, education, contains("subjectivity")) %>%
  gather("word_type", "subjectivity", 2:5)%>%
  mutate(word_type = as.factor(word_type)) %>%
  filter(!is.na(subjectivity)) %>%
  filter(subjectivity != 0) %>%
  group_by(userID, word_type) %>%
  summarise(mean.subjectivity = mean(subjectivity)) %>%
  group_by(word_type) %>%
  multi_boot_standard(column = "mean.subjectivity", na.rm = T) 

levels(all.subj.means$word_type) = c("a1", "a2", "a3", "cue")
all.subj.means$word_type = factor(all.subj.means$word_type,levels(all.subj.means$word_type)[c(4, 1:3)])

ggplot(all.subj.means, aes(x = word_type, y = mean, 
                              group = 1)) +
  geom_line() +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower))+
  xlab("Word Type") +
  ylab("Mean subjectivity of word type") +
  theme_bw(base_size = 15)  +
  theme(legend.position = "none")

Education

d.subj.educ = d.all %>%
  select(userID, education, contains("subjectivity"), -cue.subjectivity) %>%
  gather("a.type", "subjectivity", 3:5) %>%
  filter(!is.na(subjectivity)) %>%
  filter(subjectivity != 0) %>%
  group_by(userID) %>%
  summarise(mean.subjectivity = mean(subjectivity)) %>%
  left_join(d.all %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(education, userID))  %>%
  group_by(education) %>%
  filter(education > 1) %>%
  filter(!is.na(education)) %>%
  multi_boot_standard(column = "mean.subjectivity", na.rm = T)

ggplot(d.subj.educ, aes(x = education, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Education") +
  ylab("Mean subjectivity of associates") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Age

d.subjectivity.age = d.all %>%
  select(userID, education, contains("subjectivity"), -cue.subjectivity) %>%
  gather("a.type", "subjectivity", 3:5) %>%
  filter(!is.na(subjectivity)) %>%
  filter(subjectivity != 0) %>%
  group_by(userID) %>%
  summarise(mean.subjectivity = mean(subjectivity)) %>%
  left_join(d.pos.age %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(age.bin, userID))  %>%
  group_by(age.bin) %>%
  filter(!is.na(age.bin))%>%
  multi_boot_standard(column = "mean.subjectivity", na.rm = T)

ggplot(d.subjectivity.age, aes(x = age.bin, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Age bin") +
  ylab("Mean subjectivity of associates") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Gender

d.subjectivity.gender = d.all %>%
  select(userID, gender, contains("subjectivity"), -cue.subjectivity) %>%
  gather("a.type", "subjectivity", 3:5) %>%
  filter(!is.na(subjectivity)) %>%
  filter(subjectivity != 0) %>%
  group_by(userID) %>%
  summarise(mean.subjectivity = mean(subjectivity)) %>%
  left_join(d.all %>% group_by(userID) %>% slice(1) 
                         %>% ungroup() %>% select(gender, userID))  %>%
  group_by(gender) %>%
  filter(!is.na(gender)) %>%
  multi_boot_standard(column = "mean.subjectivity", na.rm = T)

ggplot(d.subjectivity.gender , aes(x = gender, y = mean)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Gender") +
  ylab("Mean subjectivity of associates") +
  geom_line() +
  geom_point() + 
  theme_bw(base_size = 15)