library(knitr)
library(boot)
library(broom)
library(tidyverse)
library(forcats)
library(wordnet)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = T, tidy = F, fig.height = 4)

setDict("/Documents/GRADUATE_SCHOOL/Misc./wordnet/dict/")

arsq <- function(formula, data, indices) {
  d <- data[indices,] # allows boot to select sample 
  fit <- lm(formula, data=d)
  return(summary(fit)$adj.r.square)
} 

Read data

simlex = read.table("simlex_with_brysbaert_concreteness.txt",sep="\t", header = T)

Get synsets

Note that were only looking at synsets for meanings of the same POS (which is why “dog” only has 7 here.)

simlex$pos.2 = simlex$Dom_PoS_SUBTLEX.2
simlex$pos.2[882] <- "Noun" # wordnet does not have the verb form for the word "might"

getNumSynsets <- function (word, POS){
      POS = ifelse(as.character(POS) == "Name" |as.character(POS) == "Pronoun", "Noun", as.character(POS))
      filter <- getTermFilter("ExactMatchFilter",
                        as.character(word),
                        TRUE)
      print(as.character(word))
      terms <- getIndexTerms(toupper(POS), 1, filter)
      num_synsets = length(getSynsets(terms[[1]]))
      return(num_synsets)
}

# get synsets
#simlex.clean = simlex %>%
#  rowwise() %>%
#  mutate(num_synsets1 = getNumSynsets(word1, Dom_PoS_SUBTLEX),
#         num_synsets2 = getNumSynsets(word2, pos.2))  

#write.csv(simlex.clean, "simlex_synsets.csv")

simlex.clean = read.csv("simlex_synsets.csv")

# tidy and get means
simlex.clean = simlex.clean %>%
  select(word1,POS,word2, SimLex999, conc.w1., conc.w2.,Lg10WF.2, Lg10WF.1, 
         num_synsets1, num_synsets2, similarity_news, similarity_wiki) %>%
  rowwise() %>%
  mutate(mean.conc = mean(c(conc.w1., conc.w2.)),
         mean.synsets = mean(c(num_synsets1, num_synsets2)), 
         mean.freq = mean(c(Lg10WF.1, Lg10WF.2)))

Synset analyses

By dataset

Controling for mean frequency

syns.meds = simlex.clean %>% 
  group_by(POS) %>%
  summarize(median.synsets = median(mean.synsets))

wiki.reg = simlex.clean %>% 
  left_join(syns.meds) %>%
  mutate(syns = as.factor(ifelse(mean.synsets <= median.synsets, "few", "many"))) %>%
  group_by(POS, syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_wiki + mean.freq , data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg)) %>%
  mutate(dataset = "wiki")

wiki.cis = simlex.clean %>% 
   left_join(syns.meds) %>%
  mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_wiki + mean.freq),type = "bca")$bca)) %>%
  select(POS, syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "wiki")

news.reg = simlex.clean %>% 
  left_join(syns.meds) %>%
  mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq , data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg))  %>%
  mutate(dataset = "news")

news.cis = simlex.clean %>% 
  left_join(syns.meds) %>%
  mutate(syns = ifelse(mean.synsets <=  median.synsets, "few", "many")) %>%
  group_by(POS, syns)   %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_news + mean.freq), type = "bca")$bca)) %>%
  select(POS, syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "news")

all.d = rbind(left_join(wiki.reg, wiki.cis), 
              left_join(news.reg, news.cis)) %>%
  mutate(POS = plyr::revalue(POS, c("N"="Noun", "V"="Verb", "A" = "Adj.")))

corpus_names <- c(`news`="Google News",`wiki`="Wikipedia")

#pdf("synsetsPOS3.pdf", width = 6, height = 3)
ggplot(all.d, aes(x = POS, y = R2.reg, group = syns, fill = syns)) +
  geom_bar(stat ="identity", position = "dodge") +
  geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
  facet_grid(~dataset, labeller=as_labeller(corpus_names)) +
  xlab("Part of speech") +
  ylab(bquote('Adjusted'~R^2)) +
  labs(fill = "# of synsets") +
  theme_bw() 

#dev.off()

By dataset and POS

controling for mean frequency

wiki.reg = simlex.clean %>% 
  mutate(syns = as.factor(ifelse(mean.synsets <= median(simlex.clean$mean.synsets), "few", "many"))) %>%
  group_by(syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_wiki + mean.freq , data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg)) %>%
  mutate(dataset = "wiki")

wiki.cis = simlex.clean %>% 
  mutate(syns = ifelse(mean.synsets <=  median(simlex.clean$mean.synsets), "few", "many")) %>%
  group_by(syns)  %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_wiki + mean.freq),type = "bca")$bca)) %>%
  select( syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "wiki")

news.reg = simlex.clean %>% 
  mutate(syns = ifelse(mean.synsets <= median(simlex.clean$mean.synsets), "few", "many")) %>%
  group_by(syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq , data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg))  %>%
  mutate(dataset = "news")

news.cis = simlex.clean %>% 
  mutate(syns = ifelse(mean.synsets <= median(simlex.clean$mean.synsets), "few", "many")) %>%
  group_by(syns)%>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_news + mean.freq),type = "bca")$bca)) %>%
  select(syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "news")

all.d = rbind(left_join(wiki.reg, wiki.cis) ,left_join(news.reg, news.cis))

#pdf("synsets.pdf", width = 6, height = 3)
corpus_names <- c(`news`="Google News",`wiki`="Wikipedia")
ggplot(all.d, aes(x = syns, y = R2.reg, group = syns, fill = syns)) +
  geom_bar(stat ="identity", position = "dodge") +
  geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
  facet_grid(~dataset, labeller=as_labeller(corpus_names)) +
  xlab("Number of synsets") +
  ylab(bquote('Adjusted'~R^2)) +
  theme_bw() +
  theme(legend.position = "none")

#dev.off()

By dataset and POS

controling for mean frequency and concreteness

wiki.reg = simlex.clean %>% 
  left_join(syns.meds) %>%
  mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
  group_by(POS, syns) %>%
  do(R2.reg = summary(lm(SimLex999 ~ similarity_wiki + mean.freq + mean.conc, data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg)) %>%
  mutate(dataset = "wiki")

wiki.cis = simlex.clean %>% 
  left_join(syns.meds) %>%
  mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_wiki + mean.freq +mean.conc),type = "bca")$bca)) %>%
  select(POS, syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "wiki")

news.reg = simlex.clean %>% 
  left_join(syns.meds) %>%
  mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq + mean.conc , data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg))  %>%
  mutate(dataset = "news")

news.cis = simlex.clean %>% 
  left_join(syns.meds) %>%
  mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
  group_by(POS, syns)   %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_news + mean.freq + mean.conc),type = "bca")$bca)) %>%
  select(POS, syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "news")

all.d = rbind(left_join(wiki.reg, wiki.cis) ,left_join(news.reg, news.cis))

ggplot(all.d, aes(x = POS, y = R2.reg, group = syns, fill = syns)) +
  geom_bar(stat ="identity", position = "dodge") +
  geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
  facet_wrap(~dataset) +
  xlab("Part of speech") +
  theme_bw()

Concreteness analyses

By dataset and POS

Controling for mean frequency (doesn’t matter whether you control for mean or two words separately).

conc.meds = simlex.clean %>% 
  group_by(POS) %>%
  summarize(median.concs= median(mean.conc))

wiki.reg = simlex.clean %>% 
  left_join(conc.meds) %>%
  mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_wiki + mean.freq , data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg)) %>%
  mutate(dataset = "wiki")

wiki.cis = simlex.clean %>% 
  left_join(conc.meds) %>%
  mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_wiki + mean.freq ),type = "bca")$bca)) %>%
  select(POS, syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "wiki")

news.reg = simlex.clean %>% 
  left_join(conc.meds) %>%
  mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq, data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg))  %>%
  mutate(dataset = "news")

news.cis = simlex.clean %>% 
  left_join(conc.meds) %>%
  mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
  group_by(POS, syns)   %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_news + mean.freq),type = "bca")$bca)) %>%
  select(POS, syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "news")

all.d = rbind(left_join(wiki.reg, wiki.cis) ,left_join(news.reg, news.cis))

ggplot(all.d, aes(x = POS, y = R2.reg, group = syns, fill = syns)) +
  geom_bar(stat ="identity", position = "dodge") +
  geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
  facet_wrap(~dataset) +
  xlab("Part of speech") +
  theme_bw()

By dataset and POS

Controling for mean frequency and mean sys

conc.meds = simlex.clean %>% 
  group_by(POS) %>%
  summarize(median.concs= median(mean.conc))

wiki.reg = simlex.clean %>% 
  left_join(conc.meds) %>%
  mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_wiki + mean.freq+ mean.synsets, data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg)) %>%
  mutate(dataset = "wiki")

wiki.cis = simlex.clean %>% 
  left_join(conc.meds) %>%
  mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_wiki + mean.freq + mean.synsets),type = "bca")$bca)) %>%
  select(POS, syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "wiki")

news.reg = simlex.clean %>% 
  left_join(conc.meds) %>%
  mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
  group_by(POS, syns)  %>%
  do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq + mean.synsets, data =.))$adj.r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg))  %>%
  mutate(dataset = "news")

news.cis = simlex.clean %>% 
  left_join(conc.meds) %>%
  mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
  group_by(POS, syns)   %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~similarity_news + mean.freq + mean.synsets),type = "bca")$bca)) %>%
  select(POS, syns, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "news")

all.d = rbind(left_join(wiki.reg, wiki.cis) ,left_join(news.reg, news.cis))

ggplot(all.d, aes(x = POS, y = R2.reg, group = syns, fill = syns)) +
  geom_bar(stat ="identity", position = "dodge") +
  geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
  facet_wrap(~dataset) +
  xlab("Part of speech") +
  theme_bw()