library(knitr)
library(boot)
library(broom)
library(tidyverse)
library(forcats)
library(wordnet)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = T, tidy = F, fig.height = 4)
setDict("/Documents/GRADUATE_SCHOOL/Misc./wordnet/dict/")
arsq <- function(formula, data, indices) {
d <- data[indices,] # allows boot to select sample
fit <- lm(formula, data=d)
return(summary(fit)$adj.r.square)
}
Read data
simlex = read.table("simlex_with_brysbaert_concreteness.txt",sep="\t", header = T)
Note that were only looking at synsets for meanings of the same POS (which is why “dog” only has 7 here.)
simlex$pos.2 = simlex$Dom_PoS_SUBTLEX.2
simlex$pos.2[882] <- "Noun" # wordnet does not have the verb form for the word "might"
getNumSynsets <- function (word, POS){
POS = ifelse(as.character(POS) == "Name" |as.character(POS) == "Pronoun", "Noun", as.character(POS))
filter <- getTermFilter("ExactMatchFilter",
as.character(word),
TRUE)
print(as.character(word))
terms <- getIndexTerms(toupper(POS), 1, filter)
num_synsets = length(getSynsets(terms[[1]]))
return(num_synsets)
}
# get synsets
#simlex.clean = simlex %>%
# rowwise() %>%
# mutate(num_synsets1 = getNumSynsets(word1, Dom_PoS_SUBTLEX),
# num_synsets2 = getNumSynsets(word2, pos.2))
#write.csv(simlex.clean, "simlex_synsets.csv")
simlex.clean = read.csv("simlex_synsets.csv")
# tidy and get means
simlex.clean = simlex.clean %>%
select(word1,POS,word2, SimLex999, conc.w1., conc.w2.,Lg10WF.2, Lg10WF.1,
num_synsets1, num_synsets2, similarity_news, similarity_wiki) %>%
rowwise() %>%
mutate(mean.conc = mean(c(conc.w1., conc.w2.)),
mean.synsets = mean(c(num_synsets1, num_synsets2)),
mean.freq = mean(c(Lg10WF.1, Lg10WF.2)))
Controling for mean frequency
syns.meds = simlex.clean %>%
group_by(POS) %>%
summarize(median.synsets = median(mean.synsets))
wiki.reg = simlex.clean %>%
left_join(syns.meds) %>%
mutate(syns = as.factor(ifelse(mean.synsets <= median.synsets, "few", "many"))) %>%
group_by(POS, syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_wiki + mean.freq , data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "wiki")
wiki.cis = simlex.clean %>%
left_join(syns.meds) %>%
mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
group_by(POS, syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_wiki + mean.freq),type = "bca")$bca)) %>%
select(POS, syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "wiki")
news.reg = simlex.clean %>%
left_join(syns.meds) %>%
mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
group_by(POS, syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq , data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "news")
news.cis = simlex.clean %>%
left_join(syns.meds) %>%
mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
group_by(POS, syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_news + mean.freq), type = "bca")$bca)) %>%
select(POS, syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "news")
all.d = rbind(left_join(wiki.reg, wiki.cis),
left_join(news.reg, news.cis)) %>%
mutate(POS = plyr::revalue(POS, c("N"="Noun", "V"="Verb", "A" = "Adj.")))
corpus_names <- c(`news`="Google News",`wiki`="Wikipedia")
#pdf("synsetsPOS3.pdf", width = 6, height = 3)
ggplot(all.d, aes(x = POS, y = R2.reg, group = syns, fill = syns)) +
geom_bar(stat ="identity", position = "dodge") +
geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
facet_grid(~dataset, labeller=as_labeller(corpus_names)) +
xlab("Part of speech") +
ylab(bquote('Adjusted'~R^2)) +
labs(fill = "# of synsets") +
theme_bw()
#dev.off()
controling for mean frequency
wiki.reg = simlex.clean %>%
mutate(syns = as.factor(ifelse(mean.synsets <= median(simlex.clean$mean.synsets), "few", "many"))) %>%
group_by(syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_wiki + mean.freq , data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "wiki")
wiki.cis = simlex.clean %>%
mutate(syns = ifelse(mean.synsets <= median(simlex.clean$mean.synsets), "few", "many")) %>%
group_by(syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_wiki + mean.freq),type = "bca")$bca)) %>%
select( syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "wiki")
news.reg = simlex.clean %>%
mutate(syns = ifelse(mean.synsets <= median(simlex.clean$mean.synsets), "few", "many")) %>%
group_by(syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq , data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "news")
news.cis = simlex.clean %>%
mutate(syns = ifelse(mean.synsets <= median(simlex.clean$mean.synsets), "few", "many")) %>%
group_by(syns)%>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_news + mean.freq),type = "bca")$bca)) %>%
select(syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "news")
all.d = rbind(left_join(wiki.reg, wiki.cis) ,left_join(news.reg, news.cis))
#pdf("synsets.pdf", width = 6, height = 3)
corpus_names <- c(`news`="Google News",`wiki`="Wikipedia")
ggplot(all.d, aes(x = syns, y = R2.reg, group = syns, fill = syns)) +
geom_bar(stat ="identity", position = "dodge") +
geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
facet_grid(~dataset, labeller=as_labeller(corpus_names)) +
xlab("Number of synsets") +
ylab(bquote('Adjusted'~R^2)) +
theme_bw() +
theme(legend.position = "none")
#dev.off()
controling for mean frequency and concreteness
wiki.reg = simlex.clean %>%
left_join(syns.meds) %>%
mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
group_by(POS, syns) %>%
do(R2.reg = summary(lm(SimLex999 ~ similarity_wiki + mean.freq + mean.conc, data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "wiki")
wiki.cis = simlex.clean %>%
left_join(syns.meds) %>%
mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
group_by(POS, syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_wiki + mean.freq +mean.conc),type = "bca")$bca)) %>%
select(POS, syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "wiki")
news.reg = simlex.clean %>%
left_join(syns.meds) %>%
mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
group_by(POS, syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq + mean.conc , data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "news")
news.cis = simlex.clean %>%
left_join(syns.meds) %>%
mutate(syns = ifelse(mean.synsets <= median.synsets, "few", "many")) %>%
group_by(POS, syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_news + mean.freq + mean.conc),type = "bca")$bca)) %>%
select(POS, syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "news")
all.d = rbind(left_join(wiki.reg, wiki.cis) ,left_join(news.reg, news.cis))
ggplot(all.d, aes(x = POS, y = R2.reg, group = syns, fill = syns)) +
geom_bar(stat ="identity", position = "dodge") +
geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
facet_wrap(~dataset) +
xlab("Part of speech") +
theme_bw()
Controling for mean frequency (doesn’t matter whether you control for mean or two words separately).
conc.meds = simlex.clean %>%
group_by(POS) %>%
summarize(median.concs= median(mean.conc))
wiki.reg = simlex.clean %>%
left_join(conc.meds) %>%
mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
group_by(POS, syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_wiki + mean.freq , data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "wiki")
wiki.cis = simlex.clean %>%
left_join(conc.meds) %>%
mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
group_by(POS, syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_wiki + mean.freq ),type = "bca")$bca)) %>%
select(POS, syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "wiki")
news.reg = simlex.clean %>%
left_join(conc.meds) %>%
mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
group_by(POS, syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq, data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "news")
news.cis = simlex.clean %>%
left_join(conc.meds) %>%
mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
group_by(POS, syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_news + mean.freq),type = "bca")$bca)) %>%
select(POS, syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "news")
all.d = rbind(left_join(wiki.reg, wiki.cis) ,left_join(news.reg, news.cis))
ggplot(all.d, aes(x = POS, y = R2.reg, group = syns, fill = syns)) +
geom_bar(stat ="identity", position = "dodge") +
geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
facet_wrap(~dataset) +
xlab("Part of speech") +
theme_bw()
Controling for mean frequency and mean sys
conc.meds = simlex.clean %>%
group_by(POS) %>%
summarize(median.concs= median(mean.conc))
wiki.reg = simlex.clean %>%
left_join(conc.meds) %>%
mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
group_by(POS, syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_wiki + mean.freq+ mean.synsets, data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "wiki")
wiki.cis = simlex.clean %>%
left_join(conc.meds) %>%
mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
group_by(POS, syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_wiki + mean.freq + mean.synsets),type = "bca")$bca)) %>%
select(POS, syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "wiki")
news.reg = simlex.clean %>%
left_join(conc.meds) %>%
mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
group_by(POS, syns) %>%
do(R2.reg = summary(lm(SimLex999~similarity_news + mean.freq + mean.synsets, data =.))$adj.r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg)) %>%
mutate(dataset = "news")
news.cis = simlex.clean %>%
left_join(conc.meds) %>%
mutate(syns = ifelse(mean.conc <= median.concs, "few", "many")) %>%
group_by(POS, syns) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~similarity_news + mean.freq + mean.synsets),type = "bca")$bca)) %>%
select(POS, syns, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "news")
all.d = rbind(left_join(wiki.reg, wiki.cis) ,left_join(news.reg, news.cis))
ggplot(all.d, aes(x = POS, y = R2.reg, group = syns, fill = syns)) +
geom_bar(stat ="identity", position = "dodge") +
geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
facet_wrap(~dataset) +
xlab("Part of speech") +
theme_bw()