rsq <- function(formula, data, indices) {
d <- data[indices,] # allows boot to select sample
fit <- lm(formula, data=d)
return(summary(fit)$r.square)
}
arsq <- function(formula, data, indices) {
d <- data[indices,] # allows boot to select sample
fit <- lm(formula, data=d)
return(summary(fit)$adj.r.square)
}
library(knitr)
library(boot)
library(broom)
library(tidyverse)
library(forcats)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F, fig.height = 4)
Read data
simlex = read.table("simlex_with_brysbaert_concreteness.txt",sep="\t", header = T)
Get R2 via boot
boot code from from http://www.statmethods.net/advstats/bootstrapping.html
Median split by POS Bootstrapped mean
wiki.meds = simlex %>%
group_by(POS) %>%
summarize(median.conc = median(Conc.M.1))
wiki = simlex %>%
left_join(wiki.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
group_by(POS, conc) %>%
do(tidy(boot(data=., statistic=rsq,
R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2.))) %>%
select(statistic, std.error) %>%
rename(R2 = statistic) %>%
mutate(dataset = "wiki")
news.meds = simlex %>%
group_by(POS) %>%
summarize(median.conc = median(Conc.M.1))
news = simlex %>%
left_join(news.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
group_by(POS, conc) %>%
do(tidy(
boot(data=., statistic=rsq,
R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2.))) %>%
select(statistic, std.error) %>%
rename(R2 = statistic) %>%
mutate(dataset = "news")
Regular mean
wiki.reg = simlex %>%
left_join(wiki.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
group_by(POS, conc) %>%
do(R2.reg = summary(lm(SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2., data =.))$r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg))
news.reg = simlex %>%
left_join(news.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
group_by(POS, conc) %>%
do(R2.reg = summary(lm(SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2., data =.))$r.squared[1]) %>%
mutate(R2.reg = unlist(R2.reg))
Get 95 ci via boot.ci
wiki.cis = simlex %>%
left_join(wiki.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
group_by(POS, conc) %>%
do(tidy(boot.ci(boot(data=., statistic=rsq,
R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2.),type = "bca")$bca)) %>%
select(POS, conc, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "wiki")
news.cis = simlex %>%
left_join(news.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>% group_by(POS, conc) %>%
do(tidy(boot.ci(boot(data=., statistic=rsq,
R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2.),type = "bca")$bca))%>%
select(POS, conc, V4, V5) %>%
rename(low.ci = V4,
high.ci = V5) %>%
mutate(dataset = "news")
Adjusted regular mean
wiki.adj.reg = simlex %>%
left_join(wiki.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
group_by(POS, conc) %>%
do(adj.R2.reg = summary(lm(SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2.,
data =.))$adj.r.squared[1]) %>%
mutate(adj.R2.reg = unlist(adj.R2.reg))
news.adj.reg = simlex %>%
left_join(news.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
group_by(POS, conc) %>%
do(adj.R2.reg = summary(lm(SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2., data =.))$adj.r.squared[1]) %>%
mutate(adj.R2.reg = unlist(adj.R2.reg))
Get 95 ci via boot.ci adj.
wiki.adj.cis = simlex %>%
left_join(wiki.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
group_by(POS, conc) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2.),type = "bca")$bca)) %>%
select(POS, conc, V4, V5) %>%
rename(adj.low.ci = V4,
adj.high.ci = V5) %>%
mutate(dataset = "wiki")
news.adj.cis = simlex %>%
left_join(news.meds) %>%
mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>% group_by(POS, conc) %>%
do(tidy(boot.ci(boot(data=., statistic=arsq,
R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2.),type = "bca")$bca))%>%
select(POS, conc, V4, V5) %>%
rename(adj.low.ci = V4,
adj.high.ci = V5) %>%
mutate(dataset = "news")
Bind everything together
all.wiki = left_join(wiki, wiki.cis) %>%
left_join(wiki.reg) %>%
left_join(wiki.adj.reg) %>%
left_join(wiki.adj.cis)
all.news = left_join(news, news.cis) %>%
left_join(news.reg) %>%
left_join(news.adj.reg) %>%
left_join(news.adj.cis)
all.d = rbind(all.wiki,all.news) %>%
select(dataset, POS, conc, R2, low.ci, high.ci, std.error,
R2.reg, adj.R2.reg, adj.low.ci, adj.high.ci) %>%
ungroup() %>%
mutate(high.ci.se = R2 + (std.error * 1),
low.ci.se = R2 - (std.error * 1),
POS = as.factor(POS))
#fct_recode(POS, "N" = "noun")
ggplot(all.d, aes(x = POS, y = R2, group = conc, fill = conc)) +
geom_bar(stat ="identity", position = "dodge") +
geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
facet_wrap(~dataset) +
xlab("Part of speech") +
theme_bw()
ggplot(all.d, aes(x = POS, y = R2.reg, group = conc, fill = conc)) +
geom_bar(stat ="identity", position = "dodge") +
geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
facet_wrap(~dataset) +
xlab("Part of speech") +
theme_bw()
ggplot(all.d, aes(x = POS, y = adj.R2.reg, group = conc, fill = conc)) +
geom_bar(stat ="identity", position = "dodge") +
geom_linerange(aes(ymax = adj.low.ci, ymin = adj.high.ci), position = position_dodge(width = 0.9))+
facet_wrap(~dataset) +
xlab("Part of speech") +
theme_bw()
All the means:
all.d %>%
rename(boot.R2 = R2) %>%
select(-low.ci, -high.ci, -low.ci.se, -high.ci.se, -std.error) %>%
kable(format = "markdown")
| dataset | POS | conc | boot.R2 | R2.reg | adj.R2.reg | adj.low.ci | adj.high.ci |
|---|---|---|---|---|---|---|---|
| wiki | A | abstract | 0.2908048 | 0.2908048 | 0.2362513 | -0.0124138 | 0.4207099 |
| wiki | A | concrete | 0.1776202 | 0.1776202 | 0.1061089 | -0.0710995 | 0.2735055 |
| wiki | N | abstract | 0.2250343 | 0.2250343 | 0.2154962 | 0.1298778 | 0.3080355 |
| wiki | N | concrete | 0.2320659 | 0.2320659 | 0.2226722 | 0.1372494 | 0.2942879 |
| wiki | V | abstract | 0.2303449 | 0.2303449 | 0.2007427 | 0.0563019 | 0.3159019 |
| wiki | V | concrete | 0.0715155 | 0.0715155 | 0.0361447 | -0.0350556 | 0.1295207 |
| news | A | abstract | 0.4630949 | 0.4630949 | 0.4225737 | 0.1908190 | 0.5804148 |
| news | A | concrete | 0.2919595 | 0.2919595 | 0.2329561 | 0.0051848 | 0.3861982 |
| news | N | abstract | 0.2559431 | 0.2559431 | 0.2467288 | 0.1417995 | 0.3323447 |
| news | N | concrete | 0.3623012 | 0.3623012 | 0.3542545 | 0.2679943 | 0.4290277 |
| news | V | abstract | 0.2846781 | 0.2846781 | 0.2574278 | 0.1032017 | 0.3765030 |
| news | V | concrete | 0.1186857 | 0.1186857 | 0.0854286 | -0.0229473 | 0.2207201 |