rsq <- function(formula, data, indices) {
  d <- data[indices,] # allows boot to select sample 
  fit <- lm(formula, data=d)
  return(summary(fit)$r.square)
} 


arsq <- function(formula, data, indices) {
  d <- data[indices,] # allows boot to select sample 
  fit <- lm(formula, data=d)
  return(summary(fit)$adj.r.square)
} 

library(knitr)
library(boot)
library(broom)
library(tidyverse)
library(forcats)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F, fig.height = 4)

Read data

simlex = read.table("simlex_with_brysbaert_concreteness.txt",sep="\t", header = T)

Get R2 via boot

boot code from from http://www.statmethods.net/advstats/bootstrapping.html

Median split by POS Bootstrapped mean

wiki.meds = simlex %>% 
  group_by(POS) %>%
  summarize(median.conc = median(Conc.M.1))

wiki = simlex %>% 
  left_join(wiki.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
  group_by(POS, conc) %>%
  do(tidy(boot(data=., statistic=rsq, 
       R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2.))) %>%
  select(statistic, std.error) %>%
  rename(R2 = statistic) %>%
  mutate(dataset = "wiki")

news.meds = simlex %>% 
  group_by(POS) %>%
  summarize(median.conc = median(Conc.M.1))

news = simlex %>% 
  left_join(news.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
  group_by(POS, conc)  %>%
  do(tidy(
  boot(data=., statistic=rsq, 
       R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2.))) %>%
  select(statistic, std.error) %>%
  rename(R2 = statistic) %>%
  mutate(dataset = "news")

Regular mean

wiki.reg = simlex %>% 
  left_join(wiki.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
  group_by(POS, conc)  %>%
  do(R2.reg = summary(lm(SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2., data =.))$r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg))

news.reg = simlex %>% 
  left_join(news.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
  group_by(POS, conc)  %>%
  do(R2.reg = summary(lm(SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2., data =.))$r.squared[1]) %>%
  mutate(R2.reg = unlist(R2.reg))

Get 95 ci via boot.ci

wiki.cis = simlex %>% 
  left_join(wiki.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
  group_by(POS, conc) %>%
  do(tidy(boot.ci(boot(data=., statistic=rsq, 
                     R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2.),type = "bca")$bca)) %>%
  select(POS, conc, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "wiki")

news.cis = simlex %>% 
  left_join(news.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%  group_by(POS, conc) %>%
  do(tidy(boot.ci(boot(data=., statistic=rsq, 
                       R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2.),type = "bca")$bca))%>%
  select(POS, conc, V4, V5) %>%
  rename(low.ci = V4,
         high.ci = V5) %>%
  mutate(dataset = "news")

Adjusted regular mean

wiki.adj.reg = simlex %>% 
  left_join(wiki.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
  group_by(POS, conc)  %>%
  do(adj.R2.reg = summary(lm(SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2.,
                             data =.))$adj.r.squared[1]) %>%
  mutate(adj.R2.reg = unlist(adj.R2.reg))

news.adj.reg = simlex %>% 
  left_join(news.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
  group_by(POS, conc)  %>%
  do(adj.R2.reg = summary(lm(SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2., data =.))$adj.r.squared[1]) %>%
  mutate(adj.R2.reg = unlist(adj.R2.reg))

Get 95 ci via boot.ci adj.

wiki.adj.cis = simlex %>% 
  left_join(wiki.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%
  group_by(POS, conc) %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                     R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_wiki+conc.w2.),type = "bca")$bca)) %>%
  select(POS, conc, V4, V5) %>%
  rename(adj.low.ci = V4,
         adj.high.ci = V5) %>%
  mutate(dataset = "wiki")

news.adj.cis = simlex %>% 
  left_join(news.meds) %>%
  mutate(conc = ifelse(Conc.M.1 <= median.conc, "abstract", "concrete")) %>%  group_by(POS, conc) %>%
  do(tidy(boot.ci(boot(data=., statistic=arsq, 
                       R=1000, formula=SimLex999~Lg10WF.1+Lg10WF.2+similarity_news+conc.w2.),type = "bca")$bca))%>%
  select(POS, conc, V4, V5) %>%
  rename(adj.low.ci = V4,
         adj.high.ci = V5) %>%
  mutate(dataset = "news")

Bind everything together

all.wiki = left_join(wiki, wiki.cis) %>% 
  left_join(wiki.reg) %>% 
  left_join(wiki.adj.reg) %>%
  left_join(wiki.adj.cis) 

all.news = left_join(news, news.cis) %>%
  left_join(news.reg) %>%  
  left_join(news.adj.reg) %>%
  left_join(news.adj.cis) 

all.d = rbind(all.wiki,all.news) %>%
  select(dataset, POS, conc, R2, low.ci, high.ci, std.error, 
         R2.reg, adj.R2.reg, adj.low.ci, adj.high.ci) %>%
  ungroup() %>%
  mutate(high.ci.se = R2 + (std.error * 1),
         low.ci.se = R2 - (std.error * 1),
         POS = as.factor(POS)) 
  #fct_recode(POS, "N" = "noun")

R2 - bootstrapping mean

ggplot(all.d, aes(x = POS, y = R2, group = conc, fill = conc)) +
  geom_bar(stat ="identity", position = "dodge") +
  geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
  facet_wrap(~dataset) +
  xlab("Part of speech") +
  theme_bw()

R2 - no bootstrapping mean

ggplot(all.d, aes(x = POS, y = R2.reg, group = conc, fill = conc)) +
  geom_bar(stat ="identity", position = "dodge") +
  geom_linerange(aes(ymax = low.ci, ymin = high.ci), position = position_dodge(width = 0.9))+
  facet_wrap(~dataset) +
  xlab("Part of speech") +
  theme_bw()

adjusted R2

ggplot(all.d, aes(x = POS, y = adj.R2.reg, group = conc, fill = conc)) +
  geom_bar(stat ="identity", position = "dodge") +
  geom_linerange(aes(ymax = adj.low.ci, ymin = adj.high.ci), position = position_dodge(width = 0.9))+
  facet_wrap(~dataset) +
  xlab("Part of speech") +
  theme_bw()

All the means:

all.d %>%
  rename(boot.R2 = R2) %>%
  select(-low.ci, -high.ci, -low.ci.se, -high.ci.se, -std.error) %>%
  kable(format = "markdown")
dataset POS conc boot.R2 R2.reg adj.R2.reg adj.low.ci adj.high.ci
wiki A abstract 0.2908048 0.2908048 0.2362513 -0.0124138 0.4207099
wiki A concrete 0.1776202 0.1776202 0.1061089 -0.0710995 0.2735055
wiki N abstract 0.2250343 0.2250343 0.2154962 0.1298778 0.3080355
wiki N concrete 0.2320659 0.2320659 0.2226722 0.1372494 0.2942879
wiki V abstract 0.2303449 0.2303449 0.2007427 0.0563019 0.3159019
wiki V concrete 0.0715155 0.0715155 0.0361447 -0.0350556 0.1295207
news A abstract 0.4630949 0.4630949 0.4225737 0.1908190 0.5804148
news A concrete 0.2919595 0.2919595 0.2329561 0.0051848 0.3861982
news N abstract 0.2559431 0.2559431 0.2467288 0.1417995 0.3323447
news N concrete 0.3623012 0.3623012 0.3542545 0.2679943 0.4290277
news V abstract 0.2846781 0.2846781 0.2574278 0.1032017 0.3765030
news V concrete 0.1186857 0.1186857 0.0854286 -0.0229473 0.2207201