Getting NBER metadata

NBER folks recommend downloading this .tab file.

You can also download NBER metadata as .txt files from here. (The issue with the .txt files is that they often are missing dates, which I want for a lot of plots.)

I’ll use the tab file and then use the authors text file for when I get into author demographics.

I fread directly from the NBER URLs. I’m doing this on 6/28/21, but if you’re running this later, the output will be slightly different since more papers will be included in the raw data.

library(data.table);library(predictrace);library(tidyverse);library(tidyr)
library(tidytext);library(quickpalette);library(viridis)
library(usmap);library(socviz)
library(ggraph);library(igraph)
library(lubridate);library(patchwork)

nber<-#fread("nber-data/working_papers.tab",  quote = "")%>%
  fread("https://data.nber.org/nber-wp-logs/working_papers.tab",  quote = "")%>%
  filter(abstract!="NULL")
nber%>%select(paper)%>%distinct()%>%nrow()
[1] 30488

There are 30,488 papers in the NBER data.

Data vs models?

I use the same word searches as Currie, Kleven, and Zwiers (2020).1

nber<-nber%>%
  mutate(ex=(str_detect(abstract, "exploit")),
         dd=str_detect(abstract, "Difference in Diff|Difference in diff|difference in diff|Difference-in-Diff|Difference-in-diff|difference-in-diff|Differences in Diff|Differences in diff|differences in diff|Differences-in-Diff|Differences-in-diff|differences-in-diff|diff-in-diff|d-in-d|DiD"),
         event=str_detect(abstract, "event study|event-study"),
         iv=str_detect(abstract, "Instrumental Variable|Instrumental variable|instrumental variable|Instrumental-Variable|Instrumental-variable|instrumental-variable|Two Stage Least Squares|Two stage least squares|two
stage least squares|2SLS|TSLS|valid instrument|exogenous instrument|IV Estimat|IV estimat|IV-estimat|IV Specification|IV specification|IV-specification|IV Regression|IV regression|IV-regression|IV Strateg|IV strateg|IV-strateg|we instrument|I instrument|paper instruments|exclusion restriction|weak first stage|simulated instrument"),
        rd=str_detect(abstract, "Regression Discontinuit|Regression discontinuit|regression discontinuit|Regression-discontinuity|regression-discontinuity|Regression Kink|Regression kink|regression kink|RD Design|RD design|RD-design|RD Estimat|RD estimat|RD-estimat|RD Model|RD model|RD-model|RD Regression|RD regression|RD-regression|RD Coefficient|RD coefficient|RD-coefficient|RK Design|RK design|RK-Design|RK-design|RKD"),
         covid=str_detect(abstract, "covid|coronavirus|COVID|Covid|CORONAVIRUS"),
         data=str_detect(abstract, "data"),
         model=str_detect(abstract, "model"))

Note: I use the {ggtext} package to color words in the title and subtitle.

library(ggtext)

nber%>%mutate(year=year(ymd(public_date)))%>%
  group_by(year)%>%filter(year>=1980)%>%
  summarise(data=mean(data==T),
            model=mean(model==T))%>%
  pivot_longer(2:3)%>%
  ggplot(aes(x=year, y=value, color=name, linetype=name, shape=name))+
  geom_line()+geom_point()+
  scale_color_manual(values=c("#009E73", "gray40"), name="")+
  labs(x="", y="", caption="Data: NBER working paper metadata. Plot: Alex Albright.",
       title="<span style = 'font-size:22pt'><span style = 'color:#009E73;'>Data</span> has overtaken <span style = 'color:gray40;'>Models</span></span><br> % of abstracts including the words <span style = 'color:#009E73;'>'data'</span> or <span style = 'color:gray40;'>'model'</span>")+
  scale_y_continuous(labels=scales::percent_format(accuracy = 1), breaks=seq(0.2,.6,.1), limits=c(0.18,0.5))+
  theme_minimal(base_family = "Palatino", base_size = 14)+theme(plot.title.position = "plot", legend.position = "none",
                                                                plot.title = element_markdown())
 21 failed to parse.
ggsave('graphs/modelvdata1.png', dpi=250, width=7, height=5)