TidyTuesday week 40: NBER papers, data from the National Bureau of Economic Research NBER by way of the nberwp package by Ben Davies.
library(tidyverse)
library(glue)
library(gt)
library(gtExtras)
library(gender)
library(hrbrthemes)
library(ggtext)
library(patchwork)
papers <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/papers.csv')
Rows: 29434 Columns: 4
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): paper, title
dbl (2): year, month
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
authors <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/authors.csv')
Rows: 15437 Columns: 4
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): author, name, user_nber, user_repec
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
programs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/programs.csv')
Rows: 21 Columns: 3
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): program, program_desc, program_category
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
paper_authors <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/paper_authors.csv')
Rows: 67090 Columns: 2
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): paper, author
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
paper_programs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/paper_programs.csv')
Rows: 53996 Columns: 2
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): paper, program
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
joined_df <- left_join(papers, paper_authors) %>%
left_join(authors) %>%
left_join(paper_programs) %>%
left_join(programs)%>%
mutate(
catalogue_group = str_sub(paper, 1, 1),
catalogue_group = case_when(
catalogue_group == "h" ~ "Historical",
catalogue_group == "t" ~ "Technical",
catalogue_group == "w" ~ "General"
),
.after = paper
)
Joining, by = "paper"
Joining, by = "author"
Joining, by = "paper"
Joining, by = "program"
wp = joined_df %>% group_by(program, program_desc, year) %>%
summarise(n=n_distinct(paper)) %>%
arrange(year) %>%
mutate(decade=case_when(between(year,1980,1989)~"1980s",
between(year,1990,1999)~"1990s",
between(year,2000,2009)~"2000s",
between(year,2010,2019)~"2010s"
)) %>%
drop_na()
`summarise()` has grouped output by 'program', 'program_desc'. You can override using the `.groups` argument.
wp2 = wp %>% mutate(program=glue::glue("{program_desc} ({program})")) %>%
group_by(program) %>% mutate(total=sum(n)) %>%
arrange(year, program) %>%
mutate(spark=list(n)) %>%
select(program, spark, total) %>%
distinct()
wp3 = wp %>% mutate(program=glue::glue("{program_desc} ({program})")) %>%
group_by(program, decade) %>% tally(n) %>%
ungroup() %>%
pivot_wider(names_from = decade, values_from=n) %>%
mutate_if(is.numeric, list(~replace_na(., 0)))
wp3 %>% inner_join(wp2, by="program") %>%
select(Program=program, Total=total, "1980s","1990s","2000s","2010s",Trend=spark) %>%
arrange(desc(Total)) %>%
gt() %>%
gt_theme_espn() %>%
cols_align(Program, align="left") %>%
gt_plt_dot(Total, Program,palette = "rcartocolor::ag_GrnYl", max_value=5246) %>%
gtExtras::gt_sparkline(Trend) %>%
tab_options(table.font.size = 12.5,
heading.subtitle.font.size = 14) %>%
gt_color_box(`1980s`, domain=2:786) %>%
gt_color_box(`1990s`, domain=2:797) %>%
gt_color_box(`2000s`, domain=132:1647) %>%
gt_color_box(`2010s`, domain=200:2424) %>%
tab_header(title="NBER Papers", subtitle="Working papers count by program and decade") %>%
tab_source_note(source_note="TidyTuesday Week 40 | Data source: National Bureau of Economic Research (NBER) by way of the nberwp package by Ben Davies")
| NBER Papers | ||||||
|---|---|---|---|---|---|---|
| Working papers count by program and decade | ||||||
| Program | Total | 1980s | 1990s | 2000s | 2010s | Trend |
Labor Studies (LS)
|
5246 | 489.0
|
797.0
|
1536.0
|
2424.0
|
|
Public Economics (PE)
|
5216 | 506.0
|
764.0
|
1561.0
|
2385.0
|
|
Economic Fluctuations and Growth (EFG)
|
5200 | 562.0
|
680.0
|
1647.0
|
2311.0
|
|
International Finance and Macroeconomics (IFM)
|
3820 | 786.0
|
699.0
|
1145.0
|
1190.0
|
|
International Trade and Investment (ITI)
|
3297 | 782.0
|
725.0
|
911.0
|
879.0
|
|
Monetary Economics (ME)
|
3020 | 509.0
|
504.0
|
782.0
|
1225.0
|
|
Asset Pricing (AP)
|
2426 | 0.0
|
307.0
|
927.0
|
1192.0
|
|
Productivity, Innovation, and Entrepreneurship (PR)
|
2182 | 111.0
|
290.0
|
636.0
|
1145.0
|
|
Health Economics (HE)
|
2039 | 87.0
|
143.0
|
620.0
|
1189.0
|
|
Corporate Finance (CF)
|
2034 | 2.0
|
175.0
|
718.0
|
1139.0
|
|
Development of the American Economy (DAE)
|
1532 | 59.0
|
230.0
|
540.0
|
703.0
|
|
Industrial Organization (IO)
|
1530 | 0.0
|
106.0
|
489.0
|
935.0
|
|
Children (CH)
|
1510 | 2.0
|
60.0
|
565.0
|
883.0
|
|
Economics of Aging (AG)
|
1490 | 43.0
|
221.0
|
495.0
|
731.0
|
|
Health Care (HC)
|
1464 | 0.0
|
153.0
|
479.0
|
832.0
|
|
Economics of Education (ED)
|
1417 | 0.0
|
2.0
|
436.0
|
979.0
|
|
Political Economics (POL)
|
1120 | 0.0
|
0.0
|
260.0
|
860.0
|
|
Environment and Energy Economics (EEE)
|
1096 | 2.0
|
13.0
|
256.0
|
825.0
|
|
Law and Economics (LE)
|
1020 | 20.0
|
72.0
|
353.0
|
575.0
|
|
Development Economics (DEV)
|
999 | 0.0
|
0.0
|
0.0
|
999.0
|
|
Technical Working Papers (TWP)
|
335 | 0.0
|
3.0
|
132.0
|
200.0
|
|
| TidyTuesday Week 40 | Data source: National Bureau of Economic Research (NBER) by way of the nberwp package by Ben Davies | ||||||
ALT text: The table showing the count of NBER papers, by program and decade, from 1980 to 2019, where Labor studies program have the highest total working paper (n=5246) in the time period and Technical Working papers have the lowest total (n=335). The table includes a spark line showing the yearly counts of working paper, where all programs have higher counts of working papers in recent years, except for International Trade and Investment program.
gender_df = joined_df %>% separate(name,c("given",NA))
Expected 2 pieces. Additional pieces discarded in 66921 rows [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 21, 22, 24, 25, 26, 27, ...].Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [98409].
name<-gender(gender_df$given, years = c(2020-60,2020-30))
gender_df<-gender_df %>%
left_join(name,by= c("given"="name"))
gender_tab = gender_df %>%
select(program_category, program_desc, gender) %>%
drop_na() %>%
count(program_category, program_desc, gender)
gt2 = gender_tab %>%
group_by(gender) %>%
mutate(prop=n/sum(n)) %>%
mutate(rank = rank(prop, ties.method = "random"))
gt2 %>%
ggplot(aes(x=str_to_title(gender), y=rank, color=program_category)) +
geom_point(size=5) +
geom_line(aes(group=program_desc), alpha=.5) +
scale_color_manual(values=c("#219ebc","#e85d04","#023047")) +
geom_text(data=gt2 %>% filter(gender=="female"), aes(label=program_desc), size=2.5, hjust=1, nudge_x = -.07,family="Arial Narrow") +
geom_text(data=gt2 %>% filter(gender=="male"), aes(label=program_desc), size=2.5, hjust=0, nudge_x = .07, family="Arial Narrow") +
geom_text(data=gt2 %>% filter(gender=="female"), aes(label=round(prop*100,1)),
size=2.2, color="white",fontface="bold",family="Arial Narrow") +
geom_text(data=gt2 %>% filter(gender=="male"), aes(label=round(prop*100,1)),
size=2.2, color="white",fontface="bold",family="Arial Narrow") +
scale_x_discrete(position="top", expand=c(0.6,0.6)) +
theme_minimal(base_size = 10, base_family = "Arial Narrow") +
guides(color=guide_legend(reverse=T)) +
theme(legend.margin=margin(t=-5),
axis.title=element_blank(),
panel.grid = element_blank(),
axis.text.y=element_blank(),
axis.text.x.top=element_text(size=10, margin=margin(b=-2), color="black"),
legend.text=element_text(size=8),
legend.title=element_text(size=8),
plot.margin = unit(c(.5, .5, .5, .5), "cm"),
plot.title=element_text(margin=margin(b=13))) +
labs(color="Program Category",
title="NBER Papers: Percentage of programs by gender")