Submission

  1. Fork this repository to your own account
  2. Make changes to the README.Rmd file (including the author field above).
  3. Knit the file to HTML format
  4. Publish HTML on Rpubs (you will need to sign up for the free account) and add the link below.
  5. Submit a PR with your answers.

Link to Rpubs document: XXX

Overview

Take the datacamp course on joining data to learn the join verbs.

You will analyze some data in the nihexporter package to answer some questions. The questions will cover methods in dplyr including two-table verbs.

First you need to install the nihexporter package (and devtools, if you don’t have it already).

# install.packages('devtools')
devtools::install_github("jayhesselberth/nihexporter")

Read the Overview vignette to learn the structure of the pacakge. Take a look at two tables from the package:

library(nihexporter)
#> Loading required package: jsonlite
#> Loading required package: httr
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyverse)
#> Loading tidyverse: ggplot2
#> Loading tidyverse: tibble
#> Loading tidyverse: tidyr
#> Loading tidyverse: readr
#> Loading tidyverse: purrr
#> Conflicts with tidy packages ----------------------------------------------
#> filter(): dplyr, stats
#> lag():    dplyr, stats
library(broom)

projects
#> # A tibble: 901,059 × 13
#>    application.id institute activity application.type arra.funded
#>             <int>    <fctr>   <fctr>           <fctr>      <fctr>
#> 1         6258248        CA      C06                1          NA
#> 2         6033399        RR      C06                1          NA
#> 3         6039178        RR      C06                1          NA
#> 4         6258225        RR      C06                1          NA
#> 5         6258259        RR      C06                1          NA
#> 6         6258242        RR      C06                1          NA
#> 7         6258249        RR      C06                1          NA
#> 8         6258264        RR      C06                1          NA
#> 9         6254919        RR      C06                1          NA
#> 10        6254932        RR      C06                1          NA
#> # ... with 901,049 more rows, and 8 more variables: project.num <chr>,
#> #   foa.number <fctr>, fiscal.year <int>, project.start <date>,
#> #   project.end <date>, study.section <fctr>, suffix <fctr>, fy.cost <dbl>
project_io
#> # A tibble: 224,899 × 4
#>    project.num n.pubs n.patents total.cost
#>          <chr>  <int>     <int>      <dbl>
#> 1  C06CA091516      1         1    1488000
#> 2  C06RR013577      1         1     329812
#> 3  C06RR014469      3         1    1000000
#> 4  C06RR014488      1         1    1000000
#> 5  C06RR014489     17         1    1250000
#> 6  C06RR014510      1         1     977434
#> 7  C06RR014517      1         1    1032334
#> 8  C06RR014520      1         1    1000000
#> 9  C06RR014524      1         1    1999999
#> 10 C06RR014527      1         1     852265
#> # ... with 224,889 more rows

Problems

Problem 1

The projects and project_io tables have the project.num column in common. Use this column to perform a left_join and identify the most expensive project for each institute. Project institutes are in the projects table, whereas total project costs are available in the project_io table. Note you will have to use the arrange and slice verbs from dplyr.

dplyr::left_join(projects, project_io, by="project.num") %>% 
  select(project.num, institute, total.cost) %>% 
  group_by(institute) %>% 
  arrange(desc(total.cost)) %>% 
  slice(1:1) %>%
  arrange(desc(total.cost)) -> answer

answer

Problem 2

Create a plot that describes the change in output (number of publications) for the most expensive grant from the above list. Note that you cannot use n.pubs for this, as that is an aggregate count over time. You need a tbl of following structure, which you can then use to calculate the number of publications each year (hint: use n() in a summarize statement:

library(cowplot)
#> 
#> Attaching package: 'cowplot'
#> The following object is masked from 'package:ggplot2':
#> 
#>     ggsave
dplyr::left_join(projects, project_io, by="project.num") %>% 
  select(project.num, institute, total.cost) %>% 
  group_by(institute) %>% 
  arrange(desc(total.cost)) %>% 
  slice(1:1) %>%
  arrange(desc(total.cost)) %>%
  ungroup() %>%
  slice(1:1) %>%
  select(project.num) %>%
  left_join(publinks, by ="project.num") %>%
  left_join(publications, by ="pmid") %>%
  select(pub.year, pmid) %>%
  na.omit() -> final.table

# a line graph
final.table %>%
  group_by(pub.year) %>%
  summarise(total.pubs=n()) %>%
  ggplot(aes(x=pub.year, y=total.pubs)) + geom_point() + geom_line() + ylim(0,25) + labs(x = "Year", y = "Number of publications") + theme_classic()

# a bar graph
final.table %>%
  ggplot(aes(factor(pub.year))) + geom_bar() + theme_classic()

Problem 3

Use the projects and clinical_studies table to calculate how many clinical studies have been started by projects from each institute.

projects %>% 
  select(project.num, institute) %>% 
  inner_join(clinical_studies, by = "project.num") %>% 
  group_by(institute) %>% 
  summarise(total.trials = n_distinct(trial.id)) %>% 
  arrange(desc(total.trials))-> total.trials

total.trials
#> # A tibble: 25 × 2
#>    institute total.trials
#>       <fctr>        <int>
#> 1         CA         5232
#> 2         HL         1751
#> 3         MH         1448
#> 4         DA         1157
#> 5         DK          915
#> 6         HD          756
#> 7         AT          559
#> 8         AG          519
#> 9         AI          419
#> 10        NS          378
#> # ... with 15 more rows
total.trials %>% 
  ggplot(aes(x=institute, y=total.trials)) + geom_point() + labs(title = "Total number of clinical trials per institute", x = "Institute", y = "Number of clinical trials") + theme_classic()

Problem 4

Calculate correlations between total.cost and n.pubs for each institute. You will need to join the projects and project_io tables to link institute with total.cost and n.pubs. You will then need to use group_by and do from dplyr and tidy from broom to calculate correlations.

projects %>% 
  left_join(project_io) %>% 
  filter(!duplicated(project.num), !is.na(n.pubs), !is.na(total.cost)) %>%
  select(institute,project.num,fiscal.year, fy.cost, n.pubs, n.patents, total.cost) %>% 
  arrange(institute, project.num) -> tab


tab %>%
  group_by(institute) %>%
  do(tidy(cor(.$n.pubs, .$total.cost))) -> result

result

Problem 5 (extra credit)

Correlate rcr values from the publications table to one of:

  • total cost (overall for NIH, or by institute)
  • budget mechanism (e.g., RFA, PAR, etc.)
  • type of grant (e.g. R01, P01, R21, etc.).
publinks %>% 
  left_join(publications, by ="pmid") %>% 
  left_join(projects, by ="project.num") %>% 
  select(institute, project.num, activity, pmid, rcr) %>% 
  na.omit() -> rcr.table

rcr.table
#> # A tibble: 24,604,181 × 5
#>    institute project.num activity     pmid      rcr
#>       <fctr>       <chr>   <fctr>    <int>    <dbl>
#> 1         RR C06RR014469      C06 17621683 0.927150
#> 2         RR C06RR014469      C06 24059281 1.221055
#> 3         RR C06RR014469      C06 24976670 0.470547
#> 4         RR C06RR014469      C06 25980177 0.405393
#> 5         RR C06RR014489      C06 16200579 1.990231
#> 6         RR C06RR014489      C06 16192645 3.040031
#> 7         RR C06RR014489      C06 16059633 1.032259
#> 8         RR C06RR014489      C06 15872085 2.603867
#> 9         RR C06RR014489      C06 16002251 1.017025
#> 10        RR C06RR014489      C06 16234975 7.211317
#> # ... with 24,604,171 more rows
rcr.table %>% 
  group_by(activity) %>% 
  summarise(n.pub = n(), median.rcr = median(rcr), mean.rcr = mean(rcr), min.rcr = min(rcr), max.rcr=max(rcr)) %>% 
  rename(grant = activity) %>% 
  arrange(desc(n.pub))
#> # A tibble: 154 × 6
#>     grant   n.pub median.rcr mean.rcr min.rcr   max.rcr
#>    <fctr>   <int>      <dbl>    <dbl>   <dbl>     <dbl>
#> 1     R01 9572001   1.147641 2.017117       0 1330.4491
#> 2     P30 4119272   1.073715 1.969296       0  235.3096
#> 3     T32 1646600   1.091107 1.899937       0  253.7495
#> 4     P01 1495422   1.235473 2.224052       0  398.3514
#> 5     U01  928790   1.356669 2.624244       0  309.5266
#> 6     P50  873442   1.347864 2.553361       0  168.4233
#> 7     R37  612591   1.320472 2.450805       0  204.9630
#> 8     ZIA  529732   1.195404 2.276984       0  276.3978
#> 9     U10  519934   1.416343 3.073635       0  197.1685
#> 10    M01  364845   1.271357 2.257545       0  219.5835
#> # ... with 144 more rows

Problem 6 (extra extra credit)

Use gganimate to illustrate the change in cost of R01s for a few institutes over time.

# devtools::install_github("dgrtwo/gganimate")
library(gganimate)
projects %>% 
  filter(activity == "R01", institute == c("RR", "TW")) %>%
  select(institute, project.num, activity, fiscal.year, project.start, project.end, fy.cost) %>% 
  left_join(project_io, by ="project.num") %>%
  na.omit() %>%
  arrange(institute, fiscal.year) -> tmp 
#> Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
#> shorter object length
#> Warning in `==.default`(structure(c(6L, 23L, 23L, 23L, 23L, 23L, 23L,
#> 23L, : longer object length is not a multiple of shorter object length
p <- ggplot(tmp, aes(x= total.cost, y= fy.cost, size = n.pubs, color = institute, frame = fiscal.year)) +
  geom_point() +
  scale_x_log10()

gganimate(p, "animated.gif")
#> I cannot find ImageMagick with convert = 'convert'
#> Warning in animation::im.convert(filenames[-1], basename(filename),
#> extra.opts = opts, : Please install ImageMagick first or put its bin path
#> into the system PATH variable
Above is the final answer

Above is the final answer