README.Rmd file (including the author field above).Link to Rpubs document: XXX
Take the datacamp course on joining data to learn the join verbs.
You will analyze some data in the nihexporter package to answer some questions. The questions will cover methods in dplyr including two-table verbs.
First you need to install the nihexporter package (and devtools, if you don’t have it already).
# install.packages('devtools')
devtools::install_github("jayhesselberth/nihexporter")
Read the Overview vignette to learn the structure of the pacakge. Take a look at two tables from the package:
library(nihexporter)
#> Loading required package: jsonlite
#> Loading required package: httr
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyverse)
#> Loading tidyverse: ggplot2
#> Loading tidyverse: tibble
#> Loading tidyverse: tidyr
#> Loading tidyverse: readr
#> Loading tidyverse: purrr
#> Conflicts with tidy packages ----------------------------------------------
#> filter(): dplyr, stats
#> lag(): dplyr, stats
library(broom)
projects
#> # A tibble: 901,059 × 13
#> application.id institute activity application.type arra.funded
#> <int> <fctr> <fctr> <fctr> <fctr>
#> 1 6258248 CA C06 1 NA
#> 2 6033399 RR C06 1 NA
#> 3 6039178 RR C06 1 NA
#> 4 6258225 RR C06 1 NA
#> 5 6258259 RR C06 1 NA
#> 6 6258242 RR C06 1 NA
#> 7 6258249 RR C06 1 NA
#> 8 6258264 RR C06 1 NA
#> 9 6254919 RR C06 1 NA
#> 10 6254932 RR C06 1 NA
#> # ... with 901,049 more rows, and 8 more variables: project.num <chr>,
#> # foa.number <fctr>, fiscal.year <int>, project.start <date>,
#> # project.end <date>, study.section <fctr>, suffix <fctr>, fy.cost <dbl>
project_io
#> # A tibble: 224,899 × 4
#> project.num n.pubs n.patents total.cost
#> <chr> <int> <int> <dbl>
#> 1 C06CA091516 1 1 1488000
#> 2 C06RR013577 1 1 329812
#> 3 C06RR014469 3 1 1000000
#> 4 C06RR014488 1 1 1000000
#> 5 C06RR014489 17 1 1250000
#> 6 C06RR014510 1 1 977434
#> 7 C06RR014517 1 1 1032334
#> 8 C06RR014520 1 1 1000000
#> 9 C06RR014524 1 1 1999999
#> 10 C06RR014527 1 1 852265
#> # ... with 224,889 more rows
The projects and project_io tables have the project.num column in common. Use this column to perform a left_join and identify the most expensive project for each institute. Project institutes are in the projects table, whereas total project costs are available in the project_io table. Note you will have to use the arrange and slice verbs from dplyr.
dplyr::left_join(projects, project_io, by="project.num") %>%
select(project.num, institute, total.cost) %>%
group_by(institute) %>%
arrange(desc(total.cost)) %>%
slice(1:1) %>%
arrange(desc(total.cost)) -> answer
answer
Create a plot that describes the change in output (number of publications) for the most expensive grant from the above list. Note that you cannot use n.pubs for this, as that is an aggregate count over time. You need a tbl of following structure, which you can then use to calculate the number of publications each year (hint: use n() in a summarize statement:
library(cowplot)
#>
#> Attaching package: 'cowplot'
#> The following object is masked from 'package:ggplot2':
#>
#> ggsave
dplyr::left_join(projects, project_io, by="project.num") %>%
select(project.num, institute, total.cost) %>%
group_by(institute) %>%
arrange(desc(total.cost)) %>%
slice(1:1) %>%
arrange(desc(total.cost)) %>%
ungroup() %>%
slice(1:1) %>%
select(project.num) %>%
left_join(publinks, by ="project.num") %>%
left_join(publications, by ="pmid") %>%
select(pub.year, pmid) %>%
na.omit() -> final.table
# a line graph
final.table %>%
group_by(pub.year) %>%
summarise(total.pubs=n()) %>%
ggplot(aes(x=pub.year, y=total.pubs)) + geom_point() + geom_line() + ylim(0,25) + labs(x = "Year", y = "Number of publications") + theme_classic()
# a bar graph
final.table %>%
ggplot(aes(factor(pub.year))) + geom_bar() + theme_classic()
Use the projects and clinical_studies table to calculate how many clinical studies have been started by projects from each institute.
projects %>%
select(project.num, institute) %>%
inner_join(clinical_studies, by = "project.num") %>%
group_by(institute) %>%
summarise(total.trials = n_distinct(trial.id)) %>%
arrange(desc(total.trials))-> total.trials
total.trials
#> # A tibble: 25 × 2
#> institute total.trials
#> <fctr> <int>
#> 1 CA 5232
#> 2 HL 1751
#> 3 MH 1448
#> 4 DA 1157
#> 5 DK 915
#> 6 HD 756
#> 7 AT 559
#> 8 AG 519
#> 9 AI 419
#> 10 NS 378
#> # ... with 15 more rows
total.trials %>%
ggplot(aes(x=institute, y=total.trials)) + geom_point() + labs(title = "Total number of clinical trials per institute", x = "Institute", y = "Number of clinical trials") + theme_classic()
Calculate correlations between total.cost and n.pubs for each institute. You will need to join the projects and project_io tables to link institute with total.cost and n.pubs. You will then need to use group_by and do from dplyr and tidy from broom to calculate correlations.
projects %>%
left_join(project_io) %>%
filter(!duplicated(project.num), !is.na(n.pubs), !is.na(total.cost)) %>%
select(institute,project.num,fiscal.year, fy.cost, n.pubs, n.patents, total.cost) %>%
arrange(institute, project.num) -> tab
tab %>%
group_by(institute) %>%
do(tidy(cor(.$n.pubs, .$total.cost))) -> result
result
Correlate rcr values from the publications table to one of:
publinks %>%
left_join(publications, by ="pmid") %>%
left_join(projects, by ="project.num") %>%
select(institute, project.num, activity, pmid, rcr) %>%
na.omit() -> rcr.table
rcr.table
#> # A tibble: 24,604,181 × 5
#> institute project.num activity pmid rcr
#> <fctr> <chr> <fctr> <int> <dbl>
#> 1 RR C06RR014469 C06 17621683 0.927150
#> 2 RR C06RR014469 C06 24059281 1.221055
#> 3 RR C06RR014469 C06 24976670 0.470547
#> 4 RR C06RR014469 C06 25980177 0.405393
#> 5 RR C06RR014489 C06 16200579 1.990231
#> 6 RR C06RR014489 C06 16192645 3.040031
#> 7 RR C06RR014489 C06 16059633 1.032259
#> 8 RR C06RR014489 C06 15872085 2.603867
#> 9 RR C06RR014489 C06 16002251 1.017025
#> 10 RR C06RR014489 C06 16234975 7.211317
#> # ... with 24,604,171 more rows
rcr.table %>%
group_by(activity) %>%
summarise(n.pub = n(), median.rcr = median(rcr), mean.rcr = mean(rcr), min.rcr = min(rcr), max.rcr=max(rcr)) %>%
rename(grant = activity) %>%
arrange(desc(n.pub))
#> # A tibble: 154 × 6
#> grant n.pub median.rcr mean.rcr min.rcr max.rcr
#> <fctr> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 R01 9572001 1.147641 2.017117 0 1330.4491
#> 2 P30 4119272 1.073715 1.969296 0 235.3096
#> 3 T32 1646600 1.091107 1.899937 0 253.7495
#> 4 P01 1495422 1.235473 2.224052 0 398.3514
#> 5 U01 928790 1.356669 2.624244 0 309.5266
#> 6 P50 873442 1.347864 2.553361 0 168.4233
#> 7 R37 612591 1.320472 2.450805 0 204.9630
#> 8 ZIA 529732 1.195404 2.276984 0 276.3978
#> 9 U10 519934 1.416343 3.073635 0 197.1685
#> 10 M01 364845 1.271357 2.257545 0 219.5835
#> # ... with 144 more rows
Use gganimate to illustrate the change in cost of R01s for a few institutes over time.
# devtools::install_github("dgrtwo/gganimate")
library(gganimate)
projects %>%
filter(activity == "R01", institute == c("RR", "TW")) %>%
select(institute, project.num, activity, fiscal.year, project.start, project.end, fy.cost) %>%
left_join(project_io, by ="project.num") %>%
na.omit() %>%
arrange(institute, fiscal.year) -> tmp
#> Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
#> shorter object length
#> Warning in `==.default`(structure(c(6L, 23L, 23L, 23L, 23L, 23L, 23L,
#> 23L, : longer object length is not a multiple of shorter object length
p <- ggplot(tmp, aes(x= total.cost, y= fy.cost, size = n.pubs, color = institute, frame = fiscal.year)) +
geom_point() +
scale_x_log10()
gganimate(p, "animated.gif")
#> I cannot find ImageMagick with convert = 'convert'
#> Warning in animation::im.convert(filenames[-1], basename(filename),
#> extra.opts = opts, : Please install ImageMagick first or put its bin path
#> into the system PATH variable
Above is the final answer