This project was part of an Intro to Data Science assignment where each week we’d get a dataset from the TidyTuesday github and create visualizations from it.
The data this week comes from the European Medicines Agency via Miquel Anglada Girotto on GitHub. We used the source table of all EPARs for human and veterinary medicines, rather than Miquel’s scraped data.
Miquel wrote about his exploration of the data.
More details including the codebook is available here
Has the EMA’s orphan designation (est. 2000) worked to increase incentive for pharmaceutical companies to develop drugs for rare diseases?
table(drugs$orphan_medicine) # num orphan drugs (true) vs non orphan drugs (false)
##
## FALSE TRUE
## 1826 162
dim(drugs) # dataset dimentions
## [1] 1988 28
drugs <- drugs %>%
mutate(year = year(marketing_authorisation_date)) %>% # extract the year from the date variable
filter(category == "human") # remove animal drugs
orphan <- drugs %>%
drop_na(marketing_authorisation_date) %>% # ignore drugs that haven't been approved
select(orphan_medicine, year) # only include orphan drug status in year
# colors
myColors <- brewer.pal(3,"Paired")
names(myColors) <- levels(orphan$orphan_medicine)
fillScale <- scale_fill_manual(name = "Orphan Designation", values = myColors)
ggplot(orphan, aes(x = year, fill = orphan_medicine)) +
geom_bar() + # bar graph, year vs orphan designation (color stack)
labs(x = "Marketing Authorization Date", title = "Drugs with Orphan Designation Status Over Time") + # x/y labels
fillScale + # colors
theme_bw()
Are drugs with orphan designation more likely to be authorised under exceptional circumstance?
# therapeutic area variable includes multiple values for each observation, separated by semicolons
drugs %>% select(medicine_name, therapeutic_area) %>% head(5)
## # A tibble: 5 × 2
## medicine_name therapeutic_area
## <chr> <chr>
## 1 Adcetris Lymphoma, Non-Hodgkin; Hodgkin Disease
## 2 Nityr Tyrosinemias
## 3 Ebvallo Lymphoproliferative Disorders
## 4 Ronapreve COVID-19 virus infection
## 5 Cosentyx Arthritis, Psoriatic; Psoriasis; Spondylitis, Ankylosing
therareas <- drugs %>%
cSplit('therapeutic_area', sep="; ", type.convert=FALSE) # separate each therapeutic area value into new columns (therapeutic_area_1, therapeutic_area_2, etc for each observation)
therareas %>% select(medicine_name, therapeutic_area_01, therapeutic_area_02) %>% head(5)
## medicine_name therapeutic_area_01 therapeutic_area_02
## 1: Adcetris Lymphoma, Non-Hodgkin Hodgkin Disease
## 2: Nityr Tyrosinemias <NA>
## 3: Ebvallo Lymphoproliferative Disorders <NA>
## 4: Ronapreve COVID-19 virus infection <NA>
## 5: Cosentyx Arthritis, Psoriatic Psoriasis
columns <- c(colnames(therareas[,(ncol(therareas)-16+1):ncol(therareas)])) # extract column names for each therapeutic area number
therareas <- therareas %>%
pivot_longer(cols = columns, values_to = "therapeutic_area", names_to = NULL, values_drop_na = TRUE) # pivot longer to have one therapeutic_area column with multiple observations for each drug depending on num of therapeutic areas
therareas %>% select(medicine_name, therapeutic_area) %>% head(5)
## # A tibble: 5 × 2
## medicine_name therapeutic_area
## <chr> <chr>
## 1 Adcetris Lymphoma, Non-Hodgkin
## 2 Adcetris Hodgkin Disease
## 3 Nityr Tyrosinemias
## 4 Ebvallo Lymphoproliferative Disorders
## 5 Ronapreve COVID-19 virus infection
exceptional <- therareas %>% # count for num drugs that are both orphan drugs and approved under exceptional circumstances, drugs that are one or the other, and drugs that are neither
group_by(orphan_medicine,exceptional_circumstances) %>%
count()
exceptional # view table
## # A tibble: 4 × 3
## # Groups: orphan_medicine, exceptional_circumstances [4]
## orphan_medicine exceptional_circumstances n
## <lgl> <lgl> <int>
## 1 FALSE FALSE 2695
## 2 FALSE TRUE 38
## 3 TRUE FALSE 186
## 4 TRUE TRUE 23
# colors
myColors <- brewer.pal(4,"Paired")
names(myColors) <- levels(exceptional$exceptional_circumstances)
fillScale <- scale_fill_manual(name = "Orphan Designation", values = myColors)
# proportional bar graph of drugs authorized under exceptional circumstances vs non exceptional circumstances
prop <- ggplot(exceptional, aes(x = exceptional_circumstances, y = n, fill = orphan_medicine)) +
geom_col(position = "fill") +
labs(x = "Exceptional Circumstances", y = "Proportion of Drugs", title = "The Proportion of Orphan Drugs \nAuthorized Under Exceptional \nCircumstances") +
fillScale +
theme_bw() +
theme(legend.position = "none")
# bar graph of drugs authorized under exceptional circumstances vs non exceptional circumstances
sbs <- ggplot(exceptional, aes(x = exceptional_circumstances, y = n, fill = orphan_medicine)) +
geom_col(position = "dodge") +
labs(x = "Exceptional Circumstances", y = "Number of Drugs", title = "The Number of Orphan Drugs \nAuthorized Under Exceptional \nCircumstances") +
fillScale +
theme_bw() +
theme(legend.position = "none")
(legend extraction code source)
# save legend (to place below both plots)
leg <- ggplot(exceptional, aes(x = exceptional_circumstances, y = n, fill = orphan_medicine)) +
geom_col(position = "dodge") +
labs(x = "Exceptional Circumstances", y = "Number of Drugs", title = "The Number of Orphan Drugs \nAuthorized Under Exceptional \nCircumstances") +
fillScale +
theme(legend.position = "bottom") +
theme_bw()
# extract legend
extract_legend <- function(my_ggp) {
step1 <- ggplot_gtable(ggplot_build(my_ggp))
step2 <- which(sapply(step1$grobs, function(x) x$name) == "guide-box")
step3 <- step1$grobs[[step2]]
return(step3)
}
shared_legend <- extract_legend(leg)
# place plots and legend
grid.arrange(arrangeGrob(prop, sbs, ncol = 2),
shared_legend, nrow = 2, heights = c(5, 1))
What are the current top therapeutic areas pharmaceutical companies make drugs for and how has development in these areas changed over time?
# using same therareas dataset (separated therapeutic areas) from above
therareas_count <- therareas %>%
drop_na(year) %>% # remove any drugs that haven't been approved
group_by(therapeutic_area) %>%
summarise(num_drugs = n()) %>% # count num drugs in each therapeutic area (drugs may appear multiple times, once for each therapeutic area it covers)
arrange(desc(num_drugs)) %>% # arrange in descending order (most drugs to least)
head(9) # top 9 therapeutic areas all time
therareas_year_count <- therareas %>%
drop_na(year) %>% # remove any drugs that haven't been approved
group_by(therapeutic_area, year) %>%
summarise(num_drugs = n()) %>% # count num drugs in each therapeutic area each year
filter(therapeutic_area %in% therareas_count$therapeutic_area)
order <- c(therareas_count$therapeutic_area) # order of plots, most drugs in a therapeutic area to least
therareas_year_count <- therareas_year_count %>%
mutate(therapeutic_area = as.factor(therapeutic_area))
therareas_year_count <- therareas_year_count %>%
mutate(therapeutic_area = fct_relevel(therapeutic_area, order)) # reorder to order list
# colors
myColors <- brewer.pal(9,"Paired")
names(myColors) <- levels(therareas_year_count$therapeutic_area)
colorScale <- scale_color_manual(name = "Therapeutic Area", values = myColors)
ggplot(therareas_year_count, aes(x = year, y = num_drugs, color = therapeutic_area)) +
geom_line(linetype = "dashed") +
geom_point() +
facet_wrap(~therapeutic_area) +
labs(y = "Number of Approved Drugs", x = "Year", title = "Drug Approvals for Top 9 Therapeutic Areas Over Time") +
theme_bw() +
theme(legend.position = "none") +
colorScale