USA drug sales data extraction

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE, 
    tidy = FALSE, fig.width = 8, fig.height = 7)
options(width = 116, scipen = 10)

Reference

Code to extract and process data from web (run only once)

## Provide URLs
url2003.2009 <- paste("http://www.drugs.com/top200_", 2003:2009, ".html", sep = "")
url2010      <- "http://www.drugs.com/top200.html"
url2011.2012 <- paste("http://www.drugs.com/stats/top100/", 2011:2012, "/sales", sep = "")


## Load XML package
library(XML)

## Import 2003-2010 data
drugs2003.2010 <-
    lapply(c(url2003.2009, url2010),
           function(url) {
               tab.imported <- readHTMLTable(url, header = FALSE, skip.rows = 1)[[2]][-201,1:4]

               names(tab.imported) <- c("rank","name","company","sales")

               tab.imported <- within(tab.imported, {
                   rank <- seq_along(rank)
                   sales <- as.numeric(gsub(",", "", sales)) * 1000
               })
               tab.imported
           })


## Import 2011-2012 data (different format)
drugs2011.2012 <-
    lapply(url2011.2012,
           function(url) {
               tab.imported <- readHTMLTable(url)[[2]][,1:3]

               names(tab.imported) <- c("rank", "name", "sales")

               tab.imported <- within(tab.imported, {
                   rank <- seq_along(rank)
                   sales <- as.numeric(gsub(",", "", sales)) * 1000
               })
           })


## Extract company names from drug.2003.2010 dataset
companies <- unlist(sapply(drugs2003.2010, "[", "company"))
names(companies) <- NULL
companies <- sort(unique(companies))
companies <- companies[!companies == ""]

## Other company names found in drug.2011.2012 dataset
other.companies <- c("Otsuka Pharmaceutical Co.", "Generic Drug", "Vertex Pharmaceuticals", "MedImmune, Inc", "Sunovion Pharmaceuticals Inc.")

## Removal of company names from drugs2011.2012 dataset (drug name and company name mixed)
replace.pattern <- paste(companies, other.companies, collapse = "|", sep = "|")

drugs2011.2012 <-
    lapply(drugs2011.2012,
           function(y) {
               y$name <- gsub(replace.pattern, "", y$name)
               y
           })

## Save data for backup
save(list = c("drugs2003.2010","drugs2011.2012"), file = "~/Documents/statistics/Rmedstats/drugs.com.2012.RData")

Load saved data to avoid multiple http access

## Load data
load(file = "~/Documents/statistics/Rmedstats/drugs.com.2012.RData")

## Create data frames
df.drugs2003.2010 <- do.call(rbind, drugs2003.2010)
df.drugs2011.2012 <- do.call(rbind, drugs2011.2012)

## Add year variable
df.drugs2003.2010$year <- rep(2003:2010, rep(200, 8))
df.drugs2011.2012$year <- rep(2011:2012, rep(100, 2))

## Create drug-company table
drug.company.table <- unique(df.drugs2003.2010[,c("name","company")])

## Merge company name to 2011-2012 dataset
df.drugs2011.2012 <-
    merge(x = df.drugs2011.2012,
          y = drug.company.table,
          all.x = TRUE)

## All datasets combined as
df.drugs2003.2012 <- rbind(df.drugs2003.2010[,c("year","rank","name","company","sales")],
                           df.drugs2011.2012
                           )

## Order by year and ranking
library(doBy)
df.drugs2003.2012 <- orderBy(data = df.drugs2003.2012, ~ +year +rank)

Extract data necessary

## Extract drugs that entered top 20 at some point
ever.in.top.group      <- subset(df.drugs2003.2012, rank %in% 1:20)$name
data.ever.in.top.group <- subset(df.drugs2003.2012, name %in% ever.in.top.group)

## Give 1 if sales is NA
data.ever.in.top.group[is.na(data.ever.in.top.group$sales), "sales"] <- 1

## Extract data in 2012 for labeling
last.year.top.group <- subset(data.ever.in.top.group, year == 2012)

Plotting with ggplot2

## ggplot2
library(ggplot2)

## set data as subset that include only drugs that ever entered the top group
gg.data <- ggplot(data.ever.in.top.group)

gg.data +
    opts(legend.position = "none") +
    scale_y_reverse(limit = c(30,1), breaks = c(1,10,20,30)) +
    scale_x_continuous(limit = c(2003, 2014)) +
    scale_color_discrete(h = c(0, 120) + 15, c = 100, l = 60, h.start = 0, direction = -1) +
    geom_line(aes(x = year, y = rank, color = name), lwd = 1) +
    geom_text(aes(x = 2013, y = rank, color = name, label = name), data = last.year.top.group) +
    geom_point(aes(x = year, y = rank, color = name, size = sales))

plot of chunk unnamed-chunk-5

Data as table: This is probably the best way of presenting these data.

library(reshape2)
drug.table <- dcast(data = df.drugs2003.2012, rank ~ year, value.var = "name")[1:20,]
drug.table
   rank          2003          2004          2005          2006          2007          2008          2009
1     1       Lipitor       Lipitor       Lipitor       Lipitor       Lipitor       Lipitor       Lipitor
2     2      Prevacid      Prevacid        Nexium        Nexium        Nexium        Nexium        Nexium
3     3         Zocor         Zocor      Prevacid      Prevacid Advair Diskus        Plavix        Plavix
4     4        Nexium        Nexium         Zocor Advair Diskus      Prevacid Advair Diskus Advair Diskus
5     5        Zoloft        Zoloft Advair Diskus     Singulair        Plavix      Prevacid      Seroquel
6     6      Celebrex Advair Diskus        Zoloft    Effexor XR     Singulair      Seroquel       Abilify
7     7       Zyprexa    Effexor XR        Plavix        Plavix      Seroquel     Singulair     Singulair
8     8     Neurontin        Plavix    Effexor XR         Zocor    Effexor XR    Effexor XR     OxyContin
9     9    Effexor XR      Celebrex     Singulair       Norvasc       Lexapro     OxyContin         Actos
10   10 Advair Diskus     Neurontin       Norvasc       Lexapro         Actos         Actos      Prevacid
11   11     OxyContin      Protonix      Protonix      Seroquel      Protonix       Lexapro      Cymbalta
12   12       Norvasc       Norvasc        Ambien      Protonix       Vytorin       Abilify    Effexor XR
13   13 Wellbutrin SR       Zyprexa       Lexapro        Ambien       Topamax       Topamax       Lexapro
14   14        Plavix     Singulair      Seroquel         Actos     Risperdal      Cymbalta       Crestor
15   15     Pravachol        Ambien         Actos        Zoloft       Abilify       Zyprexa       Zyprexa
16   16         Vioxx     OxyContin       Zyprexa Wellbutrin XL      Cymbalta       Valtrex       Valtrex
17   17     Singulair       Lexapro       Fosamax       Avandia      Lamictal       Crestor        Flomax
18   18      Protonix     Pravachol     Risperdal     Risperdal       Zyprexa       Vytorin        Lantus
19   19        Ambien       Fosamax       Avandia       Zyprexa      Levaquin      Lamictal        Lyrica
20   20     Risperdal     Risperdal      Levaquin       Topamax      Celebrex      Celebrex      Celebrex
            2010          2011          2012
1         Nexium       Lipitor        Nexium
2        Lipitor        Plavix       Abilify
3         Plavix        Nexium        Plavix
4  Advair Diskus       Abilify     Singulair
5      OxyContin Advair Diskus Advair Diskus
6        Abilify      Seroquel       Crestor
7      Singulair     Singulair      Cymbalta
8       Seroquel       Crestor        Humira
9        Crestor      Cymbalta        Enbrel
10      Cymbalta        Humira      Remicade
11         Actos        Enbrel  atorvastatin
12       Lexapro      Remicade      Neulasta
13       Zyprexa         Actos       Rituxan
14       Spiriva      Neulasta      Copaxone
15        Lantus       Rituxan       Lipitor
16       Aricept       Zyprexa       Atripla
17        Lyrica      Copaxone     OxyContin
18        Diovan       Lexapro       Spiriva
19    Effexor XR     OxyContin      Seroquel
20      Concerta        Epogen       Avastin

GoogleVis version made possible by dichika http://rpubs.com/dichika/1276

Use “{r, results = "asis”}“ for knitr. Not very useful for these data.

data.ever.in.top.group$year <- as.Date(paste(sep = "", data.ever.in.top.group$year, "-01-01"))

library(googleVis)
res <- gvisMotionChart(data.ever.in.top.group, idvar = "name", timevar = "year")

print(res, "chart")