## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE,
tidy = FALSE, fig.width = 8, fig.height = 7)
options(width = 116, scipen = 10)
## Provide URLs
url2003.2009 <- paste("http://www.drugs.com/top200_", 2003:2009, ".html", sep = "")
url2010 <- "http://www.drugs.com/top200.html"
url2011.2012 <- paste("http://www.drugs.com/stats/top100/", 2011:2012, "/sales", sep = "")
## Load XML package
library(XML)
## Import 2003-2010 data
drugs2003.2010 <-
lapply(c(url2003.2009, url2010),
function(url) {
tab.imported <- readHTMLTable(url, header = FALSE, skip.rows = 1)[[2]][-201,1:4]
names(tab.imported) <- c("rank","name","company","sales")
tab.imported <- within(tab.imported, {
rank <- seq_along(rank)
sales <- as.numeric(gsub(",", "", sales)) * 1000
})
tab.imported
})
## Import 2011-2012 data (different format)
drugs2011.2012 <-
lapply(url2011.2012,
function(url) {
tab.imported <- readHTMLTable(url)[[2]][,1:3]
names(tab.imported) <- c("rank", "name", "sales")
tab.imported <- within(tab.imported, {
rank <- seq_along(rank)
sales <- as.numeric(gsub(",", "", sales)) * 1000
})
})
## Extract company names from drug.2003.2010 dataset
companies <- unlist(sapply(drugs2003.2010, "[", "company"))
names(companies) <- NULL
companies <- sort(unique(companies))
companies <- companies[!companies == ""]
## Other company names found in drug.2011.2012 dataset
other.companies <- c("Otsuka Pharmaceutical Co.", "Generic Drug", "Vertex Pharmaceuticals", "MedImmune, Inc", "Sunovion Pharmaceuticals Inc.")
## Removal of company names from drugs2011.2012 dataset (drug name and company name mixed)
replace.pattern <- paste(companies, other.companies, collapse = "|", sep = "|")
drugs2011.2012 <-
lapply(drugs2011.2012,
function(y) {
y$name <- gsub(replace.pattern, "", y$name)
y
})
## Save data for backup
save(list = c("drugs2003.2010","drugs2011.2012"), file = "~/Documents/statistics/Rmedstats/drugs.com.2012.RData")
## Load data
load(file = "~/Documents/statistics/Rmedstats/drugs.com.2012.RData")
## Create data frames
df.drugs2003.2010 <- do.call(rbind, drugs2003.2010)
df.drugs2011.2012 <- do.call(rbind, drugs2011.2012)
## Add year variable
df.drugs2003.2010$year <- rep(2003:2010, rep(200, 8))
df.drugs2011.2012$year <- rep(2011:2012, rep(100, 2))
## Create drug-company table
drug.company.table <- unique(df.drugs2003.2010[,c("name","company")])
## Merge company name to 2011-2012 dataset
df.drugs2011.2012 <-
merge(x = df.drugs2011.2012,
y = drug.company.table,
all.x = TRUE)
## All datasets combined as
df.drugs2003.2012 <- rbind(df.drugs2003.2010[,c("year","rank","name","company","sales")],
df.drugs2011.2012
)
## Order by year and ranking
library(doBy)
df.drugs2003.2012 <- orderBy(data = df.drugs2003.2012, ~ +year +rank)
## Extract drugs that entered top 20 at some point
ever.in.top.group <- subset(df.drugs2003.2012, rank %in% 1:20)$name
data.ever.in.top.group <- subset(df.drugs2003.2012, name %in% ever.in.top.group)
## Give 1 if sales is NA
data.ever.in.top.group[is.na(data.ever.in.top.group$sales), "sales"] <- 1
## Extract data in 2012 for labeling
last.year.top.group <- subset(data.ever.in.top.group, year == 2012)
## ggplot2
library(ggplot2)
## set data as subset that include only drugs that ever entered the top group
gg.data <- ggplot(data.ever.in.top.group)
gg.data +
opts(legend.position = "none") +
scale_y_reverse(limit = c(30,1), breaks = c(1,10,20,30)) +
scale_x_continuous(limit = c(2003, 2014)) +
scale_color_discrete(h = c(0, 120) + 15, c = 100, l = 60, h.start = 0, direction = -1) +
geom_line(aes(x = year, y = rank, color = name), lwd = 1) +
geom_text(aes(x = 2013, y = rank, color = name, label = name), data = last.year.top.group) +
geom_point(aes(x = year, y = rank, color = name, size = sales))
library(reshape2)
drug.table <- dcast(data = df.drugs2003.2012, rank ~ year, value.var = "name")[1:20,]
drug.table
rank 2003 2004 2005 2006 2007 2008 2009
1 1 Lipitor Lipitor Lipitor Lipitor Lipitor Lipitor Lipitor
2 2 Prevacid Prevacid Nexium Nexium Nexium Nexium Nexium
3 3 Zocor Zocor Prevacid Prevacid Advair Diskus Plavix Plavix
4 4 Nexium Nexium Zocor Advair Diskus Prevacid Advair Diskus Advair Diskus
5 5 Zoloft Zoloft Advair Diskus Singulair Plavix Prevacid Seroquel
6 6 Celebrex Advair Diskus Zoloft Effexor XR Singulair Seroquel Abilify
7 7 Zyprexa Effexor XR Plavix Plavix Seroquel Singulair Singulair
8 8 Neurontin Plavix Effexor XR Zocor Effexor XR Effexor XR OxyContin
9 9 Effexor XR Celebrex Singulair Norvasc Lexapro OxyContin Actos
10 10 Advair Diskus Neurontin Norvasc Lexapro Actos Actos Prevacid
11 11 OxyContin Protonix Protonix Seroquel Protonix Lexapro Cymbalta
12 12 Norvasc Norvasc Ambien Protonix Vytorin Abilify Effexor XR
13 13 Wellbutrin SR Zyprexa Lexapro Ambien Topamax Topamax Lexapro
14 14 Plavix Singulair Seroquel Actos Risperdal Cymbalta Crestor
15 15 Pravachol Ambien Actos Zoloft Abilify Zyprexa Zyprexa
16 16 Vioxx OxyContin Zyprexa Wellbutrin XL Cymbalta Valtrex Valtrex
17 17 Singulair Lexapro Fosamax Avandia Lamictal Crestor Flomax
18 18 Protonix Pravachol Risperdal Risperdal Zyprexa Vytorin Lantus
19 19 Ambien Fosamax Avandia Zyprexa Levaquin Lamictal Lyrica
20 20 Risperdal Risperdal Levaquin Topamax Celebrex Celebrex Celebrex
2010 2011 2012
1 Nexium Lipitor Nexium
2 Lipitor Plavix Abilify
3 Plavix Nexium Plavix
4 Advair Diskus Abilify Singulair
5 OxyContin Advair Diskus Advair Diskus
6 Abilify Seroquel Crestor
7 Singulair Singulair Cymbalta
8 Seroquel Crestor Humira
9 Crestor Cymbalta Enbrel
10 Cymbalta Humira Remicade
11 Actos Enbrel atorvastatin
12 Lexapro Remicade Neulasta
13 Zyprexa Actos Rituxan
14 Spiriva Neulasta Copaxone
15 Lantus Rituxan Lipitor
16 Aricept Zyprexa Atripla
17 Lyrica Copaxone OxyContin
18 Diovan Lexapro Spiriva
19 Effexor XR OxyContin Seroquel
20 Concerta Epogen Avastin
Use “{r, results = "asis”}“ for knitr. Not very useful for these data.
data.ever.in.top.group$year <- as.Date(paste(sep = "", data.ever.in.top.group$year, "-01-01"))
library(googleVis)
res <- gvisMotionChart(data.ever.in.top.group, idvar = "name", timevar = "year")
print(res, "chart")