While perusing the r-bloggers website for data visuals to reproduce, we came across a post by Julia Silge, wherein Julia created graphs of the top 10 causes of death in her home state of Utah using a brand new R package called gganimate. As public health majors and beginner R users, we were excited to try to reproduce Julia’s graphs with help from the R code provided in her blog post.
Here are the steps that we used to recreate Julia’s graphs:
library(tidyr)
library(dplyr)
library(ggplot2)
library(RSocrata)
library(animation)
library(gganimate)
deathDF <- read.socrata("https://opendata.utah.gov/resource/fu2n-aa2y.csv")
colnames(deathDF) <- c("cause", "year", "number", "notes", "population",
"adjustedrate", "LL95CI", "UL95CI", "standarderror")
sapply(deathDF, class)
## cause year number notes population
## "factor" "integer" "integer" "factor" "factor"
## adjustedrate LL95CI UL95CI standarderror
## "factor" "factor" "factor" "factor"
deathDF <- deathDF[!is.na(deathDF$year),]
deathDF$cause <- as.factor(as.character(deathDF$cause))
5.Transform factor column names containing commas to numeric values.
deathDF$population <- as.numeric(gsub("[[:punct:]]", "", deathDF$population))
summary(deathDF$population)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2193000 2325000 2526000 2541000 2774000 2901000 67
deathDF[,6:9] <- apply(deathDF[,6:9], 2, function(x) gsub("*", "", x))
deathDF[,6:9] <- apply(deathDF[,6:9], 2, as.numeric)
deathDF <- complete(deathDF, cause, year)
totalDF <- deathDF[deathDF$cause == "Total",]
deathDF <- left_join(deathDF[,c("cause", "year", "number", "adjustedrate")],
totalDF[,c("year", "number", "population","adjustedrate")],by ="year")
colnames(deathDF) <- c("cause", "year", "number", "adjustedrate", "totalnumber", "population", "totaladjustedrate")
deathDF$number[is.na(deathDF$number)] <- 0
deathDF$adjustedrate[is.na(deathDF$adjustedrate)] <- 0
summary(deathDF$number)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 4.0 32.5 473.2 172.2 12670.0
summary(deathDF$adjustedrate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.63 25.31 9.24 657.50
head(deathDF)
## Source: local data frame [6 x 7]
##
## cause year number adjustedrate totalnumber
## (fctr) (int) (dbl) (dbl) (int)
## 1 Acute bronchitis and bronchiolitis 1999 0 0 9959
## 2 Acute bronchitis and bronchiolitis 2000 0 0 10341
## 3 Acute bronchitis and bronchiolitis 2001 0 0 10233
## 4 Acute bronchitis and bronchiolitis 2002 0 0 10635
## 5 Acute bronchitis and bronchiolitis 2003 0 0 10672
## 6 Acute bronchitis and bronchiolitis 2004 0 0 10571
## Variables not shown: population (dbl), totaladjustedrate (dbl)
top10 <- deathDF[deathDF$cause != "Total",] %>%
group_by(cause) %>% summarise(adjustedrate = mean(adjustedrate)) %>%
top_n(10, adjustedrate) %>% arrange(desc(adjustedrate))
deathDFtop10 <- deathDF[deathDF$cause %in% top10$cause,]
deathDFtop10$cause <- as.factor(as.character(deathDFtop10$cause))
deathDFtop10$shortcause <- deathDFtop10$cause
levels(deathDFtop10$shortcause) <- c("Alzheimer's", "Stroke", "COPD","Diabetes", "Heart disease", "Flu/pneumonia", "Suicide", "Cancer", "Kidney disease", "Accident")
deathDFtop10$shortcause <- as.factor(as.character(deathDFtop10$shortcause))
ggplot(data = deathDF[deathDF$cause == "Diseases of heart",], aes(x = year, y = number)) +
geom_line(size = 2.5, alpha = 0.7, color = "mediumseagreen") +
geom_point(size = 0.5) + xlab("Year") + ylab("Number of deaths") +
ggtitle("Heart Disease Deaths in Utah")
ggplot(data = deathDF[deathDF$cause == "Diseases of heart",],
aes(x = year, y = 1e5*number/population)) +
geom_line(size = 2.5, alpha = 0.7, color = "mediumseagreen") +
geom_point(size = 0.5) + xlab("Year") + ylab("Number of deaths per 100,000 population") +
ggtitle("Heart Disease Deaths in Utah")
ggplot(data = deathDF[deathDF$cause == "Diseases of heart",],
aes(x = year, y = adjustedrate)) +
geom_line(size = 2.5, alpha = 0.7, color = "mediumseagreen") +
geom_point(size = 0.5) + xlab("Year") +
ylab("Age adjusted mortality (deaths per 100,000 population") +
ggtitle("Heart Disease Age Adjusted Mortality in Utah")
p <- ggplot(data = deathDFtop10,
aes(x = year, y = adjustedrate, color = cause, frame = as.character(cause))) +
geom_line(size = 2.5, alpha = 0.7) +
geom_point(size = 0.5, color = "black") + xlab("Year") +
theme(legend.position="none") +
ylab("Age adjusted mortality (deaths per 100,000 population)")
gg_animate(p)
p2 <- ggplot(data = deathDFtop10,
aes(x = shortcause, y = adjustedrate, fill = shortcause, frame = year)) +
geom_bar(stat = "identity", position = "dodge") +
theme(legend.position="none",
axis.text.x= element_text(angle=45, hjust = 1)) +
ylab("Age adjusted mortality (deaths per 100,000 population)") +
xlab("Cause of death")
gg_animate(p2)
Now that we’ve successfully recreated a data visual in R, it’s time for us to get to work on creating our own data visualizations. Stay tuned for an update!