- What to look for in time
- Discrete data over time
- Continuous data over time
- Issues with plotting cycles
Summer 2020
Yau says, when visualizing patterns over time, we typically focus on:
Few has a similar list:
Typical to lay out time along the x-axis from left-to-right
When time data is discretized into “blocks” of time, use a discrete data visualization (e.g., bar plots, dot plots, etc.)
When time data is continuous, use a continuous data visualization (e.g., line plot, stacked area plot, step plot, etc.)
It’s okay to annotate specific points in time to support your narative or for comparison
Nathan Yau talks uses standard R functions, but we’ll use libraries like reshape2, dplyr, RColorBrewer, and ggplot2 to make our lives a lot easier
When viewing a time series, keep these in mind:
Also these are sometimes helpful for trends and cycles:
library(ggplot2)
library(dplyr)
# Get the data and add a more useful "new world record" field
hotdogs = read.csv("http://datasets.flowingdata.com/hot-dog-contest-winners.csv")
hotdogs = mutate(hotdogs,
New.record=factor(c("Old record stands",
"New world record")[hotdogs$New.record+1],
ordered=T,
levels=c("Old record stands","New world record")))
# Set a color palette with two colors (light grey and green)
hdPalette <- c("#999999","#20B400")
# Your basic bar plot call for ggplot2
ggplot(hotdogs, aes(x=Year, y=Dogs.eaten, fill=New.record)) +
geom_bar(stat="identity") +
# More on the next slide ...
# Change the fill color values to use the palette we made
scale_fill_manual(values=hdPalette) +
#
# Make sure the y-axis ticks every 10 HDBs and label everything
scale_y_continuous(breaks = seq(from=0,to=70,by=10)) +
ylab("Hotdogs and Buns Eaten (HDBs)") +
ggtitle("Nathan's Hot Dog Context Results, 1980-2010") +
#
# We don't need a legend title b/c we made the new record
# labels easier to understand
guides(fill=guide_legend(title=NULL)) +
#
# I like serif fonts, so use Times instead of the default font
# and make the font bigger
theme_bw(base_family="Times") +
theme(text=element_text(size=18)) +
#
# More on the next slide ...
#
# Get rid of all the grid lines and the border then instead
# generate Tufte-style "anti-grid" lines *through* the bars
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.border = element_blank()) +
geom_hline(yintercept=seq(from=0,to=70,by=10), col="white") +
#
# Annotate when important changes occurred
annotate("text",2001,56,label="Takeru Koyahsi", hjust=0) +
annotate("segment",x=2000.6,xend=2006.4,y=55,yend=55,size=1.2) +
annotate("text",2007,70,label="Joey Chestnut", hjust=0) +
annotate("segment",x=2006.6,xend=2010.4,y=69,yend=69,size=1.2))
library(ggplot2) # For plotting
library(dplyr) # For mutate()
library(RColorBrewer) # For color scaling
library(reshape2) # For melt()
# Get the data in the right format, with the Year as an ordered
# factor and removing the prefix 'X' from the strings in the
# dataset
rawHotdogs = read.csv("http://datasets.flowingdata.com/hot-dog-places.csv")
rawHotdogs$Place = rownames(rawHotdogs)
hotdogs = melt(rawHotdogs)
YearStr=as.character(gsub("X","",as.character(hotdogs$variable)))
hotdogs = mutate(hotdogs,
Year=factor(YearStr,
ordered=TRUE,
levels=unique(YearStr)))
# More on the next slide ...
# Build a linear color palette of greens, then reverse it # so that the darkest green is first and so on. hdPalette <- rev(colorRampPalette(brewer.pal(5,"Greens"))(4)) # Basic stacked bar plot ggplot(hotdogs, aes(x=Year,y=value,fill=Place)) + geom_bar(stat="identity") + # # Use the provided color palette for "fill" scale_fill_manual(values=hdPalette) + # # More on the next slide ...
#
# Use a white background and a large serif font
theme_bw(base_family="Times") +
theme(text=element_text(size=18)) +
#
# Tick every 25 HDBs, label the y-axis properly, and give title
scale_y_continuous(breaks = seq(from=0,to=150,by=25)) +
ylab("Hotdogs and Buns Eaten (HDBs)") +
ggtitle("Hot Dog Eating Context Results, 2000-2010")
# Get data directly from the Bureau of Labor Statistics
blsDataURL = "https://download.bls.gov/pub/time.series/ce/ce.data.00a.TotalNonfarm.Employment"
empl <- read.table(blsDataURL, header=F,flush=T,skip=1,
col.names=c("ID","Year","Period","Value"))
emplSmall = filter(empl,
(Year>2000) & (Year<2015), # Get only 2001-2014
Period != "M13", # Remove any year-end redundancies
ID=="CES0000000001") # Select Total National data series
# Create a "RawYear" variable that includes months in numeric value
emplSmall = mutate(emplSmall,
Month = as.numeric(gsub("M","",as.character(Period))),
RawYear = Year + (Month-1)/12)
#
# More on the next slide ...
# Number of rows (observations)
nr = nrow(emplSmall)
# Since we are computing the *change* in jobs, we must subtract
# each period from the preceding period. This also means re-indexing
# from the second period onward. While we're at it, let's add a
# categorical variable for the administration.
Employment = data.frame(
EmplChange = emplSmall$Value[2:nr] - emplSmall$Value[1:(nr-1)], # Delta per period
Year = emplSmall$RawYear[2:nr],
Administration = factor(c("Bush","Obama")[1+(emplSmall$RawYear[2:nr] > 2009.08)])
)
hdPalette <- c("firebrick","deepskyblue") # Custom bar colors
# Standard plot command
ggplot(Employment, aes(x=Year, y=EmplChange, fill=Administration)) +
geom_rect(aes(xmin=Year-(1/24), xmax=Year+(1/24), ymin=0, ymax=EmplChange), color="white") +
scale_fill_manual(values=hdPalette) +
# More on the next slide ...
# Draw the baseline
geom_hline(yintercept=0, col="black", size=1) +
#
# Set tick spacing, label the axis and title, set the font
scale_y_continuous(breaks = seq(from=-800,to=600,by=200)) +
ylab("Employees (Thousands)") +
ggtitle("New Jobs in the United States, 2001-2014") +
theme(text=element_text(size=18, family="Times")) +
#
# Annotate the sides of the plot corresponding with gains vs. loss
annotate("text",2001,450,label="Gain in Jobs",
hjust=0, family="Times", fontface="bold") +
annotate("text",2001,-450,label="Loss in Jobs",
hjust=0, family="Times", fontface="bold")
# Get and reformat the data
subscribers = read.csv("http://datasets.flowingdata.com/flowingdata_subscribers.csv")
subscribers = transform(subscribers,
Subscribers = Subscribers/1000, # we'll show val per K
Date = strptime(as.character(Date),"%m-%d-%Y"),
Day = 1:length(Date))
# Nathan Yau's x-axis shows ticks every day and labels every 5 days ...
# Except Day 1, which is also labeled. I construct this to use later
# to deal with this strangeness.
weirdXLabels = c("1",rep("",3),
"5",rep("",4),
"10",rep("",4),
"15",rep("",4),
"20",rep("",4),
"25",rep("",4),
"30")
# Start the plot
ggplot(subscribers, aes(x=Day, y=Subscribers)) +
# More on the next slide ...
# Place all the annotations. The segments, at least, have to
# come first so that the points will occlude them...
#
# Annotate the errant points
annotate("text",14,11,label="Reporting Error",
hjust=0, family="Times", fontface="bold") +
annotate("text",14,9,hjust=0, family="Times",
label="A source reported incorrect subscriber\n counts for these days") +
#
# Annotate the first and end point, drawing line segments to
# the actual point.
annotate("text",1,22,label="25,047", family="Times", fontface="bold") +
annotate("segment",x=1,y=22.5,xend=1,yend=25.047,size=0.5) +
annotate("text",30,22,label="27,611", family="Times", fontface="bold") +
annotate("text",30,21,label="(+10%)", family="Times") +
annotate("segment",x=30,y=22.5,xend=30,yend=27.611,size=0.5) +
#
# A weird hack to get the top of the graph to look like his does ...
annotate("text",x=0.5,y=30,label="thousand subscribers",hjust=0, family="Times") +
# More on the next slide
# Draw the actual points
geom_point(shape=21, fill="firebrick", size=5) +
#
# Set all the theme elements
theme_bw() +
theme(text=element_text(size=18,family="Times"), # OK, so he doesn't use serif ...
panel.grid.major.x=element_blank(), # Get rid of major x grid lines
panel.grid.minor.x=element_blank(), # Get rid of minor x grid lines
panel.border = element_blank(), # Get rid of the border
axis.ticks.y = element_blank()) + # Get rid of y-axis tick marks
#
# Set the grid lines and labels on the axes, draw the y-axis baseline back in
scale_y_continuous(breaks = seq(from=0,to=30,by=5)) +
scale_x_discrete(breaks = 1:30, labels=weirdXLabels) +
geom_hline(yintercept=0, col="black", size=0.5) +
ylab("") +
xlab("January 2010") +
ggtitle("Increase in RSS & Email Subscribers in January 2001")
The simplest way to communicate continuity or connection is by connecting points using line segments
Since all data is discrete at some point, the distinction between when to use a dot plot for “discrete” data and when to use a line plot is not always clear
In the previous examples, we were making observations within a discete interval of time (e.g., all the subscribers in a give day)
Alternatively, we might be simply sampling some value at discrete intervals (e.g., the amount of CO2 in the atmosphere at periodic intervals)
We sometimes refer to the latter as a time series
Trends – overall tendency of a series (e.g., increase vs. decrease)
Variability – degree of change from one point in time to the next over some time span
Rate of change – overal rate of change over some time span
Co-variation – how variation in one time series affects another
Cycles – tendency for a time series to exibit periodic patterns
Exceptions – sudden changes in how a time series proceeds
library(dplyr)
library(ggplot2)
ggplot(economics,aes(x=date, y=pce)) +
geom_line(size=1.25,color="darkblue") +
xlab("Date") +
ylab("Personal Consumption Expenditures (billions of dollars)") +
theme(text=element_text(size=18))
Basic R plotting commands understand ts
Sadly, ggplot2 does not understand ts or ts models
We’ll talk more about “models” another week
plot(co2,lwd=2,col="darkblue");grid()
plot(stl(co2,"per"),lwd=2,col="darkblue");grid()
# Build a model using appropriate model parameters
myModel = arima(co2, order=c(10,2,1))
myPrediction = predict(myModel,22*12)
# Setup the initial plot
plot(co2,xlim=c(1959,2020),ylim=c(300,410),
lwd=1.25,col="darkblue",
xlab="Date",ylab="Atmospheric Concentration of CO2 (ppm)")
grid()
# Sketch out today's measure
# From: http://co2now.org/current-co2/co2-now/
segments(y0=400.26, y1=400.26, x0=0, x1=2015+2/12, col="darkorange", lty=2)
segments(y0=0, y1=400.26, x0=2015+2/12, x1=2015+2/12, col="darkorange", lty=2)
points(2015+2/12,400.26,pch=19,col="darkorange")
text(2006,402,"Atmospheric Concentration of CO2 in Feb 2015",col="darkorange")
segments(y0=416.27, y1=416.27, x0=0, x1=2020+6/12, col="darkred", lty=2)
segments(y0=0, y1=416.27, x0=2020+6/12, x1=2020+6/12, col="darkred", lty=2)
points(2020+6/12,416.27,pch=19,col="darkred")
text(2011,417.7,"Atmospheric Concentration of CO2 in Jun 2020",col="darkred")
# Add the predictions
lines(myPrediction$pred,lwd=4,col="steelblue")
# Get the data
postage = read.csv("http://datasets.flowingdata.com/us-postage.csv")
# Make labels for prices to put over steps
priceLabels = as.character(100*postage$Price)
priceLabels[1] = paste(priceLabels[1],"cents")
# Make the special, irregular x-axis year labels
yearLabels = c("1991",rep("",3),
"'95",rep("",3),
"'99",rep("",1),
"2001",
"'02",rep("",3),
"'06", "'07", "'08", "'09",
"")
# Create the basic plot
ggplot(postage, aes(x=Year, y=Price)) +
geom_step(size=1.5, color="firebrick") +
geom_text(aes(label=priceLabels), vjust=-0.5, hjust=0) + # Label the prices
#
# Blank out axis elements and set the font
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.title=element_blank(),
text=element_text(size=18,family="Times")) +
#
# Add the irregular x-axis labels then title the thing
scale_x_continuous(breaks=seq(from=1991,to=2010,by=1), labels=yearLabels) +
ggtitle("United States Postage Rate, 1991-2010")
ggplot(economics, aes(x=date,y=psavert)) +
geom_point(shape=21, fill="white", size=3) +
geom_smooth(method=lm, se=FALSE, color="darkblue", size=1.5) +
theme(text=element_text(size=18, family="Times")) +
xlab("Date") + ylab("Personal Savings Rate") +
ggtitle("United States Personal Savings Rates, 1967-2007")
# Get the data
utilURL ="http://eecs.ucf.edu/~wiegand/ids6938/datasets/February-2015-SystemUtilization.txt"
sysutil = read.csv(utilURL)
sysutil = transform(sysutil,
Date=strptime(DateTime,"%m/%d/%Y %H:%M"),
PctUtil=100*System.Utilization)
# Create the plot
ggplot(sysutil, aes(x=Date, y=PctUtil)) +
# Set axis and title lables, also set y-axis limits
xlab("Date") + ylab("System Utilization (%)") + ylim(c(0,100)) +
ggtitle("Stokes System Utilization for February 2015") +
#
# More on next slide ...
# Draw the points geom_point(shape=21,size=2,color="pink",fill="pink") + # # Use a white background and set the font properties theme_bw() + theme(text=element_text(size=18, family="Times")) + # # Draw the piece-wise, smoothed curve fit using LOESS stat_smooth(size=1.75, color="firebrick", method=loess)
When data is cyclical, it can be complicated to visualize
Points that might be placed close together are often physically separated
Plot types that resolve this can be difficult to understand
Multiple data trends on one plot can be confusing and cluttered
Year = ordered(c(rep(1974,12), rep(1975,12), rep(1976,12), rep(1977,12), rep(1978,12), rep(1979,12)))
MonthList = c('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec')
Month = factor(rep(MonthList,6),
levels=MonthList,
ordered=T)
Deaths = as.numeric(ldeaths)
UKDeaths = data.frame(Year, Month, Deaths)
ggplot(UKDeaths, aes(x=Month, y=Deaths, fill=factor(Year))) +
geom_bar(stat="identity", position="dodge", color="black", width=0.75) +
geom_hline(yintercept=0, size=1.1) +
scale_fill_brewer(palette="YlOrRd", name="Year") +
theme(text=element_text(size=18, family="Times")) +
ggtitle("Deaths for Lung Disease in the UK")
ggplot(UKDeaths, aes(x=Month, y=Deaths, group=Year, color=factor(Year))) +
geom_line(size=2) +
coord_polar() +
scale_color_brewer(palette="Set1", name="Year") +
theme(text=element_text(size=18, family="Times")) +
ggtitle("Deaths for Lung Disease in the UK")
ggplot(UKDeaths, aes(x=Month, y=Year, fill=Deaths)) +
geom_tile(color="white") +
scale_fill_gradient(low="white", high="steelblue" ) +
theme(text=element_text(size=18, family="Times")) +
ggtitle("Deaths for Lung Disease in the UK")
ggplot(UKDeaths, aes(x=Month, y=Deaths)) +
geom_boxplot() +
theme(text=element_text(size=18, family="Times")) +
ggtitle("Deaths for Lung Disease in the UK (1974-1979)")
There are no easy answers to how to address this
What is your objective?
What will your audience understand?
Somtimes: What is the least annoying option?