- Plotting Tools
- Visualizing Single Variate Distributions & Values
- Visualizing Multi-Variate Distributions & Values
- Interaction & Navigation
- Analytical Navigation
- Further Help
Summer 2020
Suppose we want to visualize a Binomial distribution, \(n=15,\; p=0.25\)
library(ggplot2)
k = 0:15
pmf = dbinom(k, size=max(k), prob=0.25)
MyData = data.frame(k, pmf)
ggplot(MyData,aes(x=k, y=pmf)) +
geom_linerange(ymin=0, ymax=pmf, size=1.25) +
geom_point(size=3.5) +
ylab("Pr{k}") +
theme(text=element_text(size=18, family="Times"))
Suppose we want to visualize a Normal distribution, \(\mu = 5, \sigma=2\)
library(ggplot2)
ggplot(data.frame(x=c(-5,15),y=c(0,1)),aes(x=x,y=y)) +
stat_function(fun=dnorm,args=list(mean=5,sd=2)) +
ggtitle("Normal Distribution, ~N(5,2)") +
theme(text=element_text(size=18, family="Times"))
To get a rough picture of the distribution of a sample, use a histogram
library(ggplot2)
MyData = data.frame(val=rnorm(200))
ggplot(MyData,aes(x=val)) +
geom_histogram(binwidth=0.5, col="white", fill="darkblue") +
xlab("Value") + ylab("Count") + ggtitle("Histogram of MyData") +
theme(text=element_text(size=18, family="Times"))
Or a density plot
library(ggplot2)
MyData = data.frame(val=rnorm(200))
ggplot(MyData,aes(x=val)) +
geom_density(fill="pink",col=NA) +
xlab("Value") + ylab("Density") + ggtitle("Density of MyData") +
theme(text=element_text(size=18, family="Times"))
Or all of these
library(ggplot2)
MyData = data.frame(val=rnorm(200))
mu = mean(MyData$val)
sig = sqrt(var(MyData$val))
ggplot(MyData,aes(x=val)) +
geom_density(fill="pink",col=NA) +
geom_histogram(binwidth=0.5, aes(y=..density..), col="white", alpha=0.4) +
stat_function(fun=dnorm,arg=list(mean=mu,sd=sig), size=1.5, col="darkred") +
xlab("Value") + ylab("Density") +
ggtitle("Estimating MyData Distribution") +
theme(text=element_text(size=18, family="Times"))
Q-Q plots give us a way to see how close to a normal distribution
our data might be
| Right Skew | Short Tails | ||
| Left Skew | Long Tails |
MyData = data.frame(val=rnorm(200)) qqnorm(MyData$val,pch=19,col="darkgray") qqline(MyData$val,lwd=2,col="darkred")
Dot plots use position to encode a numeric value, proportion, or frequency
library(ggplot2)
MyData = data.frame(State=state.name[1:10], Area=state.area[1:10])
ggplot(MyData,aes(x=Area,y=State)) +
geom_point(size=4) +
xlab("Area (sq. miles)") +
theme(text=element_text(size=18, family="Times"))
Dot plots use position to encode a numeric value, proportion, or frequency
Note: There’s no implicit meaning to the \(y\)-axis positions
So we can order the dot plot based on value typically to make it easier to read
library(ggplot2)
MyData = data.frame(State=state.name[1:10], Area=state.area[1:10])
MySortedData = transform(MyData, State=reorder(State,Area))
ggplot(MySortedData,aes(x=Area,y=State)) +
geom_point(size=4) +
xlab("Area (sq. miles)") +
theme(text=element_text(size=18, family="Times"))
Bar plots use length and position to encode a numeric value
library(ggplot2)
MyData = data.frame(State=state.name[1:10], Area=state.area[1:10])
MySortedData = transform(MyData, State=reorder(State,Area))
ggplot(MySortedData,aes(x=State,y=Area)) +
geom_bar(stat="identity") +
coord_flip() +
ylab("Area (sq. miles)") + # Recall we flipped the axes ...
theme(text=element_text(size=18, family="Times"))
Note: Again, these are ordered for ease of reading …
Strip plots encode 1D numeric values and imply distribution information
Lines imply connection … don’t use them if there isn’t any
For example, use lines to connect the same algorithm at different points during a run
library(ggplot2)
fakeData = data.frame(evals=c(100,150,200,250),
performance=c(1000.1,1300.2,1410.6,1470.3),
ci=c(150,90,50,30))
ggplot(fakeData,aes(evals,performance)) +
geom_errorbar(aes(ymin=performance-ci/2, ymax=performance+ci/2),
size=0.5, width=10) +
geom_line(color="darkblue", size=1.25) +
geom_point(size=5) +
xlab("Number of Evaluations") +
ylab("Algorithm Performance") +
theme(text=element_text(size=18, family="Times"))
Box plots give information about the median, inter-quartiles, outliers, as well as confidence inervals
library(ggplot2)
ggplot(mtcars, aes(1,y=mpg)) +
geom_boxplot(notch=T, fill="pink") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
xlim(c(0,2)) +
xlab("") + ylab("Mileage") +
ggtitle("Distribution of Car Mileage") +
theme(text=element_text(size=18, family="Times"))
Use dodge to visualize multiple Binomial distributions
library(ggplot2)
k = 0:15
p = factor(c(rep(0.25,length(k)),rep(0.4,length(k))))
pmf = c(dbinom(k, size=max(k), prob=0.25), dbinom(k, size=max(k), prob=0.4))
MyData = data.frame(k, p, pmf)
ggplot(MyData, aes(x=k, y=pmf, group=p)) +
geom_linerange(ymin=0,
aes(ymax=pmf, color=p),
size=1.25,
position=position_dodge(width=0.25)) +
geom_point(size=3.5, position=position_dodge(width=0.25), aes(color=p)) +
ylab("Pr{k}") +
ggtitle("Two Binomial Distributions, n=15, p=0.25 and p=0.4")
Label text is too small? Use theme()
You can use factors to separate different plots straightforwardly
library(ggplot2)
library(MASS) # Contains a lot of extra data sets
birthwt1 = birthwt # Copy a birth Wt / risk factor data set
birthwt1$smoke = factor(birthwt$smoke) # Make "smoking during preg." a factor
ggplot(birthwt1, aes(x=bwt, fill=smoke)) +
geom_density(alpha=0.3) +
xlab("Birth Weight (g)") +
ylab("Distribution Density") +
scale_fill_discrete(name="Mom Smoked?",
labels=c("No","Yes")) +
theme(text=element_text(size=20, family="Times"))
library(ggplot2)
library(MASS) # Contains a lot of extra data sets
bwt = birthwt$bwt # Get the birth Wt / risk factor vector
smoke = as.factor(birthwt$smoke) # Make "smoking during preg." variable a factor
MyData = data.frame(bwt,smoke)
ggplot(MyData, aes(x=bwt, fill=smoke)) +
geom_histogram(aes(y=..density..),
binwidth=500,
position=position_dodge(width=500),
color="black") +
xlab("Birth Weight (g)") +
ylab("Distribution Density") +
scale_fill_discrete(name="Mom Smoked?",
labels=c("No","Yes")) +
theme(text=element_text(size=20, family="Times"))
You can use stat_density2d to create contour density plots
library(ggplot2)
library(gcookbook)
ggplot(faithful, aes(x=eruptions, y=waiting)) +
stat_density2d(aes(color=..level..), size=1.5) +
xlab("Eruption Time (min)") +
ylab("Time Between Eruptions (min)") +
scale_color_continuous(name="Distribution\nDensity") +
ggtitle("Old Gaithful Geyser Eruptions") +
theme(text=element_text(size=20, family="Times"))
Use geom_point for scatter plots of numeric values
library(ggplot2)
library(MASS)
ggplot(Boston,aes(x=age, y=medv, size=crim, color=dis)) +
geom_point() +
scale_size(range=c(2.5,10)) +
xlab("Age of Home") +
ylab("Median Home Value (thousands)") +
scale_size_continuous(name="Township\nCrime Rate") +
scale_color_continuous(name="Distance to\nEmployment") +
ggtitle("Houses of Boston") +
theme(text=element_text(size=20, family="Times"))
The standard R function pairs allows us to see all pairwise scatter plots
pairs(iris[1:4],pch=19)
If you install the GGally library, you get a ggplot version with ggpairs
library(GGally) ggpairs(iris) + theme(text=element_text(size=20, family="Times"))
library(ggplot2)
library(gcookbook)
ggplot(uspopage, aes(x=Year, y=Thousands, group=AgeGroup)) +
geom_line(aes(color=AgeGroup,size=AgeGroup)) +
xlab("Year") +
ylab("Number of People in US (thousands)") +
theme(text=element_text(size=20, family="Times"))
library(ggplot2)
library(gcookbook)
Year = uspopage$Year
Thousands = uspopage$Thousands
AgeGroup = factor(uspopage$AgeGroup,levels=rev(levels(uspopage$AgeGroup)))
MyData = data.frame(Year,Thousands,AgeGroup)
ggplot(MyData, aes(x=Year,
y=Thousands,
fill=AgeGroup,
order=-as.numeric(AgeGroup))) +
geom_area() + scale_fill_grey(start=0.8, end=0) +
xlab("Year") + ylab("Number of People in US (thousands)") +
theme(text=element_text(size=20, family="Times"))
We can make “grouped” boxplots using dodge
library(ggplot2) ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity", position="dodge", color="white") + scale_fill_brewer(palette="Set1") + theme(text=element_text(size=20, family="Times"))
By default, ggplot wants to stack …
library(ggplot2) ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity", color="white") + scale_fill_brewer(palette="Set1") + theme(text=element_text(size=20, family="Times"))
library(vcd) mosaic(HairEyeColor) + theme(text=element_text(size=20, family="Times"))
## NULL
Florence Nightengale used Coxcomb plots to convince the the Brittish that the biggest threat to their soldiers during the Crimean war were preventable diseases
nightengale = read.csv("http://eecs.ucf.edu/~wiegand/ids6938/datasets/nightengale.csv",header=TRUE)
Month = as.Date(paste("01",nightengale$Date),"%d %B %Y")
DeathType = factor(nightengale$DeathType,ordered=TRUE)
DeathRate = sqrt((1000*nightengale$NumDeaths/nightengale$AvgArmySize)/pi)
MyData = data.frame(Month,DeathType,DeathRate)
ggplot(MyData, aes(x=Month,
y=DeathRate,
fill=DeathType,
order=as.numeric(DeathType))) +
geom_bar(stat="identity") +
coord_polar() +
scale_x_date(breaks=MyData$Month,labels=format(MyData$Month,"%b %Y")) +
theme(text=element_text(size=20, family="Times"))
parcoord(iris[1:4],col=iris$Species,lwd=2,main="Iris Dataset")
library(ggplot2)
ggplot(iris,aes(x=Species, y=Sepal.Length)) +
geom_boxplot(outlier.size=3, notch=TRUE) +
ylab("Iris Sepal Length (cm)") +
theme(text=element_text(size=20, family="Times"))
Analytical Navigation: Visual navigation through data as means to learn something about patterns, relationships, and idiosyncrasies within it (and the underlying phenomena that produced it)
“Data analysis, like experimentation, must be considered as an open-minded, highly interactive, iterative process …” – John Tukey
“Overview first, zoom and filter, then details-on-demand” – Ben Schneiderman
Like a paper: abstract, method, then results
We report fuel mileage incorrectly in the U.S. We care about cost, not distance. It’s better cost savings to increase mileage of inefficient cars a little than to increase mileage of efficient cars a lot.