etienne — Jun 6, 2013, 11:03 PM
# ggplot2 graphics tutorial
# Etienne Laliberte
# June 4, 2013
# the diamonds dataset
library(ggplot2)
data(diamonds)
head(diamonds)
carat cut color clarity depth table price x y z
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
diam <- subset(diamonds, cut %in% c("Fair", "Good"))
head(diam[, names(diam) %in% c("carat", "cut", "color", "price")] )
carat cut color price
3 0.23 Good E 327
5 0.31 Good J 335
9 0.22 Fair E 337
11 0.30 Good J 339
18 0.30 Good J 351
19 0.30 Good J 351
# when base graphics get frustrating
par( mfrow = c(1, 2))
cols <- rep(NA, nrow(diam) )
rbow <- rainbow(7)
cols[diam$color == "D"] <- rbow[1]
cols[diam$color == "E"] <- rbow[2]
cols[diam$color == "F"] <- rbow[3]
cols[diam$color == "G"] <- rbow[4]
cols[diam$color == "H"] <- rbow[5]
cols[diam$color == "I"] <- rbow[6]
cols[diam$color == "J"] <- rbow[7]
diam$cols <- cols
plot(diam$price[diam$cut == "Fair"] ~ diam$carat[diam$cut == "Fair"],
main = "Fair", xlab = "carat", ylab = "price", pch = 19,
col = diam$cols[diam$cut == "Fair"])
legend("bottomright", legend = levels(diam$color), pch = rep(19, 7), col = rbow)
plot(diam$price[diam$cut == "Good"] ~ diam$carat[diam$cut == "Good"],
main = "Good", xlab = "carat", ylab = "price", pch = 19,
col = diam$cols[diam$cut == "Good"])
# multi-panel conditioning using ggplot2
# only one line of code!
qplot(x = carat, y = price, colour = color, facets = . ~ cut, data = diam)
# getting started with qplot (quick plot)
set.seed(1410)
dsmall <- diamonds[sample(nrow(diamonds), 100), ]
qplot(x = carat, y = price, data = dsmall)
qplot(x = log(carat), y = log(price), data = dsmall)
qplot(x = carat, y = price, data = dsmall, log = "xy")
# this does the same as above
qplot(x = carat, y = price, data = dsmall) + scale_y_log10() +
scale_x_log10()
# colour, size, shape
qplot(carat, price, data = dsmall, colour = color)
qplot(x = carat, y = price, data = dsmall, size = x)
qplot(carat, price, data = dsmall, shape = cut)
# setting vs mapping
qplot(carat, price, data = dsmall, colour = I("blue") )
qplot(carat, price, data = dsmall, colour = "blue")
# dealing with overplotting
qplot(carat, price, data = diamonds)
qplot(carat, price, data = diamonds, alpha = I(1/100))
# other 2D geoms
qplot(carat, price, data = dsmall, geom = c("point", "smooth"))
geom_smooth: method="auto" and size of largest group is <1000, so using
loess. Use 'method = x' to change the smoothing method.
qplot(carat, price, data = dsmall, geom = "jitter",
position = position_jitter(width = 1, height = 1))
qplot(carat, price, data = dsmall, geom = "polygon")
qplot(carat, price, data = dsmall, geom = "line")
qplot(carat, price, data = dsmall, geom = "path")
# smoothers
qplot(carat, price, data = dsmall, geom = c("point", "smooth"))
geom_smooth: method="auto" and size of largest group is <1000, so using
loess. Use 'method = x' to change the smoothing method.
qplot(carat, price, data = dsmall, geom = c("point", "smooth"), se = F)
geom_smooth: method="auto" and size of largest group is <1000, so using
loess. Use 'method = x' to change the smoothing method.
qplot(carat, price, data = dsmall, geom = c("point", "smooth"), span = 0.2)
geom_smooth: method="auto" and size of largest group is <1000, so using
loess. Use 'method = x' to change the smoothing method.
qplot(carat, price, data = dsmall, geom = c("point", "smooth"), method = "lm")
# boxplots and jittered points
qplot(color, price, data = dsmall, geom = "boxplot")
qplot(color, price, data = diamonds, geom = "boxplot")
qplot(color, price, data = diamonds, geom = "jitter", alpha = I(1/10))
# histograms and density plots
qplot(carat, data = diamonds, geom = "histogram") # play with binwidth (1, 0.1, 0.01)
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
this.
qplot(carat, data = diamonds, geom = "density") # play with adjust (0.5, 1, 3)
# splitting by groups
qplot(carat, data = diamonds, geom = "histogram", fill = color)
stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust
this.
qplot(carat, data = diamonds, geom = "density", colour = color)
# barplots
qplot(color, data = dsmall, geom = "bar")
qplot(color, data = dsmall, geom = "bar", weight = carat, ylab = "sum carat")
# barplots: stacking, filling, dodging
qplot(color, data = dsmall, geom = "bar", fill = cut, position = "stack")
qplot(color, data = dsmall, geom = "bar", fill = cut, position = "fill")
qplot(color, data = dsmall, geom = "bar", fill = cut, position = "dodge")
# time series
economics[1:4,]# US economic data over 40 years
date pce pop psavert uempmed unemploy
1 1967-06-30 507.8 198712 9.8 4.5 2944
2 1967-07-31 510.9 198911 9.8 4.7 2945
3 1967-08-31 516.7 199113 9.0 4.6 2958
4 1967-09-30 513.3 199311 9.8 4.9 3143
(p <- qplot(date, unemploy, data = economics, geom = "line", ylab = "unemployed (1000s)") )
qplot(date, uempmed, data = economics, geom = "line", ylab = "unemployement (weeks)")
# times series and groups
library(nlme)
data(Oxboys) ; Oxboys[1:3,]
Grouped Data: height ~ age | Subject
Subject age height Occasion
1 1 -1.0000 140.5 1
2 1 -0.7479 143.4 2
3 1 -0.4630 144.8 3
qplot(age, height, data = Oxboys, geom = "line")
qplot(age, height, data = Oxboys, group = Subject, geom = "line")
# facetting
qplot(carat, price, data = diamonds, facets = . ~ cut)
# facet_grid vs facet_wrap
qplot(carat, price, data = dsmall) + facet_grid(. ~ cut)
qplot(carat, price, data = dsmall) + facet_wrap(~ cut, nrow = 2)
# facets and scales
qplot(carat, price, data = dsmall) + facet_wrap(~ cut, nrow = 1)
qplot(carat, price, data = dsmall) + facet_wrap( ~ cut, nrow = 1, scales = "free")
# qplot vs ggplot
qplot(x = carat, y = price, data = dsmall)
ggplot(data = dsmall, mapping = aes(x = carat, y = price)) +
layer(geom = "point")
ggplot(dsmall, aes(carat, price)) +
geom_point()
# avoiding duplication
bestfit <- geom_smooth(method = "lm", se = F, colour = "orange", size = 2)
qplot(carat, price, data = dsmall) + bestfit
qplot(depth, price, data = dsmall) + bestfit
# mapping: global vs local
qplot(carat, price, colour = cut, data = dsmall) + geom_smooth(method = 'lm', se = F)
qplot(carat, price, geom = 'smooth', method = 'lm', se = F, data = dsmall) + geom_point(aes(colour = cut))
# FYI, the following two lines do the same in 'full ggplot' grammar
ggplot(dsmall, aes(carat, price, colour = cut)) + geom_point() + geom_smooth(method = "lm", se = F)
ggplot(dsmall, aes(carat, price)) + geom_point(aes(colour = cut)) + geom_smooth(method = "lm", se = F)
# mapping vs setting
qplot(carat, price, data = dsmall) + geom_point(aes(colour = cut))
qplot(carat, price, data = dsmall) + geom_point(colour = "red")
# changing labels
dsmall$microg <- dsmall$carat * 200 * 1000
qplot(microg, price, colour = cut, data = dsmall) +
xlab(expression(Weight~(mu*g)) ) +
ylab("Price ($US)") +
scale_colour_discrete(name = "What is\nthe cut?")
# changing scales (e.g. colour)
p <- qplot(carat, price, colour = cut, data = dsmall) + geom_point()
p + scale_colour_brewer(palette = "Set1")
p + scale_colour_manual(values = c("red","blue", "green", "orange", "purple"))
p + scale_colour_grey(start = 0, end = 0.9) + theme_bw()
# flipping axes
qplot(color, price, data = dsmall, geom = "boxplot") + coord_flip()
# themes
qplot(carat, price, data = dsmall) + theme_grey()
qplot(carat, price, data = dsmall) + theme_bw()
# arranging multiple plots
library(gridExtra)
Loading required package: grid
p1 <- qplot(carat, price, data = dsmall)
p2 <- qplot(color, price, data = dsmall, geom = "boxplot")
grid.arrange(p1, p2, ncol = 2)
# barplot with error bars
# means and standard errors
se <- function(x) sd(x) / sqrt(length(x) )
se.min <- function(x) mean(x) - se(x)
se.max <- function(x) mean(x) + se(x)
#plot
(p <- ggplot(dsmall, aes(color, carat) ) +
stat_summary(fun.y = mean, geom = "bar", fill = "orange") +
stat_summary(fun.ymin = se.min, fun.ymax = se.max,
geom = "errorbar", width = 0.5) +
coord_cartesian(ylim = c(0, 1.8)) )
# barplots and error bars
# create a smaller dataset
library(plyr)
dsmall2 <- ddply(dsmall, .(color), numcolwise(se.max))[, 1:2]
dsmall2$lett <- c(rep("a", 3), rep("b", 2), rep("c", 2))
dsmall2
color carat lett
1 D 0.7365 a
2 E 0.6694 a
3 F 0.9159 a
4 G 1.0116 b
5 H 1.1941 b
6 I 1.5020 c
7 J 1.5341 c
# plot the barplot with the letters
p + geom_text(aes(label = lett, vjust = -0.5), data = dsmall2)
# boxplots with letters
letts <- dsmall2$lett
lettpos <- boxplot(carat ~ color, data = dsmall, plot = FALSE)$stats[5, ]
lettdf <- data.frame(color = levels(dsmall$color), carat = lettpos, lett = letts)
qplot(color, carat, data = dsmall, geom = "boxplot") +
geom_text(aes(label = lett, vjust = -0.5), data = lettdf) +
ylim(c(0, 2.75))
# map of australia
load("ozdata.rda") # check the data
ozplot <- qplot(long, lat, data = ozdata, geom = "polygon", fill = state) + coord_equal()
ozplot
# fraud cases in australia
ozcrime <- read.table("ozcrime.txt", header = TRUE)
ggplot(data = ozdata, aes(x = long, y = lat)) +
geom_polygon(aes(fill = state)) +
geom_point(data = ozcrime, aes(x = long, y = lat, size = fraud), colour = "red") +
geom_text(data = ozcrime, aes(x = long, y = lat, label = city, vjust = 1.5))+
scale_size(range = c(3, 9)) +
coord_fixed()
# zooming in on SWA
ozplot +
coord_fixed(xlim = c(114, 118),
ylim = c(-31, -35.5) ) +
theme(legend.position = 'none')
# saving output
p <- qplot(carat, price, data = dsmall)
ggsave("myplot.png", height = 3, plot = p, width = 4, dpi = 300)
ggsave("myplot.pdf", height = 3, plot = p, width = 4)