This document demonstrates different ways of generating scatter plots for large datasets with the ggplot2 and tabplot plotting packages. Plots make use of the diamonds dataset.
library(knitr)
knitr::opts_chunk$set(tidy=FALSE,
fig.width=10,
fig.height=5,
fig.align='left',
warning=FALSE,
message=FALSE,
echo=TRUE)
options(width = 120)
library(ggplot2)
library(colorspace)
library(gridExtra)
Standard scatterplots of large datasets are blobs that obscure structure in the data. We can mitigate the problem somewhat by manipulating the alpha transparency of geom_point(), which begins to reveal the varying density of data points .
# generate an artificial, large dataset
x1 = rnorm(100000)
x2 = rnorm(100000)
bigdf = as.data.frame(cbind(x1,x2))
p = ggplot(bigdf,aes(x=x1,y=x2)) +
ggtitle("Plot of 100K Point Dataset") +
xlab("x1") +
ylab("x2")
p1 = p + geom_point()
p2 = p + geom_point(alpha = 0.01, colour="purple") +
theme_bw()
grid.arrange(p1,p2,ncol=2)
Using density lines or heatmaps can reveal structure in large datasets.
p = ggplot(bigdf,aes(x=x1,y=x2)) +
ggtitle("Plot of 100K Point Dataset") +
xlab("x1") +
ylab("x2")
p1 = p +
geom_point(alpha = 0.01, colour="orange") +
geom_density2d() +
theme_bw()
p2 = p +
stat_bin_hex(colour="white", na.rm=TRUE) +
scale_fill_gradientn(colours=c("purple","green"),
name = "Frequency",
na.value=NA)
grid.arrange(p1,p2,ncol=2)
# create a color palette to use in smoothed scatterplot
library(RColorBrewer)
buylrd = c("#313695", "#4575B4", "#74ADD1", "#ABD9E9", "#E0F3F8", "#FFFFBF",
"#FEE090", "#FDAE61", "#F46D43", "#D73027", "#A50026")
myColRamp = colorRampPalette(c(buylrd))
# smoothed scatterplot
smoothScatter(x=x1,y=x2,
colramp=myColRamp,
main="Plot of 100K Point Dataset",
xlab="x1",
ylab="x2")
sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 hexbin_1.27.1 gridExtra_2.2.1 colorspace_1.3-2 ggplot2_2.2.1 knitr_1.15.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.10 magrittr_1.5 MASS_7.3-45 munsell_0.4.3 lattice_0.20-35 stringr_1.2.0
## [7] plyr_1.8.4 tools_3.3.3 grid_3.3.3 gtable_0.2.0 KernSmooth_2.23-15 htmltools_0.3.5
## [13] yaml_2.1.14 lazyeval_0.2.0 rprojroot_1.2 digest_0.6.12 tibble_1.3.0 evaluate_0.10
## [19] rmarkdown_1.4 labeling_0.3 stringi_1.1.3 scales_0.4.1 backports_1.0.5