This document demonstrates different ways of generating scatter plots for large datasets with the ggplot2 plotting package. Plots make use of a generated dataset.
library(knitr)
knitr::opts_chunk$set(tidy=FALSE,
fig.width=10,
fig.height=5,
fig.align='left',
warning=FALSE,
message=FALSE,
echo=TRUE)
options(width = 120)
library(ggplot2)
library(colorspace)
library(gridExtra)
# Note: package hexbin must be installed for hex plot below to workStandard scatterplots of large datasets are blobs that obscure structure in the data. We can mitigate the problem somewhat by manipulating the alpha transparency of geom_point(), which begins to reveal the varying density of data points .
# generate an artificial, large dataset
x1 = rnorm(100000)
x2 = rnorm(100000)
bigdf = as.data.frame(cbind(x1,x2))p = ggplot(bigdf,aes(x=x1,y=x2)) +
ggtitle("Plot of 100K Point Dataset") +
xlab("x1") +
ylab("x2")
p1 = p + geom_point()
p2 = p + geom_point(alpha = 0.01, colour="purple") +
theme_bw()
grid.arrange(p1,p2,ncol=2)Using density lines or heatmaps can reveal structure in large datasets.
p = ggplot(bigdf,aes(x=x1,y=x2)) +
ggtitle("Plot of 100K Point Dataset") +
xlab("x1") +
ylab("x2")
p1 = p +
geom_point(alpha = 0.01, colour="orange") +
geom_density2d() +
theme_bw()
p2 = p +
# geom_hex() simple
geom_hex(colour="white", na.rm=TRUE) +
scale_fill_gradientn(colours=c("purple","green"),
name = "Frequency",
na.value=NA)
grid.arrange(p1,p2,ncol=2)# create a color palette to use in smoothed scatterplot
library(RColorBrewer)
buylrd = c("#313695", "#4575B4", "#74ADD1", "#ABD9E9", "#E0F3F8", "#FFFFBF",
"#FEE090", "#FDAE61", "#F46D43", "#D73027", "#A50026")
myColRamp = colorRampPalette(c(buylrd))
# smoothed scatterplot
smoothScatter(x=x1,y=x2,
colramp=myColRamp,
main="Plot of 100K Point Dataset",
xlab="x1",
ylab="x2")sessionInfo()## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 hexbin_1.27.2 gridExtra_2.3 colorspace_1.3-2 ggplot2_3.0.0 knitr_1.20
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.18 pillar_1.3.0 compiler_3.4.3 plyr_1.8.4 bindr_0.1.1 tools_3.4.3
## [7] digest_0.6.15 evaluate_0.11 tibble_1.4.2 gtable_0.2.0 lattice_0.20-35 pkgconfig_2.0.1
## [13] rlang_0.2.1 yaml_2.2.0 bindrcpp_0.2.2 withr_2.1.2 dplyr_0.7.6 stringr_1.3.1
## [19] rprojroot_1.3-2 grid_3.4.3 tidyselect_0.2.4 glue_1.3.0 R6_2.2.2 rmarkdown_1.10
## [25] purrr_0.2.5 magrittr_1.5 backports_1.1.2 scales_0.5.0 htmltools_0.3.6 MASS_7.3-50
## [31] assertthat_0.2.0 labeling_0.3 KernSmooth_2.23-15 stringi_1.2.4 lazyeval_0.2.1 munsell_0.5.0
## [37] crayon_1.3.4