Introduction

This document demonstrates different ways of generating scatter plots for large datasets with the ggplot2 and tabplot plotting packages. Plots make use of the diamonds dataset.

library(knitr)
knitr::opts_chunk$set(tidy=T, 
               fig.width=10,
               fig.height=5,
               fig.align='left',
               warning=FALSE,
               message=FALSE,
               echo=TRUE)
options(width = 120)
library(ggplot2)
library(colorspace)
library(gridExtra)
attach(diamonds)

The Overplotting Problem

Standard scatterplots of large datasets are blobs that obscure structure in the data. We can mitigate the problem somewhat by manipulating the alpha transparency of geom_point().

p = ggplot(diamonds,aes(x=carat,y=price))  +
  ggtitle("Price vs. Carat Weight") +
  xlab("Carat Weight") +
  ylab("Price")
p1 = p + geom_point()
p2 = p + geom_point(alpha = 0.1, colour="purple") +
  theme_bw()
grid.arrange(p1,p2,ncol=2)

Better Solutions to Overplotting

ggplot2 Scatterplots

Using density lines or heatmaps can reveal structure in large datasets. In addition, a log10 transform of the price and carat variables also helps to reveal clusters and price points around standard carat weights (1/4, 1/2, 1, 1.5, 2, 3).

p = ggplot(diamonds,aes(x=log10(carat),y=log10(price))) +
  ggtitle("Price vs. Carat Weight") +
  xlab("log10(Carat Weight)") +
  ylab("log10(Price)") +
  ylim(2.5,4.5)
p1 = p + 
  geom_point(alpha = 0.1, colour="orange") + 
  geom_density2d() + 
  theme_bw()
p2 = p +
  geom_density2d() + 
  theme_bw()
grid.arrange(p1,p2,ncol=2)

p1 = p +
  stat_bin_hex(colour="white", na.rm=TRUE) +
  scale_fill_gradientn(colours=c("purple","green"), name = "Frequency", na.value=NA)
p2 = p +
  stat_bin_hex(colour="white", na.rm=TRUE) +
  scale_fill_gradientn(colours=c("blue","orange"), name = "Frequency", na.value=NA) + 
  theme_bw()
grid.arrange(p1,p2,ncol=2)

p1 = ggplot(diamonds,aes(x=log10(carat),y=log10(price),colour=color)) +
  ggtitle("Price vs. Carat Weight") +
  xlab("log10(Carat Weight)") +
  ylab("log10(Price)") +
  ylim(2.5,4.5) + 
  geom_point()
p2 = ggplot(diamonds,aes(x=log10(carat),y=log10(price),colour=cut)) +
  ggtitle("Price vs. Carat Weight") +
  xlab("log10(Carat Weight)") +
  ylab("log10(Price)") +
  ylim(2.5,4.5) + 
  geom_point()
grid.arrange(p1,p2,ncol=2)

Smoothed Scatterplots

# create a color palette to use in smoothed scatterplot
library(RColorBrewer)
buylrd = c("#313695", "#4575B4", "#74ADD1", "#ABD9E9", "#E0F3F8", "#FFFFBF",
           "#FEE090", "#FDAE61", "#F46D43", "#D73027", "#A50026") 
myColRamp = colorRampPalette(c(buylrd))

# smoothed scatterplot
smoothScatter(x=log10(carat),y=log10(price),
              colramp=myColRamp,
              main="Price vs. Caret Weight",
              xlab="log10(Carat Weight)",
              ylab="log10(Price)")

Visualization of large datasets with tabplot

library(tabplot)
tableplot(diamonds, select = c(carat, price, cut, color, clarity), sortCol = 1)

Programming Environment

sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tabplot_1.3-1      ffbase_0.12.3      ff_2.2-13          bit_1.1-12         RColorBrewer_1.1-2 hexbin_1.27.1     
##  [7] gridExtra_2.2.1    colorspace_1.3-2   ggplot2_2.2.1      knitr_1.15.1      
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.10       magrittr_1.5       MASS_7.3-45        munsell_0.4.3      lattice_0.20-35    fastmatch_1.1-0   
##  [7] stringr_1.2.0      plyr_1.8.4         tools_3.3.3        grid_3.3.3         gtable_0.2.0       KernSmooth_2.23-15
## [13] htmltools_0.3.5    yaml_2.1.14        lazyeval_0.2.0     rprojroot_1.2      digest_0.6.12      tibble_1.3.0      
## [19] formatR_1.4        evaluate_0.10      rmarkdown_1.4      labeling_0.3       stringi_1.1.3      scales_0.4.1      
## [25] backports_1.0.5