This document demonstrates different ways of generating scatter plots for large datasets with the ggplot2 and tabplot plotting packages. Plots make use of the diamonds dataset.
library(knitr)
knitr::opts_chunk$set(tidy=T,
fig.width=10,
fig.height=5,
fig.align='left',
warning=FALSE,
message=FALSE,
echo=TRUE)
options(width = 120)
library(ggplot2)
library(colorspace)
library(gridExtra)
attach(diamonds)Standard scatterplots of large datasets are blobs that obscure structure in the data. We can mitigate the problem somewhat by manipulating the alpha transparency of geom_point().
p = ggplot(diamonds,aes(x=carat,y=price)) +
ggtitle("Price vs. Carat Weight") +
xlab("Carat Weight") +
ylab("Price")
p1 = p + geom_point()
p2 = p + geom_point(alpha = 0.1, colour="purple") +
theme_bw()
grid.arrange(p1,p2,ncol=2)Using density lines or heatmaps can reveal structure in large datasets. In addition, a log10 transform of the price and carat variables also helps to reveal clusters and price points around standard carat weights (1/4, 1/2, 1, 1.5, 2, 3).
p = ggplot(diamonds,aes(x=log10(carat),y=log10(price))) +
ggtitle("Price vs. Carat Weight") +
xlab("log10(Carat Weight)") +
ylab("log10(Price)") +
ylim(2.5,4.5)
p1 = p +
geom_point(alpha = 0.1, colour="orange") +
geom_density2d() +
theme_bw()
p2 = p +
geom_density2d() +
theme_bw()
grid.arrange(p1,p2,ncol=2)p1 = p +
stat_bin_hex(colour="white", na.rm=TRUE) +
scale_fill_gradientn(colours=c("purple","green"), name = "Frequency", na.value=NA)
p2 = p +
stat_bin_hex(colour="white", na.rm=TRUE) +
scale_fill_gradientn(colours=c("blue","orange"), name = "Frequency", na.value=NA) +
theme_bw()
grid.arrange(p1,p2,ncol=2)p1 = ggplot(diamonds,aes(x=log10(carat),y=log10(price),colour=color)) +
ggtitle("Price vs. Carat Weight") +
xlab("log10(Carat Weight)") +
ylab("log10(Price)") +
ylim(2.5,4.5) +
geom_point()
p2 = ggplot(diamonds,aes(x=log10(carat),y=log10(price),colour=cut)) +
ggtitle("Price vs. Carat Weight") +
xlab("log10(Carat Weight)") +
ylab("log10(Price)") +
ylim(2.5,4.5) +
geom_point()
grid.arrange(p1,p2,ncol=2)# create a color palette to use in smoothed scatterplot
library(RColorBrewer)
buylrd = c("#313695", "#4575B4", "#74ADD1", "#ABD9E9", "#E0F3F8", "#FFFFBF",
"#FEE090", "#FDAE61", "#F46D43", "#D73027", "#A50026")
myColRamp = colorRampPalette(c(buylrd))
# smoothed scatterplot
smoothScatter(x=log10(carat),y=log10(price),
colramp=myColRamp,
main="Price vs. Caret Weight",
xlab="log10(Carat Weight)",
ylab="log10(Price)")library(tabplot)
tableplot(diamonds, select = c(carat, price, cut, color, clarity), sortCol = 1)sessionInfo()## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tabplot_1.3-1 ffbase_0.12.3 ff_2.2-13 bit_1.1-12 RColorBrewer_1.1-2 hexbin_1.27.1
## [7] gridExtra_2.2.1 colorspace_1.3-2 ggplot2_2.2.1 knitr_1.15.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.10 magrittr_1.5 MASS_7.3-45 munsell_0.4.3 lattice_0.20-35 fastmatch_1.1-0
## [7] stringr_1.2.0 plyr_1.8.4 tools_3.3.3 grid_3.3.3 gtable_0.2.0 KernSmooth_2.23-15
## [13] htmltools_0.3.5 yaml_2.1.14 lazyeval_0.2.0 rprojroot_1.2 digest_0.6.12 tibble_1.3.0
## [19] formatR_1.4 evaluate_0.10 rmarkdown_1.4 labeling_0.3 stringi_1.1.3 scales_0.4.1
## [25] backports_1.0.5