Introduction

This document demonstrates different ways of generating histograms with the base, lattice and ggplot2 plotting packages. Plots make use of the iris dataset. Summary statistics generated with dplyr have been included where appropriate.

library(knitr)
library(dplyr)
library(psych)
knitr::opts_chunk$set(tidy=T, 
               fig.width=8,
               fig.height=6,
               fig.align='center',
               warning=FALSE,
               message=FALSE,
               echo=TRUE)
options(width = 120)
data(iris); attach(iris)

Base Plotting Package

par(mfrow=c(1,3))
# frequencies with line for mean
hist(Sepal.Length, col="orange", 
     main="Sepal Length (Frequency)",
     xlab="Sepal Length (cm)")
abline(v=mean(Sepal.Length),col="blue",lwd=2)

# density with superimposed normal curve
hist(Sepal.Length, prob=TRUE, col="gray", 
     main="Sepal Length (Density)",
     xlab="Sepal Length (cm)",
     ylim=c(0.0,0.7))
curve(dnorm(x, mean=mean(Sepal.Length), sd=sd(Sepal.Length)), add=TRUE, col="purple")

# density with superimposed smoothed density curves; note how adjust parameter reveals
# multi-modal distribution
hist(Sepal.Length, prob=TRUE, col="violet", 
     main="Sepal Length (Density)",
     xlab="Sepal Length (cm)",
     ylim=c(0.0,0.7))
lines(density(Sepal.Length, adjust=0.50), col="red", lwd=2)
lines(density(Sepal.Length), col="blue", lwd=2)
lines(density(Sepal.Length, adjust=2), lty="dotted", col="darkgreen", lwd=2)

par(mfrow=c(1,1))

Summary Statistics - Sepal Length

# function for standard error of the mean 
se = function(x) {
  sd(x, na.rm=TRUE) / sqrt(length(x[!is.na(x)])) 
}

# summary statistics with dplyr
sepal_length = iris %>%
  summarise(ct=n(),
            mn=mean(Sepal.Length),
            md=median(Sepal.Length),
            min=min(Sepal.Length),
            max=max(Sepal.Length),
            range=max-min,
            sd=sd(Sepal.Length),
            se(Sepal.Length),
            skew(Sepal.Length),
            kurtosi(Sepal.Length))
colnames(sepal_length) = 
  c('Count','Mean','Median','Min','Max','Range','Std Dev','Std Err','Skew','Kurtosis')
kable(sepal_length, format='markdown', digits=4)
Count Mean Median Min Max Range Std Dev Std Err Skew Kurtosis
150 5.8433 5.8 4.3 7.9 3.6 0.8281 0.0676 0.3086 -0.6058
sepal_length_percentiles = iris %>%
    summarise('Min' = min(Sepal.Length),
              '10%' = quantile(Sepal.Length,prob=.10),
              '20%' = quantile(Sepal.Length,prob=.20),
              '30%' = quantile(Sepal.Length,prob=.30),
              '40%' = quantile(Sepal.Length,prob=.40),
              '50% / Median' = quantile(Sepal.Length,prob=.50),
              '60%' = quantile(Sepal.Length,prob=.60),
              '70%' = quantile(Sepal.Length,prob=.70),
              '80%' = quantile(Sepal.Length,prob=.80),
              '90%' = quantile(Sepal.Length,prob=.90),
              '95%' = quantile(Sepal.Length,prob=.95),
              'Max' = max(Sepal.Length))
kable(sepal_length_percentiles, format='markdown', digits=1)
Min 10% 20% 30% 40% 50% / Median 60% 70% 80% 90% 95% Max
4.3 4.8 5 5.3 5.6 5.8 6.1 6.3 6.5 6.9 7.3 7.9

Lattice Plotting Package

library(lattice)
histogram(~ Sepal.Length | Species,
          main="Sepal Length By Species",
          xlab="Sepal Length (cm)",
          breaks=seq(from=4,to=9,by=0.25),
          col=c("orange","lightblue","purple"),
          panel=function(x, col=col,...){
            panel.histogram(x,col=col[packet.number()],...)
          },
          layout=c(1,3))

densityplot(~ Sepal.Length | Species,
            main="Sepal Length By Species",
            xlab="Sepal Length (cm)",
            layout=c(1,3))

densityplot(~ Sepal.Length,
            groups=Species,
            xlab="Sepal Length (cm)",
            main="Sepal Length By Species",
            plot.points=FALSE,
            auto.key=TRUE)

Summary Statistics - Sepal Length by Species

sepal_length_by_species = iris %>%
  group_by(Species) %>% 
  summarise(ct=n(),
            mn=mean(Sepal.Length),
            md=median(Sepal.Length),
            min=min(Sepal.Length),
            max=max(Sepal.Length),
            range=max-min,
            sd=sd(Sepal.Length),
            se(Sepal.Length),
            skew(Sepal.Length),
            kurtosi(Sepal.Length))
colnames(sepal_length_by_species) = 
  c('Species','Count','Mean','Median','Min','Max','Range','Std Dev','Std Err','Skew','Kurtosis')
kable(sepal_length_by_species, format='markdown', digits=4)
Species Count Mean Median Min Max Range Std Dev Std Err Skew Kurtosis
setosa 50 5.006 5.0 4.3 5.8 1.5 0.3525 0.0498 0.1130 -0.4509
versicolor 50 5.936 5.9 4.9 7.0 2.1 0.5162 0.0730 0.0991 -0.6939
virginica 50 6.588 6.5 4.9 7.9 3.0 0.6359 0.0899 0.1110 -0.2033

ggplot2 Plotting Package

library(ggplot2)
library(ggthemes)
library(gridExtra)
ggplot(data = iris, aes(x=Sepal.Length,y=..density..)) +
  geom_histogram(fill="cornsilk",colour="grey60",size=.5) + 
  geom_line(stat="density", adjust=0.5, colour="red") +
  geom_line(stat="density") +
  geom_line(stat="density", adjust=2.0, colour="blue") +
  xlab("Sepal Length (cm)") +
  ylab("Density") +
  xlim(3,9) +
  theme_economist() +
  ggtitle("Sepal Length\n")

ggplot(data = iris, aes(x=Sepal.Length)) +
  geom_histogram(aes(fill=Species)) +
  scale_fill_brewer(palette="Set2") +
  facet_wrap( ~ Species, ncol=1) +
  xlab("Sepal Length (cm)") +
  ylab("Frequency") +
  xlim(3,9) +
  theme_bw() +
  ggtitle("Sepal Length By Species\n")

p1 = ggplot(data = iris, aes(x=Sepal.Length, colour=Species)) + 
    geom_line(stat="density") + 
    xlim(3,9) +
    xlab("Sepal Length (cm)") +
    ylab("Density")  + 
    ggtitle("Sepal Length By Species (Density)\n") +
    theme(plot.title = element_text(lineheight=.8, face="plain"))
p2 = ggplot(data = iris, aes(x=Sepal.Length, fill=Species)) + 
    geom_density(colour=NA,alpha=0.2) + 
    geom_line(stat="density") + 
    xlim(3,9) +
    xlab("Sepal Length (cm)") +
    ylab("Density")  + 
    ggtitle("Sepal Length By Species (Density)\n") +
    theme(plot.title = element_text(lineheight=.8, face="plain"))
grid.arrange(p1,p2,nrow=2)

p1 = ggplot(data = iris, aes(x=Sepal.Length, fill=Species)) +
  geom_bar()+
  xlab("Sepal Length (cm)") +
  ylab("Frequency") +
  ggtitle("Sepal Length By Species\n") +
  theme_bw()
p2 = ggplot(data = iris, aes(x=Sepal.Length, fill=Species)) +
  geom_bar(position="fill")+
  xlab("Sepal Length (cm)") +
  ylab("Frequency") +
  ggtitle("Sepal Length By Species\n") +
  theme_bw()
grid.arrange(p1,p2,nrow=2)

Programming Environment

sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gridExtra_2.3   ggthemes_3.4.0  ggplot2_2.2.1   lattice_0.20-35 bindrcpp_0.2    psych_1.7.8     dplyr_0.7.4    
## [8] knitr_1.18     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.14       formatR_1.5        RColorBrewer_1.1-2 pillar_1.0.1       compiler_3.4.3     highr_0.6         
##  [7] plyr_1.8.4         bindr_0.1          tools_3.4.3        digest_0.6.13      evaluate_0.10.1    tibble_1.4.1      
## [13] nlme_3.1-131       gtable_0.2.0       pkgconfig_2.0.1    rlang_0.1.6        yaml_2.1.16        parallel_3.4.3    
## [19] stringr_1.2.0      rprojroot_1.3-1    grid_3.4.3         glue_1.2.0         R6_2.2.2           foreign_0.8-69    
## [25] rmarkdown_1.8      magrittr_1.5       backports_1.1.2    scales_0.5.0       htmltools_0.3.6    assertthat_0.2.0  
## [31] mnormt_1.5-5       colorspace_1.3-2   labeling_0.3       stringi_1.1.6      lazyeval_0.2.1     munsell_0.4.3