This document demonstrates different ways of generating histograms with the base, lattice and ggplot2 plotting packages. Plots make use of the iris dataset. Summary statistics generated with dplyr have been included where appropriate.
library(knitr)
library(dplyr)
library(psych)
knitr::opts_chunk$set(tidy=T,
fig.width=8,
fig.height=6,
fig.align='center',
warning=FALSE,
message=FALSE,
echo=TRUE)
options(width = 120)
data(iris); attach(iris)par(mfrow=c(1,3))
# frequencies with line for mean
hist(Sepal.Length, col="orange",
main="Sepal Length (Frequency)",
xlab="Sepal Length (cm)")
abline(v=mean(Sepal.Length),col="blue",lwd=2)
# density with superimposed normal curve
hist(Sepal.Length, prob=TRUE, col="gray",
main="Sepal Length (Density)",
xlab="Sepal Length (cm)",
ylim=c(0.0,0.7))
curve(dnorm(x, mean=mean(Sepal.Length), sd=sd(Sepal.Length)), add=TRUE, col="purple")
# density with superimposed smoothed density curves; note how adjust parameter reveals
# multi-modal distribution
hist(Sepal.Length, prob=TRUE, col="violet",
main="Sepal Length (Density)",
xlab="Sepal Length (cm)",
ylim=c(0.0,0.7))
lines(density(Sepal.Length, adjust=0.50), col="red", lwd=2)
lines(density(Sepal.Length), col="blue", lwd=2)
lines(density(Sepal.Length, adjust=2), lty="dotted", col="darkgreen", lwd=2)par(mfrow=c(1,1))# function for standard error of the mean
se = function(x) {
sd(x, na.rm=TRUE) / sqrt(length(x[!is.na(x)]))
}
# summary statistics with dplyr
sepal_length = iris %>%
summarise(ct=n(),
mn=mean(Sepal.Length),
md=median(Sepal.Length),
min=min(Sepal.Length),
max=max(Sepal.Length),
range=max-min,
sd=sd(Sepal.Length),
se(Sepal.Length),
skew(Sepal.Length),
kurtosi(Sepal.Length))
colnames(sepal_length) =
c('Count','Mean','Median','Min','Max','Range','Std Dev','Std Err','Skew','Kurtosis')
kable(sepal_length, format='markdown', digits=4)| Count | Mean | Median | Min | Max | Range | Std Dev | Std Err | Skew | Kurtosis |
|---|---|---|---|---|---|---|---|---|---|
| 150 | 5.8433 | 5.8 | 4.3 | 7.9 | 3.6 | 0.8281 | 0.0676 | 0.3086 | -0.6058 |
sepal_length_percentiles = iris %>%
summarise('Min' = min(Sepal.Length),
'10%' = quantile(Sepal.Length,prob=.10),
'20%' = quantile(Sepal.Length,prob=.20),
'30%' = quantile(Sepal.Length,prob=.30),
'40%' = quantile(Sepal.Length,prob=.40),
'50% / Median' = quantile(Sepal.Length,prob=.50),
'60%' = quantile(Sepal.Length,prob=.60),
'70%' = quantile(Sepal.Length,prob=.70),
'80%' = quantile(Sepal.Length,prob=.80),
'90%' = quantile(Sepal.Length,prob=.90),
'95%' = quantile(Sepal.Length,prob=.95),
'Max' = max(Sepal.Length))
kable(sepal_length_percentiles, format='markdown', digits=1)| Min | 10% | 20% | 30% | 40% | 50% / Median | 60% | 70% | 80% | 90% | 95% | Max |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4.3 | 4.8 | 5 | 5.3 | 5.6 | 5.8 | 6.1 | 6.3 | 6.5 | 6.9 | 7.3 | 7.9 |
library(lattice)
histogram(~ Sepal.Length | Species,
main="Sepal Length By Species",
xlab="Sepal Length (cm)",
breaks=seq(from=4,to=9,by=0.25),
col=c("orange","lightblue","purple"),
panel=function(x, col=col,...){
panel.histogram(x,col=col[packet.number()],...)
},
layout=c(1,3))densityplot(~ Sepal.Length | Species,
main="Sepal Length By Species",
xlab="Sepal Length (cm)",
layout=c(1,3))densityplot(~ Sepal.Length,
groups=Species,
xlab="Sepal Length (cm)",
main="Sepal Length By Species",
plot.points=FALSE,
auto.key=TRUE)sepal_length_by_species = iris %>%
group_by(Species) %>%
summarise(ct=n(),
mn=mean(Sepal.Length),
md=median(Sepal.Length),
min=min(Sepal.Length),
max=max(Sepal.Length),
range=max-min,
sd=sd(Sepal.Length),
se(Sepal.Length),
skew(Sepal.Length),
kurtosi(Sepal.Length))
colnames(sepal_length_by_species) =
c('Species','Count','Mean','Median','Min','Max','Range','Std Dev','Std Err','Skew','Kurtosis')
kable(sepal_length_by_species, format='markdown', digits=4)| Species | Count | Mean | Median | Min | Max | Range | Std Dev | Std Err | Skew | Kurtosis |
|---|---|---|---|---|---|---|---|---|---|---|
| setosa | 50 | 5.006 | 5.0 | 4.3 | 5.8 | 1.5 | 0.3525 | 0.0498 | 0.1130 | -0.4509 |
| versicolor | 50 | 5.936 | 5.9 | 4.9 | 7.0 | 2.1 | 0.5162 | 0.0730 | 0.0991 | -0.6939 |
| virginica | 50 | 6.588 | 6.5 | 4.9 | 7.9 | 3.0 | 0.6359 | 0.0899 | 0.1110 | -0.2033 |
library(ggplot2)
library(ggthemes)
library(gridExtra)
ggplot(data = iris, aes(x=Sepal.Length,y=..density..)) +
geom_histogram(fill="cornsilk",colour="grey60",size=.5) +
geom_line(stat="density", adjust=0.5, colour="red") +
geom_line(stat="density") +
geom_line(stat="density", adjust=2.0, colour="blue") +
xlab("Sepal Length (cm)") +
ylab("Density") +
xlim(3,9) +
theme_economist() +
ggtitle("Sepal Length\n")ggplot(data = iris, aes(x=Sepal.Length)) +
geom_histogram(aes(fill=Species)) +
scale_fill_brewer(palette="Set2") +
facet_wrap( ~ Species, ncol=1) +
xlab("Sepal Length (cm)") +
ylab("Frequency") +
xlim(3,9) +
theme_bw() +
ggtitle("Sepal Length By Species\n")p1 = ggplot(data = iris, aes(x=Sepal.Length, colour=Species)) +
geom_line(stat="density") +
xlim(3,9) +
xlab("Sepal Length (cm)") +
ylab("Density") +
ggtitle("Sepal Length By Species (Density)\n") +
theme(plot.title = element_text(lineheight=.8, face="plain"))
p2 = ggplot(data = iris, aes(x=Sepal.Length, fill=Species)) +
geom_density(colour=NA,alpha=0.2) +
geom_line(stat="density") +
xlim(3,9) +
xlab("Sepal Length (cm)") +
ylab("Density") +
ggtitle("Sepal Length By Species (Density)\n") +
theme(plot.title = element_text(lineheight=.8, face="plain"))
grid.arrange(p1,p2,nrow=2)p1 = ggplot(data = iris, aes(x=Sepal.Length, fill=Species)) +
geom_bar()+
xlab("Sepal Length (cm)") +
ylab("Frequency") +
ggtitle("Sepal Length By Species\n") +
theme_bw()
p2 = ggplot(data = iris, aes(x=Sepal.Length, fill=Species)) +
geom_bar(position="fill")+
xlab("Sepal Length (cm)") +
ylab("Frequency") +
ggtitle("Sepal Length By Species\n") +
theme_bw()
grid.arrange(p1,p2,nrow=2)sessionInfo()## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gridExtra_2.3 ggthemes_3.4.0 ggplot2_2.2.1 lattice_0.20-35 bindrcpp_0.2 psych_1.7.8 dplyr_0.7.4
## [8] knitr_1.18
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.14 formatR_1.5 RColorBrewer_1.1-2 pillar_1.0.1 compiler_3.4.3 highr_0.6
## [7] plyr_1.8.4 bindr_0.1 tools_3.4.3 digest_0.6.13 evaluate_0.10.1 tibble_1.4.1
## [13] nlme_3.1-131 gtable_0.2.0 pkgconfig_2.0.1 rlang_0.1.6 yaml_2.1.16 parallel_3.4.3
## [19] stringr_1.2.0 rprojroot_1.3-1 grid_3.4.3 glue_1.2.0 R6_2.2.2 foreign_0.8-69
## [25] rmarkdown_1.8 magrittr_1.5 backports_1.1.2 scales_0.5.0 htmltools_0.3.6 assertthat_0.2.0
## [31] mnormt_1.5-5 colorspace_1.3-2 labeling_0.3 stringi_1.1.6 lazyeval_0.2.1 munsell_0.4.3