Let \({\bf x} = (x_1,\ldots,x_m)\) and \({\bf y} = (y_1,\ldots,y_n)\) be two samples from continuous distributions, say \(F:{\mathbb R} \rightarrow (0,1)\) and \(G:{\mathbb R} \rightarrow (0,1)\), respectively. The following R code presents several tools for visually comparing the distribution of these two populations. There exist formal tools for testing \(F=G\), but it is often helpful to visualise possible discrepancies between the samples.
You can find more information about boxplots in the corresponding Wikipedia entry and additional details in this tutorial.
rm(list=ls())
# Required packages
# install.packages("twopiece", repos="http://R-Forge.R-project.org")
library(twopiece)
## Loading required package: flexclust
## Loading required package: grid
## Loading required package: lattice
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: mvtnorm
## Loading required package: label.switching
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
boxplot(x,y, names = c("Population 1","Population 2"), cex.lab=1.5, cex.axis = 1.5, col = c('red','blue'))
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
boxplot(x,y, names = c("Population 1","Population 2"), cex.lab=1.5, cex.axis = 1.5, col = c('red','blue'))
You can find more information about histograms in the corresponding Wikipedia entry and additional details in this tutorial.
rm(list=ls())
# Required packages
library(twopiece)
library(ggplot2)
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
# Overplot
hist(x, probability=T, col=rgb(1,0,0,0.75), xlim = c(-4,4), ylim = c(0,0.5), xlab ="Data", cex.axis =1.5, cex.lab = 1.5, main = "Histograms")
hist(y,add=T, probability=T, col=rgb(0,0,1,0.75))
box()
legend(1.1, 0.5, c("Population 1","Population 2"), col=c(rgb(1,0,0,0.75),rgb(0,0,1,0.75)),
text.col = "black", lty = c(1, 1), lwd = c(2,2),
merge = TRUE, bg = "gray90",cex=1.1)
# Back-to-back
df = data.frame(x = x, x2 = y)
bb = ggplot(df, aes(x)) + geom_histogram( aes(x = x, y = ..density..), fill="red", bins = 20) +
geom_histogram( aes(x = x2, y = -..density..), fill= "blue", bins = 20) +
xlab("Data") + ylab("Density") + theme(axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
bb
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
# Overplot
hist(x, probability=T, col=rgb(1,0,0,0.75), xlim = c(-5,4.5), ylim = c(0,0.5), xlab ="Data", cex.axis =1.5, cex.lab = 1.5, main = "Histograms")
hist(y,add=T, probability=T, col=rgb(0,0,1,0.75))
box()
legend(1.1, 0.5, c("Population 1","Population 2"), col=c(rgb(1,0,0,0.75),rgb(0,0,1,0.75)),
text.col = "black", lty = c(1, 1), lwd = c(2,2),
merge = TRUE, bg = "gray90",cex=1.1)
# Back-to-back
df = data.frame(x = x, x2 = y)
bb = ggplot(df, aes(x)) + geom_histogram( aes(x = x, y = ..density..), fill="red", bins = 20) +
geom_histogram( aes(x = x2, y = -..density..), fill= "blue", bins = 20) +
xlab("Data") + ylab("Density") + theme(axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
bb
You can find more information about kernel density estimaton in the corresponding Wikipedia entry and additional details in this tutorial.
rm(list=ls())
# Required packages
library(twopiece)
######################################################################################################
# Kernel density estimator (PDF)
######################################################################################################
# x: vector of quantiles
# h: smoothing parameter (bandwidth)
# data: data set
# Function
kde.dnorm <- function(x,h,data) mean( dnorm( x-data, 0, h ) )
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
hx <- bw.nrd0(x) # bandwith for x
hy <- bw.nrd0(y) # bandwith for y
# Kernel density estimators
kdex <- Vectorize(function(t) kde.dnorm(t,hx,x))
kdey <- Vectorize(function(t) kde.dnorm(t,hy,y))
curve(kdex, xlim = c(-4,4), ylim = c(0,0.5), xlab ="", ylab = "Density", cex.axis =1.5, cex.lab = 1.5, main = "KDE", col="red",lwd=2)
curve(kdey, xlim = c(-4,4), ylim = c(0,0.5), xlab ="", ylab = "Density", cex.axis =1.5, cex.lab = 1.5, main = "KDE", col="blue", lwd=2, add=T)
legend(1.1, 0.5, c("Population 1","Population 2"), col=c("red","blue"),
text.col = "black", lty = c(1, 1), lwd = c(2,2),
merge = TRUE, bg = "gray90",cex=1.1)
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
hx <- bw.nrd0(x) # bandwith for x
hy <- bw.nrd0(y) # bandwith for y
# Kernel density estimators
kdex <- Vectorize(function(t) kde.dnorm(t,hx,x))
kdey <- Vectorize(function(t) kde.dnorm(t,hy,y))
curve(kdex, xlim = c(-5,4.5), ylim = c(0,0.5), xlab ="", ylab = "Density", cex.axis =1.5, cex.lab = 1.5, main = "KDE", col="red",lwd=2)
curve(kdey, xlim = c(-5,4.5), ylim = c(0,0.5), xlab ="", ylab = "Density", cex.axis =1.5, cex.lab = 1.5, main = "KDE", col="blue", lwd=2, add=T)
legend(1.1, 0.5, c("Population 1","Population 2"), col=c("red","blue"),
text.col = "black", lty = c(1, 1), lwd = c(2,2),
merge = TRUE, bg = "gray90",cex=1.1)
You can find more information about violin plots in the corresponding Wikipedia entry and additional details in this tutorial.
rm(list=ls())
# Required packages
library(twopiece)
library(ggplot2)
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
# Violin plots of the two populations
df <- reshape2::melt(data.frame(cbind(x,y)), id.vars = NULL)
vp <- ggplot(df, aes(x = variable, y = value)) + geom_violin(scale="width",adjust = 1,width = 0.5,fill = "grey80") + geom_boxplot(width=0.075,fatten = 3) +
theme_bw() + xlab("Data") + ylab("Density") + theme(axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14)) +
theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14)) +
stat_summary(fun.y = mean, colour = c("red","blue"), geom="point", shape=18, size=4) +
scale_x_discrete(labels = c("Population 1", "Population 2"))
vp
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
# Violin plots of the two populations
df <- reshape2::melt(data.frame(cbind(x,y)), id.vars = NULL)
vp <- ggplot(df, aes(x = variable, y = value)) + geom_violin(scale="width",adjust = 1,width = 0.5,fill = "grey80") + geom_boxplot(width=0.075,fatten = 3) +
theme_bw() + xlab("Data") + ylab("Density") + theme(axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14)) +
theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14)) +
stat_summary(fun.y = mean, colour = c("red","blue"), geom="point", shape=18, size=4) +
scale_x_discrete(labels = c("Population 1", "Population 2"))
vp
You can find more information about the ROC curve in the corresponding Wikipedia entry.
rm(list=ls())
# Required packages
library(twopiece)
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
# Empirical ROC curve
xx = c(-Inf, sort(unique(c(x,y))), Inf)
sens = sapply(xx, function(t){mean(x >= t)})
spec = sapply(xx, function(t){mean(y < t)})
plot(0, 0, xlim = c(0, 1), ylim = c(0, 1), type = 'l', lwd = 2 , xlab = "1-Specificity", ylab="Sensitivity", main ="Empirical ROC", cex.axis=1.5, cex.lab=1.5)
segments(0, 0, 1, 1, col = 1)
lines(1 - spec, sens, type = 'l', col = "green", lwd = 2)
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
# Empirical ROC curve
xx = c(-Inf, sort(unique(c(x,y))), Inf)
sens = sapply(xx, function(t){mean(x >= t)})
spec = sapply(xx, function(t){mean(y < t)})
plot(0, 0, xlim = c(0, 1), ylim = c(0, 1), type = 'l', lwd = 2 , xlab = "1-Specificity", ylab="Sensitivity", main ="Empirical ROC", cex.axis=1.5, cex.lab=1.5)
segments(0, 0, 1, 1, col = 1)
lines(1 - spec, sens, type = 'l', col = "green", lwd = 2)
You can find more information about the empirical cumulative distribution function (ECDF) in the corresponding Wikipedia entry. The Kolmogorov Smirnov test is related to the comparison of the ECDFs associated to each population.
rm(list=ls())
# Required packages
library(twopiece)
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
# Kolmogorov-Smirnov test
ks.test(x,y)
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.042, p-value = 0.7699
## alternative hypothesis: two-sided
# Empirical CDF
plot(ecdf(x), col="red", xlab ="", ylab = "ECDF", cex.axis =1.5, cex.lab = 1.5, main = "ECDF",lwd=2)
lines(ecdf(y), col="blue", cex.axis =1.5, cex.lab = 1.5, main = "ECDF",lwd=2)
legend(1.1, 0.5, c("Population 1","Population 2"), col=c(rgb(1,0,0,0.75),rgb(0,0,1,0.75)),
text.col = "black", lty = c(1, 1), lwd = c(2,2),
merge = TRUE, bg = "gray90",cex=1.1)
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
# Kolmogorov-Smirnov test
ks.test(x,y)
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.29, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Empirical CDF
plot(ecdf(x), col="red", xlim = c(-5,4.5), xlab ="", ylab = "ECDF", cex.axis =1.5, cex.lab = 1.5, main = "ECDF",lwd=2)
lines(ecdf(y), col="blue", cex.axis =1.5, cex.lab = 1.5, main = "ECDF",lwd=2)
legend(1.1, 0.5, c("Population 1","Population 2"), col=c(rgb(1,0,0,0.75),rgb(0,0,1,0.75)),
text.col = "black", lty = c(1, 1), lwd = c(2,2),
merge = TRUE, bg = "gray90",cex=1.1)
Bean plots are similar to violin plots with some additional visual features such as bars indicating the location of the observations (see: Beanplot: A Boxplot Alternative for Visual Comparison of Distributions).
rm(list=ls())
# Required packages
library(twopiece)
library(beanplot)
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
# Beanplots
beanplot(x,y, col = list("red",c("blue","black")))
legend("topright", fill = c("red", "blue"),
legend = c("Population 1", "Population 2"), box.lty=0)
# Back to back beanplot
df <- data.frame(rbind(cbind(x,1),cbind(y,2)))
colnames(df) <- c("values","group")
values <- df[,1]
group <- df[,2]
beanplot(values ~ group, ll = 0.04,
main = "Bean plot", side = "both", xlab="value", ylab ="Density",
col = list("red", c("blue", "black")), axes = FALSE)
box()
legend("bottomright", fill = c("red", "blue"),
legend = c("Population 1", "Population 2"), box.lty=0)
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
# Beanplots
beanplot(x,y, col = list("red",c("blue","black")))
box()
legend("topright", fill = c("red", "blue"),
legend = c("Population 1", "Population 2"), box.lty=0)
# Back to back beanplot
df <- data.frame(rbind(cbind(x,1),cbind(y,2)))
colnames(df) <- c("values","group")
values <- df[,1]
group <- df[,2]
beanplot(values ~ group, ll = 0.04,
main = "Bean plot", side = "both", xlab="value", ylab ="Density",
col = list("red", c("blue", "black")), axes = FALSE)
box()
legend("bottomright", fill = c("red", "blue"),
legend = c("Population 1", "Population 2"), box.lty=0)
Bee Swarm plots represent a different way of visualising the distribution of a sample. I am not a big fan of this visual tool as the discretisation of the data can be very influential on the shape of the bee swarm (similar to histograms, see: Dot Plots). For this reason, it is often recommended to couple its use with boxplots or violin plots. For additional details on this kind of plots see this tutorial.
rm(list=ls())
# Required packages
library(beeswarm)
library(twopiece)
n<- 500
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
# Bee Swarm plots of the two populations
df <- data.frame(rbind(cbind(x,1),cbind(y,2)))
colnames(df) <- c("values","group")
values <- df[,1]
group <- df[,2]
boxplot(x,y,ylim=c(-4,4), lwd = 2, names = c(1,2), cex.axis = 1.5, cex.lab = 1.5)
beeswarm(values ~ group, data=df, method="swarm", col=c("red","blue"), cex=0.75, cex.axis = 1.5, cex.lab=1.5, add = TRUE)
legend("topright", fill = c("red", "blue"),
legend = c("Population 1", "Population 2"), box.lty=0)
box()
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
# Bee Swarm plots of the two populations
df <- data.frame(rbind(cbind(x,1),cbind(y,2)))
colnames(df) <- c("values","group")
values <- df[,1]
group <- df[,2]
boxplot(x,y,ylim=c(-5,4), lwd = 2, names = c(1,2), cex.axis = 1.5, cex.lab = 1.5)
beeswarm(values ~ group, data=df, method="swarm", col=c("red","blue"), cex=0.75, cex.axis = 1.5, cex.lab=1.5, add = TRUE)
legend("topright", fill = c("red", "blue"),
legend = c("Population 1", "Population 2"), box.lty=0)
box()
This is an old data visualisation tool, which similar to a histogram. You can find more information about stem and leaf diagrams in the corresponding Wikipedia entry.
rm(list=ls())
# Required packages
library(twopiece)
library(aplpack)
## Loading required package: tcltk
## Warning: running command ''/usr/bin/otool' -L '/Library/Frameworks/
## R.framework/Resources/library/tcltk/libs//tcltk.so'' had status 1
n <- 500 # Sample size
# Equal distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rnorm(n,0,1)
# Stem-and-leaf plots
stem(x)
##
## The decimal point is at the |
##
## -2 | 765
## -2 | 322111000
## -1 | 888777777666655555
## -1 | 444444433333333333322222211111111111110000000000000
## -0 | 99999998888888877777777777777776666666666666666666666555555555555555
## -0 | 44444444444444444444444333333333333333333322222222222222222222221111+11
## 0 | 00000000000111111111111111111111111112222222222222222222223333333333+16
## 0 | 55555555555555566666666666677777777777777777778888888888888899999999
## 1 | 0000000000111111111111222222222333333333444444
## 1 | 555555666667777777888999
## 2 | 00000111222344
## 2 | 6
## 3 | 2
stem(y)
##
## The decimal point is at the |
##
## -2 | 865
## -2 | 22111000
## -1 | 99999999988877777666655555555555
## -1 | 44444444443333222222222222221111111111100000000000
## -0 | 9999999888888888888888877777777777777776666666666666666555555555555
## -0 | 44444444444333333333333333332222222222222222222222221111111111111111+10
## 0 | 00000011111111111111111111112222222222222222222222333333333333333333+8
## 0 | 55555555555555555566666666666666666666677777777788888888888888888899
## 1 | 0000000000000000011111111111122222222233444444
## 1 | 555555566667777788999
## 2 | 00012333444
## 2 | 556677
# Back-to-back stem-and-leaf plots
stem.leaf.backback(x,y)
## ___________________________________________________________________________________________________________________________
## 1 | 2: represents 1.2, leaf unit: 0.1
## x y
## ___________________________________________________________________________________________________________________________
## | -3* |
## | -2. |8 1
## 2 66| s |6 2
## 3 4| f |5 3
## 5 23| t |22 5
## 10 00001| -2* |1000 9
## 12 99| -1. |999999988888 21
## 24 666666667777| s |7776666666 31
## 32 44445555| f |55555554444444444 48
## 52 22222222222333333333| t |33333333322222 62
## 76 000000000000000001111111| -1* |111111111111110000000000000 89
## 90 88888899999999| -0. |999999988888888888 107
## 121 6666666666666677777777777777777| s |777777777777777666666666666666666 140
## 165 44444444444444444444444455555555555555555555| f |5555555555555544444444444 165
## 212 22222222222222222222222223333333333333333333333| t |33333333333332222222222222222222222 200
## 245 000000000000000000111111111111111| -0* |11111111111111111111100000000000000000000000000000 250
## (47) 11111111111111111111110000000000000000000000000| 0* |0000000000000001111111111111111111111 (37)
## 208 33333333333333333333322222222222222222| t |2222222222222222222222223333333333333333333333 213
## 170 555555555554444444444444444444| f |444444444444444555555555555555555 167
## 140 7777777777777777766666666666666666| s |66666666666666677777777777777 134
## 106 999999999888888888888888| 0. |888888888888888889999999999999 105
## 82 1111111100000000000| 1* |0000000000000001111111111 75
## 63 33333333222222222222| t |2222233333 50
## 43 55555554444| f |4444455555 40
## 32 777776666666| s |66666667 30
## 20 9999988| 1. |8888999 22
## 13 11111000| 2* |001 15
## 5 32| t |23333 12
## 3 54| f |44455 7
## | s |66 2
## | 2. |
## | 3* |
## ___________________________________________________________________________________________________________________________
## HI: 3.2410399349424
## n: 500 500
## ___________________________________________________________________________________________________________________________
# Different distribution
set.seed(123)
x <- rnorm(n,0,1)
y <- rtp3(n,0,1,0.5,param="eps",FUN=rnorm)
# Stem-and-leaf plots
stem(x)
##
## The decimal point is at the |
##
## -2 | 765
## -2 | 322111000
## -1 | 888777777666655555
## -1 | 444444433333333333322222211111111111110000000000000
## -0 | 99999998888888877777777777777776666666666666666666666555555555555555
## -0 | 44444444444444444444444333333333333333333322222222222222222222221111+11
## 0 | 00000000000111111111111111111111111112222222222222222222223333333333+16
## 0 | 55555555555555566666666666677777777777777777778888888888888899999999
## 1 | 0000000000111111111111222222222333333333444444
## 1 | 555555666667777777888999
## 2 | 00000111222344
## 2 | 6
## 3 | 2
stem(y)
##
## The decimal point is at the |
##
## -4 | 220
## -3 | 98887665
## -3 | 3322211000
## -2 | 9999998888887777665555555
## -2 | 4444444333322222222222211111111000000
## -1 | 998888888888888777777777766666666666666555555555
## -1 | 444444444444443333333322222222222222222222211111111110000000000000
## -0 | 99999999999999999999998888888888888777777777777777777766666666666666+9
## -0 | 44444444444444444444443333333333333333333332222222222222222222222221+17
## 0 | 000000011111111111122222222222222222233333333333333333444444444
## 0 | 555555555555555666666666666666777788888999999
## 1 | 00001123
## 1 | 7
# Back-to-back stem-and-leaf plots
stem.leaf.backback(x,y)
## _____________________________________________________________________________________________________________________
## 1 | 2: represents 1.2, leaf unit: 0.1
## x y
## LO: -4.24833903587326
## _____________________________________________________________________________________________________________________
## | -4* |10 3
## | -3. |888 6
## | s |776 9
## | f |54 11
## | t |2222 15
## | -3* |100000 21
## | -2. |99998888 29
## 2 66| s |7777766666 39
## 3 4| f |5555444444 49
## 5 23| t |33333322222222 63
## 10 00001| -2* |11111111100000000 80
## 12 99| -1. |99998888888 91
## 24 666666667777| s |77777777777776666666666 114
## 32 44445555| f |555555555555444444444444 138
## 52 22222222222333333333| t |33333333332222222222222222 164
## 76 000000000000000001111111| -1* |111111111111110000000000 188
## 90 88888899999999| -0. |9999999999999999999998888888888888888 225
## 121 6666666666666677777777777777777| s |7777777777777766666666666666666666 (34)
## 165 44444444444444444444444455555555555555555555| f |555555555555555544444444444444444444444444 241
## 212 22222222222222222222222223333333333333333333333| t |3333333333333333332222222222222222222222222 199
## 245 000000000000000000111111111111111| -0* |111111111111111111111111000000000000000 156
## (47) 11111111111111111111110000000000000000000000000| 0* |0000000000000000111111111111 117
## 208 33333333333333333333322222222222222222| t |222222222222222222333333333333 89
## 170 555555555554444444444444444444| f |444444444444455555555555555 59
## 140 7777777777777777766666666666666666| s |6666666666677777 32
## 106 999999999888888888888888| 0. |88889999999 16
## 82 1111111100000000000| 1* |01 5
## 63 33333333222222222222| t |23 3
## 43 55555554444| f |
## 32 777776666666| s |6 1
## 20 9999988| 1. |
## 13 11111000| 2* |
## 5 32| t |
## 3 54| f |
## | s |
## | 2. |
## | 3* |
## _____________________________________________________________________________________________________________________
## HI: 3.2410399349424
## n: 500 500
## _____________________________________________________________________________________________________________________