Introduction


English Statistician Frank Anscombe designed four short datasets with de aim of demonstrate the importance of visualising data and the dangers of reliance on simple summary statistics.

His original paper Graphs in Statistical Analysis can be retrieved from JSTOR website.

1. Dataset


As with all classic datasets, the quartet is included in the R datasets package. First, load required libraries and data, and visualize them:

library(ggplot2)
library(gridExtra)
data(anscombe)
anscombe
##    x1 x2 x3 x4    y1   y2    y3    y4
## 1  10 10 10  8  8.04 9.14  7.46  6.58
## 2   8  8  8  8  6.95 8.14  6.77  5.76
## 3  13 13 13  8  7.58 8.74 12.74  7.71
## 4   9  9  9  8  8.81 8.77  7.11  8.84
## 5  11 11 11  8  8.33 9.26  7.81  8.47
## 6  14 14 14  8  9.96 8.10  8.84  7.04
## 7   6  6  6  8  7.24 6.13  6.08  5.25
## 8   4  4  4 19  4.26 3.10  5.39 12.50
## 9  12 12 12  8 10.84 9.13  8.15  5.56
## 10  7  7  7  8  4.82 7.26  6.42  7.91
## 11  5  5  5  8  5.68 4.74  5.73  6.89

2. Statistics Summary


The summary of the four datasets show the similarities between such datasets in terms of the mean:

summary(anscombe)
##        x1             x2             x3             x4    
##  Min.   : 4.0   Min.   : 4.0   Min.   : 4.0   Min.   : 8  
##  1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 8  
##  Median : 9.0   Median : 9.0   Median : 9.0   Median : 8  
##  Mean   : 9.0   Mean   : 9.0   Mean   : 9.0   Mean   : 9  
##  3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.: 8  
##  Max.   :14.0   Max.   :14.0   Max.   :14.0   Max.   :19  
##        y1               y2              y3              y4        
##  Min.   : 4.260   Min.   :3.100   Min.   : 5.39   Min.   : 5.250  
##  1st Qu.: 6.315   1st Qu.:6.695   1st Qu.: 6.25   1st Qu.: 6.170  
##  Median : 7.580   Median :8.140   Median : 7.11   Median : 7.040  
##  Mean   : 7.501   Mean   :7.501   Mean   : 7.50   Mean   : 7.501  
##  3rd Qu.: 8.570   3rd Qu.:8.950   3rd Qu.: 7.98   3rd Qu.: 8.190  
##  Max.   :10.840   Max.   :9.260   Max.   :12.74   Max.   :12.500

It is also easy to see the similarities in terms of the variance, correlation coefficient and linear regression:

# correlation
sapply(1:4, function(x) cor(anscombe[, x], anscombe[, x+4]))
## [1] 0.8164205 0.8162365 0.8162867 0.8165214
# variance
sapply(5:8, function(x) var(anscombe[, x]))
## [1] 4.127269 4.127629 4.122620 4.123249
# linear regression
lm(y1 ~ x1, data = anscombe)
## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001
lm(y2 ~ x2, data = anscombe)
## 
## Call:
## lm(formula = y2 ~ x2, data = anscombe)
## 
## Coefficients:
## (Intercept)           x2  
##       3.001        0.500
lm(y3 ~ x3, data = anscombe)
## 
## Call:
## lm(formula = y3 ~ x3, data = anscombe)
## 
## Coefficients:
## (Intercept)           x3  
##      3.0025       0.4997
lm(y4 ~ x4, data = anscombe)
## 
## Call:
## lm(formula = y4 ~ x4, data = anscombe)
## 
## Coefficients:
## (Intercept)           x4  
##      3.0017       0.4999

3. Plotting the quartet with ggplot2 package


p1 <- ggplot(anscombe) +
    geom_point(aes(x1, y1), color = "darkred", size = 3) +
    #theme_bw() +
    scale_x_continuous(breaks = seq(0, 20, 2)) +
    scale_y_continuous(breaks = seq(0, 12, 2)) +
    geom_abline(intercept = 3, slope = 0.5, color = "darkblue") +
    expand_limits(x = 0, y = 0) +
    labs(title = "dataset 1")

p2 <- ggplot(anscombe) +
    geom_point(aes(x2, y2), color = "darkred", size = 3) +
    #theme_bw() +
    scale_x_continuous(breaks = seq(0, 20, 2)) +
    scale_y_continuous(breaks = seq(0, 12, 2)) +
    geom_abline(intercept = 3, slope = 0.5, color = "darkblue") +
    expand_limits(x = 0, y = 0) +
    labs(title = "dataset 2")

p3 <- ggplot(anscombe) +
    geom_point(aes(x3, y3), color = "darkred", size = 3) +
    #theme_bw() +
    scale_x_continuous(breaks = seq(0, 20, 2)) +
    scale_y_continuous(breaks = seq(0, 12, 2)) +
    geom_abline(intercept = 3, slope = 0.5, color = "darkblue") +
    expand_limits(x = 0, y = 0) +
    labs(title = "dataset 3")

p4 <- ggplot(anscombe) +
    geom_point(aes(x4, y4), color = "darkred", size = 3) +
    #theme_bw() +
    scale_x_continuous(breaks = seq(0, 20, 2)) +
    scale_y_continuous(breaks = seq(0, 12, 2)) +
    geom_abline(intercept = 3, slope = 0.5, color = "darkblue") +
    expand_limits(x = 0, y = 0) +
    labs(title = "dataset 4")

p <- list(p1, p2, p3, p4)

do.call(grid.arrange, c(p, list(ncol=2)))