Data Visualisation

This is an R Markdown document written by Professor Carlos Rodriguez. It is intended to show the exeptional power of data visualisation.

As a first steep, we need to retrieve a data set from an https site, such dataset contains the Anscombes’ Quartet (Graphs in Statistical Analysis, F. J. Anscombe The American Statistician, Vol. 27, No. 1. (Feb., 1973), pp. 17-21.):

require (downloader)
download("https://dl.dropboxusercontent.com/u/95175494/datasets/anscombe-quartet.csv", destfile="anscombe.csv")
anscombe <- read.csv("anscombe.csv", header=TRUE)
attach (anscombe)
anscombe

##    x1    y1 x2   y2 x3    y3 x4    y4
## 1  10  8.04 10 9.14 10  7.46  8  6.58
## 2   8  6.95  8 8.14  8  6.77  8  5.76
## 3  13  7.58 13 8.74 13 12.74  8  7.71
## 4   9  8.81  9 8.77  9  7.11  8  8.84
## 5  11  8.33 11 9.26 11  7.81  8  8.47
## 6  14  9.96 14 8.10 14  8.84  8  7.04
## 7   6  7.24  6 6.13  6  6.08  8  5.25
## 8   4  4.26  4 3.10  4  5.39 19 12.50
## 9  12 10.84 12 9.13 12  8.15  8  5.56
## 10  7  4.82  7 7.26  7  6.42  8  7.91
## 11  5  5.68  5 4.74  5  5.73  8  6.89

To visualise the datasets readings we generate tables in a graphic format with gridExtra package:

Dataset1 <- as.table(cbind(x1,y1))
Dataset2 <- as.table(cbind(x2,y2))
Dataset3 <- as.table(cbind(x3,y3))
Dataset4 <- as.table(cbind(x4,y4))

# Loading packages for diplay tables and graphs
# library(OIdata)
library(gridExtra)
plot.new()
grid.table(Dataset1);plot.new()

grid.table(Dataset2);plot.new()

grid.table(Dataset3);plot.new()

grid.table(Dataset4)

You can proceed to perform traditional statistical analyses:

mean(x1);mean(y1);mean(x2);mean(y2);mean(x3);mean(y3);mean(x4);mean(y4)

## [1] 9

## [1] 7.500909

## [1] 9

## [1] 7.500909

## [1] 9

## [1] 7.5

## [1] 9

## [1] 7.500909

sd(x1);sd(y1);sd(x2);sd(y2);sd(x3);sd(y3);sd(x4);sd(y4)

## [1] 3.316625

## [1] 2.031568

## [1] 3.316625

## [1] 2.031657

## [1] 3.316625

## [1] 2.030424

## [1] 3.316625

## [1] 2.030579

cor(x1,y1);cor(x2,y2);cor(x3,y3);cor(x4,y4)

## [1] 0.8164205

## [1] 0.8162365

## [1] 0.8162867

## [1] 0.8165214

As noticed, the four datasets have the same parameters, even the same correlation.

To know whether there is any diferences between datasets we need to generate data visualisations:

par(mfrow=c(2, 2),bg="cornsilk")
plot(x1,y1, main="Plot X and Y for dataset 1",col="red",ylim=c(0,15))
abline(lm(y1~x1),col="red",lw=2)
plot(x2,y2, main="Plot X and Y for dataset 2",col="red",ylim=c(0,15))
abline(lm(y2~x2),col="red",lw=2)
plot(x3,y3, main="Plot X and Y for dataset 3",col="red",ylim=c(0,15))
abline(lm(y3~x3),col="red",lw=2)
plot(x4,y4, main="Plot X and Y for dataset 4",col="red",ylim=c(0,15))
abline(lm(y4~x4),col="red",lw=2)

To show another example of the power of data visualisation, we need to replicate part of the analysis of Galton (Natural Inheritance, Francis Galton, F.R.S.):

par(mfrow=c(1, 1), bg="white") # Returning the display to original settings
# ---- assocplot for Galton Investigation ---- 
galton <- as.table(cbind(c(12,25,9),c(20,51,28),c(18,28,14)))
dimnames(galton) <- list(Wife=c("tall","medium","short"),Husband=c("Short","Medium","Tall")) 
plot.new()
grid.table(galton,
           # change font sizes:
           gpar.coltext = gpar(cex = 1.2),
           gpar.rowtext = gpar(cex = 1.2))

opar <- par(bg = "white")
par(bg = "cornsilk")
assocplot(galton, col=c("green","red"))
title(main="assocplot for the Galton Investigation", col.main="darkblue", sub="Graphic by Carlos Rodriguez, PhD", col.sub="red", cex.sub=0.7)

Finally, a spineplot for the same dataset:

spineplot(galton, col=c("green","red"))
title(main="spineplot for the Galton Investigation", col.main="darkblue", sub="Graphic by Carlos Rodriguez, PhD", col.sub="red", cex.sub=0.7)

par(opar)

Data Visualisation

Carlos Rodríguez, PhD

04/12/2014