This is an R Markdown document written by Professor Carlos Rodriguez. It is intended to show the exeptional power of data visualisation.
As a first steep, we need to retrieve a data set from an https site, such dataset contains the Anscombes’ Quartet (Graphs in Statistical Analysis, F. J. Anscombe The American Statistician, Vol. 27, No. 1. (Feb., 1973), pp. 17-21.):
require (downloader)
download("https://dl.dropboxusercontent.com/u/95175494/datasets/anscombe-quartet.csv", destfile="anscombe.csv")
anscombe <- read.csv("anscombe.csv", header=TRUE)
attach (anscombe)
anscombe
## x1 y1 x2 y2 x3 y3 x4 y4
## 1 10 8.04 10 9.14 10 7.46 8 6.58
## 2 8 6.95 8 8.14 8 6.77 8 5.76
## 3 13 7.58 13 8.74 13 12.74 8 7.71
## 4 9 8.81 9 8.77 9 7.11 8 8.84
## 5 11 8.33 11 9.26 11 7.81 8 8.47
## 6 14 9.96 14 8.10 14 8.84 8 7.04
## 7 6 7.24 6 6.13 6 6.08 8 5.25
## 8 4 4.26 4 3.10 4 5.39 19 12.50
## 9 12 10.84 12 9.13 12 8.15 8 5.56
## 10 7 4.82 7 7.26 7 6.42 8 7.91
## 11 5 5.68 5 4.74 5 5.73 8 6.89
To visualise the datasets readings we generate tables in a graphic format with gridExtra package:
Dataset1 <- as.table(cbind(x1,y1))
Dataset2 <- as.table(cbind(x2,y2))
Dataset3 <- as.table(cbind(x3,y3))
Dataset4 <- as.table(cbind(x4,y4))
# Loading packages for diplay tables and graphs
# library(OIdata)
library(gridExtra)
plot.new()
grid.table(Dataset1);plot.new()
grid.table(Dataset2);plot.new()
grid.table(Dataset3);plot.new()
grid.table(Dataset4)
You can proceed to perform traditional statistical analyses:
mean(x1);mean(y1);mean(x2);mean(y2);mean(x3);mean(y3);mean(x4);mean(y4)
## [1] 9
## [1] 7.500909
## [1] 9
## [1] 7.500909
## [1] 9
## [1] 7.5
## [1] 9
## [1] 7.500909
sd(x1);sd(y1);sd(x2);sd(y2);sd(x3);sd(y3);sd(x4);sd(y4)
## [1] 3.316625
## [1] 2.031568
## [1] 3.316625
## [1] 2.031657
## [1] 3.316625
## [1] 2.030424
## [1] 3.316625
## [1] 2.030579
cor(x1,y1);cor(x2,y2);cor(x3,y3);cor(x4,y4)
## [1] 0.8164205
## [1] 0.8162365
## [1] 0.8162867
## [1] 0.8165214
As noticed, the four datasets have the same parameters, even the same correlation.
To know whether there is any diferences between datasets we need to generate data visualisations:
par(mfrow=c(2, 2),bg="cornsilk")
plot(x1,y1, main="Plot X and Y for dataset 1",col="red",ylim=c(0,15))
abline(lm(y1~x1),col="red",lw=2)
plot(x2,y2, main="Plot X and Y for dataset 2",col="red",ylim=c(0,15))
abline(lm(y2~x2),col="red",lw=2)
plot(x3,y3, main="Plot X and Y for dataset 3",col="red",ylim=c(0,15))
abline(lm(y3~x3),col="red",lw=2)
plot(x4,y4, main="Plot X and Y for dataset 4",col="red",ylim=c(0,15))
abline(lm(y4~x4),col="red",lw=2)
To show another example of the power of data visualisation, we need to replicate part of the analysis of Galton (Natural Inheritance, Francis Galton, F.R.S.):
par(mfrow=c(1, 1), bg="white") # Returning the display to original settings
# ---- assocplot for Galton Investigation ----
galton <- as.table(cbind(c(12,25,9),c(20,51,28),c(18,28,14)))
dimnames(galton) <- list(Wife=c("tall","medium","short"),Husband=c("Short","Medium","Tall"))
plot.new()
grid.table(galton,
# change font sizes:
gpar.coltext = gpar(cex = 1.2),
gpar.rowtext = gpar(cex = 1.2))
opar <- par(bg = "white")
par(bg = "cornsilk")
assocplot(galton, col=c("green","red"))
title(main="assocplot for the Galton Investigation", col.main="darkblue", sub="Graphic by Carlos Rodriguez, PhD", col.sub="red", cex.sub=0.7)
Finally, a spineplot for the same dataset:
spineplot(galton, col=c("green","red"))
title(main="spineplot for the Galton Investigation", col.main="darkblue", sub="Graphic by Carlos Rodriguez, PhD", col.sub="red", cex.sub=0.7)
par(opar)