1 Descriptive Statistics?

“The greatest value of a picture is when it forces us to notice what we never expected to see.” John W. Tukey, Exploratory Data Analysis, 1977

  1. EDA(Exploratory Data Analysis) phase : At the beginning of anlaysis
  2. Share phase : At the end of analysis

2 Theory : When and How to Use Appropriate One

  1. Histogram vs Stem & Leaf plot
  2. Boxplot
  3. Representation
  • Exponential, Log, Square & Cubic Transformation, Normalization
  1. P-P plot vs Q-Q plot
  2. Two-Way Data Table
  3. Time-series Data : Smoothing
  4. 3D plot

2.1 Stem & Leaf plot vs Histogram

woolA = warpbreaks[warpbreaks$wool=="A",]
stem(woolA$breaks,)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   1 | 0257888
##   2 | 114566689
##   3 | 00566
##   4 | 3
##   5 | 124
##   6 | 7
##   7 | 0
hist(woolA$breaks)

2.2 Box(-and-Whisker) Plot

  • Based on Five Number Summary : Min, First Quantile, Median, Third Quantile, Maximum
  • H-spread similar to IQR
  • inner fence = \(H_u + 1.5{H-spread}, H_l - 1.5{H-spread}\)
  • outer fence = \(H_u + 3{H-spread}, H_l - 3{H-spread}\)
  • notch = median \(\pm 1.58 \frac{H-spread}{\sqrt{n}}\)
    • If two notched interval do not overlap, two medians are significantly different at roughly 5% level
par(mfrow=c(1,2),mar=c(0,4,2,2)) # bottom, left, top , right
boxplot(Nile[time(Nile)<1898],main="Before Aswan Dam",
        notch = T, ylim= c(600,1400),col = "lightyellow")
boxplot(Nile[time(Nile)>=1898],main="After Aswan Dam",
        notch = T ,ylim= c(600,1400),col="lightgrey")

2.3 Representation

2.3.1 Aims of re-expression

  • to achieve symmetry within batches
  • to make spread between batches alike
    • Usually these two objects are achieved simultaneously; Re-expression usually needs not to balance these two.

2.3.2 Why symmetrize?

  • A large fraction of data are smashed into small regions of graphs and impair visual assessment
  • No unique notion of location; mean ̸= median ̸= mode
  • Many probabilistic model are based on normal assumptions

2.3.3 Ladder of Power

library(wesanderson)
curve(2*((x+1)^(1/2)-1),0,5,col= wes_palette(n=5, name="Darjeeling1")[2],
      lwd=2, main = "Ladder of Power", xlab = "X",ylab="Y", ylim = c(0,3.0))
curve(3*((x+1)^(1/3)-1),0,5,col = wes_palette(n=5, name="Darjeeling1")[3],add=T,lwd=2)
curve(log(x+1),0,5,col=wes_palette(n=5, name="Darjeeling1")[1],add=T,lwd=2)
curve(-3*((x+1)^(-1/3)-1),0,5,col=wes_palette(n=5, name="Darjeeling1")[4],add=T,lwd=2)
curve(-2*((x+1)^(-1/2)-1),0,5,col=wes_palette(n=5, name="Darjeeling1")[5],add=T,lwd=2)

legend(0.2,2.8,c("Square Root", "Cubic Root", "Log", "1/Cube Root", "1/Square Root"),
       col=wes_palette(n=5, name="Darjeeling1")[c(2,3,1,4,5)],pch=1)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
stock.data = getSymbols('005930.KS',from='2020-01-01',to='2020-12-31',auto.assign = FALSE)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
colnames(stock.data) = c("open","high","low","close","volume","adjusted")
stock.data = stock.data[stock.data$volume>0,]
plot(stock.data$close,main = "Time-Series Plot of Stock Price")

par(mfrow=c(1,2)) 
boxplot(stock.data$close,main = "Boxplot of Stock Price")
plot(density(stock.data$close), main = "Desnity Plot of Stock Price")

par(mfrow= c(1,2))
boxplot(stock.data$close, main = "Original Data")
boxplot(log(stock.data$close),main = "P = 0 Transformation")

2.4 p-p plot vs q-q plot

2.4.1 P-P plot : for same x, \(F_1(x)\) ~ \(F_2(x)\) / empirical CDF - theoretical CDF

2.4.2 Q-Q plot : If \(F_1\) and \(F_2\) are the same, for \(0\le p \le 1\), (\(F_1^{-1}\),\(F_2^{-1}\)) will lie on a straight line with the slope 1 and the intercept 0

  • variance difference : slope changes
  • mean difference : y-intercept changes
par(mfrow=c(2,2), mar = c(4,4,2,4))
set.seed(1)
sp = rnorm(50,mean = 1, sd = 2)
sp = sp[order(sp)]
qn = qnorm(seq(from=0.01,to=0.99,length=50))
plot(ecdf(sp), main = "Empirical CDF  vs Theoretical CDF");lines(seq(-2,2,length=50),pnorm(seq(-2,2,length=50)),col="red")
plot(qn, sp, main = "Normal Q-Q Plot", xlim = c(-4,4),ylim = c(-4,4)); abline(0,1,col="red") ; abline(1,2,col="blue")
plot(pnorm(sp),seq(0.02,1,by=0.02), main = "Normal P-P Plot with mu = 0 , sd = 1"); abline(0,1, col="red")
plot(pnorm(sp,mean=1, sd=2),seq(0.02,1,by=0.02), main = "Normal P-P Plot with mu = 1 , sd = 2"); abline(0,1, col="red")

2.5 Two-way data table(Contingency Table)

  • Visualization of ANOVA : Median Polish
require(graphics)

## Deaths from sport parachuting;  from ABC of EDA, p.224:
deaths = rbind(c(14,15,14),
          c( 7, 4, 7),
          c( 8, 2,10),
          c(15, 9,10),
          c( 0, 2, 0))
dimnames(deaths) <- list(c("1-24", "25-74", "75-199", "200++", "Not Reported"),
                         paste(1973:1975))
deaths
##              1973 1974 1975
## 1-24           14   15   14
## 25-74           7    4    7
## 75-199          8    2   10
## 200++          15    9   10
## Not Reported    0    2    0
(med.d <- medpolish(deaths))
## 1: 19
## Final: 19
## 
## Median Polish Results (Dataset: "deaths")
## 
## Overall: 8
## 
## Row Effects:
##         1-24        25-74       75-199        200++ Not Reported 
##            6           -1            0            2           -8 
## 
## Column Effects:
## 1973 1974 1975 
##    0   -1    0 
## 
## Residuals:
##              1973 1974 1975
## 1-24            0    2    0
## 25-74           0   -2    0
## 75-199          0   -5    2
## 200++           5    0    0
## Not Reported    0    3    0
plot(med.d)

## Coded Plot
CodedPlot = function(x){
  result = x
  criteria = fivenum(x)
  hspread = criteria[4]-criteria[2]
  outerU = criteria[4]+3*hspread
  outerL = criteria[2]-3*hspread
  innerU = criteria[4]+1.5*hspread
  innerL = criteria[2]+1.5*hspread
  for(i in 1:ncol(x)){
    for(j in 1:nrow(x)){
      if(x[j,i]>outerU) result[j,i] = "P"
      else if(x[j,i]>innerU) result[j,i] = "#"
      else if(x[j,i]>criteria[4]) result[j,i] = "+"
      else if(x[j,i]>criteria[2]) result[j,i] = "."
      else if(x[j,i]>innerL) result[j,i] = "-"
      else if(x[j,i]>outerL) result[j,i] = "="
      else result[j,i] = "M"
    }
  }
  return(result)
}
CodedPlot(deaths)
##              1973 1974 1975
## 1-24         "+"  "+"  "+" 
## 25-74        "."  "."  "." 
## 75-199       "."  "="  "." 
## 200++        "+"  "."  "." 
## Not Reported "="  "="  "="

2.7 3D Plot

library(scatterplot3d)
attach(iris)
scatterplot3d(Sepal.Length, Sepal.Width, Petal.Length)

4 References

  1. R을 활용한 탐색적 자료분석 (허명회 저)
  2. Data Visualization with R by Rob Kabacoff
  3. R Markdown: The Definitive Guide by Yihui Xie