Notes on outliers

An outlier tells a story! Don't ignore this story.

Oultiers can't be removed just because they are outliers! There must be a reason.

Before removing outliers, ask the following questions:

For packages that can handle data with outliers, see “Robust statistical methods” http://cran.r-project.org/web/views/Robust.html

Identifying outliers via boxplots and corresponding functions

# some libraries for general data wrangling
library(plyr)
library(ggplot2)

# set working directory
setwd("~/AgFace/Plant_Production/Silverstar_tin")
load(file = "Silverstar_tin_environment.RData")

# prepare a data set:
GRDC.para <- "Yield..g.m2."
GRDC.yield <- df.melt[df.melt$variable %in% GRDC.para, ]

# Manual visualisation and calculation
p <- ggplot(GRDC.yield, aes(x = "All samples", y = value))
     p <- p + geom_boxplot(outlier.colour = "tomato")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-1


# get the summary statistics
summary(GRDC.yield$value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     240     509     594     602     718    1060     256

my.median <- median(GRDC.yield$value, na.rm = TRUE)

# add corresponding lines to the plot
p <- p + geom_hline(yintercept = my.median, colour = "red")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-1


# other summary functions
fivenum(GRDC.yield$value, na.rm = TRUE)
## [1]  239.9  509.4  593.6  718.5 1060.6
quantile(GRDC.yield$value, na.rm = TRUE)
##     0%    25%    50%    75%   100% 
##  239.9  509.5  593.6  718.0 1060.6

p <- p + geom_hline(yintercept = my.median, colour = "red")

# grab the first and third quantile (lower and upper hinge)
first.quant <- quantile(GRDC.yield$value, na.rm = TRUE)[2]
third.quant <- quantile(GRDC.yield$value, na.rm = TRUE)[4]

my.quantiles <- c(first.quant, third.quant)
p <- p + geom_hline(yintercept = my.quantiles, colour = "green")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-1


# calculate end of whiskers
# whisker commonly extend to 1.5 * interquartile range (IQR)
# different definitions exist!

# get the IQR
IQR(GRDC.yield$value, na.rm = TRUE)
## [1] 208.5
my.IQR <- 1.5 * IQR(GRDC.yield$value, na.rm = TRUE)

upper.IQR <- third.quant + my.IQR
lower.IQR <- first.quant - my.IQR

upper.lower.range <- c(upper.IQR, lower.IQR)

# in the graph, the whisker extends to the sample that is still within the 1.5*IQR range, it does not extend to the calculated 1.5*IQR!
p <- p + geom_hline(yintercept = upper.lower.range, colour = "yellow")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-1


# identify samples outside the upper/lower 1.5*IQR range

which(GRDC.yield$value > upper.IQR | 
      GRDC.yield$value < lower.IQR)
## [1] 359

# check the sample in context of the data frame:
GRDC.yield[359, ]
##       Year Irrigation   Cultivar Stage Trmt.Order Plot.Order
## 47343 2012        Sup Silverstar  DC90         27         55
##       CO2.Irr.Cultivar.DATABASE Trial.ID RingID PlotID HalfRing RingPos.
## 47343       eCO2 Sup Silverstar      PBC      5      G        1        W
##          RingTrt  CO2  Crop Qplot QHalfRing Qring Bulk_Trait Sample.date
## 47343 Wheat eCO2 eCO2 Wheat    p3         2     2      Trait          NA
##       Sowing.date Environment my.HalfringID   my.Trait Ord.Environment
## 47343          NA    2012.Sup      2012.5.W Silverstar        2012.Sup
##           variable value    DC_date
## 47343 Yield..g.m2.  1061 2012-12-05


# Calculation using boxplots.stats()
# Back to the original simple boxplot
p <- ggplot(GRDC.yield, aes(x = "All", y = value))
     p <- p + geom_boxplot()
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-1


# using the boxplots.stats function
# Uses a different approach than our manual calculation!
# see ?boxplots.stats, ?boxplot, and ?geom_boxplotfor details!

boxplot.stats(GRDC.yield$value)
## $stats
## [1]  239.9  509.4  593.6  718.5 1008.4
## 
## $n
## [1] 128
## 
## $conf
## [1] 564.4 622.8
## 
## $out
## [1] 1061

# list of sample-values that are outside the whiskers:
boxplot.stats(GRDC.yield$value)$out
## [1] 1061

# Multiple groups
p <- ggplot(GRDC.yield, aes(x = Cultivar, y = value))
     p <- p + geom_boxplot()
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-1


# multiple groups via plyr
ddply(GRDC.yield,
      .(Cultivar),
      summarise,
      outliers = boxplot.stats(value)$out)
##    Cultivar outliers
## 1 SB003 low    273.9
## 2 SB003 low    879.3

# facet plot
p <- ggplot(GRDC.yield, aes(x = Environment, y = value))
     p <- p + geom_boxplot(aes(fill = Cultivar, linetype = CO2), notch = FALSE)
     p <- p + labs(y = expression("Yield"~"["~g~m^2~"]"),
                   x = "Environment")
     p <- p + theme_bw()
     p <- p + facet_grid(. ~ my.Trait)
p
## Warning: Removed 128 rows containing non-finite values (stat_boxplot).
## Warning: Removed 128 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-1


# corresponding boxplot.stats - doesn't return any outliers contrary to the graph?
dlply(GRDC.yield,
      .(Environment, Cultivar, CO2, my.Trait),
      summarise,
      outliers = boxplot.stats(value))
## $`2011.Rain.SB003 low.aCO2.SB`
##                            outliers
## 1 400.6, 454.6, 529.4, 628.6, 707.0
## 2                                 4
## 3                      392.0, 666.8
## 4                                  
## 
## $`2011.Rain.SB003 low.eCO2.SB`
##                            outliers
## 1 524.7, 540.7, 580.5, 647.4, 690.3
## 2                                 4
## 3                      496.2, 664.8
## 4                                  
## 
## $`2011.Rain.SB062 high.aCO2.SB`
##                            outliers
## 1 466.4, 531.0, 604.0, 644.9, 677.5
## 2                                 4
## 3                      514.0, 693.9
## 4                                  
## 
## $`2011.Rain.SB062 high.eCO2.SB`
##                            outliers
## 1 579.8, 648.7, 760.8, 846.9, 889.6
## 2                                 4
## 3                      604.2, 917.5
## 4                                  
## 
## $`2011.Rain.Silverstar.aCO2.Silverstar`
##                            outliers
## 1 369.3, 435.3, 559.5, 632.8, 647.9
## 2                                 4
## 3                      403.4, 715.6
## 4                                  
## 
## $`2011.Rain.Silverstar.eCO2.Silverstar`
##                            outliers
## 1 592.0, 655.8, 762.4, 824.0, 842.7
## 2                                 4
## 3                      629.5, 895.3
## 4                                  
## 
## $`2011.Rain.SSR T65.aCO2.Silverstar`
##                            outliers
## 1 239.9, 247.0, 387.1, 534.8, 549.4
## 2                                 4
## 3                      159.8, 614.4
## 4                                  
## 
## $`2011.Rain.SSR T65.eCO2.Silverstar`
##                            outliers
## 1 327.4, 387.2, 469.8, 570.5, 648.5
## 2                                 4
## 3                      325.0, 614.6
## 4                                  
## 
## $`2012.Rain.SB003 low.aCO2.SB`
##                            outliers
## 1 273.9, 305.0, 384.0, 511.4, 590.9
## 2                                 4
## 3                      220.9, 547.1
## 4                                  
## 
## $`2012.Rain.SB003 low.eCO2.SB`
##                            outliers
## 1 331.5, 380.3, 481.2, 613.9, 694.5
## 2                                 4
## 3                      296.6, 665.8
## 4                                  
## 
## $`2012.Rain.SB062 high.aCO2.SB`
##                            outliers
## 1 273.3, 307.4, 399.0, 630.6, 804.7
## 2                                 4
## 3                      143.6, 654.4
## 4                                  
## 
## $`2012.Rain.SB062 high.eCO2.SB`
##                            outliers
## 1 422.9, 473.6, 548.7, 662.7, 752.2
## 2                                 4
## 3                      399.3, 698.1
## 4                                  
## 
## $`2012.Rain.Silverstar.aCO2.Silverstar`
##                            outliers
## 1 428.3, 442.9, 513.7, 668.1, 766.4
## 2                                 4
## 3                      335.7, 691.6
## 4                                  
## 
## $`2012.Rain.Silverstar.eCO2.Silverstar`
##                            outliers
## 1 523.8, 536.1, 579.5, 612.1, 613.4
## 2                                 4
## 3                      519.5, 639.5
## 4                                  
## 
## $`2012.Rain.SSR T65.aCO2.Silverstar`
##                            outliers
## 1 424.2, 443.4, 462.8, 494.4, 525.7
## 2                                 4
## 3                      422.5, 503.1
## 4                                  
## 
## $`2012.Rain.SSR T65.eCO2.Silverstar`
##                            outliers
## 1 412.6, 468.1, 588.9, 658.9, 663.6
## 2                                 4
## 3                      438.1, 739.6
## 4                                  
## 
## $`2011.Sup.SB003 low.aCO2.SB`
##                            outliers
## 1 542.5, 548.4, 554.5, 566.4, 578.2
## 2                                 4
## 3                      540.3, 568.7
## 4                                  
## 
## $`2011.Sup.SB003 low.eCO2.SB`
##                            outliers
## 1 509.5, 559.1, 631.6, 709.1, 763.5
## 2                                 4
## 3                      513.1, 750.1
## 4                                  
## 
## $`2011.Sup.SB062 high.aCO2.SB`
##                            outliers
## 1 524.5, 569.7, 625.2, 698.5, 761.5
## 2                                 4
## 3                      523.4, 727.0
## 4                                  
## 
## $`2011.Sup.SB062 high.eCO2.SB`
##                             outliers
## 1 634.4, 645.2, 702.6, 878.9, 1008.4
## 2                                  4
## 3                       518.0, 887.3
## 4                                   
## 
## $`2011.Sup.Silverstar.aCO2.Silverstar`
##                            outliers
## 1 492.3, 563.0, 676.8, 744.1, 768.1
## 2                                 4
## 3                      533.8, 819.9
## 4                                  
## 
## $`2011.Sup.Silverstar.eCO2.Silverstar`
##                            outliers
## 1 723.5, 758.9, 860.8, 939.0, 950.7
## 2                                 4
## 3                     718.5, 1003.0
## 4                                  
## 
## $`2011.Sup.SSR T65.aCO2.Silverstar`
##                            outliers
## 1 354.8, 356.1, 389.8, 487.6, 552.8
## 2                                 4
## 3                      285.9, 493.7
## 4                                  
## 
## $`2011.Sup.SSR T65.eCO2.Silverstar`
##                            outliers
## 1 475.2, 509.5, 569.5, 669.6, 744.0
## 2                                 4
## 3                      443.0, 695.9
## 4                                  
## 
## $`2012.Sup.SB003 low.aCO2.SB`
##                            outliers
## 1 521.7, 529.7, 541.4, 603.7, 662.1
## 2                                 4
## 3                      483.0, 599.9
## 4                                  
## 
## $`2012.Sup.SB003 low.eCO2.SB`
##                            outliers
## 1 635.7, 709.4, 805.8, 853.9, 879.3
## 2                                 4
## 3                      691.6, 919.9
## 4                                  
## 
## $`2012.Sup.SB062 high.aCO2.SB`
##                            outliers
## 1 522.3, 527.2, 550.2, 653.4, 738.3
## 2                                 4
## 3                      450.5, 649.9
## 4                                  
## 
## $`2012.Sup.SB062 high.eCO2.SB`
##                            outliers
## 1 735.2, 743.0, 761.3, 779.1, 786.4
## 2                                 4
## 3                      732.8, 789.7
## 4                                  
## 
## $`2012.Sup.Silverstar.aCO2.Silverstar`
##                            outliers
## 1 543.9, 575.3, 609.8, 640.3, 667.6
## 2                                 4
## 3                      558.4, 661.3
## 4                                  
## 
## $`2012.Sup.Silverstar.eCO2.Silverstar`
##                              outliers
## 1 713.0, 732.0, 865.9, 1020.7, 1060.6
## 2                                   4
## 3                       637.8, 1093.9
## 4                                    
## 
## $`2012.Sup.SSR T65.aCO2.Silverstar`
##                            outliers
## 1 423.3, 466.3, 567.0, 647.4, 670.0
## 2                                 4
## 3                      424.0, 710.1
## 4                                  
## 
## $`2012.Sup.SSR T65.eCO2.Silverstar`
##                            outliers
## 1 531.1, 615.3, 730.0, 818.5, 876.5
## 2                                 4
## 3                      569.5, 890.5
## 4                                  
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##    Environment   Cultivar  CO2   my.Trait
## 1    2011.Rain  SB003 low aCO2         SB
## 2    2011.Rain  SB003 low eCO2         SB
## 3    2011.Rain SB062 high aCO2         SB
## 4    2011.Rain SB062 high eCO2         SB
## 5    2011.Rain Silverstar aCO2 Silverstar
## 6    2011.Rain Silverstar eCO2 Silverstar
## 7    2011.Rain    SSR T65 aCO2 Silverstar
## 8    2011.Rain    SSR T65 eCO2 Silverstar
## 9    2012.Rain  SB003 low aCO2         SB
## 10   2012.Rain  SB003 low eCO2         SB
## 11   2012.Rain SB062 high aCO2         SB
## 12   2012.Rain SB062 high eCO2         SB
## 13   2012.Rain Silverstar aCO2 Silverstar
## 14   2012.Rain Silverstar eCO2 Silverstar
## 15   2012.Rain    SSR T65 aCO2 Silverstar
## 16   2012.Rain    SSR T65 eCO2 Silverstar
## 17    2011.Sup  SB003 low aCO2         SB
## 18    2011.Sup  SB003 low eCO2         SB
## 19    2011.Sup SB062 high aCO2         SB
## 20    2011.Sup SB062 high eCO2         SB
## 21    2011.Sup Silverstar aCO2 Silverstar
## 22    2011.Sup Silverstar eCO2 Silverstar
## 23    2011.Sup    SSR T65 aCO2 Silverstar
## 24    2011.Sup    SSR T65 eCO2 Silverstar
## 25    2012.Sup  SB003 low aCO2         SB
## 26    2012.Sup  SB003 low eCO2         SB
## 27    2012.Sup SB062 high aCO2         SB
## 28    2012.Sup SB062 high eCO2         SB
## 29    2012.Sup Silverstar aCO2 Silverstar
## 30    2012.Sup Silverstar eCO2 Silverstar
## 31    2012.Sup    SSR T65 aCO2 Silverstar
## 32    2012.Sup    SSR T65 eCO2 Silverstar

using the library(outliers)

Might not be very helpful in the context of biological, or generally noisy data.

library(outliers)

# One-group boxplot
p <- ggplot(GRDC.yield, aes(x = "All samples", y = value))
p <- p + geom_boxplot(outlier.colour = "tomato")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-2


# be rather careful with these functions, make sure you read and understand the helpfiles!
# Grubbs tests for one or two outliers in data sample
grubbs.test(GRDC.yield$value)
## 
##  Grubbs test for one outlier
## 
## data:  GRDC.yield$value
## G = 2.8416, U = 0.9359, p-value = 0.2521
## alternative hypothesis: highest value 1060.55111111111 is an outlier

# two outliers
p <- ggplot(GRDC.yield, aes(x = Cultivar, y = value))
p <- p + geom_boxplot(aes(fill = Cultivar))
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-2


# checking for two outliers on opposite ends of the range (type 11)
# this finds two outliers for all groups that we don't expect from the quantile based methods described above'! So be extemely cautious!!!
dlply(GRDC.yield,
      .(Cultivar),
      summarise,
      grubbs.test(value, type = 11))
## $`SB003 low`
##                                                  ..1
## 1                                     4.3498, 0.6948
## 2 273.934222222222 and 879.345777777778 are outliers
## 3                                             0.5178
## 4              Grubbs test for two opposite outliers
## 5                                              value
## 
## $`SB062 high`
##                                                  ..1
## 1                                     4.6572, 0.6502
## 2 273.322666666667 and 1008.40103703704 are outliers
## 3                                             0.1804
## 4              Grubbs test for two opposite outliers
## 5                                              value
## 
## $Silverstar
##                                                  ..1
## 1                                     4.2164, 0.7089
## 2 369.344962962963 and 1060.55111111111 are outliers
## 3                                             0.7873
## 4              Grubbs test for two opposite outliers
## 5                                              value
## 
## $`SSR T65`
##                                                  ..1
## 1                                     4.3685, 0.6883
## 2 239.926444444444 and 876.501333333333 are outliers
## 3                                             0.4874
## 4              Grubbs test for two opposite outliers
## 5                                              value
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##     Cultivar
## 1  SB003 low
## 2 SB062 high
## 3 Silverstar
## 4    SSR T65


# A very fuzzy method, seems to be even more dangerous:
# outlier Find value with largest difference from the mean
# Finds value with largest difference between it and sample mean, which can be an outlier
outlier(GRDC.yield$value)
## [1] 1061

ddply(GRDC.yield,
      .(Cultivar),
      summarise,
      outlier(value))
##     Cultivar    ..1
## 1  SB003 low  879.3
## 2 SB062 high 1008.4
## 3 Silverstar 1060.6
## 4    SSR T65  876.5

Notched boxplots:

See ?geom_boxplot notch: if FALSE (default) make a standard box plot. If TRUE, make a notched box plot. Notches are used to compare groups; if the notches of two boxes do not overlap, this is strong evidence that the medians differ. […] In a notched box plot, the notches extend 1.58 * IQR / sqrt(n). This gives a roughly 95% interval for comparing medians. See McGill et al. (1978) for more details.

# Notched boxplots example:

p <- ggplot(GRDC.yield, aes(x = Cultivar, y = value))
p <- p + geom_boxplot(aes(fill = Cultivar), notch = TRUE)
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.

plot of chunk unnamed-chunk-3


# Silverstar and SSR T65 seem to have different medians.
# testing the differences via aov and t.test for comparison
summary(aov(value ~ Cultivar, data = GRDC.yield[GRDC.yield$my.Trait == "Silverstar", ]))
##             Df  Sum Sq Mean Sq F value  Pr(>F)    
## Cultivar     1  361777  361777      15 0.00026 ***
## Residuals   62 1491349   24054                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 128 observations deleted due to missingness
t.test(value ~ Cultivar, data = GRDC.yield[GRDC.yield$my.Trait == "Silverstar", ])
## 
##  Welch Two Sample t-test
## 
## data:  value by Cultivar
## t = 3.878, df = 61.16, p-value = 0.0002598
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   72.84 227.90
## sample estimates:
## mean in group Silverstar    mean in group SSR T65 
##                    673.8                    523.4

summary(aov(value ~ Cultivar, data = GRDC.yield[GRDC.yield$my.Trait == "SB", ]))
##             Df  Sum Sq Mean Sq F value Pr(>F)  
## Cultivar     1   72781   72781    3.29  0.075 .
## Residuals   62 1372799   22142                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 128 observations deleted due to missingness
t.test(value ~ Cultivar, data = GRDC.yield[GRDC.yield$my.Trait == "SB", ])
## 
##  Welch Two Sample t-test
## 
## data:  value by Cultivar
## t = -1.813, df = 61.04, p-value = 0.07475
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -141.831    6.941
## sample estimates:
##  mean in group SB003 low mean in group SB062 high 
##                    572.6                    640.0

# tillering boxplots with notches - where notches back-fire due to small sample sizes.
GRDC.para <- "Tillers.m2..SS.dry."
GRDC.tiller <- df.melt[df.melt$variable %in% GRDC.para &
                       df.melt$my.Trait == "Silverstar", ]

p <- ggplot(GRDC.tiller, aes(x = Environment, y = value))
     p <- p + geom_boxplot(aes(fill = Cultivar, linetype = CO2), notch = TRUE)
     p <- p + facet_grid(Stage ~ .)
     p <- p + labs(y = expression(Tillers~m^-2),
                   x = "Environment")
     p <- p + theme_bw()
# as there was a question on how to turn the axis-text and change the size
     p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 0, size = rel(0.5)))
p
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

plot of chunk unnamed-chunk-3