An outlier tells a story! Don't ignore this story.
Oultiers can't be removed just because they are outliers! There must be a reason.
Before removing outliers, ask the following questions:
For packages that can handle data with outliers, see “Robust statistical methods” http://cran.r-project.org/web/views/Robust.html
# some libraries for general data wrangling
library(plyr)
library(ggplot2)
# set working directory
setwd("~/AgFace/Plant_Production/Silverstar_tin")
load(file = "Silverstar_tin_environment.RData")
# prepare a data set:
GRDC.para <- "Yield..g.m2."
GRDC.yield <- df.melt[df.melt$variable %in% GRDC.para, ]
# Manual visualisation and calculation
p <- ggplot(GRDC.yield, aes(x = "All samples", y = value))
p <- p + geom_boxplot(outlier.colour = "tomato")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
# get the summary statistics
summary(GRDC.yield$value)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 240 509 594 602 718 1060 256
my.median <- median(GRDC.yield$value, na.rm = TRUE)
# add corresponding lines to the plot
p <- p + geom_hline(yintercept = my.median, colour = "red")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
# other summary functions
fivenum(GRDC.yield$value, na.rm = TRUE)
## [1] 239.9 509.4 593.6 718.5 1060.6
quantile(GRDC.yield$value, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 239.9 509.5 593.6 718.0 1060.6
p <- p + geom_hline(yintercept = my.median, colour = "red")
# grab the first and third quantile (lower and upper hinge)
first.quant <- quantile(GRDC.yield$value, na.rm = TRUE)[2]
third.quant <- quantile(GRDC.yield$value, na.rm = TRUE)[4]
my.quantiles <- c(first.quant, third.quant)
p <- p + geom_hline(yintercept = my.quantiles, colour = "green")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
# calculate end of whiskers
# whisker commonly extend to 1.5 * interquartile range (IQR)
# different definitions exist!
# get the IQR
IQR(GRDC.yield$value, na.rm = TRUE)
## [1] 208.5
my.IQR <- 1.5 * IQR(GRDC.yield$value, na.rm = TRUE)
upper.IQR <- third.quant + my.IQR
lower.IQR <- first.quant - my.IQR
upper.lower.range <- c(upper.IQR, lower.IQR)
# in the graph, the whisker extends to the sample that is still within the 1.5*IQR range, it does not extend to the calculated 1.5*IQR!
p <- p + geom_hline(yintercept = upper.lower.range, colour = "yellow")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
# identify samples outside the upper/lower 1.5*IQR range
which(GRDC.yield$value > upper.IQR |
GRDC.yield$value < lower.IQR)
## [1] 359
# check the sample in context of the data frame:
GRDC.yield[359, ]
## Year Irrigation Cultivar Stage Trmt.Order Plot.Order
## 47343 2012 Sup Silverstar DC90 27 55
## CO2.Irr.Cultivar.DATABASE Trial.ID RingID PlotID HalfRing RingPos.
## 47343 eCO2 Sup Silverstar PBC 5 G 1 W
## RingTrt CO2 Crop Qplot QHalfRing Qring Bulk_Trait Sample.date
## 47343 Wheat eCO2 eCO2 Wheat p3 2 2 Trait NA
## Sowing.date Environment my.HalfringID my.Trait Ord.Environment
## 47343 NA 2012.Sup 2012.5.W Silverstar 2012.Sup
## variable value DC_date
## 47343 Yield..g.m2. 1061 2012-12-05
# Calculation using boxplots.stats()
# Back to the original simple boxplot
p <- ggplot(GRDC.yield, aes(x = "All", y = value))
p <- p + geom_boxplot()
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
# using the boxplots.stats function
# Uses a different approach than our manual calculation!
# see ?boxplots.stats, ?boxplot, and ?geom_boxplotfor details!
boxplot.stats(GRDC.yield$value)
## $stats
## [1] 239.9 509.4 593.6 718.5 1008.4
##
## $n
## [1] 128
##
## $conf
## [1] 564.4 622.8
##
## $out
## [1] 1061
# list of sample-values that are outside the whiskers:
boxplot.stats(GRDC.yield$value)$out
## [1] 1061
# Multiple groups
p <- ggplot(GRDC.yield, aes(x = Cultivar, y = value))
p <- p + geom_boxplot()
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
# multiple groups via plyr
ddply(GRDC.yield,
.(Cultivar),
summarise,
outliers = boxplot.stats(value)$out)
## Cultivar outliers
## 1 SB003 low 273.9
## 2 SB003 low 879.3
# facet plot
p <- ggplot(GRDC.yield, aes(x = Environment, y = value))
p <- p + geom_boxplot(aes(fill = Cultivar, linetype = CO2), notch = FALSE)
p <- p + labs(y = expression("Yield"~"["~g~m^2~"]"),
x = "Environment")
p <- p + theme_bw()
p <- p + facet_grid(. ~ my.Trait)
p
## Warning: Removed 128 rows containing non-finite values (stat_boxplot).
## Warning: Removed 128 rows containing non-finite values (stat_boxplot).
# corresponding boxplot.stats - doesn't return any outliers contrary to the graph?
dlply(GRDC.yield,
.(Environment, Cultivar, CO2, my.Trait),
summarise,
outliers = boxplot.stats(value))
## $`2011.Rain.SB003 low.aCO2.SB`
## outliers
## 1 400.6, 454.6, 529.4, 628.6, 707.0
## 2 4
## 3 392.0, 666.8
## 4
##
## $`2011.Rain.SB003 low.eCO2.SB`
## outliers
## 1 524.7, 540.7, 580.5, 647.4, 690.3
## 2 4
## 3 496.2, 664.8
## 4
##
## $`2011.Rain.SB062 high.aCO2.SB`
## outliers
## 1 466.4, 531.0, 604.0, 644.9, 677.5
## 2 4
## 3 514.0, 693.9
## 4
##
## $`2011.Rain.SB062 high.eCO2.SB`
## outliers
## 1 579.8, 648.7, 760.8, 846.9, 889.6
## 2 4
## 3 604.2, 917.5
## 4
##
## $`2011.Rain.Silverstar.aCO2.Silverstar`
## outliers
## 1 369.3, 435.3, 559.5, 632.8, 647.9
## 2 4
## 3 403.4, 715.6
## 4
##
## $`2011.Rain.Silverstar.eCO2.Silverstar`
## outliers
## 1 592.0, 655.8, 762.4, 824.0, 842.7
## 2 4
## 3 629.5, 895.3
## 4
##
## $`2011.Rain.SSR T65.aCO2.Silverstar`
## outliers
## 1 239.9, 247.0, 387.1, 534.8, 549.4
## 2 4
## 3 159.8, 614.4
## 4
##
## $`2011.Rain.SSR T65.eCO2.Silverstar`
## outliers
## 1 327.4, 387.2, 469.8, 570.5, 648.5
## 2 4
## 3 325.0, 614.6
## 4
##
## $`2012.Rain.SB003 low.aCO2.SB`
## outliers
## 1 273.9, 305.0, 384.0, 511.4, 590.9
## 2 4
## 3 220.9, 547.1
## 4
##
## $`2012.Rain.SB003 low.eCO2.SB`
## outliers
## 1 331.5, 380.3, 481.2, 613.9, 694.5
## 2 4
## 3 296.6, 665.8
## 4
##
## $`2012.Rain.SB062 high.aCO2.SB`
## outliers
## 1 273.3, 307.4, 399.0, 630.6, 804.7
## 2 4
## 3 143.6, 654.4
## 4
##
## $`2012.Rain.SB062 high.eCO2.SB`
## outliers
## 1 422.9, 473.6, 548.7, 662.7, 752.2
## 2 4
## 3 399.3, 698.1
## 4
##
## $`2012.Rain.Silverstar.aCO2.Silverstar`
## outliers
## 1 428.3, 442.9, 513.7, 668.1, 766.4
## 2 4
## 3 335.7, 691.6
## 4
##
## $`2012.Rain.Silverstar.eCO2.Silverstar`
## outliers
## 1 523.8, 536.1, 579.5, 612.1, 613.4
## 2 4
## 3 519.5, 639.5
## 4
##
## $`2012.Rain.SSR T65.aCO2.Silverstar`
## outliers
## 1 424.2, 443.4, 462.8, 494.4, 525.7
## 2 4
## 3 422.5, 503.1
## 4
##
## $`2012.Rain.SSR T65.eCO2.Silverstar`
## outliers
## 1 412.6, 468.1, 588.9, 658.9, 663.6
## 2 4
## 3 438.1, 739.6
## 4
##
## $`2011.Sup.SB003 low.aCO2.SB`
## outliers
## 1 542.5, 548.4, 554.5, 566.4, 578.2
## 2 4
## 3 540.3, 568.7
## 4
##
## $`2011.Sup.SB003 low.eCO2.SB`
## outliers
## 1 509.5, 559.1, 631.6, 709.1, 763.5
## 2 4
## 3 513.1, 750.1
## 4
##
## $`2011.Sup.SB062 high.aCO2.SB`
## outliers
## 1 524.5, 569.7, 625.2, 698.5, 761.5
## 2 4
## 3 523.4, 727.0
## 4
##
## $`2011.Sup.SB062 high.eCO2.SB`
## outliers
## 1 634.4, 645.2, 702.6, 878.9, 1008.4
## 2 4
## 3 518.0, 887.3
## 4
##
## $`2011.Sup.Silverstar.aCO2.Silverstar`
## outliers
## 1 492.3, 563.0, 676.8, 744.1, 768.1
## 2 4
## 3 533.8, 819.9
## 4
##
## $`2011.Sup.Silverstar.eCO2.Silverstar`
## outliers
## 1 723.5, 758.9, 860.8, 939.0, 950.7
## 2 4
## 3 718.5, 1003.0
## 4
##
## $`2011.Sup.SSR T65.aCO2.Silverstar`
## outliers
## 1 354.8, 356.1, 389.8, 487.6, 552.8
## 2 4
## 3 285.9, 493.7
## 4
##
## $`2011.Sup.SSR T65.eCO2.Silverstar`
## outliers
## 1 475.2, 509.5, 569.5, 669.6, 744.0
## 2 4
## 3 443.0, 695.9
## 4
##
## $`2012.Sup.SB003 low.aCO2.SB`
## outliers
## 1 521.7, 529.7, 541.4, 603.7, 662.1
## 2 4
## 3 483.0, 599.9
## 4
##
## $`2012.Sup.SB003 low.eCO2.SB`
## outliers
## 1 635.7, 709.4, 805.8, 853.9, 879.3
## 2 4
## 3 691.6, 919.9
## 4
##
## $`2012.Sup.SB062 high.aCO2.SB`
## outliers
## 1 522.3, 527.2, 550.2, 653.4, 738.3
## 2 4
## 3 450.5, 649.9
## 4
##
## $`2012.Sup.SB062 high.eCO2.SB`
## outliers
## 1 735.2, 743.0, 761.3, 779.1, 786.4
## 2 4
## 3 732.8, 789.7
## 4
##
## $`2012.Sup.Silverstar.aCO2.Silverstar`
## outliers
## 1 543.9, 575.3, 609.8, 640.3, 667.6
## 2 4
## 3 558.4, 661.3
## 4
##
## $`2012.Sup.Silverstar.eCO2.Silverstar`
## outliers
## 1 713.0, 732.0, 865.9, 1020.7, 1060.6
## 2 4
## 3 637.8, 1093.9
## 4
##
## $`2012.Sup.SSR T65.aCO2.Silverstar`
## outliers
## 1 423.3, 466.3, 567.0, 647.4, 670.0
## 2 4
## 3 424.0, 710.1
## 4
##
## $`2012.Sup.SSR T65.eCO2.Silverstar`
## outliers
## 1 531.1, 615.3, 730.0, 818.5, 876.5
## 2 4
## 3 569.5, 890.5
## 4
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## Environment Cultivar CO2 my.Trait
## 1 2011.Rain SB003 low aCO2 SB
## 2 2011.Rain SB003 low eCO2 SB
## 3 2011.Rain SB062 high aCO2 SB
## 4 2011.Rain SB062 high eCO2 SB
## 5 2011.Rain Silverstar aCO2 Silverstar
## 6 2011.Rain Silverstar eCO2 Silverstar
## 7 2011.Rain SSR T65 aCO2 Silverstar
## 8 2011.Rain SSR T65 eCO2 Silverstar
## 9 2012.Rain SB003 low aCO2 SB
## 10 2012.Rain SB003 low eCO2 SB
## 11 2012.Rain SB062 high aCO2 SB
## 12 2012.Rain SB062 high eCO2 SB
## 13 2012.Rain Silverstar aCO2 Silverstar
## 14 2012.Rain Silverstar eCO2 Silverstar
## 15 2012.Rain SSR T65 aCO2 Silverstar
## 16 2012.Rain SSR T65 eCO2 Silverstar
## 17 2011.Sup SB003 low aCO2 SB
## 18 2011.Sup SB003 low eCO2 SB
## 19 2011.Sup SB062 high aCO2 SB
## 20 2011.Sup SB062 high eCO2 SB
## 21 2011.Sup Silverstar aCO2 Silverstar
## 22 2011.Sup Silverstar eCO2 Silverstar
## 23 2011.Sup SSR T65 aCO2 Silverstar
## 24 2011.Sup SSR T65 eCO2 Silverstar
## 25 2012.Sup SB003 low aCO2 SB
## 26 2012.Sup SB003 low eCO2 SB
## 27 2012.Sup SB062 high aCO2 SB
## 28 2012.Sup SB062 high eCO2 SB
## 29 2012.Sup Silverstar aCO2 Silverstar
## 30 2012.Sup Silverstar eCO2 Silverstar
## 31 2012.Sup SSR T65 aCO2 Silverstar
## 32 2012.Sup SSR T65 eCO2 Silverstar
Might not be very helpful in the context of biological, or generally noisy data.
library(outliers)
# One-group boxplot
p <- ggplot(GRDC.yield, aes(x = "All samples", y = value))
p <- p + geom_boxplot(outlier.colour = "tomato")
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
# be rather careful with these functions, make sure you read and understand the helpfiles!
# Grubbs tests for one or two outliers in data sample
grubbs.test(GRDC.yield$value)
##
## Grubbs test for one outlier
##
## data: GRDC.yield$value
## G = 2.8416, U = 0.9359, p-value = 0.2521
## alternative hypothesis: highest value 1060.55111111111 is an outlier
# two outliers
p <- ggplot(GRDC.yield, aes(x = Cultivar, y = value))
p <- p + geom_boxplot(aes(fill = Cultivar))
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
# checking for two outliers on opposite ends of the range (type 11)
# this finds two outliers for all groups that we don't expect from the quantile based methods described above'! So be extemely cautious!!!
dlply(GRDC.yield,
.(Cultivar),
summarise,
grubbs.test(value, type = 11))
## $`SB003 low`
## ..1
## 1 4.3498, 0.6948
## 2 273.934222222222 and 879.345777777778 are outliers
## 3 0.5178
## 4 Grubbs test for two opposite outliers
## 5 value
##
## $`SB062 high`
## ..1
## 1 4.6572, 0.6502
## 2 273.322666666667 and 1008.40103703704 are outliers
## 3 0.1804
## 4 Grubbs test for two opposite outliers
## 5 value
##
## $Silverstar
## ..1
## 1 4.2164, 0.7089
## 2 369.344962962963 and 1060.55111111111 are outliers
## 3 0.7873
## 4 Grubbs test for two opposite outliers
## 5 value
##
## $`SSR T65`
## ..1
## 1 4.3685, 0.6883
## 2 239.926444444444 and 876.501333333333 are outliers
## 3 0.4874
## 4 Grubbs test for two opposite outliers
## 5 value
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## Cultivar
## 1 SB003 low
## 2 SB062 high
## 3 Silverstar
## 4 SSR T65
# A very fuzzy method, seems to be even more dangerous:
# outlier Find value with largest difference from the mean
# Finds value with largest difference between it and sample mean, which can be an outlier
outlier(GRDC.yield$value)
## [1] 1061
ddply(GRDC.yield,
.(Cultivar),
summarise,
outlier(value))
## Cultivar ..1
## 1 SB003 low 879.3
## 2 SB062 high 1008.4
## 3 Silverstar 1060.6
## 4 SSR T65 876.5
See ?geom_boxplot notch: if FALSE (default) make a standard box plot. If TRUE, make a notched box plot. Notches are used to compare groups; if the notches of two boxes do not overlap, this is strong evidence that the medians differ. […] In a notched box plot, the notches extend 1.58 * IQR / sqrt(n). This gives a roughly 95% interval for comparing medians. See McGill et al. (1978) for more details.
# Notched boxplots example:
p <- ggplot(GRDC.yield, aes(x = Cultivar, y = value))
p <- p + geom_boxplot(aes(fill = Cultivar), notch = TRUE)
p
## Warning: Removed 256 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
# Silverstar and SSR T65 seem to have different medians.
# testing the differences via aov and t.test for comparison
summary(aov(value ~ Cultivar, data = GRDC.yield[GRDC.yield$my.Trait == "Silverstar", ]))
## Df Sum Sq Mean Sq F value Pr(>F)
## Cultivar 1 361777 361777 15 0.00026 ***
## Residuals 62 1491349 24054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 128 observations deleted due to missingness
t.test(value ~ Cultivar, data = GRDC.yield[GRDC.yield$my.Trait == "Silverstar", ])
##
## Welch Two Sample t-test
##
## data: value by Cultivar
## t = 3.878, df = 61.16, p-value = 0.0002598
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 72.84 227.90
## sample estimates:
## mean in group Silverstar mean in group SSR T65
## 673.8 523.4
summary(aov(value ~ Cultivar, data = GRDC.yield[GRDC.yield$my.Trait == "SB", ]))
## Df Sum Sq Mean Sq F value Pr(>F)
## Cultivar 1 72781 72781 3.29 0.075 .
## Residuals 62 1372799 22142
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 128 observations deleted due to missingness
t.test(value ~ Cultivar, data = GRDC.yield[GRDC.yield$my.Trait == "SB", ])
##
## Welch Two Sample t-test
##
## data: value by Cultivar
## t = -1.813, df = 61.04, p-value = 0.07475
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -141.831 6.941
## sample estimates:
## mean in group SB003 low mean in group SB062 high
## 572.6 640.0
# tillering boxplots with notches - where notches back-fire due to small sample sizes.
GRDC.para <- "Tillers.m2..SS.dry."
GRDC.tiller <- df.melt[df.melt$variable %in% GRDC.para &
df.melt$my.Trait == "Silverstar", ]
p <- ggplot(GRDC.tiller, aes(x = Environment, y = value))
p <- p + geom_boxplot(aes(fill = Cultivar, linetype = CO2), notch = TRUE)
p <- p + facet_grid(Stage ~ .)
p <- p + labs(y = expression(Tillers~m^-2),
x = "Environment")
p <- p + theme_bw()
# as there was a question on how to turn the axis-text and change the size
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 0, size = rel(0.5)))
p
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.