Comparison of two methods for outlier detection in a simple example.

References

  1. Robust Statistics

  2. Detecting outliers: Do not use standard deviation around the mean, use absolute deviation around the median

Robust outlier detection

##################################################################################################
# Robust outlier detection
# Location estimated with the median
# Scale estimated with the normalised mean absolute deviation
##################################################################################################
rm(list=ls())

# Normalised Median Absolute Deviation
b <- 1/qnorm(0.75) # Normalisation constant
NMAD <- function(data)  b*median( abs(data - median(data)))

# Data (guess which entries are outliers by eye)
data <- c(3, 5, 5, 7, 9, 11, 11, 1000)

# Number of NMADS from the Median
a <- 3

# NMAD and Median
NMAD.data <- NMAD(data)
med <- median(data)

c(med,NMAD.data)
## [1] 8.000000 4.447807
# Robust outlier detection
Rob.Out.detect <- Vectorize(function(x){
  val <- abs((x - med)/NMAD.data)
  return(ifelse(val>a, TRUE, FALSE))
}  )

# Identifying outliers in the data with the robust method
cbind.data.frame(data,Rob.Out.detect(data))
##   data Rob.Out.detect(data)
## 1    3                FALSE
## 2    5                FALSE
## 3    5                FALSE
## 4    7                FALSE
## 5    9                FALSE
## 6   11                FALSE
## 7   11                FALSE
## 8 1000                 TRUE

Non-Robust outlier detection

##################################################################################################
# Non-Robust outlier detection
# Location estimated with the mean
# Scale estimated with the standard deviation
##################################################################################################


# Sample standard deviation
s.sigma <- function(data) sqrt( mean((data - mean(data))^2) )

# Number of SDs from the Mean
a <- 3

# Mean and standard deviation
mean.data <- mean(data)
sd.data <- s.sigma (data)

c(mean.data,sd.data)
## [1] 131.3750 328.3207
# Non-Robust outlier detection
Out.detect <- Vectorize(function(x){
  val <- abs((x - mean.data)/sd.data)
  return(ifelse(val>a, TRUE, FALSE))
}  )

# Identifying outliers in the data with the non-robust method
cbind.data.frame(data,Out.detect(data))
##   data Out.detect(data)
## 1    3            FALSE
## 2    5            FALSE
## 3    5            FALSE
## 4    7            FALSE
## 5    9            FALSE
## 6   11            FALSE
## 7   11            FALSE
## 8 1000            FALSE