Ví dụ về loại bỏ outlier có thể xem ở link sau:

bao gồm 2 hàm để detect outliers [http://rpubs.com/hauselin/outliersDetect]

require(DMwR)

Chuẩn bị data

set.seed(12345)
x <- rnorm(50, 5, 0.5)
y <- 2*x + rnorm(10)/5
x[c(9,10)] <- c(8,7)          # Tạo 2 giá trị outlier cho x
y[c(8,11)] <- c(15,16)        # Tạo 2 giá trị outlier cho y
df <- data.frame(x,y)         # Tạo data frame chứa 2 biến x,y

# Xác định giá trị outlier trong data frame
outlier_values_x <- boxplot.stats(df$x)$out  
outlier_values_y <- boxplot.stats(df$y)$out 

# Xác định vị trí các outlier trong data frame.
idx<- which(df$x %in% outlier_values_x)
idy<- which(df$y %in% outlier_values_y)
id <- unique(c(idx,idy))

df.new <- df[-id,]        # New data frame without outliers
cor(df.new$x, df.new$y)

## [1] 0.9807549

df.out <- df[id,]         # Data frame of outliers

# Regression model
model <- lm(y~x, data=df.new)

Vẽ đồ thị.

require(ggplot2)

## Loading required package: ggplot2

p <- ggplot(data=df.new, aes(x,y))
p <- p+geom_point(shape=1)+ xlim(range(df$x)) + ylim(range(df$y))
p + geom_smooth(method=lm, se=TRUE, fullrange=TRUE)+
  geom_point(data=df.out, aes(x,y), shape=16, col="red", size=2)+
  geom_text(data=df.out, aes(x,y), label=id,vjust=-0.5, hjust=-0)

## Warning: Removed 8 rows containing missing values (geom_smooth).

Another approach

Chú ý phương pháp này rất phụ thuộc vào giá trị số lần mean distance lấy. In general use, those observations that have a cook’s distance greater than 4 times the mean may be classified as influential. This is not a hard boundary.[http://r-statistics.co/Outlier-Treatment-With-R.html#outliers%20package]

mod <- lm(data=df, y~x)
cooksd <- cooks.distance(mod)
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance")  # plot cook's distance
abline(h = 2*mean(cooksd, na.rm=T), col="red")  # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>2*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  # add labels

influential <- as.numeric(names(cooksd)[(cooksd > 2*mean(cooksd, na.rm=T))])  # influential row numbers
df[influential, ]

##           x         y
## 8  4.861908 15.000000
## 9  8.000000 10.144853
## 10 7.000000  8.611289
## 11 4.941876 16.000000

Outliers

LDA

April 8, 2016

Chuẩn bị data

Vẽ đồ thị.

Another approach