library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- data.frame(interval = rep(0, 100000))
df$val <- 0
for(i in 1:1000) {
n.dist <- rnorm(100, mean=10, sd=1)
s <- sample(1:length(n.dist), 5)
n.dist[s] <- n.dist[s] * runif(1, 1.5, 10) # add outliers
start <- (i-1) * 100 + 1
finish <- start + 99
df[start:finish, 'interval'] <- i
df[start:finish, 'val'] <- n.dist
}
hist(df$val, breaks = 100)
quantile(df$val, c(0, 25, 50, 75, 90, 95, 99, 100) / 100)
## 0% 25% 50% 75% 90% 95%
## 5.687101 9.367206 10.069279 10.801035 11.617129 13.654955
## 99% 100%
## 82.850342 122.334509
q_df <- df %>% group_by(interval) %>% summarise(q95 = quantile(val, 0.95), q99 = quantile(val, 0.99))
mean(q_df$q95)
## [1] 14.37472
mean(q_df$q99)
## [1] 59.42437
There is significant difference for 99% percentile.
df <- inner_join(df, q_df, by = 'interval')
df$outlier <- 0
df[df$val > df$q99, 'outlier'] <- 1
o.i <- df$outlier == 1
df[o.i, 'diff'] <- df[o.i, 'val'] - df[o.i, 'q99']
q_df[, c('err_sum', 'err_mean')] <- df %>% group_by(interval) %>% summarise(err_sum = sum(diff, na.rm = T), err_mean = mean(diff, na.rm = T)) %>% select(err_sum, err_mean)
head(q_df, 10)
## # A tibble: 10 <U+00D7> 5
## interval q95 q99 err_sum err_mean
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 12.67511 31.26303 2.2721062 2.2721062
## 2 2 14.59902 58.89609 3.1586920 3.1586920
## 3 3 13.47868 37.18748 2.1591758 2.1591758
## 4 4 13.50213 42.80225 0.7815183 0.7815183
## 5 5 12.48462 19.82491 0.5013201 0.5013201
## 6 6 16.26485 92.26356 12.1385140 12.1385140
## 7 7 12.13830 16.14357 2.5034237 2.5034237
## 8 8 15.18040 80.28339 0.4458414 0.4458414
## 9 9 13.76332 45.24825 1.4632086 1.4632086
## 10 10 13.03196 24.13079 9.1705759 9.1705759
It happends just beacause we have wide-range outliers and we get completely different percentiles for each interval.
bin.right.ranges <- seq(min(df$val), max(df$val), (max(df$val) - min(df$val)) / 100)
bin.val.counts <- rep(0, 100)
for(i in 1:nrow(df)) {
bin.index <- which.min(bin.right.ranges <= df[i, 'val'])
bin.val.counts[bin.index] <- bin.val.counts[bin.index] + 1
}
bin.val.counts <- bin.val.counts[!is.na(bin.val.counts)]
quantile(df$val, c(0, 25, 50, 75, 90, 95, 99, 100) / 100)
## 0% 25% 50% 75% 90% 95%
## 5.687101 9.367206 10.069279 10.801035 11.617129 13.654955
## 99% 100%
## 82.850342 122.334509
bin.right.ranges[which.max(cumsum(bin.val.counts) / sum(bin.val.counts) >= 0.95)]
## [1] 13.85242
bin.right.ranges[which.max(cumsum(bin.val.counts) / sum(bin.val.counts) >= 0.99)]
## [1] 83.84086