Nir Regev
Chief Data Scientist
Sisense Ltd.
May 16th, 2016
library(ggplot2)## Warning: package 'ggplot2' was built under R version 3.2.5
library(Rmpfr)
# create data from binomail distribution
mass.traffic <- data.frame(conversion = rbinom(10000,10000,.05))
# create cdf for mass.traffic
mass.traffic.cdf <- ecdf(rbinom(10000,10000,.05))
# let's plot the binomial distribution
a <- ggplot(mass.traffic, aes(conversion))
a + geom_density(kernel="gaussian")# let's plot the cdf
ggplot(mass.traffic, aes(conversion)) + stat_ecdf(geom = "step")# the probability of P(conversions < 400)
mass.traffic.cdf(400)## [1] 0
mass.traffic <- data.frame(conversion = rbinom(1000,1000,.05))
# create cdf for mass.traffic
mass.traffic.cdf <- ecdf(rbinom(1000,1000,.05))
# let's plot the binomial distribution
a <- ggplot(mass.traffic, aes(conversion))
a + geom_density(kernel="gaussian")# let's plot the cdf
ggplot(mass.traffic, aes(conversion)) + stat_ecdf(geom = "step")# the probability of P(conversions < 40)mass.traffic.cdf(40)## [1] 0.073
Our goal is to estimate the posterior probability distribution function
Let’s define a likelihood function in R to calculate the probability to encounter a series of data points {Ci,Ni}
log_likelihood <- function(n, c, theta){
return (sum(log(dbinom(x = c, size = n, prob = theta))))
}bayesian_anomaly_detector <- function(n, c, base_cr=0.05, null_prior=0.98, post_jump_cr=0.03){
# Returns a posterior representing the beliefs on the probability of a
# change in conversion rate
# First value in the returned vector - represent the null hypothesis prior (our belief that there will be no change in conversion rate)
# Then, 2nd value represents our belief that conversion rate changed at time 1, 3rd value at time 2 ...
theta = rep(base_cr, length(n))
likelihood = rep(0 , length(n)+1) #First element represents the probability of no change
likelihood[1] = null_prior #Set likelihood equal to prior
likelihood[2:length(likelihood)] = (1.0-null_prior) / length(n) #Remainder represents probability of a change in c. rate
likelihood[1] = likelihood[1] * exp(log_likelihood(n, c, theta))
for (i in 1:(length(n))){
theta[] = base_cr
theta[i:length(n)] = post_jump_cr
likelihood[i+1] = likelihood[i+1] * exp(log_likelihood(n, c, theta))
}
likelihood = likelihood / sum(likelihood)
return (c(likelihood[1], likelihood[2:(length(likelihood))]))
}# traffic for 20 days
traffic = c( 1000, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 1000)
# conversions for 20 days
conversions = c(51, 40, 51, 41, 44,
52, 40, 51, 42, 44,
50, 41, 54, 41, 44,
53, 40, 50, 40, 44)
# priors for a change in conversion rate (a fall from 5% to 4%)
theta = c( 0.05, 0.05, 0.05, 0.05, 0.05,
0.05, 0.05, 0.05, 0.05, 0.05,
0.05, 0.05, 0.05, 0.05, 0.05,
0.05, 0.05, 0.05, 0.05, 0.05)
#n = rep(100, 20)
#c = rbinom(20,100,.05)
#c[14:length(c)] = rbinom(7,100,.03) #Jump occurs at t=13
#base_cr - base conversion rate
#post_change_cr - conversion rate after a change
# null_prior - prior belief that no change occurs
base_cr=0.05; null_prior=0.9; post_change_cr=0.03
posterior <- bayesian_anomaly_detector(traffic, conversions,base_cr, null_prior, post_change_cr)
posterior## [1] 9.992901e-01 7.909490e-33 4.224354e-30 6.508883e-30 3.476308e-27
## [6] 9.115098e-27 1.177864e-25 1.070542e-22 1.649491e-22 8.809712e-20
## [11] 3.930987e-19 5.079668e-18 1.594227e-15 4.180164e-15 1.100261e-11
## [16] 2.884954e-11 3.727971e-10 5.766046e-07 8.884322e-07 2.788298e-04
## [21] 4.296209e-04
setwd("C:/Users/Nir.Regev/Documents/Target")
source("bayesian_anomaly_detector.R")
time <- 40
traffic.mean <- 1000
traffic.sd <- 100
traffic = rep(traffic.mean, time)
conversions = rbinom(time,traffic.mean,.05)
#c[14:length(c)] = rbinom(7,100,.03) #Jump occurs at t=13
base_cr=0.05; null_prior=0.9; post_jump_cr=0.03
posterior <- bayesian_anomaly_detector(traffic,conversions,base_cr, null_prior, post_jump_cr)
posterior## [1] 1.000000e+00 6.397294e-103 4.857727e-102 6.301242e-98 3.352576e-98
## [6] 6.182963e-96 5.619600e-93 6.090130e-91 1.602983e-87 5.030880e-85
## [11] 9.278165e-83 1.198935e-81 2.636491e-80 4.077856e-77 2.606848e-72
## [16] 8.212777e-67 1.806012e-65 1.154528e-60 6.166179e-58 1.132856e-58
## [21] 3.541851e-59 7.788627e-58 1.719297e-53 2.221696e-52 3.423188e-52
## [26] 5.294638e-49 9.764598e-47 1.062270e-41 5.673445e-39 5.156506e-36
## [31] 4.686669e-33 2.984617e-31 4.598696e-31 2.051987e-30 3.173805e-27
## [36] 4.908918e-24 5.340310e-19 2.392025e-15 1.067347e-14 2.809365e-11
## [41] 3.044594e-09
df.posterior <- data.frame(time_of_jump = c(1:time), probability = posterior[2:length(posterior)])
df.prior <- data.frame(time_of_jump = c(1:time), probability = rep((1-null_prior)/time,time))
ymin <- min(min(posterior[2:length(posterior)]),min(df.prior$probability))
ymax <- max(max(posterior[2:length(posterior)]),max(df.prior$probability))
three_peaks <- sort(mpfr(posterior,1024),decreasing = T)[2:4]
p <- ggplot() +
geom_line(data = df.posterior, aes(x = time_of_jump, y = probability, color = "Posterior")) +
geom_line(data = df.prior, aes(x = time_of_jump, y = probability, color = "Prior")) +
xlab('Time of Jump') +
ylab('Probability') +
ylim(ymin,ymax)
p# p + annotate("text", time-2, y = as.numeric(three_peaks[1]), label = "Conversion Dropped ?",
# colour = "blue", size = 3)time <- 40
traffic.mean <- 1000
traffic.sd <- 100
conversions = rbinom(time,traffic.mean,.05)
traffic = abs(floor(rnorm(time,traffic.mean,traffic.sd))) # generate random traffic from gaussian distribution [mean=10000,sd=5000]
conversions = rbinom(time,traffic.mean,.05)
# jump in time time - 5
fill.jump <- length(conversions) - (time-5)
conversions[(time-5):length(conversions)] = rbinom(fill.jump+1,traffic.mean,.03)
base_cr=0.05; null_prior=0.8; post_jump_cr=0.03
posterior <- bayesian_anomaly_detector(traffic,conversions,base_cr,null_prior,post_jump_cr)
df.posterior <- data.frame(time_of_jump = c(1:time), probability = posterior[2:length(posterior)])
df.prior <- data.frame(time_of_jump = c(1:time), probability = rep((1-null_prior)/time,time))
ymin <- min(min(posterior[2:length(posterior)]),min(df.prior$probability))
ymax <- max(max(posterior[2:length(posterior)]),max(df.prior$probability))
p <- ggplot() +
geom_line(data = df.posterior, aes(x = time_of_jump, y = probability, color = "Posterior")) +
geom_line(data = df.prior, aes(x = time_of_jump, y = probability, color = "Prior")) +
xlab('Time of Jump') +
ylab('Probability') +
ylim(ymin,ymax)
three_peaks <- sort(mpfr(posterior,1024),decreasing = T)[2:4]
p + annotate("text", time-5, y = as.numeric(three_peaks[1]), label = "Conversion Dropped",
colour = "blue", size = 4)posterior <- mpfr(posterior,1024)
prior <- df.prior$probability for (i in 1:20){
time <- 40
traffic.mean <- 1000
traffic.sd <- 100
conversions = rbinom(time,traffic.mean,.05)
traffic = abs(floor(rnorm(time,traffic.mean,traffic.sd))) # generate random traffic from gaussian distribution [mean=10000,sd=5000]
# jump in time time - 5
fill.jump <- length(conversions) - (time-5)
conversions[(time-5):length(conversions)] = rbinom(fill.jump+1,traffic.mean,.04) #Jump occurs at t=13
base_cr=0.05; null_prior=0.8; post_jump_cr=0.03
posterior <- bayesian_anomaly_detector(traffic,conversions,base_cr,null_prior,post_jump_cr)
df.posterior <- data.frame(time_of_jump = c(1:time), probability = posterior[2:length(posterior)])
df.prior <- data.frame(time_of_jump = c(1:time), probability = rep((1-null_prior)/time,time))
ymin <- min(min(posterior[2:length(posterior)]),min(df.prior$probability))
ymax <- max(max(posterior[2:length(posterior)]),max(df.prior$probability))
p <- ggplot() +
geom_line(data = df.posterior, aes(x = time_of_jump, y = probability, color = "Posterior")) +
geom_line(data = df.prior, aes(x = time_of_jump, y = probability, color = "Prior")) +
xlab('Time of Jump') +
ylab('Probability') +
ylim(ymin,ymax) #+
#theme_grey(base_size = 15)
three_peaks <- sort(mpfr(posterior,1024),decreasing = T)[2:4]
print(p + annotate("text", time-5, y = as.numeric(three_peaks[1]), label = "Conversion Dropped",
colour = "blue", size = 4))
posterior <- mpfr(posterior,1024)
prior <- df.prior$probability
}