Conversion Rate Analysis

Nir Regev
Principal Data Scientist
Sisense Ltd.

May 16th, 2016

Conversion Anomaly Detection

The freqentist statistics threshold approach

library(ggplot2)
library(Rmpfr)
## Warning: package 'Rmpfr' was built under R version 3.2.5
## Loading required package: gmp
## Warning: package 'gmp' was built under R version 3.2.5
## 
## Attaching package: 'gmp'
## The following objects are masked from 'package:base':
## 
##     %*%, apply, crossprod, matrix, tcrossprod
## C code of R package 'Rmpfr': GMP using 64 bits per limb
## 
## Attaching package: 'Rmpfr'
## The following objects are masked from 'package:stats':
## 
##     dbinom, dnorm, dpois, pnorm
## The following objects are masked from 'package:base':
## 
##     cbind, pmax, pmin, rbind
# create data from binomail distribution
mass.traffic <- data.frame(conversion = rbinom(10000,10000,.05))
# create cdf for mass.traffic
mass.traffic.cdf <- ecdf(rbinom(10000,10000,.05))
# let's plot the binomial distribution
a <- ggplot(mass.traffic, aes(conversion))
a + geom_density(kernel="gaussian")

# let's plot the cdf
ggplot(mass.traffic, aes(conversion)) + stat_ecdf(geom = "step")

# the probability of P(conversions < 400)
mass.traffic.cdf(400)
## [1] 0
mass.traffic <- data.frame(conversion = rbinom(1000,1000,.05))
# create cdf for mass.traffic
mass.traffic.cdf <- ecdf(rbinom(1000,1000,.05))
# let's plot the binomial distribution
a <- ggplot(mass.traffic, aes(conversion))
a + geom_density(kernel="gaussian")

# let's plot the cdf
ggplot(mass.traffic, aes(conversion)) + stat_ecdf(geom = "step")

# the probability of P(conversions < 40)
mass.traffic.cdf(40)
## [1] 0.082

Static threshold ? Better call Saul…

log_likelihood <- function(n, c, theta){
  
  return (sum(log(dbinom(x = c, size = n, prob = theta))))
}
bayesian_anomaly_detector <- function(n, c, base_cr=0.05, null_prior=0.98, post_jump_cr=0.03){
  # Returns a posterior representing the beliefs on the probability of a
  # change in conversion rate
  # First value in the returned vector - represent the null hypothesis prior (our belief that there will be no change in conversion rate)
  # Then, 2nd value represents our belief that conversion rate changed at time 1, 3rd value at time 2 ...
  theta = rep(base_cr, length(n))
  likelihood =  rep(0 , length(n)+1) #First element represents the probability of no change
  likelihood[1] = null_prior #Set likelihood equal to prior
  likelihood[2:length(likelihood)] = (1.0-null_prior) / length(n) #Remainder represents probability of a change in c. rate
  
  likelihood[1] = likelihood[1] * exp(log_likelihood(n, c, theta))
  for (i in 1:(length(n))){
    theta[] = base_cr
    theta[i:length(n)] = post_jump_cr
    likelihood[i+1] = likelihood[i+1] * exp(log_likelihood(n, c, theta))
    
  }
  likelihood = likelihood / sum(likelihood)
  return (c(likelihood[1], likelihood[2:(length(likelihood))]))
}

Let’s simulate some data and see how model works

# traffic for 20 days
traffic = c( 1000,  1000,  1000,  1000,  1000,
       1000,  1000,  1000,  1000,  1000,
       1000,  1000,  1000,  1000,  1000,
       1000,  1000,  1000,  1000,  1000)
# conversions for 20 days
conversions = c(51, 40, 51, 41, 44,
      39, 54, 41, 61, 52,
      65, 58, 44, 49, 34,
      39, 24, 28, 36, 43)
# priors for a change in conversion rate (a fall from 5% to 4%)
theta = c( 0.05,  0.05,  0.05,  0.05,  0.05,
           0.05,  0.05,  0.05,  0.05,  0.05,
           0.05,  0.05,  0.05,  0.05,  0.05,
           0.05,  0.05,  0.05, 0.05,   0.05)

#n = rep(100, 20)
#c = rbinom(20,100,.05)
#c[14:length(c)] = rbinom(7,100,.03) #Jump occurs at t=13
#base_cr - base conversion rate
#post_change_cr - conversion rate after a change
# null_prior - prior belief that no change occurs
base_cr=0.05; null_prior=0.9; post_change_cr=0.03
posterior <- bayesian_anomaly_detector(traffic, conversions,base_cr, null_prior, post_change_cr)
posterior
##  [1] 1.041438e-05 2.009700e-33 1.073354e-30 1.653824e-30 8.832853e-28
##  [6] 2.316029e-27 2.992800e-26 2.709736e-26 7.132298e-23 1.870133e-22
## [11] 2.034479e-17 1.849106e-14 1.687058e-08 3.724088e-04 4.812311e-03
## [16] 8.875069e-01 5.630366e-02 5.097837e-02 1.587750e-05 4.147302e-08
## [21] 7.619462e-09
setwd("C:/Users/Nir.Regev/Documents/Target")
source("bayesian_anomaly_detector.R")
library(ggplot2)
# n = c( 1000.,  1000.,  1000.,  1000.,  1000.,  1000.,  1000.,  1000.,
#             1000.,  1000.,  1000.,  1000.,  1000.,  1000.,  1000.,  1000.,
#             1000.,  1000.,  1000.,  1000.)
# 
# c = c(51, 40, 51, 41, 44, 39, 54, 41, 61, 52, 65, 58, 44, 49, 34, 39, 24,
#            28, 36, 43)
# theta = c( 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05, 0.05,
#            0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05, 0.05,  0.05)

# no jumps in conversion rates
time <- 20
traffic.mean <- 100
traffic.sd <- 10
traffic = rep(traffic.mean, time)
conversions = rbinom(time,traffic.mean,.05)
#c[14:length(c)] = rbinom(7,100,.03) #Jump occurs at t=13
base_cr=0.05; null_prior=0.9; post_jump_cr=0.04
posterior <- bayesian_anomaly_detector(traffic,conversions,base_cr, null_prior, post_jump_cr)
posterior
##  [1] 9.909206e-01 7.819393e-04 4.378502e-04 3.911951e-04 2.190513e-04
##  [6] 2.472130e-04 3.524150e-04 1.973363e-04 1.763091e-04 1.989757e-04
## [11] 1.407376e-04 9.954521e-05 1.419069e-04 4.077176e-04 3.642732e-04
## [16] 5.192904e-04 4.639574e-04 5.236047e-04 4.678119e-04 5.279548e-04
## [21] 2.420293e-03
df.posterior <- data.frame(time_of_jump = c(1:time), probability = posterior[2:length(posterior)])
df.prior <- data.frame(time_of_jump = c(1:time), probability = rep((1-null_prior)/time,time))
ymin <- min(min(posterior[2:length(posterior)]),min(df.prior$probability))
ymax <- max(max(posterior[2:length(posterior)]),max(df.prior$probability))
three_peaks <- sort(mpfr(posterior,1024),decreasing = T)[2:4]
p <- ggplot() + 
  geom_line(data = df.posterior, aes(x = time_of_jump, y = probability, color = "Posterior")) +
  geom_line(data = df.prior, aes(x = time_of_jump, y = probability, color = "Prior"))  +
  xlab('Time of Jump') +
  ylab('Probability') +
  ylim(ymin,ymax)
p +  annotate("text", time-2, y = as.numeric(three_peaks[1]), label = "Conversion Dropped ?",
              colour = "blue", size = 4)

# jump in time time - 5
traffic = abs(floor(rnorm(time,traffic.mean,traffic.sd))) # generate random traffic from gaussian distribution [mean=10000,sd=5000]
conversions = rbinom(time,traffic.mean,.05)
fill.jump <- length(conversions) - (time-5)
conversions[(time-5):length(conversions)] = rbinom(fill.jump+1,traffic.mean,.03) #Jump occurs at t=13
base_cr=0.05; null_prior=0.99; post_jump_cr=0.03
posterior <- bayesian_anomaly_detector(traffic,conversions,base_cr,null_prior,post_jump_cr)
df.posterior <- data.frame(time_of_jump = c(1:time), probability = posterior[2:length(posterior)])
df.prior <- data.frame(time_of_jump = c(1:time), probability = rep((1-null_prior)/time,time))
ymin <- min(min(posterior[2:length(posterior)]),min(df.prior$probability))
ymax <- max(max(posterior[2:length(posterior)]),max(df.prior$probability))
p <- ggplot() + 
  geom_line(data = df.posterior, aes(x = time_of_jump, y = probability, color = "Posterior")) +
  geom_line(data = df.prior, aes(x = time_of_jump, y = probability, color = "Prior"))  +
  xlab('Time of Jump') +
  ylab('Probability') +
  ylim(ymin,ymax)
three_peaks <- sort(mpfr(posterior,1024),decreasing = T)[2:4]
p +  annotate("text", time-5, y = as.numeric(three_peaks[1]), label = "Conversion Dropped",
              colour = "blue", size = 4)

     # annotate("pointrange", x = time-5, y = as.numeric(three_peaks[1]), ymin = as.numeric(three_peaks[1]), ymax = as.numeric(three_peaks[1]), colour = "red", size = 1)

Choose prior wisely